@shd101wyy/yo 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ // Punycode codec (RFC 3492)
2
+ //
3
+ // Provides punycode encoding/decoding and IDN hostname conversion.
4
+ //
5
+ // Example:
6
+ // { punycode_decode, punycode_encode, to_unicode, to_ascii } :: import "std/encoding/punycode";
7
+ //
8
+ // encoded := punycode_encode(`München`);
9
+ // decoded := punycode_decode(encoded);
10
+ // ascii_domain := to_ascii(`münchen.de`); // "xn--mnchen-3ya.de"
11
+ // unicode_domain := to_unicode(ascii_domain); // "münchen.de"
12
+
13
+ open import "../string";
14
+ { ArrayList } :: import "../collections/array_list";
15
+
16
+ // Punycode constants (RFC 3492 section 5)
17
+ _BASE :: i32(36);
18
+ _TMIN :: i32(1);
19
+ _TMAX :: i32(26);
20
+ _SKEW :: i32(38);
21
+ _DAMP :: i32(700);
22
+ _INITIAL_BIAS :: i32(72);
23
+ _INITIAL_N :: i32(128);
24
+
25
+ // Decode a single punycode digit character to its value.
26
+ _decode_digit :: (fn(cp: i32) -> i32)(
27
+ cond(
28
+ ((cp >= i32(0x30)) && (cp <= i32(0x39))) => (cp - i32(22)),
29
+ ((cp >= i32(0x41)) && (cp <= i32(0x5A))) => (cp - i32(0x41)),
30
+ ((cp >= i32(0x61)) && (cp <= i32(0x7A))) => (cp - i32(0x61)),
31
+ true => i32(-1)
32
+ )
33
+ );
34
+
35
+ // Encode a digit value to its punycode character.
36
+ _encode_digit :: (fn(d: i32) -> u8)(
37
+ cond(
38
+ (d < i32(26)) => u8((d + i32(0x61))),
39
+ true => u8(((d - i32(26)) + i32(0x30)))
40
+ )
41
+ );
42
+
43
+ // Bias adaptation function (RFC 3492 section 3.4)
44
+ _adapt :: (fn(delta_val: i32, num_points: i32, first_time: bool) -> i32)({
45
+ (d : i32) = cond(
46
+ first_time => (delta_val / _DAMP),
47
+ true => (delta_val / i32(2))
48
+ );
49
+ d = (d + (d / num_points));
50
+ (k : i32) = i32(0);
51
+ while (d > (((_BASE - _TMIN) * _TMAX) / i32(2))), {
52
+ d = (d / (_BASE - _TMIN));
53
+ k = (k + _BASE);
54
+ };
55
+ (k + ((((_BASE - _TMIN) + i32(1)) * d) / (d + _SKEW)))
56
+ });
57
+
58
+ // Encode a single Unicode code point as UTF-8 bytes.
59
+ _encode_codepoint :: (fn(cp: i32, out: *(ArrayList(u8))) -> unit)(
60
+ cond(
61
+ (cp < i32(0x80)) => {
62
+ out.*.push(u8(cp));
63
+ },
64
+ (cp < i32(0x800)) => {
65
+ out.*.push(u8((i32(0xC0) | (cp >> i32(6)))));
66
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
67
+ },
68
+ (cp < i32(0x10000)) => {
69
+ out.*.push(u8((i32(0xE0) | (cp >> i32(12)))));
70
+ out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
71
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
72
+ },
73
+ true => {
74
+ out.*.push(u8((i32(0xF0) | (cp >> i32(18)))));
75
+ out.*.push(u8((i32(0x80) | ((cp >> i32(12)) & i32(0x3F)))));
76
+ out.*.push(u8((i32(0x80) | ((cp >> i32(6)) & i32(0x3F)))));
77
+ out.*.push(u8((i32(0x80) | (cp & i32(0x3F)))));
78
+ }
79
+ )
80
+ );
81
+
82
+ // Decode a UTF-8 string into an array of code points.
83
+ _string_to_codepoints :: (fn(s: String) -> ArrayList(i32))({
84
+ (bytes : ArrayList(u8)) = s.as_bytes();
85
+ (cps : ArrayList(i32)) = ArrayList(i32).new();
86
+ (i : usize) = usize(0);
87
+ (blen : usize) = bytes.len();
88
+ while (i < blen), {
89
+ (b : i32) = i32(bytes.get(i).unwrap());
90
+ (cp : i32) = i32(0);
91
+ (size : usize) = usize(1);
92
+ cond(
93
+ (b < i32(0x80)) => {
94
+ cp = b;
95
+ },
96
+ ((b >= i32(0xC0)) && (b < i32(0xE0))) => {
97
+ cp = (b & i32(0x1F));
98
+ size = usize(2);
99
+ },
100
+ ((b >= i32(0xE0)) && (b < i32(0xF0))) => {
101
+ cp = (b & i32(0x0F));
102
+ size = usize(3);
103
+ },
104
+ ((b >= i32(0xF0)) && (b < i32(0xF8))) => {
105
+ cp = (b & i32(0x07));
106
+ size = usize(4);
107
+ },
108
+ true => {
109
+ cp = i32(0xFFFD);
110
+ }
111
+ );
112
+ (j : usize) = usize(1);
113
+ while (j < size), {
114
+ if(((i + j) < blen), {
115
+ cp = ((cp << i32(6)) | (i32(bytes.get((i + j)).unwrap()) & i32(0x3F)));
116
+ });
117
+ j = (j + usize(1));
118
+ };
119
+ cps.push(cp);
120
+ i = (i + size);
121
+ };
122
+ cps
123
+ });
124
+
125
+ // Decode a punycode-encoded string (without the xn-- prefix).
126
+ // Returns .Some(decoded) on success, .None on error.
127
+ punycode_decode :: (fn(input: String) -> Option(String))({
128
+ (bytes : ArrayList(u8)) = input.as_bytes();
129
+ (input_len : i32) = i32(bytes.len());
130
+
131
+ // Find the last '-' separator
132
+ (basic_end : i32) = i32(-1);
133
+ (j : i32) = (input_len - i32(1));
134
+ while ((j >= i32(0)) && (basic_end < i32(0))), {
135
+ if((i32(bytes.get(usize(j)).unwrap()) == i32(0x2D)), {
136
+ basic_end = j;
137
+ });
138
+ j = (j - i32(1));
139
+ };
140
+
141
+ (output : ArrayList(i32)) = ArrayList(i32).new();
142
+ (basic_length : i32) = cond(
143
+ (basic_end >= i32(0)) => basic_end,
144
+ true => i32(0)
145
+ );
146
+
147
+ (bi : i32) = i32(0);
148
+ while (bi < basic_length), {
149
+ (cp : i32) = i32(bytes.get(usize(bi)).unwrap());
150
+ if((cp >= i32(0x80)), {
151
+ return .None;
152
+ });
153
+ output.push(cp);
154
+ bi = (bi + i32(1));
155
+ };
156
+
157
+ (idx : i32) = cond(
158
+ (basic_end >= i32(0)) => (basic_end + i32(1)),
159
+ true => i32(0)
160
+ );
161
+ (n : i32) = _INITIAL_N;
162
+ (bias : i32) = _INITIAL_BIAS;
163
+ (i_val : i32) = i32(0);
164
+
165
+ while (idx < input_len), {
166
+ (old_i : i32) = i_val;
167
+ (w : i32) = i32(1);
168
+ (k : i32) = _BASE;
169
+ (decode_done : bool) = false;
170
+
171
+ while (!(decode_done)), {
172
+ if((idx >= input_len), {
173
+ return .None;
174
+ });
175
+ (digit : i32) = _decode_digit(i32(bytes.get(usize(idx)).unwrap()));
176
+ idx = (idx + i32(1));
177
+ if((digit < i32(0)), {
178
+ return .None;
179
+ });
180
+
181
+ i_val = (i_val + (digit * w));
182
+
183
+ (t : i32) = cond(
184
+ (k <= bias) => _TMIN,
185
+ (k >= (bias + _TMAX)) => _TMAX,
186
+ true => (k - bias)
187
+ );
188
+
189
+ if((digit < t), {
190
+ decode_done = true;
191
+ }, {
192
+ w = (w * (_BASE - t));
193
+ k = (k + _BASE);
194
+ });
195
+ };
196
+
197
+ (out_len : i32) = (i32(output.len()) + i32(1));
198
+ bias = _adapt((i_val - old_i), out_len, (old_i == i32(0)));
199
+ n = (n + (i_val / out_len));
200
+ i_val = (i_val % out_len);
201
+
202
+ // Insert code point at position i_val
203
+ output.push(i32(0));
204
+ (shift_idx : i32) = (i32(output.len()) - i32(1));
205
+ while (shift_idx > i_val), {
206
+ _ := output.set(usize(shift_idx), output.get(usize((shift_idx - i32(1)))).unwrap());
207
+ shift_idx = (shift_idx - i32(1));
208
+ };
209
+ _ := output.set(usize(i_val), n);
210
+ i_val = (i_val + i32(1));
211
+ };
212
+
213
+ // Convert code points to UTF-8
214
+ (result_bytes : ArrayList(u8)) = ArrayList(u8).new();
215
+ (ri : usize) = usize(0);
216
+ while (ri < output.len()), {
217
+ _encode_codepoint(output.get(ri).unwrap(), (&result_bytes));
218
+ ri = (ri + usize(1));
219
+ };
220
+ .Some(String.from_bytes(result_bytes))
221
+ });
222
+
223
+ // Encode a Unicode string to punycode (without the xn-- prefix).
224
+ punycode_encode :: (fn(input: String) -> String)({
225
+ (cps : ArrayList(i32)) = _string_to_codepoints(input);
226
+ (cp_count : i32) = i32(cps.len());
227
+
228
+ // Separate basic and non-basic code points
229
+ (out : ArrayList(u8)) = ArrayList(u8).new();
230
+ (basic_count : i32) = i32(0);
231
+ (ci : i32) = i32(0);
232
+ while (ci < cp_count), {
233
+ (cp : i32) = cps.get(usize(ci)).unwrap();
234
+ if((cp < i32(0x80)), {
235
+ out.push(u8(cp));
236
+ basic_count = (basic_count + i32(1));
237
+ });
238
+ ci = (ci + i32(1));
239
+ };
240
+
241
+ if((basic_count > i32(0)), {
242
+ out.push(u8(0x2D));
243
+ });
244
+
245
+ (handled : i32) = basic_count;
246
+ (n : i32) = _INITIAL_N;
247
+ (delta : i32) = i32(0);
248
+ (bias : i32) = _INITIAL_BIAS;
249
+
250
+ while (handled < cp_count), {
251
+ // Find minimum code point >= n
252
+ (m : i32) = i32(0x7FFFFFFF);
253
+ (mi : i32) = i32(0);
254
+ while (mi < cp_count), {
255
+ (cp : i32) = cps.get(usize(mi)).unwrap();
256
+ if(((cp >= n) && (cp < m)), {
257
+ m = cp;
258
+ });
259
+ mi = (mi + i32(1));
260
+ };
261
+
262
+ delta = (delta + ((m - n) * (handled + i32(1))));
263
+ n = m;
264
+
265
+ (ei : i32) = i32(0);
266
+ while (ei < cp_count), {
267
+ (cp : i32) = cps.get(usize(ei)).unwrap();
268
+ if((cp < n), {
269
+ delta = (delta + i32(1));
270
+ });
271
+ if((cp == n), {
272
+ (q : i32) = delta;
273
+ (k : i32) = _BASE;
274
+ (encode_done : bool) = false;
275
+ while (!(encode_done)), {
276
+ (t : i32) = cond(
277
+ (k <= bias) => _TMIN,
278
+ (k >= (bias + _TMAX)) => _TMAX,
279
+ true => (k - bias)
280
+ );
281
+ if((q < t), {
282
+ out.push(_encode_digit(q));
283
+ encode_done = true;
284
+ }, {
285
+ out.push(_encode_digit((t + ((q - t) % (_BASE - t)))));
286
+ q = ((q - t) / (_BASE - t));
287
+ k = (k + _BASE);
288
+ });
289
+ };
290
+ bias = _adapt(delta, (handled + i32(1)), (handled == basic_count));
291
+ delta = i32(0);
292
+ handled = (handled + i32(1));
293
+ });
294
+ ei = (ei + i32(1));
295
+ };
296
+ delta = (delta + i32(1));
297
+ n = (n + i32(1));
298
+ };
299
+
300
+ String.from_bytes(out)
301
+ });
302
+
303
+ // Convert an IDN hostname to Unicode display form.
304
+ // Splits on '.', decodes xn-- labels, keeps original on failure.
305
+ to_unicode :: (fn(hostname: String) -> String)({
306
+ (parts : ArrayList(String)) = hostname.split(`.`);
307
+ (result : String) = ``;
308
+ (pi : usize) = usize(0);
309
+ while (pi < parts.len()), {
310
+ (part : String) = parts.get(pi).unwrap();
311
+ if((pi > usize(0)), {
312
+ result = `${result}.`;
313
+ });
314
+ (part_lower : String) = part.to_lowercase();
315
+ if(part_lower.starts_with(`xn--`), {
316
+ (encoded : String) = part.substring(usize(4), part.len());
317
+ match(punycode_decode(encoded),
318
+ .Some(decoded) => {
319
+ result = `${result}${decoded}`;
320
+ },
321
+ .None => {
322
+ // Keep the original label including xn-- prefix on decode failure
323
+ result = `${result}${part}`;
324
+ }
325
+ );
326
+ }, {
327
+ result = `${result}${part}`;
328
+ });
329
+ pi = (pi + usize(1));
330
+ };
331
+ result
332
+ });
333
+
334
+ // Convert a Unicode hostname to ASCII punycode form.
335
+ // Non-ASCII labels get xn-- prefix.
336
+ to_ascii :: (fn(hostname: String) -> String)({
337
+ (parts : ArrayList(String)) = hostname.split(`.`);
338
+ (result : String) = ``;
339
+ (pi : usize) = usize(0);
340
+ while (pi < parts.len()), {
341
+ (part : String) = parts.get(pi).unwrap();
342
+ if((pi > usize(0)), {
343
+ result = `${result}.`;
344
+ });
345
+ // Check if the label has non-ASCII characters
346
+ (has_non_ascii : bool) = false;
347
+ (bytes : ArrayList(u8)) = part.as_bytes();
348
+ (bi : usize) = usize(0);
349
+ while (bi < bytes.len()), {
350
+ if((i32(bytes.get(bi).unwrap()) >= i32(0x80)), {
351
+ has_non_ascii = true;
352
+ });
353
+ bi = (bi + usize(1));
354
+ };
355
+ if(has_non_ascii, {
356
+ (encoded : String) = punycode_encode(part);
357
+ result = `${result}xn--${encoded}`;
358
+ }, {
359
+ result = `${result}${part}`;
360
+ });
361
+ pi = (pi + usize(1));
362
+ };
363
+ result
364
+ });
365
+
366
+ export punycode_decode, punycode_encode, to_unicode, to_ascii;
@@ -203,24 +203,25 @@ impl(rune, ToString(
203
203
  // 0x80-0x7FF: 2 bytes
204
204
  // 0x800-0xFFFF: 3 bytes (excluding surrogates 0xD800-0xDFFF)
205
205
  // 0x10000-0x10FFFF: 4 bytes
206
+ // Use 5-byte buffer to always have space for null terminator
206
207
 
207
208
  buffer := cond(
208
209
  (code <= 0x7F) => {
209
210
  // 1-byte encoding: 0xxxxxxx
210
- arr := Array(u8, usize(4)).fill(0);
211
+ arr := Array(u8, usize(5)).fill(0);
211
212
  arr(0) = u8(code);
212
213
  arr
213
214
  },
214
215
  (code <= 0x7FF) => {
215
216
  // 2-byte encoding: 110xxxxx 10xxxxxx
216
- arr := Array(u8, usize(4)).fill(0);
217
+ arr := Array(u8, usize(5)).fill(0);
217
218
  arr(0) = u8(u32(0xC0) | ((code >> 6) & 0x1F));
218
219
  arr(1) = u8(u32(0x80) | (code & 0x3F));
219
220
  arr
220
221
  },
221
222
  (code <= 0xFFFF) => {
222
223
  // 3-byte encoding: 1110xxxx 10xxxxxx 10xxxxxx
223
- arr := Array(u8, usize(4)).fill(0);
224
+ arr := Array(u8, usize(5)).fill(0);
224
225
  arr(0) = u8(u32(0xE0) | ((code >> 12) & 0x0F));
225
226
  arr(1) = u8(u32(0x80) | ((code >> 6) & 0x3F));
226
227
  arr(2) = u8(u32(0x80) | (code & 0x3F));
@@ -228,7 +229,7 @@ impl(rune, ToString(
228
229
  },
229
230
  true => {
230
231
  // 4-byte encoding: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
231
- arr := Array(u8, usize(4)).fill(0);
232
+ arr := Array(u8, usize(5)).fill(0);
232
233
  arr(0) = u8(u32(0xF0) | ((code >> 18) & 0x07));
233
234
  arr(1) = u8(u32(0x80) | ((code >> 12) & 0x3F));
234
235
  arr(2) = u8(u32(0x80) | ((code >> 6) & 0x3F));
package/std/glob/index.yo CHANGED
@@ -187,7 +187,7 @@ _glob_match_impl :: (fn(pb: ArrayList(u8), pi: usize, tb: ArrayList(u8), ti: usi
187
187
  });
188
188
 
189
189
  glob_match :: (fn(pattern: String, text: String) -> bool)(
190
- _glob_match_impl(pattern._bytes, usize(0), text._bytes, usize(0))
190
+ _glob_match_impl(pattern.as_bytes(), usize(0), text.as_bytes(), usize(0))
191
191
  );
192
192
 
193
193
  GlobPattern :: object(
@@ -199,7 +199,7 @@ impl(GlobPattern,
199
199
  Self(_pattern: pattern)
200
200
  ),
201
201
  matches : (fn(self: Self, text: String) -> bool)(
202
- _glob_match_impl(self._pattern._bytes, usize(0), text._bytes, usize(0))
202
+ _glob_match_impl(self._pattern.as_bytes(), usize(0), text.as_bytes(), usize(0))
203
203
  )
204
204
  );
205
205
 
@@ -0,0 +1,55 @@
1
+ // C11 wctype.h - Wide character classification and conversion functions
2
+ // Provides functions to test and convert wide character types
3
+
4
+ { wint_t } :: import "./stdint";
5
+
6
+ c_include "<wctype.h>",
7
+ // Wide character classification functions
8
+ iswalnum :
9
+ fn(wc : wint_t) -> int,
10
+ iswalpha :
11
+ fn(wc : wint_t) -> int,
12
+ iswblank :
13
+ fn(wc : wint_t) -> int,
14
+ iswcntrl :
15
+ fn(wc : wint_t) -> int,
16
+ iswdigit :
17
+ fn(wc : wint_t) -> int,
18
+ iswgraph :
19
+ fn(wc : wint_t) -> int,
20
+ iswlower :
21
+ fn(wc : wint_t) -> int,
22
+ iswprint :
23
+ fn(wc : wint_t) -> int,
24
+ iswpunct :
25
+ fn(wc : wint_t) -> int,
26
+ iswspace :
27
+ fn(wc : wint_t) -> int,
28
+ iswupper :
29
+ fn(wc : wint_t) -> int,
30
+ iswxdigit :
31
+ fn(wc : wint_t) -> int,
32
+
33
+ // Wide character conversion functions
34
+ towlower :
35
+ fn(wc : wint_t) -> wint_t,
36
+ towupper :
37
+ fn(wc : wint_t) -> wint_t
38
+ ;
39
+
40
+ export
41
+ iswalnum,
42
+ iswalpha,
43
+ iswblank,
44
+ iswcntrl,
45
+ iswdigit,
46
+ iswgraph,
47
+ iswlower,
48
+ iswprint,
49
+ iswpunct,
50
+ iswspace,
51
+ iswupper,
52
+ iswxdigit,
53
+ towlower,
54
+ towupper
55
+ ;
package/std/path.yo CHANGED
@@ -67,7 +67,7 @@ impl(Path,
67
67
  // Check if path is absolute
68
68
  // Unix: starts with '/'
69
69
  // Windows: starts with drive letter like 'C:' or UNC path '\\'
70
- bytes := normalized._bytes;
70
+ bytes := normalized.as_bytes();
71
71
  cond(
72
72
  (bytes.len() > usize(0)) => {
73
73
  first_byte := bytes.get(usize(0));
@@ -128,7 +128,7 @@ impl(Path,
128
128
  true => {
129
129
  // Check if it's "." (current directory)
130
130
  is_dot := ((part.len() == usize(1)) && {
131
- byte_opt := part._bytes.get(usize(0));
131
+ byte_opt := part.as_bytes().get(usize(0));
132
132
  match(byte_opt,
133
133
  .Some(b) => (b == u8(46)),
134
134
  .None => false
@@ -137,8 +137,8 @@ impl(Path,
137
137
 
138
138
  // Check if it's ".." (parent directory)
139
139
  is_dotdot := ((part.len() == usize(2)) && {
140
- b0_opt := part._bytes.get(usize(0));
141
- b1_opt := part._bytes.get(usize(1));
140
+ b0_opt := part.as_bytes().get(usize(0));
141
+ b1_opt := part.as_bytes().get(usize(1));
142
142
  match(b0_opt,
143
143
  .Some(b0) => match(b1_opt,
144
144
  .Some(b1) => ((b0 == u8(46)) && (b1 == u8(46))),
@@ -605,8 +605,8 @@ impl(Path, ToString(
605
605
  // Check if it's a drive letter like "C:"
606
606
  cond(
607
607
  (first_seg.len() == usize(2)) => {
608
- b0_opt := first_seg._bytes.get(usize(0));
609
- b1_opt := first_seg._bytes.get(usize(1));
608
+ b0_opt := first_seg.as_bytes().get(usize(0));
609
+ b1_opt := first_seg.as_bytes().get(usize(1));
610
610
  match(b0_opt,
611
611
  .Some(b0) => match(b1_opt,
612
612
  .Some(b1) => {
package/std/prelude.yo CHANGED
@@ -96,6 +96,8 @@ extern "Yo",
96
96
  fn(forall(T: Type), slice: Slice(T)) -> usize,
97
97
  __yo_slice_new :
98
98
  fn(forall(T: Type), ptr: *(T), length: usize) -> Slice(T),
99
+ __yo_slice_ptr :
100
+ fn(forall(T: Type), slice: Slice(T)) -> *(T),
99
101
 
100
102
  // C macro related
101
103
  __yo_c_macro_defined : (fn(comptime(name) : comptime_string) -> comptime(bool)),
@@ -3196,6 +3198,9 @@ impl(forall(T : Type), Slice(T),
3196
3198
  ),
3197
3199
  len : (fn(self : Self) -> usize)(
3198
3200
  __yo_slice_len(self)
3201
+ ),
3202
+ ptr : (fn(self : Self) -> *(T))(
3203
+ __yo_slice_ptr(self)
3199
3204
  )
3200
3205
  );
3201
3206
 
@@ -3209,6 +3214,9 @@ impl(str,
3209
3214
  ),
3210
3215
  len : (fn(self : Self) -> usize)(
3211
3216
  __yo_slice_len(self.bytes)
3217
+ ),
3218
+ ptr : (fn(self : Self) -> *(u8))(
3219
+ __yo_slice_ptr(self.bytes)
3212
3220
  )
3213
3221
  );
3214
3222
 
@@ -189,6 +189,27 @@ impl(RegexParser,
189
189
  r
190
190
  }),
191
191
 
192
+ // Parse \xHH hex escape — reads exactly 2 hex digits and returns the codepoint.
193
+ _parse_hex_byte : (fn(self : Self) -> Option(u32))({
194
+ if(((self._pos + usize(2)) > self._bytes.len()), { return .None; });
195
+ (h1 : u8) = self._bytes.get(self._pos).unwrap();
196
+ (h2 : u8) = self._bytes.get((self._pos + usize(1))).unwrap();
197
+ (v1 : i32) = cond(
198
+ ((h1 >= u8(48)) && (h1 <= u8(57))) => (i32(h1) - i32(48)),
199
+ ((h1 >= u8(65)) && (h1 <= u8(70))) => ((i32(h1) - i32(65)) + i32(10)),
200
+ ((h1 >= u8(97)) && (h1 <= u8(102))) => ((i32(h1) - i32(97)) + i32(10)),
201
+ true => { return .None; }
202
+ );
203
+ (v2 : i32) = cond(
204
+ ((h2 >= u8(48)) && (h2 <= u8(57))) => (i32(h2) - i32(48)),
205
+ ((h2 >= u8(65)) && (h2 <= u8(70))) => ((i32(h2) - i32(65)) + i32(10)),
206
+ ((h2 >= u8(97)) && (h2 <= u8(102))) => ((i32(h2) - i32(97)) + i32(10)),
207
+ true => { return .None; }
208
+ );
209
+ self._pos = (self._pos + usize(2));
210
+ .Some(u32(((v1 << i32(4)) | v2)))
211
+ }),
212
+
192
213
  _parse_class_escape : (fn(self : Self) -> Result(ArrayList(CharRange), String))({
193
214
  b := self._advance();
194
215
  match(b,
@@ -219,6 +240,14 @@ impl(RegexParser,
219
240
  r.push(CharRange(low: u32(33), high: u32(0x10FFFF)));
220
241
  .Ok(r)
221
242
  },
243
+ (ch == u8(120)) => {
244
+ r := ArrayList(CharRange).new();
245
+ match(self._parse_hex_byte(),
246
+ .Some(v) => r.push(CharRange(low: v, high: v)),
247
+ .None => r.push(CharRange(low: u32(ch), high: u32(ch)))
248
+ );
249
+ .Ok(r)
250
+ },
222
251
  true => {
223
252
  r := ArrayList(CharRange).new();
224
253
  codepoint := self._escape_char_codepoint(ch);
@@ -246,6 +275,31 @@ impl(RegexParser,
246
275
  (end_first == u8(93)) => {
247
276
  ranges.push(CharRange(low: low, high: low));
248
277
  },
278
+ (end_first == u8(92)) => {
279
+ // High end is an escape sequence (e.g. \x20, \0, \n)
280
+ self._pos = (self._pos + usize(1));
281
+ self._pos = (self._pos + usize(1));
282
+ esc := self._parse_class_escape();
283
+ match(esc,
284
+ .Ok(esc_ranges) => {
285
+ if(((esc_ranges.len() == usize(1)) && (esc_ranges.get(usize(0)).unwrap().low == esc_ranges.get(usize(0)).unwrap().high)), {
286
+ (high : u32) = esc_ranges.get(usize(0)).unwrap().low;
287
+ ranges.push(CharRange(low: low, high: high));
288
+ }, {
289
+ // Multi-range escape like \d can't be range endpoint; treat dash as literal
290
+ ranges.push(CharRange(low: low, high: low));
291
+ ranges.push(CharRange(low: u32(45), high: u32(45)));
292
+ j := usize(0);
293
+ while (j < esc_ranges.len()), (j = (j + usize(1))), {
294
+ ranges.push(esc_ranges.get(j).unwrap());
295
+ };
296
+ });
297
+ },
298
+ .Err(_e) => {
299
+ ranges.push(CharRange(low: low, high: low));
300
+ }
301
+ );
302
+ },
249
303
  true => {
250
304
  // Consume dash
251
305
  self._pos = (self._pos + usize(1));
@@ -284,10 +338,16 @@ impl(RegexParser,
284
338
  esc := self._parse_class_escape();
285
339
  match(esc,
286
340
  .Ok(esc_ranges) => {
287
- j := usize(0);
288
- while (j < esc_ranges.len()), (j = (j + usize(1))), {
289
- ranges.push(esc_ranges.get(j).unwrap());
290
- };
341
+ // If escape produced a single codepoint, check for range (e.g. \0-\x20)
342
+ if(((esc_ranges.len() == usize(1)) && (esc_ranges.get(usize(0)).unwrap().low == esc_ranges.get(usize(0)).unwrap().high)), {
343
+ (low : u32) = esc_ranges.get(usize(0)).unwrap().low;
344
+ self._try_parse_char_range(ranges, low);
345
+ }, {
346
+ j := usize(0);
347
+ while (j < esc_ranges.len()), (j = (j + usize(1))), {
348
+ ranges.push(esc_ranges.get(j).unwrap());
349
+ };
350
+ });
291
351
  },
292
352
  .Err(e) => { return .Err(e); }
293
353
  );
@@ -452,6 +512,11 @@ impl(RegexParser,
452
512
  (ch == u8(112)) => self._parse_unicode_property(false),
453
513
  // Negated unicode property \P{Name}
454
514
  (ch == u8(80)) => self._parse_unicode_property(true),
515
+ // Hex escape \xHH
516
+ (ch == u8(120)) => match(self._parse_hex_byte(),
517
+ .Some(v) => .Ok(RegexNode.literal(v)),
518
+ .None => .Ok(RegexNode.literal(u32(ch)))
519
+ ),
455
520
  true => .Ok(RegexNode.literal(self._escape_char_codepoint(ch)))
456
521
  ),
457
522
  .None => .Err(`Unexpected end of pattern after backslash`)