@shd101wyy/yo 0.0.28 → 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,847 @@
1
+ // std/regex/parser.yo - Regex pattern parser
2
+ //
3
+ // Parses a regex pattern string into an AST of RegexNode objects.
4
+ // Uses an iterative stack-based approach to avoid mutual recursion.
5
+
6
+ open import "std/collections/array_list";
7
+ open import "std/string";
8
+ { RegexNode, NodeKind, CharRange, AnchorKind, GroupNameEntry } :: import "./node.yo";
9
+ { unicode_property_ranges } :: import "./unicode.yo";
10
+
11
+ // A parse frame for tracking alternation/sequence state
12
+ ParseFrame :: struct(
13
+ alternatives : ArrayList(ArrayList(RegexNode)),
14
+ current : ArrayList(RegexNode),
15
+ is_non_capturing : bool,
16
+ group_index : usize,
17
+ is_lookahead : bool,
18
+ is_lookbehind : bool,
19
+ is_positive : bool
20
+ );
21
+
22
+ // Parser state object
23
+ RegexParser :: object(
24
+ _source : String,
25
+ _bytes : ArrayList(u8),
26
+ _pos : usize,
27
+ _group_count : usize,
28
+ _group_names : ArrayList(GroupNameEntry)
29
+ );
30
+
31
+ // First impl block: utility + leaf parsers (defined bottom-up)
32
+ impl(RegexParser,
33
+ new : (fn(pattern : String) -> Self)(
34
+ Self(
35
+ _source: pattern,
36
+ _bytes: pattern.as_bytes(),
37
+ _pos: usize(0),
38
+ _group_count: usize(0),
39
+ _group_names: ArrayList(GroupNameEntry).new()
40
+ )
41
+ ),
42
+
43
+ _peek : (fn(self : Self) -> Option(u8))(
44
+ cond(
45
+ (self._pos < self._bytes.len()) => self._bytes.get(self._pos),
46
+ true => .None
47
+ )
48
+ ),
49
+
50
+ _advance : (fn(self : Self) -> Option(u8))({
51
+ cond(
52
+ (self._pos >= self._bytes.len()) => {
53
+ return .None;
54
+ },
55
+ true => ()
56
+ );
57
+ b := self._bytes.get(self._pos);
58
+ self._pos = (self._pos + usize(1));
59
+ b
60
+ }),
61
+
62
+ // Decode a full UTF-8 codepoint given the first byte (already consumed by _advance).
63
+ // Reads 0-3 continuation bytes from self._pos and advances past them.
64
+ _read_codepoint : (fn(self : Self, first : u8) -> u32)(
65
+ cond(
66
+ (first < u8(0x80)) => u32(first),
67
+ ((first >= u8(0xC0)) && (first < u8(0xE0))) => {
68
+ second := self._bytes.get(self._pos).unwrap();
69
+ self._pos = (self._pos + usize(1));
70
+ (((u32(first) & u32(0x1F)) << u32(6)) | (u32(second) & u32(0x3F)))
71
+ },
72
+ ((first >= u8(0xE0)) && (first < u8(0xF0))) => {
73
+ second := self._bytes.get(self._pos).unwrap();
74
+ third := self._bytes.get((self._pos + usize(1))).unwrap();
75
+ self._pos = (self._pos + usize(2));
76
+ ((((u32(first) & u32(0x0F)) << u32(12)) | ((u32(second) & u32(0x3F)) << u32(6))) | (u32(third) & u32(0x3F)))
77
+ },
78
+ true => {
79
+ second := self._bytes.get(self._pos).unwrap();
80
+ third := self._bytes.get((self._pos + usize(1))).unwrap();
81
+ fourth := self._bytes.get((self._pos + usize(2))).unwrap();
82
+ self._pos = (self._pos + usize(3));
83
+ (((((u32(first) & u32(0x07)) << u32(18)) | ((u32(second) & u32(0x3F)) << u32(12))) | ((u32(third) & u32(0x3F)) << u32(6))) | (u32(fourth) & u32(0x3F)))
84
+ }
85
+ )
86
+ ),
87
+
88
+ _at_end : (fn(self : Self) -> bool)(
89
+ (self._pos >= self._bytes.len())
90
+ ),
91
+
92
+ group_count : (fn(self : Self) -> usize)(
93
+ self._group_count
94
+ ),
95
+
96
+ group_names : (fn(self : Self) -> ArrayList(GroupNameEntry))(
97
+ self._group_names
98
+ ),
99
+
100
+ _lookup_group_name : (fn(self : Self, name : String) -> Option(usize))({
101
+ i := usize(0);
102
+ while (i < self._group_names.len()), (i = (i + usize(1))), {
103
+ entry := self._group_names.get(i).unwrap();
104
+ cond(
105
+ (entry.name == name) => { return .Some(entry.index); },
106
+ true => ()
107
+ );
108
+ };
109
+ .None
110
+ }),
111
+
112
+ _escape_char_codepoint : (fn(self : Self, ch : u8) -> u32)(
113
+ cond(
114
+ (ch == u8(110)) => u32(10),
115
+ (ch == u8(116)) => u32(9),
116
+ (ch == u8(114)) => u32(13),
117
+ (ch == u8(102)) => u32(12),
118
+ (ch == u8(118)) => u32(11),
119
+ (ch == u8(48)) => u32(0),
120
+ true => u32(ch)
121
+ )
122
+ ),
123
+
124
+ _parse_number : (fn(self : Self) -> Option(usize))({
125
+ start := self._pos;
126
+ while (!(self._at_end())), {
127
+ pk := self._peek();
128
+ is_digit := match(pk,
129
+ .Some(b) => ((b >= u8(48)) && (b <= u8(57))),
130
+ .None => false
131
+ );
132
+ cond(
133
+ is_digit => {
134
+ self._pos = (self._pos + usize(1));
135
+ },
136
+ true => { break; }
137
+ );
138
+ };
139
+ cond(
140
+ (self._pos == start) => {
141
+ return .None;
142
+ },
143
+ true => ()
144
+ );
145
+ result := usize(0);
146
+ i := start;
147
+ while (i < self._pos), (i = (i + usize(1))), {
148
+ d := self._bytes.get(i).unwrap();
149
+ result = ((result * usize(10)) + usize((d - u8(48))));
150
+ };
151
+ .Some(result)
152
+ }),
153
+
154
+ _parse_greedy_modifier : (fn(self : Self) -> bool)({
155
+ pk := self._peek();
156
+ is_question := match(pk,
157
+ .Some(b) => (b == u8(63)),
158
+ .None => false
159
+ );
160
+ cond(
161
+ is_question => {
162
+ self._pos = (self._pos + usize(1));
163
+ return false;
164
+ },
165
+ true => ()
166
+ );
167
+ true
168
+ }),
169
+
170
+ _make_digit_ranges : (fn(self : Self) -> ArrayList(CharRange))({
171
+ r := ArrayList(CharRange).new();
172
+ r.push(CharRange(low: u32(48), high: u32(57)));
173
+ r
174
+ }),
175
+
176
+ _make_word_ranges : (fn(self : Self) -> ArrayList(CharRange))({
177
+ r := ArrayList(CharRange).new();
178
+ r.push(CharRange(low: u32(48), high: u32(57)));
179
+ r.push(CharRange(low: u32(65), high: u32(90)));
180
+ r.push(CharRange(low: u32(95), high: u32(95)));
181
+ r.push(CharRange(low: u32(97), high: u32(122)));
182
+ r
183
+ }),
184
+
185
+ _make_space_ranges : (fn(self : Self) -> ArrayList(CharRange))({
186
+ r := ArrayList(CharRange).new();
187
+ r.push(CharRange(low: u32(9), high: u32(13)));
188
+ r.push(CharRange(low: u32(32), high: u32(32)));
189
+ r
190
+ }),
191
+
192
+ _parse_class_escape : (fn(self : Self) -> Result(ArrayList(CharRange), String))({
193
+ b := self._advance();
194
+ match(b,
195
+ .Some(ch) =>
196
+ cond(
197
+ (ch == u8(100)) => .Ok(self._make_digit_ranges()),
198
+ (ch == u8(68)) => {
199
+ r := ArrayList(CharRange).new();
200
+ r.push(CharRange(low: u32(0), high: u32(47)));
201
+ r.push(CharRange(low: u32(58), high: u32(0x10FFFF)));
202
+ .Ok(r)
203
+ },
204
+ (ch == u8(119)) => .Ok(self._make_word_ranges()),
205
+ (ch == u8(87)) => {
206
+ r := ArrayList(CharRange).new();
207
+ r.push(CharRange(low: u32(0), high: u32(47)));
208
+ r.push(CharRange(low: u32(58), high: u32(64)));
209
+ r.push(CharRange(low: u32(91), high: u32(94)));
210
+ r.push(CharRange(low: u32(96), high: u32(96)));
211
+ r.push(CharRange(low: u32(123), high: u32(0x10FFFF)));
212
+ .Ok(r)
213
+ },
214
+ (ch == u8(115)) => .Ok(self._make_space_ranges()),
215
+ (ch == u8(83)) => {
216
+ r := ArrayList(CharRange).new();
217
+ r.push(CharRange(low: u32(0), high: u32(8)));
218
+ r.push(CharRange(low: u32(14), high: u32(31)));
219
+ r.push(CharRange(low: u32(33), high: u32(0x10FFFF)));
220
+ .Ok(r)
221
+ },
222
+ true => {
223
+ r := ArrayList(CharRange).new();
224
+ codepoint := self._escape_char_codepoint(ch);
225
+ r.push(CharRange(low: codepoint, high: codepoint));
226
+ .Ok(r)
227
+ }
228
+ ),
229
+ .None => .Err(`Unexpected end of pattern after backslash`)
230
+ )
231
+ }),
232
+
233
+ _try_parse_char_range : (fn(self : Self, ranges : ArrayList(CharRange), low : u32) -> unit)({
234
+ pk3 := self._peek();
235
+ is_dash := match(pk3,
236
+ .Some(b2) => (b2 == u8(45)),
237
+ .None => false
238
+ );
239
+ cond(
240
+ is_dash => {
241
+ has_end := ((self._pos + usize(1)) < self._bytes.len());
242
+ cond(
243
+ has_end => {
244
+ end_first := self._bytes.get((self._pos + usize(1))).unwrap();
245
+ cond(
246
+ (end_first == u8(93)) => {
247
+ ranges.push(CharRange(low: low, high: low));
248
+ },
249
+ true => {
250
+ // Consume dash
251
+ self._pos = (self._pos + usize(1));
252
+ // Consume first byte of end char
253
+ self._pos = (self._pos + usize(1));
254
+ // Decode full codepoint (reads continuation bytes)
255
+ end_cp := self._read_codepoint(end_first);
256
+ ranges.push(CharRange(low: low, high: end_cp));
257
+ }
258
+ );
259
+ },
260
+ true => {
261
+ ranges.push(CharRange(low: low, high: low));
262
+ }
263
+ );
264
+ },
265
+ true => {
266
+ ranges.push(CharRange(low: low, high: low));
267
+ }
268
+ );
269
+ }),
270
+
271
+ _parse_char_class_content : (fn(self : Self, ranges : ArrayList(CharRange)) -> Result(unit, String))({
272
+ pk2 := self._peek();
273
+ b := match(pk2,
274
+ .Some(v) => v,
275
+ .None => { return .Err(`Unterminated character class`); }
276
+ );
277
+
278
+ cond(
279
+ (b == u8(93)) => {
280
+ return .Ok(());
281
+ },
282
+ (b == u8(92)) => {
283
+ self._pos = (self._pos + usize(1));
284
+ esc := self._parse_class_escape();
285
+ match(esc,
286
+ .Ok(esc_ranges) => {
287
+ j := usize(0);
288
+ while (j < esc_ranges.len()), (j = (j + usize(1))), {
289
+ ranges.push(esc_ranges.get(j).unwrap());
290
+ };
291
+ },
292
+ .Err(e) => { return .Err(e); }
293
+ );
294
+ },
295
+ true => {
296
+ self._pos = (self._pos + usize(1));
297
+ low := self._read_codepoint(b);
298
+ self._try_parse_char_range(ranges, low);
299
+ }
300
+ );
301
+ .Ok(())
302
+ }),
303
+
304
+ _parse_char_class : (fn(self : Self) -> Result(RegexNode, String))({
305
+ ranges := ArrayList(CharRange).new();
306
+ negated := false;
307
+
308
+ // Check for negation '^'
309
+ pk := self._peek();
310
+ is_neg := match(pk,
311
+ .Some(b) => (b == u8(94)),
312
+ .None => false
313
+ );
314
+ cond(
315
+ is_neg => {
316
+ negated = true;
317
+ self._pos = (self._pos + usize(1));
318
+ },
319
+ true => ()
320
+ );
321
+
322
+ // Parse class contents until ']'
323
+ while (!(self._at_end())), {
324
+ pk2 := self._peek();
325
+ found_end := match(pk2,
326
+ .Some(b) => (b == u8(93)),
327
+ .None => false
328
+ );
329
+ cond(
330
+ found_end => {
331
+ self._pos = (self._pos + usize(1));
332
+ return .Ok(RegexNode.char_class(ranges, negated));
333
+ },
334
+ true => ()
335
+ );
336
+
337
+ result := self._parse_char_class_content(ranges);
338
+ match(result,
339
+ .Ok(_) => (),
340
+ .Err(e) => { return .Err(e); }
341
+ );
342
+ };
343
+ .Err(`Unterminated character class`)
344
+ }),
345
+
346
+ // Parse \p{PropertyName} or \P{PropertyName} unicode property escape.
347
+ // negated=true for \P (inverted match).
348
+ _parse_unicode_property : (fn(self : Self, negated : bool) -> Result(RegexNode, String))({
349
+ // Expect opening '{'
350
+ pk := self._peek();
351
+ is_brace := match(pk,
352
+ .Some(v) => (v == u8(123)),
353
+ .None => false
354
+ );
355
+ cond(
356
+ (!(is_brace)) => {
357
+ return .Err(`Expected '{' after \\p or \\P`);
358
+ },
359
+ true => ()
360
+ );
361
+ self._pos = (self._pos + usize(1));
362
+
363
+ // Read property name until '}'
364
+ name_bytes := ArrayList(u8).new();
365
+ while (!(self._at_end())), {
366
+ nb := self._peek();
367
+ nb_val := match(nb,
368
+ .Some(v) => v,
369
+ .None => { break; }
370
+ );
371
+ cond(
372
+ (nb_val == u8(125)) => {
373
+ self._pos = (self._pos + usize(1));
374
+ name_str := String.from_bytes(name_bytes);
375
+ lookup := unicode_property_ranges(name_str);
376
+ return match(lookup,
377
+ .Some(ranges) => .Ok(RegexNode.char_class(ranges, negated)),
378
+ .None => .Err(`Unknown Unicode property: \\p{${name_str}}`)
379
+ );
380
+ },
381
+ true => {
382
+ name_bytes.push(nb_val);
383
+ self._pos = (self._pos + usize(1));
384
+ }
385
+ );
386
+ };
387
+ .Err(`Unterminated Unicode property \\p{...}`)
388
+ }),
389
+
390
+ _parse_escape : (fn(self : Self) -> Result(RegexNode, String))({
391
+ b := self._advance();
392
+ match(b,
393
+ .Some(ch) =>
394
+ cond(
395
+ (ch == u8(100)) => .Ok(RegexNode.char_class(self._make_digit_ranges(), false)),
396
+ (ch == u8(68)) => .Ok(RegexNode.char_class(self._make_digit_ranges(), true)),
397
+ (ch == u8(119)) => .Ok(RegexNode.char_class(self._make_word_ranges(), false)),
398
+ (ch == u8(87)) => .Ok(RegexNode.char_class(self._make_word_ranges(), true)),
399
+ (ch == u8(115)) => .Ok(RegexNode.char_class(self._make_space_ranges(), false)),
400
+ (ch == u8(83)) => .Ok(RegexNode.char_class(self._make_space_ranges(), true)),
401
+ (ch == u8(98)) => .Ok(RegexNode.anchor_node(.WordBoundary)),
402
+ (ch == u8(66)) => .Ok(RegexNode.anchor_node(.NonWordBoundary)),
403
+ // Numeric backreference \1 through \9
404
+ ((ch >= u8(49)) && (ch <= u8(57))) => {
405
+ group_idx := usize((ch - u8(48)));
406
+ cond(
407
+ (group_idx > self._group_count) =>
408
+ .Err(`Backreference \\${group_idx} exceeds number of groups`),
409
+ true => .Ok(RegexNode.backreference(group_idx))
410
+ )
411
+ },
412
+ // Named backreference \k<name>
413
+ (ch == u8(107)) => {
414
+ pk := self._peek();
415
+ is_lt := match(pk,
416
+ .Some(v) => (v == u8(60)),
417
+ .None => false
418
+ );
419
+ cond(
420
+ (!(is_lt)) => {
421
+ return .Err(`Expected '<' after \\k`);
422
+ },
423
+ true => ()
424
+ );
425
+ self._pos = (self._pos + usize(1));
426
+ name_bytes := ArrayList(u8).new();
427
+ while (!(self._at_end())), {
428
+ nb := self._peek();
429
+ nb_val := match(nb,
430
+ .Some(v) => v,
431
+ .None => { break; }
432
+ );
433
+ cond(
434
+ (nb_val == u8(62)) => {
435
+ self._pos = (self._pos + usize(1));
436
+ name_str := String.from_bytes(name_bytes);
437
+ lookup := self._lookup_group_name(name_str);
438
+ return match(lookup,
439
+ .Some(idx) => .Ok(RegexNode.backreference(idx)),
440
+ .None => .Err(`Unknown named group in backreference`)
441
+ );
442
+ },
443
+ true => {
444
+ name_bytes.push(nb_val);
445
+ self._pos = (self._pos + usize(1));
446
+ }
447
+ );
448
+ };
449
+ .Err(`Unterminated named backreference \\k<...>`)
450
+ },
451
+ // Unicode property \p{Name}
452
+ (ch == u8(112)) => self._parse_unicode_property(false),
453
+ // Negated unicode property \P{Name}
454
+ (ch == u8(80)) => self._parse_unicode_property(true),
455
+ true => .Ok(RegexNode.literal(self._escape_char_codepoint(ch)))
456
+ ),
457
+ .None => .Err(`Unexpected end of pattern after backslash`)
458
+ )
459
+ }),
460
+
461
+ _parse_counted_quantifier : (fn(self : Self, atom : RegexNode) -> Result(RegexNode, String))({
462
+ self._pos = (self._pos + usize(1));
463
+ min_opt := self._parse_number();
464
+ mn := match(min_opt,
465
+ .Some(v) => v,
466
+ .None => { return .Err(`Expected number after '{'`); }
467
+ );
468
+
469
+ pk := self._peek();
470
+ b := match(pk,
471
+ .Some(v) => v,
472
+ .None => { return .Err(`Unexpected end of pattern in quantifier`); }
473
+ );
474
+
475
+ cond(
476
+ (b == u8(125)) => {
477
+ // {n}
478
+ self._pos = (self._pos + usize(1));
479
+ greedy := self._parse_greedy_modifier();
480
+ return .Ok(RegexNode.quantifier(atom, mn, mn, greedy));
481
+ },
482
+ (b != u8(44)) => {
483
+ return .Err(`Expected ',' or '}' in quantifier`);
484
+ },
485
+ true => ()
486
+ );
487
+
488
+ // Consume ','
489
+ self._pos = (self._pos + usize(1));
490
+
491
+ pk2 := self._peek();
492
+ b2 := match(pk2,
493
+ .Some(v) => v,
494
+ .None => { return .Err(`Unexpected end of pattern in quantifier`); }
495
+ );
496
+
497
+ cond(
498
+ (b2 == u8(125)) => {
499
+ // {n,} — unbounded
500
+ self._pos = (self._pos + usize(1));
501
+ greedy := self._parse_greedy_modifier();
502
+ return .Ok(RegexNode.quantifier(atom, mn, usize(0), greedy));
503
+ },
504
+ true => ()
505
+ );
506
+
507
+ // {n,m}
508
+ max_opt := self._parse_number();
509
+ mx := match(max_opt,
510
+ .Some(v) => v,
511
+ .None => { return .Err(`Expected number after ',' in quantifier`); }
512
+ );
513
+
514
+ pk3 := self._peek();
515
+ b3 := match(pk3,
516
+ .Some(v) => v,
517
+ .None => { return .Err(`Unexpected end of pattern in quantifier`); }
518
+ );
519
+
520
+ cond(
521
+ (b3 != u8(125)) => {
522
+ return .Err(`Expected '}' in quantifier`);
523
+ },
524
+ true => ()
525
+ );
526
+ self._pos = (self._pos + usize(1));
527
+
528
+ cond(
529
+ (mx < mn) => {
530
+ return .Err(`Invalid quantifier: max less than min`);
531
+ },
532
+ true => ()
533
+ );
534
+
535
+ greedy := self._parse_greedy_modifier();
536
+ .Ok(RegexNode.quantifier(atom, mn, mx, greedy))
537
+ }),
538
+
539
+ _parse_atom : (fn(self : Self) -> Result(RegexNode, String))({
540
+ b := self._advance();
541
+ match(b,
542
+ .Some(ch) =>
543
+ cond(
544
+ (ch == u8(46)) => .Ok(RegexNode.dot()),
545
+ (ch == u8(94)) => .Ok(RegexNode.anchor_node(.Start)),
546
+ (ch == u8(36)) => .Ok(RegexNode.anchor_node(.End)),
547
+ (ch == u8(91)) => self._parse_char_class(),
548
+ (ch == u8(92)) => self._parse_escape(),
549
+ true => .Ok(RegexNode.literal(self._read_codepoint(ch)))
550
+ ),
551
+ .None => .Err(`Unexpected end of pattern`)
552
+ )
553
+ }),
554
+
555
+ _maybe_quantify : (fn(self : Self, a : RegexNode) -> Result(RegexNode, String))({
556
+ pk := self._peek();
557
+ b := match(pk,
558
+ .Some(v) => v,
559
+ .None => { return .Ok(a); }
560
+ );
561
+
562
+ cond(
563
+ (b == u8(42)) => {
564
+ self._pos = (self._pos + usize(1));
565
+ greedy := self._parse_greedy_modifier();
566
+ .Ok(RegexNode.quantifier(a, usize(0), usize(0), greedy))
567
+ },
568
+ (b == u8(43)) => {
569
+ self._pos = (self._pos + usize(1));
570
+ greedy := self._parse_greedy_modifier();
571
+ .Ok(RegexNode.quantifier(a, usize(1), usize(0), greedy))
572
+ },
573
+ (b == u8(63)) => {
574
+ self._pos = (self._pos + usize(1));
575
+ greedy := self._parse_greedy_modifier();
576
+ .Ok(RegexNode.quantifier(a, usize(0), usize(1), greedy))
577
+ },
578
+ (b == u8(123)) => self._parse_counted_quantifier(a),
579
+ true => .Ok(a)
580
+ )
581
+ }),
582
+
583
+ _parse_quantified : (fn(self : Self) -> Result(RegexNode, String))({
584
+ atom := self._parse_atom();
585
+ match(atom,
586
+ .Ok(a) => self._maybe_quantify(a),
587
+ .Err(e) => .Err(e)
588
+ )
589
+ })
590
+ );
591
+
592
+ // Second impl block: helper methods first, then main parse loop
593
+ impl(RegexParser,
594
+ _make_sequence : (fn(self : Self, nodes : ArrayList(RegexNode)) -> RegexNode)(
595
+ cond(
596
+ (nodes.len() == usize(0)) => RegexNode.sequence(ArrayList(RegexNode).new()),
597
+ (nodes.len() == usize(1)) => nodes.get(usize(0)).unwrap(),
598
+ true => RegexNode.sequence(nodes)
599
+ )
600
+ ),
601
+
602
+ _finalize_frame : (fn(self : Self, alts : ArrayList(ArrayList(RegexNode)), seq : ArrayList(RegexNode)) -> RegexNode)({
603
+ cond(
604
+ (alts.len() == usize(0)) => {
605
+ return self._make_sequence(seq);
606
+ },
607
+ true => ()
608
+ );
609
+ alts.push(seq);
610
+ result := self._make_sequence(alts.get(usize(0)).unwrap());
611
+ i := usize(1);
612
+ while (i < alts.len()), (i = (i + usize(1))), {
613
+ right := self._make_sequence(alts.get(i).unwrap());
614
+ result = RegexNode.alternation(result, right);
615
+ };
616
+ result
617
+ }),
618
+
619
+ parse : (fn(self : Self) -> Result(RegexNode, String))({
620
+ stack := ArrayList(ParseFrame).new();
621
+ cur_alts := ArrayList(ArrayList(RegexNode)).new();
622
+ cur_seq := ArrayList(RegexNode).new();
623
+ cur_non_cap := false;
624
+ cur_group_idx := usize(0);
625
+ cur_is_la := false;
626
+ cur_is_lb := false;
627
+ cur_is_pos := true;
628
+
629
+ while (!(self._at_end())), {
630
+ pk := self._peek();
631
+ b := match(pk,
632
+ .Some(v) => v,
633
+ .None => { break; }
634
+ );
635
+
636
+ cond(
637
+ (b == u8(40)) => {
638
+ // '(' — open group
639
+ self._pos = (self._pos + usize(1));
640
+ stack.push(ParseFrame(
641
+ alternatives: cur_alts,
642
+ current: cur_seq,
643
+ is_non_capturing: cur_non_cap,
644
+ group_index: cur_group_idx,
645
+ is_lookahead: cur_is_la,
646
+ is_lookbehind: cur_is_lb,
647
+ is_positive: cur_is_pos
648
+ ));
649
+
650
+ // Detect group type: (?: (?<name> (?= (?! (?<= (?<!
651
+ is_nc := false;
652
+ g_idx := usize(0);
653
+ is_la := false;
654
+ is_lb := false;
655
+ is_pos := true;
656
+ pk2 := self._peek();
657
+ is_question := match(pk2,
658
+ .Some(v) => (v == u8(63)),
659
+ .None => false
660
+ );
661
+ cond(
662
+ is_question => {
663
+ has_next := ((self._pos + usize(1)) < self._bytes.len());
664
+ cond(
665
+ has_next => {
666
+ nb := self._bytes.get((self._pos + usize(1)));
667
+ next_ch := match(nb,
668
+ .Some(v) => v,
669
+ .None => u8(0)
670
+ );
671
+ cond(
672
+ // (?: — non-capturing group
673
+ (next_ch == u8(58)) => {
674
+ is_nc = true;
675
+ self._pos = (self._pos + usize(2));
676
+ },
677
+ // (?= — positive lookahead
678
+ (next_ch == u8(61)) => {
679
+ is_la = true;
680
+ is_pos = true;
681
+ is_nc = true;
682
+ self._pos = (self._pos + usize(2));
683
+ },
684
+ // (?! — negative lookahead
685
+ (next_ch == u8(33)) => {
686
+ is_la = true;
687
+ is_pos = false;
688
+ is_nc = true;
689
+ self._pos = (self._pos + usize(2));
690
+ },
691
+ // (?< — could be lookbehind or named group
692
+ (next_ch == u8(60)) => {
693
+ // Check char after '<'
694
+ has_third := ((self._pos + usize(2)) < self._bytes.len());
695
+ third_ch := cond(
696
+ has_third => match(self._bytes.get((self._pos + usize(2))),
697
+ .Some(v) => v,
698
+ .None => u8(0)
699
+ ),
700
+ true => u8(0)
701
+ );
702
+ cond(
703
+ // (?<= — positive lookbehind
704
+ (third_ch == u8(61)) => {
705
+ is_lb = true;
706
+ is_pos = true;
707
+ is_nc = true;
708
+ self._pos = (self._pos + usize(3));
709
+ },
710
+ // (?<! — negative lookbehind
711
+ (third_ch == u8(33)) => {
712
+ is_lb = true;
713
+ is_pos = false;
714
+ is_nc = true;
715
+ self._pos = (self._pos + usize(3));
716
+ },
717
+ // (?<name> — named capturing group
718
+ true => {
719
+ self._pos = (self._pos + usize(2));
720
+ name_bytes := ArrayList(u8).new();
721
+ while (!(self._at_end())), {
722
+ name_b := self._peek();
723
+ name_ch := match(name_b,
724
+ .Some(v) => v,
725
+ .None => { break; }
726
+ );
727
+ cond(
728
+ (name_ch == u8(62)) => {
729
+ self._pos = (self._pos + usize(1));
730
+ break;
731
+ },
732
+ true => {
733
+ name_bytes.push(name_ch);
734
+ self._pos = (self._pos + usize(1));
735
+ }
736
+ );
737
+ };
738
+ self._group_count = (self._group_count + usize(1));
739
+ g_idx = self._group_count;
740
+ group_name := String.from_bytes(name_bytes);
741
+ self._group_names.push(GroupNameEntry(name: group_name, index: g_idx));
742
+ }
743
+ );
744
+ },
745
+ true => ()
746
+ );
747
+ },
748
+ true => ()
749
+ );
750
+ },
751
+ true => ()
752
+ );
753
+
754
+ cond(
755
+ (((!(is_nc)) && (!(is_la))) && ((!(is_lb)) && (g_idx == usize(0)))) => {
756
+ self._group_count = (self._group_count + usize(1));
757
+ g_idx = self._group_count;
758
+ },
759
+ true => ()
760
+ );
761
+
762
+ cur_alts = ArrayList(ArrayList(RegexNode)).new();
763
+ cur_seq = ArrayList(RegexNode).new();
764
+ cur_non_cap = is_nc;
765
+ cur_group_idx = g_idx;
766
+ cur_is_la = is_la;
767
+ cur_is_lb = is_lb;
768
+ cur_is_pos = is_pos;
769
+ },
770
+ (b == u8(41)) => {
771
+ // ')' — close group
772
+ self._pos = (self._pos + usize(1));
773
+ cond(
774
+ (stack.len() == usize(0)) => {
775
+ return .Err(`Unexpected ')' without matching '('`);
776
+ },
777
+ true => ()
778
+ );
779
+
780
+ inner := self._finalize_frame(cur_alts, cur_seq);
781
+
782
+ // Save this group's type before restoring parent frame
783
+ this_is_la := cur_is_la;
784
+ this_is_lb := cur_is_lb;
785
+
786
+ group_node := cond(
787
+ cur_is_la => RegexNode.lookahead(inner, cur_is_pos),
788
+ cur_is_lb => RegexNode.lookbehind(inner, cur_is_pos),
789
+ cur_non_cap => RegexNode.non_capturing_group(inner),
790
+ true => RegexNode.group(inner, cur_group_idx)
791
+ );
792
+
793
+ parent := stack.get((stack.len() - usize(1))).unwrap();
794
+ stack.pop();
795
+ cur_alts = parent.alternatives;
796
+ cur_seq = parent.current;
797
+ cur_non_cap = parent.is_non_capturing;
798
+ cur_group_idx = parent.group_index;
799
+ cur_is_la = parent.is_lookahead;
800
+ cur_is_lb = parent.is_lookbehind;
801
+ cur_is_pos = parent.is_positive;
802
+
803
+ // Lookahead/lookbehind should not be quantified
804
+ cond(
805
+ (this_is_la || this_is_lb) => {
806
+ cur_seq.push(group_node);
807
+ },
808
+ true => {
809
+ quantified := self._maybe_quantify(group_node);
810
+ match(quantified,
811
+ .Ok(q) => { cur_seq.push(q); },
812
+ .Err(e) => { return .Err(e); }
813
+ );
814
+ }
815
+ );
816
+ },
817
+ (b == u8(124)) => {
818
+ // '|' — alternation
819
+ self._pos = (self._pos + usize(1));
820
+ cur_alts.push(cur_seq);
821
+ cur_seq = ArrayList(RegexNode).new();
822
+ },
823
+ true => {
824
+ atom := self._parse_quantified();
825
+ match(atom,
826
+ .Ok(a) => { cur_seq.push(a); },
827
+ .Err(e) => { return .Err(e); }
828
+ );
829
+ }
830
+ );
831
+ };
832
+
833
+ cond(
834
+ (stack.len() > usize(0)) => {
835
+ return .Err(`Unterminated group — expected ')'`);
836
+ },
837
+ true => ()
838
+ );
839
+
840
+ result := self._finalize_frame(cur_alts, cur_seq);
841
+ .Ok(result)
842
+ })
843
+ );
844
+
845
+ export
846
+ RegexParser
847
+ ;