@shd101wyy/yo 0.0.28 → 0.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@shd101wyy/yo",
3
3
  "displayName": "Yo",
4
- "version": "0.0.28",
4
+ "version": "0.0.29",
5
5
  "main": "./out/cjs/index.cjs",
6
6
  "module": "./out/esm/index.mjs",
7
7
  "types": "./out/types/src/index.d.ts",
@@ -0,0 +1,355 @@
1
+ // std/regex/compiler.yo - NFA compiler
2
+ //
3
+ // Compiles a RegexNode AST into a flat list of NFA instructions
4
+ // using Thompson's construction algorithm.
5
+
6
+ open import "std/collections/array_list";
7
+ open import "std/string";
8
+ { RegexNode, NodeKind, CharRange, AnchorKind, GroupNameEntry } :: import "./node.yo";
9
+
10
+ // NFA instruction types
11
+ InstrKind :: enum(
12
+ Char,
13
+ CharClass,
14
+ AnyChar,
15
+ Split,
16
+ Jump,
17
+ Save,
18
+ Match,
19
+ AssertStart,
20
+ AssertEnd,
21
+ AssertWordBoundary,
22
+ AssertNonWordBoundary,
23
+ Backref,
24
+ Lookahead,
25
+ Lookbehind
26
+ );
27
+
28
+ // A single NFA instruction
29
+ Instr :: struct(
30
+ kind : InstrKind,
31
+ codepoint : u32,
32
+ class_idx : usize,
33
+ target_a : usize,
34
+ target_b : usize,
35
+ target : usize,
36
+ slot : usize
37
+ );
38
+
39
+ impl(Instr,
40
+ char_instr : (fn(cp : u32) -> Self)(
41
+ Self(kind: .Char, codepoint: cp, class_idx: usize(0),
42
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
43
+ ),
44
+ any_char_instr : (fn() -> Self)(
45
+ Self(kind: .AnyChar, codepoint: u32(0), class_idx: usize(0),
46
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
47
+ ),
48
+ char_class_instr : (fn(idx : usize) -> Self)(
49
+ Self(kind: .CharClass, codepoint: u32(0), class_idx: idx,
50
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
51
+ ),
52
+ split_instr : (fn(a : usize, b : usize) -> Self)(
53
+ Self(kind: .Split, codepoint: u32(0), class_idx: usize(0),
54
+ target_a: a, target_b: b, target: usize(0), slot: usize(0))
55
+ ),
56
+ jump_instr : (fn(t : usize) -> Self)(
57
+ Self(kind: .Jump, codepoint: u32(0), class_idx: usize(0),
58
+ target_a: usize(0), target_b: usize(0), target: t, slot: usize(0))
59
+ ),
60
+ save_instr : (fn(s : usize) -> Self)(
61
+ Self(kind: .Save, codepoint: u32(0), class_idx: usize(0),
62
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: s)
63
+ ),
64
+ match_instr : (fn() -> Self)(
65
+ Self(kind: .Match, codepoint: u32(0), class_idx: usize(0),
66
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
67
+ ),
68
+ assert_start_instr : (fn() -> Self)(
69
+ Self(kind: .AssertStart, codepoint: u32(0), class_idx: usize(0),
70
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
71
+ ),
72
+ assert_end_instr : (fn() -> Self)(
73
+ Self(kind: .AssertEnd, codepoint: u32(0), class_idx: usize(0),
74
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
75
+ ),
76
+ assert_word_boundary_instr : (fn() -> Self)(
77
+ Self(kind: .AssertWordBoundary, codepoint: u32(0), class_idx: usize(0),
78
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
79
+ ),
80
+ assert_non_word_boundary_instr : (fn() -> Self)(
81
+ Self(kind: .AssertNonWordBoundary, codepoint: u32(0), class_idx: usize(0),
82
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: usize(0))
83
+ ),
84
+ backref_instr : (fn(group_idx : usize) -> Self)(
85
+ Self(kind: .Backref, codepoint: u32(0), class_idx: usize(0),
86
+ target_a: usize(0), target_b: usize(0), target: usize(0), slot: group_idx)
87
+ ),
88
+ // Lookahead: target_a=sub_start, target_b=sub_end, slot=1 for positive/0 for negative
89
+ lookahead_instr : (fn(sub_start : usize, sub_end : usize, positive : bool) -> Self)(
90
+ Self(kind: .Lookahead, codepoint: u32(0), class_idx: usize(0),
91
+ target_a: sub_start, target_b: sub_end, target: usize(0),
92
+ slot: cond(positive => usize(1), true => usize(0)))
93
+ ),
94
+ // Lookbehind: target_a=sub_start, target_b=sub_end, slot=1 for positive/0 for negative
95
+ lookbehind_instr : (fn(sub_start : usize, sub_end : usize, positive : bool) -> Self)(
96
+ Self(kind: .Lookbehind, codepoint: u32(0), class_idx: usize(0),
97
+ target_a: sub_start, target_b: sub_end, target: usize(0),
98
+ slot: cond(positive => usize(1), true => usize(0)))
99
+ )
100
+ );
101
+
102
+ // Character class table entry
103
+ ClassEntry :: struct(
104
+ ranges : ArrayList(CharRange),
105
+ negated : bool
106
+ );
107
+
108
+ // The compiled NFA program
109
+ NfaProgram :: object(
110
+ instructions : ArrayList(Instr),
111
+ classes : ArrayList(ClassEntry),
112
+ n_groups : usize,
113
+ group_names : ArrayList(GroupNameEntry),
114
+ literal_prefix : ArrayList(u8)
115
+ );
116
+
117
+ // The NFA compiler
118
+ NfaCompiler :: object(
119
+ _program : NfaProgram
120
+ );
121
+
122
+ // Utilities defined first (bottom-up ordering required)
123
+ impl(NfaCompiler,
124
+ new : (fn() -> Self)(
125
+ Self(
126
+ _program: NfaProgram(
127
+ instructions: ArrayList(Instr).new(),
128
+ classes: ArrayList(ClassEntry).new(),
129
+ n_groups: usize(0),
130
+ group_names: ArrayList(GroupNameEntry).new(),
131
+ literal_prefix: ArrayList(u8).new()
132
+ )
133
+ )
134
+ ),
135
+
136
+ _emit : (fn(self : Self, instr : Instr) -> usize)({
137
+ idx := self._program.instructions.len();
138
+ self._program.instructions.push(instr);
139
+ idx
140
+ }),
141
+
142
+ _current_pc : (fn(self : Self) -> usize)(
143
+ self._program.instructions.len()
144
+ ),
145
+
146
+ _add_class : (fn(self : Self, ranges : ArrayList(CharRange), negated : bool) -> usize)({
147
+ idx := self._program.classes.len();
148
+ self._program.classes.push(ClassEntry(ranges: ranges, negated: negated));
149
+ idx
150
+ })
151
+ );
152
+
153
+ // Compile methods: _compile_node is self-recursive, quantifier logic inlined
154
+ impl(NfaCompiler,
155
+ _compile_node : (fn(self : Self, node : RegexNode) -> unit)({
156
+ match(node.kind,
157
+ .Literal => {
158
+ self._emit(Instr.char_instr(node.codepoint));
159
+ },
160
+ .Dot => {
161
+ self._emit(Instr.any_char_instr());
162
+ },
163
+ .CharClass => {
164
+ idx := self._add_class(node.ranges, node.negated);
165
+ self._emit(Instr.char_class_instr(idx));
166
+ },
167
+ .Anchor =>
168
+ match(node.anchor,
169
+ .Start => { self._emit(Instr.assert_start_instr()); },
170
+ .End => { self._emit(Instr.assert_end_instr()); },
171
+ .WordBoundary => { self._emit(Instr.assert_word_boundary_instr()); },
172
+ .NonWordBoundary => { self._emit(Instr.assert_non_word_boundary_instr()); }
173
+ ),
174
+ .Sequence => {
175
+ i := usize(0);
176
+ while (i < node.children.len()), (i = (i + usize(1))), {
177
+ child := node.children.get(i).unwrap();
178
+ recur(self, child);
179
+ };
180
+ },
181
+ .Alternation => {
182
+ left := node.children.get(usize(0)).unwrap();
183
+ right := node.children.get(usize(1)).unwrap();
184
+ split_pc := self._emit(Instr.split_instr(usize(0), usize(0)));
185
+ left_start := self._current_pc();
186
+ self._program.instructions.set(split_pc, Instr.split_instr(left_start, usize(0)));
187
+ recur(self, left);
188
+ jump_pc := self._emit(Instr.jump_instr(usize(0)));
189
+ right_start := self._current_pc();
190
+ self._program.instructions.set(split_pc, Instr.split_instr(left_start, right_start));
191
+ recur(self, right);
192
+ end_pc := self._current_pc();
193
+ self._program.instructions.set(jump_pc, Instr.jump_instr(end_pc));
194
+ },
195
+ .Quantifier => {
196
+ child := node.children.get(usize(0)).unwrap();
197
+ min_val := node.q_min;
198
+ max_val := node.q_max;
199
+ greedy := node.q_greedy;
200
+
201
+ // Emit min required copies, tracking start of last copy for loop-back
202
+ (last_body_start : usize) = self._current_pc();
203
+ qi := usize(0);
204
+ while (qi < min_val), (qi = (qi + usize(1))), {
205
+ last_body_start = self._current_pc();
206
+ recur(self, child);
207
+ };
208
+
209
+ cond(
210
+ ((max_val == usize(0)) && (min_val == usize(0))) => {
211
+ // * — zero or more (max=0 means unbounded when min=0)
212
+ l1 := self._current_pc();
213
+ split_pc := self._emit(Instr.split_instr(usize(0), usize(0)));
214
+ l2 := self._current_pc();
215
+ recur(self, child);
216
+ self._emit(Instr.jump_instr(l1));
217
+ l3 := self._current_pc();
218
+ cond(
219
+ greedy => {
220
+ self._program.instructions.set(split_pc, Instr.split_instr(l2, l3));
221
+ },
222
+ true => {
223
+ self._program.instructions.set(split_pc, Instr.split_instr(l3, l2));
224
+ }
225
+ );
226
+ },
227
+ ((max_val == usize(0)) && (min_val > usize(0))) => {
228
+ // + beyond min copies — loop back to last min copy (no extra body)
229
+ split_pc := self._emit(Instr.split_instr(usize(0), usize(0)));
230
+ l2 := self._current_pc();
231
+ cond(
232
+ greedy => {
233
+ self._program.instructions.set(split_pc, Instr.split_instr(last_body_start, l2));
234
+ },
235
+ true => {
236
+ self._program.instructions.set(split_pc, Instr.split_instr(l2, last_body_start));
237
+ }
238
+ );
239
+ },
240
+ true => {
241
+ // {min, max} — emit (max - min) optional copies
242
+ remaining := (max_val - min_val);
243
+ qj := usize(0);
244
+ while (qj < remaining), (qj = (qj + usize(1))), {
245
+ split_pc := self._emit(Instr.split_instr(usize(0), usize(0)));
246
+ body_start := self._current_pc();
247
+ recur(self, child);
248
+ after := self._current_pc();
249
+ cond(
250
+ greedy => {
251
+ self._program.instructions.set(split_pc, Instr.split_instr(body_start, after));
252
+ },
253
+ true => {
254
+ self._program.instructions.set(split_pc, Instr.split_instr(after, body_start));
255
+ }
256
+ );
257
+ };
258
+ }
259
+ );
260
+ },
261
+ .Group => {
262
+ child := node.children.get(usize(0)).unwrap();
263
+ start_slot := (node.group_index * usize(2));
264
+ end_slot := ((node.group_index * usize(2)) + usize(1));
265
+ self._emit(Instr.save_instr(start_slot));
266
+ recur(self, child);
267
+ self._emit(Instr.save_instr(end_slot));
268
+ },
269
+ .NonCapturingGroup => {
270
+ child := node.children.get(usize(0)).unwrap();
271
+ recur(self, child);
272
+ },
273
+ .Backreference => {
274
+ self._emit(Instr.backref_instr(node.group_index));
275
+ },
276
+ .Lookahead => {
277
+ child := node.children.get(usize(0)).unwrap();
278
+ positive := (!(node.negated));
279
+ // Emit lookahead instruction with placeholder sub_end
280
+ la_pc := self._emit(Instr.lookahead_instr(usize(0), usize(0), positive));
281
+ sub_start := self._current_pc();
282
+ recur(self, child);
283
+ self._emit(Instr.match_instr());
284
+ sub_end := self._current_pc();
285
+ self._program.instructions.set(la_pc, Instr.lookahead_instr(sub_start, sub_end, positive));
286
+ },
287
+ .Lookbehind => {
288
+ child := node.children.get(usize(0)).unwrap();
289
+ positive := (!(node.negated));
290
+ lb_pc := self._emit(Instr.lookbehind_instr(usize(0), usize(0), positive));
291
+ sub_start := self._current_pc();
292
+ recur(self, child);
293
+ self._emit(Instr.match_instr());
294
+ sub_end := self._current_pc();
295
+ self._program.instructions.set(lb_pc, Instr.lookbehind_instr(sub_start, sub_end, positive));
296
+ }
297
+ );
298
+ })
299
+ );
300
+
301
+ // Literal prefix extraction (must be before compile)
302
+ impl(NfaCompiler,
303
+ // Extract literal bytes from the start of the pattern for fast scanning.
304
+ // Walks past Save/AssertStart instructions, then collects Char instructions.
305
+ _extract_literal_prefix : (fn(self : Self) -> unit)({
306
+ prefix := ArrayList(u8).new();
307
+ pc := usize(0);
308
+ instrs := self._program.instructions;
309
+ (done : bool) = false;
310
+
311
+ while ((pc < instrs.len()) && (!(done))), {
312
+ instr := instrs.get(pc).unwrap();
313
+ match(instr.kind,
314
+ .Save => { pc = (pc + usize(1)); },
315
+ .AssertStart => { pc = (pc + usize(1)); },
316
+ .Char => {
317
+ cp := instr.codepoint;
318
+ cond(
319
+ (cp < u32(0x80)) => {
320
+ prefix.push(u8(cp));
321
+ pc = (pc + usize(1));
322
+ },
323
+ true => { done = true; }
324
+ );
325
+ },
326
+ _ => { done = true; }
327
+ );
328
+ };
329
+
330
+ self._program.literal_prefix = prefix;
331
+ })
332
+ );
333
+
334
+ // Top-level compile method
335
+ impl(NfaCompiler,
336
+ compile : (fn(self : Self, root : RegexNode, n_groups : usize, group_names : ArrayList(GroupNameEntry)) -> NfaProgram)({
337
+ self._program.n_groups = n_groups;
338
+ self._program.group_names = group_names;
339
+ self._emit(Instr.save_instr(usize(0)));
340
+ self._compile_node(root);
341
+ self._emit(Instr.save_instr(usize(1)));
342
+ self._emit(Instr.match_instr());
343
+ self._extract_literal_prefix();
344
+ self._program
345
+ })
346
+ );
347
+
348
+ export
349
+ NfaCompiler,
350
+ NfaProgram,
351
+ Instr,
352
+ InstrKind,
353
+ ClassEntry,
354
+ GroupNameEntry
355
+ ;
@@ -0,0 +1,104 @@
1
+ // std/regex/flags.yo - RegexFlags parsing and representation
2
+ //
3
+ // Regex flags follow JavaScript syntax: "gi", "ms", "iu", etc.
4
+ //
5
+ // Supported flags:
6
+ // g - global: match all occurrences
7
+ // i - ignoreCase: case-insensitive matching
8
+ // m - multiline: ^ and $ match line boundaries
9
+ // s - dotAll: . matches newline characters
10
+ // u - unicode: full Unicode matching
11
+ // y - sticky: match from lastIndex only
12
+
13
+ open import "std/string";
14
+ open import "std/collections/array_list";
15
+
16
+ RegexFlags :: struct(
17
+ global : bool,
18
+ ignore_case : bool,
19
+ multiline : bool,
20
+ dot_all : bool,
21
+ unicode : bool,
22
+ sticky : bool
23
+ );
24
+
25
+ impl(RegexFlags,
26
+ // Create default flags (all false)
27
+ default : (fn() -> Self)(
28
+ Self(
29
+ global: false,
30
+ ignore_case: false,
31
+ multiline: false,
32
+ dot_all: false,
33
+ unicode: false,
34
+ sticky: false
35
+ )
36
+ ),
37
+
38
+ // Parse flags from a string like `gi`, `ms`, etc.
39
+ parse : (fn(flags_str: String) -> Result(Self, String))({
40
+ result := Self.default();
41
+ bytes := flags_str.as_bytes();
42
+ i := usize(0);
43
+ while (i < bytes.len()), (i = (i + usize(1))), {
44
+ byte_opt := bytes.get(i);
45
+ match(byte_opt,
46
+ .Some(b) => {
47
+ cond(
48
+ (b == u8(103)) => {
49
+ // 'g'
50
+ cond(
51
+ result.global => { return .Err(`Duplicate flag: g`); },
52
+ true => { result.global = true; }
53
+ );
54
+ },
55
+ (b == u8(105)) => {
56
+ // 'i'
57
+ cond(
58
+ result.ignore_case => { return .Err(`Duplicate flag: i`); },
59
+ true => { result.ignore_case = true; }
60
+ );
61
+ },
62
+ (b == u8(109)) => {
63
+ // 'm'
64
+ cond(
65
+ result.multiline => { return .Err(`Duplicate flag: m`); },
66
+ true => { result.multiline = true; }
67
+ );
68
+ },
69
+ (b == u8(115)) => {
70
+ // 's'
71
+ cond(
72
+ result.dot_all => { return .Err(`Duplicate flag: s`); },
73
+ true => { result.dot_all = true; }
74
+ );
75
+ },
76
+ (b == u8(117)) => {
77
+ // 'u'
78
+ cond(
79
+ result.unicode => { return .Err(`Duplicate flag: u`); },
80
+ true => { result.unicode = true; }
81
+ );
82
+ },
83
+ (b == u8(121)) => {
84
+ // 'y'
85
+ cond(
86
+ result.sticky => { return .Err(`Duplicate flag: y`); },
87
+ true => { result.sticky = true; }
88
+ );
89
+ },
90
+ true => {
91
+ return .Err(`Invalid flag character`);
92
+ }
93
+ );
94
+ },
95
+ .None => ()
96
+ );
97
+ };
98
+ .Ok(result)
99
+ })
100
+ );
101
+
102
+ export
103
+ RegexFlags
104
+ ;
@@ -0,0 +1,83 @@
1
+ // std/regex/match.yo - Match result type
2
+ //
3
+ // Represents the result of a regex match, including the matched text,
4
+ // position, and captured groups.
5
+
6
+ open import "std/collections/array_list";
7
+ open import "std/string";
8
+ { GroupNameEntry } :: import "./node.yo";
9
+
10
+ // A single regex match result
11
+ RegexMatch :: object(
12
+ _value : String,
13
+ _index : usize,
14
+ _input : String,
15
+ _groups : ArrayList(Option(String)),
16
+ _group_names : ArrayList(GroupNameEntry)
17
+ );
18
+
19
+ impl(RegexMatch,
20
+ // Create a match result
21
+ new : (fn(value : String, index : usize, input : String, groups : ArrayList(Option(String)), group_names : ArrayList(GroupNameEntry)) -> Self)(
22
+ Self(
23
+ _value: value,
24
+ _index: index,
25
+ _input: input,
26
+ _groups: groups,
27
+ _group_names: group_names
28
+ )
29
+ ),
30
+
31
+ // Get the full matched text
32
+ value : (fn(self : Self) -> String)(
33
+ self._value
34
+ ),
35
+
36
+ // Get the start position (character index) of the match
37
+ index : (fn(self : Self) -> usize)(
38
+ self._index
39
+ ),
40
+
41
+ // Get the original input string
42
+ input : (fn(self : Self) -> String)(
43
+ self._input
44
+ ),
45
+
46
+ // Get capture group by index (1-based, group 0 is the full match)
47
+ group : (fn(self : Self, idx : usize) -> Option(String))(
48
+ cond(
49
+ (idx == usize(0)) => .Some(self._value),
50
+ true => {
51
+ actual_idx := (idx - usize(1));
52
+ cond(
53
+ (actual_idx < self._groups.len()) => self._groups.get(actual_idx).unwrap(),
54
+ true => .None
55
+ )
56
+ }
57
+ )
58
+ ),
59
+
60
+ // Get capture group by name
61
+ named_group : (fn(self : Self, name : String) -> Option(String))({
62
+ i := usize(0);
63
+ while (i < self._group_names.len()), (i = (i + usize(1))), {
64
+ entry := self._group_names.get(i).unwrap();
65
+ cond(
66
+ (entry.name == name) => {
67
+ return self.group(entry.index);
68
+ },
69
+ true => ()
70
+ );
71
+ };
72
+ .None
73
+ }),
74
+
75
+ // Get number of capture groups (not counting group 0)
76
+ group_count : (fn(self : Self) -> usize)(
77
+ self._groups.len()
78
+ )
79
+ );
80
+
81
+ export
82
+ RegexMatch
83
+ ;