lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,718 @@
1
+ //! Opcode definitions and handlers
2
+ //!
3
+ //! This module defines all opcodes and their execution handlers.
4
+ //! Must match lib/lexer_kit/ir/opcode.rb and ext/lexer_kit/lexer_kit_ext.h
5
+
6
+ use crate::dfa::{match_literal, scan_until, scan_until_escape};
7
+ use crate::trie;
8
+ use crate::types::{
9
+ CompiledProgram, EmitResult, VmResult, VmState, MAX_MODE_STACK, MAX_ZERO_PROGRESS_TOKENS,
10
+ };
11
+
12
+ // =============================================================================
13
+ // Opcode Definitions
14
+ // =============================================================================
15
+
16
+ /// DFA operations
17
+ pub const OP_DFA_RUN: u8 = 0x01;
18
+ pub const OP_DFA_RUN_IF_MATCH: u8 = 0x02;
19
+
20
+ /// Delimiter/literal operations
21
+ pub const OP_SCAN_UNTIL: u8 = 0x10;
22
+ pub const OP_MATCH_LITERAL: u8 = 0x12;
23
+ pub const OP_SCAN_UNTIL_ESCAPE: u8 = 0x13;
24
+ pub const OP_MATCH_RANGE: u8 = 0x14;
25
+ pub const OP_MATCH_LITERAL_OR_JUMP: u8 = 0x16;
26
+
27
+ /// Branch/control operations
28
+ pub const OP_SWITCH_BYTE: u8 = 0x20;
29
+ pub const OP_JUMP: u8 = 0x21;
30
+ pub const OP_JUMP_IF_EOF: u8 = 0x24;
31
+
32
+ /// Mode operations
33
+ pub const OP_PUSH_MODE: u8 = 0x30;
34
+ pub const OP_POP_MODE: u8 = 0x31;
35
+
36
+ /// Token operations
37
+ pub const OP_EMIT: u8 = 0x40;
38
+ pub const OP_EMIT_SKIP: u8 = 0x41;
39
+ pub const OP_EMIT_ERROR: u8 = 0x42;
40
+ pub const OP_MARK: u8 = 0x43;
41
+ pub const OP_EMIT_AND_JUMP: u8 = 0x44;
42
+ pub const OP_KEYWORD_LOOKUP: u8 = 0x45;
43
+ pub const OP_LITERAL_TRIE_RUN: u8 = 0x46;
44
+ pub const OP_CLEAR_BEST: u8 = 0x49;
45
+ pub const OP_COMMIT_BEST: u8 = 0x4C;
46
+ pub const OP_SET_MATCH: u8 = 0x4D;
47
+ pub const OP_LITERAL_TRIE_COMMIT: u8 = 0x4E;
48
+ pub const OP_EMIT_SKIP_AND_JUMP: u8 = 0x4F;
49
+
50
+ /// Special
51
+ pub const OP_HALT: u8 = 0xFF;
52
+
53
+ // =============================================================================
54
+ // Helper Types
55
+ // =============================================================================
56
+
57
+ /// Packed argument with two fields: upper 10 bits and lower 14 bits
58
+ ///
59
+ /// Used by opcodes that encode two values in a single 24-bit argument:
60
+ /// - DFA_RUN_IF_MATCH: dfa_id (10) + fail_target (14)
61
+ /// - MATCH_LITERAL_OR_JUMP: const_id (10) + fail_target (14)
62
+ /// - EMIT_AND_JUMP: token_id (10) + jump_target (14)
63
+ /// - SET_MATCH: order (10) + action_ip (14)
64
+ /// - LITERAL_TRIE_COMMIT: const_id (10) + fail_target (14)
65
+ #[derive(Debug, Clone, Copy)]
66
+ pub struct PackedArg {
67
+ /// Upper 10 bits (bits 23:14)
68
+ pub upper: u16,
69
+ /// Lower 14 bits (bits 13:0)
70
+ pub lower: u32,
71
+ }
72
+
73
+ impl PackedArg {
74
+ /// Extract packed argument from a 24-bit value
75
+ #[inline]
76
+ pub fn from_u32(val: u32) -> Self {
77
+ Self {
78
+ upper: ((val >> 14) & 0x3FF) as u16,
79
+ lower: val & 0x3FFF,
80
+ }
81
+ }
82
+ }
83
+
84
+ // =============================================================================
85
+ // Helper Functions
86
+ // =============================================================================
87
+
88
+ /// Validate jump target
89
+ #[inline]
90
+ fn validate_jump(target: u32, prog: &CompiledProgram) -> VmResult {
91
+ if (target as usize) < prog.instructions.len() {
92
+ VmResult::Continue
93
+ } else {
94
+ VmResult::Halt
95
+ }
96
+ }
97
+
98
+ /// Load constant from pool, setting failure state if not found
99
+ ///
100
+ /// Returns Some(data) if constant exists, None if not found (and sets vm.failed)
101
+ #[inline]
102
+ fn load_constant_or_fail<'a>(vm: &mut VmState<'a>, const_id: usize) -> Option<&'a [u8]> {
103
+ match vm.prog.constant_pool.get(const_id) {
104
+ Some(data) => Some(data),
105
+ None => {
106
+ vm.failed = true;
107
+ vm.ip += 1;
108
+ None
109
+ }
110
+ }
111
+ }
112
+
113
+ /// Check for zero-progress after emit (infinite loop detection)
114
+ #[inline]
115
+ fn check_zero_progress(vm: &mut VmState) -> VmResult {
116
+ if vm.pos == vm.last_emit_pos {
117
+ vm.zero_progress_count += 1;
118
+ if vm.zero_progress_count >= MAX_ZERO_PROGRESS_TOKENS {
119
+ return VmResult::Halt;
120
+ }
121
+ } else {
122
+ vm.zero_progress_count = 0;
123
+ vm.last_emit_pos = vm.pos;
124
+ }
125
+ VmResult::Continue
126
+ }
127
+
128
+ /// Read u16 from bytes (big-endian)
129
+ #[inline]
130
+ fn read_u16(bytes: &[u8]) -> u16 {
131
+ ((bytes[0] as u16) << 8) | (bytes[1] as u16)
132
+ }
133
+
134
+ // =============================================================================
135
+ // Opcode Handlers
136
+ // =============================================================================
137
+
138
+ /// Execute a single opcode
139
+ ///
140
+ /// Returns (VmResult, Option<EmitResult>)
141
+ pub fn execute(vm: &mut VmState, opcode: u8, arg: u32) -> (VmResult, Option<EmitResult>) {
142
+ match opcode {
143
+ OP_HALT => (VmResult::Halt, None),
144
+
145
+ // DFA operations
146
+ OP_DFA_RUN => {
147
+ op_dfa_run(vm, arg);
148
+ (VmResult::Continue, None)
149
+ }
150
+ OP_DFA_RUN_IF_MATCH => (op_dfa_run_if_match(vm, arg), None),
151
+
152
+ // Delimiter/literal operations
153
+ OP_SCAN_UNTIL => {
154
+ op_scan_until(vm, arg);
155
+ (VmResult::Continue, None)
156
+ }
157
+ OP_MATCH_LITERAL => {
158
+ op_match_literal(vm, arg);
159
+ (VmResult::Continue, None)
160
+ }
161
+ OP_SCAN_UNTIL_ESCAPE => {
162
+ op_scan_until_escape(vm, arg);
163
+ (VmResult::Continue, None)
164
+ }
165
+ OP_MATCH_RANGE => {
166
+ op_match_range(vm, arg);
167
+ (VmResult::Continue, None)
168
+ }
169
+ OP_MATCH_LITERAL_OR_JUMP => (op_match_literal_or_jump(vm, arg), None),
170
+
171
+ // Branch/control operations
172
+ OP_SWITCH_BYTE => (op_switch_byte(vm, arg), None),
173
+ OP_JUMP => (op_jump(vm, arg), None),
174
+ OP_JUMP_IF_EOF => (op_jump_if_eof(vm, arg), None),
175
+
176
+ // Mode operations
177
+ OP_PUSH_MODE => (op_push_mode(vm, arg), None),
178
+ OP_POP_MODE => (op_pop_mode(vm), None),
179
+
180
+ // Token operations
181
+ OP_EMIT | OP_EMIT_ERROR => op_emit(vm, arg),
182
+ OP_EMIT_SKIP => {
183
+ let result = op_emit_skip(vm);
184
+ (result, None)
185
+ }
186
+ OP_MARK => {
187
+ op_mark(vm);
188
+ (VmResult::Continue, None)
189
+ }
190
+ OP_EMIT_AND_JUMP => op_emit_and_jump(vm, arg),
191
+ OP_KEYWORD_LOOKUP => {
192
+ op_keyword_lookup(vm, arg);
193
+ (VmResult::Continue, None)
194
+ }
195
+ OP_LITERAL_TRIE_RUN => {
196
+ op_literal_trie_run(vm, arg);
197
+ (VmResult::Continue, None)
198
+ }
199
+ OP_CLEAR_BEST => {
200
+ op_clear_best(vm);
201
+ (VmResult::Continue, None)
202
+ }
203
+ OP_COMMIT_BEST => (op_commit_best(vm, arg), None),
204
+ OP_SET_MATCH => {
205
+ op_set_match(vm, arg);
206
+ (VmResult::Continue, None)
207
+ }
208
+ OP_LITERAL_TRIE_COMMIT => (op_literal_trie_commit(vm, arg), None),
209
+ OP_EMIT_SKIP_AND_JUMP => (op_emit_skip_and_jump(vm, arg), None),
210
+
211
+ // Unknown opcode
212
+ _ => (VmResult::Halt, None),
213
+ }
214
+ }
215
+
216
+ // =============================================================================
217
+ // Individual Opcode Implementations
218
+ // =============================================================================
219
+
220
+ /// OP_MARK: Set mark position
221
+ fn op_mark(vm: &mut VmState) {
222
+ vm.mark = vm.pos;
223
+ vm.ip += 1;
224
+ }
225
+
226
+ /// OP_DFA_RUN: Execute DFA and advance position
227
+ fn op_dfa_run(vm: &mut VmState, arg: u32) {
228
+ let dfa_id = arg as usize;
229
+ if dfa_id >= vm.prog.dfa_tables.len() {
230
+ vm.failed = true;
231
+ vm.last_match_len = 0;
232
+ vm.ip += 1;
233
+ return;
234
+ }
235
+
236
+ let dfa = &vm.prog.dfa_tables[dfa_id];
237
+ if let Some(m) = dfa.run(vm.remaining_bytes()) {
238
+ vm.pos += m.length;
239
+ vm.failed = false;
240
+ vm.last_match_len = m.length;
241
+ } else {
242
+ vm.failed = true;
243
+ vm.last_match_len = 0;
244
+ }
245
+ vm.ip += 1;
246
+ }
247
+
248
+ /// OP_DFA_RUN_IF_MATCH: Run DFA with integrated failure jump
249
+ ///
250
+ /// Argument encoding:
251
+ /// - arg[23:14] = dfa_id (10 bits)
252
+ /// - arg[13:0] = fail_target (14 bits)
253
+ fn op_dfa_run_if_match(vm: &mut VmState, arg: u32) -> VmResult {
254
+ let packed = PackedArg::from_u32(arg);
255
+ let dfa_id = packed.upper as usize;
256
+ let fail_target = packed.lower;
257
+
258
+ if dfa_id >= vm.prog.dfa_tables.len() {
259
+ vm.failed = true;
260
+ vm.last_match_len = 0;
261
+ vm.ip = fail_target;
262
+ return validate_jump(fail_target, vm.prog);
263
+ }
264
+
265
+ let dfa = &vm.prog.dfa_tables[dfa_id];
266
+ if let Some(m) = dfa.run(vm.remaining_bytes()) {
267
+ vm.pos += m.length;
268
+ vm.failed = false;
269
+ vm.last_match_len = m.length;
270
+ vm.ip += 1;
271
+ VmResult::Continue
272
+ } else {
273
+ vm.failed = true;
274
+ vm.last_match_len = 0;
275
+ vm.ip = fail_target;
276
+ validate_jump(fail_target, vm.prog)
277
+ }
278
+ }
279
+
280
+ /// OP_SCAN_UNTIL: Scan until delimiter found
281
+ fn op_scan_until(vm: &mut VmState, arg: u32) {
282
+ let Some(delim) = load_constant_or_fail(vm, arg as usize) else {
283
+ return;
284
+ };
285
+
286
+ let remaining = vm.remaining_bytes();
287
+ let scanned = scan_until(remaining, delim);
288
+ vm.pos += scanned;
289
+ vm.failed = scanned >= remaining.len();
290
+ vm.ip += 1;
291
+ }
292
+
293
+ /// OP_SCAN_UNTIL_ESCAPE: Scan until delimiter found, skipping escape sequences
294
+ fn op_scan_until_escape(vm: &mut VmState, arg: u32) {
295
+ let Some(config) = load_constant_or_fail(vm, arg as usize) else {
296
+ return;
297
+ };
298
+ if config.len() < 4 {
299
+ vm.failed = true;
300
+ vm.ip += 1;
301
+ return;
302
+ }
303
+
304
+ let close_len = read_u16(&config[0..2]) as usize;
305
+ let escape_len = read_u16(&config[2..4]) as usize;
306
+
307
+ if 4 + close_len + escape_len > config.len() {
308
+ vm.failed = true;
309
+ vm.ip += 1;
310
+ return;
311
+ }
312
+
313
+ let close = &config[4..4 + close_len];
314
+ let escape = &config[4 + close_len..4 + close_len + escape_len];
315
+
316
+ let (found, end_pos) = scan_until_escape(vm.remaining_bytes(), close, escape);
317
+
318
+ if found {
319
+ vm.pos += end_pos;
320
+ vm.failed = false;
321
+ } else {
322
+ vm.pos = vm.bytes.len();
323
+ vm.failed = true;
324
+ }
325
+ vm.ip += 1;
326
+ }
327
+
328
+ /// OP_MATCH_LITERAL: Match literal string
329
+ fn op_match_literal(vm: &mut VmState, arg: u32) {
330
+ let Some(lit) = load_constant_or_fail(vm, arg as usize) else {
331
+ vm.last_match_len = 0;
332
+ return;
333
+ };
334
+
335
+ if match_literal(vm.remaining_bytes(), lit) {
336
+ vm.pos += lit.len();
337
+ vm.failed = false;
338
+ vm.last_match_len = lit.len();
339
+ } else {
340
+ vm.failed = true;
341
+ vm.last_match_len = 0;
342
+ }
343
+ vm.ip += 1;
344
+ }
345
+
346
+ /// OP_MATCH_LITERAL_OR_JUMP: Match literal with integrated failure jump
347
+ ///
348
+ /// Argument encoding:
349
+ /// - arg[23:14] = const_id (10 bits)
350
+ /// - arg[13:0] = fail_target (14 bits)
351
+ fn op_match_literal_or_jump(vm: &mut VmState, arg: u32) -> VmResult {
352
+ let packed = PackedArg::from_u32(arg);
353
+ let const_id = packed.upper as usize;
354
+ let fail_target = packed.lower;
355
+
356
+ let lit = match vm.prog.constant_pool.get(const_id) {
357
+ Some(l) => l,
358
+ None => {
359
+ vm.failed = true;
360
+ vm.last_match_len = 0;
361
+ vm.ip = fail_target;
362
+ return validate_jump(fail_target, vm.prog);
363
+ }
364
+ };
365
+
366
+ if match_literal(vm.remaining_bytes(), lit) {
367
+ vm.pos += lit.len();
368
+ vm.failed = false;
369
+ vm.last_match_len = lit.len();
370
+ vm.ip += 1;
371
+ VmResult::Continue
372
+ } else {
373
+ vm.failed = true;
374
+ vm.last_match_len = 0;
375
+ vm.ip = fail_target;
376
+ validate_jump(fail_target, vm.prog)
377
+ }
378
+ }
379
+
380
+ /// OP_MATCH_RANGE: Match byte in range [lo, hi]
381
+ fn op_match_range(vm: &mut VmState, arg: u32) {
382
+ let lo = ((arg >> 8) & 0xFF) as u8;
383
+ let hi = (arg & 0xFF) as u8;
384
+
385
+ if lo > hi {
386
+ vm.failed = true;
387
+ vm.last_match_len = 0;
388
+ } else if let Some(byte) = vm.current_byte() {
389
+ if byte >= lo && byte <= hi {
390
+ vm.pos += 1;
391
+ vm.failed = false;
392
+ vm.last_match_len = 1;
393
+ } else {
394
+ vm.failed = true;
395
+ vm.last_match_len = 0;
396
+ }
397
+ } else {
398
+ vm.failed = true;
399
+ vm.last_match_len = 0;
400
+ }
401
+ vm.ip += 1;
402
+ }
403
+
404
+ /// OP_SWITCH_BYTE: Jump based on current byte
405
+ fn op_switch_byte(vm: &mut VmState, arg: u32) -> VmResult {
406
+ let jt_id = arg as usize;
407
+ if jt_id >= vm.prog.jump_tables.len() {
408
+ vm.ip += 1;
409
+ return VmResult::Continue;
410
+ }
411
+
412
+ let jt = &vm.prog.jump_tables[jt_id];
413
+ let target = if let Some(byte) = vm.current_byte() {
414
+ jt.lookup[byte as usize]
415
+ } else {
416
+ jt.default_offset
417
+ };
418
+
419
+ if validate_jump(target, vm.prog) == VmResult::Halt {
420
+ return VmResult::Halt;
421
+ }
422
+ vm.ip = target;
423
+ VmResult::Continue
424
+ }
425
+
426
+ /// OP_JUMP: Unconditional jump
427
+ fn op_jump(vm: &mut VmState, arg: u32) -> VmResult {
428
+ if validate_jump(arg, vm.prog) == VmResult::Halt {
429
+ return VmResult::Halt;
430
+ }
431
+ vm.ip = arg;
432
+ VmResult::Continue
433
+ }
434
+
435
+ /// OP_JUMP_IF_EOF: Jump if at end of input
436
+ fn op_jump_if_eof(vm: &mut VmState, arg: u32) -> VmResult {
437
+ if vm.is_eof() {
438
+ if validate_jump(arg, vm.prog) == VmResult::Halt {
439
+ return VmResult::Halt;
440
+ }
441
+ vm.ip = arg;
442
+ } else {
443
+ vm.ip += 1;
444
+ }
445
+ VmResult::Continue
446
+ }
447
+
448
+ /// OP_PUSH_MODE: Push current mode and switch to new mode
449
+ fn op_push_mode(vm: &mut VmState, arg: u32) -> VmResult {
450
+ if vm.mode_sp as usize >= MAX_MODE_STACK {
451
+ vm.error = Some("mode stack overflow: exceeded maximum depth of 64");
452
+ return VmResult::Halt;
453
+ }
454
+ vm.mode_stack[vm.mode_sp as usize] = vm.mode;
455
+ vm.mode_sp += 1;
456
+ vm.mode = arg as u16;
457
+
458
+ let offset = vm.prog.find_mode_offset(arg as u16);
459
+ if validate_jump(offset, vm.prog) == VmResult::Halt {
460
+ return VmResult::Halt;
461
+ }
462
+ vm.ip = offset;
463
+ VmResult::Continue
464
+ }
465
+
466
+ /// OP_POP_MODE: Pop mode from stack and return to it
467
+ fn op_pop_mode(vm: &mut VmState) -> VmResult {
468
+ if vm.mode_sp > 0 {
469
+ vm.mode_sp -= 1;
470
+ vm.mode = vm.mode_stack[vm.mode_sp as usize];
471
+ }
472
+
473
+ let offset = vm.prog.find_mode_offset(vm.mode);
474
+ if validate_jump(offset, vm.prog) == VmResult::Halt {
475
+ return VmResult::Halt;
476
+ }
477
+ vm.ip = offset;
478
+ VmResult::Continue
479
+ }
480
+
481
+ /// OP_EMIT: Emit token
482
+ fn op_emit(vm: &mut VmState, arg: u32) -> (VmResult, Option<EmitResult>) {
483
+ let token_id = if vm.has_pending_token {
484
+ let id = vm.pending_token_id;
485
+ vm.has_pending_token = false;
486
+ id
487
+ } else {
488
+ arg as u16
489
+ };
490
+
491
+ let start = vm.mark;
492
+ let length = vm.pos.saturating_sub(vm.mark);
493
+ vm.mark = vm.pos;
494
+ vm.ip += 1;
495
+
496
+ if check_zero_progress(vm) == VmResult::Halt {
497
+ return (VmResult::Halt, None);
498
+ }
499
+
500
+ (
501
+ VmResult::Emit,
502
+ Some(EmitResult {
503
+ token_id,
504
+ start,
505
+ length,
506
+ }),
507
+ )
508
+ }
509
+
510
+ /// OP_EMIT_SKIP: Skip matched content without emitting
511
+ fn op_emit_skip(vm: &mut VmState) -> VmResult {
512
+ vm.mark = vm.pos;
513
+ vm.ip += 1;
514
+ check_zero_progress(vm)
515
+ }
516
+
517
+ /// OP_EMIT_AND_JUMP: Emit token and jump
518
+ ///
519
+ /// Argument encoding:
520
+ /// - arg[23:14] = token_id (10 bits)
521
+ /// - arg[13:0] = jump_target (14 bits)
522
+ fn op_emit_and_jump(vm: &mut VmState, arg: u32) -> (VmResult, Option<EmitResult>) {
523
+ let packed = PackedArg::from_u32(arg);
524
+ let token_id_arg = packed.upper;
525
+ let target = packed.lower;
526
+
527
+ let token_id = if vm.has_pending_token {
528
+ let id = vm.pending_token_id;
529
+ vm.has_pending_token = false;
530
+ id
531
+ } else {
532
+ token_id_arg
533
+ };
534
+
535
+ let start = vm.mark;
536
+ let length = vm.pos.saturating_sub(vm.mark);
537
+ vm.mark = vm.pos;
538
+
539
+ if check_zero_progress(vm) == VmResult::Halt {
540
+ return (VmResult::Halt, None);
541
+ }
542
+
543
+ if validate_jump(target, vm.prog) == VmResult::Halt {
544
+ return (VmResult::Halt, None);
545
+ }
546
+ vm.ip = target;
547
+
548
+ (
549
+ VmResult::Emit,
550
+ Some(EmitResult {
551
+ token_id,
552
+ start,
553
+ length,
554
+ }),
555
+ )
556
+ }
557
+
558
+ /// OP_EMIT_SKIP_AND_JUMP: Skip content and jump
559
+ fn op_emit_skip_and_jump(vm: &mut VmState, arg: u32) -> VmResult {
560
+ vm.mark = vm.pos;
561
+
562
+ if check_zero_progress(vm) == VmResult::Halt {
563
+ return VmResult::Halt;
564
+ }
565
+
566
+ if validate_jump(arg, vm.prog) == VmResult::Halt {
567
+ return VmResult::Halt;
568
+ }
569
+ vm.ip = arg;
570
+ VmResult::Continue
571
+ }
572
+
573
+ /// OP_KEYWORD_LOOKUP: Look up matched text in keyword table
574
+ fn op_keyword_lookup(vm: &mut VmState, arg: u32) {
575
+ let kt_id = arg as usize;
576
+ if kt_id >= vm.prog.keyword_tables.len() {
577
+ vm.ip += 1;
578
+ return;
579
+ }
580
+
581
+ let kt = &vm.prog.keyword_tables[kt_id];
582
+
583
+ // Guard against mark being beyond input bounds
584
+ let mark = vm.mark.min(vm.bytes.len());
585
+ let match_len = vm.pos.saturating_sub(mark);
586
+ let match_text = &vm.bytes[mark..mark + match_len];
587
+
588
+ vm.pending_token_id = kt.base_token_id;
589
+ vm.has_pending_token = true;
590
+
591
+ for entry in &kt.entries {
592
+ if entry.keyword.len() == match_len && entry.keyword == match_text {
593
+ vm.pending_token_id = entry.token_id;
594
+ break;
595
+ }
596
+ }
597
+
598
+ vm.ip += 1;
599
+ }
600
+
601
+ /// OP_LITERAL_TRIE_RUN: Run literal trie with integrated best-match update
602
+ fn op_literal_trie_run(vm: &mut VmState, arg: u32) {
603
+ let Some(trie_data) = load_constant_or_fail(vm, arg as usize) else {
604
+ vm.last_match_len = 0;
605
+ vm.reset_pos_to_mark();
606
+ return;
607
+ };
608
+
609
+ if let Some(m) = trie::run(trie_data, vm.remaining_bytes()) {
610
+ vm.pos += m.length;
611
+ vm.failed = false;
612
+ vm.last_match_len = m.length;
613
+ vm.last_match_order = m.order;
614
+ vm.last_match_ip = m.action_ip;
615
+
616
+ // Update best match if longer or same length with earlier order
617
+ vm.update_best_if_better();
618
+ } else {
619
+ vm.failed = true;
620
+ vm.last_match_len = 0;
621
+ }
622
+
623
+ // Reset to mark for next candidate
624
+ vm.reset_pos_to_mark();
625
+ vm.ip += 1;
626
+ }
627
+
628
+ /// OP_SET_MATCH: Combined regex candidate match update
629
+ ///
630
+ /// Argument encoding:
631
+ /// - arg[23:14] = order (10 bits)
632
+ /// - arg[13:0] = action_ip (14 bits)
633
+ fn op_set_match(vm: &mut VmState, arg: u32) {
634
+ let packed = PackedArg::from_u32(arg);
635
+ vm.last_match_order = packed.upper;
636
+ vm.last_match_ip = packed.lower;
637
+
638
+ // Update best match if longer or same length with earlier order
639
+ if vm.last_match_len > 0 {
640
+ vm.update_best_if_better();
641
+ }
642
+
643
+ // Reset to mark for next candidate
644
+ vm.reset_pos_to_mark();
645
+ vm.ip += 1;
646
+ }
647
+
648
+ /// OP_CLEAR_BEST: Reset best match tracking
649
+ fn op_clear_best(vm: &mut VmState) {
650
+ vm.clear_best();
651
+ vm.ip += 1;
652
+ }
653
+
654
+ /// OP_COMMIT_BEST: Jump to best match action or default
655
+ fn op_commit_best(vm: &mut VmState, arg: u32) -> VmResult {
656
+ if vm.best_match_len > 0 {
657
+ vm.pos = vm.mark + vm.best_match_len;
658
+ if validate_jump(vm.best_match_ip, vm.prog) == VmResult::Halt {
659
+ return VmResult::Halt;
660
+ }
661
+ vm.ip = vm.best_match_ip;
662
+ } else {
663
+ if validate_jump(arg, vm.prog) == VmResult::Halt {
664
+ return VmResult::Halt;
665
+ }
666
+ vm.ip = arg;
667
+ }
668
+ VmResult::Continue
669
+ }
670
+
671
+ /// OP_LITERAL_TRIE_COMMIT: Combined CLEAR_BEST + LITERAL_TRIE_RUN + COMMIT_BEST
672
+ ///
673
+ /// Argument encoding:
674
+ /// - arg[23:14] = const_id (10 bits)
675
+ /// - arg[13:0] = fail_target (14 bits)
676
+ fn op_literal_trie_commit(vm: &mut VmState, arg: u32) -> VmResult {
677
+ let packed = PackedArg::from_u32(arg);
678
+ let const_id = packed.upper as usize;
679
+ let fail_target = packed.lower;
680
+
681
+ // CLEAR_BEST
682
+ vm.clear_best();
683
+
684
+ // LITERAL_TRIE_RUN logic
685
+ let trie_data = match vm.prog.constant_pool.get(const_id) {
686
+ Some(t) => t,
687
+ None => {
688
+ if validate_jump(fail_target, vm.prog) == VmResult::Halt {
689
+ return VmResult::Halt;
690
+ }
691
+ vm.ip = fail_target;
692
+ return VmResult::Continue;
693
+ }
694
+ };
695
+
696
+ if let Some(m) = trie::run(trie_data, vm.remaining_bytes()) {
697
+ // Match found - update best_match (always better since we just cleared)
698
+ vm.best_match_len = m.length;
699
+ vm.best_match_order = m.order;
700
+ vm.best_match_ip = m.action_ip;
701
+ }
702
+
703
+ // COMMIT_BEST
704
+ if vm.best_match_len > 0 {
705
+ vm.pos = vm.mark + vm.best_match_len;
706
+ if validate_jump(vm.best_match_ip, vm.prog) == VmResult::Halt {
707
+ return VmResult::Halt;
708
+ }
709
+ vm.ip = vm.best_match_ip;
710
+ } else {
711
+ if validate_jump(fail_target, vm.prog) == VmResult::Halt {
712
+ return VmResult::Halt;
713
+ }
714
+ vm.ip = fail_target;
715
+ }
716
+
717
+ VmResult::Continue
718
+ }