lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,718 @@
|
|
|
1
|
+
//! Opcode definitions and handlers
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines all opcodes and their execution handlers.
|
|
4
|
+
//! Must match lib/lexer_kit/ir/opcode.rb and ext/lexer_kit/lexer_kit_ext.h
|
|
5
|
+
|
|
6
|
+
use crate::dfa::{match_literal, scan_until, scan_until_escape};
|
|
7
|
+
use crate::trie;
|
|
8
|
+
use crate::types::{
|
|
9
|
+
CompiledProgram, EmitResult, VmResult, VmState, MAX_MODE_STACK, MAX_ZERO_PROGRESS_TOKENS,
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
// =============================================================================
|
|
13
|
+
// Opcode Definitions
|
|
14
|
+
// =============================================================================
|
|
15
|
+
|
|
16
|
+
/// DFA operations
|
|
17
|
+
pub const OP_DFA_RUN: u8 = 0x01;
|
|
18
|
+
pub const OP_DFA_RUN_IF_MATCH: u8 = 0x02;
|
|
19
|
+
|
|
20
|
+
/// Delimiter/literal operations
|
|
21
|
+
pub const OP_SCAN_UNTIL: u8 = 0x10;
|
|
22
|
+
pub const OP_MATCH_LITERAL: u8 = 0x12;
|
|
23
|
+
pub const OP_SCAN_UNTIL_ESCAPE: u8 = 0x13;
|
|
24
|
+
pub const OP_MATCH_RANGE: u8 = 0x14;
|
|
25
|
+
pub const OP_MATCH_LITERAL_OR_JUMP: u8 = 0x16;
|
|
26
|
+
|
|
27
|
+
/// Branch/control operations
|
|
28
|
+
pub const OP_SWITCH_BYTE: u8 = 0x20;
|
|
29
|
+
pub const OP_JUMP: u8 = 0x21;
|
|
30
|
+
pub const OP_JUMP_IF_EOF: u8 = 0x24;
|
|
31
|
+
|
|
32
|
+
/// Mode operations
|
|
33
|
+
pub const OP_PUSH_MODE: u8 = 0x30;
|
|
34
|
+
pub const OP_POP_MODE: u8 = 0x31;
|
|
35
|
+
|
|
36
|
+
/// Token operations
|
|
37
|
+
pub const OP_EMIT: u8 = 0x40;
|
|
38
|
+
pub const OP_EMIT_SKIP: u8 = 0x41;
|
|
39
|
+
pub const OP_EMIT_ERROR: u8 = 0x42;
|
|
40
|
+
pub const OP_MARK: u8 = 0x43;
|
|
41
|
+
pub const OP_EMIT_AND_JUMP: u8 = 0x44;
|
|
42
|
+
pub const OP_KEYWORD_LOOKUP: u8 = 0x45;
|
|
43
|
+
pub const OP_LITERAL_TRIE_RUN: u8 = 0x46;
|
|
44
|
+
pub const OP_CLEAR_BEST: u8 = 0x49;
|
|
45
|
+
pub const OP_COMMIT_BEST: u8 = 0x4C;
|
|
46
|
+
pub const OP_SET_MATCH: u8 = 0x4D;
|
|
47
|
+
pub const OP_LITERAL_TRIE_COMMIT: u8 = 0x4E;
|
|
48
|
+
pub const OP_EMIT_SKIP_AND_JUMP: u8 = 0x4F;
|
|
49
|
+
|
|
50
|
+
/// Special
|
|
51
|
+
pub const OP_HALT: u8 = 0xFF;
|
|
52
|
+
|
|
53
|
+
// =============================================================================
|
|
54
|
+
// Helper Types
|
|
55
|
+
// =============================================================================
|
|
56
|
+
|
|
57
|
+
/// Packed argument with two fields: upper 10 bits and lower 14 bits
|
|
58
|
+
///
|
|
59
|
+
/// Used by opcodes that encode two values in a single 24-bit argument:
|
|
60
|
+
/// - DFA_RUN_IF_MATCH: dfa_id (10) + fail_target (14)
|
|
61
|
+
/// - MATCH_LITERAL_OR_JUMP: const_id (10) + fail_target (14)
|
|
62
|
+
/// - EMIT_AND_JUMP: token_id (10) + jump_target (14)
|
|
63
|
+
/// - SET_MATCH: order (10) + action_ip (14)
|
|
64
|
+
/// - LITERAL_TRIE_COMMIT: const_id (10) + fail_target (14)
|
|
65
|
+
#[derive(Debug, Clone, Copy)]
|
|
66
|
+
pub struct PackedArg {
|
|
67
|
+
/// Upper 10 bits (bits 23:14)
|
|
68
|
+
pub upper: u16,
|
|
69
|
+
/// Lower 14 bits (bits 13:0)
|
|
70
|
+
pub lower: u32,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
impl PackedArg {
|
|
74
|
+
/// Extract packed argument from a 24-bit value
|
|
75
|
+
#[inline]
|
|
76
|
+
pub fn from_u32(val: u32) -> Self {
|
|
77
|
+
Self {
|
|
78
|
+
upper: ((val >> 14) & 0x3FF) as u16,
|
|
79
|
+
lower: val & 0x3FFF,
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// =============================================================================
|
|
85
|
+
// Helper Functions
|
|
86
|
+
// =============================================================================
|
|
87
|
+
|
|
88
|
+
/// Validate jump target
|
|
89
|
+
#[inline]
|
|
90
|
+
fn validate_jump(target: u32, prog: &CompiledProgram) -> VmResult {
|
|
91
|
+
if (target as usize) < prog.instructions.len() {
|
|
92
|
+
VmResult::Continue
|
|
93
|
+
} else {
|
|
94
|
+
VmResult::Halt
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/// Load constant from pool, setting failure state if not found
|
|
99
|
+
///
|
|
100
|
+
/// Returns Some(data) if constant exists, None if not found (and sets vm.failed)
|
|
101
|
+
#[inline]
|
|
102
|
+
fn load_constant_or_fail<'a>(vm: &mut VmState<'a>, const_id: usize) -> Option<&'a [u8]> {
|
|
103
|
+
match vm.prog.constant_pool.get(const_id) {
|
|
104
|
+
Some(data) => Some(data),
|
|
105
|
+
None => {
|
|
106
|
+
vm.failed = true;
|
|
107
|
+
vm.ip += 1;
|
|
108
|
+
None
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/// Check for zero-progress after emit (infinite loop detection)
|
|
114
|
+
#[inline]
|
|
115
|
+
fn check_zero_progress(vm: &mut VmState) -> VmResult {
|
|
116
|
+
if vm.pos == vm.last_emit_pos {
|
|
117
|
+
vm.zero_progress_count += 1;
|
|
118
|
+
if vm.zero_progress_count >= MAX_ZERO_PROGRESS_TOKENS {
|
|
119
|
+
return VmResult::Halt;
|
|
120
|
+
}
|
|
121
|
+
} else {
|
|
122
|
+
vm.zero_progress_count = 0;
|
|
123
|
+
vm.last_emit_pos = vm.pos;
|
|
124
|
+
}
|
|
125
|
+
VmResult::Continue
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Read u16 from bytes (big-endian)
|
|
129
|
+
#[inline]
|
|
130
|
+
fn read_u16(bytes: &[u8]) -> u16 {
|
|
131
|
+
((bytes[0] as u16) << 8) | (bytes[1] as u16)
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// =============================================================================
|
|
135
|
+
// Opcode Handlers
|
|
136
|
+
// =============================================================================
|
|
137
|
+
|
|
138
|
+
/// Execute a single opcode
|
|
139
|
+
///
|
|
140
|
+
/// Returns (VmResult, Option<EmitResult>)
|
|
141
|
+
pub fn execute(vm: &mut VmState, opcode: u8, arg: u32) -> (VmResult, Option<EmitResult>) {
|
|
142
|
+
match opcode {
|
|
143
|
+
OP_HALT => (VmResult::Halt, None),
|
|
144
|
+
|
|
145
|
+
// DFA operations
|
|
146
|
+
OP_DFA_RUN => {
|
|
147
|
+
op_dfa_run(vm, arg);
|
|
148
|
+
(VmResult::Continue, None)
|
|
149
|
+
}
|
|
150
|
+
OP_DFA_RUN_IF_MATCH => (op_dfa_run_if_match(vm, arg), None),
|
|
151
|
+
|
|
152
|
+
// Delimiter/literal operations
|
|
153
|
+
OP_SCAN_UNTIL => {
|
|
154
|
+
op_scan_until(vm, arg);
|
|
155
|
+
(VmResult::Continue, None)
|
|
156
|
+
}
|
|
157
|
+
OP_MATCH_LITERAL => {
|
|
158
|
+
op_match_literal(vm, arg);
|
|
159
|
+
(VmResult::Continue, None)
|
|
160
|
+
}
|
|
161
|
+
OP_SCAN_UNTIL_ESCAPE => {
|
|
162
|
+
op_scan_until_escape(vm, arg);
|
|
163
|
+
(VmResult::Continue, None)
|
|
164
|
+
}
|
|
165
|
+
OP_MATCH_RANGE => {
|
|
166
|
+
op_match_range(vm, arg);
|
|
167
|
+
(VmResult::Continue, None)
|
|
168
|
+
}
|
|
169
|
+
OP_MATCH_LITERAL_OR_JUMP => (op_match_literal_or_jump(vm, arg), None),
|
|
170
|
+
|
|
171
|
+
// Branch/control operations
|
|
172
|
+
OP_SWITCH_BYTE => (op_switch_byte(vm, arg), None),
|
|
173
|
+
OP_JUMP => (op_jump(vm, arg), None),
|
|
174
|
+
OP_JUMP_IF_EOF => (op_jump_if_eof(vm, arg), None),
|
|
175
|
+
|
|
176
|
+
// Mode operations
|
|
177
|
+
OP_PUSH_MODE => (op_push_mode(vm, arg), None),
|
|
178
|
+
OP_POP_MODE => (op_pop_mode(vm), None),
|
|
179
|
+
|
|
180
|
+
// Token operations
|
|
181
|
+
OP_EMIT | OP_EMIT_ERROR => op_emit(vm, arg),
|
|
182
|
+
OP_EMIT_SKIP => {
|
|
183
|
+
let result = op_emit_skip(vm);
|
|
184
|
+
(result, None)
|
|
185
|
+
}
|
|
186
|
+
OP_MARK => {
|
|
187
|
+
op_mark(vm);
|
|
188
|
+
(VmResult::Continue, None)
|
|
189
|
+
}
|
|
190
|
+
OP_EMIT_AND_JUMP => op_emit_and_jump(vm, arg),
|
|
191
|
+
OP_KEYWORD_LOOKUP => {
|
|
192
|
+
op_keyword_lookup(vm, arg);
|
|
193
|
+
(VmResult::Continue, None)
|
|
194
|
+
}
|
|
195
|
+
OP_LITERAL_TRIE_RUN => {
|
|
196
|
+
op_literal_trie_run(vm, arg);
|
|
197
|
+
(VmResult::Continue, None)
|
|
198
|
+
}
|
|
199
|
+
OP_CLEAR_BEST => {
|
|
200
|
+
op_clear_best(vm);
|
|
201
|
+
(VmResult::Continue, None)
|
|
202
|
+
}
|
|
203
|
+
OP_COMMIT_BEST => (op_commit_best(vm, arg), None),
|
|
204
|
+
OP_SET_MATCH => {
|
|
205
|
+
op_set_match(vm, arg);
|
|
206
|
+
(VmResult::Continue, None)
|
|
207
|
+
}
|
|
208
|
+
OP_LITERAL_TRIE_COMMIT => (op_literal_trie_commit(vm, arg), None),
|
|
209
|
+
OP_EMIT_SKIP_AND_JUMP => (op_emit_skip_and_jump(vm, arg), None),
|
|
210
|
+
|
|
211
|
+
// Unknown opcode
|
|
212
|
+
_ => (VmResult::Halt, None),
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
// =============================================================================
|
|
217
|
+
// Individual Opcode Implementations
|
|
218
|
+
// =============================================================================
|
|
219
|
+
|
|
220
|
+
/// OP_MARK: Set mark position
|
|
221
|
+
fn op_mark(vm: &mut VmState) {
|
|
222
|
+
vm.mark = vm.pos;
|
|
223
|
+
vm.ip += 1;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/// OP_DFA_RUN: Execute DFA and advance position
|
|
227
|
+
fn op_dfa_run(vm: &mut VmState, arg: u32) {
|
|
228
|
+
let dfa_id = arg as usize;
|
|
229
|
+
if dfa_id >= vm.prog.dfa_tables.len() {
|
|
230
|
+
vm.failed = true;
|
|
231
|
+
vm.last_match_len = 0;
|
|
232
|
+
vm.ip += 1;
|
|
233
|
+
return;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
let dfa = &vm.prog.dfa_tables[dfa_id];
|
|
237
|
+
if let Some(m) = dfa.run(vm.remaining_bytes()) {
|
|
238
|
+
vm.pos += m.length;
|
|
239
|
+
vm.failed = false;
|
|
240
|
+
vm.last_match_len = m.length;
|
|
241
|
+
} else {
|
|
242
|
+
vm.failed = true;
|
|
243
|
+
vm.last_match_len = 0;
|
|
244
|
+
}
|
|
245
|
+
vm.ip += 1;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/// OP_DFA_RUN_IF_MATCH: Run DFA with integrated failure jump
|
|
249
|
+
///
|
|
250
|
+
/// Argument encoding:
|
|
251
|
+
/// - arg[23:14] = dfa_id (10 bits)
|
|
252
|
+
/// - arg[13:0] = fail_target (14 bits)
|
|
253
|
+
fn op_dfa_run_if_match(vm: &mut VmState, arg: u32) -> VmResult {
|
|
254
|
+
let packed = PackedArg::from_u32(arg);
|
|
255
|
+
let dfa_id = packed.upper as usize;
|
|
256
|
+
let fail_target = packed.lower;
|
|
257
|
+
|
|
258
|
+
if dfa_id >= vm.prog.dfa_tables.len() {
|
|
259
|
+
vm.failed = true;
|
|
260
|
+
vm.last_match_len = 0;
|
|
261
|
+
vm.ip = fail_target;
|
|
262
|
+
return validate_jump(fail_target, vm.prog);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
let dfa = &vm.prog.dfa_tables[dfa_id];
|
|
266
|
+
if let Some(m) = dfa.run(vm.remaining_bytes()) {
|
|
267
|
+
vm.pos += m.length;
|
|
268
|
+
vm.failed = false;
|
|
269
|
+
vm.last_match_len = m.length;
|
|
270
|
+
vm.ip += 1;
|
|
271
|
+
VmResult::Continue
|
|
272
|
+
} else {
|
|
273
|
+
vm.failed = true;
|
|
274
|
+
vm.last_match_len = 0;
|
|
275
|
+
vm.ip = fail_target;
|
|
276
|
+
validate_jump(fail_target, vm.prog)
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/// OP_SCAN_UNTIL: Scan until delimiter found
|
|
281
|
+
fn op_scan_until(vm: &mut VmState, arg: u32) {
|
|
282
|
+
let Some(delim) = load_constant_or_fail(vm, arg as usize) else {
|
|
283
|
+
return;
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
let remaining = vm.remaining_bytes();
|
|
287
|
+
let scanned = scan_until(remaining, delim);
|
|
288
|
+
vm.pos += scanned;
|
|
289
|
+
vm.failed = scanned >= remaining.len();
|
|
290
|
+
vm.ip += 1;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/// OP_SCAN_UNTIL_ESCAPE: Scan until delimiter found, skipping escape sequences
|
|
294
|
+
fn op_scan_until_escape(vm: &mut VmState, arg: u32) {
|
|
295
|
+
let Some(config) = load_constant_or_fail(vm, arg as usize) else {
|
|
296
|
+
return;
|
|
297
|
+
};
|
|
298
|
+
if config.len() < 4 {
|
|
299
|
+
vm.failed = true;
|
|
300
|
+
vm.ip += 1;
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
let close_len = read_u16(&config[0..2]) as usize;
|
|
305
|
+
let escape_len = read_u16(&config[2..4]) as usize;
|
|
306
|
+
|
|
307
|
+
if 4 + close_len + escape_len > config.len() {
|
|
308
|
+
vm.failed = true;
|
|
309
|
+
vm.ip += 1;
|
|
310
|
+
return;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
let close = &config[4..4 + close_len];
|
|
314
|
+
let escape = &config[4 + close_len..4 + close_len + escape_len];
|
|
315
|
+
|
|
316
|
+
let (found, end_pos) = scan_until_escape(vm.remaining_bytes(), close, escape);
|
|
317
|
+
|
|
318
|
+
if found {
|
|
319
|
+
vm.pos += end_pos;
|
|
320
|
+
vm.failed = false;
|
|
321
|
+
} else {
|
|
322
|
+
vm.pos = vm.bytes.len();
|
|
323
|
+
vm.failed = true;
|
|
324
|
+
}
|
|
325
|
+
vm.ip += 1;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/// OP_MATCH_LITERAL: Match literal string
|
|
329
|
+
fn op_match_literal(vm: &mut VmState, arg: u32) {
|
|
330
|
+
let Some(lit) = load_constant_or_fail(vm, arg as usize) else {
|
|
331
|
+
vm.last_match_len = 0;
|
|
332
|
+
return;
|
|
333
|
+
};
|
|
334
|
+
|
|
335
|
+
if match_literal(vm.remaining_bytes(), lit) {
|
|
336
|
+
vm.pos += lit.len();
|
|
337
|
+
vm.failed = false;
|
|
338
|
+
vm.last_match_len = lit.len();
|
|
339
|
+
} else {
|
|
340
|
+
vm.failed = true;
|
|
341
|
+
vm.last_match_len = 0;
|
|
342
|
+
}
|
|
343
|
+
vm.ip += 1;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/// OP_MATCH_LITERAL_OR_JUMP: Match literal with integrated failure jump
|
|
347
|
+
///
|
|
348
|
+
/// Argument encoding:
|
|
349
|
+
/// - arg[23:14] = const_id (10 bits)
|
|
350
|
+
/// - arg[13:0] = fail_target (14 bits)
|
|
351
|
+
fn op_match_literal_or_jump(vm: &mut VmState, arg: u32) -> VmResult {
|
|
352
|
+
let packed = PackedArg::from_u32(arg);
|
|
353
|
+
let const_id = packed.upper as usize;
|
|
354
|
+
let fail_target = packed.lower;
|
|
355
|
+
|
|
356
|
+
let lit = match vm.prog.constant_pool.get(const_id) {
|
|
357
|
+
Some(l) => l,
|
|
358
|
+
None => {
|
|
359
|
+
vm.failed = true;
|
|
360
|
+
vm.last_match_len = 0;
|
|
361
|
+
vm.ip = fail_target;
|
|
362
|
+
return validate_jump(fail_target, vm.prog);
|
|
363
|
+
}
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
if match_literal(vm.remaining_bytes(), lit) {
|
|
367
|
+
vm.pos += lit.len();
|
|
368
|
+
vm.failed = false;
|
|
369
|
+
vm.last_match_len = lit.len();
|
|
370
|
+
vm.ip += 1;
|
|
371
|
+
VmResult::Continue
|
|
372
|
+
} else {
|
|
373
|
+
vm.failed = true;
|
|
374
|
+
vm.last_match_len = 0;
|
|
375
|
+
vm.ip = fail_target;
|
|
376
|
+
validate_jump(fail_target, vm.prog)
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/// OP_MATCH_RANGE: Match byte in range [lo, hi]
|
|
381
|
+
fn op_match_range(vm: &mut VmState, arg: u32) {
|
|
382
|
+
let lo = ((arg >> 8) & 0xFF) as u8;
|
|
383
|
+
let hi = (arg & 0xFF) as u8;
|
|
384
|
+
|
|
385
|
+
if lo > hi {
|
|
386
|
+
vm.failed = true;
|
|
387
|
+
vm.last_match_len = 0;
|
|
388
|
+
} else if let Some(byte) = vm.current_byte() {
|
|
389
|
+
if byte >= lo && byte <= hi {
|
|
390
|
+
vm.pos += 1;
|
|
391
|
+
vm.failed = false;
|
|
392
|
+
vm.last_match_len = 1;
|
|
393
|
+
} else {
|
|
394
|
+
vm.failed = true;
|
|
395
|
+
vm.last_match_len = 0;
|
|
396
|
+
}
|
|
397
|
+
} else {
|
|
398
|
+
vm.failed = true;
|
|
399
|
+
vm.last_match_len = 0;
|
|
400
|
+
}
|
|
401
|
+
vm.ip += 1;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/// OP_SWITCH_BYTE: Jump based on current byte
|
|
405
|
+
fn op_switch_byte(vm: &mut VmState, arg: u32) -> VmResult {
|
|
406
|
+
let jt_id = arg as usize;
|
|
407
|
+
if jt_id >= vm.prog.jump_tables.len() {
|
|
408
|
+
vm.ip += 1;
|
|
409
|
+
return VmResult::Continue;
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
let jt = &vm.prog.jump_tables[jt_id];
|
|
413
|
+
let target = if let Some(byte) = vm.current_byte() {
|
|
414
|
+
jt.lookup[byte as usize]
|
|
415
|
+
} else {
|
|
416
|
+
jt.default_offset
|
|
417
|
+
};
|
|
418
|
+
|
|
419
|
+
if validate_jump(target, vm.prog) == VmResult::Halt {
|
|
420
|
+
return VmResult::Halt;
|
|
421
|
+
}
|
|
422
|
+
vm.ip = target;
|
|
423
|
+
VmResult::Continue
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
/// OP_JUMP: Unconditional jump
|
|
427
|
+
fn op_jump(vm: &mut VmState, arg: u32) -> VmResult {
|
|
428
|
+
if validate_jump(arg, vm.prog) == VmResult::Halt {
|
|
429
|
+
return VmResult::Halt;
|
|
430
|
+
}
|
|
431
|
+
vm.ip = arg;
|
|
432
|
+
VmResult::Continue
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/// OP_JUMP_IF_EOF: Jump if at end of input
|
|
436
|
+
fn op_jump_if_eof(vm: &mut VmState, arg: u32) -> VmResult {
|
|
437
|
+
if vm.is_eof() {
|
|
438
|
+
if validate_jump(arg, vm.prog) == VmResult::Halt {
|
|
439
|
+
return VmResult::Halt;
|
|
440
|
+
}
|
|
441
|
+
vm.ip = arg;
|
|
442
|
+
} else {
|
|
443
|
+
vm.ip += 1;
|
|
444
|
+
}
|
|
445
|
+
VmResult::Continue
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/// OP_PUSH_MODE: Push current mode and switch to new mode
|
|
449
|
+
fn op_push_mode(vm: &mut VmState, arg: u32) -> VmResult {
|
|
450
|
+
if vm.mode_sp as usize >= MAX_MODE_STACK {
|
|
451
|
+
vm.error = Some("mode stack overflow: exceeded maximum depth of 64");
|
|
452
|
+
return VmResult::Halt;
|
|
453
|
+
}
|
|
454
|
+
vm.mode_stack[vm.mode_sp as usize] = vm.mode;
|
|
455
|
+
vm.mode_sp += 1;
|
|
456
|
+
vm.mode = arg as u16;
|
|
457
|
+
|
|
458
|
+
let offset = vm.prog.find_mode_offset(arg as u16);
|
|
459
|
+
if validate_jump(offset, vm.prog) == VmResult::Halt {
|
|
460
|
+
return VmResult::Halt;
|
|
461
|
+
}
|
|
462
|
+
vm.ip = offset;
|
|
463
|
+
VmResult::Continue
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
/// OP_POP_MODE: Pop mode from stack and return to it
|
|
467
|
+
fn op_pop_mode(vm: &mut VmState) -> VmResult {
|
|
468
|
+
if vm.mode_sp > 0 {
|
|
469
|
+
vm.mode_sp -= 1;
|
|
470
|
+
vm.mode = vm.mode_stack[vm.mode_sp as usize];
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
let offset = vm.prog.find_mode_offset(vm.mode);
|
|
474
|
+
if validate_jump(offset, vm.prog) == VmResult::Halt {
|
|
475
|
+
return VmResult::Halt;
|
|
476
|
+
}
|
|
477
|
+
vm.ip = offset;
|
|
478
|
+
VmResult::Continue
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
/// OP_EMIT: Emit token
|
|
482
|
+
fn op_emit(vm: &mut VmState, arg: u32) -> (VmResult, Option<EmitResult>) {
|
|
483
|
+
let token_id = if vm.has_pending_token {
|
|
484
|
+
let id = vm.pending_token_id;
|
|
485
|
+
vm.has_pending_token = false;
|
|
486
|
+
id
|
|
487
|
+
} else {
|
|
488
|
+
arg as u16
|
|
489
|
+
};
|
|
490
|
+
|
|
491
|
+
let start = vm.mark;
|
|
492
|
+
let length = vm.pos.saturating_sub(vm.mark);
|
|
493
|
+
vm.mark = vm.pos;
|
|
494
|
+
vm.ip += 1;
|
|
495
|
+
|
|
496
|
+
if check_zero_progress(vm) == VmResult::Halt {
|
|
497
|
+
return (VmResult::Halt, None);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
(
|
|
501
|
+
VmResult::Emit,
|
|
502
|
+
Some(EmitResult {
|
|
503
|
+
token_id,
|
|
504
|
+
start,
|
|
505
|
+
length,
|
|
506
|
+
}),
|
|
507
|
+
)
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
/// OP_EMIT_SKIP: Skip matched content without emitting
|
|
511
|
+
fn op_emit_skip(vm: &mut VmState) -> VmResult {
|
|
512
|
+
vm.mark = vm.pos;
|
|
513
|
+
vm.ip += 1;
|
|
514
|
+
check_zero_progress(vm)
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
/// OP_EMIT_AND_JUMP: Emit token and jump
|
|
518
|
+
///
|
|
519
|
+
/// Argument encoding:
|
|
520
|
+
/// - arg[23:14] = token_id (10 bits)
|
|
521
|
+
/// - arg[13:0] = jump_target (14 bits)
|
|
522
|
+
fn op_emit_and_jump(vm: &mut VmState, arg: u32) -> (VmResult, Option<EmitResult>) {
|
|
523
|
+
let packed = PackedArg::from_u32(arg);
|
|
524
|
+
let token_id_arg = packed.upper;
|
|
525
|
+
let target = packed.lower;
|
|
526
|
+
|
|
527
|
+
let token_id = if vm.has_pending_token {
|
|
528
|
+
let id = vm.pending_token_id;
|
|
529
|
+
vm.has_pending_token = false;
|
|
530
|
+
id
|
|
531
|
+
} else {
|
|
532
|
+
token_id_arg
|
|
533
|
+
};
|
|
534
|
+
|
|
535
|
+
let start = vm.mark;
|
|
536
|
+
let length = vm.pos.saturating_sub(vm.mark);
|
|
537
|
+
vm.mark = vm.pos;
|
|
538
|
+
|
|
539
|
+
if check_zero_progress(vm) == VmResult::Halt {
|
|
540
|
+
return (VmResult::Halt, None);
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
if validate_jump(target, vm.prog) == VmResult::Halt {
|
|
544
|
+
return (VmResult::Halt, None);
|
|
545
|
+
}
|
|
546
|
+
vm.ip = target;
|
|
547
|
+
|
|
548
|
+
(
|
|
549
|
+
VmResult::Emit,
|
|
550
|
+
Some(EmitResult {
|
|
551
|
+
token_id,
|
|
552
|
+
start,
|
|
553
|
+
length,
|
|
554
|
+
}),
|
|
555
|
+
)
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/// OP_EMIT_SKIP_AND_JUMP: Skip content and jump
|
|
559
|
+
fn op_emit_skip_and_jump(vm: &mut VmState, arg: u32) -> VmResult {
|
|
560
|
+
vm.mark = vm.pos;
|
|
561
|
+
|
|
562
|
+
if check_zero_progress(vm) == VmResult::Halt {
|
|
563
|
+
return VmResult::Halt;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if validate_jump(arg, vm.prog) == VmResult::Halt {
|
|
567
|
+
return VmResult::Halt;
|
|
568
|
+
}
|
|
569
|
+
vm.ip = arg;
|
|
570
|
+
VmResult::Continue
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
/// OP_KEYWORD_LOOKUP: Look up matched text in keyword table
|
|
574
|
+
fn op_keyword_lookup(vm: &mut VmState, arg: u32) {
|
|
575
|
+
let kt_id = arg as usize;
|
|
576
|
+
if kt_id >= vm.prog.keyword_tables.len() {
|
|
577
|
+
vm.ip += 1;
|
|
578
|
+
return;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
let kt = &vm.prog.keyword_tables[kt_id];
|
|
582
|
+
|
|
583
|
+
// Guard against mark being beyond input bounds
|
|
584
|
+
let mark = vm.mark.min(vm.bytes.len());
|
|
585
|
+
let match_len = vm.pos.saturating_sub(mark);
|
|
586
|
+
let match_text = &vm.bytes[mark..mark + match_len];
|
|
587
|
+
|
|
588
|
+
vm.pending_token_id = kt.base_token_id;
|
|
589
|
+
vm.has_pending_token = true;
|
|
590
|
+
|
|
591
|
+
for entry in &kt.entries {
|
|
592
|
+
if entry.keyword.len() == match_len && entry.keyword == match_text {
|
|
593
|
+
vm.pending_token_id = entry.token_id;
|
|
594
|
+
break;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
vm.ip += 1;
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/// OP_LITERAL_TRIE_RUN: Run literal trie with integrated best-match update
|
|
602
|
+
fn op_literal_trie_run(vm: &mut VmState, arg: u32) {
|
|
603
|
+
let Some(trie_data) = load_constant_or_fail(vm, arg as usize) else {
|
|
604
|
+
vm.last_match_len = 0;
|
|
605
|
+
vm.reset_pos_to_mark();
|
|
606
|
+
return;
|
|
607
|
+
};
|
|
608
|
+
|
|
609
|
+
if let Some(m) = trie::run(trie_data, vm.remaining_bytes()) {
|
|
610
|
+
vm.pos += m.length;
|
|
611
|
+
vm.failed = false;
|
|
612
|
+
vm.last_match_len = m.length;
|
|
613
|
+
vm.last_match_order = m.order;
|
|
614
|
+
vm.last_match_ip = m.action_ip;
|
|
615
|
+
|
|
616
|
+
// Update best match if longer or same length with earlier order
|
|
617
|
+
vm.update_best_if_better();
|
|
618
|
+
} else {
|
|
619
|
+
vm.failed = true;
|
|
620
|
+
vm.last_match_len = 0;
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// Reset to mark for next candidate
|
|
624
|
+
vm.reset_pos_to_mark();
|
|
625
|
+
vm.ip += 1;
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
/// OP_SET_MATCH: Combined regex candidate match update
|
|
629
|
+
///
|
|
630
|
+
/// Argument encoding:
|
|
631
|
+
/// - arg[23:14] = order (10 bits)
|
|
632
|
+
/// - arg[13:0] = action_ip (14 bits)
|
|
633
|
+
fn op_set_match(vm: &mut VmState, arg: u32) {
|
|
634
|
+
let packed = PackedArg::from_u32(arg);
|
|
635
|
+
vm.last_match_order = packed.upper;
|
|
636
|
+
vm.last_match_ip = packed.lower;
|
|
637
|
+
|
|
638
|
+
// Update best match if longer or same length with earlier order
|
|
639
|
+
if vm.last_match_len > 0 {
|
|
640
|
+
vm.update_best_if_better();
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
// Reset to mark for next candidate
|
|
644
|
+
vm.reset_pos_to_mark();
|
|
645
|
+
vm.ip += 1;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
/// OP_CLEAR_BEST: Reset best match tracking
|
|
649
|
+
fn op_clear_best(vm: &mut VmState) {
|
|
650
|
+
vm.clear_best();
|
|
651
|
+
vm.ip += 1;
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
/// OP_COMMIT_BEST: Jump to best match action or default
|
|
655
|
+
fn op_commit_best(vm: &mut VmState, arg: u32) -> VmResult {
|
|
656
|
+
if vm.best_match_len > 0 {
|
|
657
|
+
vm.pos = vm.mark + vm.best_match_len;
|
|
658
|
+
if validate_jump(vm.best_match_ip, vm.prog) == VmResult::Halt {
|
|
659
|
+
return VmResult::Halt;
|
|
660
|
+
}
|
|
661
|
+
vm.ip = vm.best_match_ip;
|
|
662
|
+
} else {
|
|
663
|
+
if validate_jump(arg, vm.prog) == VmResult::Halt {
|
|
664
|
+
return VmResult::Halt;
|
|
665
|
+
}
|
|
666
|
+
vm.ip = arg;
|
|
667
|
+
}
|
|
668
|
+
VmResult::Continue
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
/// OP_LITERAL_TRIE_COMMIT: Combined CLEAR_BEST + LITERAL_TRIE_RUN + COMMIT_BEST
|
|
672
|
+
///
|
|
673
|
+
/// Argument encoding:
|
|
674
|
+
/// - arg[23:14] = const_id (10 bits)
|
|
675
|
+
/// - arg[13:0] = fail_target (14 bits)
|
|
676
|
+
fn op_literal_trie_commit(vm: &mut VmState, arg: u32) -> VmResult {
|
|
677
|
+
let packed = PackedArg::from_u32(arg);
|
|
678
|
+
let const_id = packed.upper as usize;
|
|
679
|
+
let fail_target = packed.lower;
|
|
680
|
+
|
|
681
|
+
// CLEAR_BEST
|
|
682
|
+
vm.clear_best();
|
|
683
|
+
|
|
684
|
+
// LITERAL_TRIE_RUN logic
|
|
685
|
+
let trie_data = match vm.prog.constant_pool.get(const_id) {
|
|
686
|
+
Some(t) => t,
|
|
687
|
+
None => {
|
|
688
|
+
if validate_jump(fail_target, vm.prog) == VmResult::Halt {
|
|
689
|
+
return VmResult::Halt;
|
|
690
|
+
}
|
|
691
|
+
vm.ip = fail_target;
|
|
692
|
+
return VmResult::Continue;
|
|
693
|
+
}
|
|
694
|
+
};
|
|
695
|
+
|
|
696
|
+
if let Some(m) = trie::run(trie_data, vm.remaining_bytes()) {
|
|
697
|
+
// Match found - update best_match (always better since we just cleared)
|
|
698
|
+
vm.best_match_len = m.length;
|
|
699
|
+
vm.best_match_order = m.order;
|
|
700
|
+
vm.best_match_ip = m.action_ip;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
// COMMIT_BEST
|
|
704
|
+
if vm.best_match_len > 0 {
|
|
705
|
+
vm.pos = vm.mark + vm.best_match_len;
|
|
706
|
+
if validate_jump(vm.best_match_ip, vm.prog) == VmResult::Halt {
|
|
707
|
+
return VmResult::Halt;
|
|
708
|
+
}
|
|
709
|
+
vm.ip = vm.best_match_ip;
|
|
710
|
+
} else {
|
|
711
|
+
if validate_jump(fail_target, vm.prog) == VmResult::Halt {
|
|
712
|
+
return VmResult::Halt;
|
|
713
|
+
}
|
|
714
|
+
vm.ip = fail_target;
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
VmResult::Continue
|
|
718
|
+
}
|