lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
//! Safety tests that can be run under Miri
|
|
2
|
+
//!
|
|
3
|
+
//! Run with: cargo +nightly miri test
|
|
4
|
+
//!
|
|
5
|
+
//! These tests verify memory safety for:
|
|
6
|
+
//! - FastTokenBuffer (Vec-based token buffer)
|
|
7
|
+
//! - DFA execution (boundary conditions)
|
|
8
|
+
//! - Trie execution (malformed data handling)
|
|
9
|
+
//! - VM execution (infinite loop detection, stack overflow)
|
|
10
|
+
|
|
11
|
+
#[cfg(test)]
|
|
12
|
+
mod tests {
|
|
13
|
+
use crate::dfa::{match_literal, scan_until, scan_until_escape};
|
|
14
|
+
use crate::fast_stream::FastTokenBuffer;
|
|
15
|
+
use crate::trie;
|
|
16
|
+
use crate::types::{
|
|
17
|
+
CompiledProgram, DfaTable as DfaTableType, Instruction, KeywordEntry, KeywordTable, Mode,
|
|
18
|
+
DFA_NO_ACCEPT,
|
|
19
|
+
};
|
|
20
|
+
use crate::vm;
|
|
21
|
+
|
|
22
|
+
// =========================================================================
|
|
23
|
+
// FastTokenBuffer tests
|
|
24
|
+
// =========================================================================
|
|
25
|
+
|
|
26
|
+
#[test]
|
|
27
|
+
fn test_token_buffer_basic() {
|
|
28
|
+
let tokens = vec![
|
|
29
|
+
(1u16, 0usize, 5usize),
|
|
30
|
+
(2u16, 5usize, 3usize),
|
|
31
|
+
(3u16, 8usize, 2usize),
|
|
32
|
+
];
|
|
33
|
+
let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
34
|
+
|
|
35
|
+
assert!(!buffer.eof());
|
|
36
|
+
assert_eq!(buffer.token_id(), 1);
|
|
37
|
+
assert_eq!(buffer.start(), 0);
|
|
38
|
+
assert_eq!(buffer.len(), 5);
|
|
39
|
+
|
|
40
|
+
buffer.advance();
|
|
41
|
+
assert_eq!(buffer.token_id(), 2);
|
|
42
|
+
|
|
43
|
+
buffer.advance();
|
|
44
|
+
assert_eq!(buffer.token_id(), 3);
|
|
45
|
+
|
|
46
|
+
buffer.advance();
|
|
47
|
+
assert!(buffer.eof());
|
|
48
|
+
assert_eq!(buffer.token_id(), -1);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
#[test]
|
|
52
|
+
fn test_empty_buffer() {
|
|
53
|
+
let buffer = FastTokenBuffer::from_tokens(vec![]).unwrap();
|
|
54
|
+
assert!(buffer.eof());
|
|
55
|
+
assert_eq!(buffer.token_id(), -1);
|
|
56
|
+
assert_eq!(buffer.start(), -1);
|
|
57
|
+
assert_eq!(buffer.len(), -1);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
#[test]
|
|
61
|
+
fn test_peek() {
|
|
62
|
+
let tokens = vec![(1u16, 0usize, 5usize), (2u16, 5usize, 3usize)];
|
|
63
|
+
let buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
64
|
+
|
|
65
|
+
assert_eq!(buffer.peek_token_id(0), 1);
|
|
66
|
+
assert_eq!(buffer.peek_token_id(1), 2);
|
|
67
|
+
assert_eq!(buffer.peek_token_id(2), -1); // out of bounds
|
|
68
|
+
assert_eq!(buffer.peek_start(0), 0);
|
|
69
|
+
assert_eq!(buffer.peek_start(1), 5);
|
|
70
|
+
assert_eq!(buffer.peek_start(2), -1);
|
|
71
|
+
assert_eq!(buffer.peek_len(0), 5);
|
|
72
|
+
assert_eq!(buffer.peek_len(1), 3);
|
|
73
|
+
assert_eq!(buffer.peek_len(2), -1);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
#[test]
|
|
77
|
+
fn test_single_token() {
|
|
78
|
+
let tokens = vec![(42u16, 100usize, 50usize)];
|
|
79
|
+
let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
80
|
+
|
|
81
|
+
assert_eq!(buffer.count(), 1);
|
|
82
|
+
assert_eq!(buffer.index(), 0);
|
|
83
|
+
assert_eq!(buffer.token_id(), 42);
|
|
84
|
+
assert_eq!(buffer.start(), 100);
|
|
85
|
+
assert_eq!(buffer.len(), 50);
|
|
86
|
+
|
|
87
|
+
buffer.advance();
|
|
88
|
+
assert!(buffer.eof());
|
|
89
|
+
assert_eq!(buffer.index(), 1);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
#[test]
|
|
93
|
+
fn test_many_tokens() {
|
|
94
|
+
let tokens: Vec<_> = (0..1000)
|
|
95
|
+
.map(|i| (i as u16, i * 10, 10usize))
|
|
96
|
+
.collect();
|
|
97
|
+
let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
98
|
+
|
|
99
|
+
assert_eq!(buffer.count(), 1000);
|
|
100
|
+
|
|
101
|
+
for i in 0..1000 {
|
|
102
|
+
assert_eq!(buffer.token_id(), i as i32);
|
|
103
|
+
assert_eq!(buffer.start(), (i * 10) as i32);
|
|
104
|
+
buffer.advance();
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
assert!(buffer.eof());
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[test]
|
|
111
|
+
fn test_advance_past_end() {
|
|
112
|
+
let tokens = vec![(1u16, 0usize, 1usize)];
|
|
113
|
+
let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
114
|
+
|
|
115
|
+
buffer.advance();
|
|
116
|
+
assert!(buffer.eof());
|
|
117
|
+
|
|
118
|
+
// Advancing past end should be safe
|
|
119
|
+
buffer.advance();
|
|
120
|
+
buffer.advance();
|
|
121
|
+
buffer.advance();
|
|
122
|
+
assert!(buffer.eof());
|
|
123
|
+
assert_eq!(buffer.token_id(), -1);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// =========================================================================
|
|
127
|
+
// DFA tests
|
|
128
|
+
// =========================================================================
|
|
129
|
+
|
|
130
|
+
#[test]
|
|
131
|
+
fn test_dfa_empty_input() {
|
|
132
|
+
let dfa = create_simple_dfa();
|
|
133
|
+
let result = dfa.run(b"");
|
|
134
|
+
// Empty input may or may not match depending on DFA (zero-length match)
|
|
135
|
+
assert!(result.is_none() || result.unwrap().length == 0);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
#[test]
|
|
139
|
+
fn test_dfa_empty_dfa() {
|
|
140
|
+
let dfa = DfaTableType::new();
|
|
141
|
+
let result = dfa.run(b"hello");
|
|
142
|
+
assert!(result.is_none());
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
#[test]
|
|
146
|
+
fn test_dfa_single_byte_match() {
|
|
147
|
+
let dfa = create_simple_dfa();
|
|
148
|
+
let result = dfa.run(b"a");
|
|
149
|
+
assert!(result.is_some());
|
|
150
|
+
assert_eq!(result.unwrap().length, 1);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_dfa_no_match() {
|
|
155
|
+
let dfa = create_simple_dfa();
|
|
156
|
+
let result = dfa.run(b"xyz");
|
|
157
|
+
assert!(result.is_none());
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
fn create_simple_dfa() -> DfaTableType {
|
|
161
|
+
// DFA that matches 'a'
|
|
162
|
+
let mut dfa = DfaTableType::new();
|
|
163
|
+
dfa.state_count = 3;
|
|
164
|
+
dfa.class_count = 2;
|
|
165
|
+
dfa.byte_class = vec![0; 256];
|
|
166
|
+
dfa.byte_class[b'a' as usize] = 1;
|
|
167
|
+
dfa.transitions = vec![
|
|
168
|
+
0, 0, // state 0 (dead)
|
|
169
|
+
0, 2, // state 1: class 0 -> dead, class 1 ('a') -> state 2
|
|
170
|
+
0, 0, // state 2 (accept)
|
|
171
|
+
];
|
|
172
|
+
dfa.accept_tokens = vec![
|
|
173
|
+
DFA_NO_ACCEPT, // state 0
|
|
174
|
+
DFA_NO_ACCEPT, // state 1
|
|
175
|
+
1, // state 2: token 1
|
|
176
|
+
];
|
|
177
|
+
dfa
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// =========================================================================
|
|
181
|
+
// scan_until / match_literal tests
|
|
182
|
+
// =========================================================================
|
|
183
|
+
|
|
184
|
+
#[test]
|
|
185
|
+
fn test_scan_until_empty() {
|
|
186
|
+
assert_eq!(scan_until(b"", b"x"), 0);
|
|
187
|
+
assert_eq!(scan_until(b"hello", b""), 5);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_scan_until_found() {
|
|
192
|
+
assert_eq!(scan_until(b"hello world", b" "), 5);
|
|
193
|
+
assert_eq!(scan_until(b"hello-->world", b"-->"), 5);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
#[test]
|
|
197
|
+
fn test_scan_until_not_found() {
|
|
198
|
+
assert_eq!(scan_until(b"hello", b"x"), 5);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
fn test_scan_until_escape_basic() {
|
|
203
|
+
let (found, pos) = scan_until_escape(b"hello\"world", b"\"", b"\\\"");
|
|
204
|
+
assert!(found);
|
|
205
|
+
assert_eq!(pos, 5);
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
#[test]
|
|
209
|
+
fn test_scan_until_escape_with_escape() {
|
|
210
|
+
let (found, pos) = scan_until_escape(b"hello\\\"world\"end", b"\"", b"\\\"");
|
|
211
|
+
assert!(found);
|
|
212
|
+
assert_eq!(pos, 12);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
#[test]
|
|
216
|
+
fn test_match_literal_basic() {
|
|
217
|
+
assert!(match_literal(b"hello", b"hel"));
|
|
218
|
+
assert!(!match_literal(b"hello", b"world"));
|
|
219
|
+
assert!(!match_literal(b"hi", b"hello"));
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// =========================================================================
|
|
223
|
+
// Trie tests
|
|
224
|
+
// =========================================================================
|
|
225
|
+
|
|
226
|
+
#[test]
|
|
227
|
+
fn test_trie_empty_data() {
|
|
228
|
+
let result = trie::run(&[], b"hello");
|
|
229
|
+
assert!(result.is_none());
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_trie_empty_input() {
|
|
234
|
+
let trie_data = build_simple_trie();
|
|
235
|
+
let result = trie::run(&trie_data, b"");
|
|
236
|
+
assert!(result.is_none());
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
#[test]
|
|
240
|
+
fn test_trie_match() {
|
|
241
|
+
let trie_data = build_simple_trie();
|
|
242
|
+
let result = trie::run(&trie_data, b"a");
|
|
243
|
+
assert!(result.is_some());
|
|
244
|
+
let m = result.unwrap();
|
|
245
|
+
assert_eq!(m.length, 1);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
#[test]
|
|
249
|
+
fn test_trie_longer_match() {
|
|
250
|
+
let trie_data = build_simple_trie();
|
|
251
|
+
let result = trie::run(&trie_data, b"ab");
|
|
252
|
+
assert!(result.is_some());
|
|
253
|
+
let m = result.unwrap();
|
|
254
|
+
assert_eq!(m.length, 2);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
#[test]
|
|
258
|
+
fn test_trie_no_match() {
|
|
259
|
+
let trie_data = build_simple_trie();
|
|
260
|
+
let result = trie::run(&trie_data, b"xyz");
|
|
261
|
+
assert!(result.is_none());
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
#[test]
|
|
265
|
+
fn test_trie_malformed_short() {
|
|
266
|
+
// Too short to be valid
|
|
267
|
+
let result = trie::run(&[0, 0, 0, 1], b"a");
|
|
268
|
+
assert!(result.is_none());
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
fn build_simple_trie() -> Vec<u8> {
|
|
272
|
+
// Trie for "a" (action=10) and "ab" (action=20)
|
|
273
|
+
let mut trie = Vec::new();
|
|
274
|
+
|
|
275
|
+
// Header: 3 nodes, 2 edges
|
|
276
|
+
trie.extend_from_slice(&3u32.to_be_bytes());
|
|
277
|
+
trie.extend_from_slice(&2u32.to_be_bytes());
|
|
278
|
+
|
|
279
|
+
// Node 0: root
|
|
280
|
+
trie.extend_from_slice(&0u32.to_be_bytes()); // edge_start
|
|
281
|
+
trie.extend_from_slice(&1u16.to_be_bytes()); // edge_len
|
|
282
|
+
trie.extend_from_slice(&0xFFFFu16.to_be_bytes()); // order (not accepting)
|
|
283
|
+
trie.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes()); // action (not accepting)
|
|
284
|
+
|
|
285
|
+
// Node 1: after 'a'
|
|
286
|
+
trie.extend_from_slice(&1u32.to_be_bytes()); // edge_start
|
|
287
|
+
trie.extend_from_slice(&1u16.to_be_bytes()); // edge_len
|
|
288
|
+
trie.extend_from_slice(&1u16.to_be_bytes()); // order
|
|
289
|
+
trie.extend_from_slice(&10u32.to_be_bytes()); // action
|
|
290
|
+
|
|
291
|
+
// Node 2: after 'ab'
|
|
292
|
+
trie.extend_from_slice(&0u32.to_be_bytes()); // edge_start
|
|
293
|
+
trie.extend_from_slice(&0u16.to_be_bytes()); // edge_len
|
|
294
|
+
trie.extend_from_slice(&0u16.to_be_bytes()); // order
|
|
295
|
+
trie.extend_from_slice(&20u32.to_be_bytes()); // action
|
|
296
|
+
|
|
297
|
+
// Edge 0: 'a' -> node 1
|
|
298
|
+
trie.push(b'a');
|
|
299
|
+
trie.extend_from_slice(&1u32.to_be_bytes());
|
|
300
|
+
|
|
301
|
+
// Edge 1: 'b' -> node 2
|
|
302
|
+
trie.push(b'b');
|
|
303
|
+
trie.extend_from_slice(&2u32.to_be_bytes());
|
|
304
|
+
|
|
305
|
+
trie
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// =========================================================================
|
|
309
|
+
// VM tests
|
|
310
|
+
// =========================================================================
|
|
311
|
+
|
|
312
|
+
#[test]
|
|
313
|
+
fn test_vm_empty_program() {
|
|
314
|
+
let prog = CompiledProgram::new();
|
|
315
|
+
let result = vm::collect_tokens(&prog, b"hello");
|
|
316
|
+
assert!(result.is_ok());
|
|
317
|
+
assert!(result.unwrap().is_empty());
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
#[test]
|
|
321
|
+
fn test_vm_empty_input() {
|
|
322
|
+
let prog = create_halt_program();
|
|
323
|
+
let result = vm::collect_tokens(&prog, b"");
|
|
324
|
+
assert!(result.is_ok());
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
#[test]
|
|
328
|
+
fn test_vm_single_token() {
|
|
329
|
+
let prog = create_simple_vm_program();
|
|
330
|
+
let result = vm::collect_tokens(&prog, b"a");
|
|
331
|
+
assert!(result.is_ok());
|
|
332
|
+
let tokens = result.unwrap();
|
|
333
|
+
assert_eq!(tokens.len(), 1);
|
|
334
|
+
assert_eq!(tokens[0], (1, 0, 1));
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
#[test]
|
|
338
|
+
fn test_vm_multiple_tokens() {
|
|
339
|
+
let prog = create_simple_vm_program();
|
|
340
|
+
let result = vm::collect_tokens(&prog, b"aaa");
|
|
341
|
+
assert!(result.is_ok());
|
|
342
|
+
let tokens = result.unwrap();
|
|
343
|
+
assert_eq!(tokens.len(), 3);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
#[test]
|
|
347
|
+
fn test_vm_mode_stack() {
|
|
348
|
+
// Test that mode operations don't cause memory issues
|
|
349
|
+
let mut prog = CompiledProgram::new();
|
|
350
|
+
|
|
351
|
+
// Add modes
|
|
352
|
+
prog.modes.push(Mode { start_offset: 0 });
|
|
353
|
+
prog.modes.push(Mode { start_offset: 2 });
|
|
354
|
+
|
|
355
|
+
// Instructions: PUSH_MODE 1, POP_MODE, HALT
|
|
356
|
+
prog.instructions = vec![
|
|
357
|
+
Instruction {
|
|
358
|
+
opcode: 0x30, // PUSH_MODE
|
|
359
|
+
arg_hi: 0,
|
|
360
|
+
arg_mid: 0,
|
|
361
|
+
arg_lo: 1,
|
|
362
|
+
},
|
|
363
|
+
Instruction {
|
|
364
|
+
opcode: 0xFF, // HALT (at mode 1 offset)
|
|
365
|
+
arg_hi: 0,
|
|
366
|
+
arg_mid: 0,
|
|
367
|
+
arg_lo: 0,
|
|
368
|
+
},
|
|
369
|
+
Instruction {
|
|
370
|
+
opcode: 0xFF, // HALT
|
|
371
|
+
arg_hi: 0,
|
|
372
|
+
arg_mid: 0,
|
|
373
|
+
arg_lo: 0,
|
|
374
|
+
},
|
|
375
|
+
];
|
|
376
|
+
|
|
377
|
+
let result = vm::collect_tokens(&prog, b"x");
|
|
378
|
+
assert!(result.is_ok());
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
#[test]
|
|
382
|
+
fn test_vm_keyword_lookup() {
|
|
383
|
+
let mut prog = CompiledProgram::new();
|
|
384
|
+
|
|
385
|
+
// Add a keyword table
|
|
386
|
+
let mut kt = KeywordTable::new(1); // base token = 1
|
|
387
|
+
kt.entries.push(KeywordEntry {
|
|
388
|
+
keyword: b"if".to_vec(),
|
|
389
|
+
token_id: 2,
|
|
390
|
+
});
|
|
391
|
+
prog.keyword_tables.push(kt);
|
|
392
|
+
|
|
393
|
+
// Simple program that just halts
|
|
394
|
+
prog.instructions = vec![Instruction {
|
|
395
|
+
opcode: 0xFF, // HALT
|
|
396
|
+
arg_hi: 0,
|
|
397
|
+
arg_mid: 0,
|
|
398
|
+
arg_lo: 0,
|
|
399
|
+
}];
|
|
400
|
+
|
|
401
|
+
let result = vm::collect_tokens(&prog, b"if");
|
|
402
|
+
assert!(result.is_ok());
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
fn create_halt_program() -> CompiledProgram {
|
|
406
|
+
let mut prog = CompiledProgram::new();
|
|
407
|
+
prog.instructions = vec![Instruction {
|
|
408
|
+
opcode: 0xFF, // HALT
|
|
409
|
+
arg_hi: 0,
|
|
410
|
+
arg_mid: 0,
|
|
411
|
+
arg_lo: 0,
|
|
412
|
+
}];
|
|
413
|
+
prog.modes.push(Mode { start_offset: 0 });
|
|
414
|
+
prog
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
fn create_simple_vm_program() -> CompiledProgram {
|
|
418
|
+
use crate::opcodes::*;
|
|
419
|
+
|
|
420
|
+
let mut prog = CompiledProgram::new();
|
|
421
|
+
|
|
422
|
+
// DFA that matches 'a'
|
|
423
|
+
let mut dfa = DfaTableType::new();
|
|
424
|
+
dfa.state_count = 3;
|
|
425
|
+
dfa.class_count = 2;
|
|
426
|
+
dfa.byte_class = vec![0; 256];
|
|
427
|
+
dfa.byte_class[b'a' as usize] = 1;
|
|
428
|
+
dfa.transitions = vec![0, 0, 0, 2, 0, 0];
|
|
429
|
+
dfa.accept_tokens = vec![DFA_NO_ACCEPT, DFA_NO_ACCEPT, 1];
|
|
430
|
+
prog.dfa_tables.push(dfa);
|
|
431
|
+
|
|
432
|
+
// Instructions:
|
|
433
|
+
// 0: MARK
|
|
434
|
+
// 1: DFA_RUN_IF_MATCH (dfa=0, fail->4)
|
|
435
|
+
// 2: EMIT 1
|
|
436
|
+
// 3: JUMP 0
|
|
437
|
+
// 4: HALT
|
|
438
|
+
prog.instructions = vec![
|
|
439
|
+
Instruction {
|
|
440
|
+
opcode: OP_MARK,
|
|
441
|
+
arg_hi: 0,
|
|
442
|
+
arg_mid: 0,
|
|
443
|
+
arg_lo: 0,
|
|
444
|
+
},
|
|
445
|
+
Instruction {
|
|
446
|
+
opcode: OP_DFA_RUN_IF_MATCH,
|
|
447
|
+
arg_hi: 0,
|
|
448
|
+
arg_mid: 0,
|
|
449
|
+
arg_lo: 4,
|
|
450
|
+
},
|
|
451
|
+
Instruction {
|
|
452
|
+
opcode: OP_EMIT,
|
|
453
|
+
arg_hi: 0,
|
|
454
|
+
arg_mid: 0,
|
|
455
|
+
arg_lo: 1,
|
|
456
|
+
},
|
|
457
|
+
Instruction {
|
|
458
|
+
opcode: OP_JUMP,
|
|
459
|
+
arg_hi: 0,
|
|
460
|
+
arg_mid: 0,
|
|
461
|
+
arg_lo: 0,
|
|
462
|
+
},
|
|
463
|
+
Instruction {
|
|
464
|
+
opcode: OP_HALT,
|
|
465
|
+
arg_hi: 0,
|
|
466
|
+
arg_mid: 0,
|
|
467
|
+
arg_lo: 0,
|
|
468
|
+
},
|
|
469
|
+
];
|
|
470
|
+
|
|
471
|
+
prog.modes.push(Mode { start_offset: 0 });
|
|
472
|
+
prog.default_mode_offset = 0;
|
|
473
|
+
|
|
474
|
+
prog
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
// =========================================================================
|
|
478
|
+
// Stress tests (smaller under Miri)
|
|
479
|
+
// =========================================================================
|
|
480
|
+
|
|
481
|
+
#[test]
|
|
482
|
+
fn test_stress_token_buffer() {
|
|
483
|
+
// Miri is slow, so keep this reasonable
|
|
484
|
+
let count = if cfg!(miri) { 100 } else { 10000 };
|
|
485
|
+
|
|
486
|
+
let tokens: Vec<_> = (0..count).map(|i| (i as u16, i, 1usize)).collect();
|
|
487
|
+
|
|
488
|
+
let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
|
|
489
|
+
|
|
490
|
+
for i in 0..count {
|
|
491
|
+
assert_eq!(buffer.token_id(), i as i32);
|
|
492
|
+
buffer.advance();
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
assert!(buffer.eof());
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
//! Literal Trie matching
|
|
2
|
+
//!
|
|
3
|
+
//! This module implements trie-based literal matching for the lexer VM.
|
|
4
|
+
//! The trie format is a binary serialization format:
|
|
5
|
+
//!
|
|
6
|
+
//! Header (8 bytes):
|
|
7
|
+
//! - node_count: u32 (big-endian)
|
|
8
|
+
//! - edge_count: u32 (big-endian)
|
|
9
|
+
//!
|
|
10
|
+
//! Nodes (12 bytes each):
|
|
11
|
+
//! - edge_start: u32 (big-endian) - index into edges array
|
|
12
|
+
//! - edge_len: u16 (big-endian) - number of outgoing edges
|
|
13
|
+
//! - order: u16 (big-endian) - match priority
|
|
14
|
+
//! - action: u32 (big-endian) - action IP (0xFFFFFFFF = no action)
|
|
15
|
+
//!
|
|
16
|
+
//! Edges (5 bytes each):
|
|
17
|
+
//! - byte: u8 - the byte to match
|
|
18
|
+
//! - child_idx: u32 (big-endian) - index of child node
|
|
19
|
+
|
|
20
|
+
/// Trie match result
|
|
21
|
+
#[derive(Debug, Clone, Copy)]
|
|
22
|
+
pub struct TrieMatch {
|
|
23
|
+
pub length: usize,
|
|
24
|
+
pub order: u16,
|
|
25
|
+
pub action_ip: u32,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// Read big-endian u32 from bytes
|
|
29
|
+
#[inline]
|
|
30
|
+
fn read_u32(bytes: &[u8]) -> u32 {
|
|
31
|
+
((bytes[0] as u32) << 24)
|
|
32
|
+
| ((bytes[1] as u32) << 16)
|
|
33
|
+
| ((bytes[2] as u32) << 8)
|
|
34
|
+
| (bytes[3] as u32)
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/// Read big-endian u16 from bytes
|
|
38
|
+
#[inline]
|
|
39
|
+
fn read_u16(bytes: &[u8]) -> u16 {
|
|
40
|
+
((bytes[0] as u16) << 8) | (bytes[1] as u16)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/// Run literal trie and return best match
|
|
44
|
+
///
|
|
45
|
+
/// Returns None if no match, Some(TrieMatch) on match.
|
|
46
|
+
pub fn run(trie: &[u8], input: &[u8]) -> Option<TrieMatch> {
|
|
47
|
+
// Minimum trie size: header (8) + at least one node (12)
|
|
48
|
+
if trie.len() < 8 {
|
|
49
|
+
return None;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
let node_count = read_u32(&trie[0..4]);
|
|
53
|
+
let edge_count = read_u32(&trie[4..8]);
|
|
54
|
+
|
|
55
|
+
let nodes_offset = 8;
|
|
56
|
+
let nodes_size = (node_count as usize) * 12;
|
|
57
|
+
let edges_offset = nodes_offset + nodes_size;
|
|
58
|
+
let edges_size = (edge_count as usize) * 5;
|
|
59
|
+
|
|
60
|
+
if edges_offset + edges_size > trie.len() {
|
|
61
|
+
return None;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
let mut best: Option<TrieMatch> = None;
|
|
65
|
+
let mut node_idx: u32 = 0;
|
|
66
|
+
let mut pos = 0;
|
|
67
|
+
|
|
68
|
+
while pos < input.len() {
|
|
69
|
+
// Read current node
|
|
70
|
+
let node_ptr = nodes_offset + (node_idx as usize) * 12;
|
|
71
|
+
if node_ptr + 12 > trie.len() {
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
let edge_start = read_u32(&trie[node_ptr..node_ptr + 4]);
|
|
76
|
+
let edge_len = read_u16(&trie[node_ptr + 4..node_ptr + 6]);
|
|
77
|
+
|
|
78
|
+
let byte = input[pos];
|
|
79
|
+
let mut child_idx: Option<u32> = None;
|
|
80
|
+
|
|
81
|
+
// Search for matching edge
|
|
82
|
+
let edge_base = edges_offset + (edge_start as usize) * 5;
|
|
83
|
+
for i in 0..edge_len {
|
|
84
|
+
let edge_ptr = edge_base + (i as usize) * 5;
|
|
85
|
+
if edge_ptr + 5 > trie.len() {
|
|
86
|
+
return best;
|
|
87
|
+
}
|
|
88
|
+
if trie[edge_ptr] == byte {
|
|
89
|
+
child_idx = Some(read_u32(&trie[edge_ptr + 1..edge_ptr + 5]));
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
match child_idx {
|
|
95
|
+
Some(idx) if idx < node_count => {
|
|
96
|
+
node_idx = idx;
|
|
97
|
+
pos += 1;
|
|
98
|
+
|
|
99
|
+
// Check if this node has an action
|
|
100
|
+
let child_ptr = nodes_offset + (node_idx as usize) * 12;
|
|
101
|
+
let order = read_u16(&trie[child_ptr + 6..child_ptr + 8]);
|
|
102
|
+
let action = read_u32(&trie[child_ptr + 8..child_ptr + 12]);
|
|
103
|
+
|
|
104
|
+
if action != 0xFFFFFFFF {
|
|
105
|
+
// Update best if longer or same length with earlier order
|
|
106
|
+
let should_update = match &best {
|
|
107
|
+
None => true,
|
|
108
|
+
Some(b) => pos > b.length || (pos == b.length && order < b.order),
|
|
109
|
+
};
|
|
110
|
+
if should_update {
|
|
111
|
+
best = Some(TrieMatch {
|
|
112
|
+
length: pos,
|
|
113
|
+
order,
|
|
114
|
+
action_ip: action,
|
|
115
|
+
});
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
_ => break,
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
best
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#[cfg(test)]
|
|
127
|
+
mod tests {
|
|
128
|
+
use super::*;
|
|
129
|
+
|
|
130
|
+
fn build_simple_trie() -> Vec<u8> {
|
|
131
|
+
// Build a simple trie for "a" and "ab"
|
|
132
|
+
// Node 0: root (edges to 'a' -> node 1)
|
|
133
|
+
// Node 1: after 'a' (action=10, edges to 'b' -> node 2)
|
|
134
|
+
// Node 2: after 'ab' (action=20)
|
|
135
|
+
let mut trie = Vec::new();
|
|
136
|
+
|
|
137
|
+
// Header: 3 nodes, 2 edges
|
|
138
|
+
trie.extend_from_slice(&3u32.to_be_bytes());
|
|
139
|
+
trie.extend_from_slice(&2u32.to_be_bytes());
|
|
140
|
+
|
|
141
|
+
// Node 0: edge_start=0, edge_len=1, order=0xFFFF, action=0xFFFFFFFF
|
|
142
|
+
trie.extend_from_slice(&0u32.to_be_bytes());
|
|
143
|
+
trie.extend_from_slice(&1u16.to_be_bytes());
|
|
144
|
+
trie.extend_from_slice(&0xFFFFu16.to_be_bytes());
|
|
145
|
+
trie.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes());
|
|
146
|
+
|
|
147
|
+
// Node 1: edge_start=1, edge_len=1, order=1, action=10
|
|
148
|
+
trie.extend_from_slice(&1u32.to_be_bytes());
|
|
149
|
+
trie.extend_from_slice(&1u16.to_be_bytes());
|
|
150
|
+
trie.extend_from_slice(&1u16.to_be_bytes());
|
|
151
|
+
trie.extend_from_slice(&10u32.to_be_bytes());
|
|
152
|
+
|
|
153
|
+
// Node 2: edge_start=0, edge_len=0, order=0, action=20
|
|
154
|
+
trie.extend_from_slice(&0u32.to_be_bytes());
|
|
155
|
+
trie.extend_from_slice(&0u16.to_be_bytes());
|
|
156
|
+
trie.extend_from_slice(&0u16.to_be_bytes());
|
|
157
|
+
trie.extend_from_slice(&20u32.to_be_bytes());
|
|
158
|
+
|
|
159
|
+
// Edge 0: byte='a', child=1
|
|
160
|
+
trie.push(b'a');
|
|
161
|
+
trie.extend_from_slice(&1u32.to_be_bytes());
|
|
162
|
+
|
|
163
|
+
// Edge 1: byte='b', child=2
|
|
164
|
+
trie.push(b'b');
|
|
165
|
+
trie.extend_from_slice(&2u32.to_be_bytes());
|
|
166
|
+
|
|
167
|
+
trie
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#[test]
|
|
171
|
+
fn test_trie_match_single() {
|
|
172
|
+
let trie = build_simple_trie();
|
|
173
|
+
let result = run(&trie, b"a");
|
|
174
|
+
assert!(result.is_some());
|
|
175
|
+
let m = result.unwrap();
|
|
176
|
+
assert_eq!(m.length, 1);
|
|
177
|
+
assert_eq!(m.action_ip, 10);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
#[test]
|
|
181
|
+
fn test_trie_match_longer() {
|
|
182
|
+
let trie = build_simple_trie();
|
|
183
|
+
let result = run(&trie, b"ab");
|
|
184
|
+
assert!(result.is_some());
|
|
185
|
+
let m = result.unwrap();
|
|
186
|
+
assert_eq!(m.length, 2);
|
|
187
|
+
assert_eq!(m.action_ip, 20);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_trie_match_prefix() {
|
|
192
|
+
let trie = build_simple_trie();
|
|
193
|
+
let result = run(&trie, b"abc");
|
|
194
|
+
assert!(result.is_some());
|
|
195
|
+
let m = result.unwrap();
|
|
196
|
+
assert_eq!(m.length, 2);
|
|
197
|
+
assert_eq!(m.action_ip, 20);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[test]
|
|
201
|
+
fn test_trie_no_match() {
|
|
202
|
+
let trie = build_simple_trie();
|
|
203
|
+
let result = run(&trie, b"xyz");
|
|
204
|
+
assert!(result.is_none());
|
|
205
|
+
}
|
|
206
|
+
}
|