lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,498 @@
1
+ //! Safety tests that can be run under Miri
2
+ //!
3
+ //! Run with: cargo +nightly miri test
4
+ //!
5
+ //! These tests verify memory safety for:
6
+ //! - FastTokenBuffer (Vec-based token buffer)
7
+ //! - DFA execution (boundary conditions)
8
+ //! - Trie execution (malformed data handling)
9
+ //! - VM execution (infinite loop detection, stack overflow)
10
+
11
+ #[cfg(test)]
12
+ mod tests {
13
+ use crate::dfa::{match_literal, scan_until, scan_until_escape};
14
+ use crate::fast_stream::FastTokenBuffer;
15
+ use crate::trie;
16
+ use crate::types::{
17
+ CompiledProgram, DfaTable as DfaTableType, Instruction, KeywordEntry, KeywordTable, Mode,
18
+ DFA_NO_ACCEPT,
19
+ };
20
+ use crate::vm;
21
+
22
+ // =========================================================================
23
+ // FastTokenBuffer tests
24
+ // =========================================================================
25
+
26
+ #[test]
27
+ fn test_token_buffer_basic() {
28
+ let tokens = vec![
29
+ (1u16, 0usize, 5usize),
30
+ (2u16, 5usize, 3usize),
31
+ (3u16, 8usize, 2usize),
32
+ ];
33
+ let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
34
+
35
+ assert!(!buffer.eof());
36
+ assert_eq!(buffer.token_id(), 1);
37
+ assert_eq!(buffer.start(), 0);
38
+ assert_eq!(buffer.len(), 5);
39
+
40
+ buffer.advance();
41
+ assert_eq!(buffer.token_id(), 2);
42
+
43
+ buffer.advance();
44
+ assert_eq!(buffer.token_id(), 3);
45
+
46
+ buffer.advance();
47
+ assert!(buffer.eof());
48
+ assert_eq!(buffer.token_id(), -1);
49
+ }
50
+
51
+ #[test]
52
+ fn test_empty_buffer() {
53
+ let buffer = FastTokenBuffer::from_tokens(vec![]).unwrap();
54
+ assert!(buffer.eof());
55
+ assert_eq!(buffer.token_id(), -1);
56
+ assert_eq!(buffer.start(), -1);
57
+ assert_eq!(buffer.len(), -1);
58
+ }
59
+
60
+ #[test]
61
+ fn test_peek() {
62
+ let tokens = vec![(1u16, 0usize, 5usize), (2u16, 5usize, 3usize)];
63
+ let buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
64
+
65
+ assert_eq!(buffer.peek_token_id(0), 1);
66
+ assert_eq!(buffer.peek_token_id(1), 2);
67
+ assert_eq!(buffer.peek_token_id(2), -1); // out of bounds
68
+ assert_eq!(buffer.peek_start(0), 0);
69
+ assert_eq!(buffer.peek_start(1), 5);
70
+ assert_eq!(buffer.peek_start(2), -1);
71
+ assert_eq!(buffer.peek_len(0), 5);
72
+ assert_eq!(buffer.peek_len(1), 3);
73
+ assert_eq!(buffer.peek_len(2), -1);
74
+ }
75
+
76
+ #[test]
77
+ fn test_single_token() {
78
+ let tokens = vec![(42u16, 100usize, 50usize)];
79
+ let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
80
+
81
+ assert_eq!(buffer.count(), 1);
82
+ assert_eq!(buffer.index(), 0);
83
+ assert_eq!(buffer.token_id(), 42);
84
+ assert_eq!(buffer.start(), 100);
85
+ assert_eq!(buffer.len(), 50);
86
+
87
+ buffer.advance();
88
+ assert!(buffer.eof());
89
+ assert_eq!(buffer.index(), 1);
90
+ }
91
+
92
+ #[test]
93
+ fn test_many_tokens() {
94
+ let tokens: Vec<_> = (0..1000)
95
+ .map(|i| (i as u16, i * 10, 10usize))
96
+ .collect();
97
+ let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
98
+
99
+ assert_eq!(buffer.count(), 1000);
100
+
101
+ for i in 0..1000 {
102
+ assert_eq!(buffer.token_id(), i as i32);
103
+ assert_eq!(buffer.start(), (i * 10) as i32);
104
+ buffer.advance();
105
+ }
106
+
107
+ assert!(buffer.eof());
108
+ }
109
+
110
+ #[test]
111
+ fn test_advance_past_end() {
112
+ let tokens = vec![(1u16, 0usize, 1usize)];
113
+ let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
114
+
115
+ buffer.advance();
116
+ assert!(buffer.eof());
117
+
118
+ // Advancing past end should be safe
119
+ buffer.advance();
120
+ buffer.advance();
121
+ buffer.advance();
122
+ assert!(buffer.eof());
123
+ assert_eq!(buffer.token_id(), -1);
124
+ }
125
+
126
+ // =========================================================================
127
+ // DFA tests
128
+ // =========================================================================
129
+
130
+ #[test]
131
+ fn test_dfa_empty_input() {
132
+ let dfa = create_simple_dfa();
133
+ let result = dfa.run(b"");
134
+ // Empty input may or may not match depending on DFA (zero-length match)
135
+ assert!(result.is_none() || result.unwrap().length == 0);
136
+ }
137
+
138
+ #[test]
139
+ fn test_dfa_empty_dfa() {
140
+ let dfa = DfaTableType::new();
141
+ let result = dfa.run(b"hello");
142
+ assert!(result.is_none());
143
+ }
144
+
145
+ #[test]
146
+ fn test_dfa_single_byte_match() {
147
+ let dfa = create_simple_dfa();
148
+ let result = dfa.run(b"a");
149
+ assert!(result.is_some());
150
+ assert_eq!(result.unwrap().length, 1);
151
+ }
152
+
153
+ #[test]
154
+ fn test_dfa_no_match() {
155
+ let dfa = create_simple_dfa();
156
+ let result = dfa.run(b"xyz");
157
+ assert!(result.is_none());
158
+ }
159
+
160
+ fn create_simple_dfa() -> DfaTableType {
161
+ // DFA that matches 'a'
162
+ let mut dfa = DfaTableType::new();
163
+ dfa.state_count = 3;
164
+ dfa.class_count = 2;
165
+ dfa.byte_class = vec![0; 256];
166
+ dfa.byte_class[b'a' as usize] = 1;
167
+ dfa.transitions = vec![
168
+ 0, 0, // state 0 (dead)
169
+ 0, 2, // state 1: class 0 -> dead, class 1 ('a') -> state 2
170
+ 0, 0, // state 2 (accept)
171
+ ];
172
+ dfa.accept_tokens = vec![
173
+ DFA_NO_ACCEPT, // state 0
174
+ DFA_NO_ACCEPT, // state 1
175
+ 1, // state 2: token 1
176
+ ];
177
+ dfa
178
+ }
179
+
180
+ // =========================================================================
181
+ // scan_until / match_literal tests
182
+ // =========================================================================
183
+
184
+ #[test]
185
+ fn test_scan_until_empty() {
186
+ assert_eq!(scan_until(b"", b"x"), 0);
187
+ assert_eq!(scan_until(b"hello", b""), 5);
188
+ }
189
+
190
+ #[test]
191
+ fn test_scan_until_found() {
192
+ assert_eq!(scan_until(b"hello world", b" "), 5);
193
+ assert_eq!(scan_until(b"hello-->world", b"-->"), 5);
194
+ }
195
+
196
+ #[test]
197
+ fn test_scan_until_not_found() {
198
+ assert_eq!(scan_until(b"hello", b"x"), 5);
199
+ }
200
+
201
+ #[test]
202
+ fn test_scan_until_escape_basic() {
203
+ let (found, pos) = scan_until_escape(b"hello\"world", b"\"", b"\\\"");
204
+ assert!(found);
205
+ assert_eq!(pos, 5);
206
+ }
207
+
208
+ #[test]
209
+ fn test_scan_until_escape_with_escape() {
210
+ let (found, pos) = scan_until_escape(b"hello\\\"world\"end", b"\"", b"\\\"");
211
+ assert!(found);
212
+ assert_eq!(pos, 12);
213
+ }
214
+
215
+ #[test]
216
+ fn test_match_literal_basic() {
217
+ assert!(match_literal(b"hello", b"hel"));
218
+ assert!(!match_literal(b"hello", b"world"));
219
+ assert!(!match_literal(b"hi", b"hello"));
220
+ }
221
+
222
+ // =========================================================================
223
+ // Trie tests
224
+ // =========================================================================
225
+
226
+ #[test]
227
+ fn test_trie_empty_data() {
228
+ let result = trie::run(&[], b"hello");
229
+ assert!(result.is_none());
230
+ }
231
+
232
+ #[test]
233
+ fn test_trie_empty_input() {
234
+ let trie_data = build_simple_trie();
235
+ let result = trie::run(&trie_data, b"");
236
+ assert!(result.is_none());
237
+ }
238
+
239
+ #[test]
240
+ fn test_trie_match() {
241
+ let trie_data = build_simple_trie();
242
+ let result = trie::run(&trie_data, b"a");
243
+ assert!(result.is_some());
244
+ let m = result.unwrap();
245
+ assert_eq!(m.length, 1);
246
+ }
247
+
248
+ #[test]
249
+ fn test_trie_longer_match() {
250
+ let trie_data = build_simple_trie();
251
+ let result = trie::run(&trie_data, b"ab");
252
+ assert!(result.is_some());
253
+ let m = result.unwrap();
254
+ assert_eq!(m.length, 2);
255
+ }
256
+
257
+ #[test]
258
+ fn test_trie_no_match() {
259
+ let trie_data = build_simple_trie();
260
+ let result = trie::run(&trie_data, b"xyz");
261
+ assert!(result.is_none());
262
+ }
263
+
264
+ #[test]
265
+ fn test_trie_malformed_short() {
266
+ // Too short to be valid
267
+ let result = trie::run(&[0, 0, 0, 1], b"a");
268
+ assert!(result.is_none());
269
+ }
270
+
271
+ fn build_simple_trie() -> Vec<u8> {
272
+ // Trie for "a" (action=10) and "ab" (action=20)
273
+ let mut trie = Vec::new();
274
+
275
+ // Header: 3 nodes, 2 edges
276
+ trie.extend_from_slice(&3u32.to_be_bytes());
277
+ trie.extend_from_slice(&2u32.to_be_bytes());
278
+
279
+ // Node 0: root
280
+ trie.extend_from_slice(&0u32.to_be_bytes()); // edge_start
281
+ trie.extend_from_slice(&1u16.to_be_bytes()); // edge_len
282
+ trie.extend_from_slice(&0xFFFFu16.to_be_bytes()); // order (not accepting)
283
+ trie.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes()); // action (not accepting)
284
+
285
+ // Node 1: after 'a'
286
+ trie.extend_from_slice(&1u32.to_be_bytes()); // edge_start
287
+ trie.extend_from_slice(&1u16.to_be_bytes()); // edge_len
288
+ trie.extend_from_slice(&1u16.to_be_bytes()); // order
289
+ trie.extend_from_slice(&10u32.to_be_bytes()); // action
290
+
291
+ // Node 2: after 'ab'
292
+ trie.extend_from_slice(&0u32.to_be_bytes()); // edge_start
293
+ trie.extend_from_slice(&0u16.to_be_bytes()); // edge_len
294
+ trie.extend_from_slice(&0u16.to_be_bytes()); // order
295
+ trie.extend_from_slice(&20u32.to_be_bytes()); // action
296
+
297
+ // Edge 0: 'a' -> node 1
298
+ trie.push(b'a');
299
+ trie.extend_from_slice(&1u32.to_be_bytes());
300
+
301
+ // Edge 1: 'b' -> node 2
302
+ trie.push(b'b');
303
+ trie.extend_from_slice(&2u32.to_be_bytes());
304
+
305
+ trie
306
+ }
307
+
308
+ // =========================================================================
309
+ // VM tests
310
+ // =========================================================================
311
+
312
+ #[test]
313
+ fn test_vm_empty_program() {
314
+ let prog = CompiledProgram::new();
315
+ let result = vm::collect_tokens(&prog, b"hello");
316
+ assert!(result.is_ok());
317
+ assert!(result.unwrap().is_empty());
318
+ }
319
+
320
+ #[test]
321
+ fn test_vm_empty_input() {
322
+ let prog = create_halt_program();
323
+ let result = vm::collect_tokens(&prog, b"");
324
+ assert!(result.is_ok());
325
+ }
326
+
327
+ #[test]
328
+ fn test_vm_single_token() {
329
+ let prog = create_simple_vm_program();
330
+ let result = vm::collect_tokens(&prog, b"a");
331
+ assert!(result.is_ok());
332
+ let tokens = result.unwrap();
333
+ assert_eq!(tokens.len(), 1);
334
+ assert_eq!(tokens[0], (1, 0, 1));
335
+ }
336
+
337
+ #[test]
338
+ fn test_vm_multiple_tokens() {
339
+ let prog = create_simple_vm_program();
340
+ let result = vm::collect_tokens(&prog, b"aaa");
341
+ assert!(result.is_ok());
342
+ let tokens = result.unwrap();
343
+ assert_eq!(tokens.len(), 3);
344
+ }
345
+
346
+ #[test]
347
+ fn test_vm_mode_stack() {
348
+ // Test that mode operations don't cause memory issues
349
+ let mut prog = CompiledProgram::new();
350
+
351
+ // Add modes
352
+ prog.modes.push(Mode { start_offset: 0 });
353
+ prog.modes.push(Mode { start_offset: 2 });
354
+
355
+ // Instructions: PUSH_MODE 1, POP_MODE, HALT
356
+ prog.instructions = vec![
357
+ Instruction {
358
+ opcode: 0x30, // PUSH_MODE
359
+ arg_hi: 0,
360
+ arg_mid: 0,
361
+ arg_lo: 1,
362
+ },
363
+ Instruction {
364
+ opcode: 0xFF, // HALT (at mode 1 offset)
365
+ arg_hi: 0,
366
+ arg_mid: 0,
367
+ arg_lo: 0,
368
+ },
369
+ Instruction {
370
+ opcode: 0xFF, // HALT
371
+ arg_hi: 0,
372
+ arg_mid: 0,
373
+ arg_lo: 0,
374
+ },
375
+ ];
376
+
377
+ let result = vm::collect_tokens(&prog, b"x");
378
+ assert!(result.is_ok());
379
+ }
380
+
381
+ #[test]
382
+ fn test_vm_keyword_lookup() {
383
+ let mut prog = CompiledProgram::new();
384
+
385
+ // Add a keyword table
386
+ let mut kt = KeywordTable::new(1); // base token = 1
387
+ kt.entries.push(KeywordEntry {
388
+ keyword: b"if".to_vec(),
389
+ token_id: 2,
390
+ });
391
+ prog.keyword_tables.push(kt);
392
+
393
+ // Simple program that just halts
394
+ prog.instructions = vec![Instruction {
395
+ opcode: 0xFF, // HALT
396
+ arg_hi: 0,
397
+ arg_mid: 0,
398
+ arg_lo: 0,
399
+ }];
400
+
401
+ let result = vm::collect_tokens(&prog, b"if");
402
+ assert!(result.is_ok());
403
+ }
404
+
405
+ fn create_halt_program() -> CompiledProgram {
406
+ let mut prog = CompiledProgram::new();
407
+ prog.instructions = vec![Instruction {
408
+ opcode: 0xFF, // HALT
409
+ arg_hi: 0,
410
+ arg_mid: 0,
411
+ arg_lo: 0,
412
+ }];
413
+ prog.modes.push(Mode { start_offset: 0 });
414
+ prog
415
+ }
416
+
417
+ fn create_simple_vm_program() -> CompiledProgram {
418
+ use crate::opcodes::*;
419
+
420
+ let mut prog = CompiledProgram::new();
421
+
422
+ // DFA that matches 'a'
423
+ let mut dfa = DfaTableType::new();
424
+ dfa.state_count = 3;
425
+ dfa.class_count = 2;
426
+ dfa.byte_class = vec![0; 256];
427
+ dfa.byte_class[b'a' as usize] = 1;
428
+ dfa.transitions = vec![0, 0, 0, 2, 0, 0];
429
+ dfa.accept_tokens = vec![DFA_NO_ACCEPT, DFA_NO_ACCEPT, 1];
430
+ prog.dfa_tables.push(dfa);
431
+
432
+ // Instructions:
433
+ // 0: MARK
434
+ // 1: DFA_RUN_IF_MATCH (dfa=0, fail->4)
435
+ // 2: EMIT 1
436
+ // 3: JUMP 0
437
+ // 4: HALT
438
+ prog.instructions = vec![
439
+ Instruction {
440
+ opcode: OP_MARK,
441
+ arg_hi: 0,
442
+ arg_mid: 0,
443
+ arg_lo: 0,
444
+ },
445
+ Instruction {
446
+ opcode: OP_DFA_RUN_IF_MATCH,
447
+ arg_hi: 0,
448
+ arg_mid: 0,
449
+ arg_lo: 4,
450
+ },
451
+ Instruction {
452
+ opcode: OP_EMIT,
453
+ arg_hi: 0,
454
+ arg_mid: 0,
455
+ arg_lo: 1,
456
+ },
457
+ Instruction {
458
+ opcode: OP_JUMP,
459
+ arg_hi: 0,
460
+ arg_mid: 0,
461
+ arg_lo: 0,
462
+ },
463
+ Instruction {
464
+ opcode: OP_HALT,
465
+ arg_hi: 0,
466
+ arg_mid: 0,
467
+ arg_lo: 0,
468
+ },
469
+ ];
470
+
471
+ prog.modes.push(Mode { start_offset: 0 });
472
+ prog.default_mode_offset = 0;
473
+
474
+ prog
475
+ }
476
+
477
+ // =========================================================================
478
+ // Stress tests (smaller under Miri)
479
+ // =========================================================================
480
+
481
+ #[test]
482
+ fn test_stress_token_buffer() {
483
+ // Miri is slow, so keep this reasonable
484
+ let count = if cfg!(miri) { 100 } else { 10000 };
485
+
486
+ let tokens: Vec<_> = (0..count).map(|i| (i as u16, i, 1usize)).collect();
487
+
488
+ let mut buffer = FastTokenBuffer::from_tokens(tokens).unwrap();
489
+
490
+ for i in 0..count {
491
+ assert_eq!(buffer.token_id(), i as i32);
492
+ buffer.advance();
493
+ }
494
+
495
+ assert!(buffer.eof());
496
+ }
497
+
498
+ }
@@ -0,0 +1,206 @@
1
+ //! Literal Trie matching
2
+ //!
3
+ //! This module implements trie-based literal matching for the lexer VM.
4
+ //! The trie format is a binary serialization format:
5
+ //!
6
+ //! Header (8 bytes):
7
+ //! - node_count: u32 (big-endian)
8
+ //! - edge_count: u32 (big-endian)
9
+ //!
10
+ //! Nodes (12 bytes each):
11
+ //! - edge_start: u32 (big-endian) - index into edges array
12
+ //! - edge_len: u16 (big-endian) - number of outgoing edges
13
+ //! - order: u16 (big-endian) - match priority
14
+ //! - action: u32 (big-endian) - action IP (0xFFFFFFFF = no action)
15
+ //!
16
+ //! Edges (5 bytes each):
17
+ //! - byte: u8 - the byte to match
18
+ //! - child_idx: u32 (big-endian) - index of child node
19
+
20
+ /// Trie match result
21
+ #[derive(Debug, Clone, Copy)]
22
+ pub struct TrieMatch {
23
+ pub length: usize,
24
+ pub order: u16,
25
+ pub action_ip: u32,
26
+ }
27
+
28
+ /// Read big-endian u32 from bytes
29
+ #[inline]
30
+ fn read_u32(bytes: &[u8]) -> u32 {
31
+ ((bytes[0] as u32) << 24)
32
+ | ((bytes[1] as u32) << 16)
33
+ | ((bytes[2] as u32) << 8)
34
+ | (bytes[3] as u32)
35
+ }
36
+
37
+ /// Read big-endian u16 from bytes
38
+ #[inline]
39
+ fn read_u16(bytes: &[u8]) -> u16 {
40
+ ((bytes[0] as u16) << 8) | (bytes[1] as u16)
41
+ }
42
+
43
+ /// Run literal trie and return best match
44
+ ///
45
+ /// Returns None if no match, Some(TrieMatch) on match.
46
+ pub fn run(trie: &[u8], input: &[u8]) -> Option<TrieMatch> {
47
+ // Minimum trie size: header (8) + at least one node (12)
48
+ if trie.len() < 8 {
49
+ return None;
50
+ }
51
+
52
+ let node_count = read_u32(&trie[0..4]);
53
+ let edge_count = read_u32(&trie[4..8]);
54
+
55
+ let nodes_offset = 8;
56
+ let nodes_size = (node_count as usize) * 12;
57
+ let edges_offset = nodes_offset + nodes_size;
58
+ let edges_size = (edge_count as usize) * 5;
59
+
60
+ if edges_offset + edges_size > trie.len() {
61
+ return None;
62
+ }
63
+
64
+ let mut best: Option<TrieMatch> = None;
65
+ let mut node_idx: u32 = 0;
66
+ let mut pos = 0;
67
+
68
+ while pos < input.len() {
69
+ // Read current node
70
+ let node_ptr = nodes_offset + (node_idx as usize) * 12;
71
+ if node_ptr + 12 > trie.len() {
72
+ break;
73
+ }
74
+
75
+ let edge_start = read_u32(&trie[node_ptr..node_ptr + 4]);
76
+ let edge_len = read_u16(&trie[node_ptr + 4..node_ptr + 6]);
77
+
78
+ let byte = input[pos];
79
+ let mut child_idx: Option<u32> = None;
80
+
81
+ // Search for matching edge
82
+ let edge_base = edges_offset + (edge_start as usize) * 5;
83
+ for i in 0..edge_len {
84
+ let edge_ptr = edge_base + (i as usize) * 5;
85
+ if edge_ptr + 5 > trie.len() {
86
+ return best;
87
+ }
88
+ if trie[edge_ptr] == byte {
89
+ child_idx = Some(read_u32(&trie[edge_ptr + 1..edge_ptr + 5]));
90
+ break;
91
+ }
92
+ }
93
+
94
+ match child_idx {
95
+ Some(idx) if idx < node_count => {
96
+ node_idx = idx;
97
+ pos += 1;
98
+
99
+ // Check if this node has an action
100
+ let child_ptr = nodes_offset + (node_idx as usize) * 12;
101
+ let order = read_u16(&trie[child_ptr + 6..child_ptr + 8]);
102
+ let action = read_u32(&trie[child_ptr + 8..child_ptr + 12]);
103
+
104
+ if action != 0xFFFFFFFF {
105
+ // Update best if longer or same length with earlier order
106
+ let should_update = match &best {
107
+ None => true,
108
+ Some(b) => pos > b.length || (pos == b.length && order < b.order),
109
+ };
110
+ if should_update {
111
+ best = Some(TrieMatch {
112
+ length: pos,
113
+ order,
114
+ action_ip: action,
115
+ });
116
+ }
117
+ }
118
+ }
119
+ _ => break,
120
+ }
121
+ }
122
+
123
+ best
124
+ }
125
+
126
+ #[cfg(test)]
127
+ mod tests {
128
+ use super::*;
129
+
130
+ fn build_simple_trie() -> Vec<u8> {
131
+ // Build a simple trie for "a" and "ab"
132
+ // Node 0: root (edges to 'a' -> node 1)
133
+ // Node 1: after 'a' (action=10, edges to 'b' -> node 2)
134
+ // Node 2: after 'ab' (action=20)
135
+ let mut trie = Vec::new();
136
+
137
+ // Header: 3 nodes, 2 edges
138
+ trie.extend_from_slice(&3u32.to_be_bytes());
139
+ trie.extend_from_slice(&2u32.to_be_bytes());
140
+
141
+ // Node 0: edge_start=0, edge_len=1, order=0xFFFF, action=0xFFFFFFFF
142
+ trie.extend_from_slice(&0u32.to_be_bytes());
143
+ trie.extend_from_slice(&1u16.to_be_bytes());
144
+ trie.extend_from_slice(&0xFFFFu16.to_be_bytes());
145
+ trie.extend_from_slice(&0xFFFFFFFFu32.to_be_bytes());
146
+
147
+ // Node 1: edge_start=1, edge_len=1, order=1, action=10
148
+ trie.extend_from_slice(&1u32.to_be_bytes());
149
+ trie.extend_from_slice(&1u16.to_be_bytes());
150
+ trie.extend_from_slice(&1u16.to_be_bytes());
151
+ trie.extend_from_slice(&10u32.to_be_bytes());
152
+
153
+ // Node 2: edge_start=0, edge_len=0, order=0, action=20
154
+ trie.extend_from_slice(&0u32.to_be_bytes());
155
+ trie.extend_from_slice(&0u16.to_be_bytes());
156
+ trie.extend_from_slice(&0u16.to_be_bytes());
157
+ trie.extend_from_slice(&20u32.to_be_bytes());
158
+
159
+ // Edge 0: byte='a', child=1
160
+ trie.push(b'a');
161
+ trie.extend_from_slice(&1u32.to_be_bytes());
162
+
163
+ // Edge 1: byte='b', child=2
164
+ trie.push(b'b');
165
+ trie.extend_from_slice(&2u32.to_be_bytes());
166
+
167
+ trie
168
+ }
169
+
170
+ #[test]
171
+ fn test_trie_match_single() {
172
+ let trie = build_simple_trie();
173
+ let result = run(&trie, b"a");
174
+ assert!(result.is_some());
175
+ let m = result.unwrap();
176
+ assert_eq!(m.length, 1);
177
+ assert_eq!(m.action_ip, 10);
178
+ }
179
+
180
+ #[test]
181
+ fn test_trie_match_longer() {
182
+ let trie = build_simple_trie();
183
+ let result = run(&trie, b"ab");
184
+ assert!(result.is_some());
185
+ let m = result.unwrap();
186
+ assert_eq!(m.length, 2);
187
+ assert_eq!(m.action_ip, 20);
188
+ }
189
+
190
+ #[test]
191
+ fn test_trie_match_prefix() {
192
+ let trie = build_simple_trie();
193
+ let result = run(&trie, b"abc");
194
+ assert!(result.is_some());
195
+ let m = result.unwrap();
196
+ assert_eq!(m.length, 2);
197
+ assert_eq!(m.action_ip, 20);
198
+ }
199
+
200
+ #[test]
201
+ fn test_trie_no_match() {
202
+ let trie = build_simple_trie();
203
+ let result = run(&trie, b"xyz");
204
+ assert!(result.is_none());
205
+ }
206
+ }