lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9461615637319b9b54724de2d2b7597bd6e8351157b893460aac1039942e3a36
4
+ data.tar.gz: dda19a6134d27d3524ffccc44b63eef24bd5ca60a905d0ba36ef38cf7b872699
5
+ SHA512:
6
+ metadata.gz: bc120d92fe97d14f9dcd1522770c8872ad4722e6b8d40b471edffce25662507b5add3afb1a3424b73d5d2b4d30eaed3ef9d4a514d58fb2dbe761165fcfd876a8
7
+ data.tar.gz: d5c052b7e48111cc23fa5383fc37a9366368f8057c68d65293e421b1f933465b59de4a8f9ca3360d4eeffcb6efbdd15499669c488551a7e572f012bf9fc8676d
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Masayoshi Takahashi (takahashim)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,157 @@
1
+ # LexerKit
2
+
3
+ A high-performance lexer toolkit for Ruby.
4
+ Define tokenizers with a Ruby DSL and run them through a Rust native extension.
5
+
6
+ ## Features
7
+
8
+ - DSL-based lexer definition
9
+ - Fast stream lexing with minimal allocation
10
+ - On-demand token object creation for diagnostics
11
+ - Compiled lexer serialization
12
+ - Regex-based token patterns compiled to DFA
13
+
14
+ ## Installation
15
+
16
+ ```ruby
17
+ # Gemfile
18
+ gem "lexer_kit"
19
+ ```
20
+
21
+ ```bash
22
+ bundle install
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```ruby
28
+ require "lexer_kit"
29
+
30
+ lexer = LexerKit.build do
31
+ token :NUMBER, /[0-9]+/
32
+ token :PLUS, "+"
33
+ token :MINUS, "-"
34
+ token :SPACE, /[ \t\r\n]+/, skip: true
35
+ end.compile
36
+
37
+ stream = lexer.stream("12 + 34 - 5")
38
+ until stream.eof?
39
+ puts "#{stream.token_name}: #{stream.text.inspect}"
40
+ stream.advance
41
+ end
42
+ ```
43
+
44
+ ## Core DSL
45
+
46
+ ### `token`
47
+
48
+ ```ruby
49
+ token :IDENT, /[a-zA-Z_][a-zA-Z0-9_]*/
50
+ token :ARROW, "->"
51
+ token :SPACE, /[ \t]+/, skip: true
52
+ token :DQUOTE, '"', push: :string
53
+ token :END_Q, '"', pop: true
54
+ ```
55
+
56
+ Options:
57
+
58
+ - `skip: true` skips emitting the token
59
+ - `push: :mode_name` pushes a mode
60
+ - `pop: true` pops the current mode
61
+
62
+ ### `keyword` / `define_keywords`
63
+
64
+ ```ruby
65
+ token :IDENT, /[a-z_]+/
66
+ keyword :IF, "if"
67
+ define_keywords :else, :while, :return
68
+ ```
69
+
70
+ ### `mode`
71
+
72
+ ```ruby
73
+ LexerKit.build do
74
+ token :DQUOTE, '"', push: :string
75
+ token :IDENT, /[a-z]+/
76
+
77
+ mode :string do
78
+ token :CONTENT, /[^"\\]+/
79
+ token :ESCAPE, /\\./
80
+ token :DQUOTE, '"', pop: true
81
+ end
82
+ end
83
+ ```
84
+
85
+ ### `scan_until` / `delimited`
86
+
87
+ ```ruby
88
+ scan_until :BLOCK_COMMENT, open: "/*", close: "*/", skip: true
89
+
90
+ delimited :TEXT, delimiter: "{{" do
91
+ token :IDENT, /[a-zA-Z_]+/
92
+ token :DOT, "."
93
+ token :CLOSE, "}}", pop: true
94
+ end
95
+ ```
96
+
97
+ ### `utf8_range`
98
+
99
+ ```ruby
100
+ token :HIRAGANA, LexerKit.utf8_range("ぁ".."ん")
101
+ token :CJK, LexerKit.utf8_range(0x4E00..0x9FFF)
102
+ ```
103
+
104
+ ## Regex Notes
105
+
106
+ - Most common regex syntax is supported (`[]`, quantifiers, groups, alternation, escapes, `/.../i`)
107
+ - Backtracking-dependent features are not supported (lookaround, backreference, etc.)
108
+ - Anchors and word-boundary assertions are not used in lexer matching
109
+ - `*?`, `+?`, `??` are parsed but behave as longest-match (DFA behavior)
110
+
111
+ ## Stream API and Error Handling
112
+
113
+ `stream.start` and `stream.len` are byte offsets.
114
+
115
+ ```ruby
116
+ stream = lexer.stream(input)
117
+ until stream.eof?
118
+ if stream.error?
119
+ token = stream.make_token
120
+ puts token.render_diagnostic("unexpected character")
121
+ end
122
+ stream.advance
123
+ end
124
+ ```
125
+
126
+ `LexerKit` always falls back to `:INVALID` for unmatched input.
127
+
128
+ ## Serialization
129
+
130
+ Pre-compile lexers for faster startup:
131
+
132
+ ```ruby
133
+ lexer = builder.compile
134
+ LexerKit::Format::LKT1.save(lexer, path: "lexer.lkt1")
135
+ LexerKit::Format::LKB1.save(lexer, path: "lexer.lkb1")
136
+ ```
137
+
138
+ ```bash
139
+ lexer_kit compile lexer.rb -o lexer.lkt1
140
+ ```
141
+
142
+ Load later:
143
+
144
+ ```ruby
145
+ lexer = LexerKit.load_lexer(File.expand_path("data/lexer.lkt1", __dir__))
146
+ ```
147
+
148
+ ## Performance Snapshot
149
+
150
+ JSON benchmark (600KB input, project benchmark script):
151
+
152
+ - LexerKit: `95.2 i/s`
153
+ - StringScanner: `4.8 i/s` (about `20x` slower)
154
+
155
+ ## License
156
+
157
+ MIT License
data/exe/lexer_kit ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "lexer_kit"
5
+ require "lexer_kit/cli"
6
+
7
+ exit(LexerKit::CLI.run(ARGV))
@@ -0,0 +1,17 @@
1
+ [package]
2
+ name = "lexer_kit_rust"
3
+ version = "0.1.0"
4
+ edition = "2021"
5
+ authors = ["LexerKit Authors"]
6
+ license = "MIT"
7
+ publish = false
8
+
9
+ [lib]
10
+ crate-type = ["cdylib"]
11
+
12
+ [dependencies]
13
+ magnus = "0.8"
14
+ rb-sys = "0.9"
15
+ libc = "0.2"
16
+
17
+ # Profile settings are in workspace root Cargo.toml
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mkmf"
4
+ require "rb_sys/mkmf"
5
+
6
+ create_rust_makefile("lexer_kit_rust/lexer_kit_rust")
@@ -0,0 +1,213 @@
1
+ //! Deserializer for CompiledProgram from Ruby data structures
2
+ //!
3
+ //! This module handles the conversion of Ruby Hash/Array data into
4
+ //! the Rust CompiledProgram structure used by the VM.
5
+
6
+ use magnus::{Error, RArray, RHash, RString, Ruby};
7
+
8
+ use crate::types::{
9
+ CompiledProgram, ConstantEntry, DfaTable, Instruction, JumpTable, KeywordEntry, KeywordTable,
10
+ Mode,
11
+ };
12
+
13
+ // =============================================================================
14
+ // Safety Helpers for Ruby String Access
15
+ // =============================================================================
16
+
17
+ /// Extract bytes from a Ruby String as a slice.
18
+ ///
19
+ /// # Safety Contract
20
+ /// The returned slice is valid only for the duration of the current function call.
21
+ /// The caller must:
22
+ /// - Use the slice immediately without storing it
23
+ /// - Not call any Ruby methods that could trigger GC while using the slice
24
+ /// - Not return the slice to the caller
25
+ ///
26
+ /// This is safe within deserialization functions because:
27
+ /// - The RString is rooted by Ruby's stack during the function call
28
+ /// - No Ruby code runs between obtaining the slice and consuming it
29
+ #[inline]
30
+ fn rstring_as_bytes(rb_str: &RString) -> &[u8] {
31
+ // SAFETY: See function documentation. The slice is used immediately
32
+ // within deserialization and no GC can occur.
33
+ unsafe { rb_str.as_slice() }
34
+ }
35
+
36
+ /// Copy bytes from a Ruby String into an owned Vec.
37
+ ///
38
+ /// This is the preferred method when the data needs to outlive the current scope.
39
+ /// The data is immediately copied, avoiding any lifetime concerns.
40
+ #[inline]
41
+ fn rstring_to_vec(rb_str: &RString) -> Vec<u8> {
42
+ // SAFETY: We immediately copy the data into an owned Vec,
43
+ // so the temporary slice lifetime is not an issue.
44
+ unsafe { rb_str.as_slice() }.to_vec()
45
+ }
46
+
47
+ /// Parse Ruby data hash into CompiledProgram
48
+ pub fn parse_ruby_data(rb_data: RHash) -> Result<CompiledProgram, Error> {
49
+ let ruby = Ruby::get().unwrap();
50
+ let mut prog = CompiledProgram::new();
51
+
52
+ // Instructions
53
+ let rb_instructions: RString = rb_data.fetch::<_, RString>(ruby.sym_new("instructions"))?;
54
+ let instr_bytes = rstring_as_bytes(&rb_instructions);
55
+ let instr_count = instr_bytes.len() / 4;
56
+ prog.instructions = (0..instr_count)
57
+ .map(|i| Instruction::from_bytes(&instr_bytes[i * 4..(i + 1) * 4]))
58
+ .collect();
59
+
60
+ // DFA tables
61
+ let rb_dfas: RArray = rb_data.fetch(ruby.sym_new("dfa_tables"))?;
62
+ for i in 0..rb_dfas.len() {
63
+ let rb_dfa: RHash = rb_dfas.entry(i as isize)?;
64
+ let dfa = parse_dfa_table(&ruby, rb_dfa)?;
65
+ prog.dfa_tables.push(dfa);
66
+ }
67
+
68
+ // Jump tables
69
+ let rb_jumps: RArray = rb_data.fetch(ruby.sym_new("jump_tables"))?;
70
+ for i in 0..rb_jumps.len() {
71
+ let rb_jt: RHash = rb_jumps.entry(i as isize)?;
72
+ let jt = parse_jump_table(&ruby, rb_jt)?;
73
+ prog.jump_tables.push(jt);
74
+ }
75
+
76
+ // Keyword tables
77
+ let rb_keywords: RArray = rb_data.fetch(ruby.sym_new("keyword_tables"))?;
78
+ for i in 0..rb_keywords.len() {
79
+ let rb_kt: RHash = rb_keywords.entry(i as isize)?;
80
+ let kt = parse_keyword_table(&ruby, rb_kt)?;
81
+ prog.keyword_tables.push(kt);
82
+ }
83
+
84
+ // Constant pool
85
+ let rb_pool: RArray = rb_data.fetch(ruby.sym_new("constant_pool"))?;
86
+ for i in 0..rb_pool.len() {
87
+ let rb_entry: RString = rb_pool.entry(i as isize)?;
88
+ let data = rstring_to_vec(&rb_entry);
89
+ prog.constant_pool.entries.push(ConstantEntry { data });
90
+ }
91
+
92
+ // Modes
93
+ let rb_modes: RArray = rb_data.fetch(ruby.sym_new("modes"))?;
94
+ for i in 0..rb_modes.len() {
95
+ let rb_mode: RArray = rb_modes.entry(i as isize)?;
96
+ let rb_name: RString = rb_mode.entry(0)?;
97
+ let rb_offset: u32 = rb_mode.entry(1)?;
98
+
99
+ let name = rstring_as_bytes(&rb_name);
100
+ prog.modes.push(Mode {
101
+ start_offset: rb_offset,
102
+ });
103
+
104
+ // Check for default mode
105
+ if name == b"default" {
106
+ prog.default_mode_offset = rb_offset;
107
+ }
108
+ }
109
+
110
+ Ok(prog)
111
+ }
112
+
113
+ /// Parse DFA table from Ruby hash
114
+ fn parse_dfa_table(ruby: &Ruby, rb_dfa: RHash) -> Result<DfaTable, Error> {
115
+ let mut dfa = DfaTable::new();
116
+
117
+ dfa.state_count = rb_dfa.fetch::<_, u16>(ruby.sym_new("state_count"))?;
118
+ dfa.class_count = rb_dfa.fetch::<_, u16>(ruby.sym_new("class_count"))?;
119
+
120
+ // Byte class
121
+ let rb_byte_class: RString = rb_dfa.fetch(ruby.sym_new("byte_class"))?;
122
+ let byte_class_data = rstring_as_bytes(&rb_byte_class);
123
+ if byte_class_data.len() < 256 {
124
+ return Err(Error::new(
125
+ ruby.exception_arg_error(),
126
+ "byte_class must be 256 bytes",
127
+ ));
128
+ }
129
+ dfa.byte_class = byte_class_data[..256].to_vec();
130
+
131
+ // Transitions (big-endian u16 packed) - validate size
132
+ let rb_trans: RString = rb_dfa.fetch(ruby.sym_new("transitions"))?;
133
+ let trans_data = rstring_as_bytes(&rb_trans);
134
+ let trans_size = (dfa.state_count as usize) * (dfa.class_count as usize);
135
+ let required_trans_bytes = trans_size * 2;
136
+ if trans_data.len() < required_trans_bytes {
137
+ return Err(Error::new(
138
+ ruby.exception_arg_error(),
139
+ "transitions data too short",
140
+ ));
141
+ }
142
+ dfa.transitions = (0..trans_size)
143
+ .map(|i| {
144
+ let off = i * 2;
145
+ ((trans_data[off] as u16) << 8) | (trans_data[off + 1] as u16)
146
+ })
147
+ .collect();
148
+
149
+ // Accept tokens (big-endian u16 packed) - validate size
150
+ let rb_accept: RString = rb_dfa.fetch(ruby.sym_new("accept_tokens"))?;
151
+ let accept_data = rstring_as_bytes(&rb_accept);
152
+ let required_accept_bytes = (dfa.state_count as usize) * 2;
153
+ if accept_data.len() < required_accept_bytes {
154
+ return Err(Error::new(
155
+ ruby.exception_arg_error(),
156
+ "accept_tokens must have state_count entries",
157
+ ));
158
+ }
159
+ dfa.accept_tokens = (0..dfa.state_count as usize)
160
+ .map(|i| {
161
+ let off = i * 2;
162
+ ((accept_data[off] as u16) << 8) | (accept_data[off + 1] as u16)
163
+ })
164
+ .collect();
165
+
166
+ Ok(dfa)
167
+ }
168
+
169
+ /// Parse jump table from Ruby hash
170
+ fn parse_jump_table(ruby: &Ruby, rb_jt: RHash) -> Result<JumpTable, Error> {
171
+ let mut jt = JumpTable::new();
172
+
173
+ // Lookup table (big-endian u32 packed, 256 entries) - validate size
174
+ let rb_lookup: RString = rb_jt.fetch(ruby.sym_new("lookup"))?;
175
+ let lookup_data = rstring_as_bytes(&rb_lookup);
176
+ if lookup_data.len() < 256 * 4 {
177
+ return Err(Error::new(
178
+ ruby.exception_arg_error(),
179
+ "jump table lookup must be 1024 bytes",
180
+ ));
181
+ }
182
+ for i in 0..256 {
183
+ let off = i * 4;
184
+ jt.lookup[i] = ((lookup_data[off] as u32) << 24)
185
+ | ((lookup_data[off + 1] as u32) << 16)
186
+ | ((lookup_data[off + 2] as u32) << 8)
187
+ | (lookup_data[off + 3] as u32);
188
+ }
189
+
190
+ // Default offset
191
+ let rb_default: Option<u32> = rb_jt.fetch(ruby.sym_new("default_offset")).ok();
192
+ jt.default_offset = rb_default.unwrap_or(0);
193
+
194
+ Ok(jt)
195
+ }
196
+
197
+ /// Parse keyword table from Ruby hash
198
+ fn parse_keyword_table(ruby: &Ruby, rb_kt: RHash) -> Result<KeywordTable, Error> {
199
+ let base_token_id: u16 = rb_kt.fetch(ruby.sym_new("base_token_id"))?;
200
+ let mut kt = KeywordTable::new(base_token_id);
201
+
202
+ let rb_keywords: RArray = rb_kt.fetch(ruby.sym_new("keywords"))?;
203
+ for i in 0..rb_keywords.len() {
204
+ let rb_entry: RArray = rb_keywords.entry(i as isize)?;
205
+ let rb_key: RString = rb_entry.entry(0)?;
206
+ let token_id: u16 = rb_entry.entry(1)?;
207
+
208
+ let keyword = rstring_to_vec(&rb_key);
209
+ kt.entries.push(KeywordEntry { keyword, token_id });
210
+ }
211
+
212
+ Ok(kt)
213
+ }
@@ -0,0 +1,217 @@
1
+ //! DFA (Deterministic Finite Automaton) execution
2
+ //!
3
+ //! This module implements DFA-based pattern matching for the lexer VM.
4
+
5
+ use crate::types::{DfaTable, DFA_DEAD_STATE, DFA_NO_ACCEPT};
6
+
7
+ /// DFA match result (length of the match)
8
+ #[derive(Debug, Clone, Copy)]
9
+ pub struct DfaMatch {
10
+ pub length: usize,
11
+ }
12
+
13
+ impl DfaTable {
14
+ /// Run DFA on input bytes, returning the match length and token ID.
15
+ ///
16
+ /// Returns None if no match, Some((length, token_id)) on match.
17
+ /// Supports zero-length matches (e.g., "a*" matching empty string).
18
+ pub fn run(&self, bytes: &[u8]) -> Option<DfaMatch> {
19
+ // Safety check: verify DFA has valid dimensions
20
+ if self.state_count == 0 || self.class_count == 0 {
21
+ return None;
22
+ }
23
+ if self.byte_class.len() < 256 {
24
+ return None;
25
+ }
26
+ if self.transitions.is_empty() || self.accept_tokens.is_empty() {
27
+ return None;
28
+ }
29
+
30
+ let mut state: u16 = 1; // Start state (0 is dead state)
31
+ let mut last_accept: Option<DfaMatch> = None;
32
+
33
+ // Validate start state is within bounds
34
+ if state >= self.state_count {
35
+ return None;
36
+ }
37
+
38
+ // Check if start state is accepting (handles zero-length matches like "a*")
39
+ if self.accept_tokens[state as usize] != DFA_NO_ACCEPT {
40
+ last_accept = Some(DfaMatch { length: 0 });
41
+ }
42
+
43
+ for (i, &byte) in bytes.iter().enumerate() {
44
+ let cls = self.byte_class[byte as usize];
45
+ let idx = (state as usize) * (self.class_count as usize) + (cls as usize);
46
+
47
+ if idx >= self.transitions.len() {
48
+ break;
49
+ }
50
+
51
+ state = self.transitions[idx];
52
+
53
+ if state == DFA_DEAD_STATE || state >= self.state_count {
54
+ break;
55
+ }
56
+
57
+ // Check if accepting state
58
+ if self.accept_tokens[state as usize] != DFA_NO_ACCEPT {
59
+ last_accept = Some(DfaMatch { length: i + 1 });
60
+ }
61
+ }
62
+
63
+ last_accept
64
+ }
65
+ }
66
+
67
+ /// Scan until delimiter found (using memchr for optimization)
68
+ ///
69
+ /// Returns the position where delimiter starts, or input length if not found.
70
+ pub fn scan_until(bytes: &[u8], delim: &[u8]) -> usize {
71
+ if delim.is_empty() {
72
+ return bytes.len();
73
+ }
74
+
75
+ if delim.len() == 1 {
76
+ // Single-byte delimiter: use memchr for SIMD optimization
77
+ return memchr_single(bytes, delim[0]).unwrap_or(bytes.len());
78
+ }
79
+
80
+ // Multi-byte delimiter: search for first byte, then check rest
81
+ let first = delim[0];
82
+ let mut pos = 0;
83
+
84
+ while pos < bytes.len() {
85
+ match memchr_single(&bytes[pos..], first) {
86
+ Some(idx) => {
87
+ let abs_idx = pos + idx;
88
+ if abs_idx + delim.len() > bytes.len() {
89
+ return bytes.len();
90
+ }
91
+ if &bytes[abs_idx..abs_idx + delim.len()] == delim {
92
+ return abs_idx;
93
+ }
94
+ pos = abs_idx + 1;
95
+ }
96
+ None => return bytes.len(),
97
+ }
98
+ }
99
+
100
+ bytes.len()
101
+ }
102
+
103
+ /// Simple memchr implementation
104
+ #[inline]
105
+ fn memchr_single(haystack: &[u8], needle: u8) -> Option<usize> {
106
+ haystack.iter().position(|&b| b == needle)
107
+ }
108
+
109
+ /// Match literal string at current position
110
+ ///
111
+ /// Returns true if the bytes at the start match the literal.
112
+ #[inline]
113
+ pub fn match_literal(bytes: &[u8], literal: &[u8]) -> bool {
114
+ if bytes.len() < literal.len() {
115
+ return false;
116
+ }
117
+ &bytes[..literal.len()] == literal
118
+ }
119
+
120
+ /// Scan until delimiter found, skipping escape sequences
121
+ ///
122
+ /// Returns (found, position) where found is true if delimiter was found.
123
+ pub fn scan_until_escape(bytes: &[u8], close: &[u8], escape: &[u8]) -> (bool, usize) {
124
+ let mut pos = 0;
125
+
126
+ while pos < bytes.len() {
127
+ let remaining = &bytes[pos..];
128
+ let scanned = scan_until(remaining, close);
129
+
130
+ if scanned >= remaining.len() {
131
+ // Delimiter not found - reached EOF
132
+ return (false, bytes.len());
133
+ }
134
+
135
+ let hit = pos + scanned;
136
+
137
+ // Check if this delimiter is part of an escape sequence
138
+ if !escape.is_empty() {
139
+ // Case 1: escape sequence starts with the close delimiter
140
+ // e.g., escape = "{[{]}" with close = "{["
141
+ if escape.starts_with(close)
142
+ && hit + escape.len() <= bytes.len()
143
+ && &bytes[hit..hit + escape.len()] == escape
144
+ {
145
+ // Escape sequence found - skip past it and continue
146
+ pos = hit + escape.len();
147
+ continue;
148
+ }
149
+
150
+ // Case 2: escape sequence ends with the close delimiter
151
+ // e.g., escape = "\\\"" with close = "\""
152
+ if escape.len() > close.len() && escape.ends_with(close) {
153
+ let prefix_len = escape.len() - close.len();
154
+ if hit >= prefix_len && &bytes[hit - prefix_len..hit + close.len()] == escape {
155
+ // Escape sequence found - skip past the close delimiter
156
+ pos = hit + close.len();
157
+ continue;
158
+ }
159
+ }
160
+ }
161
+
162
+ // Delimiter found without escape
163
+ return (true, hit);
164
+ }
165
+
166
+ (false, bytes.len())
167
+ }
168
+
169
+ #[cfg(test)]
170
+ mod tests {
171
+ use super::*;
172
+
173
+ #[test]
174
+ fn test_scan_until_single_byte() {
175
+ assert_eq!(scan_until(b"hello world", b" "), 5);
176
+ assert_eq!(scan_until(b"hello", b" "), 5);
177
+ assert_eq!(scan_until(b" hello", b" "), 0);
178
+ }
179
+
180
+ #[test]
181
+ fn test_scan_until_multi_byte() {
182
+ assert_eq!(scan_until(b"hello-->world", b"-->"), 5);
183
+ assert_eq!(scan_until(b"-->world", b"-->"), 0);
184
+ assert_eq!(scan_until(b"hello world", b"-->"), 11);
185
+ }
186
+
187
+ #[test]
188
+ fn test_match_literal() {
189
+ assert!(match_literal(b"hello world", b"hello"));
190
+ assert!(!match_literal(b"hello world", b"world"));
191
+ assert!(!match_literal(b"hi", b"hello"));
192
+ }
193
+
194
+ #[test]
195
+ fn test_scan_until_escape() {
196
+ // Without escape - finds delimiter at position 5
197
+ assert_eq!(
198
+ scan_until_escape(b"hello\"world", b"\"", b"\\\""),
199
+ (true, 5)
200
+ );
201
+ // With escape - escape starts at position 5, skips past it, finds real delimiter at 12
202
+ assert_eq!(
203
+ scan_until_escape(b"hello\\\"world\"end", b"\"", b"\\\""),
204
+ (true, 12)
205
+ );
206
+ // No delimiter
207
+ assert_eq!(
208
+ scan_until_escape(b"hello world", b"\"", b"\\\""),
209
+ (false, 11)
210
+ );
211
+ // Multi-byte delimiter with escape
212
+ assert_eq!(
213
+ scan_until_escape(b"hello{[{]}world{[test", b"{[", b"{[{]}"),
214
+ (true, 15)
215
+ );
216
+ }
217
+ }