lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,304 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # NFA represents a Non-deterministic Finite Automaton.
6
+ # Built using Thompson's construction from regex AST.
7
+ # Case folding is applied during NFA construction, not parsing.
8
+ class NFA
9
+ include CaseFolding
10
+
11
+ # Epsilon transition marker
12
+ EPSILON = :epsilon
13
+
14
+ attr_reader :start_state, :accept_state, :transitions, :token_id
15
+
16
+ def initialize(case_insensitive: false)
17
+ @case_insensitive = case_insensitive
18
+ @states = 0
19
+ @transitions = Hash.new { |h, k| h[k] = [] }
20
+ @start_state = nil
21
+ @accept_state = nil
22
+ @token_id = nil
23
+ end
24
+
25
+ # Create a new state and return its ID
26
+ def new_state
27
+ state = @states
28
+ @states += 1
29
+ state
30
+ end
31
+
32
+ # Add a transition
33
+ def add_transition(from, to, input)
34
+ @transitions[from] << [input, to]
35
+ end
36
+
37
+ # Build NFA from regex AST
38
+ # @param ast [RegexAST::*] AST node
39
+ # @param token_id [Integer] token ID for accept state
40
+ # @param case_insensitive [Boolean] whether to apply case folding
41
+ # @return [NFA]
42
+ def self.from_ast(ast, token_id = 0, case_insensitive: false)
43
+ nfa = new(case_insensitive: case_insensitive)
44
+ start, accept = nfa.build(ast)
45
+ nfa.finalize!(start_state: start, accept_state: accept, token_id: token_id)
46
+ nfa
47
+ end
48
+
49
+ # Build NFA from Regex (AST with flags)
50
+ # @param regex [RegexAST::Regex] regex pattern with flags
51
+ # @param token_id [Integer] token ID for accept state
52
+ # @return [NFA]
53
+ def self.from_regex(regex, token_id = 0)
54
+ from_ast(regex.ast, token_id, case_insensitive: regex.case_insensitive)
55
+ end
56
+
57
+ def finalize!(start_state:, accept_state:, token_id:)
58
+ @start_state = start_state
59
+ @accept_state = accept_state
60
+ @token_id = token_id
61
+ end
62
+
63
+ # Build NFA fragment from AST node
64
+ # Returns [start_state, accept_state]
65
+ def build(node)
66
+ case node
67
+ when RegexAST::Literal
68
+ build_literal(node)
69
+ when RegexAST::CharClass
70
+ build_char_class(node)
71
+ when RegexAST::Any
72
+ build_any
73
+ when RegexAST::Concat
74
+ build_concat(node)
75
+ when RegexAST::Alternation
76
+ build_alternation(node)
77
+ when RegexAST::Quantifier
78
+ build_quantifier(node)
79
+ when RegexAST::Group
80
+ build(node.child)
81
+ else
82
+ raise "unknown AST node: #{node.class}"
83
+ end
84
+ end
85
+
86
+ # Compute epsilon closure of a set of states
87
+ # @param states [Set<Integer>] initial states
88
+ # @return [Set<Integer>] epsilon closure
89
+ def epsilon_closure(states)
90
+ closure = Set.new(states)
91
+ worklist = states.to_a
92
+
93
+ while (state = worklist.pop)
94
+ @transitions[state].each do |input, target|
95
+ if input == EPSILON && !closure.include?(target)
96
+ closure << target
97
+ worklist << target
98
+ end
99
+ end
100
+ end
101
+
102
+ closure
103
+ end
104
+
105
+ # Get all states reachable from a set of states on a given input
106
+ # @param states [Set<Integer>] current states
107
+ # @param input [Integer] input byte
108
+ # @return [Set<Integer>] reachable states
109
+ def move(states, input)
110
+ result = Set.new
111
+ states.each do |state|
112
+ @transitions[state].each do |trans_input, target|
113
+ result << target if trans_input == input
114
+ end
115
+ end
116
+ result
117
+ end
118
+
119
+ # Compute the set of possible first bytes this NFA can match
120
+ # @return [Set<Integer>] set of bytes (0-255)
121
+ def first_byte_set
122
+ closure = epsilon_closure(Set[@start_state])
123
+ result = Set.new
124
+
125
+ closure.each do |state|
126
+ @transitions[state].each do |input, _target|
127
+ result << input if input.is_a?(Integer)
128
+ end
129
+ end
130
+
131
+ result
132
+ end
133
+
134
+ private
135
+
136
+ def build_literal(node)
137
+ return [new_state, new_state] if node.byte.nil? # empty
138
+
139
+ start = new_state
140
+ accept = new_state
141
+
142
+ # Apply case folding if enabled
143
+ case_fold_byte(node.byte).each do |byte|
144
+ add_transition(start, accept, byte)
145
+ end
146
+
147
+ [start, accept]
148
+ end
149
+
150
+ def build_char_class(node)
151
+ start = new_state
152
+ accept = new_state
153
+
154
+ # Expand ranges with case folding
155
+ expanded_ranges = node.ranges.flat_map do |from, to|
156
+ case_fold_range(from, to)
157
+ end
158
+
159
+ if node.negated
160
+ # Negated: add transitions for all bytes NOT in the class
161
+ included = Set.new
162
+ expanded_ranges.each do |from, to|
163
+ (from..to).each { |b| included << b }
164
+ end
165
+ (0..255).each do |byte|
166
+ add_transition(start, accept, byte) unless included.include?(byte)
167
+ end
168
+ else
169
+ # Normal: add transitions for all bytes in the class
170
+ expanded_ranges.each do |from, to|
171
+ (from..to).each do |byte|
172
+ add_transition(start, accept, byte)
173
+ end
174
+ end
175
+ end
176
+
177
+ [start, accept]
178
+ end
179
+
180
+ def build_any
181
+ start = new_state
182
+ accept = new_state
183
+ # Match any byte except newline (0x0A)
184
+ (0..255).each do |byte|
185
+ add_transition(start, accept, byte) unless byte == 0x0A
186
+ end
187
+ [start, accept]
188
+ end
189
+
190
+ def build_concat(node)
191
+ return [new_state, new_state] if node.children.empty?
192
+
193
+ fragments = node.children.map { |child| build(child) }
194
+
195
+ # Chain fragments together with epsilon transitions
196
+ fragments.each_cons(2) do |(_, accept), (next_start, _)|
197
+ add_transition(accept, next_start, EPSILON)
198
+ end
199
+
200
+ [fragments.first[0], fragments.last[1]]
201
+ end
202
+
203
+ def build_alternation(node)
204
+ start = new_state
205
+ accept = new_state
206
+
207
+ node.children.each do |child|
208
+ child_start, child_accept = build(child)
209
+ add_transition(start, child_start, EPSILON)
210
+ add_transition(child_accept, accept, EPSILON)
211
+ end
212
+
213
+ [start, accept]
214
+ end
215
+
216
+ def build_quantifier(node)
217
+ case [node.min, node.max] # rubocop:disable Style/MinMax
218
+ when [0, nil] # *
219
+ build_star(node.child)
220
+ when [1, nil] # +
221
+ build_plus(node.child)
222
+ when [0, 1] # ?
223
+ build_optional(node.child)
224
+ else
225
+ build_counted(node.child, node.min, node.max)
226
+ end
227
+ end
228
+
229
+ def build_star(child_ast)
230
+ start = new_state
231
+ accept = new_state
232
+
233
+ child_start, child_accept = build(child_ast)
234
+
235
+ add_transition(start, child_start, EPSILON)
236
+ add_transition(start, accept, EPSILON)
237
+ add_transition(child_accept, child_start, EPSILON)
238
+ add_transition(child_accept, accept, EPSILON)
239
+
240
+ [start, accept]
241
+ end
242
+
243
+ def build_plus(child_ast)
244
+ start = new_state
245
+ accept = new_state
246
+
247
+ child_start, child_accept = build(child_ast)
248
+
249
+ add_transition(start, child_start, EPSILON)
250
+ add_transition(child_accept, child_start, EPSILON)
251
+ add_transition(child_accept, accept, EPSILON)
252
+
253
+ [start, accept]
254
+ end
255
+
256
+ def build_optional(child_ast)
257
+ start = new_state
258
+ accept = new_state
259
+
260
+ child_start, child_accept = build(child_ast)
261
+
262
+ add_transition(start, child_start, EPSILON)
263
+ add_transition(start, accept, EPSILON)
264
+ add_transition(child_accept, accept, EPSILON)
265
+
266
+ [start, accept]
267
+ end
268
+
269
+ def build_counted(child_ast, min, max)
270
+ start = new_state
271
+ current = start
272
+
273
+ # Build min required copies
274
+ min.times do
275
+ child_start, child_accept = build(child_ast)
276
+ add_transition(current, child_start, EPSILON)
277
+ current = child_accept
278
+ end
279
+
280
+ accept = new_state
281
+ add_transition(current, accept, EPSILON)
282
+
283
+ if max.nil?
284
+ # {n,} - min copies followed by star
285
+ child_start, child_accept = build(child_ast)
286
+ add_transition(current, child_start, EPSILON)
287
+ add_transition(child_accept, child_start, EPSILON)
288
+ add_transition(child_accept, accept, EPSILON)
289
+ elsif max > min
290
+ # {n,m} - min copies followed by (max-min) optional copies
291
+ (max - min).times do
292
+ child_start, child_accept = build(child_ast)
293
+ add_transition(current, child_start, EPSILON)
294
+ add_transition(current, accept, EPSILON)
295
+ current = child_accept
296
+ add_transition(current, accept, EPSILON)
297
+ end
298
+ end
299
+
300
+ [start, accept]
301
+ end
302
+ end
303
+ end
304
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # RegexAST contains immutable AST node types for parsed regular expressions.
6
+ # Uses Ruby's Data class for immutability and value semantics.
7
+ module RegexAST
8
+ # Metadata for AST nodes (source location, original text, etc.)
9
+ Meta = Data.define(:span, :literal_text, :codepoint, :bytes) do
10
+ def to_s
11
+ parts = []
12
+ parts << "span=#{span.inspect}" if span
13
+ parts << "text=#{literal_text.inspect}" if literal_text
14
+ parts << "cp=U+#{codepoint.to_s(16).upcase}" if codepoint
15
+ "{#{parts.join(', ')}}"
16
+ end
17
+ end
18
+
19
+ # Literal byte match
20
+ Literal = Data.define(:byte, :meta)
21
+
22
+ # Character class (e.g., [a-z], [^0-9])
23
+ # ranges: [[from, to], ...]
24
+ CharClass = Data.define(:ranges, :negated, :meta)
25
+
26
+ # Concatenation of patterns
27
+ Concat = Data.define(:children, :meta)
28
+
29
+ # Alternation (e.g., a|b|c)
30
+ Alternation = Data.define(:children, :meta)
31
+
32
+ # Quantifier (*, +, ?, {n,m})
33
+ # max = nil means unlimited
34
+ Quantifier = Data.define(:child, :min, :max, :greedy, :meta)
35
+
36
+ # Grouping (capturing or non-capturing)
37
+ Group = Data.define(:child, :meta)
38
+
39
+ # Any character (. pattern) - matches any byte except newline
40
+ Any = Data.define(:meta)
41
+
42
+ # Complete regex pattern with flags
43
+ # Wraps an AST node with pattern-level settings
44
+ Regex = Data.define(:ast, :case_insensitive) do
45
+ # Parse a pattern into Regex
46
+ # @param pattern [Regexp, String, Regex, RegexAstProvider]
47
+ # @return [Regex]
48
+ def self.parse(pattern)
49
+ case pattern
50
+ when self
51
+ pattern
52
+ when LexerKit::RegexAstProvider
53
+ pattern.to_regex
54
+ else
55
+ source = pattern.is_a?(Regexp) ? pattern.source : pattern
56
+ case_insensitive = pattern.is_a?(Regexp) && pattern.casefold?
57
+ ast = RegexParser.new(source).parse
58
+ new(ast: ast, case_insensitive: case_insensitive)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end