lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# NFA represents a Non-deterministic Finite Automaton.
|
|
6
|
+
# Built using Thompson's construction from regex AST.
|
|
7
|
+
# Case folding is applied during NFA construction, not parsing.
|
|
8
|
+
class NFA
|
|
9
|
+
include CaseFolding
|
|
10
|
+
|
|
11
|
+
# Epsilon transition marker
|
|
12
|
+
EPSILON = :epsilon
|
|
13
|
+
|
|
14
|
+
attr_reader :start_state, :accept_state, :transitions, :token_id
|
|
15
|
+
|
|
16
|
+
def initialize(case_insensitive: false)
|
|
17
|
+
@case_insensitive = case_insensitive
|
|
18
|
+
@states = 0
|
|
19
|
+
@transitions = Hash.new { |h, k| h[k] = [] }
|
|
20
|
+
@start_state = nil
|
|
21
|
+
@accept_state = nil
|
|
22
|
+
@token_id = nil
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Create a new state and return its ID
|
|
26
|
+
def new_state
|
|
27
|
+
state = @states
|
|
28
|
+
@states += 1
|
|
29
|
+
state
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Add a transition
|
|
33
|
+
def add_transition(from, to, input)
|
|
34
|
+
@transitions[from] << [input, to]
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Build NFA from regex AST
|
|
38
|
+
# @param ast [RegexAST::*] AST node
|
|
39
|
+
# @param token_id [Integer] token ID for accept state
|
|
40
|
+
# @param case_insensitive [Boolean] whether to apply case folding
|
|
41
|
+
# @return [NFA]
|
|
42
|
+
def self.from_ast(ast, token_id = 0, case_insensitive: false)
|
|
43
|
+
nfa = new(case_insensitive: case_insensitive)
|
|
44
|
+
start, accept = nfa.build(ast)
|
|
45
|
+
nfa.finalize!(start_state: start, accept_state: accept, token_id: token_id)
|
|
46
|
+
nfa
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Build NFA from Regex (AST with flags)
|
|
50
|
+
# @param regex [RegexAST::Regex] regex pattern with flags
|
|
51
|
+
# @param token_id [Integer] token ID for accept state
|
|
52
|
+
# @return [NFA]
|
|
53
|
+
def self.from_regex(regex, token_id = 0)
|
|
54
|
+
from_ast(regex.ast, token_id, case_insensitive: regex.case_insensitive)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def finalize!(start_state:, accept_state:, token_id:)
|
|
58
|
+
@start_state = start_state
|
|
59
|
+
@accept_state = accept_state
|
|
60
|
+
@token_id = token_id
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Build NFA fragment from AST node
|
|
64
|
+
# Returns [start_state, accept_state]
|
|
65
|
+
def build(node)
|
|
66
|
+
case node
|
|
67
|
+
when RegexAST::Literal
|
|
68
|
+
build_literal(node)
|
|
69
|
+
when RegexAST::CharClass
|
|
70
|
+
build_char_class(node)
|
|
71
|
+
when RegexAST::Any
|
|
72
|
+
build_any
|
|
73
|
+
when RegexAST::Concat
|
|
74
|
+
build_concat(node)
|
|
75
|
+
when RegexAST::Alternation
|
|
76
|
+
build_alternation(node)
|
|
77
|
+
when RegexAST::Quantifier
|
|
78
|
+
build_quantifier(node)
|
|
79
|
+
when RegexAST::Group
|
|
80
|
+
build(node.child)
|
|
81
|
+
else
|
|
82
|
+
raise "unknown AST node: #{node.class}"
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Compute epsilon closure of a set of states
|
|
87
|
+
# @param states [Set<Integer>] initial states
|
|
88
|
+
# @return [Set<Integer>] epsilon closure
|
|
89
|
+
def epsilon_closure(states)
|
|
90
|
+
closure = Set.new(states)
|
|
91
|
+
worklist = states.to_a
|
|
92
|
+
|
|
93
|
+
while (state = worklist.pop)
|
|
94
|
+
@transitions[state].each do |input, target|
|
|
95
|
+
if input == EPSILON && !closure.include?(target)
|
|
96
|
+
closure << target
|
|
97
|
+
worklist << target
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
closure
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Get all states reachable from a set of states on a given input
|
|
106
|
+
# @param states [Set<Integer>] current states
|
|
107
|
+
# @param input [Integer] input byte
|
|
108
|
+
# @return [Set<Integer>] reachable states
|
|
109
|
+
def move(states, input)
|
|
110
|
+
result = Set.new
|
|
111
|
+
states.each do |state|
|
|
112
|
+
@transitions[state].each do |trans_input, target|
|
|
113
|
+
result << target if trans_input == input
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
result
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Compute the set of possible first bytes this NFA can match
|
|
120
|
+
# @return [Set<Integer>] set of bytes (0-255)
|
|
121
|
+
def first_byte_set
|
|
122
|
+
closure = epsilon_closure(Set[@start_state])
|
|
123
|
+
result = Set.new
|
|
124
|
+
|
|
125
|
+
closure.each do |state|
|
|
126
|
+
@transitions[state].each do |input, _target|
|
|
127
|
+
result << input if input.is_a?(Integer)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
result
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
private
|
|
135
|
+
|
|
136
|
+
def build_literal(node)
|
|
137
|
+
return [new_state, new_state] if node.byte.nil? # empty
|
|
138
|
+
|
|
139
|
+
start = new_state
|
|
140
|
+
accept = new_state
|
|
141
|
+
|
|
142
|
+
# Apply case folding if enabled
|
|
143
|
+
case_fold_byte(node.byte).each do |byte|
|
|
144
|
+
add_transition(start, accept, byte)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
[start, accept]
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def build_char_class(node)
|
|
151
|
+
start = new_state
|
|
152
|
+
accept = new_state
|
|
153
|
+
|
|
154
|
+
# Expand ranges with case folding
|
|
155
|
+
expanded_ranges = node.ranges.flat_map do |from, to|
|
|
156
|
+
case_fold_range(from, to)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
if node.negated
|
|
160
|
+
# Negated: add transitions for all bytes NOT in the class
|
|
161
|
+
included = Set.new
|
|
162
|
+
expanded_ranges.each do |from, to|
|
|
163
|
+
(from..to).each { |b| included << b }
|
|
164
|
+
end
|
|
165
|
+
(0..255).each do |byte|
|
|
166
|
+
add_transition(start, accept, byte) unless included.include?(byte)
|
|
167
|
+
end
|
|
168
|
+
else
|
|
169
|
+
# Normal: add transitions for all bytes in the class
|
|
170
|
+
expanded_ranges.each do |from, to|
|
|
171
|
+
(from..to).each do |byte|
|
|
172
|
+
add_transition(start, accept, byte)
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
[start, accept]
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def build_any
|
|
181
|
+
start = new_state
|
|
182
|
+
accept = new_state
|
|
183
|
+
# Match any byte except newline (0x0A)
|
|
184
|
+
(0..255).each do |byte|
|
|
185
|
+
add_transition(start, accept, byte) unless byte == 0x0A
|
|
186
|
+
end
|
|
187
|
+
[start, accept]
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def build_concat(node)
|
|
191
|
+
return [new_state, new_state] if node.children.empty?
|
|
192
|
+
|
|
193
|
+
fragments = node.children.map { |child| build(child) }
|
|
194
|
+
|
|
195
|
+
# Chain fragments together with epsilon transitions
|
|
196
|
+
fragments.each_cons(2) do |(_, accept), (next_start, _)|
|
|
197
|
+
add_transition(accept, next_start, EPSILON)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
[fragments.first[0], fragments.last[1]]
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def build_alternation(node)
|
|
204
|
+
start = new_state
|
|
205
|
+
accept = new_state
|
|
206
|
+
|
|
207
|
+
node.children.each do |child|
|
|
208
|
+
child_start, child_accept = build(child)
|
|
209
|
+
add_transition(start, child_start, EPSILON)
|
|
210
|
+
add_transition(child_accept, accept, EPSILON)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
[start, accept]
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def build_quantifier(node)
|
|
217
|
+
case [node.min, node.max] # rubocop:disable Style/MinMax
|
|
218
|
+
when [0, nil] # *
|
|
219
|
+
build_star(node.child)
|
|
220
|
+
when [1, nil] # +
|
|
221
|
+
build_plus(node.child)
|
|
222
|
+
when [0, 1] # ?
|
|
223
|
+
build_optional(node.child)
|
|
224
|
+
else
|
|
225
|
+
build_counted(node.child, node.min, node.max)
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def build_star(child_ast)
|
|
230
|
+
start = new_state
|
|
231
|
+
accept = new_state
|
|
232
|
+
|
|
233
|
+
child_start, child_accept = build(child_ast)
|
|
234
|
+
|
|
235
|
+
add_transition(start, child_start, EPSILON)
|
|
236
|
+
add_transition(start, accept, EPSILON)
|
|
237
|
+
add_transition(child_accept, child_start, EPSILON)
|
|
238
|
+
add_transition(child_accept, accept, EPSILON)
|
|
239
|
+
|
|
240
|
+
[start, accept]
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def build_plus(child_ast)
|
|
244
|
+
start = new_state
|
|
245
|
+
accept = new_state
|
|
246
|
+
|
|
247
|
+
child_start, child_accept = build(child_ast)
|
|
248
|
+
|
|
249
|
+
add_transition(start, child_start, EPSILON)
|
|
250
|
+
add_transition(child_accept, child_start, EPSILON)
|
|
251
|
+
add_transition(child_accept, accept, EPSILON)
|
|
252
|
+
|
|
253
|
+
[start, accept]
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def build_optional(child_ast)
|
|
257
|
+
start = new_state
|
|
258
|
+
accept = new_state
|
|
259
|
+
|
|
260
|
+
child_start, child_accept = build(child_ast)
|
|
261
|
+
|
|
262
|
+
add_transition(start, child_start, EPSILON)
|
|
263
|
+
add_transition(start, accept, EPSILON)
|
|
264
|
+
add_transition(child_accept, accept, EPSILON)
|
|
265
|
+
|
|
266
|
+
[start, accept]
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def build_counted(child_ast, min, max)
|
|
270
|
+
start = new_state
|
|
271
|
+
current = start
|
|
272
|
+
|
|
273
|
+
# Build min required copies
|
|
274
|
+
min.times do
|
|
275
|
+
child_start, child_accept = build(child_ast)
|
|
276
|
+
add_transition(current, child_start, EPSILON)
|
|
277
|
+
current = child_accept
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
accept = new_state
|
|
281
|
+
add_transition(current, accept, EPSILON)
|
|
282
|
+
|
|
283
|
+
if max.nil?
|
|
284
|
+
# {n,} - min copies followed by star
|
|
285
|
+
child_start, child_accept = build(child_ast)
|
|
286
|
+
add_transition(current, child_start, EPSILON)
|
|
287
|
+
add_transition(child_accept, child_start, EPSILON)
|
|
288
|
+
add_transition(child_accept, accept, EPSILON)
|
|
289
|
+
elsif max > min
|
|
290
|
+
# {n,m} - min copies followed by (max-min) optional copies
|
|
291
|
+
(max - min).times do
|
|
292
|
+
child_start, child_accept = build(child_ast)
|
|
293
|
+
add_transition(current, child_start, EPSILON)
|
|
294
|
+
add_transition(current, accept, EPSILON)
|
|
295
|
+
current = child_accept
|
|
296
|
+
add_transition(current, accept, EPSILON)
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
[start, accept]
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# RegexAST contains immutable AST node types for parsed regular expressions.
|
|
6
|
+
# Uses Ruby's Data class for immutability and value semantics.
|
|
7
|
+
module RegexAST
|
|
8
|
+
# Metadata for AST nodes (source location, original text, etc.)
|
|
9
|
+
Meta = Data.define(:span, :literal_text, :codepoint, :bytes) do
|
|
10
|
+
def to_s
|
|
11
|
+
parts = []
|
|
12
|
+
parts << "span=#{span.inspect}" if span
|
|
13
|
+
parts << "text=#{literal_text.inspect}" if literal_text
|
|
14
|
+
parts << "cp=U+#{codepoint.to_s(16).upcase}" if codepoint
|
|
15
|
+
"{#{parts.join(', ')}}"
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Literal byte match
|
|
20
|
+
Literal = Data.define(:byte, :meta)
|
|
21
|
+
|
|
22
|
+
# Character class (e.g., [a-z], [^0-9])
|
|
23
|
+
# ranges: [[from, to], ...]
|
|
24
|
+
CharClass = Data.define(:ranges, :negated, :meta)
|
|
25
|
+
|
|
26
|
+
# Concatenation of patterns
|
|
27
|
+
Concat = Data.define(:children, :meta)
|
|
28
|
+
|
|
29
|
+
# Alternation (e.g., a|b|c)
|
|
30
|
+
Alternation = Data.define(:children, :meta)
|
|
31
|
+
|
|
32
|
+
# Quantifier (*, +, ?, {n,m})
|
|
33
|
+
# max = nil means unlimited
|
|
34
|
+
Quantifier = Data.define(:child, :min, :max, :greedy, :meta)
|
|
35
|
+
|
|
36
|
+
# Grouping (capturing or non-capturing)
|
|
37
|
+
Group = Data.define(:child, :meta)
|
|
38
|
+
|
|
39
|
+
# Any character (. pattern) - matches any byte except newline
|
|
40
|
+
Any = Data.define(:meta)
|
|
41
|
+
|
|
42
|
+
# Complete regex pattern with flags
|
|
43
|
+
# Wraps an AST node with pattern-level settings
|
|
44
|
+
Regex = Data.define(:ast, :case_insensitive) do
|
|
45
|
+
# Parse a pattern into Regex
|
|
46
|
+
# @param pattern [Regexp, String, Regex, RegexAstProvider]
|
|
47
|
+
# @return [Regex]
|
|
48
|
+
def self.parse(pattern)
|
|
49
|
+
case pattern
|
|
50
|
+
when self
|
|
51
|
+
pattern
|
|
52
|
+
when LexerKit::RegexAstProvider
|
|
53
|
+
pattern.to_regex
|
|
54
|
+
else
|
|
55
|
+
source = pattern.is_a?(Regexp) ? pattern.source : pattern
|
|
56
|
+
case_insensitive = pattern.is_a?(Regexp) && pattern.casefold?
|
|
57
|
+
ast = RegexParser.new(source).parse
|
|
58
|
+
new(ast: ast, case_insensitive: case_insensitive)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|