lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# RegexParser parses a subset of regular expressions into an AST.
|
|
6
|
+
# Supported features:
|
|
7
|
+
# - Literal characters
|
|
8
|
+
# - Character classes: [a-z], [^abc], ., \d, \w, \s, \D, \W, \S
|
|
9
|
+
# - Quantifiers: *, +, ?, {n}, {n,}, {n,m}
|
|
10
|
+
# - Alternation: |
|
|
11
|
+
# - Grouping: (), (?:...)
|
|
12
|
+
# - Escapes: \\, \., \[, etc.
|
|
13
|
+
class RegexParser
|
|
14
|
+
# Include AST types from RegexAST module
|
|
15
|
+
include RegexAST
|
|
16
|
+
|
|
17
|
+
def initialize(source)
|
|
18
|
+
@source = source.is_a?(Regexp) ? source.source : source
|
|
19
|
+
@pos = 0
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def parse
|
|
23
|
+
result = parse_alternation
|
|
24
|
+
error("unexpected character at position #{@pos}") unless eof?
|
|
25
|
+
|
|
26
|
+
result
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def error(message)
|
|
32
|
+
source = Core::Source.new(@source)
|
|
33
|
+
raise LexerKit::ParseError.new(message, source: source, position: @pos)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def parse_alternation
|
|
37
|
+
start_pos = @pos
|
|
38
|
+
children = [parse_concat]
|
|
39
|
+
|
|
40
|
+
while peek == "|"
|
|
41
|
+
advance
|
|
42
|
+
children << parse_concat
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
return children[0] if children.length == 1
|
|
46
|
+
|
|
47
|
+
Alternation.new(children: children, meta: meta_for_children(children, start_pos, @pos))
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def parse_concat
|
|
51
|
+
start_pos = @pos
|
|
52
|
+
children = []
|
|
53
|
+
|
|
54
|
+
while !eof? && peek != "|" && peek != ")"
|
|
55
|
+
children << parse_quantified
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
case children.length
|
|
59
|
+
when 0 then Literal.new(byte: nil, meta: nil) # empty
|
|
60
|
+
when 1 then children[0]
|
|
61
|
+
else Concat.new(children: children, meta: meta_for_children(children, start_pos, @pos))
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def parse_quantified
|
|
66
|
+
start_pos = @pos
|
|
67
|
+
child = parse_atom
|
|
68
|
+
|
|
69
|
+
case peek
|
|
70
|
+
when "*"
|
|
71
|
+
advance
|
|
72
|
+
greedy = peek != "?"
|
|
73
|
+
advance if peek == "?"
|
|
74
|
+
Quantifier.new(child: child, min: 0, max: nil, greedy: greedy, meta: meta_for_span(start_pos, @pos))
|
|
75
|
+
when "+"
|
|
76
|
+
advance
|
|
77
|
+
greedy = peek != "?"
|
|
78
|
+
advance if peek == "?"
|
|
79
|
+
Quantifier.new(child: child, min: 1, max: nil, greedy: greedy, meta: meta_for_span(start_pos, @pos))
|
|
80
|
+
when "?"
|
|
81
|
+
advance
|
|
82
|
+
greedy = peek != "?"
|
|
83
|
+
advance if peek == "?"
|
|
84
|
+
Quantifier.new(child: child, min: 0, max: 1, greedy: greedy, meta: meta_for_span(start_pos, @pos))
|
|
85
|
+
when "{"
|
|
86
|
+
parse_counted_quantifier(child, start_pos)
|
|
87
|
+
else
|
|
88
|
+
child
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def parse_counted_quantifier(child, start_pos)
|
|
93
|
+
advance # skip {
|
|
94
|
+
min = parse_number
|
|
95
|
+
max = min
|
|
96
|
+
|
|
97
|
+
if peek == ","
|
|
98
|
+
advance
|
|
99
|
+
max = peek == "}" ? nil : parse_number
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
expect("}")
|
|
103
|
+
greedy = peek != "?"
|
|
104
|
+
advance if peek == "?"
|
|
105
|
+
|
|
106
|
+
Quantifier.new(child: child, min: min, max: max, greedy: greedy, meta: meta_for_span(start_pos, @pos))
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def parse_number
|
|
110
|
+
start = @pos
|
|
111
|
+
advance while peek =~ /\d/
|
|
112
|
+
@source[start...@pos].to_i
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def parse_atom
|
|
116
|
+
start_pos = @pos
|
|
117
|
+
case peek
|
|
118
|
+
when "("
|
|
119
|
+
advance
|
|
120
|
+
# Check for non-capturing group (?:...) or other extensions
|
|
121
|
+
if peek == "?"
|
|
122
|
+
advance
|
|
123
|
+
case peek
|
|
124
|
+
when ":"
|
|
125
|
+
# Non-capturing group (?:...) - same as regular group for DFA
|
|
126
|
+
advance
|
|
127
|
+
result = parse_alternation
|
|
128
|
+
expect(")")
|
|
129
|
+
Group.new(child: result, meta: meta_for_span(start_pos, @pos))
|
|
130
|
+
else
|
|
131
|
+
# Other extensions like (?=...), (?!...) etc - not supported
|
|
132
|
+
error("unsupported group extension (?#{peek}) at position #{@pos}")
|
|
133
|
+
end
|
|
134
|
+
else
|
|
135
|
+
# Regular capturing group
|
|
136
|
+
result = parse_alternation
|
|
137
|
+
expect(")")
|
|
138
|
+
Group.new(child: result, meta: meta_for_span(start_pos, @pos))
|
|
139
|
+
end
|
|
140
|
+
when "["
|
|
141
|
+
parse_char_class
|
|
142
|
+
when "\\"
|
|
143
|
+
parse_escape(start_pos)
|
|
144
|
+
when "."
|
|
145
|
+
advance
|
|
146
|
+
Any.new(meta: meta_for_span(start_pos, @pos, literal_text: "."))
|
|
147
|
+
when "^", "$"
|
|
148
|
+
# Anchors - not supported in DFA, just skip
|
|
149
|
+
advance
|
|
150
|
+
Literal.new(byte: nil, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
151
|
+
when nil, "|", ")", "*", "+", "?", "{"
|
|
152
|
+
error("unexpected '#{peek}' at position #{@pos}")
|
|
153
|
+
else
|
|
154
|
+
# Handle multi-byte UTF-8 characters
|
|
155
|
+
char = peek
|
|
156
|
+
advance
|
|
157
|
+
bytes = char.bytes
|
|
158
|
+
if bytes.size == 1
|
|
159
|
+
make_literal_node(bytes[0], meta_for_span(start_pos, @pos, literal_text: char, codepoint: bytes[0], bytes: bytes))
|
|
160
|
+
else
|
|
161
|
+
# Multi-byte UTF-8: create concatenation of literals
|
|
162
|
+
# Note: case folding only applies to ASCII letters, so UTF-8 bytes are unchanged
|
|
163
|
+
meta = meta_for_span(start_pos, @pos, literal_text: char, codepoint: char.ord, bytes: bytes)
|
|
164
|
+
Concat.new(children: bytes.map { |b| Literal.new(byte: b, meta: nil) }, meta: meta)
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def parse_char_class
|
|
170
|
+
start_pos = @pos
|
|
171
|
+
advance # skip [
|
|
172
|
+
negated = peek == "^"
|
|
173
|
+
advance if negated
|
|
174
|
+
|
|
175
|
+
collector = CharClassCollector.new
|
|
176
|
+
|
|
177
|
+
while peek && peek != "]"
|
|
178
|
+
start_item = parse_char_class_item
|
|
179
|
+
|
|
180
|
+
if range_follows?
|
|
181
|
+
advance # skip -
|
|
182
|
+
end_item = parse_char_class_item
|
|
183
|
+
collector.add_range(start_item, end_item)
|
|
184
|
+
else
|
|
185
|
+
collector.add_item(start_item)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
expect("]")
|
|
190
|
+
meta = meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos])
|
|
191
|
+
collector.to_ast(negated: negated, meta: meta)
|
|
192
|
+
rescue ArgumentError => e
|
|
193
|
+
error(e.message)
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
def range_follows?
|
|
197
|
+
peek == "-" && @source[@pos + 1] != "]"
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def parse_char_class_item
|
|
201
|
+
if peek == "\\"
|
|
202
|
+
advance
|
|
203
|
+
bytes = parse_escape_char
|
|
204
|
+
if bytes.length == 1
|
|
205
|
+
return { type: :byte, value: bytes[0] }
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
codepoint = bytes.pack("C*").force_encoding("UTF-8").ord
|
|
209
|
+
return { type: :codepoint, value: codepoint }
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
char = peek
|
|
213
|
+
advance
|
|
214
|
+
bytes = char.bytes
|
|
215
|
+
if bytes.length == 1
|
|
216
|
+
{ type: :byte, value: bytes[0] }
|
|
217
|
+
else
|
|
218
|
+
{ type: :codepoint, value: char.ord }
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def parse_escape(start_pos)
|
|
223
|
+
advance # skip \
|
|
224
|
+
|
|
225
|
+
case peek
|
|
226
|
+
when "d"
|
|
227
|
+
advance
|
|
228
|
+
CharClass.new(ranges: [[0x30, 0x39]], negated: false, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
229
|
+
when "D"
|
|
230
|
+
advance
|
|
231
|
+
CharClass.new(ranges: [[0x30, 0x39]], negated: true, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
232
|
+
when "w"
|
|
233
|
+
advance
|
|
234
|
+
CharClass.new(ranges: [[0x30, 0x39], [0x41, 0x5A], [0x61, 0x7A], [0x5F, 0x5F]], negated: false,
|
|
235
|
+
meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
236
|
+
when "W"
|
|
237
|
+
advance
|
|
238
|
+
CharClass.new(ranges: [[0x30, 0x39], [0x41, 0x5A], [0x61, 0x7A], [0x5F, 0x5F]], negated: true,
|
|
239
|
+
meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
240
|
+
when "s"
|
|
241
|
+
advance
|
|
242
|
+
CharClass.new(ranges: [[0x09, 0x0D], [0x20, 0x20]], negated: false,
|
|
243
|
+
meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos])) # \t\n\v\f\r and space
|
|
244
|
+
when "S"
|
|
245
|
+
advance
|
|
246
|
+
CharClass.new(ranges: [[0x09, 0x0D], [0x20, 0x20]], negated: true,
|
|
247
|
+
meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
|
|
248
|
+
when "n"
|
|
249
|
+
advance
|
|
250
|
+
Literal.new(byte: 0x0A, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0A, bytes: [0x0A])) # newline
|
|
251
|
+
when "r"
|
|
252
|
+
advance
|
|
253
|
+
Literal.new(byte: 0x0D, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0D, bytes: [0x0D])) # carriage return
|
|
254
|
+
when "t"
|
|
255
|
+
advance
|
|
256
|
+
Literal.new(byte: 0x09, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x09, bytes: [0x09])) # tab
|
|
257
|
+
when "v"
|
|
258
|
+
advance
|
|
259
|
+
Literal.new(byte: 0x0B, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0B, bytes: [0x0B])) # vertical tab
|
|
260
|
+
when "f"
|
|
261
|
+
advance
|
|
262
|
+
Literal.new(byte: 0x0C, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0C, bytes: [0x0C])) # form feed
|
|
263
|
+
when "0"
|
|
264
|
+
advance
|
|
265
|
+
Literal.new(byte: 0x00, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x00, bytes: [0x00])) # null
|
|
266
|
+
when "b"
|
|
267
|
+
advance
|
|
268
|
+
Literal.new(byte: 0x08, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x08, bytes: [0x08])) # backspace
|
|
269
|
+
when "x"
|
|
270
|
+
advance
|
|
271
|
+
hex = consume_hex_digits(2)
|
|
272
|
+
Literal.new(byte: hex, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: hex, bytes: [hex]))
|
|
273
|
+
when "u"
|
|
274
|
+
advance
|
|
275
|
+
codepoint = consume_hex_digits(4)
|
|
276
|
+
# Convert Unicode codepoint to UTF-8 byte sequence
|
|
277
|
+
utf8_bytes = [codepoint].pack("U").bytes
|
|
278
|
+
if utf8_bytes.size == 1
|
|
279
|
+
Literal.new(byte: utf8_bytes[0], meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: codepoint, bytes: utf8_bytes))
|
|
280
|
+
else
|
|
281
|
+
# Multi-byte UTF-8: create concatenation of literals
|
|
282
|
+
meta = meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: codepoint, bytes: utf8_bytes)
|
|
283
|
+
Concat.new(children: utf8_bytes.map { |b| Literal.new(byte: b, meta: nil) }, meta: meta)
|
|
284
|
+
end
|
|
285
|
+
when nil
|
|
286
|
+
error("unexpected end of pattern after \\")
|
|
287
|
+
else
|
|
288
|
+
byte = peek.ord
|
|
289
|
+
advance
|
|
290
|
+
Literal.new(byte: byte, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: byte, bytes: [byte]))
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def consume_hex_digits(count)
|
|
295
|
+
digits = +""
|
|
296
|
+
count.times do
|
|
297
|
+
error("expected hex digit at position #{@pos}") unless peek =~ /[0-9a-fA-F]/
|
|
298
|
+
|
|
299
|
+
digits << peek
|
|
300
|
+
advance
|
|
301
|
+
end
|
|
302
|
+
digits.to_i(16)
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Returns an array of bytes (for multi-byte UTF-8 support)
|
|
306
|
+
def parse_escape_char
|
|
307
|
+
case peek
|
|
308
|
+
when "n"
|
|
309
|
+
advance
|
|
310
|
+
[0x0A] # newline
|
|
311
|
+
when "r"
|
|
312
|
+
advance
|
|
313
|
+
[0x0D] # carriage return
|
|
314
|
+
when "t"
|
|
315
|
+
advance
|
|
316
|
+
[0x09] # tab
|
|
317
|
+
when "v"
|
|
318
|
+
advance
|
|
319
|
+
[0x0B] # vertical tab
|
|
320
|
+
when "f"
|
|
321
|
+
advance
|
|
322
|
+
[0x0C] # form feed
|
|
323
|
+
when "0"
|
|
324
|
+
advance
|
|
325
|
+
[0x00] # null
|
|
326
|
+
when "b"
|
|
327
|
+
advance
|
|
328
|
+
[0x08] # backspace
|
|
329
|
+
when "x"
|
|
330
|
+
advance
|
|
331
|
+
[consume_hex_digits(2)]
|
|
332
|
+
when "u"
|
|
333
|
+
advance
|
|
334
|
+
codepoint = consume_hex_digits(4)
|
|
335
|
+
[codepoint].pack("U").bytes
|
|
336
|
+
when nil then error("unexpected end of pattern after \\")
|
|
337
|
+
else
|
|
338
|
+
byte = peek.ord
|
|
339
|
+
advance
|
|
340
|
+
[byte]
|
|
341
|
+
end
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def peek
|
|
345
|
+
@source[@pos]
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
def advance
|
|
349
|
+
@pos += 1
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def expect(char)
|
|
353
|
+
if peek != char
|
|
354
|
+
error("expected '#{char}' at position #{@pos}, got '#{peek}'")
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
advance
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def eof?
|
|
361
|
+
@pos >= @source.length
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
# Create a literal node for a single byte
|
|
365
|
+
def make_literal_node(byte, meta)
|
|
366
|
+
Literal.new(byte: byte, meta: meta)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
def meta_for_span(start_pos, end_pos, literal_text: nil, codepoint: nil, bytes: nil)
|
|
370
|
+
Meta.new(
|
|
371
|
+
span: [@source[0...start_pos].bytesize, @source[0...end_pos].bytesize],
|
|
372
|
+
literal_text: literal_text,
|
|
373
|
+
codepoint: codepoint,
|
|
374
|
+
bytes: bytes
|
|
375
|
+
)
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
def meta_for_children(children, start_pos, end_pos)
|
|
379
|
+
return nil if children.empty?
|
|
380
|
+
|
|
381
|
+
meta_for_span(start_pos, end_pos)
|
|
382
|
+
end
|
|
383
|
+
end
|
|
384
|
+
end
|
|
385
|
+
end
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# Build regex AST from UTF-8 codepoint ranges.
|
|
6
|
+
module Utf8Range
|
|
7
|
+
def self.ast_for_ranges(ranges)
|
|
8
|
+
parts = ranges.flat_map do |start_cp, end_cp|
|
|
9
|
+
build_range(start_cp, end_cp)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
return RegexAST::Literal.new(byte: nil, meta: nil) if parts.empty?
|
|
13
|
+
return parts.first if parts.size == 1
|
|
14
|
+
|
|
15
|
+
RegexAST::Alternation.new(children: parts, meta: nil)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.build_range(start_cp, end_cp)
|
|
19
|
+
raise ArgumentError, "invalid range" if start_cp > end_cp
|
|
20
|
+
|
|
21
|
+
segments = []
|
|
22
|
+
segments.concat(build_for_length(start_cp, end_cp, 1))
|
|
23
|
+
segments.concat(build_for_length(start_cp, end_cp, 2))
|
|
24
|
+
segments.concat(build_for_length(start_cp, end_cp, 3))
|
|
25
|
+
segments.concat(build_for_length(start_cp, end_cp, 4))
|
|
26
|
+
segments
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.build_for_length(start_cp, end_cp, length)
|
|
30
|
+
if length == 3 && overlaps_surrogates?(start_cp, end_cp)
|
|
31
|
+
segments = []
|
|
32
|
+
segments.concat(build_for_length(start_cp, 0xD7FF, length)) if start_cp <= 0xD7FF
|
|
33
|
+
segments.concat(build_for_length(0xE000, end_cp, length)) if end_cp >= 0xE000
|
|
34
|
+
return segments
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
min_cp, max_cp = codepoint_bounds(length)
|
|
38
|
+
return [] if end_cp < min_cp || start_cp > max_cp
|
|
39
|
+
|
|
40
|
+
s = [start_cp, min_cp].max
|
|
41
|
+
e = [end_cp, max_cp].min
|
|
42
|
+
return [] if s > e
|
|
43
|
+
|
|
44
|
+
segments = []
|
|
45
|
+
lead_bytes_for_length(length).each do |lead|
|
|
46
|
+
lead_min_cp, lead_max_cp = codepoint_range_for_lead(lead, length)
|
|
47
|
+
next if e < lead_min_cp || s > lead_max_cp
|
|
48
|
+
|
|
49
|
+
seg_start = [s, lead_min_cp].max
|
|
50
|
+
seg_end = [e, lead_max_cp].min
|
|
51
|
+
start_bytes = utf8_bytes(seg_start)
|
|
52
|
+
end_bytes = utf8_bytes(seg_end)
|
|
53
|
+
min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
|
|
54
|
+
segments.concat(build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, 1).map do |seq|
|
|
55
|
+
RegexAST::Concat.new(children: [byte_node(lead)] + seq, meta: nil)
|
|
56
|
+
end)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
segments
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def self.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx)
|
|
63
|
+
return [[]] if idx >= start_bytes.size
|
|
64
|
+
|
|
65
|
+
if start_bytes[idx] == end_bytes[idx]
|
|
66
|
+
tails = build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx + 1)
|
|
67
|
+
return tails.map { |t| [byte_node(start_bytes[idx])] + t }
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
sequences = []
|
|
71
|
+
|
|
72
|
+
end_first = max_bytes.dup
|
|
73
|
+
end_first[0...idx] = start_bytes[0...idx]
|
|
74
|
+
end_first[idx] = start_bytes[idx]
|
|
75
|
+
first_tails = build_sequences(start_bytes, end_first, min_bytes, max_bytes, idx + 1)
|
|
76
|
+
sequences.concat(first_tails.map { |t| [byte_node(start_bytes[idx])] + t })
|
|
77
|
+
|
|
78
|
+
if start_bytes[idx] + 1 <= end_bytes[idx] - 1
|
|
79
|
+
middle = byte_range_node(start_bytes[idx] + 1, end_bytes[idx] - 1)
|
|
80
|
+
sequences << ([middle] + full_range_nodes(min_bytes, max_bytes, idx + 1))
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
start_last = min_bytes.dup
|
|
84
|
+
start_last[0...idx] = end_bytes[0...idx]
|
|
85
|
+
start_last[idx] = end_bytes[idx]
|
|
86
|
+
last_tails = build_sequences(start_last, end_bytes, min_bytes, max_bytes, idx + 1)
|
|
87
|
+
sequences.concat(last_tails.map { |t| [byte_node(end_bytes[idx])] + t })
|
|
88
|
+
|
|
89
|
+
sequences
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def self.full_range_nodes(min_bytes, max_bytes, idx)
|
|
93
|
+
(idx...min_bytes.size).map do |pos|
|
|
94
|
+
byte_range_node(min_bytes[pos], max_bytes[pos])
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def self.byte_node(byte)
|
|
99
|
+
RegexAST::Literal.new(byte: byte, meta: nil)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def self.byte_range_node(min_byte, max_byte)
|
|
103
|
+
if min_byte == max_byte
|
|
104
|
+
RegexAST::Literal.new(byte: min_byte, meta: nil)
|
|
105
|
+
else
|
|
106
|
+
RegexAST::CharClass.new(ranges: [[min_byte, max_byte]], negated: false, meta: nil)
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def self.utf8_bytes(codepoint)
|
|
111
|
+
[codepoint].pack("U").bytes
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def self.codepoint_bounds(length)
|
|
115
|
+
case length
|
|
116
|
+
when 1 then [0x0, 0x7F]
|
|
117
|
+
when 2 then [0x80, 0x7FF]
|
|
118
|
+
when 3 then [0x800, 0xFFFF]
|
|
119
|
+
when 4 then [0x10000, 0x10FFFF]
|
|
120
|
+
else [1, 0]
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def self.overlaps_surrogates?(start_cp, end_cp)
|
|
125
|
+
start_cp <= 0xDFFF && end_cp >= 0xD800
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def self.lead_bytes_for_length(length)
|
|
129
|
+
case length
|
|
130
|
+
when 1 then (0x00..0x7F).to_a
|
|
131
|
+
when 2 then (0xC2..0xDF).to_a
|
|
132
|
+
when 3 then (0xE0..0xEF).to_a
|
|
133
|
+
when 4 then (0xF0..0xF4).to_a
|
|
134
|
+
else []
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def self.byte_bounds_for_lead(lead, length)
|
|
139
|
+
case length
|
|
140
|
+
when 1
|
|
141
|
+
[[lead], [lead]]
|
|
142
|
+
when 2
|
|
143
|
+
[[lead, 0x80], [lead, 0xBF]]
|
|
144
|
+
when 3
|
|
145
|
+
min_second, max_second = if lead == 0xE0
|
|
146
|
+
[0xA0, 0xBF]
|
|
147
|
+
elsif lead == 0xED
|
|
148
|
+
[0x80, 0x9F]
|
|
149
|
+
else
|
|
150
|
+
[0x80, 0xBF]
|
|
151
|
+
end
|
|
152
|
+
[[lead, min_second, 0x80], [lead, max_second, 0xBF]]
|
|
153
|
+
when 4
|
|
154
|
+
min_second, max_second = if lead == 0xF0
|
|
155
|
+
[0x90, 0xBF]
|
|
156
|
+
elsif lead == 0xF4
|
|
157
|
+
[0x80, 0x8F]
|
|
158
|
+
else
|
|
159
|
+
[0x80, 0xBF]
|
|
160
|
+
end
|
|
161
|
+
[[lead, min_second, 0x80, 0x80], [lead, max_second, 0xBF, 0xBF]]
|
|
162
|
+
else
|
|
163
|
+
[[], []]
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def self.codepoint_range_for_lead(lead, length)
|
|
168
|
+
min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
|
|
169
|
+
min_cp = min_bytes.pack("C*").force_encoding("UTF-8").ord
|
|
170
|
+
max_cp = max_bytes.pack("C*").force_encoding("UTF-8").ord
|
|
171
|
+
[min_cp, max_cp]
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
class Utf8RangePattern
|
|
6
|
+
include LexerKit::RegexAstProvider
|
|
7
|
+
|
|
8
|
+
def initialize(ranges)
|
|
9
|
+
@ranges = ranges
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def to_ast
|
|
13
|
+
Utf8Range.ast_for_ranges(@ranges)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "dfa/regex_ast"
|
|
4
|
+
require_relative "dfa/utf8_range"
|
|
5
|
+
require_relative "dfa/case_folding"
|
|
6
|
+
require_relative "dfa/char_class_collector"
|
|
7
|
+
require_relative "dfa/regex_parser"
|
|
8
|
+
require_relative "dfa/nfa"
|
|
9
|
+
require_relative "dfa/dfa_minimizer"
|
|
10
|
+
require_relative "dfa/byte_class_builder"
|
|
11
|
+
require_relative "dfa/dfa_builder"
|
|
12
|
+
|
|
13
|
+
module LexerKit
|
|
14
|
+
# DFA module provides regex to DFA compilation.
|
|
15
|
+
module DFA
|
|
16
|
+
# Compile a Regex to DFA
|
|
17
|
+
# @param regex [RegexAST::Regex] regex pattern
|
|
18
|
+
# @param token_id [Integer] token ID for accepting state
|
|
19
|
+
# @return [IR::DFATable]
|
|
20
|
+
def self.compile_regex(regex, token_id = 0)
|
|
21
|
+
raise ArgumentError, "expected Regex, got #{regex.class}" unless regex.is_a?(RegexAST::Regex)
|
|
22
|
+
|
|
23
|
+
nfa = NFA.from_regex(regex, token_id)
|
|
24
|
+
DFABuilder.new(nfa).build
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Compute the set of possible first bytes for a Regex
|
|
28
|
+
# @param regex [RegexAST::Regex] regex pattern
|
|
29
|
+
# @return [Set<Integer>] set of bytes (0-255)
|
|
30
|
+
def self.first_byte_set(regex)
|
|
31
|
+
raise ArgumentError, "expected Regex, got #{regex.class}" unless regex.is_a?(RegexAST::Regex)
|
|
32
|
+
|
|
33
|
+
nfa = NFA.from_regex(regex, 0)
|
|
34
|
+
nfa.first_byte_set
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
# Base error for all LexerKit errors
|
|
5
|
+
class Error < StandardError; end
|
|
6
|
+
|
|
7
|
+
# ===========================================================================
|
|
8
|
+
# Binary/File Errors
|
|
9
|
+
# ===========================================================================
|
|
10
|
+
|
|
11
|
+
# Raised when binary deserialization fails
|
|
12
|
+
class InvalidBinaryError < Error; end
|
|
13
|
+
|
|
14
|
+
# Raised when file integrity check fails (SHA256 mismatch, etc.)
|
|
15
|
+
class IntegrityError < Error; end
|
|
16
|
+
|
|
17
|
+
# ===========================================================================
|
|
18
|
+
# User Input Errors (with diagnostic information)
|
|
19
|
+
# ===========================================================================
|
|
20
|
+
|
|
21
|
+
# Base class for errors with diagnostic information (source location, span, etc.)
|
|
22
|
+
# Provides rich error messages with source code context.
|
|
23
|
+
class DiagnosticError < Error
|
|
24
|
+
attr_reader :diagnostic, :source
|
|
25
|
+
|
|
26
|
+
def initialize(message, source: nil, span: nil, level: :error, notes: nil)
|
|
27
|
+
@source = source
|
|
28
|
+
@diagnostic = Core::Diagnostic.new(level: level, message: message, span: span, notes: notes) if source && span
|
|
29
|
+
super(message)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def render(color: $stderr.tty?)
|
|
33
|
+
return message unless @diagnostic && @source
|
|
34
|
+
|
|
35
|
+
@diagnostic.render(@source, color: color)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def self.from_location(location, message, level: :error, notes: nil)
|
|
39
|
+
return new(message) unless location&.path && File.file?(location.path)
|
|
40
|
+
|
|
41
|
+
bytes = File.binread(location.path)
|
|
42
|
+
source = Core::Source.new(bytes, filename: location.path)
|
|
43
|
+
span = source.span_for_line(location.lineno)
|
|
44
|
+
new(message, source: source, span: span, level: level, notes: notes)
|
|
45
|
+
rescue Errno::ENOENT, Errno::EACCES
|
|
46
|
+
new(message)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Raised during Builder DSL evaluation (invalid token definitions, etc.)
|
|
51
|
+
class BuildError < DiagnosticError; end
|
|
52
|
+
|
|
53
|
+
# Raised during compilation (IR generation, optimization, etc.)
|
|
54
|
+
class CompileError < DiagnosticError; end
|
|
55
|
+
|
|
56
|
+
# Raised during regex parsing
|
|
57
|
+
class ParseError < DiagnosticError
|
|
58
|
+
attr_reader :position
|
|
59
|
+
|
|
60
|
+
def initialize(message, source: nil, position: nil)
|
|
61
|
+
@position = position # character position (codepoint-based)
|
|
62
|
+
span = source.span_for_char_index(position) if source && position
|
|
63
|
+
super(message, source: source, span: span)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# ===========================================================================
|
|
68
|
+
# Runtime Errors
|
|
69
|
+
# ===========================================================================
|
|
70
|
+
|
|
71
|
+
# Raised when C native extension is required but not loaded
|
|
72
|
+
class NativeExtensionError < Error; end
|
|
73
|
+
|
|
74
|
+
# Raised during VM execution (stack overflow, infinite loop, etc.)
|
|
75
|
+
class VMError < Error; end
|
|
76
|
+
end
|