lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,385 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # RegexParser parses a subset of regular expressions into an AST.
6
+ # Supported features:
7
+ # - Literal characters
8
+ # - Character classes: [a-z], [^abc], ., \d, \w, \s, \D, \W, \S
9
+ # - Quantifiers: *, +, ?, {n}, {n,}, {n,m}
10
+ # - Alternation: |
11
+ # - Grouping: (), (?:...)
12
+ # - Escapes: \\, \., \[, etc.
13
+ class RegexParser
14
+ # Include AST types from RegexAST module
15
+ include RegexAST
16
+
17
+ def initialize(source)
18
+ @source = source.is_a?(Regexp) ? source.source : source
19
+ @pos = 0
20
+ end
21
+
22
+ def parse
23
+ result = parse_alternation
24
+ error("unexpected character at position #{@pos}") unless eof?
25
+
26
+ result
27
+ end
28
+
29
+ private
30
+
31
+ def error(message)
32
+ source = Core::Source.new(@source)
33
+ raise LexerKit::ParseError.new(message, source: source, position: @pos)
34
+ end
35
+
36
+ def parse_alternation
37
+ start_pos = @pos
38
+ children = [parse_concat]
39
+
40
+ while peek == "|"
41
+ advance
42
+ children << parse_concat
43
+ end
44
+
45
+ return children[0] if children.length == 1
46
+
47
+ Alternation.new(children: children, meta: meta_for_children(children, start_pos, @pos))
48
+ end
49
+
50
+ def parse_concat
51
+ start_pos = @pos
52
+ children = []
53
+
54
+ while !eof? && peek != "|" && peek != ")"
55
+ children << parse_quantified
56
+ end
57
+
58
+ case children.length
59
+ when 0 then Literal.new(byte: nil, meta: nil) # empty
60
+ when 1 then children[0]
61
+ else Concat.new(children: children, meta: meta_for_children(children, start_pos, @pos))
62
+ end
63
+ end
64
+
65
+ def parse_quantified
66
+ start_pos = @pos
67
+ child = parse_atom
68
+
69
+ case peek
70
+ when "*"
71
+ advance
72
+ greedy = peek != "?"
73
+ advance if peek == "?"
74
+ Quantifier.new(child: child, min: 0, max: nil, greedy: greedy, meta: meta_for_span(start_pos, @pos))
75
+ when "+"
76
+ advance
77
+ greedy = peek != "?"
78
+ advance if peek == "?"
79
+ Quantifier.new(child: child, min: 1, max: nil, greedy: greedy, meta: meta_for_span(start_pos, @pos))
80
+ when "?"
81
+ advance
82
+ greedy = peek != "?"
83
+ advance if peek == "?"
84
+ Quantifier.new(child: child, min: 0, max: 1, greedy: greedy, meta: meta_for_span(start_pos, @pos))
85
+ when "{"
86
+ parse_counted_quantifier(child, start_pos)
87
+ else
88
+ child
89
+ end
90
+ end
91
+
92
+ def parse_counted_quantifier(child, start_pos)
93
+ advance # skip {
94
+ min = parse_number
95
+ max = min
96
+
97
+ if peek == ","
98
+ advance
99
+ max = peek == "}" ? nil : parse_number
100
+ end
101
+
102
+ expect("}")
103
+ greedy = peek != "?"
104
+ advance if peek == "?"
105
+
106
+ Quantifier.new(child: child, min: min, max: max, greedy: greedy, meta: meta_for_span(start_pos, @pos))
107
+ end
108
+
109
+ def parse_number
110
+ start = @pos
111
+ advance while peek =~ /\d/
112
+ @source[start...@pos].to_i
113
+ end
114
+
115
+ def parse_atom
116
+ start_pos = @pos
117
+ case peek
118
+ when "("
119
+ advance
120
+ # Check for non-capturing group (?:...) or other extensions
121
+ if peek == "?"
122
+ advance
123
+ case peek
124
+ when ":"
125
+ # Non-capturing group (?:...) - same as regular group for DFA
126
+ advance
127
+ result = parse_alternation
128
+ expect(")")
129
+ Group.new(child: result, meta: meta_for_span(start_pos, @pos))
130
+ else
131
+ # Other extensions like (?=...), (?!...) etc - not supported
132
+ error("unsupported group extension (?#{peek}) at position #{@pos}")
133
+ end
134
+ else
135
+ # Regular capturing group
136
+ result = parse_alternation
137
+ expect(")")
138
+ Group.new(child: result, meta: meta_for_span(start_pos, @pos))
139
+ end
140
+ when "["
141
+ parse_char_class
142
+ when "\\"
143
+ parse_escape(start_pos)
144
+ when "."
145
+ advance
146
+ Any.new(meta: meta_for_span(start_pos, @pos, literal_text: "."))
147
+ when "^", "$"
148
+ # Anchors - not supported in DFA, just skip
149
+ advance
150
+ Literal.new(byte: nil, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
151
+ when nil, "|", ")", "*", "+", "?", "{"
152
+ error("unexpected '#{peek}' at position #{@pos}")
153
+ else
154
+ # Handle multi-byte UTF-8 characters
155
+ char = peek
156
+ advance
157
+ bytes = char.bytes
158
+ if bytes.size == 1
159
+ make_literal_node(bytes[0], meta_for_span(start_pos, @pos, literal_text: char, codepoint: bytes[0], bytes: bytes))
160
+ else
161
+ # Multi-byte UTF-8: create concatenation of literals
162
+ # Note: case folding only applies to ASCII letters, so UTF-8 bytes are unchanged
163
+ meta = meta_for_span(start_pos, @pos, literal_text: char, codepoint: char.ord, bytes: bytes)
164
+ Concat.new(children: bytes.map { |b| Literal.new(byte: b, meta: nil) }, meta: meta)
165
+ end
166
+ end
167
+ end
168
+
169
+ def parse_char_class
170
+ start_pos = @pos
171
+ advance # skip [
172
+ negated = peek == "^"
173
+ advance if negated
174
+
175
+ collector = CharClassCollector.new
176
+
177
+ while peek && peek != "]"
178
+ start_item = parse_char_class_item
179
+
180
+ if range_follows?
181
+ advance # skip -
182
+ end_item = parse_char_class_item
183
+ collector.add_range(start_item, end_item)
184
+ else
185
+ collector.add_item(start_item)
186
+ end
187
+ end
188
+
189
+ expect("]")
190
+ meta = meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos])
191
+ collector.to_ast(negated: negated, meta: meta)
192
+ rescue ArgumentError => e
193
+ error(e.message)
194
+ end
195
+
196
+ def range_follows?
197
+ peek == "-" && @source[@pos + 1] != "]"
198
+ end
199
+
200
+ def parse_char_class_item
201
+ if peek == "\\"
202
+ advance
203
+ bytes = parse_escape_char
204
+ if bytes.length == 1
205
+ return { type: :byte, value: bytes[0] }
206
+ end
207
+
208
+ codepoint = bytes.pack("C*").force_encoding("UTF-8").ord
209
+ return { type: :codepoint, value: codepoint }
210
+ end
211
+
212
+ char = peek
213
+ advance
214
+ bytes = char.bytes
215
+ if bytes.length == 1
216
+ { type: :byte, value: bytes[0] }
217
+ else
218
+ { type: :codepoint, value: char.ord }
219
+ end
220
+ end
221
+
222
+ def parse_escape(start_pos)
223
+ advance # skip \
224
+
225
+ case peek
226
+ when "d"
227
+ advance
228
+ CharClass.new(ranges: [[0x30, 0x39]], negated: false, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
229
+ when "D"
230
+ advance
231
+ CharClass.new(ranges: [[0x30, 0x39]], negated: true, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
232
+ when "w"
233
+ advance
234
+ CharClass.new(ranges: [[0x30, 0x39], [0x41, 0x5A], [0x61, 0x7A], [0x5F, 0x5F]], negated: false,
235
+ meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
236
+ when "W"
237
+ advance
238
+ CharClass.new(ranges: [[0x30, 0x39], [0x41, 0x5A], [0x61, 0x7A], [0x5F, 0x5F]], negated: true,
239
+ meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
240
+ when "s"
241
+ advance
242
+ CharClass.new(ranges: [[0x09, 0x0D], [0x20, 0x20]], negated: false,
243
+ meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos])) # \t\n\v\f\r and space
244
+ when "S"
245
+ advance
246
+ CharClass.new(ranges: [[0x09, 0x0D], [0x20, 0x20]], negated: true,
247
+ meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos]))
248
+ when "n"
249
+ advance
250
+ Literal.new(byte: 0x0A, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0A, bytes: [0x0A])) # newline
251
+ when "r"
252
+ advance
253
+ Literal.new(byte: 0x0D, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0D, bytes: [0x0D])) # carriage return
254
+ when "t"
255
+ advance
256
+ Literal.new(byte: 0x09, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x09, bytes: [0x09])) # tab
257
+ when "v"
258
+ advance
259
+ Literal.new(byte: 0x0B, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0B, bytes: [0x0B])) # vertical tab
260
+ when "f"
261
+ advance
262
+ Literal.new(byte: 0x0C, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x0C, bytes: [0x0C])) # form feed
263
+ when "0"
264
+ advance
265
+ Literal.new(byte: 0x00, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x00, bytes: [0x00])) # null
266
+ when "b"
267
+ advance
268
+ Literal.new(byte: 0x08, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: 0x08, bytes: [0x08])) # backspace
269
+ when "x"
270
+ advance
271
+ hex = consume_hex_digits(2)
272
+ Literal.new(byte: hex, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: hex, bytes: [hex]))
273
+ when "u"
274
+ advance
275
+ codepoint = consume_hex_digits(4)
276
+ # Convert Unicode codepoint to UTF-8 byte sequence
277
+ utf8_bytes = [codepoint].pack("U").bytes
278
+ if utf8_bytes.size == 1
279
+ Literal.new(byte: utf8_bytes[0], meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: codepoint, bytes: utf8_bytes))
280
+ else
281
+ # Multi-byte UTF-8: create concatenation of literals
282
+ meta = meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: codepoint, bytes: utf8_bytes)
283
+ Concat.new(children: utf8_bytes.map { |b| Literal.new(byte: b, meta: nil) }, meta: meta)
284
+ end
285
+ when nil
286
+ error("unexpected end of pattern after \\")
287
+ else
288
+ byte = peek.ord
289
+ advance
290
+ Literal.new(byte: byte, meta: meta_for_span(start_pos, @pos, literal_text: @source[start_pos...@pos], codepoint: byte, bytes: [byte]))
291
+ end
292
+ end
293
+
294
+ def consume_hex_digits(count)
295
+ digits = +""
296
+ count.times do
297
+ error("expected hex digit at position #{@pos}") unless peek =~ /[0-9a-fA-F]/
298
+
299
+ digits << peek
300
+ advance
301
+ end
302
+ digits.to_i(16)
303
+ end
304
+
305
+ # Returns an array of bytes (for multi-byte UTF-8 support)
306
+ def parse_escape_char
307
+ case peek
308
+ when "n"
309
+ advance
310
+ [0x0A] # newline
311
+ when "r"
312
+ advance
313
+ [0x0D] # carriage return
314
+ when "t"
315
+ advance
316
+ [0x09] # tab
317
+ when "v"
318
+ advance
319
+ [0x0B] # vertical tab
320
+ when "f"
321
+ advance
322
+ [0x0C] # form feed
323
+ when "0"
324
+ advance
325
+ [0x00] # null
326
+ when "b"
327
+ advance
328
+ [0x08] # backspace
329
+ when "x"
330
+ advance
331
+ [consume_hex_digits(2)]
332
+ when "u"
333
+ advance
334
+ codepoint = consume_hex_digits(4)
335
+ [codepoint].pack("U").bytes
336
+ when nil then error("unexpected end of pattern after \\")
337
+ else
338
+ byte = peek.ord
339
+ advance
340
+ [byte]
341
+ end
342
+ end
343
+
344
+ def peek
345
+ @source[@pos]
346
+ end
347
+
348
+ def advance
349
+ @pos += 1
350
+ end
351
+
352
+ def expect(char)
353
+ if peek != char
354
+ error("expected '#{char}' at position #{@pos}, got '#{peek}'")
355
+ end
356
+
357
+ advance
358
+ end
359
+
360
+ def eof?
361
+ @pos >= @source.length
362
+ end
363
+
364
+ # Create a literal node for a single byte
365
+ def make_literal_node(byte, meta)
366
+ Literal.new(byte: byte, meta: meta)
367
+ end
368
+
369
+ def meta_for_span(start_pos, end_pos, literal_text: nil, codepoint: nil, bytes: nil)
370
+ Meta.new(
371
+ span: [@source[0...start_pos].bytesize, @source[0...end_pos].bytesize],
372
+ literal_text: literal_text,
373
+ codepoint: codepoint,
374
+ bytes: bytes
375
+ )
376
+ end
377
+
378
+ def meta_for_children(children, start_pos, end_pos)
379
+ return nil if children.empty?
380
+
381
+ meta_for_span(start_pos, end_pos)
382
+ end
383
+ end
384
+ end
385
+ end
@@ -0,0 +1,175 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # Build regex AST from UTF-8 codepoint ranges.
6
+ module Utf8Range
7
+ def self.ast_for_ranges(ranges)
8
+ parts = ranges.flat_map do |start_cp, end_cp|
9
+ build_range(start_cp, end_cp)
10
+ end
11
+
12
+ return RegexAST::Literal.new(byte: nil, meta: nil) if parts.empty?
13
+ return parts.first if parts.size == 1
14
+
15
+ RegexAST::Alternation.new(children: parts, meta: nil)
16
+ end
17
+
18
+ def self.build_range(start_cp, end_cp)
19
+ raise ArgumentError, "invalid range" if start_cp > end_cp
20
+
21
+ segments = []
22
+ segments.concat(build_for_length(start_cp, end_cp, 1))
23
+ segments.concat(build_for_length(start_cp, end_cp, 2))
24
+ segments.concat(build_for_length(start_cp, end_cp, 3))
25
+ segments.concat(build_for_length(start_cp, end_cp, 4))
26
+ segments
27
+ end
28
+
29
+ def self.build_for_length(start_cp, end_cp, length)
30
+ if length == 3 && overlaps_surrogates?(start_cp, end_cp)
31
+ segments = []
32
+ segments.concat(build_for_length(start_cp, 0xD7FF, length)) if start_cp <= 0xD7FF
33
+ segments.concat(build_for_length(0xE000, end_cp, length)) if end_cp >= 0xE000
34
+ return segments
35
+ end
36
+
37
+ min_cp, max_cp = codepoint_bounds(length)
38
+ return [] if end_cp < min_cp || start_cp > max_cp
39
+
40
+ s = [start_cp, min_cp].max
41
+ e = [end_cp, max_cp].min
42
+ return [] if s > e
43
+
44
+ segments = []
45
+ lead_bytes_for_length(length).each do |lead|
46
+ lead_min_cp, lead_max_cp = codepoint_range_for_lead(lead, length)
47
+ next if e < lead_min_cp || s > lead_max_cp
48
+
49
+ seg_start = [s, lead_min_cp].max
50
+ seg_end = [e, lead_max_cp].min
51
+ start_bytes = utf8_bytes(seg_start)
52
+ end_bytes = utf8_bytes(seg_end)
53
+ min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
54
+ segments.concat(build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, 1).map do |seq|
55
+ RegexAST::Concat.new(children: [byte_node(lead)] + seq, meta: nil)
56
+ end)
57
+ end
58
+
59
+ segments
60
+ end
61
+
62
+ def self.build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx)
63
+ return [[]] if idx >= start_bytes.size
64
+
65
+ if start_bytes[idx] == end_bytes[idx]
66
+ tails = build_sequences(start_bytes, end_bytes, min_bytes, max_bytes, idx + 1)
67
+ return tails.map { |t| [byte_node(start_bytes[idx])] + t }
68
+ end
69
+
70
+ sequences = []
71
+
72
+ end_first = max_bytes.dup
73
+ end_first[0...idx] = start_bytes[0...idx]
74
+ end_first[idx] = start_bytes[idx]
75
+ first_tails = build_sequences(start_bytes, end_first, min_bytes, max_bytes, idx + 1)
76
+ sequences.concat(first_tails.map { |t| [byte_node(start_bytes[idx])] + t })
77
+
78
+ if start_bytes[idx] + 1 <= end_bytes[idx] - 1
79
+ middle = byte_range_node(start_bytes[idx] + 1, end_bytes[idx] - 1)
80
+ sequences << ([middle] + full_range_nodes(min_bytes, max_bytes, idx + 1))
81
+ end
82
+
83
+ start_last = min_bytes.dup
84
+ start_last[0...idx] = end_bytes[0...idx]
85
+ start_last[idx] = end_bytes[idx]
86
+ last_tails = build_sequences(start_last, end_bytes, min_bytes, max_bytes, idx + 1)
87
+ sequences.concat(last_tails.map { |t| [byte_node(end_bytes[idx])] + t })
88
+
89
+ sequences
90
+ end
91
+
92
+ def self.full_range_nodes(min_bytes, max_bytes, idx)
93
+ (idx...min_bytes.size).map do |pos|
94
+ byte_range_node(min_bytes[pos], max_bytes[pos])
95
+ end
96
+ end
97
+
98
+ def self.byte_node(byte)
99
+ RegexAST::Literal.new(byte: byte, meta: nil)
100
+ end
101
+
102
+ def self.byte_range_node(min_byte, max_byte)
103
+ if min_byte == max_byte
104
+ RegexAST::Literal.new(byte: min_byte, meta: nil)
105
+ else
106
+ RegexAST::CharClass.new(ranges: [[min_byte, max_byte]], negated: false, meta: nil)
107
+ end
108
+ end
109
+
110
+ def self.utf8_bytes(codepoint)
111
+ [codepoint].pack("U").bytes
112
+ end
113
+
114
+ def self.codepoint_bounds(length)
115
+ case length
116
+ when 1 then [0x0, 0x7F]
117
+ when 2 then [0x80, 0x7FF]
118
+ when 3 then [0x800, 0xFFFF]
119
+ when 4 then [0x10000, 0x10FFFF]
120
+ else [1, 0]
121
+ end
122
+ end
123
+
124
+ def self.overlaps_surrogates?(start_cp, end_cp)
125
+ start_cp <= 0xDFFF && end_cp >= 0xD800
126
+ end
127
+
128
+ def self.lead_bytes_for_length(length)
129
+ case length
130
+ when 1 then (0x00..0x7F).to_a
131
+ when 2 then (0xC2..0xDF).to_a
132
+ when 3 then (0xE0..0xEF).to_a
133
+ when 4 then (0xF0..0xF4).to_a
134
+ else []
135
+ end
136
+ end
137
+
138
+ def self.byte_bounds_for_lead(lead, length)
139
+ case length
140
+ when 1
141
+ [[lead], [lead]]
142
+ when 2
143
+ [[lead, 0x80], [lead, 0xBF]]
144
+ when 3
145
+ min_second, max_second = if lead == 0xE0
146
+ [0xA0, 0xBF]
147
+ elsif lead == 0xED
148
+ [0x80, 0x9F]
149
+ else
150
+ [0x80, 0xBF]
151
+ end
152
+ [[lead, min_second, 0x80], [lead, max_second, 0xBF]]
153
+ when 4
154
+ min_second, max_second = if lead == 0xF0
155
+ [0x90, 0xBF]
156
+ elsif lead == 0xF4
157
+ [0x80, 0x8F]
158
+ else
159
+ [0x80, 0xBF]
160
+ end
161
+ [[lead, min_second, 0x80, 0x80], [lead, max_second, 0xBF, 0xBF]]
162
+ else
163
+ [[], []]
164
+ end
165
+ end
166
+
167
+ def self.codepoint_range_for_lead(lead, length)
168
+ min_bytes, max_bytes = byte_bounds_for_lead(lead, length)
169
+ min_cp = min_bytes.pack("C*").force_encoding("UTF-8").ord
170
+ max_cp = max_bytes.pack("C*").force_encoding("UTF-8").ord
171
+ [min_cp, max_cp]
172
+ end
173
+ end
174
+ end
175
+ end
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ class Utf8RangePattern
6
+ include LexerKit::RegexAstProvider
7
+
8
+ def initialize(ranges)
9
+ @ranges = ranges
10
+ end
11
+
12
+ def to_ast
13
+ Utf8Range.ast_for_ranges(@ranges)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "dfa/regex_ast"
4
+ require_relative "dfa/utf8_range"
5
+ require_relative "dfa/case_folding"
6
+ require_relative "dfa/char_class_collector"
7
+ require_relative "dfa/regex_parser"
8
+ require_relative "dfa/nfa"
9
+ require_relative "dfa/dfa_minimizer"
10
+ require_relative "dfa/byte_class_builder"
11
+ require_relative "dfa/dfa_builder"
12
+
13
+ module LexerKit
14
+ # DFA module provides regex to DFA compilation.
15
+ module DFA
16
+ # Compile a Regex to DFA
17
+ # @param regex [RegexAST::Regex] regex pattern
18
+ # @param token_id [Integer] token ID for accepting state
19
+ # @return [IR::DFATable]
20
+ def self.compile_regex(regex, token_id = 0)
21
+ raise ArgumentError, "expected Regex, got #{regex.class}" unless regex.is_a?(RegexAST::Regex)
22
+
23
+ nfa = NFA.from_regex(regex, token_id)
24
+ DFABuilder.new(nfa).build
25
+ end
26
+
27
+ # Compute the set of possible first bytes for a Regex
28
+ # @param regex [RegexAST::Regex] regex pattern
29
+ # @return [Set<Integer>] set of bytes (0-255)
30
+ def self.first_byte_set(regex)
31
+ raise ArgumentError, "expected Regex, got #{regex.class}" unless regex.is_a?(RegexAST::Regex)
32
+
33
+ nfa = NFA.from_regex(regex, 0)
34
+ nfa.first_byte_set
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ # Base error for all LexerKit errors
5
+ class Error < StandardError; end
6
+
7
+ # ===========================================================================
8
+ # Binary/File Errors
9
+ # ===========================================================================
10
+
11
+ # Raised when binary deserialization fails
12
+ class InvalidBinaryError < Error; end
13
+
14
+ # Raised when file integrity check fails (SHA256 mismatch, etc.)
15
+ class IntegrityError < Error; end
16
+
17
+ # ===========================================================================
18
+ # User Input Errors (with diagnostic information)
19
+ # ===========================================================================
20
+
21
+ # Base class for errors with diagnostic information (source location, span, etc.)
22
+ # Provides rich error messages with source code context.
23
+ class DiagnosticError < Error
24
+ attr_reader :diagnostic, :source
25
+
26
+ def initialize(message, source: nil, span: nil, level: :error, notes: nil)
27
+ @source = source
28
+ @diagnostic = Core::Diagnostic.new(level: level, message: message, span: span, notes: notes) if source && span
29
+ super(message)
30
+ end
31
+
32
+ def render(color: $stderr.tty?)
33
+ return message unless @diagnostic && @source
34
+
35
+ @diagnostic.render(@source, color: color)
36
+ end
37
+
38
+ def self.from_location(location, message, level: :error, notes: nil)
39
+ return new(message) unless location&.path && File.file?(location.path)
40
+
41
+ bytes = File.binread(location.path)
42
+ source = Core::Source.new(bytes, filename: location.path)
43
+ span = source.span_for_line(location.lineno)
44
+ new(message, source: source, span: span, level: level, notes: notes)
45
+ rescue Errno::ENOENT, Errno::EACCES
46
+ new(message)
47
+ end
48
+ end
49
+
50
+ # Raised during Builder DSL evaluation (invalid token definitions, etc.)
51
+ class BuildError < DiagnosticError; end
52
+
53
+ # Raised during compilation (IR generation, optimization, etc.)
54
+ class CompileError < DiagnosticError; end
55
+
56
+ # Raised during regex parsing
57
+ class ParseError < DiagnosticError
58
+ attr_reader :position
59
+
60
+ def initialize(message, source: nil, position: nil)
61
+ @position = position # character position (codepoint-based)
62
+ span = source.span_for_char_index(position) if source && position
63
+ super(message, source: source, span: span)
64
+ end
65
+ end
66
+
67
+ # ===========================================================================
68
+ # Runtime Errors
69
+ # ===========================================================================
70
+
71
+ # Raised when C native extension is required but not loaded
72
+ class NativeExtensionError < Error; end
73
+
74
+ # Raised during VM execution (stack overflow, infinite loop, etc.)
75
+ class VMError < Error; end
76
+ end