lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,596 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ class Builder
5
+ # Compiler transforms Builder definitions into IR::CompiledProgram.
6
+ class Compiler
7
+ def initialize(builder)
8
+ @builder = builder
9
+ @instructions = []
10
+ @constant_pool = IR::ConstantPool.new
11
+ @jump_tables = []
12
+ @dfa_tables = []
13
+ @token_names = []
14
+ @token_ids = {}
15
+ @mode_names = []
16
+ @mode_ids = {}
17
+ @keyword_tables = []
18
+ @modes = {}
19
+ @labels = {}
20
+ @label_counter = 0
21
+ @pending_jumps = []
22
+ @pending_dfa_run_if_match = []
23
+ @pending_match_literal_or_jump = []
24
+ @pending_emit_and_jump = []
25
+ @pending_set_match = []
26
+ @pending_literal_trie_commit = []
27
+ @pending_trie_entries = []
28
+ @dfa_cache = {} # pattern => dfa_id (cache for regex DFAs)
29
+ @keyword_cache = {} # token_def => table_id (cache for keyword tables)
30
+ end
31
+
32
+ def compile
33
+ # Assign token IDs
34
+ assign_token_ids
35
+
36
+ # Compile each mode
37
+ @builder.mode_defs.each do |_name, mode_def|
38
+ compile_mode(mode_def)
39
+ end
40
+
41
+ # Add HALT at end
42
+ emit(IR::Opcode::HALT)
43
+
44
+ # Resolve pending jumps
45
+ resolve_jumps
46
+ finalize_literal_tries
47
+
48
+ IR::CompiledProgram.new(
49
+ instructions: @instructions,
50
+ dfa_tables: @dfa_tables,
51
+ jump_tables: @jump_tables,
52
+ constant_pool: @constant_pool,
53
+ modes: @modes,
54
+ token_names: @token_names,
55
+ mode_names: @mode_names,
56
+ keyword_tables: @keyword_tables,
57
+ token_meta: build_token_meta,
58
+ version: @builder.version
59
+ )
60
+ end
61
+
62
+ private
63
+
64
+ def assign_token_ids
65
+ # Reserved token IDs (0-7)
66
+ # 0: Internal sentinel (never emitted)
67
+ # 1: :INVALID
68
+ # 2-7: Reserved for future use
69
+ @token_names = [
70
+ :__RESERVED_0__, # 0: reserved
71
+ :INVALID, # 1: error token
72
+ :__RESERVED_2__, # 2: reserved
73
+ :__RESERVED_3__, # 3: reserved
74
+ :__RESERVED_4__, # 4: reserved
75
+ :__RESERVED_5__, # 5: reserved
76
+ :__RESERVED_6__, # 6: reserved
77
+ :__RESERVED_7__ # 7: reserved
78
+ ]
79
+ @token_ids = {
80
+ INVALID: LexerKit::INVALID_TOKEN_ID
81
+ }
82
+
83
+ # User-defined tokens start from FIRST_USER_TOKEN_ID (8)
84
+ @builder.token_defs.each do |token_def|
85
+ next if @token_ids.key?(token_def.name)
86
+
87
+ id = @token_names.size
88
+ @token_names << token_def.name
89
+ @token_ids[token_def.name] = id
90
+ token_def.token_id = id
91
+ end
92
+
93
+ # Keywords as separate tokens
94
+ @builder.keywords.each_value do |name|
95
+ next if @token_ids.key?(name)
96
+
97
+ id = @token_names.size
98
+ @token_names << name
99
+ @token_ids[name] = id
100
+ end
101
+ end
102
+
103
+ # Build token metadata hash { token_id => meta_hash }
104
+ def build_token_meta
105
+ meta = {}
106
+ @builder.token_defs.each do |token_def|
107
+ next unless token_def.meta
108
+
109
+ token_id = @token_ids[token_def.name]
110
+ meta[token_id] = token_def.meta if token_id
111
+ end
112
+ meta
113
+ end
114
+
115
+ def compile_mode(mode_def)
116
+ ensure_mode_id(mode_def.name)
117
+ @modes[mode_def.name] = current_offset
118
+
119
+ if mode_def.delimited
120
+ compile_delimited_mode(mode_def)
121
+ else
122
+ compile_regular_mode(mode_def)
123
+ end
124
+ end
125
+
126
+ def compile_delimited_mode(mode_def)
127
+ delimited = mode_def.delimited
128
+ token_id = @token_ids[delimited.name]
129
+
130
+ # MARK current position
131
+ emit(IR::Opcode::MARK)
132
+
133
+ # SCAN delimiter
134
+ close_const_id = @constant_pool.add(delimited.delimiter)
135
+ if delimited.escape
136
+ config_id = add_escape_config(delimited)
137
+ emit(IR::Opcode::SCAN_UNTIL_ESCAPE, config_id)
138
+ else
139
+ emit(IR::Opcode::SCAN_UNTIL, close_const_id)
140
+ end
141
+
142
+ # EMIT TEXT token (if any content)
143
+ if delimited.skip
144
+ emit(IR::Opcode::EMIT_SKIP)
145
+ else
146
+ emit(IR::Opcode::EMIT, token_id)
147
+ end
148
+
149
+ # Check for EOF
150
+ eof_label = new_label
151
+ emit_jump(IR::Opcode::JUMP_IF_EOF, eof_label)
152
+
153
+ # Skip past delimiter
154
+ emit(IR::Opcode::MATCH_LITERAL, close_const_id)
155
+
156
+ # Switch to inner mode if defined
157
+ if delimited.inner_mode
158
+ inner_mode_id = ensure_mode_id(delimited.inner_mode)
159
+ emit(IR::Opcode::PUSH_MODE, inner_mode_id)
160
+ emit(IR::Opcode::JUMP, @modes[mode_def.name])
161
+ elsif delimited.pop
162
+ emit(IR::Opcode::POP_MODE)
163
+ else
164
+ # Jump back to start
165
+ emit(IR::Opcode::JUMP, @modes[mode_def.name])
166
+ end
167
+
168
+ # EOF label
169
+ mark_label(eof_label)
170
+ emit(IR::Opcode::HALT)
171
+ end
172
+
173
+ def compile_regular_mode(mode_def)
174
+ loop_start = current_offset
175
+
176
+ # Check EOF
177
+ eof_label = new_label
178
+ emit_jump(IR::Opcode::JUMP_IF_EOF, eof_label)
179
+
180
+ # MARK position
181
+ emit(IR::Opcode::MARK)
182
+
183
+ # Group tokens by first byte for SWITCH_BYTE
184
+ literals = mode_def.literal_tokens
185
+ regexes = mode_def.regex_tokens
186
+
187
+ if literals.any? || regexes.any?
188
+ # Build switch table for first-byte dispatch
189
+ compile_token_dispatch(mode_def, loop_start)
190
+ end
191
+
192
+ # EOF: HALT
193
+ mark_label(eof_label)
194
+ emit(IR::Opcode::HALT)
195
+ end
196
+
197
+ # Build escape config for delimited with escape sequence
198
+ # Format: [2bytes: close_len] [2bytes: escape_len] [close bytes] [escape bytes]
199
+ def add_escape_config(token_def)
200
+ close = token_def.delimiter
201
+ escape = token_def.escape
202
+ escape_bytes = escape.b
203
+ bytes = [
204
+ close.bytesize,
205
+ escape_bytes.bytesize
206
+ ].pack("S>S>") + close.b + escape_bytes
207
+
208
+ @constant_pool.add(bytes)
209
+ end
210
+
211
+ def compile_token_dispatch(mode_def, loop_start)
212
+ # Collect candidates by first byte, preserving definition order.
213
+ candidates_by_byte = {}
214
+ first_bytes_cache = {}
215
+ order_map = {}
216
+
217
+ mode_def.tokens.each_with_index do |token_def, idx|
218
+ next unless token_def.literal? || token_def.regex?
219
+
220
+ order_map[token_def] = idx
221
+
222
+ if token_def.literal?
223
+ byte = token_def.pattern.getbyte(0)
224
+ (candidates_by_byte[byte] ||= []) << token_def
225
+ else
226
+ regex = DFA::RegexAST::Regex.parse(token_def.pattern)
227
+ first_bytes = first_bytes_cache[token_def.pattern] ||= DFA.first_byte_set(regex)
228
+ first_bytes.each do |byte|
229
+ (candidates_by_byte[byte] ||= []) << token_def
230
+ end
231
+ end
232
+ end
233
+
234
+ # Create jump table
235
+ jump_table_id = @jump_tables.size
236
+ jump_table_entries = {}
237
+
238
+ # Group bytes by candidate sets - bytes with same candidates share one handler
239
+ groups = group_bytes_by_candidates(candidates_by_byte)
240
+ groups.each do |_candidates, bytes|
241
+ common_label = new_label
242
+ bytes.each { |byte| jump_table_entries[byte] = common_label }
243
+ end
244
+
245
+ # Default: error handling
246
+ default_label = new_label
247
+
248
+ @jump_tables << nil # Placeholder
249
+
250
+ # Emit SWITCH_BYTE
251
+ emit(IR::Opcode::SWITCH_BYTE, jump_table_id)
252
+
253
+ # Compile branches for each group (bytes with same candidates share handler)
254
+ groups.each do |candidates, bytes|
255
+ # All bytes in this group point to the same label
256
+ common_label = jump_table_entries[bytes.first]
257
+ mark_label(common_label)
258
+
259
+ if candidates.empty?
260
+ emit_jump(IR::Opcode::JUMP, default_label)
261
+ next
262
+ end
263
+
264
+ if candidates.size == 1
265
+ token_def = candidates.first
266
+ if token_def.literal?
267
+ const_id = @constant_pool.add(token_def.pattern)
268
+ # MATCH_LITERAL_OR_JUMP has embedded fail_target
269
+ emit_match_literal_or_jump(const_id, default_label)
270
+ elsif token_def.regex?
271
+ dfa_id = compile_regex_to_dfa(token_def.pattern)
272
+ # DFA_RUN_IF_MATCH has embedded fail_target
273
+ emit_dfa_run_if_match(dfa_id, default_label)
274
+ else
275
+ emit_jump(IR::Opcode::JUMP, default_label)
276
+ next
277
+ end
278
+
279
+ compile_token_action(token_def, loop_start)
280
+ next
281
+ end
282
+
283
+ action_labels = []
284
+ action_label_map = {}
285
+ candidates.each do |token_def|
286
+ action_label = new_label
287
+ action_labels << [action_label, token_def]
288
+ action_label_map[token_def] = action_label
289
+ end
290
+
291
+ literal_tokens = candidates.select(&:literal?)
292
+ regex_tokens = candidates.select(&:regex?)
293
+
294
+ if regex_tokens.empty?
295
+ # Literal-only: use optimized LITERAL_TRIE_COMMIT
296
+ trie_const_id = add_literal_trie(literal_tokens, order_map, action_label_map)
297
+ emit_literal_trie_commit(trie_const_id, default_label)
298
+ else
299
+ # Mixed literals and regex: use CLEAR_BEST + ... + COMMIT_BEST
300
+ emit(IR::Opcode::CLEAR_BEST)
301
+
302
+ if literal_tokens.any?
303
+ trie_const_id = add_literal_trie(literal_tokens, order_map, action_label_map)
304
+ # LITERAL_TRIE_RUN updates best match and restores pos to mark
305
+ emit(IR::Opcode::LITERAL_TRIE_RUN, trie_const_id)
306
+ end
307
+
308
+ regex_tokens.each do |token_def|
309
+ dfa_id = compile_regex_to_dfa(token_def.pattern)
310
+ emit(IR::Opcode::DFA_RUN, dfa_id)
311
+ # SET_MATCH sets candidate order/action and updates best match
312
+ emit_set_match(order_map[token_def], action_label_map[token_def])
313
+ end
314
+
315
+ emit_label_arg(IR::Opcode::COMMIT_BEST, default_label)
316
+ end
317
+
318
+ action_labels.each do |action_label, token_def|
319
+ mark_label(action_label)
320
+ compile_token_action(token_def, loop_start)
321
+ end
322
+ end
323
+
324
+ # Default branch (error handling)
325
+ mark_label(default_label)
326
+
327
+ # Error: advance one byte and emit :INVALID
328
+ emit(IR::Opcode::MATCH_RANGE, (0 << 8) | 255) # Match any byte
329
+ emit(IR::Opcode::EMIT_ERROR, LexerKit::INVALID_TOKEN_ID)
330
+ emit(IR::Opcode::JUMP, loop_start)
331
+
332
+ # Now resolve the jump table
333
+ resolved_entries = {}
334
+ jump_table_entries.each do |byte, label|
335
+ resolved_entries[byte] = @labels[label]
336
+ end
337
+ @jump_tables[jump_table_id] = IR::JumpTable.new(
338
+ entries: resolved_entries,
339
+ default_offset: @labels[default_label]
340
+ )
341
+ end
342
+
343
+ def compile_token_action(token_def, loop_start)
344
+ token_id = @token_ids[token_def.name]
345
+
346
+ if should_attach_keywords?(token_def)
347
+ table_id = build_keyword_table(token_def)
348
+ emit(IR::Opcode::KEYWORD_LOOKUP, table_id)
349
+ end
350
+
351
+ if token_def.skip
352
+ if token_def.push || token_def.pop
353
+ # When push/pop is involved, we need separate EMIT_SKIP
354
+ emit(IR::Opcode::EMIT_SKIP)
355
+ else
356
+ # Optimize EMIT_SKIP + JUMP into EMIT_SKIP_AND_JUMP
357
+ emit(IR::Opcode::EMIT_SKIP_AND_JUMP, loop_start)
358
+ return
359
+ end
360
+ elsif token_def.push || token_def.pop
361
+ # When push/pop is involved, we need separate EMIT
362
+ emit(IR::Opcode::EMIT, token_id)
363
+ else
364
+ # Optimize EMIT + JUMP into EMIT_AND_JUMP
365
+ emit_emit_and_jump(token_id, loop_start)
366
+ return
367
+ end
368
+
369
+ if token_def.push
370
+ mode_id = ensure_mode_id(token_def.push)
371
+ emit(IR::Opcode::PUSH_MODE, mode_id)
372
+ elsif token_def.pop
373
+ emit(IR::Opcode::POP_MODE)
374
+ end
375
+ end
376
+
377
+ def should_attach_keywords?(token_def)
378
+ return false if @builder.keywords.empty?
379
+ return false unless token_def.pattern.is_a?(Regexp)
380
+
381
+ @builder.keywords.keys.any? do |keyword_str|
382
+ token_def.pattern.match?(keyword_str)
383
+ end
384
+ end
385
+
386
+ def build_keyword_table(token_def)
387
+ # Cache keyword table by token_def to avoid duplicates
388
+ return @keyword_cache[token_def] if @keyword_cache.key?(token_def)
389
+
390
+ table_id = @keyword_tables.size
391
+ keywords_hash = {}
392
+ @builder.keywords.each do |value, name|
393
+ keywords_hash[value] = @token_ids[name]
394
+ end
395
+ @keyword_tables << IR::KeywordTable.new(
396
+ base_token_id: @token_ids[token_def.name],
397
+ keywords: keywords_hash
398
+ )
399
+ @keyword_cache[token_def] = table_id
400
+ table_id
401
+ end
402
+
403
+ def compile_regex_to_dfa(pattern)
404
+ # Cache DFA by pattern source to avoid duplicate compilation
405
+ cache_key = pattern.is_a?(Regexp) ? pattern.source : pattern
406
+ return @dfa_cache[cache_key] if @dfa_cache.key?(cache_key)
407
+
408
+ dfa_id = @dfa_tables.size
409
+
410
+ # Use proper regex→DFA compilation
411
+ regex = DFA::RegexAST::Regex.parse(pattern)
412
+ dfa_table = DFA.compile_regex(regex, 0)
413
+ @dfa_tables << dfa_table
414
+
415
+ @dfa_cache[cache_key] = dfa_id
416
+ dfa_id
417
+ end
418
+
419
+ def ensure_mode_id(mode_name)
420
+ return @mode_ids[mode_name] if @mode_ids.key?(mode_name)
421
+
422
+ id = @mode_names.size
423
+ @mode_names << mode_name
424
+ @mode_ids[mode_name] = id
425
+ id
426
+ end
427
+
428
+ def current_offset
429
+ @instructions.size
430
+ end
431
+
432
+ def emit(opcode, arg = 0)
433
+ @instructions << IR::Instruction.new(opcode, arg)
434
+ end
435
+
436
+ def emit_jump(opcode, label)
437
+ @pending_jumps << [@instructions.size, label]
438
+ emit(opcode, 0) # Placeholder
439
+ end
440
+
441
+ def emit_label_arg(opcode, label)
442
+ @pending_jumps << [@instructions.size, label]
443
+ emit(opcode, 0) # Placeholder
444
+ end
445
+
446
+ # Emit DFA_RUN_IF_MATCH with embedded fail_target
447
+ # Encoding: (dfa_id << 14) | fail_target (10+14 bits)
448
+ def emit_dfa_run_if_match(dfa_id, fail_label)
449
+ @pending_dfa_run_if_match << [@instructions.size, dfa_id, fail_label]
450
+ emit(IR::Opcode::DFA_RUN_IF_MATCH, dfa_id << 14) # Placeholder with dfa_id in upper bits
451
+ end
452
+
453
+ # Emit MATCH_LITERAL_OR_JUMP with embedded fail_target
454
+ # Encoding: (const_id << 14) | fail_target (10+14 bits)
455
+ def emit_match_literal_or_jump(const_id, fail_label)
456
+ @pending_match_literal_or_jump << [@instructions.size, const_id, fail_label]
457
+ emit(IR::Opcode::MATCH_LITERAL_OR_JUMP, const_id << 14) # Placeholder with const_id in upper bits
458
+ end
459
+
460
+ # Emit EMIT_AND_JUMP with embedded jump_target
461
+ # Encoding: (token_id << 14) | jump_target (10+14 bits)
462
+ def emit_emit_and_jump(token_id, jump_target)
463
+ @pending_emit_and_jump << [@instructions.size, token_id, jump_target]
464
+ emit(IR::Opcode::EMIT_AND_JUMP, token_id << 14) # Placeholder with token_id in upper bits
465
+ end
466
+
467
+ # Emit SET_MATCH with embedded order and action_label
468
+ # Encoding: (order << 14) | action_ip (10+14 bits)
469
+ def emit_set_match(order, action_label)
470
+ @pending_set_match << [@instructions.size, order, action_label]
471
+ emit(IR::Opcode::SET_MATCH, order << 14) # Placeholder with order in upper bits
472
+ end
473
+
474
+ # Emit LITERAL_TRIE_COMMIT with embedded const_id and fail_target
475
+ # Encoding: (const_id << 14) | fail_target (10+14 bits)
476
+ def emit_literal_trie_commit(const_id, fail_label)
477
+ @pending_literal_trie_commit << [@instructions.size, const_id, fail_label]
478
+ emit(IR::Opcode::LITERAL_TRIE_COMMIT, const_id << 14) # Placeholder with const_id in upper bits
479
+ end
480
+
481
+ def add_literal_trie(literals, order_map, action_label_map)
482
+ # Convert TokenDef objects to Trie entries
483
+ entries = literals.map do |token_def|
484
+ [
485
+ token_def.pattern, # literal string
486
+ order_map[token_def], # order
487
+ action_label_map[token_def] # action_ref (label)
488
+ ]
489
+ end
490
+
491
+ trie = Trie.new(entries)
492
+ const_id = @constant_pool.add_uninterned("".b)
493
+ @pending_trie_entries << [const_id, trie]
494
+ const_id
495
+ end
496
+
497
+ def finalize_literal_tries
498
+ return if @pending_trie_entries.empty?
499
+
500
+ @pending_trie_entries.each do |const_id, trie|
501
+ data = trie.encode(labels: @labels)
502
+ @constant_pool.replace(const_id, data)
503
+ end
504
+ end
505
+
506
+ # Group bytes by their candidate sets.
507
+ # Returns array of [candidates, bytes] pairs where all bytes in a group
508
+ # share the exact same candidate set.
509
+ def group_bytes_by_candidates(candidates_by_byte)
510
+ grouped = Hash.new { |h, k| h[k] = [] }
511
+
512
+ candidates_by_byte.each do |byte, candidates|
513
+ # Use object_id array as key to identify same candidate set
514
+ key = candidates.map(&:object_id).sort
515
+ grouped[key] << byte
516
+ end
517
+
518
+ result = []
519
+ grouped.each do |_key, bytes|
520
+ candidates = candidates_by_byte[bytes.first]
521
+ result << [candidates, bytes.sort]
522
+ end
523
+
524
+ result
525
+ end
526
+
527
+ def new_label
528
+ label = :"label_#{@label_counter}"
529
+ @label_counter += 1
530
+ label
531
+ end
532
+
533
+ def mark_label(label)
534
+ @labels[label] = current_offset
535
+ end
536
+
537
+ def resolve_jumps
538
+ @pending_jumps.each do |instr_idx, label|
539
+ offset = @labels[label]
540
+ raise LexerKit::CompileError, "unresolved label #{label}" unless offset
541
+
542
+ instr = @instructions[instr_idx]
543
+ @instructions[instr_idx] = IR::Instruction.new(instr.opcode, offset)
544
+ end
545
+
546
+ # Resolve DFA_RUN_IF_MATCH fail targets
547
+ # Encoding: (dfa_id << 14) | fail_target (10+14 bits)
548
+ @pending_dfa_run_if_match.each do |instr_idx, dfa_id, fail_label|
549
+ fail_offset = @labels[fail_label]
550
+ raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
551
+
552
+ arg = (dfa_id << 14) | fail_offset
553
+ @instructions[instr_idx] = IR::Instruction.new(IR::Opcode::DFA_RUN_IF_MATCH, arg)
554
+ end
555
+
556
+ # Resolve MATCH_LITERAL_OR_JUMP fail targets
557
+ # Encoding: (const_id << 14) | fail_target (10+14 bits)
558
+ @pending_match_literal_or_jump.each do |instr_idx, const_id, fail_label|
559
+ fail_offset = @labels[fail_label]
560
+ raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
561
+
562
+ arg = (const_id << 14) | fail_offset
563
+ @instructions[instr_idx] = IR::Instruction.new(IR::Opcode::MATCH_LITERAL_OR_JUMP, arg)
564
+ end
565
+
566
+ # Resolve EMIT_AND_JUMP targets
567
+ # Encoding: (token_id << 14) | jump_target (10+14 bits)
568
+ # Note: jump_target is a direct offset (loop_start), not a label
569
+ @pending_emit_and_jump.each do |instr_idx, token_id, jump_target|
570
+ arg = (token_id << 14) | jump_target
571
+ @instructions[instr_idx] = IR::Instruction.new(IR::Opcode::EMIT_AND_JUMP, arg)
572
+ end
573
+
574
+ # Resolve SET_MATCH action targets
575
+ # Encoding: (order << 14) | action_ip (10+14 bits)
576
+ @pending_set_match.each do |instr_idx, order, action_label|
577
+ action_offset = @labels[action_label]
578
+ raise LexerKit::CompileError, "unresolved label #{action_label}" unless action_offset
579
+
580
+ arg = (order << 14) | action_offset
581
+ @instructions[instr_idx] = IR::Instruction.new(IR::Opcode::SET_MATCH, arg)
582
+ end
583
+
584
+ # Resolve LITERAL_TRIE_COMMIT fail targets
585
+ # Encoding: (const_id << 14) | fail_target (10+14 bits)
586
+ @pending_literal_trie_commit.each do |instr_idx, const_id, fail_label|
587
+ fail_offset = @labels[fail_label]
588
+ raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
589
+
590
+ arg = (const_id << 14) | fail_offset
591
+ @instructions[instr_idx] = IR::Instruction.new(IR::Opcode::LITERAL_TRIE_COMMIT, arg)
592
+ end
593
+ end
594
+ end
595
+ end
596
+ end