lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,596 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
class Builder
|
|
5
|
+
# Compiler transforms Builder definitions into IR::CompiledProgram.
|
|
6
|
+
class Compiler
|
|
7
|
+
def initialize(builder)
|
|
8
|
+
@builder = builder
|
|
9
|
+
@instructions = []
|
|
10
|
+
@constant_pool = IR::ConstantPool.new
|
|
11
|
+
@jump_tables = []
|
|
12
|
+
@dfa_tables = []
|
|
13
|
+
@token_names = []
|
|
14
|
+
@token_ids = {}
|
|
15
|
+
@mode_names = []
|
|
16
|
+
@mode_ids = {}
|
|
17
|
+
@keyword_tables = []
|
|
18
|
+
@modes = {}
|
|
19
|
+
@labels = {}
|
|
20
|
+
@label_counter = 0
|
|
21
|
+
@pending_jumps = []
|
|
22
|
+
@pending_dfa_run_if_match = []
|
|
23
|
+
@pending_match_literal_or_jump = []
|
|
24
|
+
@pending_emit_and_jump = []
|
|
25
|
+
@pending_set_match = []
|
|
26
|
+
@pending_literal_trie_commit = []
|
|
27
|
+
@pending_trie_entries = []
|
|
28
|
+
@dfa_cache = {} # pattern => dfa_id (cache for regex DFAs)
|
|
29
|
+
@keyword_cache = {} # token_def => table_id (cache for keyword tables)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def compile
|
|
33
|
+
# Assign token IDs
|
|
34
|
+
assign_token_ids
|
|
35
|
+
|
|
36
|
+
# Compile each mode
|
|
37
|
+
@builder.mode_defs.each do |_name, mode_def|
|
|
38
|
+
compile_mode(mode_def)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Add HALT at end
|
|
42
|
+
emit(IR::Opcode::HALT)
|
|
43
|
+
|
|
44
|
+
# Resolve pending jumps
|
|
45
|
+
resolve_jumps
|
|
46
|
+
finalize_literal_tries
|
|
47
|
+
|
|
48
|
+
IR::CompiledProgram.new(
|
|
49
|
+
instructions: @instructions,
|
|
50
|
+
dfa_tables: @dfa_tables,
|
|
51
|
+
jump_tables: @jump_tables,
|
|
52
|
+
constant_pool: @constant_pool,
|
|
53
|
+
modes: @modes,
|
|
54
|
+
token_names: @token_names,
|
|
55
|
+
mode_names: @mode_names,
|
|
56
|
+
keyword_tables: @keyword_tables,
|
|
57
|
+
token_meta: build_token_meta,
|
|
58
|
+
version: @builder.version
|
|
59
|
+
)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def assign_token_ids
|
|
65
|
+
# Reserved token IDs (0-7)
|
|
66
|
+
# 0: Internal sentinel (never emitted)
|
|
67
|
+
# 1: :INVALID
|
|
68
|
+
# 2-7: Reserved for future use
|
|
69
|
+
@token_names = [
|
|
70
|
+
:__RESERVED_0__, # 0: reserved
|
|
71
|
+
:INVALID, # 1: error token
|
|
72
|
+
:__RESERVED_2__, # 2: reserved
|
|
73
|
+
:__RESERVED_3__, # 3: reserved
|
|
74
|
+
:__RESERVED_4__, # 4: reserved
|
|
75
|
+
:__RESERVED_5__, # 5: reserved
|
|
76
|
+
:__RESERVED_6__, # 6: reserved
|
|
77
|
+
:__RESERVED_7__ # 7: reserved
|
|
78
|
+
]
|
|
79
|
+
@token_ids = {
|
|
80
|
+
INVALID: LexerKit::INVALID_TOKEN_ID
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# User-defined tokens start from FIRST_USER_TOKEN_ID (8)
|
|
84
|
+
@builder.token_defs.each do |token_def|
|
|
85
|
+
next if @token_ids.key?(token_def.name)
|
|
86
|
+
|
|
87
|
+
id = @token_names.size
|
|
88
|
+
@token_names << token_def.name
|
|
89
|
+
@token_ids[token_def.name] = id
|
|
90
|
+
token_def.token_id = id
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Keywords as separate tokens
|
|
94
|
+
@builder.keywords.each_value do |name|
|
|
95
|
+
next if @token_ids.key?(name)
|
|
96
|
+
|
|
97
|
+
id = @token_names.size
|
|
98
|
+
@token_names << name
|
|
99
|
+
@token_ids[name] = id
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Build token metadata hash { token_id => meta_hash }
|
|
104
|
+
def build_token_meta
|
|
105
|
+
meta = {}
|
|
106
|
+
@builder.token_defs.each do |token_def|
|
|
107
|
+
next unless token_def.meta
|
|
108
|
+
|
|
109
|
+
token_id = @token_ids[token_def.name]
|
|
110
|
+
meta[token_id] = token_def.meta if token_id
|
|
111
|
+
end
|
|
112
|
+
meta
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def compile_mode(mode_def)
|
|
116
|
+
ensure_mode_id(mode_def.name)
|
|
117
|
+
@modes[mode_def.name] = current_offset
|
|
118
|
+
|
|
119
|
+
if mode_def.delimited
|
|
120
|
+
compile_delimited_mode(mode_def)
|
|
121
|
+
else
|
|
122
|
+
compile_regular_mode(mode_def)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def compile_delimited_mode(mode_def)
|
|
127
|
+
delimited = mode_def.delimited
|
|
128
|
+
token_id = @token_ids[delimited.name]
|
|
129
|
+
|
|
130
|
+
# MARK current position
|
|
131
|
+
emit(IR::Opcode::MARK)
|
|
132
|
+
|
|
133
|
+
# SCAN delimiter
|
|
134
|
+
close_const_id = @constant_pool.add(delimited.delimiter)
|
|
135
|
+
if delimited.escape
|
|
136
|
+
config_id = add_escape_config(delimited)
|
|
137
|
+
emit(IR::Opcode::SCAN_UNTIL_ESCAPE, config_id)
|
|
138
|
+
else
|
|
139
|
+
emit(IR::Opcode::SCAN_UNTIL, close_const_id)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# EMIT TEXT token (if any content)
|
|
143
|
+
if delimited.skip
|
|
144
|
+
emit(IR::Opcode::EMIT_SKIP)
|
|
145
|
+
else
|
|
146
|
+
emit(IR::Opcode::EMIT, token_id)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Check for EOF
|
|
150
|
+
eof_label = new_label
|
|
151
|
+
emit_jump(IR::Opcode::JUMP_IF_EOF, eof_label)
|
|
152
|
+
|
|
153
|
+
# Skip past delimiter
|
|
154
|
+
emit(IR::Opcode::MATCH_LITERAL, close_const_id)
|
|
155
|
+
|
|
156
|
+
# Switch to inner mode if defined
|
|
157
|
+
if delimited.inner_mode
|
|
158
|
+
inner_mode_id = ensure_mode_id(delimited.inner_mode)
|
|
159
|
+
emit(IR::Opcode::PUSH_MODE, inner_mode_id)
|
|
160
|
+
emit(IR::Opcode::JUMP, @modes[mode_def.name])
|
|
161
|
+
elsif delimited.pop
|
|
162
|
+
emit(IR::Opcode::POP_MODE)
|
|
163
|
+
else
|
|
164
|
+
# Jump back to start
|
|
165
|
+
emit(IR::Opcode::JUMP, @modes[mode_def.name])
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# EOF label
|
|
169
|
+
mark_label(eof_label)
|
|
170
|
+
emit(IR::Opcode::HALT)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def compile_regular_mode(mode_def)
|
|
174
|
+
loop_start = current_offset
|
|
175
|
+
|
|
176
|
+
# Check EOF
|
|
177
|
+
eof_label = new_label
|
|
178
|
+
emit_jump(IR::Opcode::JUMP_IF_EOF, eof_label)
|
|
179
|
+
|
|
180
|
+
# MARK position
|
|
181
|
+
emit(IR::Opcode::MARK)
|
|
182
|
+
|
|
183
|
+
# Group tokens by first byte for SWITCH_BYTE
|
|
184
|
+
literals = mode_def.literal_tokens
|
|
185
|
+
regexes = mode_def.regex_tokens
|
|
186
|
+
|
|
187
|
+
if literals.any? || regexes.any?
|
|
188
|
+
# Build switch table for first-byte dispatch
|
|
189
|
+
compile_token_dispatch(mode_def, loop_start)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# EOF: HALT
|
|
193
|
+
mark_label(eof_label)
|
|
194
|
+
emit(IR::Opcode::HALT)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Build escape config for delimited with escape sequence
|
|
198
|
+
# Format: [2bytes: close_len] [2bytes: escape_len] [close bytes] [escape bytes]
|
|
199
|
+
def add_escape_config(token_def)
|
|
200
|
+
close = token_def.delimiter
|
|
201
|
+
escape = token_def.escape
|
|
202
|
+
escape_bytes = escape.b
|
|
203
|
+
bytes = [
|
|
204
|
+
close.bytesize,
|
|
205
|
+
escape_bytes.bytesize
|
|
206
|
+
].pack("S>S>") + close.b + escape_bytes
|
|
207
|
+
|
|
208
|
+
@constant_pool.add(bytes)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def compile_token_dispatch(mode_def, loop_start)
|
|
212
|
+
# Collect candidates by first byte, preserving definition order.
|
|
213
|
+
candidates_by_byte = {}
|
|
214
|
+
first_bytes_cache = {}
|
|
215
|
+
order_map = {}
|
|
216
|
+
|
|
217
|
+
mode_def.tokens.each_with_index do |token_def, idx|
|
|
218
|
+
next unless token_def.literal? || token_def.regex?
|
|
219
|
+
|
|
220
|
+
order_map[token_def] = idx
|
|
221
|
+
|
|
222
|
+
if token_def.literal?
|
|
223
|
+
byte = token_def.pattern.getbyte(0)
|
|
224
|
+
(candidates_by_byte[byte] ||= []) << token_def
|
|
225
|
+
else
|
|
226
|
+
regex = DFA::RegexAST::Regex.parse(token_def.pattern)
|
|
227
|
+
first_bytes = first_bytes_cache[token_def.pattern] ||= DFA.first_byte_set(regex)
|
|
228
|
+
first_bytes.each do |byte|
|
|
229
|
+
(candidates_by_byte[byte] ||= []) << token_def
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Create jump table
|
|
235
|
+
jump_table_id = @jump_tables.size
|
|
236
|
+
jump_table_entries = {}
|
|
237
|
+
|
|
238
|
+
# Group bytes by candidate sets - bytes with same candidates share one handler
|
|
239
|
+
groups = group_bytes_by_candidates(candidates_by_byte)
|
|
240
|
+
groups.each do |_candidates, bytes|
|
|
241
|
+
common_label = new_label
|
|
242
|
+
bytes.each { |byte| jump_table_entries[byte] = common_label }
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Default: error handling
|
|
246
|
+
default_label = new_label
|
|
247
|
+
|
|
248
|
+
@jump_tables << nil # Placeholder
|
|
249
|
+
|
|
250
|
+
# Emit SWITCH_BYTE
|
|
251
|
+
emit(IR::Opcode::SWITCH_BYTE, jump_table_id)
|
|
252
|
+
|
|
253
|
+
# Compile branches for each group (bytes with same candidates share handler)
|
|
254
|
+
groups.each do |candidates, bytes|
|
|
255
|
+
# All bytes in this group point to the same label
|
|
256
|
+
common_label = jump_table_entries[bytes.first]
|
|
257
|
+
mark_label(common_label)
|
|
258
|
+
|
|
259
|
+
if candidates.empty?
|
|
260
|
+
emit_jump(IR::Opcode::JUMP, default_label)
|
|
261
|
+
next
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
if candidates.size == 1
|
|
265
|
+
token_def = candidates.first
|
|
266
|
+
if token_def.literal?
|
|
267
|
+
const_id = @constant_pool.add(token_def.pattern)
|
|
268
|
+
# MATCH_LITERAL_OR_JUMP has embedded fail_target
|
|
269
|
+
emit_match_literal_or_jump(const_id, default_label)
|
|
270
|
+
elsif token_def.regex?
|
|
271
|
+
dfa_id = compile_regex_to_dfa(token_def.pattern)
|
|
272
|
+
# DFA_RUN_IF_MATCH has embedded fail_target
|
|
273
|
+
emit_dfa_run_if_match(dfa_id, default_label)
|
|
274
|
+
else
|
|
275
|
+
emit_jump(IR::Opcode::JUMP, default_label)
|
|
276
|
+
next
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
compile_token_action(token_def, loop_start)
|
|
280
|
+
next
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
action_labels = []
|
|
284
|
+
action_label_map = {}
|
|
285
|
+
candidates.each do |token_def|
|
|
286
|
+
action_label = new_label
|
|
287
|
+
action_labels << [action_label, token_def]
|
|
288
|
+
action_label_map[token_def] = action_label
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
literal_tokens = candidates.select(&:literal?)
|
|
292
|
+
regex_tokens = candidates.select(&:regex?)
|
|
293
|
+
|
|
294
|
+
if regex_tokens.empty?
|
|
295
|
+
# Literal-only: use optimized LITERAL_TRIE_COMMIT
|
|
296
|
+
trie_const_id = add_literal_trie(literal_tokens, order_map, action_label_map)
|
|
297
|
+
emit_literal_trie_commit(trie_const_id, default_label)
|
|
298
|
+
else
|
|
299
|
+
# Mixed literals and regex: use CLEAR_BEST + ... + COMMIT_BEST
|
|
300
|
+
emit(IR::Opcode::CLEAR_BEST)
|
|
301
|
+
|
|
302
|
+
if literal_tokens.any?
|
|
303
|
+
trie_const_id = add_literal_trie(literal_tokens, order_map, action_label_map)
|
|
304
|
+
# LITERAL_TRIE_RUN updates best match and restores pos to mark
|
|
305
|
+
emit(IR::Opcode::LITERAL_TRIE_RUN, trie_const_id)
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
regex_tokens.each do |token_def|
|
|
309
|
+
dfa_id = compile_regex_to_dfa(token_def.pattern)
|
|
310
|
+
emit(IR::Opcode::DFA_RUN, dfa_id)
|
|
311
|
+
# SET_MATCH sets candidate order/action and updates best match
|
|
312
|
+
emit_set_match(order_map[token_def], action_label_map[token_def])
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
emit_label_arg(IR::Opcode::COMMIT_BEST, default_label)
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
action_labels.each do |action_label, token_def|
|
|
319
|
+
mark_label(action_label)
|
|
320
|
+
compile_token_action(token_def, loop_start)
|
|
321
|
+
end
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
# Default branch (error handling)
|
|
325
|
+
mark_label(default_label)
|
|
326
|
+
|
|
327
|
+
# Error: advance one byte and emit :INVALID
|
|
328
|
+
emit(IR::Opcode::MATCH_RANGE, (0 << 8) | 255) # Match any byte
|
|
329
|
+
emit(IR::Opcode::EMIT_ERROR, LexerKit::INVALID_TOKEN_ID)
|
|
330
|
+
emit(IR::Opcode::JUMP, loop_start)
|
|
331
|
+
|
|
332
|
+
# Now resolve the jump table
|
|
333
|
+
resolved_entries = {}
|
|
334
|
+
jump_table_entries.each do |byte, label|
|
|
335
|
+
resolved_entries[byte] = @labels[label]
|
|
336
|
+
end
|
|
337
|
+
@jump_tables[jump_table_id] = IR::JumpTable.new(
|
|
338
|
+
entries: resolved_entries,
|
|
339
|
+
default_offset: @labels[default_label]
|
|
340
|
+
)
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def compile_token_action(token_def, loop_start)
|
|
344
|
+
token_id = @token_ids[token_def.name]
|
|
345
|
+
|
|
346
|
+
if should_attach_keywords?(token_def)
|
|
347
|
+
table_id = build_keyword_table(token_def)
|
|
348
|
+
emit(IR::Opcode::KEYWORD_LOOKUP, table_id)
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
if token_def.skip
|
|
352
|
+
if token_def.push || token_def.pop
|
|
353
|
+
# When push/pop is involved, we need separate EMIT_SKIP
|
|
354
|
+
emit(IR::Opcode::EMIT_SKIP)
|
|
355
|
+
else
|
|
356
|
+
# Optimize EMIT_SKIP + JUMP into EMIT_SKIP_AND_JUMP
|
|
357
|
+
emit(IR::Opcode::EMIT_SKIP_AND_JUMP, loop_start)
|
|
358
|
+
return
|
|
359
|
+
end
|
|
360
|
+
elsif token_def.push || token_def.pop
|
|
361
|
+
# When push/pop is involved, we need separate EMIT
|
|
362
|
+
emit(IR::Opcode::EMIT, token_id)
|
|
363
|
+
else
|
|
364
|
+
# Optimize EMIT + JUMP into EMIT_AND_JUMP
|
|
365
|
+
emit_emit_and_jump(token_id, loop_start)
|
|
366
|
+
return
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
if token_def.push
|
|
370
|
+
mode_id = ensure_mode_id(token_def.push)
|
|
371
|
+
emit(IR::Opcode::PUSH_MODE, mode_id)
|
|
372
|
+
elsif token_def.pop
|
|
373
|
+
emit(IR::Opcode::POP_MODE)
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
def should_attach_keywords?(token_def)
|
|
378
|
+
return false if @builder.keywords.empty?
|
|
379
|
+
return false unless token_def.pattern.is_a?(Regexp)
|
|
380
|
+
|
|
381
|
+
@builder.keywords.keys.any? do |keyword_str|
|
|
382
|
+
token_def.pattern.match?(keyword_str)
|
|
383
|
+
end
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
def build_keyword_table(token_def)
|
|
387
|
+
# Cache keyword table by token_def to avoid duplicates
|
|
388
|
+
return @keyword_cache[token_def] if @keyword_cache.key?(token_def)
|
|
389
|
+
|
|
390
|
+
table_id = @keyword_tables.size
|
|
391
|
+
keywords_hash = {}
|
|
392
|
+
@builder.keywords.each do |value, name|
|
|
393
|
+
keywords_hash[value] = @token_ids[name]
|
|
394
|
+
end
|
|
395
|
+
@keyword_tables << IR::KeywordTable.new(
|
|
396
|
+
base_token_id: @token_ids[token_def.name],
|
|
397
|
+
keywords: keywords_hash
|
|
398
|
+
)
|
|
399
|
+
@keyword_cache[token_def] = table_id
|
|
400
|
+
table_id
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
def compile_regex_to_dfa(pattern)
|
|
404
|
+
# Cache DFA by pattern source to avoid duplicate compilation
|
|
405
|
+
cache_key = pattern.is_a?(Regexp) ? pattern.source : pattern
|
|
406
|
+
return @dfa_cache[cache_key] if @dfa_cache.key?(cache_key)
|
|
407
|
+
|
|
408
|
+
dfa_id = @dfa_tables.size
|
|
409
|
+
|
|
410
|
+
# Use proper regex→DFA compilation
|
|
411
|
+
regex = DFA::RegexAST::Regex.parse(pattern)
|
|
412
|
+
dfa_table = DFA.compile_regex(regex, 0)
|
|
413
|
+
@dfa_tables << dfa_table
|
|
414
|
+
|
|
415
|
+
@dfa_cache[cache_key] = dfa_id
|
|
416
|
+
dfa_id
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
def ensure_mode_id(mode_name)
|
|
420
|
+
return @mode_ids[mode_name] if @mode_ids.key?(mode_name)
|
|
421
|
+
|
|
422
|
+
id = @mode_names.size
|
|
423
|
+
@mode_names << mode_name
|
|
424
|
+
@mode_ids[mode_name] = id
|
|
425
|
+
id
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
def current_offset
|
|
429
|
+
@instructions.size
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
def emit(opcode, arg = 0)
|
|
433
|
+
@instructions << IR::Instruction.new(opcode, arg)
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
def emit_jump(opcode, label)
|
|
437
|
+
@pending_jumps << [@instructions.size, label]
|
|
438
|
+
emit(opcode, 0) # Placeholder
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
def emit_label_arg(opcode, label)
|
|
442
|
+
@pending_jumps << [@instructions.size, label]
|
|
443
|
+
emit(opcode, 0) # Placeholder
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Emit DFA_RUN_IF_MATCH with embedded fail_target
|
|
447
|
+
# Encoding: (dfa_id << 14) | fail_target (10+14 bits)
|
|
448
|
+
def emit_dfa_run_if_match(dfa_id, fail_label)
|
|
449
|
+
@pending_dfa_run_if_match << [@instructions.size, dfa_id, fail_label]
|
|
450
|
+
emit(IR::Opcode::DFA_RUN_IF_MATCH, dfa_id << 14) # Placeholder with dfa_id in upper bits
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
# Emit MATCH_LITERAL_OR_JUMP with embedded fail_target
|
|
454
|
+
# Encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
455
|
+
def emit_match_literal_or_jump(const_id, fail_label)
|
|
456
|
+
@pending_match_literal_or_jump << [@instructions.size, const_id, fail_label]
|
|
457
|
+
emit(IR::Opcode::MATCH_LITERAL_OR_JUMP, const_id << 14) # Placeholder with const_id in upper bits
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
# Emit EMIT_AND_JUMP with embedded jump_target
|
|
461
|
+
# Encoding: (token_id << 14) | jump_target (10+14 bits)
|
|
462
|
+
def emit_emit_and_jump(token_id, jump_target)
|
|
463
|
+
@pending_emit_and_jump << [@instructions.size, token_id, jump_target]
|
|
464
|
+
emit(IR::Opcode::EMIT_AND_JUMP, token_id << 14) # Placeholder with token_id in upper bits
|
|
465
|
+
end
|
|
466
|
+
|
|
467
|
+
# Emit SET_MATCH with embedded order and action_label
|
|
468
|
+
# Encoding: (order << 14) | action_ip (10+14 bits)
|
|
469
|
+
def emit_set_match(order, action_label)
|
|
470
|
+
@pending_set_match << [@instructions.size, order, action_label]
|
|
471
|
+
emit(IR::Opcode::SET_MATCH, order << 14) # Placeholder with order in upper bits
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# Emit LITERAL_TRIE_COMMIT with embedded const_id and fail_target
|
|
475
|
+
# Encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
476
|
+
def emit_literal_trie_commit(const_id, fail_label)
|
|
477
|
+
@pending_literal_trie_commit << [@instructions.size, const_id, fail_label]
|
|
478
|
+
emit(IR::Opcode::LITERAL_TRIE_COMMIT, const_id << 14) # Placeholder with const_id in upper bits
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def add_literal_trie(literals, order_map, action_label_map)
|
|
482
|
+
# Convert TokenDef objects to Trie entries
|
|
483
|
+
entries = literals.map do |token_def|
|
|
484
|
+
[
|
|
485
|
+
token_def.pattern, # literal string
|
|
486
|
+
order_map[token_def], # order
|
|
487
|
+
action_label_map[token_def] # action_ref (label)
|
|
488
|
+
]
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
trie = Trie.new(entries)
|
|
492
|
+
const_id = @constant_pool.add_uninterned("".b)
|
|
493
|
+
@pending_trie_entries << [const_id, trie]
|
|
494
|
+
const_id
|
|
495
|
+
end
|
|
496
|
+
|
|
497
|
+
def finalize_literal_tries
|
|
498
|
+
return if @pending_trie_entries.empty?
|
|
499
|
+
|
|
500
|
+
@pending_trie_entries.each do |const_id, trie|
|
|
501
|
+
data = trie.encode(labels: @labels)
|
|
502
|
+
@constant_pool.replace(const_id, data)
|
|
503
|
+
end
|
|
504
|
+
end
|
|
505
|
+
|
|
506
|
+
# Group bytes by their candidate sets.
|
|
507
|
+
# Returns array of [candidates, bytes] pairs where all bytes in a group
|
|
508
|
+
# share the exact same candidate set.
|
|
509
|
+
def group_bytes_by_candidates(candidates_by_byte)
|
|
510
|
+
grouped = Hash.new { |h, k| h[k] = [] }
|
|
511
|
+
|
|
512
|
+
candidates_by_byte.each do |byte, candidates|
|
|
513
|
+
# Use object_id array as key to identify same candidate set
|
|
514
|
+
key = candidates.map(&:object_id).sort
|
|
515
|
+
grouped[key] << byte
|
|
516
|
+
end
|
|
517
|
+
|
|
518
|
+
result = []
|
|
519
|
+
grouped.each do |_key, bytes|
|
|
520
|
+
candidates = candidates_by_byte[bytes.first]
|
|
521
|
+
result << [candidates, bytes.sort]
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
result
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
def new_label
|
|
528
|
+
label = :"label_#{@label_counter}"
|
|
529
|
+
@label_counter += 1
|
|
530
|
+
label
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
def mark_label(label)
|
|
534
|
+
@labels[label] = current_offset
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
def resolve_jumps
|
|
538
|
+
@pending_jumps.each do |instr_idx, label|
|
|
539
|
+
offset = @labels[label]
|
|
540
|
+
raise LexerKit::CompileError, "unresolved label #{label}" unless offset
|
|
541
|
+
|
|
542
|
+
instr = @instructions[instr_idx]
|
|
543
|
+
@instructions[instr_idx] = IR::Instruction.new(instr.opcode, offset)
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
# Resolve DFA_RUN_IF_MATCH fail targets
|
|
547
|
+
# Encoding: (dfa_id << 14) | fail_target (10+14 bits)
|
|
548
|
+
@pending_dfa_run_if_match.each do |instr_idx, dfa_id, fail_label|
|
|
549
|
+
fail_offset = @labels[fail_label]
|
|
550
|
+
raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
|
|
551
|
+
|
|
552
|
+
arg = (dfa_id << 14) | fail_offset
|
|
553
|
+
@instructions[instr_idx] = IR::Instruction.new(IR::Opcode::DFA_RUN_IF_MATCH, arg)
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
# Resolve MATCH_LITERAL_OR_JUMP fail targets
|
|
557
|
+
# Encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
558
|
+
@pending_match_literal_or_jump.each do |instr_idx, const_id, fail_label|
|
|
559
|
+
fail_offset = @labels[fail_label]
|
|
560
|
+
raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
|
|
561
|
+
|
|
562
|
+
arg = (const_id << 14) | fail_offset
|
|
563
|
+
@instructions[instr_idx] = IR::Instruction.new(IR::Opcode::MATCH_LITERAL_OR_JUMP, arg)
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
# Resolve EMIT_AND_JUMP targets
|
|
567
|
+
# Encoding: (token_id << 14) | jump_target (10+14 bits)
|
|
568
|
+
# Note: jump_target is a direct offset (loop_start), not a label
|
|
569
|
+
@pending_emit_and_jump.each do |instr_idx, token_id, jump_target|
|
|
570
|
+
arg = (token_id << 14) | jump_target
|
|
571
|
+
@instructions[instr_idx] = IR::Instruction.new(IR::Opcode::EMIT_AND_JUMP, arg)
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
# Resolve SET_MATCH action targets
|
|
575
|
+
# Encoding: (order << 14) | action_ip (10+14 bits)
|
|
576
|
+
@pending_set_match.each do |instr_idx, order, action_label|
|
|
577
|
+
action_offset = @labels[action_label]
|
|
578
|
+
raise LexerKit::CompileError, "unresolved label #{action_label}" unless action_offset
|
|
579
|
+
|
|
580
|
+
arg = (order << 14) | action_offset
|
|
581
|
+
@instructions[instr_idx] = IR::Instruction.new(IR::Opcode::SET_MATCH, arg)
|
|
582
|
+
end
|
|
583
|
+
|
|
584
|
+
# Resolve LITERAL_TRIE_COMMIT fail targets
|
|
585
|
+
# Encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
586
|
+
@pending_literal_trie_commit.each do |instr_idx, const_id, fail_label|
|
|
587
|
+
fail_offset = @labels[fail_label]
|
|
588
|
+
raise LexerKit::CompileError, "unresolved label #{fail_label}" unless fail_offset
|
|
589
|
+
|
|
590
|
+
arg = (const_id << 14) | fail_offset
|
|
591
|
+
@instructions[instr_idx] = IR::Instruction.new(IR::Opcode::LITERAL_TRIE_COMMIT, arg)
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
end
|
|
595
|
+
end
|
|
596
|
+
end
|