lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# Serializer handles binary encoding/decoding of CompiledProgram.
|
|
6
|
+
# Extracted from CompiledProgram to separate serialization concerns.
|
|
7
|
+
class Serializer
|
|
8
|
+
MAGIC = "LKT1"
|
|
9
|
+
FORMAT_VERSION = 3 # v3: added token_meta
|
|
10
|
+
|
|
11
|
+
# Encode a CompiledProgram to binary format.
|
|
12
|
+
# @param program [CompiledProgram]
|
|
13
|
+
# @return [String] binary data
|
|
14
|
+
def self.to_binary(program)
|
|
15
|
+
new.encode(program)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Decode binary data to a CompiledProgram.
|
|
19
|
+
# @param bytes [String] binary data
|
|
20
|
+
# @return [CompiledProgram]
|
|
21
|
+
def self.from_binary(bytes)
|
|
22
|
+
new.decode(bytes)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Encode CompiledProgram to binary.
|
|
26
|
+
# @param program [CompiledProgram]
|
|
27
|
+
# @return [String]
|
|
28
|
+
def encode(program)
|
|
29
|
+
parts = []
|
|
30
|
+
|
|
31
|
+
# Magic (4 bytes)
|
|
32
|
+
parts << MAGIC
|
|
33
|
+
|
|
34
|
+
# Format version (u16)
|
|
35
|
+
parts << [FORMAT_VERSION].pack("S>")
|
|
36
|
+
|
|
37
|
+
# User version (u32)
|
|
38
|
+
parts << [program.version].pack("L>")
|
|
39
|
+
|
|
40
|
+
# Token names
|
|
41
|
+
encode_string_array(parts, program.token_names)
|
|
42
|
+
|
|
43
|
+
# Mode names
|
|
44
|
+
encode_string_array(parts, program.mode_names)
|
|
45
|
+
|
|
46
|
+
# Modes (name → offset mapping)
|
|
47
|
+
modes = program.mode_offsets
|
|
48
|
+
parts << [modes.size].pack("S>")
|
|
49
|
+
modes.each do |name, offset|
|
|
50
|
+
name_bytes = name.to_s.b
|
|
51
|
+
parts << [name_bytes.bytesize].pack("S>")
|
|
52
|
+
parts << name_bytes
|
|
53
|
+
parts << [offset].pack("L>")
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Constant pool
|
|
57
|
+
pool_binary = program.constant_pool.to_binary
|
|
58
|
+
parts << [pool_binary.bytesize].pack("L>")
|
|
59
|
+
parts << pool_binary
|
|
60
|
+
|
|
61
|
+
# DFA tables
|
|
62
|
+
encode_table_array(parts, program.dfa_tables)
|
|
63
|
+
|
|
64
|
+
# Jump tables
|
|
65
|
+
encode_table_array(parts, program.jump_tables)
|
|
66
|
+
|
|
67
|
+
# Keyword tables
|
|
68
|
+
encode_table_array(parts, program.keyword_tables)
|
|
69
|
+
|
|
70
|
+
# Token metadata
|
|
71
|
+
encode_token_meta(parts, program.token_meta)
|
|
72
|
+
|
|
73
|
+
# Instructions
|
|
74
|
+
instructions = program.instructions
|
|
75
|
+
parts << [instructions.size].pack("L>")
|
|
76
|
+
instructions.each do |instr|
|
|
77
|
+
parts << instr.to_binary
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
parts.join
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Decode binary data to CompiledProgram.
|
|
84
|
+
# @param bytes [String]
|
|
85
|
+
# @return [CompiledProgram]
|
|
86
|
+
def decode(bytes)
|
|
87
|
+
@bytes = bytes
|
|
88
|
+
@pos = 0
|
|
89
|
+
|
|
90
|
+
# Magic
|
|
91
|
+
magic = read_bytes(4)
|
|
92
|
+
raise ArgumentError, "invalid magic: #{magic.inspect}" unless magic == MAGIC
|
|
93
|
+
|
|
94
|
+
# Format version
|
|
95
|
+
format_version = read_uint16
|
|
96
|
+
raise ArgumentError, "unsupported format version: #{format_version}" unless format_version == FORMAT_VERSION
|
|
97
|
+
|
|
98
|
+
# User version
|
|
99
|
+
version = read_uint32
|
|
100
|
+
|
|
101
|
+
# Token names
|
|
102
|
+
token_names = decode_symbol_array
|
|
103
|
+
|
|
104
|
+
# Mode names
|
|
105
|
+
mode_names = decode_symbol_array
|
|
106
|
+
|
|
107
|
+
# Modes
|
|
108
|
+
mode_count = read_uint16
|
|
109
|
+
modes = {}
|
|
110
|
+
mode_count.times do
|
|
111
|
+
name = read_length_prefixed_string.to_sym
|
|
112
|
+
offset = read_uint32
|
|
113
|
+
modes[name] = offset
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Constant pool
|
|
117
|
+
pool_len = read_uint32
|
|
118
|
+
constant_pool, = ConstantPool.from_binary(read_bytes(pool_len))
|
|
119
|
+
|
|
120
|
+
# DFA tables
|
|
121
|
+
dfa_tables = decode_table_array(DFATable)
|
|
122
|
+
|
|
123
|
+
# Jump tables
|
|
124
|
+
jump_tables = decode_table_array(JumpTable)
|
|
125
|
+
|
|
126
|
+
# Keyword tables
|
|
127
|
+
keyword_tables = decode_table_array(KeywordTable)
|
|
128
|
+
|
|
129
|
+
# Token metadata
|
|
130
|
+
token_meta = format_version >= 3 ? decode_token_meta : {}
|
|
131
|
+
|
|
132
|
+
# Instructions
|
|
133
|
+
instr_count = read_uint32
|
|
134
|
+
instructions = []
|
|
135
|
+
instr_count.times do
|
|
136
|
+
instructions << Instruction.from_binary(read_bytes(4))
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
CompiledProgram.new(
|
|
140
|
+
instructions: instructions,
|
|
141
|
+
dfa_tables: dfa_tables,
|
|
142
|
+
jump_tables: jump_tables,
|
|
143
|
+
constant_pool: constant_pool,
|
|
144
|
+
modes: modes,
|
|
145
|
+
token_names: token_names,
|
|
146
|
+
mode_names: mode_names,
|
|
147
|
+
keyword_tables: keyword_tables,
|
|
148
|
+
token_meta: token_meta,
|
|
149
|
+
version: version
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
private
|
|
154
|
+
|
|
155
|
+
# Encode an array of symbol names as length-prefixed strings.
|
|
156
|
+
def encode_string_array(parts, names)
|
|
157
|
+
parts << [names.size].pack("S>")
|
|
158
|
+
names.each do |name|
|
|
159
|
+
name_bytes = name.to_s.b
|
|
160
|
+
parts << [name_bytes.bytesize].pack("S>")
|
|
161
|
+
parts << name_bytes
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Encode an array of tables (DFA, Jump, Keyword).
|
|
166
|
+
def encode_table_array(parts, tables)
|
|
167
|
+
parts << [tables.size].pack("S>")
|
|
168
|
+
tables.each do |table|
|
|
169
|
+
table_binary = table.to_binary
|
|
170
|
+
parts << [table_binary.bytesize].pack("L>")
|
|
171
|
+
parts << table_binary
|
|
172
|
+
end
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Decode an array of symbols.
|
|
176
|
+
def decode_symbol_array
|
|
177
|
+
count = read_uint16
|
|
178
|
+
symbols = []
|
|
179
|
+
count.times do
|
|
180
|
+
symbols << read_length_prefixed_string.to_sym
|
|
181
|
+
end
|
|
182
|
+
symbols
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Decode an array of tables.
|
|
186
|
+
def decode_table_array(table_class)
|
|
187
|
+
count = read_uint16
|
|
188
|
+
tables = []
|
|
189
|
+
count.times do
|
|
190
|
+
table_len = read_uint32
|
|
191
|
+
table, = table_class.from_binary(read_bytes(table_len))
|
|
192
|
+
tables << table
|
|
193
|
+
end
|
|
194
|
+
tables
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Read raw bytes.
|
|
198
|
+
def read_bytes(len)
|
|
199
|
+
data = @bytes.byteslice(@pos, len)
|
|
200
|
+
@pos += len
|
|
201
|
+
data
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
# Read unsigned 16-bit big-endian.
|
|
205
|
+
def read_uint16
|
|
206
|
+
read_bytes(2).unpack1("S>")
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Read unsigned 32-bit big-endian.
|
|
210
|
+
def read_uint32
|
|
211
|
+
read_bytes(4).unpack1("L>")
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
# Read length-prefixed string (u16 length + bytes).
|
|
215
|
+
def read_length_prefixed_string
|
|
216
|
+
len = read_uint16
|
|
217
|
+
read_bytes(len)
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Encode token metadata hash.
|
|
221
|
+
# Format: [count: u16] [token_id: u16, json_len: u16, json: bytes]×n
|
|
222
|
+
def encode_token_meta(parts, token_meta)
|
|
223
|
+
require "json"
|
|
224
|
+
parts << [token_meta.size].pack("S>")
|
|
225
|
+
token_meta.each do |token_id, meta|
|
|
226
|
+
json = JSON.generate(meta)
|
|
227
|
+
json_bytes = json.b
|
|
228
|
+
parts << [token_id, json_bytes.bytesize].pack("S>S>")
|
|
229
|
+
parts << json_bytes
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Decode token metadata hash.
|
|
234
|
+
def decode_token_meta
|
|
235
|
+
require "json"
|
|
236
|
+
count = read_uint16
|
|
237
|
+
token_meta = {}
|
|
238
|
+
count.times do
|
|
239
|
+
token_id = read_uint16
|
|
240
|
+
json_len = read_uint16
|
|
241
|
+
json_bytes = read_bytes(json_len)
|
|
242
|
+
meta = JSON.parse(json_bytes, symbolize_names: true)
|
|
243
|
+
token_meta[token_id] = meta
|
|
244
|
+
end
|
|
245
|
+
token_meta
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
end
|
|
249
|
+
end
|
data/lib/lexer_kit/ir.rb
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "ir/opcode"
|
|
4
|
+
require_relative "ir/instruction"
|
|
5
|
+
require_relative "ir/dfa_table"
|
|
6
|
+
require_relative "ir/jump_table"
|
|
7
|
+
require_relative "ir/constant_pool"
|
|
8
|
+
require_relative "ir/keyword_table"
|
|
9
|
+
require_relative "ir/serializer"
|
|
10
|
+
require_relative "ir/compiled_program"
|
|
11
|
+
|
|
12
|
+
module LexerKit
|
|
13
|
+
# IR module contains intermediate representation types for the lexer VM.
|
|
14
|
+
module IR
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
# Runner coordinates program, stream, and source for token processing.
|
|
5
|
+
# It provides a high-level interface for lexing with full Token support.
|
|
6
|
+
#
|
|
7
|
+
# Runner has the same interface as LexStream (duck-typing compatible)
|
|
8
|
+
# plus additional methods for Token creation.
|
|
9
|
+
#
|
|
10
|
+
# Created via CompiledProgram#stream(input, filename:).
|
|
11
|
+
class Runner
|
|
12
|
+
attr_reader :program, :stream
|
|
13
|
+
|
|
14
|
+
# @api private
|
|
15
|
+
# Use CompiledProgram#stream instead
|
|
16
|
+
def initialize(program, lex_stream, filename: nil)
|
|
17
|
+
@program = program
|
|
18
|
+
@stream = lex_stream
|
|
19
|
+
@filename = filename
|
|
20
|
+
@source = nil # lazy
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Delegate basic stream operations
|
|
24
|
+
def eof? = @stream.eof?
|
|
25
|
+
def token_id = @stream.token_id
|
|
26
|
+
def start = @stream.start
|
|
27
|
+
def len = @stream.len
|
|
28
|
+
def input = @stream.input
|
|
29
|
+
|
|
30
|
+
# Advance to next token, returns self for chaining
|
|
31
|
+
def advance
|
|
32
|
+
@stream.advance
|
|
33
|
+
self
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Token name lookup (via program for correctness)
|
|
37
|
+
def token_name(id = nil)
|
|
38
|
+
id ||= token_id
|
|
39
|
+
return nil if id.nil? || id < 0
|
|
40
|
+
|
|
41
|
+
@program.token_name(id)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Text extraction
|
|
45
|
+
def text
|
|
46
|
+
return nil if eof?
|
|
47
|
+
|
|
48
|
+
s = start
|
|
49
|
+
l = len
|
|
50
|
+
return nil if s.nil? || l.nil?
|
|
51
|
+
|
|
52
|
+
input.byteslice(s, l)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Error detection (uses constant, no @program needed)
|
|
56
|
+
def error?(id = nil)
|
|
57
|
+
id ||= token_id
|
|
58
|
+
return false if id.nil? || id < 0
|
|
59
|
+
|
|
60
|
+
id == LexerKit::INVALID_TOKEN_ID
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Create Token object (lazy Source creation)
|
|
64
|
+
def make_token
|
|
65
|
+
return nil if eof?
|
|
66
|
+
|
|
67
|
+
tok_id = token_id
|
|
68
|
+
s = start
|
|
69
|
+
l = len
|
|
70
|
+
return nil if tok_id.nil? || s.nil? || l.nil?
|
|
71
|
+
|
|
72
|
+
@source ||= Core::Source.new(input, filename: @filename)
|
|
73
|
+
Core::Token.new(
|
|
74
|
+
id: tok_id,
|
|
75
|
+
name: @program.token_name(tok_id),
|
|
76
|
+
start: s,
|
|
77
|
+
len: l,
|
|
78
|
+
source: @source,
|
|
79
|
+
meta: @program.token_meta[tok_id]
|
|
80
|
+
)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Line/col for current position
|
|
84
|
+
def line_col
|
|
85
|
+
return nil if eof?
|
|
86
|
+
|
|
87
|
+
s = start
|
|
88
|
+
return nil if s.nil?
|
|
89
|
+
|
|
90
|
+
@source ||= Core::Source.new(input, filename: @filename)
|
|
91
|
+
@source.line_col(s)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Peek operations (delegate to stream)
|
|
95
|
+
def peek_token_id(n = 1) = @stream.peek_token_id(n)
|
|
96
|
+
def peek_start(n = 1) = @stream.peek_start(n)
|
|
97
|
+
def peek_len(n = 1) = @stream.peek_len(n)
|
|
98
|
+
|
|
99
|
+
def peek_token_name(n = 1)
|
|
100
|
+
id = peek_token_id(n)
|
|
101
|
+
return nil if id.nil? || id < 0
|
|
102
|
+
|
|
103
|
+
@program.token_name(id)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def peek_text(n = 1)
|
|
107
|
+
s = peek_start(n)
|
|
108
|
+
l = peek_len(n)
|
|
109
|
+
return nil if s.nil? || l.nil?
|
|
110
|
+
|
|
111
|
+
input.byteslice(s, l)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
# Builds and encodes trie data structures for literal matching.
|
|
5
|
+
#
|
|
6
|
+
# ## Value Constraints
|
|
7
|
+
#
|
|
8
|
+
# - order: 0..MAX_ORDER (ORDER_UNSET is reserved for unset values)
|
|
9
|
+
# - action_ip: 0..0xFFFFFFFF (ACTION_UNSET used for non-terminal nodes)
|
|
10
|
+
#
|
|
11
|
+
# ## Label Resolution
|
|
12
|
+
#
|
|
13
|
+
# labels hash must have Symbol keys only (Integer keys are ambiguous with direct offsets).
|
|
14
|
+
# Resolution order:
|
|
15
|
+
# 1. If action_ref is in labels hash, use mapped value
|
|
16
|
+
# 2. If action_ref is Symbol not in labels, raise ArgumentError
|
|
17
|
+
# 3. If action_ref is Integer, use directly as offset
|
|
18
|
+
#
|
|
19
|
+
class Trie
|
|
20
|
+
ORDER_UNSET = 0xFFFF
|
|
21
|
+
MAX_ORDER = 0xFFFE
|
|
22
|
+
ACTION_UNSET = 0xFFFFFFFF
|
|
23
|
+
MAX_EDGE_COUNT = 0xFFFF
|
|
24
|
+
|
|
25
|
+
def initialize(entries)
|
|
26
|
+
@trie = build_trie(entries)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def encode(labels: nil)
|
|
30
|
+
encode_trie(@trie, labels || {})
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.build(entries)
|
|
34
|
+
trie_entries = entries.map.with_index do |(literal, action_ip), index|
|
|
35
|
+
[literal, index, action_ip]
|
|
36
|
+
end
|
|
37
|
+
new(trie_entries).encode
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
Node = Struct.new(:edges, :order, :action_ref, keyword_init: true) do
|
|
41
|
+
def initialize(edges: {}, order: nil, action_ref: nil)
|
|
42
|
+
super
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
NodeEntry = Struct.new(:edge_start, :edge_count, :order, :action_ref, keyword_init: true)
|
|
47
|
+
TrieData = Struct.new(:nodes, :edges, keyword_init: true)
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
def build_trie(entries)
|
|
52
|
+
nodes = [Node.new]
|
|
53
|
+
|
|
54
|
+
entries.each do |literal, order, action_ref|
|
|
55
|
+
validate_literal(literal)
|
|
56
|
+
validate_order(order)
|
|
57
|
+
|
|
58
|
+
bytes = literal.b
|
|
59
|
+
node_idx = 0
|
|
60
|
+
|
|
61
|
+
bytes.each_byte do |byte|
|
|
62
|
+
node = nodes[node_idx]
|
|
63
|
+
child = node.edges[byte]
|
|
64
|
+
|
|
65
|
+
unless child
|
|
66
|
+
child = nodes.size
|
|
67
|
+
node.edges[byte] = child
|
|
68
|
+
nodes << Node.new
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
node_idx = child
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
node = nodes[node_idx]
|
|
75
|
+
if node.order.nil? || order < node.order
|
|
76
|
+
node.order = order
|
|
77
|
+
node.action_ref = action_ref
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# NOTE: O(E log E) per node due to Hash#sort, but acceptable for typical
|
|
82
|
+
# literal counts. For very large trie (>10k literals), consider optimization.
|
|
83
|
+
edge_list = []
|
|
84
|
+
node_entries = nodes.map do |node|
|
|
85
|
+
edge_start = edge_list.size
|
|
86
|
+
edge_count = node.edges.size
|
|
87
|
+
|
|
88
|
+
validate_edge_count(edge_count)
|
|
89
|
+
|
|
90
|
+
node.edges.sort.each { |byte, child| edge_list << [byte, child] }
|
|
91
|
+
NodeEntry.new(
|
|
92
|
+
edge_start: edge_start,
|
|
93
|
+
edge_count: edge_count,
|
|
94
|
+
order: node.order,
|
|
95
|
+
action_ref: node.action_ref
|
|
96
|
+
)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
TrieData.new(nodes: node_entries, edges: edge_list)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def encode_trie(trie, labels)
|
|
103
|
+
validate_labels(labels)
|
|
104
|
+
|
|
105
|
+
nodes = trie.nodes
|
|
106
|
+
edges = trie.edges
|
|
107
|
+
|
|
108
|
+
parts = []
|
|
109
|
+
parts << [nodes.size].pack("L>")
|
|
110
|
+
parts << [edges.size].pack("L>")
|
|
111
|
+
|
|
112
|
+
nodes.each do |node|
|
|
113
|
+
order = node.order || ORDER_UNSET
|
|
114
|
+
|
|
115
|
+
action = if node.action_ref.nil?
|
|
116
|
+
ACTION_UNSET
|
|
117
|
+
elsif node.action_ref.is_a?(Symbol)
|
|
118
|
+
labels.fetch(node.action_ref) do
|
|
119
|
+
raise ArgumentError, "Unknown label: #{node.action_ref}"
|
|
120
|
+
end
|
|
121
|
+
else
|
|
122
|
+
node.action_ref
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
validate_action_ip(action)
|
|
126
|
+
|
|
127
|
+
parts << [node.edge_start, node.edge_count, order, action].pack("L>S>S>L>")
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
edges.each do |byte, child_idx|
|
|
131
|
+
parts << [byte, child_idx].pack("CL>")
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
parts.join.force_encoding(Encoding::ASCII_8BIT)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def validate_literal(literal)
|
|
138
|
+
raise ArgumentError, "literal must be a non-empty string, got: #{literal.inspect}" if literal.nil? || literal.empty?
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def validate_order(order)
|
|
142
|
+
return if order.is_a?(Integer) && order >= 0 && order <= MAX_ORDER
|
|
143
|
+
|
|
144
|
+
raise ArgumentError, "order must be in range 0..#{format('0x%X', MAX_ORDER)} (#{format('0x%X', ORDER_UNSET)} reserved), got: #{order.inspect}"
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def validate_edge_count(count)
|
|
148
|
+
return if count.between?(0, MAX_EDGE_COUNT)
|
|
149
|
+
|
|
150
|
+
raise ArgumentError, "edge_count must fit in 16-bit unsigned integer, got: #{count}"
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def validate_action_ip(action_ip)
|
|
154
|
+
# ACTION_UNSET is allowed as the reserved "unset" value for non-terminal nodes
|
|
155
|
+
return if action_ip.is_a?(Integer) && action_ip >= 0 && action_ip <= ACTION_UNSET
|
|
156
|
+
|
|
157
|
+
raise ArgumentError, "action_ip must be in range 0..0x#{ACTION_UNSET.to_s(16).upcase} (32-bit unsigned), got: #{action_ip.inspect}"
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def validate_labels(labels)
|
|
161
|
+
return if labels.empty?
|
|
162
|
+
|
|
163
|
+
# Check for Integer keys (ambiguous with direct offsets)
|
|
164
|
+
integer_keys = labels.keys.select { |k| k.is_a?(Integer) }
|
|
165
|
+
return if integer_keys.empty?
|
|
166
|
+
|
|
167
|
+
raise ArgumentError, "labels must have Symbol keys only (Integer keys are ambiguous), found Integer keys: #{integer_keys.inspect}"
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
end
|
data/lib/lexer_kit.rb
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lexer_kit/version"
|
|
4
|
+
require_relative "lexer_kit/core" # Load first (errors depend on Core::Source, Core::Diagnostic)
|
|
5
|
+
require_relative "lexer_kit/errors" # Load after core
|
|
6
|
+
require_relative "lexer_kit/ir"
|
|
7
|
+
require_relative "lexer_kit/dfa"
|
|
8
|
+
require_relative "lexer_kit/builder"
|
|
9
|
+
require_relative "lexer_kit/format"
|
|
10
|
+
|
|
11
|
+
# Load Trie class (for testing and internal use)
|
|
12
|
+
require_relative "lexer_kit/trie"
|
|
13
|
+
|
|
14
|
+
# Try to load Rust extension
|
|
15
|
+
begin
|
|
16
|
+
require "lexer_kit_rust/lexer_kit_rust"
|
|
17
|
+
LEXER_KIT_NATIVE = true
|
|
18
|
+
|
|
19
|
+
# Runner wraps LexStream (defined in Rust) with high-level methods
|
|
20
|
+
require_relative "lexer_kit/runner"
|
|
21
|
+
rescue LoadError
|
|
22
|
+
LEXER_KIT_NATIVE = false
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
module LexerKit
|
|
26
|
+
# Reserved token IDs
|
|
27
|
+
# 0: Internal sentinel (never emitted)
|
|
28
|
+
# 1: INVALID (error token)
|
|
29
|
+
# 2-7: Reserved for future use
|
|
30
|
+
# 8+: User-defined tokens
|
|
31
|
+
#
|
|
32
|
+
# Note: The VM only emits tokens with valid IDs:
|
|
33
|
+
# - INVALID_TOKEN_ID (1) for error tokens
|
|
34
|
+
# - User tokens (>= FIRST_USER_TOKEN_ID)
|
|
35
|
+
# Tokens with sentinel/reserved IDs (0, 2-7) or zero length are filtered out.
|
|
36
|
+
RESERVED_TOKEN_ID = 0
|
|
37
|
+
INVALID_TOKEN_ID = 1
|
|
38
|
+
FIRST_USER_TOKEN_ID = 8
|
|
39
|
+
|
|
40
|
+
module RegexAstProvider
|
|
41
|
+
def to_ast
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def to_regex
|
|
46
|
+
DFA::RegexAST::Regex.new(ast: to_ast, case_insensitive: false)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Create a UTF-8 range pattern for the LexerKit regex engine.
|
|
51
|
+
#
|
|
52
|
+
# Accepted inputs:
|
|
53
|
+
# - "あ" (single character)
|
|
54
|
+
# - "あ".."ん" (Range, inclusive)
|
|
55
|
+
# - Integer codepoint ranges (e.g., 0x3041..0x3096)
|
|
56
|
+
#
|
|
57
|
+
# Notes:
|
|
58
|
+
# - Exclusive ranges (e.g., "a"..."z") are not supported.
|
|
59
|
+
# - Multi-character strings like "abc" are not supported.
|
|
60
|
+
# - Range endpoints must be single characters or integers.
|
|
61
|
+
def self.utf8_range(*ranges)
|
|
62
|
+
require_relative "lexer_kit/dfa/utf8_range_pattern"
|
|
63
|
+
parsed = ranges.map { |range| parse_range_codepoints(range) }
|
|
64
|
+
DFA::Utf8RangePattern.new(parsed)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def self.parse_range_codepoints(range)
|
|
68
|
+
if range.is_a?(Range)
|
|
69
|
+
raise ArgumentError, "utf8_range does not support exclusive ranges" if range.exclude_end?
|
|
70
|
+
|
|
71
|
+
start_cp = range.begin.is_a?(Integer) ? range.begin : single_char_ord(range.begin.to_s)
|
|
72
|
+
end_cp = range.end.is_a?(Integer) ? range.end : single_char_ord(range.end.to_s)
|
|
73
|
+
return [start_cp, end_cp]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
str = range.to_s
|
|
77
|
+
|
|
78
|
+
cp = single_char_ord(str)
|
|
79
|
+
[cp, cp]
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def self.single_char_ord(str)
|
|
83
|
+
raise ArgumentError, "utf8_range expects a single character, got #{str.inspect}" unless str.length == 1
|
|
84
|
+
|
|
85
|
+
str.ord
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
private_class_method :parse_range_codepoints
|
|
89
|
+
private_class_method :single_char_ord
|
|
90
|
+
|
|
91
|
+
# Check if native Rust extension is available
|
|
92
|
+
# @return [Boolean]
|
|
93
|
+
def self.native?
|
|
94
|
+
LEXER_KIT_NATIVE
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Build a lexer from DSL
|
|
98
|
+
# @yield [Builder] DSL block
|
|
99
|
+
# @return [Builder] configured builder
|
|
100
|
+
def self.build(&block)
|
|
101
|
+
Builder.new.tap { |b| b.instance_eval(&block) if block }
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Load a compiled lexer from .lkt1 or .lkb1 file
|
|
105
|
+
#
|
|
106
|
+
# @param path [String] path to lexer file (relative or absolute)
|
|
107
|
+
# @return [IR::CompiledProgram] compiled lexer program
|
|
108
|
+
# @raise [ArgumentError] if file not found or invalid extension
|
|
109
|
+
#
|
|
110
|
+
# @example Load from relative path
|
|
111
|
+
# LexerKit.load_lexer("lexers/json.lkt1")
|
|
112
|
+
# LexerKit.load_lexer(File.expand_path("../data/json.lkt1", __dir__))
|
|
113
|
+
#
|
|
114
|
+
# @example Load from absolute path
|
|
115
|
+
# LexerKit.load_lexer("/path/to/json.lkt1")
|
|
116
|
+
def self.load_lexer(path)
|
|
117
|
+
# Expand relative/absolute paths from current directory
|
|
118
|
+
path = File.expand_path(path)
|
|
119
|
+
|
|
120
|
+
raise ArgumentError, "Lexer not found: #{path}" unless File.exist?(path)
|
|
121
|
+
|
|
122
|
+
if path.end_with?(".lkt1")
|
|
123
|
+
Format::LKT1.load(path).program
|
|
124
|
+
elsif path.end_with?(".lkb1")
|
|
125
|
+
Format::LKB1.load(path).program
|
|
126
|
+
else
|
|
127
|
+
raise ArgumentError, "Expected .lkt1 or .lkb1 file: #{path}"
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Load a builder from DSL source file
|
|
132
|
+
#
|
|
133
|
+
# @param path [String] path to DSL source file (relative or absolute)
|
|
134
|
+
# @return [Builder] builder instance
|
|
135
|
+
# @raise [ArgumentError] if file doesn't return Builder instance
|
|
136
|
+
#
|
|
137
|
+
# @example Load from relative path
|
|
138
|
+
# LexerKit.load_builder("examples/languages/json.rb")
|
|
139
|
+
#
|
|
140
|
+
# @example Load from absolute path
|
|
141
|
+
# LexerKit.load_builder("/path/to/json.rb")
|
|
142
|
+
def self.load_builder(path)
|
|
143
|
+
# Expand relative/absolute paths from current directory
|
|
144
|
+
path = File.expand_path(path)
|
|
145
|
+
|
|
146
|
+
raise ArgumentError, "Builder source not found: #{path}" unless File.exist?(path)
|
|
147
|
+
|
|
148
|
+
content = File.read(path)
|
|
149
|
+
result = eval(content, TOPLEVEL_BINDING, path) # rubocop:disable Security/Eval
|
|
150
|
+
|
|
151
|
+
return result if result.is_a?(Builder)
|
|
152
|
+
|
|
153
|
+
raise ArgumentError, "DSL file must return LexerKit::Builder instance"
|
|
154
|
+
end
|
|
155
|
+
end
|