lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
|
|
5
|
+
module LexerKit
|
|
6
|
+
module Format
|
|
7
|
+
class LKB1
|
|
8
|
+
# Decoder handles binary deserialization of LKB1 format
|
|
9
|
+
class Decoder
|
|
10
|
+
def initialize(bytes)
|
|
11
|
+
@bytes = bytes
|
|
12
|
+
@pos = 0
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def decode_container
|
|
16
|
+
raise ArgumentError, "too short" if @bytes.bytesize < FIXED_HEADER_LEN
|
|
17
|
+
|
|
18
|
+
magic = read_bytes(4)
|
|
19
|
+
raise ArgumentError, "invalid magic: #{magic.inspect}" unless magic == MAGIC
|
|
20
|
+
|
|
21
|
+
header_version = read_u16
|
|
22
|
+
raise ArgumentError, "unsupported header version: #{header_version}" unless header_version == HEADER_VERSION
|
|
23
|
+
|
|
24
|
+
flags = read_u16
|
|
25
|
+
|
|
26
|
+
# Reject unsupported flags
|
|
27
|
+
if (flags & FLAG_PAYLOAD_COMPRESSED) != 0
|
|
28
|
+
raise ArgumentError, "compressed payload is not supported"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
if (flags & FLAG_PAYLOAD_ENCRYPTED) != 0
|
|
32
|
+
raise ArgumentError, "encrypted payload is not supported"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
header_len = read_u32
|
|
36
|
+
raise ArgumentError, "invalid header_len: #{header_len}" if header_len < FIXED_HEADER_LEN
|
|
37
|
+
|
|
38
|
+
payload_len = read_u32
|
|
39
|
+
format_version = read_u16
|
|
40
|
+
|
|
41
|
+
@pos += 2 # reserved
|
|
42
|
+
|
|
43
|
+
sha256 = read_bytes(32)
|
|
44
|
+
|
|
45
|
+
meta = {}
|
|
46
|
+
if header_len > @pos
|
|
47
|
+
meta_bytes = @bytes.byteslice(@pos, header_len - @pos)
|
|
48
|
+
meta = decode_tlv(meta_bytes)
|
|
49
|
+
@pos = header_len
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
total_len = header_len + payload_len
|
|
53
|
+
raise ArgumentError, "truncated payload" if @bytes.bytesize < total_len
|
|
54
|
+
|
|
55
|
+
payload = @bytes.byteslice(header_len, payload_len)
|
|
56
|
+
|
|
57
|
+
if (flags & FLAG_PAYLOAD_SHA256) != 0
|
|
58
|
+
actual = Digest::SHA256.digest(payload)
|
|
59
|
+
raise ArgumentError, "sha256 mismatch" unless actual == sha256
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
{
|
|
63
|
+
payload: payload,
|
|
64
|
+
meta: meta,
|
|
65
|
+
header: {
|
|
66
|
+
header_version: header_version,
|
|
67
|
+
flags: flags,
|
|
68
|
+
header_len: header_len,
|
|
69
|
+
payload_len: payload_len,
|
|
70
|
+
format_version: format_version,
|
|
71
|
+
sha256: sha256
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def read_bytes(length)
|
|
79
|
+
result = @bytes.byteslice(@pos, length)
|
|
80
|
+
@pos += length
|
|
81
|
+
result
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def read_u16
|
|
85
|
+
@bytes.byteslice(@pos, 2).unpack1("S>").tap { @pos += 2 }
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def read_u32
|
|
89
|
+
@bytes.byteslice(@pos, 4).unpack1("L>").tap { @pos += 4 }
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def decode_tlv(bytes)
|
|
93
|
+
pos = 0
|
|
94
|
+
meta = {}
|
|
95
|
+
while pos + 4 <= bytes.bytesize
|
|
96
|
+
type = bytes.byteslice(pos, 2).unpack1("S>")
|
|
97
|
+
pos += 2
|
|
98
|
+
len = bytes.byteslice(pos, 2).unpack1("S>")
|
|
99
|
+
pos += 2
|
|
100
|
+
raise ArgumentError, "invalid tlv length" if pos + len > bytes.bytesize
|
|
101
|
+
|
|
102
|
+
value = bytes.byteslice(pos, len)
|
|
103
|
+
pos += len
|
|
104
|
+
case type
|
|
105
|
+
when TLV_BUILD_ID
|
|
106
|
+
meta[:build_id] = value.force_encoding(Encoding::UTF_8)
|
|
107
|
+
when TLV_SOURCE_VERSION
|
|
108
|
+
meta[:source_version] = value.force_encoding(Encoding::UTF_8)
|
|
109
|
+
when TLV_TOKEN_COUNT
|
|
110
|
+
meta[:token_count] = value.unpack1("L>")
|
|
111
|
+
when TLV_MODE_COUNT
|
|
112
|
+
meta[:mode_count] = value.unpack1("S>")
|
|
113
|
+
when TLV_INSTRUCTION_COUNT
|
|
114
|
+
meta[:instruction_count] = value.unpack1("L>")
|
|
115
|
+
when TLV_CREATED_AT
|
|
116
|
+
meta[:created_at] = value.unpack1("Q>")
|
|
117
|
+
when TLV_GENERATOR_VERSION
|
|
118
|
+
meta[:generator_version] = value.force_encoding(Encoding::UTF_8)
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
meta
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require_relative "lkb1/decoder"
|
|
5
|
+
|
|
6
|
+
module LexerKit
|
|
7
|
+
module Format
|
|
8
|
+
# LKB1 (LexerKit Binary format version 1) is a binary container format
|
|
9
|
+
# for compiled lexer programs. It provides efficient loading with optional
|
|
10
|
+
# metadata and integrity checking via SHA256.
|
|
11
|
+
class LKB1
|
|
12
|
+
MAGIC = "LKB1"
|
|
13
|
+
HEADER_VERSION = 4
|
|
14
|
+
FIXED_HEADER_LEN = 52
|
|
15
|
+
|
|
16
|
+
FLAG_PAYLOAD_COMPRESSED = 1 << 0
|
|
17
|
+
FLAG_PAYLOAD_ENCRYPTED = 1 << 1
|
|
18
|
+
FLAG_PAYLOAD_SHA256 = 1 << 2
|
|
19
|
+
FLAG_META = 1 << 3
|
|
20
|
+
|
|
21
|
+
TLV_BUILD_ID = 0x0001
|
|
22
|
+
TLV_SOURCE_VERSION = 0x0002
|
|
23
|
+
TLV_TOKEN_COUNT = 0x0003
|
|
24
|
+
TLV_MODE_COUNT = 0x0004
|
|
25
|
+
TLV_INSTRUCTION_COUNT = 0x0005
|
|
26
|
+
TLV_CREATED_AT = 0x0006
|
|
27
|
+
TLV_GENERATOR_VERSION = 0x0007
|
|
28
|
+
|
|
29
|
+
# TLV length is u16, so string data cannot exceed this
|
|
30
|
+
TLV_MAX_STRING_LENGTH = (2**16) - 1 # 65535 bytes
|
|
31
|
+
|
|
32
|
+
attr_reader :program, :meta
|
|
33
|
+
|
|
34
|
+
# Create a new LKB1 instance from a compiled program
|
|
35
|
+
# @param program [IR::CompiledProgram] compiled lexer program
|
|
36
|
+
# @param meta [Hash] optional metadata
|
|
37
|
+
def initialize(program, meta: {})
|
|
38
|
+
@program = program
|
|
39
|
+
@meta = meta
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Load a compiled lexer program from a .lkb1 file
|
|
43
|
+
# @param path [String] path to .lkb1 file
|
|
44
|
+
# @return [LKB1] LKB1 instance
|
|
45
|
+
def self.load(path)
|
|
46
|
+
bytes = File.binread(path)
|
|
47
|
+
decode(bytes)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Save a compiled lexer program to a .lkb1 file (shortcut)
|
|
51
|
+
# @param program [IR::CompiledProgram] compiled lexer program
|
|
52
|
+
# @param path [String] output file path
|
|
53
|
+
# @param meta [Hash] optional metadata for header
|
|
54
|
+
def self.save(program, path:, meta: {})
|
|
55
|
+
new(program, meta: meta).save(path)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Decode a compiled lexer program from lkb1 binary data
|
|
59
|
+
# @param bytes [String] binary data
|
|
60
|
+
# @return [LKB1] LKB1 instance
|
|
61
|
+
def self.decode(bytes)
|
|
62
|
+
decoded = Decoder.new(bytes).decode_container
|
|
63
|
+
header = decoded[:header]
|
|
64
|
+
|
|
65
|
+
unless header[:format_version] == IR::Serializer::FORMAT_VERSION
|
|
66
|
+
raise LexerKit::IntegrityError, "Unsupported format version: #{header[:format_version]}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
program = IR::CompiledProgram.from_binary(decoded[:payload])
|
|
70
|
+
program.load_native! if LexerKit.native?
|
|
71
|
+
new(program, meta: decoded[:meta])
|
|
72
|
+
rescue ArgumentError => e
|
|
73
|
+
raise LexerKit::IntegrityError, e.message
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Encode the program to lkb1 binary data
|
|
77
|
+
# @return [String] binary data
|
|
78
|
+
def encode
|
|
79
|
+
payload = @program.to_binary
|
|
80
|
+
|
|
81
|
+
# Auto-generate metadata from program
|
|
82
|
+
default_meta = {
|
|
83
|
+
token_count: @program.token_names.size,
|
|
84
|
+
mode_count: @program.mode_names.size,
|
|
85
|
+
instruction_count: @program.instructions.size,
|
|
86
|
+
created_at: Time.now.to_i,
|
|
87
|
+
generator_version: LexerKit::VERSION
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
encode_container(
|
|
91
|
+
payload,
|
|
92
|
+
meta: default_meta.merge(@meta),
|
|
93
|
+
format_version: IR::Serializer::FORMAT_VERSION
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Save the program to a .lkb1 file
|
|
98
|
+
# @param path [String] output file path
|
|
99
|
+
def save(path)
|
|
100
|
+
File.binwrite(path, encode)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private
|
|
104
|
+
|
|
105
|
+
# Encode binary container with metadata
|
|
106
|
+
# @param payload [String] binary payload
|
|
107
|
+
# @param format_version [Integer] format version number
|
|
108
|
+
# @param meta [Hash] metadata
|
|
109
|
+
# @return [String] binary container
|
|
110
|
+
def encode_container(payload, format_version:, meta: {})
|
|
111
|
+
flags = FLAG_PAYLOAD_SHA256
|
|
112
|
+
tlv = encode_tlv(meta)
|
|
113
|
+
flags |= FLAG_META unless tlv.empty?
|
|
114
|
+
|
|
115
|
+
sha256 = Digest::SHA256.digest(payload)
|
|
116
|
+
header_len = FIXED_HEADER_LEN + tlv.bytesize
|
|
117
|
+
|
|
118
|
+
header = []
|
|
119
|
+
header << MAGIC
|
|
120
|
+
header << [HEADER_VERSION].pack("S>")
|
|
121
|
+
header << [flags].pack("S>")
|
|
122
|
+
header << [header_len].pack("L>")
|
|
123
|
+
header << [payload.bytesize].pack("L>")
|
|
124
|
+
header << [format_version].pack("S>")
|
|
125
|
+
header << [0].pack("S>")
|
|
126
|
+
header << sha256
|
|
127
|
+
header << tlv
|
|
128
|
+
|
|
129
|
+
header.join + payload
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Encode metadata as TLV (Type-Length-Value) format
|
|
133
|
+
# @param meta [Hash] metadata
|
|
134
|
+
# @return [String] TLV-encoded binary data
|
|
135
|
+
def encode_tlv(meta)
|
|
136
|
+
parts = []
|
|
137
|
+
add_tlv_string(parts, TLV_BUILD_ID, meta[:build_id])
|
|
138
|
+
add_tlv_string(parts, TLV_SOURCE_VERSION, meta[:source_version])
|
|
139
|
+
add_tlv_u32(parts, TLV_TOKEN_COUNT, meta[:token_count])
|
|
140
|
+
add_tlv_u16(parts, TLV_MODE_COUNT, meta[:mode_count])
|
|
141
|
+
add_tlv_u32(parts, TLV_INSTRUCTION_COUNT, meta[:instruction_count])
|
|
142
|
+
add_tlv_u64(parts, TLV_CREATED_AT, meta[:created_at])
|
|
143
|
+
add_tlv_string(parts, TLV_GENERATOR_VERSION, meta[:generator_version])
|
|
144
|
+
parts.join
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def add_tlv_string(parts, type, value)
|
|
148
|
+
return if value.nil?
|
|
149
|
+
|
|
150
|
+
# Ensure UTF-8 encoding and validate
|
|
151
|
+
str = value.to_s
|
|
152
|
+
unless str.encoding == Encoding::UTF_8 || str.force_encoding(Encoding::UTF_8).valid_encoding?
|
|
153
|
+
raise ArgumentError, "TLV string must be valid UTF-8"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
bytes = str.b
|
|
157
|
+
if bytes.bytesize > TLV_MAX_STRING_LENGTH
|
|
158
|
+
raise ArgumentError, "TLV string length exceeds maximum (#{TLV_MAX_STRING_LENGTH} bytes): #{bytes.bytesize}"
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
parts << [type, bytes.bytesize].pack("S>S>")
|
|
162
|
+
parts << bytes
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def add_tlv_u16(parts, type, value)
|
|
166
|
+
return if value.nil?
|
|
167
|
+
|
|
168
|
+
unless value.is_a?(Integer) && value >= 0 && value <= (2**16) - 1
|
|
169
|
+
raise ArgumentError, "TLV u16 value out of range (0..#{(2**16) - 1}): #{value}"
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
parts << [type, 2].pack("S>S>")
|
|
173
|
+
parts << [value].pack("S>")
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def add_tlv_u32(parts, type, value)
|
|
177
|
+
return if value.nil?
|
|
178
|
+
|
|
179
|
+
unless value.is_a?(Integer) && value >= 0 && value <= (2**32) - 1
|
|
180
|
+
raise ArgumentError, "TLV u32 value out of range (0..#{(2**32) - 1}): #{value}"
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
parts << [type, 4].pack("S>S>")
|
|
184
|
+
parts << [value].pack("L>")
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def add_tlv_u64(parts, type, value)
|
|
188
|
+
return if value.nil?
|
|
189
|
+
|
|
190
|
+
unless value.is_a?(Integer) && value >= 0 && value <= (2**64) - 1
|
|
191
|
+
raise ArgumentError, "TLV u64 value out of range (0..#{(2**64) - 1}): #{value}"
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
parts << [type, 8].pack("S>S>")
|
|
195
|
+
parts << [value].pack("Q>")
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
199
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "zlib"
|
|
5
|
+
require "base64"
|
|
6
|
+
require "digest"
|
|
7
|
+
|
|
8
|
+
module LexerKit
|
|
9
|
+
module Format
|
|
10
|
+
# LKT1 (LexerKit Text format version 1) is a JSON-based container format
|
|
11
|
+
# for compiled lexer programs. It supports compression and includes
|
|
12
|
+
# integrity checking via SHA256.
|
|
13
|
+
class LKT1
|
|
14
|
+
attr_reader :program
|
|
15
|
+
|
|
16
|
+
# Create a new LKT1 instance from a compiled program
|
|
17
|
+
# @param program [IR::CompiledProgram] compiled lexer program
|
|
18
|
+
def initialize(program)
|
|
19
|
+
@program = program
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Load a compiled lexer program from a .lkt1 file
|
|
23
|
+
# @param path [String] path to .lkt1 file
|
|
24
|
+
# @return [LKT1] LKT1 instance
|
|
25
|
+
def self.load(path)
|
|
26
|
+
content = File.read(path)
|
|
27
|
+
decode(content)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Save a compiled lexer program to a .lkt1 file (shortcut)
|
|
31
|
+
# @param program [IR::CompiledProgram] compiled lexer program
|
|
32
|
+
# @param path [String] output file path
|
|
33
|
+
def self.save(program, path:)
|
|
34
|
+
new(program).save(path)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Decode a compiled lexer program from lkt1 JSON string
|
|
38
|
+
# @param json_string [String] lkt1 JSON string
|
|
39
|
+
# @return [LKT1] LKT1 instance
|
|
40
|
+
def self.decode(json_string)
|
|
41
|
+
container = JSON.parse(json_string, symbolize_names: true)
|
|
42
|
+
|
|
43
|
+
# Validate format
|
|
44
|
+
unless container[:format] == "lkt1"
|
|
45
|
+
raise LexerKit::IntegrityError, "Unknown format: #{container[:format]}"
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
unless container[:kind] == "program"
|
|
49
|
+
raise LexerKit::IntegrityError, "Unknown kind: #{container[:kind]}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Decode based on codec
|
|
53
|
+
unless container[:codec] == "deflate+base64"
|
|
54
|
+
raise LexerKit::IntegrityError, "Unknown codec: #{container[:codec]}"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
compressed = Base64.strict_decode64(container[:data])
|
|
58
|
+
binary = Zlib::Inflate.inflate(compressed)
|
|
59
|
+
|
|
60
|
+
# Verify integrity (sha256 is required)
|
|
61
|
+
unless container[:sha256]
|
|
62
|
+
raise LexerKit::IntegrityError, "Missing required field: sha256"
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
actual_hash = Digest::SHA256.hexdigest(binary)
|
|
66
|
+
unless actual_hash == container[:sha256]
|
|
67
|
+
raise LexerKit::IntegrityError, "SHA256 mismatch: expected #{container[:sha256]}, got #{actual_hash}"
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
if container[:uncompressed_len] && binary.bytesize != container[:uncompressed_len]
|
|
71
|
+
raise LexerKit::IntegrityError, "Length mismatch: expected #{container[:uncompressed_len]}, got #{binary.bytesize}"
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
program = IR::CompiledProgram.from_binary(binary)
|
|
75
|
+
program.load_native! if LexerKit.native?
|
|
76
|
+
new(program)
|
|
77
|
+
rescue JSON::ParserError => e
|
|
78
|
+
raise LexerKit::IntegrityError, "Invalid JSON: #{e.message}"
|
|
79
|
+
rescue ArgumentError => e
|
|
80
|
+
# Base64 decode errors
|
|
81
|
+
raise LexerKit::IntegrityError, "Decoding error: #{e.message}"
|
|
82
|
+
rescue Zlib::Error => e
|
|
83
|
+
raise LexerKit::IntegrityError, "Decompression error: #{e.message}"
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Encode the program to lkt1 JSON string
|
|
87
|
+
# @return [String] JSON string
|
|
88
|
+
def encode
|
|
89
|
+
binary = @program.to_binary
|
|
90
|
+
compressed = Zlib::Deflate.deflate(binary)
|
|
91
|
+
encoded = Base64.strict_encode64(compressed)
|
|
92
|
+
|
|
93
|
+
JSON.generate({
|
|
94
|
+
format: "lkt1",
|
|
95
|
+
codec: "deflate+base64",
|
|
96
|
+
kind: "program",
|
|
97
|
+
table_version: 2,
|
|
98
|
+
uncompressed_len: binary.bytesize,
|
|
99
|
+
sha256: Digest::SHA256.hexdigest(binary),
|
|
100
|
+
data: encoded
|
|
101
|
+
})
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Save the program to a .lkt1 file
|
|
105
|
+
# @param path [String] output file path
|
|
106
|
+
def save(path)
|
|
107
|
+
File.write(path, encode)
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "format/lkt1"
|
|
4
|
+
require_relative "format/lkb1"
|
|
5
|
+
|
|
6
|
+
module LexerKit
|
|
7
|
+
# Format provides file format implementations for lexer programs.
|
|
8
|
+
# Supports LKT1 (JSON-based) and LKB1 (binary) formats.
|
|
9
|
+
#
|
|
10
|
+
# @example Loading a lexer
|
|
11
|
+
# program = LexerKit::Format::LKT1.load("lexer.lkt1")
|
|
12
|
+
# program = LexerKit::Format::LKB1.load("lexer.lkb1")
|
|
13
|
+
#
|
|
14
|
+
# @example Saving a lexer
|
|
15
|
+
# LexerKit::Format::LKT1.save(program, "lexer.lkt1")
|
|
16
|
+
# LexerKit::Format::LKB1.save(program, "lexer.lkb1")
|
|
17
|
+
module Format
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# CompiledProgram is the complete compiled lexer ready for execution.
|
|
6
|
+
# It contains instructions, DFA tables, constants, and metadata.
|
|
7
|
+
# Binary serialization is handled by Serializer class.
|
|
8
|
+
#
|
|
9
|
+
# Note: Native methods are included by Rust extension when loaded.
|
|
10
|
+
class CompiledProgram
|
|
11
|
+
attr_reader :instructions, :dfa_tables, :jump_tables, :constant_pool, :token_names, :mode_names, :keyword_tables, :version, :mode_offsets, :token_meta
|
|
12
|
+
|
|
13
|
+
# @param instructions [Array<Instruction>] instruction list
|
|
14
|
+
# @param dfa_tables [Array<DFATable>] DFA tables
|
|
15
|
+
# @param jump_tables [Array<JumpTable>] jump tables
|
|
16
|
+
# @param constant_pool [ConstantPool] string constants
|
|
17
|
+
# @param modes [Hash<Symbol, Integer>] mode name → start instruction offset
|
|
18
|
+
# @param token_names [Array<Symbol>] token ID → name mapping
|
|
19
|
+
# @param mode_names [Array<Symbol>] mode ID → name mapping
|
|
20
|
+
# @param keyword_tables [Array<KeywordTable>] keyword tables
|
|
21
|
+
# @param token_meta [Hash<Integer, Hash>] token ID → metadata hash
|
|
22
|
+
# @param version [Integer] user-defined version number
|
|
23
|
+
def initialize(
|
|
24
|
+
instructions:,
|
|
25
|
+
dfa_tables: [],
|
|
26
|
+
jump_tables: [],
|
|
27
|
+
constant_pool: nil,
|
|
28
|
+
modes: {},
|
|
29
|
+
token_names: [],
|
|
30
|
+
mode_names: [],
|
|
31
|
+
keyword_tables: [],
|
|
32
|
+
token_meta: {},
|
|
33
|
+
version: 1
|
|
34
|
+
)
|
|
35
|
+
@instructions = instructions.freeze
|
|
36
|
+
@dfa_tables = dfa_tables.freeze
|
|
37
|
+
@jump_tables = jump_tables.freeze
|
|
38
|
+
@constant_pool = constant_pool || ConstantPool.new
|
|
39
|
+
@mode_offsets = modes.freeze
|
|
40
|
+
@token_names = token_names.freeze
|
|
41
|
+
@mode_names = mode_names.freeze
|
|
42
|
+
@keyword_tables = keyword_tables.freeze
|
|
43
|
+
@token_meta = token_meta.freeze
|
|
44
|
+
@version = version
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Kind of compiled program
|
|
48
|
+
# @return [Symbol]
|
|
49
|
+
def kind
|
|
50
|
+
:program
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Get token ID by name
|
|
54
|
+
# @param name [Symbol]
|
|
55
|
+
# @return [Integer, nil]
|
|
56
|
+
def token_id(name)
|
|
57
|
+
@token_names.index(name)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Get token name by ID
|
|
61
|
+
# @param id [Integer]
|
|
62
|
+
# @return [Symbol, nil]
|
|
63
|
+
def token_name(id)
|
|
64
|
+
@token_names[id]
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Get all token names (excludes reserved placeholder tokens)
|
|
68
|
+
# @return [Array<Symbol>]
|
|
69
|
+
def tokens
|
|
70
|
+
@token_names.reject { |name| name.to_s.start_with?("__RESERVED_") }
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get all mode names
|
|
74
|
+
# @return [Array<Symbol>]
|
|
75
|
+
def modes
|
|
76
|
+
@mode_names.dup
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Get mode ID by name
|
|
80
|
+
# @param name [Symbol]
|
|
81
|
+
# @return [Integer, nil]
|
|
82
|
+
def mode_id(name)
|
|
83
|
+
@mode_names.index(name)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Get mode start offset
|
|
87
|
+
# @param name [Symbol]
|
|
88
|
+
# @return [Integer, nil]
|
|
89
|
+
def mode_offset(name)
|
|
90
|
+
@mode_offsets[name]
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Low-level lexing with callback (for performance-critical code)
|
|
94
|
+
# @param bytes [String] input bytes
|
|
95
|
+
# @yield [Integer, Integer, Integer] token_id, start, length
|
|
96
|
+
# @raise [NativeExtensionError] if Rust extension is not loaded
|
|
97
|
+
def lowlevel_each(bytes, &)
|
|
98
|
+
bytes = bytes.b
|
|
99
|
+
raise LexerKit::NativeExtensionError, "Rust extension not loaded" unless LexerKit.native?
|
|
100
|
+
|
|
101
|
+
ensure_rust_native!
|
|
102
|
+
lex_rust_native(bytes, &)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# Check if a token ID is an error token
|
|
106
|
+
# Use this for fast error detection in the lex loop
|
|
107
|
+
# @param tok [Integer] token ID
|
|
108
|
+
# @return [Boolean]
|
|
109
|
+
def error_token?(tok)
|
|
110
|
+
tok == LexerKit::INVALID_TOKEN_ID
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Get metadata for a token ID
|
|
114
|
+
# @param tok [Integer] token ID
|
|
115
|
+
# @return [Hash] metadata hash (empty hash if no metadata)
|
|
116
|
+
def token_meta_for(tok)
|
|
117
|
+
@token_meta[tok] || {}
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Create a Token object on demand
|
|
121
|
+
# Use this to get rich token info only when needed (e.g., for errors)
|
|
122
|
+
# Source is created internally, so there's zero overhead if not called
|
|
123
|
+
# @param tok [Integer] token ID
|
|
124
|
+
# @param start [Integer] start byte offset
|
|
125
|
+
# @param len [Integer] length in bytes
|
|
126
|
+
# @param input [String] original input string
|
|
127
|
+
# @param filename [String, nil] optional filename for diagnostics
|
|
128
|
+
# @return [Core::Token]
|
|
129
|
+
def make_token(tok, start, len, input:, filename: nil)
|
|
130
|
+
source = Core::Source.new(input, filename: filename)
|
|
131
|
+
Core::Token.new(
|
|
132
|
+
id: tok,
|
|
133
|
+
name: token_name(tok),
|
|
134
|
+
start: start,
|
|
135
|
+
len: len,
|
|
136
|
+
source: source,
|
|
137
|
+
meta: @token_meta[tok]
|
|
138
|
+
)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Create a stream-based lexer with lookahead support
|
|
142
|
+
# Returns a Runner that wraps the underlying LexStream.
|
|
143
|
+
# @param input [String] input string
|
|
144
|
+
# @param filename [String, nil] optional filename for diagnostics
|
|
145
|
+
# @return [Runner]
|
|
146
|
+
# @raise [NativeExtensionError] if Rust extension is not loaded
|
|
147
|
+
def stream(input, filename: nil)
|
|
148
|
+
raise LexerKit::NativeExtensionError, "Rust extension not loaded" unless LexerKit.native?
|
|
149
|
+
|
|
150
|
+
ensure_rust_native!
|
|
151
|
+
lex_stream = create_rust_stream(input)
|
|
152
|
+
Runner.new(self, lex_stream, filename: filename)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Tokenize input and return array of Token objects
|
|
156
|
+
# Source is shared across all tokens for efficient line/col lookup
|
|
157
|
+
# @param input [String] input string
|
|
158
|
+
# @param filename [String, nil] optional filename for diagnostics
|
|
159
|
+
# @return [Array<Core::Token>]
|
|
160
|
+
def tokenize(input, filename: nil)
|
|
161
|
+
bytes = input.b
|
|
162
|
+
source = Core::Source.new(bytes, filename: filename)
|
|
163
|
+
|
|
164
|
+
tokens = []
|
|
165
|
+
lowlevel_each(bytes) do |tok_id, start, len|
|
|
166
|
+
tokens << Core::Token.new(
|
|
167
|
+
id: tok_id,
|
|
168
|
+
name: token_name(tok_id),
|
|
169
|
+
start: start,
|
|
170
|
+
len: len,
|
|
171
|
+
source: source,
|
|
172
|
+
meta: @token_meta[tok_id]
|
|
173
|
+
)
|
|
174
|
+
end
|
|
175
|
+
tokens
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# Load native representation for fast lexing
|
|
179
|
+
# @return [self]
|
|
180
|
+
def load_native!
|
|
181
|
+
return self unless LexerKit.native?
|
|
182
|
+
|
|
183
|
+
load_rust_native(to_native_data)
|
|
184
|
+
self
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Convert to data format for Rust native loading
|
|
188
|
+
# @return [Hash] data for Rust extension
|
|
189
|
+
def to_native_data
|
|
190
|
+
{
|
|
191
|
+
instructions: @instructions.map(&:to_binary).join,
|
|
192
|
+
dfa_tables: @dfa_tables.map(&:to_native_format),
|
|
193
|
+
jump_tables: @jump_tables.map(&:to_native_format),
|
|
194
|
+
keyword_tables: @keyword_tables.map(&:to_native_format),
|
|
195
|
+
constant_pool: @constant_pool.entries,
|
|
196
|
+
modes: @mode_offsets.map { |name, offset| [name.to_s, offset] }
|
|
197
|
+
}
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Encode to binary
|
|
201
|
+
# @return [String]
|
|
202
|
+
def to_binary
|
|
203
|
+
Serializer.to_binary(self)
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Decode from binary
|
|
207
|
+
# @param bytes [String]
|
|
208
|
+
# @return [CompiledProgram]
|
|
209
|
+
def self.from_binary(bytes)
|
|
210
|
+
Serializer.from_binary(bytes)
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def inspect
|
|
214
|
+
"#<CompiledProgram v#{@version} instructions=#{@instructions.size} tokens=#{@token_names.size} native=#{LexerKit.native?}>"
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
private
|
|
218
|
+
|
|
219
|
+
# Ensure Rust native is loaded
|
|
220
|
+
# @api private
|
|
221
|
+
def ensure_rust_native!
|
|
222
|
+
return if respond_to?(:rust_native_loaded?) && rust_native_loaded?
|
|
223
|
+
|
|
224
|
+
load_rust_native(to_native_data)
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
end
|