lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+
5
+ module LexerKit
6
+ module Format
7
+ class LKB1
8
+ # Decoder handles binary deserialization of LKB1 format
9
+ class Decoder
10
+ def initialize(bytes)
11
+ @bytes = bytes
12
+ @pos = 0
13
+ end
14
+
15
+ def decode_container
16
+ raise ArgumentError, "too short" if @bytes.bytesize < FIXED_HEADER_LEN
17
+
18
+ magic = read_bytes(4)
19
+ raise ArgumentError, "invalid magic: #{magic.inspect}" unless magic == MAGIC
20
+
21
+ header_version = read_u16
22
+ raise ArgumentError, "unsupported header version: #{header_version}" unless header_version == HEADER_VERSION
23
+
24
+ flags = read_u16
25
+
26
+ # Reject unsupported flags
27
+ if (flags & FLAG_PAYLOAD_COMPRESSED) != 0
28
+ raise ArgumentError, "compressed payload is not supported"
29
+ end
30
+
31
+ if (flags & FLAG_PAYLOAD_ENCRYPTED) != 0
32
+ raise ArgumentError, "encrypted payload is not supported"
33
+ end
34
+
35
+ header_len = read_u32
36
+ raise ArgumentError, "invalid header_len: #{header_len}" if header_len < FIXED_HEADER_LEN
37
+
38
+ payload_len = read_u32
39
+ format_version = read_u16
40
+
41
+ @pos += 2 # reserved
42
+
43
+ sha256 = read_bytes(32)
44
+
45
+ meta = {}
46
+ if header_len > @pos
47
+ meta_bytes = @bytes.byteslice(@pos, header_len - @pos)
48
+ meta = decode_tlv(meta_bytes)
49
+ @pos = header_len
50
+ end
51
+
52
+ total_len = header_len + payload_len
53
+ raise ArgumentError, "truncated payload" if @bytes.bytesize < total_len
54
+
55
+ payload = @bytes.byteslice(header_len, payload_len)
56
+
57
+ if (flags & FLAG_PAYLOAD_SHA256) != 0
58
+ actual = Digest::SHA256.digest(payload)
59
+ raise ArgumentError, "sha256 mismatch" unless actual == sha256
60
+ end
61
+
62
+ {
63
+ payload: payload,
64
+ meta: meta,
65
+ header: {
66
+ header_version: header_version,
67
+ flags: flags,
68
+ header_len: header_len,
69
+ payload_len: payload_len,
70
+ format_version: format_version,
71
+ sha256: sha256
72
+ }
73
+ }
74
+ end
75
+
76
+ private
77
+
78
+ def read_bytes(length)
79
+ result = @bytes.byteslice(@pos, length)
80
+ @pos += length
81
+ result
82
+ end
83
+
84
+ def read_u16
85
+ @bytes.byteslice(@pos, 2).unpack1("S>").tap { @pos += 2 }
86
+ end
87
+
88
+ def read_u32
89
+ @bytes.byteslice(@pos, 4).unpack1("L>").tap { @pos += 4 }
90
+ end
91
+
92
+ def decode_tlv(bytes)
93
+ pos = 0
94
+ meta = {}
95
+ while pos + 4 <= bytes.bytesize
96
+ type = bytes.byteslice(pos, 2).unpack1("S>")
97
+ pos += 2
98
+ len = bytes.byteslice(pos, 2).unpack1("S>")
99
+ pos += 2
100
+ raise ArgumentError, "invalid tlv length" if pos + len > bytes.bytesize
101
+
102
+ value = bytes.byteslice(pos, len)
103
+ pos += len
104
+ case type
105
+ when TLV_BUILD_ID
106
+ meta[:build_id] = value.force_encoding(Encoding::UTF_8)
107
+ when TLV_SOURCE_VERSION
108
+ meta[:source_version] = value.force_encoding(Encoding::UTF_8)
109
+ when TLV_TOKEN_COUNT
110
+ meta[:token_count] = value.unpack1("L>")
111
+ when TLV_MODE_COUNT
112
+ meta[:mode_count] = value.unpack1("S>")
113
+ when TLV_INSTRUCTION_COUNT
114
+ meta[:instruction_count] = value.unpack1("L>")
115
+ when TLV_CREATED_AT
116
+ meta[:created_at] = value.unpack1("Q>")
117
+ when TLV_GENERATOR_VERSION
118
+ meta[:generator_version] = value.force_encoding(Encoding::UTF_8)
119
+ end
120
+ end
121
+ meta
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,199 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require_relative "lkb1/decoder"
5
+
6
+ module LexerKit
7
+ module Format
8
+ # LKB1 (LexerKit Binary format version 1) is a binary container format
9
+ # for compiled lexer programs. It provides efficient loading with optional
10
+ # metadata and integrity checking via SHA256.
11
+ class LKB1
12
+ MAGIC = "LKB1"
13
+ HEADER_VERSION = 4
14
+ FIXED_HEADER_LEN = 52
15
+
16
+ FLAG_PAYLOAD_COMPRESSED = 1 << 0
17
+ FLAG_PAYLOAD_ENCRYPTED = 1 << 1
18
+ FLAG_PAYLOAD_SHA256 = 1 << 2
19
+ FLAG_META = 1 << 3
20
+
21
+ TLV_BUILD_ID = 0x0001
22
+ TLV_SOURCE_VERSION = 0x0002
23
+ TLV_TOKEN_COUNT = 0x0003
24
+ TLV_MODE_COUNT = 0x0004
25
+ TLV_INSTRUCTION_COUNT = 0x0005
26
+ TLV_CREATED_AT = 0x0006
27
+ TLV_GENERATOR_VERSION = 0x0007
28
+
29
+ # TLV length is u16, so string data cannot exceed this
30
+ TLV_MAX_STRING_LENGTH = (2**16) - 1 # 65535 bytes
31
+
32
+ attr_reader :program, :meta
33
+
34
+ # Create a new LKB1 instance from a compiled program
35
+ # @param program [IR::CompiledProgram] compiled lexer program
36
+ # @param meta [Hash] optional metadata
37
+ def initialize(program, meta: {})
38
+ @program = program
39
+ @meta = meta
40
+ end
41
+
42
+ # Load a compiled lexer program from a .lkb1 file
43
+ # @param path [String] path to .lkb1 file
44
+ # @return [LKB1] LKB1 instance
45
+ def self.load(path)
46
+ bytes = File.binread(path)
47
+ decode(bytes)
48
+ end
49
+
50
+ # Save a compiled lexer program to a .lkb1 file (shortcut)
51
+ # @param program [IR::CompiledProgram] compiled lexer program
52
+ # @param path [String] output file path
53
+ # @param meta [Hash] optional metadata for header
54
+ def self.save(program, path:, meta: {})
55
+ new(program, meta: meta).save(path)
56
+ end
57
+
58
+ # Decode a compiled lexer program from lkb1 binary data
59
+ # @param bytes [String] binary data
60
+ # @return [LKB1] LKB1 instance
61
+ def self.decode(bytes)
62
+ decoded = Decoder.new(bytes).decode_container
63
+ header = decoded[:header]
64
+
65
+ unless header[:format_version] == IR::Serializer::FORMAT_VERSION
66
+ raise LexerKit::IntegrityError, "Unsupported format version: #{header[:format_version]}"
67
+ end
68
+
69
+ program = IR::CompiledProgram.from_binary(decoded[:payload])
70
+ program.load_native! if LexerKit.native?
71
+ new(program, meta: decoded[:meta])
72
+ rescue ArgumentError => e
73
+ raise LexerKit::IntegrityError, e.message
74
+ end
75
+
76
+ # Encode the program to lkb1 binary data
77
+ # @return [String] binary data
78
+ def encode
79
+ payload = @program.to_binary
80
+
81
+ # Auto-generate metadata from program
82
+ default_meta = {
83
+ token_count: @program.token_names.size,
84
+ mode_count: @program.mode_names.size,
85
+ instruction_count: @program.instructions.size,
86
+ created_at: Time.now.to_i,
87
+ generator_version: LexerKit::VERSION
88
+ }
89
+
90
+ encode_container(
91
+ payload,
92
+ meta: default_meta.merge(@meta),
93
+ format_version: IR::Serializer::FORMAT_VERSION
94
+ )
95
+ end
96
+
97
+ # Save the program to a .lkb1 file
98
+ # @param path [String] output file path
99
+ def save(path)
100
+ File.binwrite(path, encode)
101
+ end
102
+
103
+ private
104
+
105
+ # Encode binary container with metadata
106
+ # @param payload [String] binary payload
107
+ # @param format_version [Integer] format version number
108
+ # @param meta [Hash] metadata
109
+ # @return [String] binary container
110
+ def encode_container(payload, format_version:, meta: {})
111
+ flags = FLAG_PAYLOAD_SHA256
112
+ tlv = encode_tlv(meta)
113
+ flags |= FLAG_META unless tlv.empty?
114
+
115
+ sha256 = Digest::SHA256.digest(payload)
116
+ header_len = FIXED_HEADER_LEN + tlv.bytesize
117
+
118
+ header = []
119
+ header << MAGIC
120
+ header << [HEADER_VERSION].pack("S>")
121
+ header << [flags].pack("S>")
122
+ header << [header_len].pack("L>")
123
+ header << [payload.bytesize].pack("L>")
124
+ header << [format_version].pack("S>")
125
+ header << [0].pack("S>")
126
+ header << sha256
127
+ header << tlv
128
+
129
+ header.join + payload
130
+ end
131
+
132
+ # Encode metadata as TLV (Type-Length-Value) format
133
+ # @param meta [Hash] metadata
134
+ # @return [String] TLV-encoded binary data
135
+ def encode_tlv(meta)
136
+ parts = []
137
+ add_tlv_string(parts, TLV_BUILD_ID, meta[:build_id])
138
+ add_tlv_string(parts, TLV_SOURCE_VERSION, meta[:source_version])
139
+ add_tlv_u32(parts, TLV_TOKEN_COUNT, meta[:token_count])
140
+ add_tlv_u16(parts, TLV_MODE_COUNT, meta[:mode_count])
141
+ add_tlv_u32(parts, TLV_INSTRUCTION_COUNT, meta[:instruction_count])
142
+ add_tlv_u64(parts, TLV_CREATED_AT, meta[:created_at])
143
+ add_tlv_string(parts, TLV_GENERATOR_VERSION, meta[:generator_version])
144
+ parts.join
145
+ end
146
+
147
+ def add_tlv_string(parts, type, value)
148
+ return if value.nil?
149
+
150
+ # Ensure UTF-8 encoding and validate
151
+ str = value.to_s
152
+ unless str.encoding == Encoding::UTF_8 || str.force_encoding(Encoding::UTF_8).valid_encoding?
153
+ raise ArgumentError, "TLV string must be valid UTF-8"
154
+ end
155
+
156
+ bytes = str.b
157
+ if bytes.bytesize > TLV_MAX_STRING_LENGTH
158
+ raise ArgumentError, "TLV string length exceeds maximum (#{TLV_MAX_STRING_LENGTH} bytes): #{bytes.bytesize}"
159
+ end
160
+
161
+ parts << [type, bytes.bytesize].pack("S>S>")
162
+ parts << bytes
163
+ end
164
+
165
+ def add_tlv_u16(parts, type, value)
166
+ return if value.nil?
167
+
168
+ unless value.is_a?(Integer) && value >= 0 && value <= (2**16) - 1
169
+ raise ArgumentError, "TLV u16 value out of range (0..#{(2**16) - 1}): #{value}"
170
+ end
171
+
172
+ parts << [type, 2].pack("S>S>")
173
+ parts << [value].pack("S>")
174
+ end
175
+
176
+ def add_tlv_u32(parts, type, value)
177
+ return if value.nil?
178
+
179
+ unless value.is_a?(Integer) && value >= 0 && value <= (2**32) - 1
180
+ raise ArgumentError, "TLV u32 value out of range (0..#{(2**32) - 1}): #{value}"
181
+ end
182
+
183
+ parts << [type, 4].pack("S>S>")
184
+ parts << [value].pack("L>")
185
+ end
186
+
187
+ def add_tlv_u64(parts, type, value)
188
+ return if value.nil?
189
+
190
+ unless value.is_a?(Integer) && value >= 0 && value <= (2**64) - 1
191
+ raise ArgumentError, "TLV u64 value out of range (0..#{(2**64) - 1}): #{value}"
192
+ end
193
+
194
+ parts << [type, 8].pack("S>S>")
195
+ parts << [value].pack("Q>")
196
+ end
197
+ end
198
+ end
199
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "zlib"
5
+ require "base64"
6
+ require "digest"
7
+
8
+ module LexerKit
9
+ module Format
10
+ # LKT1 (LexerKit Text format version 1) is a JSON-based container format
11
+ # for compiled lexer programs. It supports compression and includes
12
+ # integrity checking via SHA256.
13
+ class LKT1
14
+ attr_reader :program
15
+
16
+ # Create a new LKT1 instance from a compiled program
17
+ # @param program [IR::CompiledProgram] compiled lexer program
18
+ def initialize(program)
19
+ @program = program
20
+ end
21
+
22
+ # Load a compiled lexer program from a .lkt1 file
23
+ # @param path [String] path to .lkt1 file
24
+ # @return [LKT1] LKT1 instance
25
+ def self.load(path)
26
+ content = File.read(path)
27
+ decode(content)
28
+ end
29
+
30
+ # Save a compiled lexer program to a .lkt1 file (shortcut)
31
+ # @param program [IR::CompiledProgram] compiled lexer program
32
+ # @param path [String] output file path
33
+ def self.save(program, path:)
34
+ new(program).save(path)
35
+ end
36
+
37
+ # Decode a compiled lexer program from lkt1 JSON string
38
+ # @param json_string [String] lkt1 JSON string
39
+ # @return [LKT1] LKT1 instance
40
+ def self.decode(json_string)
41
+ container = JSON.parse(json_string, symbolize_names: true)
42
+
43
+ # Validate format
44
+ unless container[:format] == "lkt1"
45
+ raise LexerKit::IntegrityError, "Unknown format: #{container[:format]}"
46
+ end
47
+
48
+ unless container[:kind] == "program"
49
+ raise LexerKit::IntegrityError, "Unknown kind: #{container[:kind]}"
50
+ end
51
+
52
+ # Decode based on codec
53
+ unless container[:codec] == "deflate+base64"
54
+ raise LexerKit::IntegrityError, "Unknown codec: #{container[:codec]}"
55
+ end
56
+
57
+ compressed = Base64.strict_decode64(container[:data])
58
+ binary = Zlib::Inflate.inflate(compressed)
59
+
60
+ # Verify integrity (sha256 is required)
61
+ unless container[:sha256]
62
+ raise LexerKit::IntegrityError, "Missing required field: sha256"
63
+ end
64
+
65
+ actual_hash = Digest::SHA256.hexdigest(binary)
66
+ unless actual_hash == container[:sha256]
67
+ raise LexerKit::IntegrityError, "SHA256 mismatch: expected #{container[:sha256]}, got #{actual_hash}"
68
+ end
69
+
70
+ if container[:uncompressed_len] && binary.bytesize != container[:uncompressed_len]
71
+ raise LexerKit::IntegrityError, "Length mismatch: expected #{container[:uncompressed_len]}, got #{binary.bytesize}"
72
+ end
73
+
74
+ program = IR::CompiledProgram.from_binary(binary)
75
+ program.load_native! if LexerKit.native?
76
+ new(program)
77
+ rescue JSON::ParserError => e
78
+ raise LexerKit::IntegrityError, "Invalid JSON: #{e.message}"
79
+ rescue ArgumentError => e
80
+ # Base64 decode errors
81
+ raise LexerKit::IntegrityError, "Decoding error: #{e.message}"
82
+ rescue Zlib::Error => e
83
+ raise LexerKit::IntegrityError, "Decompression error: #{e.message}"
84
+ end
85
+
86
+ # Encode the program to lkt1 JSON string
87
+ # @return [String] JSON string
88
+ def encode
89
+ binary = @program.to_binary
90
+ compressed = Zlib::Deflate.deflate(binary)
91
+ encoded = Base64.strict_encode64(compressed)
92
+
93
+ JSON.generate({
94
+ format: "lkt1",
95
+ codec: "deflate+base64",
96
+ kind: "program",
97
+ table_version: 2,
98
+ uncompressed_len: binary.bytesize,
99
+ sha256: Digest::SHA256.hexdigest(binary),
100
+ data: encoded
101
+ })
102
+ end
103
+
104
+ # Save the program to a .lkt1 file
105
+ # @param path [String] output file path
106
+ def save(path)
107
+ File.write(path, encode)
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "format/lkt1"
4
+ require_relative "format/lkb1"
5
+
6
+ module LexerKit
7
+ # Format provides file format implementations for lexer programs.
8
+ # Supports LKT1 (JSON-based) and LKB1 (binary) formats.
9
+ #
10
+ # @example Loading a lexer
11
+ # program = LexerKit::Format::LKT1.load("lexer.lkt1")
12
+ # program = LexerKit::Format::LKB1.load("lexer.lkb1")
13
+ #
14
+ # @example Saving a lexer
15
+ # LexerKit::Format::LKT1.save(program, "lexer.lkt1")
16
+ # LexerKit::Format::LKB1.save(program, "lexer.lkb1")
17
+ module Format
18
+ end
19
+ end
@@ -0,0 +1,228 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # CompiledProgram is the complete compiled lexer ready for execution.
6
+ # It contains instructions, DFA tables, constants, and metadata.
7
+ # Binary serialization is handled by Serializer class.
8
+ #
9
+ # Note: Native methods are included by Rust extension when loaded.
10
+ class CompiledProgram
11
+ attr_reader :instructions, :dfa_tables, :jump_tables, :constant_pool, :token_names, :mode_names, :keyword_tables, :version, :mode_offsets, :token_meta
12
+
13
+ # @param instructions [Array<Instruction>] instruction list
14
+ # @param dfa_tables [Array<DFATable>] DFA tables
15
+ # @param jump_tables [Array<JumpTable>] jump tables
16
+ # @param constant_pool [ConstantPool] string constants
17
+ # @param modes [Hash<Symbol, Integer>] mode name → start instruction offset
18
+ # @param token_names [Array<Symbol>] token ID → name mapping
19
+ # @param mode_names [Array<Symbol>] mode ID → name mapping
20
+ # @param keyword_tables [Array<KeywordTable>] keyword tables
21
+ # @param token_meta [Hash<Integer, Hash>] token ID → metadata hash
22
+ # @param version [Integer] user-defined version number
23
+ def initialize(
24
+ instructions:,
25
+ dfa_tables: [],
26
+ jump_tables: [],
27
+ constant_pool: nil,
28
+ modes: {},
29
+ token_names: [],
30
+ mode_names: [],
31
+ keyword_tables: [],
32
+ token_meta: {},
33
+ version: 1
34
+ )
35
+ @instructions = instructions.freeze
36
+ @dfa_tables = dfa_tables.freeze
37
+ @jump_tables = jump_tables.freeze
38
+ @constant_pool = constant_pool || ConstantPool.new
39
+ @mode_offsets = modes.freeze
40
+ @token_names = token_names.freeze
41
+ @mode_names = mode_names.freeze
42
+ @keyword_tables = keyword_tables.freeze
43
+ @token_meta = token_meta.freeze
44
+ @version = version
45
+ end
46
+
47
+ # Kind of compiled program
48
+ # @return [Symbol]
49
+ def kind
50
+ :program
51
+ end
52
+
53
+ # Get token ID by name
54
+ # @param name [Symbol]
55
+ # @return [Integer, nil]
56
+ def token_id(name)
57
+ @token_names.index(name)
58
+ end
59
+
60
+ # Get token name by ID
61
+ # @param id [Integer]
62
+ # @return [Symbol, nil]
63
+ def token_name(id)
64
+ @token_names[id]
65
+ end
66
+
67
+ # Get all token names (excludes reserved placeholder tokens)
68
+ # @return [Array<Symbol>]
69
+ def tokens
70
+ @token_names.reject { |name| name.to_s.start_with?("__RESERVED_") }
71
+ end
72
+
73
+ # Get all mode names
74
+ # @return [Array<Symbol>]
75
+ def modes
76
+ @mode_names.dup
77
+ end
78
+
79
+ # Get mode ID by name
80
+ # @param name [Symbol]
81
+ # @return [Integer, nil]
82
+ def mode_id(name)
83
+ @mode_names.index(name)
84
+ end
85
+
86
+ # Get mode start offset
87
+ # @param name [Symbol]
88
+ # @return [Integer, nil]
89
+ def mode_offset(name)
90
+ @mode_offsets[name]
91
+ end
92
+
93
+ # Low-level lexing with callback (for performance-critical code)
94
+ # @param bytes [String] input bytes
95
+ # @yield [Integer, Integer, Integer] token_id, start, length
96
+ # @raise [NativeExtensionError] if Rust extension is not loaded
97
+ def lowlevel_each(bytes, &)
98
+ bytes = bytes.b
99
+ raise LexerKit::NativeExtensionError, "Rust extension not loaded" unless LexerKit.native?
100
+
101
+ ensure_rust_native!
102
+ lex_rust_native(bytes, &)
103
+ end
104
+
105
+ # Check if a token ID is an error token
106
+ # Use this for fast error detection in the lex loop
107
+ # @param tok [Integer] token ID
108
+ # @return [Boolean]
109
+ def error_token?(tok)
110
+ tok == LexerKit::INVALID_TOKEN_ID
111
+ end
112
+
113
+ # Get metadata for a token ID
114
+ # @param tok [Integer] token ID
115
+ # @return [Hash] metadata hash (empty hash if no metadata)
116
+ def token_meta_for(tok)
117
+ @token_meta[tok] || {}
118
+ end
119
+
120
+ # Create a Token object on demand
121
+ # Use this to get rich token info only when needed (e.g., for errors)
122
+ # Source is created internally, so there's zero overhead if not called
123
+ # @param tok [Integer] token ID
124
+ # @param start [Integer] start byte offset
125
+ # @param len [Integer] length in bytes
126
+ # @param input [String] original input string
127
+ # @param filename [String, nil] optional filename for diagnostics
128
+ # @return [Core::Token]
129
+ def make_token(tok, start, len, input:, filename: nil)
130
+ source = Core::Source.new(input, filename: filename)
131
+ Core::Token.new(
132
+ id: tok,
133
+ name: token_name(tok),
134
+ start: start,
135
+ len: len,
136
+ source: source,
137
+ meta: @token_meta[tok]
138
+ )
139
+ end
140
+
141
+ # Create a stream-based lexer with lookahead support
142
+ # Returns a Runner that wraps the underlying LexStream.
143
+ # @param input [String] input string
144
+ # @param filename [String, nil] optional filename for diagnostics
145
+ # @return [Runner]
146
+ # @raise [NativeExtensionError] if Rust extension is not loaded
147
+ def stream(input, filename: nil)
148
+ raise LexerKit::NativeExtensionError, "Rust extension not loaded" unless LexerKit.native?
149
+
150
+ ensure_rust_native!
151
+ lex_stream = create_rust_stream(input)
152
+ Runner.new(self, lex_stream, filename: filename)
153
+ end
154
+
155
+ # Tokenize input and return array of Token objects
156
+ # Source is shared across all tokens for efficient line/col lookup
157
+ # @param input [String] input string
158
+ # @param filename [String, nil] optional filename for diagnostics
159
+ # @return [Array<Core::Token>]
160
+ def tokenize(input, filename: nil)
161
+ bytes = input.b
162
+ source = Core::Source.new(bytes, filename: filename)
163
+
164
+ tokens = []
165
+ lowlevel_each(bytes) do |tok_id, start, len|
166
+ tokens << Core::Token.new(
167
+ id: tok_id,
168
+ name: token_name(tok_id),
169
+ start: start,
170
+ len: len,
171
+ source: source,
172
+ meta: @token_meta[tok_id]
173
+ )
174
+ end
175
+ tokens
176
+ end
177
+
178
+ # Load native representation for fast lexing
179
+ # @return [self]
180
+ def load_native!
181
+ return self unless LexerKit.native?
182
+
183
+ load_rust_native(to_native_data)
184
+ self
185
+ end
186
+
187
+ # Convert to data format for Rust native loading
188
+ # @return [Hash] data for Rust extension
189
+ def to_native_data
190
+ {
191
+ instructions: @instructions.map(&:to_binary).join,
192
+ dfa_tables: @dfa_tables.map(&:to_native_format),
193
+ jump_tables: @jump_tables.map(&:to_native_format),
194
+ keyword_tables: @keyword_tables.map(&:to_native_format),
195
+ constant_pool: @constant_pool.entries,
196
+ modes: @mode_offsets.map { |name, offset| [name.to_s, offset] }
197
+ }
198
+ end
199
+
200
+ # Encode to binary
201
+ # @return [String]
202
+ def to_binary
203
+ Serializer.to_binary(self)
204
+ end
205
+
206
+ # Decode from binary
207
+ # @param bytes [String]
208
+ # @return [CompiledProgram]
209
+ def self.from_binary(bytes)
210
+ Serializer.from_binary(bytes)
211
+ end
212
+
213
+ def inspect
214
+ "#<CompiledProgram v#{@version} instructions=#{@instructions.size} tokens=#{@token_names.size} native=#{LexerKit.native?}>"
215
+ end
216
+
217
+ private
218
+
219
+ # Ensure Rust native is loaded
220
+ # @api private
221
+ def ensure_rust_native!
222
+ return if respond_to?(:rust_native_loaded?) && rust_native_loaded?
223
+
224
+ load_rust_native(to_native_data)
225
+ end
226
+ end
227
+ end
228
+ end