lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,249 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # Serializer handles binary encoding/decoding of CompiledProgram.
6
+ # Extracted from CompiledProgram to separate serialization concerns.
7
+ class Serializer
8
+ MAGIC = "LKT1"
9
+ FORMAT_VERSION = 3 # v3: added token_meta
10
+
11
+ # Encode a CompiledProgram to binary format.
12
+ # @param program [CompiledProgram]
13
+ # @return [String] binary data
14
+ def self.to_binary(program)
15
+ new.encode(program)
16
+ end
17
+
18
+ # Decode binary data to a CompiledProgram.
19
+ # @param bytes [String] binary data
20
+ # @return [CompiledProgram]
21
+ def self.from_binary(bytes)
22
+ new.decode(bytes)
23
+ end
24
+
25
+ # Encode CompiledProgram to binary.
26
+ # @param program [CompiledProgram]
27
+ # @return [String]
28
+ def encode(program)
29
+ parts = []
30
+
31
+ # Magic (4 bytes)
32
+ parts << MAGIC
33
+
34
+ # Format version (u16)
35
+ parts << [FORMAT_VERSION].pack("S>")
36
+
37
+ # User version (u32)
38
+ parts << [program.version].pack("L>")
39
+
40
+ # Token names
41
+ encode_string_array(parts, program.token_names)
42
+
43
+ # Mode names
44
+ encode_string_array(parts, program.mode_names)
45
+
46
+ # Modes (name → offset mapping)
47
+ modes = program.mode_offsets
48
+ parts << [modes.size].pack("S>")
49
+ modes.each do |name, offset|
50
+ name_bytes = name.to_s.b
51
+ parts << [name_bytes.bytesize].pack("S>")
52
+ parts << name_bytes
53
+ parts << [offset].pack("L>")
54
+ end
55
+
56
+ # Constant pool
57
+ pool_binary = program.constant_pool.to_binary
58
+ parts << [pool_binary.bytesize].pack("L>")
59
+ parts << pool_binary
60
+
61
+ # DFA tables
62
+ encode_table_array(parts, program.dfa_tables)
63
+
64
+ # Jump tables
65
+ encode_table_array(parts, program.jump_tables)
66
+
67
+ # Keyword tables
68
+ encode_table_array(parts, program.keyword_tables)
69
+
70
+ # Token metadata
71
+ encode_token_meta(parts, program.token_meta)
72
+
73
+ # Instructions
74
+ instructions = program.instructions
75
+ parts << [instructions.size].pack("L>")
76
+ instructions.each do |instr|
77
+ parts << instr.to_binary
78
+ end
79
+
80
+ parts.join
81
+ end
82
+
83
+ # Decode binary data to CompiledProgram.
84
+ # @param bytes [String]
85
+ # @return [CompiledProgram]
86
+ def decode(bytes)
87
+ @bytes = bytes
88
+ @pos = 0
89
+
90
+ # Magic
91
+ magic = read_bytes(4)
92
+ raise ArgumentError, "invalid magic: #{magic.inspect}" unless magic == MAGIC
93
+
94
+ # Format version
95
+ format_version = read_uint16
96
+ raise ArgumentError, "unsupported format version: #{format_version}" unless format_version == FORMAT_VERSION
97
+
98
+ # User version
99
+ version = read_uint32
100
+
101
+ # Token names
102
+ token_names = decode_symbol_array
103
+
104
+ # Mode names
105
+ mode_names = decode_symbol_array
106
+
107
+ # Modes
108
+ mode_count = read_uint16
109
+ modes = {}
110
+ mode_count.times do
111
+ name = read_length_prefixed_string.to_sym
112
+ offset = read_uint32
113
+ modes[name] = offset
114
+ end
115
+
116
+ # Constant pool
117
+ pool_len = read_uint32
118
+ constant_pool, = ConstantPool.from_binary(read_bytes(pool_len))
119
+
120
+ # DFA tables
121
+ dfa_tables = decode_table_array(DFATable)
122
+
123
+ # Jump tables
124
+ jump_tables = decode_table_array(JumpTable)
125
+
126
+ # Keyword tables
127
+ keyword_tables = decode_table_array(KeywordTable)
128
+
129
+ # Token metadata
130
+ token_meta = format_version >= 3 ? decode_token_meta : {}
131
+
132
+ # Instructions
133
+ instr_count = read_uint32
134
+ instructions = []
135
+ instr_count.times do
136
+ instructions << Instruction.from_binary(read_bytes(4))
137
+ end
138
+
139
+ CompiledProgram.new(
140
+ instructions: instructions,
141
+ dfa_tables: dfa_tables,
142
+ jump_tables: jump_tables,
143
+ constant_pool: constant_pool,
144
+ modes: modes,
145
+ token_names: token_names,
146
+ mode_names: mode_names,
147
+ keyword_tables: keyword_tables,
148
+ token_meta: token_meta,
149
+ version: version
150
+ )
151
+ end
152
+
153
+ private
154
+
155
+ # Encode an array of symbol names as length-prefixed strings.
156
+ def encode_string_array(parts, names)
157
+ parts << [names.size].pack("S>")
158
+ names.each do |name|
159
+ name_bytes = name.to_s.b
160
+ parts << [name_bytes.bytesize].pack("S>")
161
+ parts << name_bytes
162
+ end
163
+ end
164
+
165
+ # Encode an array of tables (DFA, Jump, Keyword).
166
+ def encode_table_array(parts, tables)
167
+ parts << [tables.size].pack("S>")
168
+ tables.each do |table|
169
+ table_binary = table.to_binary
170
+ parts << [table_binary.bytesize].pack("L>")
171
+ parts << table_binary
172
+ end
173
+ end
174
+
175
+ # Decode an array of symbols.
176
+ def decode_symbol_array
177
+ count = read_uint16
178
+ symbols = []
179
+ count.times do
180
+ symbols << read_length_prefixed_string.to_sym
181
+ end
182
+ symbols
183
+ end
184
+
185
+ # Decode an array of tables.
186
+ def decode_table_array(table_class)
187
+ count = read_uint16
188
+ tables = []
189
+ count.times do
190
+ table_len = read_uint32
191
+ table, = table_class.from_binary(read_bytes(table_len))
192
+ tables << table
193
+ end
194
+ tables
195
+ end
196
+
197
+ # Read raw bytes.
198
+ def read_bytes(len)
199
+ data = @bytes.byteslice(@pos, len)
200
+ @pos += len
201
+ data
202
+ end
203
+
204
+ # Read unsigned 16-bit big-endian.
205
+ def read_uint16
206
+ read_bytes(2).unpack1("S>")
207
+ end
208
+
209
+ # Read unsigned 32-bit big-endian.
210
+ def read_uint32
211
+ read_bytes(4).unpack1("L>")
212
+ end
213
+
214
+ # Read length-prefixed string (u16 length + bytes).
215
+ def read_length_prefixed_string
216
+ len = read_uint16
217
+ read_bytes(len)
218
+ end
219
+
220
+ # Encode token metadata hash.
221
+ # Format: [count: u16] [token_id: u16, json_len: u16, json: bytes]×n
222
+ def encode_token_meta(parts, token_meta)
223
+ require "json"
224
+ parts << [token_meta.size].pack("S>")
225
+ token_meta.each do |token_id, meta|
226
+ json = JSON.generate(meta)
227
+ json_bytes = json.b
228
+ parts << [token_id, json_bytes.bytesize].pack("S>S>")
229
+ parts << json_bytes
230
+ end
231
+ end
232
+
233
+ # Decode token metadata hash.
234
+ def decode_token_meta
235
+ require "json"
236
+ count = read_uint16
237
+ token_meta = {}
238
+ count.times do
239
+ token_id = read_uint16
240
+ json_len = read_uint16
241
+ json_bytes = read_bytes(json_len)
242
+ meta = JSON.parse(json_bytes, symbolize_names: true)
243
+ token_meta[token_id] = meta
244
+ end
245
+ token_meta
246
+ end
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "ir/opcode"
4
+ require_relative "ir/instruction"
5
+ require_relative "ir/dfa_table"
6
+ require_relative "ir/jump_table"
7
+ require_relative "ir/constant_pool"
8
+ require_relative "ir/keyword_table"
9
+ require_relative "ir/serializer"
10
+ require_relative "ir/compiled_program"
11
+
12
+ module LexerKit
13
+ # IR module contains intermediate representation types for the lexer VM.
14
+ module IR
15
+ end
16
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ # Runner coordinates program, stream, and source for token processing.
5
+ # It provides a high-level interface for lexing with full Token support.
6
+ #
7
+ # Runner has the same interface as LexStream (duck-typing compatible)
8
+ # plus additional methods for Token creation.
9
+ #
10
+ # Created via CompiledProgram#stream(input, filename:).
11
+ class Runner
12
+ attr_reader :program, :stream
13
+
14
+ # @api private
15
+ # Use CompiledProgram#stream instead
16
+ def initialize(program, lex_stream, filename: nil)
17
+ @program = program
18
+ @stream = lex_stream
19
+ @filename = filename
20
+ @source = nil # lazy
21
+ end
22
+
23
+ # Delegate basic stream operations
24
+ def eof? = @stream.eof?
25
+ def token_id = @stream.token_id
26
+ def start = @stream.start
27
+ def len = @stream.len
28
+ def input = @stream.input
29
+
30
+ # Advance to next token, returns self for chaining
31
+ def advance
32
+ @stream.advance
33
+ self
34
+ end
35
+
36
+ # Token name lookup (via program for correctness)
37
+ def token_name(id = nil)
38
+ id ||= token_id
39
+ return nil if id.nil? || id < 0
40
+
41
+ @program.token_name(id)
42
+ end
43
+
44
+ # Text extraction
45
+ def text
46
+ return nil if eof?
47
+
48
+ s = start
49
+ l = len
50
+ return nil if s.nil? || l.nil?
51
+
52
+ input.byteslice(s, l)
53
+ end
54
+
55
+ # Error detection (uses constant, no @program needed)
56
+ def error?(id = nil)
57
+ id ||= token_id
58
+ return false if id.nil? || id < 0
59
+
60
+ id == LexerKit::INVALID_TOKEN_ID
61
+ end
62
+
63
+ # Create Token object (lazy Source creation)
64
+ def make_token
65
+ return nil if eof?
66
+
67
+ tok_id = token_id
68
+ s = start
69
+ l = len
70
+ return nil if tok_id.nil? || s.nil? || l.nil?
71
+
72
+ @source ||= Core::Source.new(input, filename: @filename)
73
+ Core::Token.new(
74
+ id: tok_id,
75
+ name: @program.token_name(tok_id),
76
+ start: s,
77
+ len: l,
78
+ source: @source,
79
+ meta: @program.token_meta[tok_id]
80
+ )
81
+ end
82
+
83
+ # Line/col for current position
84
+ def line_col
85
+ return nil if eof?
86
+
87
+ s = start
88
+ return nil if s.nil?
89
+
90
+ @source ||= Core::Source.new(input, filename: @filename)
91
+ @source.line_col(s)
92
+ end
93
+
94
+ # Peek operations (delegate to stream)
95
+ def peek_token_id(n = 1) = @stream.peek_token_id(n)
96
+ def peek_start(n = 1) = @stream.peek_start(n)
97
+ def peek_len(n = 1) = @stream.peek_len(n)
98
+
99
+ def peek_token_name(n = 1)
100
+ id = peek_token_id(n)
101
+ return nil if id.nil? || id < 0
102
+
103
+ @program.token_name(id)
104
+ end
105
+
106
+ def peek_text(n = 1)
107
+ s = peek_start(n)
108
+ l = peek_len(n)
109
+ return nil if s.nil? || l.nil?
110
+
111
+ input.byteslice(s, l)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,170 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ # Builds and encodes trie data structures for literal matching.
5
+ #
6
+ # ## Value Constraints
7
+ #
8
+ # - order: 0..MAX_ORDER (ORDER_UNSET is reserved for unset values)
9
+ # - action_ip: 0..0xFFFFFFFF (ACTION_UNSET used for non-terminal nodes)
10
+ #
11
+ # ## Label Resolution
12
+ #
13
+ # labels hash must have Symbol keys only (Integer keys are ambiguous with direct offsets).
14
+ # Resolution order:
15
+ # 1. If action_ref is in labels hash, use mapped value
16
+ # 2. If action_ref is Symbol not in labels, raise ArgumentError
17
+ # 3. If action_ref is Integer, use directly as offset
18
+ #
19
+ class Trie
20
+ ORDER_UNSET = 0xFFFF
21
+ MAX_ORDER = 0xFFFE
22
+ ACTION_UNSET = 0xFFFFFFFF
23
+ MAX_EDGE_COUNT = 0xFFFF
24
+
25
+ def initialize(entries)
26
+ @trie = build_trie(entries)
27
+ end
28
+
29
+ def encode(labels: nil)
30
+ encode_trie(@trie, labels || {})
31
+ end
32
+
33
+ def self.build(entries)
34
+ trie_entries = entries.map.with_index do |(literal, action_ip), index|
35
+ [literal, index, action_ip]
36
+ end
37
+ new(trie_entries).encode
38
+ end
39
+
40
+ Node = Struct.new(:edges, :order, :action_ref, keyword_init: true) do
41
+ def initialize(edges: {}, order: nil, action_ref: nil)
42
+ super
43
+ end
44
+ end
45
+
46
+ NodeEntry = Struct.new(:edge_start, :edge_count, :order, :action_ref, keyword_init: true)
47
+ TrieData = Struct.new(:nodes, :edges, keyword_init: true)
48
+
49
+ private
50
+
51
+ def build_trie(entries)
52
+ nodes = [Node.new]
53
+
54
+ entries.each do |literal, order, action_ref|
55
+ validate_literal(literal)
56
+ validate_order(order)
57
+
58
+ bytes = literal.b
59
+ node_idx = 0
60
+
61
+ bytes.each_byte do |byte|
62
+ node = nodes[node_idx]
63
+ child = node.edges[byte]
64
+
65
+ unless child
66
+ child = nodes.size
67
+ node.edges[byte] = child
68
+ nodes << Node.new
69
+ end
70
+
71
+ node_idx = child
72
+ end
73
+
74
+ node = nodes[node_idx]
75
+ if node.order.nil? || order < node.order
76
+ node.order = order
77
+ node.action_ref = action_ref
78
+ end
79
+ end
80
+
81
+ # NOTE: O(E log E) per node due to Hash#sort, but acceptable for typical
82
+ # literal counts. For very large trie (>10k literals), consider optimization.
83
+ edge_list = []
84
+ node_entries = nodes.map do |node|
85
+ edge_start = edge_list.size
86
+ edge_count = node.edges.size
87
+
88
+ validate_edge_count(edge_count)
89
+
90
+ node.edges.sort.each { |byte, child| edge_list << [byte, child] }
91
+ NodeEntry.new(
92
+ edge_start: edge_start,
93
+ edge_count: edge_count,
94
+ order: node.order,
95
+ action_ref: node.action_ref
96
+ )
97
+ end
98
+
99
+ TrieData.new(nodes: node_entries, edges: edge_list)
100
+ end
101
+
102
+ def encode_trie(trie, labels)
103
+ validate_labels(labels)
104
+
105
+ nodes = trie.nodes
106
+ edges = trie.edges
107
+
108
+ parts = []
109
+ parts << [nodes.size].pack("L>")
110
+ parts << [edges.size].pack("L>")
111
+
112
+ nodes.each do |node|
113
+ order = node.order || ORDER_UNSET
114
+
115
+ action = if node.action_ref.nil?
116
+ ACTION_UNSET
117
+ elsif node.action_ref.is_a?(Symbol)
118
+ labels.fetch(node.action_ref) do
119
+ raise ArgumentError, "Unknown label: #{node.action_ref}"
120
+ end
121
+ else
122
+ node.action_ref
123
+ end
124
+
125
+ validate_action_ip(action)
126
+
127
+ parts << [node.edge_start, node.edge_count, order, action].pack("L>S>S>L>")
128
+ end
129
+
130
+ edges.each do |byte, child_idx|
131
+ parts << [byte, child_idx].pack("CL>")
132
+ end
133
+
134
+ parts.join.force_encoding(Encoding::ASCII_8BIT)
135
+ end
136
+
137
+ def validate_literal(literal)
138
+ raise ArgumentError, "literal must be a non-empty string, got: #{literal.inspect}" if literal.nil? || literal.empty?
139
+ end
140
+
141
+ def validate_order(order)
142
+ return if order.is_a?(Integer) && order >= 0 && order <= MAX_ORDER
143
+
144
+ raise ArgumentError, "order must be in range 0..#{format('0x%X', MAX_ORDER)} (#{format('0x%X', ORDER_UNSET)} reserved), got: #{order.inspect}"
145
+ end
146
+
147
+ def validate_edge_count(count)
148
+ return if count.between?(0, MAX_EDGE_COUNT)
149
+
150
+ raise ArgumentError, "edge_count must fit in 16-bit unsigned integer, got: #{count}"
151
+ end
152
+
153
+ def validate_action_ip(action_ip)
154
+ # ACTION_UNSET is allowed as the reserved "unset" value for non-terminal nodes
155
+ return if action_ip.is_a?(Integer) && action_ip >= 0 && action_ip <= ACTION_UNSET
156
+
157
+ raise ArgumentError, "action_ip must be in range 0..0x#{ACTION_UNSET.to_s(16).upcase} (32-bit unsigned), got: #{action_ip.inspect}"
158
+ end
159
+
160
+ def validate_labels(labels)
161
+ return if labels.empty?
162
+
163
+ # Check for Integer keys (ambiguous with direct offsets)
164
+ integer_keys = labels.keys.select { |k| k.is_a?(Integer) }
165
+ return if integer_keys.empty?
166
+
167
+ raise ArgumentError, "labels must have Symbol keys only (Integer keys are ambiguous), found Integer keys: #{integer_keys.inspect}"
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ VERSION = "0.5.0"
5
+ end
data/lib/lexer_kit.rb ADDED
@@ -0,0 +1,155 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lexer_kit/version"
4
+ require_relative "lexer_kit/core" # Load first (errors depend on Core::Source, Core::Diagnostic)
5
+ require_relative "lexer_kit/errors" # Load after core
6
+ require_relative "lexer_kit/ir"
7
+ require_relative "lexer_kit/dfa"
8
+ require_relative "lexer_kit/builder"
9
+ require_relative "lexer_kit/format"
10
+
11
+ # Load Trie class (for testing and internal use)
12
+ require_relative "lexer_kit/trie"
13
+
14
+ # Try to load Rust extension
15
+ begin
16
+ require "lexer_kit_rust/lexer_kit_rust"
17
+ LEXER_KIT_NATIVE = true
18
+
19
+ # Runner wraps LexStream (defined in Rust) with high-level methods
20
+ require_relative "lexer_kit/runner"
21
+ rescue LoadError
22
+ LEXER_KIT_NATIVE = false
23
+ end
24
+
25
+ module LexerKit
26
+ # Reserved token IDs
27
+ # 0: Internal sentinel (never emitted)
28
+ # 1: INVALID (error token)
29
+ # 2-7: Reserved for future use
30
+ # 8+: User-defined tokens
31
+ #
32
+ # Note: The VM only emits tokens with valid IDs:
33
+ # - INVALID_TOKEN_ID (1) for error tokens
34
+ # - User tokens (>= FIRST_USER_TOKEN_ID)
35
+ # Tokens with sentinel/reserved IDs (0, 2-7) or zero length are filtered out.
36
+ RESERVED_TOKEN_ID = 0
37
+ INVALID_TOKEN_ID = 1
38
+ FIRST_USER_TOKEN_ID = 8
39
+
40
+ module RegexAstProvider
41
+ def to_ast
42
+ raise NotImplementedError
43
+ end
44
+
45
+ def to_regex
46
+ DFA::RegexAST::Regex.new(ast: to_ast, case_insensitive: false)
47
+ end
48
+ end
49
+
50
+ # Create a UTF-8 range pattern for the LexerKit regex engine.
51
+ #
52
+ # Accepted inputs:
53
+ # - "あ" (single character)
54
+ # - "あ".."ん" (Range, inclusive)
55
+ # - Integer codepoint ranges (e.g., 0x3041..0x3096)
56
+ #
57
+ # Notes:
58
+ # - Exclusive ranges (e.g., "a"..."z") are not supported.
59
+ # - Multi-character strings like "abc" are not supported.
60
+ # - Range endpoints must be single characters or integers.
61
+ def self.utf8_range(*ranges)
62
+ require_relative "lexer_kit/dfa/utf8_range_pattern"
63
+ parsed = ranges.map { |range| parse_range_codepoints(range) }
64
+ DFA::Utf8RangePattern.new(parsed)
65
+ end
66
+
67
+ def self.parse_range_codepoints(range)
68
+ if range.is_a?(Range)
69
+ raise ArgumentError, "utf8_range does not support exclusive ranges" if range.exclude_end?
70
+
71
+ start_cp = range.begin.is_a?(Integer) ? range.begin : single_char_ord(range.begin.to_s)
72
+ end_cp = range.end.is_a?(Integer) ? range.end : single_char_ord(range.end.to_s)
73
+ return [start_cp, end_cp]
74
+ end
75
+
76
+ str = range.to_s
77
+
78
+ cp = single_char_ord(str)
79
+ [cp, cp]
80
+ end
81
+
82
+ def self.single_char_ord(str)
83
+ raise ArgumentError, "utf8_range expects a single character, got #{str.inspect}" unless str.length == 1
84
+
85
+ str.ord
86
+ end
87
+
88
+ private_class_method :parse_range_codepoints
89
+ private_class_method :single_char_ord
90
+
91
+ # Check if native Rust extension is available
92
+ # @return [Boolean]
93
+ def self.native?
94
+ LEXER_KIT_NATIVE
95
+ end
96
+
97
+ # Build a lexer from DSL
98
+ # @yield [Builder] DSL block
99
+ # @return [Builder] configured builder
100
+ def self.build(&block)
101
+ Builder.new.tap { |b| b.instance_eval(&block) if block }
102
+ end
103
+
104
+ # Load a compiled lexer from .lkt1 or .lkb1 file
105
+ #
106
+ # @param path [String] path to lexer file (relative or absolute)
107
+ # @return [IR::CompiledProgram] compiled lexer program
108
+ # @raise [ArgumentError] if file not found or invalid extension
109
+ #
110
+ # @example Load from relative path
111
+ # LexerKit.load_lexer("lexers/json.lkt1")
112
+ # LexerKit.load_lexer(File.expand_path("../data/json.lkt1", __dir__))
113
+ #
114
+ # @example Load from absolute path
115
+ # LexerKit.load_lexer("/path/to/json.lkt1")
116
+ def self.load_lexer(path)
117
+ # Expand relative/absolute paths from current directory
118
+ path = File.expand_path(path)
119
+
120
+ raise ArgumentError, "Lexer not found: #{path}" unless File.exist?(path)
121
+
122
+ if path.end_with?(".lkt1")
123
+ Format::LKT1.load(path).program
124
+ elsif path.end_with?(".lkb1")
125
+ Format::LKB1.load(path).program
126
+ else
127
+ raise ArgumentError, "Expected .lkt1 or .lkb1 file: #{path}"
128
+ end
129
+ end
130
+
131
+ # Load a builder from DSL source file
132
+ #
133
+ # @param path [String] path to DSL source file (relative or absolute)
134
+ # @return [Builder] builder instance
135
+ # @raise [ArgumentError] if file doesn't return Builder instance
136
+ #
137
+ # @example Load from relative path
138
+ # LexerKit.load_builder("examples/languages/json.rb")
139
+ #
140
+ # @example Load from absolute path
141
+ # LexerKit.load_builder("/path/to/json.rb")
142
+ def self.load_builder(path)
143
+ # Expand relative/absolute paths from current directory
144
+ path = File.expand_path(path)
145
+
146
+ raise ArgumentError, "Builder source not found: #{path}" unless File.exist?(path)
147
+
148
+ content = File.read(path)
149
+ result = eval(content, TOPLEVEL_BINDING, path) # rubocop:disable Security/Eval
150
+
151
+ return result if result.is_a?(Builder)
152
+
153
+ raise ArgumentError, "DSL file must return LexerKit::Builder instance"
154
+ end
155
+ end