lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # ConstantPool stores string constants (delimiters, keywords, etc.)
6
+ class ConstantPool
7
+ attr_reader :entries
8
+
9
+ def initialize
10
+ @entries = []
11
+ @index = {}
12
+ end
13
+
14
+ # Add a constant and return its ID
15
+ # @param value [String] constant value
16
+ # @return [Integer] constant ID
17
+ def add(value)
18
+ value = value.b.freeze
19
+ return @index[value] if @index.key?(value)
20
+
21
+ id = @entries.size
22
+ @entries << value
23
+ @index[value] = id
24
+ id
25
+ end
26
+
27
+ # Add a constant without interning
28
+ # @param value [String] constant value
29
+ # @return [Integer] constant ID
30
+ def add_uninterned(value)
31
+ value = value.b.freeze
32
+ id = @entries.size
33
+ @entries << value
34
+ id
35
+ end
36
+
37
+ # Replace an existing constant by ID
38
+ # @param id [Integer] constant ID
39
+ # @param value [String] new value
40
+ def replace(id, value)
41
+ value = value.b.freeze
42
+ old = @entries[id]
43
+ @entries[id] = value
44
+ if old && @index[old] == id
45
+ @index.delete(old)
46
+ end
47
+ @index[value] = id unless @index.key?(value)
48
+ end
49
+
50
+ # Get constant by ID
51
+ # @param id [Integer]
52
+ # @return [String]
53
+ def get(id)
54
+ @entries[id]
55
+ end
56
+
57
+ # Number of constants
58
+ # @return [Integer]
59
+ def size
60
+ @entries.size
61
+ end
62
+
63
+ # Encode to binary
64
+ # @return [String]
65
+ def to_binary
66
+ parts = []
67
+
68
+ # Count (u16)
69
+ parts << [@entries.size].pack("S>")
70
+
71
+ # Entries: [length (u16), bytes...]
72
+ @entries.each do |entry|
73
+ parts << [entry.bytesize].pack("S>")
74
+ parts << entry
75
+ end
76
+
77
+ parts.join
78
+ end
79
+
80
+ # Decode from binary
81
+ # @param bytes [String]
82
+ # @return [Array(ConstantPool, Integer)] [pool, bytes_consumed]
83
+ def self.from_binary(bytes)
84
+ pos = 0
85
+
86
+ count = bytes.byteslice(pos, 2).unpack1("S>")
87
+ pos += 2
88
+
89
+ pool = new
90
+ count.times do
91
+ len = bytes.byteslice(pos, 2).unpack1("S>")
92
+ pos += 2
93
+
94
+ value = bytes.byteslice(pos, len)
95
+ pool.add(value)
96
+ pos += len
97
+ end
98
+
99
+ [pool, pos]
100
+ end
101
+
102
+ def inspect
103
+ "#<ConstantPool size=#{@entries.size}>"
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,125 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # DFATable represents a compiled DFA for pattern matching.
6
+ # Uses byte class compression to reduce table size.
7
+ class DFATable
8
+ # State 0 indicates no valid transition (must match C DFA_DEAD_STATE)
9
+ DEAD_STATE = 0
10
+ # Token ID indicating non-accepting state (must match C DFA_NO_ACCEPT)
11
+ NO_ACCEPT = 0xFFFF
12
+
13
+ attr_reader :state_count, :class_count, :byte_class, :transitions, :accept_states
14
+
15
+ # @param state_count [Integer] number of states
16
+ # @param byte_class [Array<Integer>] 256-element array mapping bytes to classes
17
+ # @param transitions [Array<Integer>] state × class → next_state
18
+ # @param accept_states [Hash<Integer, Integer>] state → token_id for accepting states
19
+ def initialize(state_count:, byte_class:, transitions:, accept_states:)
20
+ @state_count = state_count
21
+ @byte_class = byte_class.freeze
22
+ @class_count = byte_class.max + 1
23
+ @transitions = transitions.freeze
24
+ @accept_states = accept_states.freeze
25
+ end
26
+
27
+ # Get next state for current state and input byte
28
+ # @param state [Integer] current state
29
+ # @param byte [Integer] input byte
30
+ # @return [Integer] next state (0 = dead state)
31
+ def transition(state, byte)
32
+ cls = @byte_class[byte]
33
+ @transitions[(state * @class_count) + cls]
34
+ end
35
+
36
+ # Check if state is accepting
37
+ # @param state [Integer]
38
+ # @return [Integer, nil] token ID if accepting, nil otherwise
39
+ def accept(state)
40
+ @accept_states[state]
41
+ end
42
+
43
+ # Encode to binary
44
+ # @return [String]
45
+ def to_binary
46
+ parts = []
47
+
48
+ # Header: state_count (u16), class_count (u16)
49
+ parts << [@state_count, @class_count].pack("S>S>")
50
+
51
+ # Byte class table (256 bytes)
52
+ parts << @byte_class.pack("C256")
53
+
54
+ # Transitions (state_count × class_count × u16)
55
+ parts << @transitions.pack("S>*")
56
+
57
+ # Accept states count (u16)
58
+ parts << [@accept_states.size].pack("S>")
59
+
60
+ # Accept states: [state (u16), token_id (u16)] pairs
61
+ @accept_states.each do |state, token_id|
62
+ parts << [state, token_id].pack("S>S>")
63
+ end
64
+
65
+ parts.join
66
+ end
67
+
68
+ # Decode from binary
69
+ # @param bytes [String]
70
+ # @return [Array(DFATable, Integer)] [table, bytes_consumed]
71
+ def self.from_binary(bytes)
72
+ pos = 0
73
+
74
+ state_count, class_count = bytes.byteslice(pos, 4).unpack("S>S>")
75
+ pos += 4
76
+
77
+ byte_class = bytes.byteslice(pos, 256).unpack("C256")
78
+ pos += 256
79
+
80
+ trans_size = state_count * class_count
81
+ transitions = bytes.byteslice(pos, trans_size * 2).unpack("S>*")
82
+ pos += trans_size * 2
83
+
84
+ accept_count = bytes.byteslice(pos, 2).unpack1("S>")
85
+ pos += 2
86
+
87
+ accept_states = {}
88
+ accept_count.times do
89
+ state, token_id = bytes.byteslice(pos, 4).unpack("S>S>")
90
+ accept_states[state] = token_id
91
+ pos += 4
92
+ end
93
+
94
+ table = new(
95
+ state_count: state_count,
96
+ byte_class: byte_class,
97
+ transitions: transitions,
98
+ accept_states: accept_states
99
+ )
100
+
101
+ [table, pos]
102
+ end
103
+
104
+ def inspect
105
+ "#<DFATable states=#{@state_count} classes=#{@class_count} accepts=#{@accept_states.size}>"
106
+ end
107
+
108
+ # Convert to format suitable for C native loading
109
+ # Returns dense accept_tokens array for O(1) lookup
110
+ # @return [Hash] data for C extension
111
+ def to_native_format
112
+ # Build dense array: accept_tokens[state] = token_id
113
+ accept_tokens = Array.new(@state_count, NO_ACCEPT)
114
+ @accept_states.each { |state, token_id| accept_tokens[state] = token_id }
115
+ {
116
+ state_count: @state_count,
117
+ class_count: @class_count,
118
+ byte_class: @byte_class.pack("C*"),
119
+ transitions: @transitions.pack("S>*"),
120
+ accept_tokens: accept_tokens.pack("S>*")
121
+ }
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # Instruction represents a single VM instruction.
6
+ # Each instruction has an opcode and an optional argument.
7
+ class Instruction
8
+ attr_reader :opcode, :arg
9
+
10
+ # @param opcode [Integer] opcode from Opcode module
11
+ # @param arg [Integer] argument (interpretation depends on opcode)
12
+ def initialize(opcode, arg = 0)
13
+ @opcode = opcode
14
+ @arg = arg
15
+ end
16
+
17
+ # Encode to binary (4 bytes: 1 opcode + 3 arg)
18
+ # @return [String] binary representation
19
+ def to_binary
20
+ [
21
+ @opcode,
22
+ (@arg >> 16) & 0xFF,
23
+ (@arg >> 8) & 0xFF,
24
+ @arg & 0xFF
25
+ ].pack("C4")
26
+ end
27
+
28
+ # Decode from binary
29
+ # @param bytes [String] 4 bytes
30
+ # @return [Instruction]
31
+ def self.from_binary(bytes)
32
+ opcode, a1, a2, a3 = bytes.unpack("C4")
33
+ arg = (a1 << 16) | (a2 << 8) | a3
34
+ new(opcode, arg)
35
+ end
36
+
37
+ def to_s
38
+ "#{Opcode.name(@opcode)} #{@arg}"
39
+ end
40
+
41
+ def inspect
42
+ "#<Instruction #{self}>"
43
+ end
44
+
45
+ def ==(other)
46
+ other.is_a?(Instruction) && @opcode == other.opcode && @arg == other.arg
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,94 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # JumpTable for SWITCH_BYTE instruction.
6
+ # Maps bytes to instruction offsets.
7
+ class JumpTable
8
+ attr_reader :entries, :default_offset
9
+
10
+ # @param entries [Hash<Integer, Integer>] byte → offset mapping
11
+ # @param default_offset [Integer] offset for unmatched bytes
12
+ def initialize(entries:, default_offset:)
13
+ @entries = entries.freeze
14
+ @default_offset = default_offset
15
+ end
16
+
17
+ # Get offset for a byte
18
+ # @param byte [Integer]
19
+ # @return [Integer] instruction offset
20
+ def lookup(byte)
21
+ @entries[byte] || @default_offset
22
+ end
23
+
24
+ # Encode to binary
25
+ # @return [String]
26
+ def to_binary
27
+ parts = []
28
+
29
+ # Default offset (u24)
30
+ parts << [
31
+ (@default_offset >> 16) & 0xFF,
32
+ (@default_offset >> 8) & 0xFF,
33
+ @default_offset & 0xFF
34
+ ].pack("C3")
35
+
36
+ # Entry count (u16)
37
+ parts << [@entries.size].pack("S>")
38
+
39
+ # Entries: [byte (u8), offset (u24)] pairs
40
+ @entries.each do |byte, offset|
41
+ parts << [
42
+ byte,
43
+ (offset >> 16) & 0xFF,
44
+ (offset >> 8) & 0xFF,
45
+ offset & 0xFF
46
+ ].pack("C4")
47
+ end
48
+
49
+ parts.join
50
+ end
51
+
52
+ # Decode from binary
53
+ # @param bytes [String]
54
+ # @return [Array(JumpTable, Integer)] [table, bytes_consumed]
55
+ def self.from_binary(bytes)
56
+ pos = 0
57
+
58
+ d1, d2, d3 = bytes.byteslice(pos, 3).unpack("C3")
59
+ default_offset = (d1 << 16) | (d2 << 8) | d3
60
+ pos += 3
61
+
62
+ entry_count = bytes.byteslice(pos, 2).unpack1("S>")
63
+ pos += 2
64
+
65
+ entries = {}
66
+ entry_count.times do
67
+ byte, o1, o2, o3 = bytes.byteslice(pos, 4).unpack("C4")
68
+ offset = (o1 << 16) | (o2 << 8) | o3
69
+ entries[byte] = offset
70
+ pos += 4
71
+ end
72
+
73
+ [new(entries: entries, default_offset: default_offset), pos]
74
+ end
75
+
76
+ def inspect
77
+ "#<JumpTable entries=#{@entries.size} default=#{@default_offset}>"
78
+ end
79
+
80
+ # Convert to format suitable for C native loading
81
+ # Returns dense 256-entry lookup table for O(1) access
82
+ # @return [Hash] data for C extension
83
+ def to_native_format
84
+ # Build dense lookup table: all 256 bytes → offset
85
+ lookup = Array.new(256, @default_offset)
86
+ @entries.each { |byte, offset| lookup[byte] = offset }
87
+ {
88
+ lookup: lookup.pack("L>*"), # 256 * 4 = 1024 bytes
89
+ default_offset: @default_offset # For EOF case
90
+ }
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,168 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # KeywordTable maps keyword strings to token IDs.
6
+ # Used for efficient O(1) keyword lookup after identifier match.
7
+ #
8
+ # Keywords must be UTF-8 encoded strings. This is consistent with
9
+ # LexerKit's token specification which requires UTF-8 encoding.
10
+ class KeywordTable
11
+ attr_reader :base_token_id, :keywords
12
+
13
+ # @param base_token_id [Integer] identifier token ID (used when not a keyword)
14
+ # @param keywords [Hash{String => Integer}] UTF-8 keyword string → token ID
15
+ def initialize(base_token_id:, keywords:)
16
+ validate_base_token_id(base_token_id)
17
+ validate_keywords(keywords)
18
+
19
+ @base_token_id = base_token_id
20
+ @keywords = keywords.freeze
21
+ end
22
+
23
+ # Encode to binary format
24
+ # Format:
25
+ # base_token_id: u16
26
+ # keyword_count: u16
27
+ # for each keyword:
28
+ # key_len: u16
29
+ # key: bytes
30
+ # token_id: u16
31
+ # @return [String]
32
+ # @raise [ArgumentError] if keyword count or any keyword length exceeds u16 limit
33
+ def to_binary
34
+ # Validate keyword count fits in u16
35
+ if @keywords.size > 0xFFFF
36
+ raise ArgumentError, "Too many keywords (#{@keywords.size}): maximum is 65535"
37
+ end
38
+
39
+ parts = []
40
+
41
+ # Base token ID (u16)
42
+ parts << [@base_token_id].pack("S>")
43
+
44
+ # Keyword count (u16)
45
+ parts << [@keywords.size].pack("S>")
46
+
47
+ # Keywords
48
+ @keywords.each do |key, token_id|
49
+ key_bytes = key.b
50
+
51
+ # Validate keyword length fits in u16
52
+ if key_bytes.bytesize > 0xFFFF
53
+ raise ArgumentError, "Keyword too long (#{key_bytes.bytesize} bytes): maximum is 65535 bytes"
54
+ end
55
+
56
+ parts << [key_bytes.bytesize].pack("S>")
57
+ parts << key_bytes
58
+ parts << [token_id].pack("S>")
59
+ end
60
+
61
+ parts.join
62
+ end
63
+
64
+ # Convert to format suitable for C native loading
65
+ # @return [Hash] data for C extension
66
+ def to_native_format
67
+ {
68
+ base_token_id: @base_token_id,
69
+ keywords: @keywords.map { |key, token_id| [key.b, token_id] }
70
+ }
71
+ end
72
+
73
+ # Decode from binary
74
+ # @param bytes [String]
75
+ # @return [Array(KeywordTable, Integer)] table and consumed bytes
76
+ # @raise [LexerKit::InvalidBinaryError] if binary data is invalid
77
+ def self.from_binary(bytes)
78
+ raise LexerKit::InvalidBinaryError, "Binary data too short (expected at least 4 bytes, got #{bytes.bytesize})" if bytes.bytesize < 4
79
+
80
+ pos = 0
81
+
82
+ # Base token ID
83
+ base_token_id = bytes.byteslice(pos, 2)&.unpack1("S>")
84
+ raise LexerKit::InvalidBinaryError, "Invalid header data" if base_token_id.nil?
85
+
86
+ pos += 2
87
+
88
+ # Keyword count
89
+ keyword_count = bytes.byteslice(pos, 2)&.unpack1("S>")
90
+ raise LexerKit::InvalidBinaryError, "Invalid header data" if keyword_count.nil?
91
+
92
+ pos += 2
93
+
94
+ # Keywords
95
+ keywords = {}
96
+ keyword_count.times do
97
+ raise LexerKit::InvalidBinaryError, "Unexpected end of data while reading keyword entry" if pos + 2 > bytes.bytesize
98
+
99
+ key_len = bytes.byteslice(pos, 2)&.unpack1("S>")
100
+ raise LexerKit::InvalidBinaryError, "Invalid key length" if key_len.nil?
101
+
102
+ pos += 2
103
+
104
+ raise LexerKit::InvalidBinaryError, "Unexpected end of data while reading keyword" if pos + key_len + 2 > bytes.bytesize
105
+
106
+ key = bytes.byteslice(pos, key_len).force_encoding(Encoding::UTF_8)
107
+ pos += key_len
108
+
109
+ token_id = bytes.byteslice(pos, 2)&.unpack1("S>")
110
+ raise LexerKit::InvalidBinaryError, "Invalid token ID" if token_id.nil?
111
+
112
+ pos += 2
113
+
114
+ keywords[key] = token_id
115
+ end
116
+
117
+ [new(base_token_id: base_token_id, keywords: keywords), pos]
118
+ rescue LexerKit::InvalidBinaryError
119
+ raise
120
+ rescue StandardError => e
121
+ raise LexerKit::InvalidBinaryError, "Failed to parse binary data: #{e.message}"
122
+ end
123
+
124
+ private
125
+
126
+ def validate_base_token_id(id)
127
+ return if id.is_a?(Integer) && id >= 0 && id <= 0xFFFF
128
+
129
+ raise ArgumentError, "base_token_id must be a 16-bit unsigned integer (0..65535), got: #{id.inspect}"
130
+ end
131
+
132
+ def validate_keywords(keywords)
133
+ unless keywords.is_a?(Hash)
134
+ raise ArgumentError, "keywords must be a Hash, got: #{keywords.class}"
135
+ end
136
+
137
+ keywords.each do |keyword, token_id|
138
+ validate_keyword(keyword)
139
+ validate_token_id(token_id)
140
+ end
141
+ end
142
+
143
+ def validate_keyword(keyword)
144
+ unless keyword.is_a?(String)
145
+ raise ArgumentError, "keyword must be a String, got: #{keyword.inspect}"
146
+ end
147
+
148
+ if keyword.empty?
149
+ raise ArgumentError, "keyword must not be empty"
150
+ end
151
+
152
+ unless keyword.encoding == Encoding::UTF_8 || keyword.encoding == Encoding::US_ASCII
153
+ raise ArgumentError, "keyword must be UTF-8 encoded, got: #{keyword.encoding}"
154
+ end
155
+
156
+ return if keyword.valid_encoding?
157
+
158
+ raise ArgumentError, "keyword contains invalid UTF-8 byte sequence"
159
+ end
160
+
161
+ def validate_token_id(token_id)
162
+ return if token_id.is_a?(Integer) && token_id >= 0 && token_id <= 0xFFFF
163
+
164
+ raise ArgumentError, "token_id must be a 16-bit unsigned integer (0..65535), got: #{token_id.inspect}"
165
+ end
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module IR
5
+ # Opcode definitions for the LexerKit VM
6
+ module Opcode
7
+ # DFA operations
8
+ DFA_RUN = 0x01 # Run DFA table, arg = table_id
9
+ # Optimized DFA run for single-candidate regex tokens.
10
+ # Argument encoding: (dfa_id << 14) | fail_target (10+14 bits)
11
+ # On success: advance position, move to next instruction
12
+ # On failure: jump directly to fail_target
13
+ DFA_RUN_IF_MATCH = 0x02
14
+
15
+ # Delimiter/literal operations
16
+ SCAN_UNTIL = 0x10 # Scan until delimiter, arg = const_id
17
+ MATCH_LITERAL = 0x12 # Match fixed string, arg = const_id
18
+ SCAN_UNTIL_ESCAPE = 0x13 # Scan until delimiter with escape, arg = config_id
19
+ MATCH_RANGE = 0x14 # Match byte range, arg = (min << 8) | max
20
+ # Optimized literal matching with embedded failure jump target.
21
+ # Argument encoding: (const_id << 14) | fail_target (10+14 bits)
22
+ # On success: advance position, move to next instruction
23
+ # On failure: jump directly to fail_target
24
+ MATCH_LITERAL_OR_JUMP = 0x16
25
+
26
+ # Branch/control operations
27
+ SWITCH_BYTE = 0x20 # Branch on next byte, arg = jump_table_id
28
+ JUMP = 0x21 # Unconditional jump, arg = offset
29
+ JUMP_IF_EOF = 0x24 # Jump if at EOF, arg = offset
30
+
31
+ # Mode operations
32
+ PUSH_MODE = 0x30 # Push mode to stack, arg = mode_id
33
+ POP_MODE = 0x31 # Pop mode from stack
34
+
35
+ # Token operations
36
+ EMIT = 0x40 # Emit token, arg = token_id
37
+ EMIT_SKIP = 0x41 # Skip (advance without emit)
38
+ EMIT_ERROR = 0x42 # Emit error token, arg = error_id
39
+ MARK = 0x43 # Mark current position
40
+ # Optimized EMIT + JUMP for common token emission pattern.
41
+ # Argument encoding: (token_id << 14) | jump_target (10+14 bits)
42
+ EMIT_AND_JUMP = 0x44
43
+ KEYWORD_LOOKUP = 0x45 # Keyword lookup, arg = keyword_table_id
44
+ LITERAL_TRIE_RUN = 0x46 # Run literal trie, arg = const_id
45
+ CLEAR_BEST = 0x49 # Clear best match tracking
46
+ COMMIT_BEST = 0x4C # Commit best match, arg = default offset
47
+ # Optimized regex candidate matching with embedded action target.
48
+ # Argument encoding: (order << 14) | action_ip (10+14 bits)
49
+ SET_MATCH = 0x4D
50
+ # Optimized CLEAR_BEST + LITERAL_TRIE_RUN + COMMIT_BEST for literal-only matching.
51
+ # Argument encoding: (const_id << 14) | fail_target (10+14 bits)
52
+ # On match: jump to best match action IP
53
+ # On no match: jump to fail_target
54
+ LITERAL_TRIE_COMMIT = 0x4E
55
+ # Optimized EMIT_SKIP + JUMP for skipped token pattern.
56
+ # Argument encoding: jump_target (24 bits)
57
+ EMIT_SKIP_AND_JUMP = 0x4F
58
+
59
+ # Special
60
+ HALT = 0xFF # Stop execution
61
+
62
+ # Opcode metadata
63
+ NAMES = {
64
+ DFA_RUN => "DFA_RUN",
65
+ DFA_RUN_IF_MATCH => "DFA_RUN_IF_MATCH",
66
+ SCAN_UNTIL => "SCAN_UNTIL",
67
+ MATCH_LITERAL => "MATCH_LITERAL",
68
+ SCAN_UNTIL_ESCAPE => "SCAN_UNTIL_ESCAPE",
69
+ MATCH_RANGE => "MATCH_RANGE",
70
+ MATCH_LITERAL_OR_JUMP => "MATCH_LITERAL_OR_JUMP",
71
+ SWITCH_BYTE => "SWITCH_BYTE",
72
+ JUMP => "JUMP",
73
+ JUMP_IF_EOF => "JUMP_IF_EOF",
74
+ PUSH_MODE => "PUSH_MODE",
75
+ POP_MODE => "POP_MODE",
76
+ EMIT => "EMIT",
77
+ EMIT_SKIP => "EMIT_SKIP",
78
+ EMIT_ERROR => "EMIT_ERROR",
79
+ MARK => "MARK",
80
+ EMIT_AND_JUMP => "EMIT_AND_JUMP",
81
+ KEYWORD_LOOKUP => "KEYWORD_LOOKUP",
82
+ LITERAL_TRIE_RUN => "LITERAL_TRIE_RUN",
83
+ CLEAR_BEST => "CLEAR_BEST",
84
+ COMMIT_BEST => "COMMIT_BEST",
85
+ SET_MATCH => "SET_MATCH",
86
+ LITERAL_TRIE_COMMIT => "LITERAL_TRIE_COMMIT",
87
+ EMIT_SKIP_AND_JUMP => "EMIT_SKIP_AND_JUMP",
88
+ HALT => "HALT"
89
+ }.freeze
90
+
91
+ def self.name(opcode)
92
+ NAMES[opcode] || "UNKNOWN(0x#{opcode.to_s(16)})"
93
+ end
94
+ end
95
+ end
96
+ end