lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# ConstantPool stores string constants (delimiters, keywords, etc.)
|
|
6
|
+
class ConstantPool
|
|
7
|
+
attr_reader :entries
|
|
8
|
+
|
|
9
|
+
def initialize
|
|
10
|
+
@entries = []
|
|
11
|
+
@index = {}
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Add a constant and return its ID
|
|
15
|
+
# @param value [String] constant value
|
|
16
|
+
# @return [Integer] constant ID
|
|
17
|
+
def add(value)
|
|
18
|
+
value = value.b.freeze
|
|
19
|
+
return @index[value] if @index.key?(value)
|
|
20
|
+
|
|
21
|
+
id = @entries.size
|
|
22
|
+
@entries << value
|
|
23
|
+
@index[value] = id
|
|
24
|
+
id
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Add a constant without interning
|
|
28
|
+
# @param value [String] constant value
|
|
29
|
+
# @return [Integer] constant ID
|
|
30
|
+
def add_uninterned(value)
|
|
31
|
+
value = value.b.freeze
|
|
32
|
+
id = @entries.size
|
|
33
|
+
@entries << value
|
|
34
|
+
id
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Replace an existing constant by ID
|
|
38
|
+
# @param id [Integer] constant ID
|
|
39
|
+
# @param value [String] new value
|
|
40
|
+
def replace(id, value)
|
|
41
|
+
value = value.b.freeze
|
|
42
|
+
old = @entries[id]
|
|
43
|
+
@entries[id] = value
|
|
44
|
+
if old && @index[old] == id
|
|
45
|
+
@index.delete(old)
|
|
46
|
+
end
|
|
47
|
+
@index[value] = id unless @index.key?(value)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Get constant by ID
|
|
51
|
+
# @param id [Integer]
|
|
52
|
+
# @return [String]
|
|
53
|
+
def get(id)
|
|
54
|
+
@entries[id]
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Number of constants
|
|
58
|
+
# @return [Integer]
|
|
59
|
+
def size
|
|
60
|
+
@entries.size
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Encode to binary
|
|
64
|
+
# @return [String]
|
|
65
|
+
def to_binary
|
|
66
|
+
parts = []
|
|
67
|
+
|
|
68
|
+
# Count (u16)
|
|
69
|
+
parts << [@entries.size].pack("S>")
|
|
70
|
+
|
|
71
|
+
# Entries: [length (u16), bytes...]
|
|
72
|
+
@entries.each do |entry|
|
|
73
|
+
parts << [entry.bytesize].pack("S>")
|
|
74
|
+
parts << entry
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
parts.join
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Decode from binary
|
|
81
|
+
# @param bytes [String]
|
|
82
|
+
# @return [Array(ConstantPool, Integer)] [pool, bytes_consumed]
|
|
83
|
+
def self.from_binary(bytes)
|
|
84
|
+
pos = 0
|
|
85
|
+
|
|
86
|
+
count = bytes.byteslice(pos, 2).unpack1("S>")
|
|
87
|
+
pos += 2
|
|
88
|
+
|
|
89
|
+
pool = new
|
|
90
|
+
count.times do
|
|
91
|
+
len = bytes.byteslice(pos, 2).unpack1("S>")
|
|
92
|
+
pos += 2
|
|
93
|
+
|
|
94
|
+
value = bytes.byteslice(pos, len)
|
|
95
|
+
pool.add(value)
|
|
96
|
+
pos += len
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
[pool, pos]
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def inspect
|
|
103
|
+
"#<ConstantPool size=#{@entries.size}>"
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# DFATable represents a compiled DFA for pattern matching.
|
|
6
|
+
# Uses byte class compression to reduce table size.
|
|
7
|
+
class DFATable
|
|
8
|
+
# State 0 indicates no valid transition (must match C DFA_DEAD_STATE)
|
|
9
|
+
DEAD_STATE = 0
|
|
10
|
+
# Token ID indicating non-accepting state (must match C DFA_NO_ACCEPT)
|
|
11
|
+
NO_ACCEPT = 0xFFFF
|
|
12
|
+
|
|
13
|
+
attr_reader :state_count, :class_count, :byte_class, :transitions, :accept_states
|
|
14
|
+
|
|
15
|
+
# @param state_count [Integer] number of states
|
|
16
|
+
# @param byte_class [Array<Integer>] 256-element array mapping bytes to classes
|
|
17
|
+
# @param transitions [Array<Integer>] state × class → next_state
|
|
18
|
+
# @param accept_states [Hash<Integer, Integer>] state → token_id for accepting states
|
|
19
|
+
def initialize(state_count:, byte_class:, transitions:, accept_states:)
|
|
20
|
+
@state_count = state_count
|
|
21
|
+
@byte_class = byte_class.freeze
|
|
22
|
+
@class_count = byte_class.max + 1
|
|
23
|
+
@transitions = transitions.freeze
|
|
24
|
+
@accept_states = accept_states.freeze
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Get next state for current state and input byte
|
|
28
|
+
# @param state [Integer] current state
|
|
29
|
+
# @param byte [Integer] input byte
|
|
30
|
+
# @return [Integer] next state (0 = dead state)
|
|
31
|
+
def transition(state, byte)
|
|
32
|
+
cls = @byte_class[byte]
|
|
33
|
+
@transitions[(state * @class_count) + cls]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Check if state is accepting
|
|
37
|
+
# @param state [Integer]
|
|
38
|
+
# @return [Integer, nil] token ID if accepting, nil otherwise
|
|
39
|
+
def accept(state)
|
|
40
|
+
@accept_states[state]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Encode to binary
|
|
44
|
+
# @return [String]
|
|
45
|
+
def to_binary
|
|
46
|
+
parts = []
|
|
47
|
+
|
|
48
|
+
# Header: state_count (u16), class_count (u16)
|
|
49
|
+
parts << [@state_count, @class_count].pack("S>S>")
|
|
50
|
+
|
|
51
|
+
# Byte class table (256 bytes)
|
|
52
|
+
parts << @byte_class.pack("C256")
|
|
53
|
+
|
|
54
|
+
# Transitions (state_count × class_count × u16)
|
|
55
|
+
parts << @transitions.pack("S>*")
|
|
56
|
+
|
|
57
|
+
# Accept states count (u16)
|
|
58
|
+
parts << [@accept_states.size].pack("S>")
|
|
59
|
+
|
|
60
|
+
# Accept states: [state (u16), token_id (u16)] pairs
|
|
61
|
+
@accept_states.each do |state, token_id|
|
|
62
|
+
parts << [state, token_id].pack("S>S>")
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
parts.join
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Decode from binary
|
|
69
|
+
# @param bytes [String]
|
|
70
|
+
# @return [Array(DFATable, Integer)] [table, bytes_consumed]
|
|
71
|
+
def self.from_binary(bytes)
|
|
72
|
+
pos = 0
|
|
73
|
+
|
|
74
|
+
state_count, class_count = bytes.byteslice(pos, 4).unpack("S>S>")
|
|
75
|
+
pos += 4
|
|
76
|
+
|
|
77
|
+
byte_class = bytes.byteslice(pos, 256).unpack("C256")
|
|
78
|
+
pos += 256
|
|
79
|
+
|
|
80
|
+
trans_size = state_count * class_count
|
|
81
|
+
transitions = bytes.byteslice(pos, trans_size * 2).unpack("S>*")
|
|
82
|
+
pos += trans_size * 2
|
|
83
|
+
|
|
84
|
+
accept_count = bytes.byteslice(pos, 2).unpack1("S>")
|
|
85
|
+
pos += 2
|
|
86
|
+
|
|
87
|
+
accept_states = {}
|
|
88
|
+
accept_count.times do
|
|
89
|
+
state, token_id = bytes.byteslice(pos, 4).unpack("S>S>")
|
|
90
|
+
accept_states[state] = token_id
|
|
91
|
+
pos += 4
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
table = new(
|
|
95
|
+
state_count: state_count,
|
|
96
|
+
byte_class: byte_class,
|
|
97
|
+
transitions: transitions,
|
|
98
|
+
accept_states: accept_states
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
[table, pos]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def inspect
|
|
105
|
+
"#<DFATable states=#{@state_count} classes=#{@class_count} accepts=#{@accept_states.size}>"
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Convert to format suitable for C native loading
|
|
109
|
+
# Returns dense accept_tokens array for O(1) lookup
|
|
110
|
+
# @return [Hash] data for C extension
|
|
111
|
+
def to_native_format
|
|
112
|
+
# Build dense array: accept_tokens[state] = token_id
|
|
113
|
+
accept_tokens = Array.new(@state_count, NO_ACCEPT)
|
|
114
|
+
@accept_states.each { |state, token_id| accept_tokens[state] = token_id }
|
|
115
|
+
{
|
|
116
|
+
state_count: @state_count,
|
|
117
|
+
class_count: @class_count,
|
|
118
|
+
byte_class: @byte_class.pack("C*"),
|
|
119
|
+
transitions: @transitions.pack("S>*"),
|
|
120
|
+
accept_tokens: accept_tokens.pack("S>*")
|
|
121
|
+
}
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# Instruction represents a single VM instruction.
|
|
6
|
+
# Each instruction has an opcode and an optional argument.
|
|
7
|
+
class Instruction
|
|
8
|
+
attr_reader :opcode, :arg
|
|
9
|
+
|
|
10
|
+
# @param opcode [Integer] opcode from Opcode module
|
|
11
|
+
# @param arg [Integer] argument (interpretation depends on opcode)
|
|
12
|
+
def initialize(opcode, arg = 0)
|
|
13
|
+
@opcode = opcode
|
|
14
|
+
@arg = arg
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Encode to binary (4 bytes: 1 opcode + 3 arg)
|
|
18
|
+
# @return [String] binary representation
|
|
19
|
+
def to_binary
|
|
20
|
+
[
|
|
21
|
+
@opcode,
|
|
22
|
+
(@arg >> 16) & 0xFF,
|
|
23
|
+
(@arg >> 8) & 0xFF,
|
|
24
|
+
@arg & 0xFF
|
|
25
|
+
].pack("C4")
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Decode from binary
|
|
29
|
+
# @param bytes [String] 4 bytes
|
|
30
|
+
# @return [Instruction]
|
|
31
|
+
def self.from_binary(bytes)
|
|
32
|
+
opcode, a1, a2, a3 = bytes.unpack("C4")
|
|
33
|
+
arg = (a1 << 16) | (a2 << 8) | a3
|
|
34
|
+
new(opcode, arg)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_s
|
|
38
|
+
"#{Opcode.name(@opcode)} #{@arg}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def inspect
|
|
42
|
+
"#<Instruction #{self}>"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def ==(other)
|
|
46
|
+
other.is_a?(Instruction) && @opcode == other.opcode && @arg == other.arg
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# JumpTable for SWITCH_BYTE instruction.
|
|
6
|
+
# Maps bytes to instruction offsets.
|
|
7
|
+
class JumpTable
|
|
8
|
+
attr_reader :entries, :default_offset
|
|
9
|
+
|
|
10
|
+
# @param entries [Hash<Integer, Integer>] byte → offset mapping
|
|
11
|
+
# @param default_offset [Integer] offset for unmatched bytes
|
|
12
|
+
def initialize(entries:, default_offset:)
|
|
13
|
+
@entries = entries.freeze
|
|
14
|
+
@default_offset = default_offset
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Get offset for a byte
|
|
18
|
+
# @param byte [Integer]
|
|
19
|
+
# @return [Integer] instruction offset
|
|
20
|
+
def lookup(byte)
|
|
21
|
+
@entries[byte] || @default_offset
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Encode to binary
|
|
25
|
+
# @return [String]
|
|
26
|
+
def to_binary
|
|
27
|
+
parts = []
|
|
28
|
+
|
|
29
|
+
# Default offset (u24)
|
|
30
|
+
parts << [
|
|
31
|
+
(@default_offset >> 16) & 0xFF,
|
|
32
|
+
(@default_offset >> 8) & 0xFF,
|
|
33
|
+
@default_offset & 0xFF
|
|
34
|
+
].pack("C3")
|
|
35
|
+
|
|
36
|
+
# Entry count (u16)
|
|
37
|
+
parts << [@entries.size].pack("S>")
|
|
38
|
+
|
|
39
|
+
# Entries: [byte (u8), offset (u24)] pairs
|
|
40
|
+
@entries.each do |byte, offset|
|
|
41
|
+
parts << [
|
|
42
|
+
byte,
|
|
43
|
+
(offset >> 16) & 0xFF,
|
|
44
|
+
(offset >> 8) & 0xFF,
|
|
45
|
+
offset & 0xFF
|
|
46
|
+
].pack("C4")
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
parts.join
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Decode from binary
|
|
53
|
+
# @param bytes [String]
|
|
54
|
+
# @return [Array(JumpTable, Integer)] [table, bytes_consumed]
|
|
55
|
+
def self.from_binary(bytes)
|
|
56
|
+
pos = 0
|
|
57
|
+
|
|
58
|
+
d1, d2, d3 = bytes.byteslice(pos, 3).unpack("C3")
|
|
59
|
+
default_offset = (d1 << 16) | (d2 << 8) | d3
|
|
60
|
+
pos += 3
|
|
61
|
+
|
|
62
|
+
entry_count = bytes.byteslice(pos, 2).unpack1("S>")
|
|
63
|
+
pos += 2
|
|
64
|
+
|
|
65
|
+
entries = {}
|
|
66
|
+
entry_count.times do
|
|
67
|
+
byte, o1, o2, o3 = bytes.byteslice(pos, 4).unpack("C4")
|
|
68
|
+
offset = (o1 << 16) | (o2 << 8) | o3
|
|
69
|
+
entries[byte] = offset
|
|
70
|
+
pos += 4
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
[new(entries: entries, default_offset: default_offset), pos]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def inspect
|
|
77
|
+
"#<JumpTable entries=#{@entries.size} default=#{@default_offset}>"
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Convert to format suitable for C native loading
|
|
81
|
+
# Returns dense 256-entry lookup table for O(1) access
|
|
82
|
+
# @return [Hash] data for C extension
|
|
83
|
+
def to_native_format
|
|
84
|
+
# Build dense lookup table: all 256 bytes → offset
|
|
85
|
+
lookup = Array.new(256, @default_offset)
|
|
86
|
+
@entries.each { |byte, offset| lookup[byte] = offset }
|
|
87
|
+
{
|
|
88
|
+
lookup: lookup.pack("L>*"), # 256 * 4 = 1024 bytes
|
|
89
|
+
default_offset: @default_offset # For EOF case
|
|
90
|
+
}
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# KeywordTable maps keyword strings to token IDs.
|
|
6
|
+
# Used for efficient O(1) keyword lookup after identifier match.
|
|
7
|
+
#
|
|
8
|
+
# Keywords must be UTF-8 encoded strings. This is consistent with
|
|
9
|
+
# LexerKit's token specification which requires UTF-8 encoding.
|
|
10
|
+
class KeywordTable
|
|
11
|
+
attr_reader :base_token_id, :keywords
|
|
12
|
+
|
|
13
|
+
# @param base_token_id [Integer] identifier token ID (used when not a keyword)
|
|
14
|
+
# @param keywords [Hash{String => Integer}] UTF-8 keyword string → token ID
|
|
15
|
+
def initialize(base_token_id:, keywords:)
|
|
16
|
+
validate_base_token_id(base_token_id)
|
|
17
|
+
validate_keywords(keywords)
|
|
18
|
+
|
|
19
|
+
@base_token_id = base_token_id
|
|
20
|
+
@keywords = keywords.freeze
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Encode to binary format
|
|
24
|
+
# Format:
|
|
25
|
+
# base_token_id: u16
|
|
26
|
+
# keyword_count: u16
|
|
27
|
+
# for each keyword:
|
|
28
|
+
# key_len: u16
|
|
29
|
+
# key: bytes
|
|
30
|
+
# token_id: u16
|
|
31
|
+
# @return [String]
|
|
32
|
+
# @raise [ArgumentError] if keyword count or any keyword length exceeds u16 limit
|
|
33
|
+
def to_binary
|
|
34
|
+
# Validate keyword count fits in u16
|
|
35
|
+
if @keywords.size > 0xFFFF
|
|
36
|
+
raise ArgumentError, "Too many keywords (#{@keywords.size}): maximum is 65535"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
parts = []
|
|
40
|
+
|
|
41
|
+
# Base token ID (u16)
|
|
42
|
+
parts << [@base_token_id].pack("S>")
|
|
43
|
+
|
|
44
|
+
# Keyword count (u16)
|
|
45
|
+
parts << [@keywords.size].pack("S>")
|
|
46
|
+
|
|
47
|
+
# Keywords
|
|
48
|
+
@keywords.each do |key, token_id|
|
|
49
|
+
key_bytes = key.b
|
|
50
|
+
|
|
51
|
+
# Validate keyword length fits in u16
|
|
52
|
+
if key_bytes.bytesize > 0xFFFF
|
|
53
|
+
raise ArgumentError, "Keyword too long (#{key_bytes.bytesize} bytes): maximum is 65535 bytes"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
parts << [key_bytes.bytesize].pack("S>")
|
|
57
|
+
parts << key_bytes
|
|
58
|
+
parts << [token_id].pack("S>")
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
parts.join
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Convert to format suitable for C native loading
|
|
65
|
+
# @return [Hash] data for C extension
|
|
66
|
+
def to_native_format
|
|
67
|
+
{
|
|
68
|
+
base_token_id: @base_token_id,
|
|
69
|
+
keywords: @keywords.map { |key, token_id| [key.b, token_id] }
|
|
70
|
+
}
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Decode from binary
|
|
74
|
+
# @param bytes [String]
|
|
75
|
+
# @return [Array(KeywordTable, Integer)] table and consumed bytes
|
|
76
|
+
# @raise [LexerKit::InvalidBinaryError] if binary data is invalid
|
|
77
|
+
def self.from_binary(bytes)
|
|
78
|
+
raise LexerKit::InvalidBinaryError, "Binary data too short (expected at least 4 bytes, got #{bytes.bytesize})" if bytes.bytesize < 4
|
|
79
|
+
|
|
80
|
+
pos = 0
|
|
81
|
+
|
|
82
|
+
# Base token ID
|
|
83
|
+
base_token_id = bytes.byteslice(pos, 2)&.unpack1("S>")
|
|
84
|
+
raise LexerKit::InvalidBinaryError, "Invalid header data" if base_token_id.nil?
|
|
85
|
+
|
|
86
|
+
pos += 2
|
|
87
|
+
|
|
88
|
+
# Keyword count
|
|
89
|
+
keyword_count = bytes.byteslice(pos, 2)&.unpack1("S>")
|
|
90
|
+
raise LexerKit::InvalidBinaryError, "Invalid header data" if keyword_count.nil?
|
|
91
|
+
|
|
92
|
+
pos += 2
|
|
93
|
+
|
|
94
|
+
# Keywords
|
|
95
|
+
keywords = {}
|
|
96
|
+
keyword_count.times do
|
|
97
|
+
raise LexerKit::InvalidBinaryError, "Unexpected end of data while reading keyword entry" if pos + 2 > bytes.bytesize
|
|
98
|
+
|
|
99
|
+
key_len = bytes.byteslice(pos, 2)&.unpack1("S>")
|
|
100
|
+
raise LexerKit::InvalidBinaryError, "Invalid key length" if key_len.nil?
|
|
101
|
+
|
|
102
|
+
pos += 2
|
|
103
|
+
|
|
104
|
+
raise LexerKit::InvalidBinaryError, "Unexpected end of data while reading keyword" if pos + key_len + 2 > bytes.bytesize
|
|
105
|
+
|
|
106
|
+
key = bytes.byteslice(pos, key_len).force_encoding(Encoding::UTF_8)
|
|
107
|
+
pos += key_len
|
|
108
|
+
|
|
109
|
+
token_id = bytes.byteslice(pos, 2)&.unpack1("S>")
|
|
110
|
+
raise LexerKit::InvalidBinaryError, "Invalid token ID" if token_id.nil?
|
|
111
|
+
|
|
112
|
+
pos += 2
|
|
113
|
+
|
|
114
|
+
keywords[key] = token_id
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
[new(base_token_id: base_token_id, keywords: keywords), pos]
|
|
118
|
+
rescue LexerKit::InvalidBinaryError
|
|
119
|
+
raise
|
|
120
|
+
rescue StandardError => e
|
|
121
|
+
raise LexerKit::InvalidBinaryError, "Failed to parse binary data: #{e.message}"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
private
|
|
125
|
+
|
|
126
|
+
def validate_base_token_id(id)
|
|
127
|
+
return if id.is_a?(Integer) && id >= 0 && id <= 0xFFFF
|
|
128
|
+
|
|
129
|
+
raise ArgumentError, "base_token_id must be a 16-bit unsigned integer (0..65535), got: #{id.inspect}"
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def validate_keywords(keywords)
|
|
133
|
+
unless keywords.is_a?(Hash)
|
|
134
|
+
raise ArgumentError, "keywords must be a Hash, got: #{keywords.class}"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
keywords.each do |keyword, token_id|
|
|
138
|
+
validate_keyword(keyword)
|
|
139
|
+
validate_token_id(token_id)
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def validate_keyword(keyword)
|
|
144
|
+
unless keyword.is_a?(String)
|
|
145
|
+
raise ArgumentError, "keyword must be a String, got: #{keyword.inspect}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
if keyword.empty?
|
|
149
|
+
raise ArgumentError, "keyword must not be empty"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
unless keyword.encoding == Encoding::UTF_8 || keyword.encoding == Encoding::US_ASCII
|
|
153
|
+
raise ArgumentError, "keyword must be UTF-8 encoded, got: #{keyword.encoding}"
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
return if keyword.valid_encoding?
|
|
157
|
+
|
|
158
|
+
raise ArgumentError, "keyword contains invalid UTF-8 byte sequence"
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def validate_token_id(token_id)
|
|
162
|
+
return if token_id.is_a?(Integer) && token_id >= 0 && token_id <= 0xFFFF
|
|
163
|
+
|
|
164
|
+
raise ArgumentError, "token_id must be a 16-bit unsigned integer (0..65535), got: #{token_id.inspect}"
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module IR
|
|
5
|
+
# Opcode definitions for the LexerKit VM
|
|
6
|
+
module Opcode
|
|
7
|
+
# DFA operations
|
|
8
|
+
DFA_RUN = 0x01 # Run DFA table, arg = table_id
|
|
9
|
+
# Optimized DFA run for single-candidate regex tokens.
|
|
10
|
+
# Argument encoding: (dfa_id << 14) | fail_target (10+14 bits)
|
|
11
|
+
# On success: advance position, move to next instruction
|
|
12
|
+
# On failure: jump directly to fail_target
|
|
13
|
+
DFA_RUN_IF_MATCH = 0x02
|
|
14
|
+
|
|
15
|
+
# Delimiter/literal operations
|
|
16
|
+
SCAN_UNTIL = 0x10 # Scan until delimiter, arg = const_id
|
|
17
|
+
MATCH_LITERAL = 0x12 # Match fixed string, arg = const_id
|
|
18
|
+
SCAN_UNTIL_ESCAPE = 0x13 # Scan until delimiter with escape, arg = config_id
|
|
19
|
+
MATCH_RANGE = 0x14 # Match byte range, arg = (min << 8) | max
|
|
20
|
+
# Optimized literal matching with embedded failure jump target.
|
|
21
|
+
# Argument encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
22
|
+
# On success: advance position, move to next instruction
|
|
23
|
+
# On failure: jump directly to fail_target
|
|
24
|
+
MATCH_LITERAL_OR_JUMP = 0x16
|
|
25
|
+
|
|
26
|
+
# Branch/control operations
|
|
27
|
+
SWITCH_BYTE = 0x20 # Branch on next byte, arg = jump_table_id
|
|
28
|
+
JUMP = 0x21 # Unconditional jump, arg = offset
|
|
29
|
+
JUMP_IF_EOF = 0x24 # Jump if at EOF, arg = offset
|
|
30
|
+
|
|
31
|
+
# Mode operations
|
|
32
|
+
PUSH_MODE = 0x30 # Push mode to stack, arg = mode_id
|
|
33
|
+
POP_MODE = 0x31 # Pop mode from stack
|
|
34
|
+
|
|
35
|
+
# Token operations
|
|
36
|
+
EMIT = 0x40 # Emit token, arg = token_id
|
|
37
|
+
EMIT_SKIP = 0x41 # Skip (advance without emit)
|
|
38
|
+
EMIT_ERROR = 0x42 # Emit error token, arg = error_id
|
|
39
|
+
MARK = 0x43 # Mark current position
|
|
40
|
+
# Optimized EMIT + JUMP for common token emission pattern.
|
|
41
|
+
# Argument encoding: (token_id << 14) | jump_target (10+14 bits)
|
|
42
|
+
EMIT_AND_JUMP = 0x44
|
|
43
|
+
KEYWORD_LOOKUP = 0x45 # Keyword lookup, arg = keyword_table_id
|
|
44
|
+
LITERAL_TRIE_RUN = 0x46 # Run literal trie, arg = const_id
|
|
45
|
+
CLEAR_BEST = 0x49 # Clear best match tracking
|
|
46
|
+
COMMIT_BEST = 0x4C # Commit best match, arg = default offset
|
|
47
|
+
# Optimized regex candidate matching with embedded action target.
|
|
48
|
+
# Argument encoding: (order << 14) | action_ip (10+14 bits)
|
|
49
|
+
SET_MATCH = 0x4D
|
|
50
|
+
# Optimized CLEAR_BEST + LITERAL_TRIE_RUN + COMMIT_BEST for literal-only matching.
|
|
51
|
+
# Argument encoding: (const_id << 14) | fail_target (10+14 bits)
|
|
52
|
+
# On match: jump to best match action IP
|
|
53
|
+
# On no match: jump to fail_target
|
|
54
|
+
LITERAL_TRIE_COMMIT = 0x4E
|
|
55
|
+
# Optimized EMIT_SKIP + JUMP for skipped token pattern.
|
|
56
|
+
# Argument encoding: jump_target (24 bits)
|
|
57
|
+
EMIT_SKIP_AND_JUMP = 0x4F
|
|
58
|
+
|
|
59
|
+
# Special
|
|
60
|
+
HALT = 0xFF # Stop execution
|
|
61
|
+
|
|
62
|
+
# Opcode metadata
|
|
63
|
+
NAMES = {
|
|
64
|
+
DFA_RUN => "DFA_RUN",
|
|
65
|
+
DFA_RUN_IF_MATCH => "DFA_RUN_IF_MATCH",
|
|
66
|
+
SCAN_UNTIL => "SCAN_UNTIL",
|
|
67
|
+
MATCH_LITERAL => "MATCH_LITERAL",
|
|
68
|
+
SCAN_UNTIL_ESCAPE => "SCAN_UNTIL_ESCAPE",
|
|
69
|
+
MATCH_RANGE => "MATCH_RANGE",
|
|
70
|
+
MATCH_LITERAL_OR_JUMP => "MATCH_LITERAL_OR_JUMP",
|
|
71
|
+
SWITCH_BYTE => "SWITCH_BYTE",
|
|
72
|
+
JUMP => "JUMP",
|
|
73
|
+
JUMP_IF_EOF => "JUMP_IF_EOF",
|
|
74
|
+
PUSH_MODE => "PUSH_MODE",
|
|
75
|
+
POP_MODE => "POP_MODE",
|
|
76
|
+
EMIT => "EMIT",
|
|
77
|
+
EMIT_SKIP => "EMIT_SKIP",
|
|
78
|
+
EMIT_ERROR => "EMIT_ERROR",
|
|
79
|
+
MARK => "MARK",
|
|
80
|
+
EMIT_AND_JUMP => "EMIT_AND_JUMP",
|
|
81
|
+
KEYWORD_LOOKUP => "KEYWORD_LOOKUP",
|
|
82
|
+
LITERAL_TRIE_RUN => "LITERAL_TRIE_RUN",
|
|
83
|
+
CLEAR_BEST => "CLEAR_BEST",
|
|
84
|
+
COMMIT_BEST => "COMMIT_BEST",
|
|
85
|
+
SET_MATCH => "SET_MATCH",
|
|
86
|
+
LITERAL_TRIE_COMMIT => "LITERAL_TRIE_COMMIT",
|
|
87
|
+
EMIT_SKIP_AND_JUMP => "EMIT_SKIP_AND_JUMP",
|
|
88
|
+
HALT => "HALT"
|
|
89
|
+
}.freeze
|
|
90
|
+
|
|
91
|
+
def self.name(opcode)
|
|
92
|
+
NAMES[opcode] || "UNKNOWN(0x#{opcode.to_s(16)})"
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|