lexer_kit 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +157 -0
- data/exe/lexer_kit +7 -0
- data/ext/lexer_kit_rust/Cargo.toml +17 -0
- data/ext/lexer_kit_rust/extconf.rb +6 -0
- data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
- data/ext/lexer_kit_rust/src/dfa.rs +217 -0
- data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
- data/ext/lexer_kit_rust/src/lib.rs +248 -0
- data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
- data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
- data/ext/lexer_kit_rust/src/trie.rs +206 -0
- data/ext/lexer_kit_rust/src/types.rs +319 -0
- data/ext/lexer_kit_rust/src/vm.rs +258 -0
- data/lib/lexer_kit/builder/compiler.rb +596 -0
- data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
- data/lib/lexer_kit/builder/mode_def.rb +36 -0
- data/lib/lexer_kit/builder/token_def.rb +65 -0
- data/lib/lexer_kit/builder/validator.rb +84 -0
- data/lib/lexer_kit/builder.rb +230 -0
- data/lib/lexer_kit/cli/commands.rb +389 -0
- data/lib/lexer_kit/cli.rb +88 -0
- data/lib/lexer_kit/core/diagnostic.rb +103 -0
- data/lib/lexer_kit/core/source.rb +154 -0
- data/lib/lexer_kit/core/span.rb +80 -0
- data/lib/lexer_kit/core/token.rb +120 -0
- data/lib/lexer_kit/core.rb +13 -0
- data/lib/lexer_kit/debug/disassembler.rb +143 -0
- data/lib/lexer_kit/debug/visualizer.rb +203 -0
- data/lib/lexer_kit/debug.rb +11 -0
- data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
- data/lib/lexer_kit/dfa/case_folding.rb +45 -0
- data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
- data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
- data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
- data/lib/lexer_kit/dfa/nfa.rb +304 -0
- data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
- data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
- data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
- data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
- data/lib/lexer_kit/dfa.rb +37 -0
- data/lib/lexer_kit/errors.rb +76 -0
- data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
- data/lib/lexer_kit/format/lkb1.rb +199 -0
- data/lib/lexer_kit/format/lkt1.rb +111 -0
- data/lib/lexer_kit/format.rb +19 -0
- data/lib/lexer_kit/ir/compiled_program.rb +228 -0
- data/lib/lexer_kit/ir/constant_pool.rb +107 -0
- data/lib/lexer_kit/ir/dfa_table.rb +125 -0
- data/lib/lexer_kit/ir/instruction.rb +50 -0
- data/lib/lexer_kit/ir/jump_table.rb +94 -0
- data/lib/lexer_kit/ir/keyword_table.rb +168 -0
- data/lib/lexer_kit/ir/opcode.rb +96 -0
- data/lib/lexer_kit/ir/serializer.rb +249 -0
- data/lib/lexer_kit/ir.rb +16 -0
- data/lib/lexer_kit/runner.rb +114 -0
- data/lib/lexer_kit/trie.rb +170 -0
- data/lib/lexer_kit/version.rb +5 -0
- data/lib/lexer_kit.rb +155 -0
- metadata +119 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "stringio"
|
|
4
|
+
|
|
5
|
+
module LexerKit
|
|
6
|
+
module Debug
|
|
7
|
+
# Visualizer for DFA tables and Jump tables
|
|
8
|
+
module Visualizer
|
|
9
|
+
# Format a DFA table as a readable string
|
|
10
|
+
# @param dfa [IR::DFATable] the DFA table
|
|
11
|
+
# @param program [IR::CompiledProgram, nil] optional program for token names
|
|
12
|
+
# @return [String]
|
|
13
|
+
def self.format_dfa(dfa, program: nil)
|
|
14
|
+
output = StringIO.new
|
|
15
|
+
|
|
16
|
+
output.puts "DFA Table"
|
|
17
|
+
output.puts " States: #{dfa.state_count}"
|
|
18
|
+
output.puts " Classes: #{dfa.class_count}"
|
|
19
|
+
output.puts
|
|
20
|
+
|
|
21
|
+
# Byte class mapping (show non-zero classes)
|
|
22
|
+
output.puts " Byte Classes:"
|
|
23
|
+
classes_by_value = Hash.new { |h, k| h[k] = [] }
|
|
24
|
+
dfa.byte_class.each_with_index do |cls, byte|
|
|
25
|
+
classes_by_value[cls] << byte if cls > 0
|
|
26
|
+
end
|
|
27
|
+
classes_by_value.sort.each do |cls, bytes|
|
|
28
|
+
byte_ranges = compress_ranges(bytes)
|
|
29
|
+
output.puts " Class #{cls}: #{byte_ranges}"
|
|
30
|
+
end
|
|
31
|
+
output.puts
|
|
32
|
+
|
|
33
|
+
# Transitions (non-dead state transitions)
|
|
34
|
+
output.puts " Transitions:"
|
|
35
|
+
(1...dfa.state_count).each do |state|
|
|
36
|
+
transitions = []
|
|
37
|
+
(0...dfa.class_count).each do |cls|
|
|
38
|
+
# Access transitions array directly: transitions[state * class_count + cls]
|
|
39
|
+
next_state = dfa.transitions[(state * dfa.class_count) + cls]
|
|
40
|
+
if next_state && next_state != 0 # Not dead state
|
|
41
|
+
transitions << "c#{cls}->s#{next_state}"
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
next unless transitions.any?
|
|
45
|
+
|
|
46
|
+
accept = dfa.accept(state)
|
|
47
|
+
accept_str = accept ? " [accept: #{format_token(accept, program)}]" : ""
|
|
48
|
+
output.puts " State #{state}: #{transitions.join(', ')}#{accept_str}"
|
|
49
|
+
end
|
|
50
|
+
output.puts
|
|
51
|
+
|
|
52
|
+
# Accept states
|
|
53
|
+
output.puts " Accept States:"
|
|
54
|
+
(1...dfa.state_count).each do |state|
|
|
55
|
+
token = dfa.accept(state)
|
|
56
|
+
if token
|
|
57
|
+
output.puts " State #{state}: #{format_token(token, program)}"
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
output.string
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Format a DFA table as Graphviz DOT format
|
|
65
|
+
# @param dfa [IR::DFATable] the DFA table
|
|
66
|
+
# @param program [IR::CompiledProgram, nil] optional program for token names
|
|
67
|
+
# @return [String]
|
|
68
|
+
def self.dfa_to_dot(dfa, program: nil)
|
|
69
|
+
output = StringIO.new
|
|
70
|
+
|
|
71
|
+
output.puts "digraph DFA {"
|
|
72
|
+
output.puts " rankdir=LR;"
|
|
73
|
+
output.puts " node [shape=circle];"
|
|
74
|
+
output.puts
|
|
75
|
+
|
|
76
|
+
# Mark accept states with double circle
|
|
77
|
+
(1...dfa.state_count).each do |state|
|
|
78
|
+
token = dfa.accept(state)
|
|
79
|
+
if token
|
|
80
|
+
label = format_token(token, program)
|
|
81
|
+
output.puts " s#{state} [shape=doublecircle, label=\"#{state}\\n#{label}\"];"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
output.puts
|
|
85
|
+
|
|
86
|
+
# Build class to byte mapping for edge labels
|
|
87
|
+
bytes_by_class = Hash.new { |h, k| h[k] = [] }
|
|
88
|
+
dfa.byte_class.each_with_index do |cls, byte|
|
|
89
|
+
bytes_by_class[cls] << byte
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
# Transitions
|
|
93
|
+
(1...dfa.state_count).each do |state|
|
|
94
|
+
(0...dfa.class_count).each do |cls|
|
|
95
|
+
next_state = dfa.transitions[(state * dfa.class_count) + cls]
|
|
96
|
+
next if next_state.nil? || next_state == 0 # Skip dead state
|
|
97
|
+
|
|
98
|
+
bytes = bytes_by_class[cls]
|
|
99
|
+
label = compress_ranges(bytes)
|
|
100
|
+
output.puts " s#{state} -> s#{next_state} [label=\"#{escape_dot(label)}\"];"
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
output.puts "}"
|
|
105
|
+
output.string
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Format a JumpTable as a readable string
|
|
109
|
+
# @param table [IR::JumpTable] the jump table
|
|
110
|
+
# @return [String]
|
|
111
|
+
def self.format_jump_table(table)
|
|
112
|
+
output = StringIO.new
|
|
113
|
+
|
|
114
|
+
output.puts "Jump Table"
|
|
115
|
+
output.puts " Default: -> #{format('%04d', table.default_offset)}"
|
|
116
|
+
output.puts " Entries:"
|
|
117
|
+
|
|
118
|
+
# Group entries by target offset
|
|
119
|
+
by_offset = Hash.new { |h, k| h[k] = [] }
|
|
120
|
+
table.entries.each do |byte, offset|
|
|
121
|
+
by_offset[offset] << byte
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
by_offset.sort.each do |offset, bytes|
|
|
125
|
+
byte_str = compress_ranges(bytes)
|
|
126
|
+
output.puts " #{byte_str} -> #{format('%04d', offset)}"
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
output.string
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Format a KeywordTable as a readable string
|
|
133
|
+
# @param table [IR::KeywordTable] the keyword table
|
|
134
|
+
# @param program [IR::CompiledProgram, nil] optional program for token names
|
|
135
|
+
# @return [String]
|
|
136
|
+
def self.format_keyword_table(table, program: nil)
|
|
137
|
+
output = StringIO.new
|
|
138
|
+
|
|
139
|
+
output.puts "Keyword Table"
|
|
140
|
+
output.puts " Base Token: #{format_token(table.base_token_id, program)}"
|
|
141
|
+
output.puts " Keywords:"
|
|
142
|
+
|
|
143
|
+
table.keywords.each do |keyword, token_id|
|
|
144
|
+
output.puts " #{keyword.inspect} -> #{format_token(token_id, program)}"
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
output.string
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
private_class_method def self.format_token(token_id, program)
|
|
151
|
+
if program
|
|
152
|
+
name = program.token_name(token_id)
|
|
153
|
+
name ? "#{name} (#{token_id})" : token_id.to_s
|
|
154
|
+
else
|
|
155
|
+
token_id.to_s
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
private_class_method def self.compress_ranges(bytes)
|
|
160
|
+
return "" if bytes.empty?
|
|
161
|
+
|
|
162
|
+
bytes = bytes.sort.uniq
|
|
163
|
+
ranges = []
|
|
164
|
+
start = bytes.first
|
|
165
|
+
last = start
|
|
166
|
+
|
|
167
|
+
bytes[1..].each do |byte|
|
|
168
|
+
if byte == last + 1
|
|
169
|
+
last = byte
|
|
170
|
+
else
|
|
171
|
+
ranges << format_range(start, last)
|
|
172
|
+
start = byte
|
|
173
|
+
last = byte
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
ranges << format_range(start, last)
|
|
177
|
+
|
|
178
|
+
ranges.join(", ")
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
private_class_method def self.format_range(start, last)
|
|
182
|
+
if start == last
|
|
183
|
+
format_byte(start)
|
|
184
|
+
else
|
|
185
|
+
"#{format_byte(start)}-#{format_byte(last)}"
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
private_class_method def self.format_byte(byte)
|
|
190
|
+
case byte
|
|
191
|
+
when 0x22 then "'\"'"
|
|
192
|
+
when 0x5C then "'\\'"
|
|
193
|
+
when 0x20...0x7F then "'#{byte.chr}'"
|
|
194
|
+
else format("0x%02X", byte)
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
private_class_method def self.escape_dot(str)
|
|
199
|
+
str.gsub("\\", "\\\\\\\\").gsub("\"", "\\\"")
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# ByteClassBuilder compresses DFA transitions by grouping bytes
|
|
6
|
+
# with identical behavior into equivalence classes.
|
|
7
|
+
class ByteClassBuilder
|
|
8
|
+
# @param dfa_states [Hash] NFA state set → DFA state ID mapping
|
|
9
|
+
# @param dfa_transitions [Hash] [state, byte] → target state mapping
|
|
10
|
+
# @param state_count [Integer] number of DFA states
|
|
11
|
+
def initialize(dfa_states:, dfa_transitions:, state_count:)
|
|
12
|
+
@dfa_states = dfa_states
|
|
13
|
+
@dfa_transitions = dfa_transitions
|
|
14
|
+
@state_count = state_count
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Build byte classes and transition table
|
|
18
|
+
# @return [Hash] { byte_class:, class_count:, transitions: }
|
|
19
|
+
def build
|
|
20
|
+
byte_class, class_count = build_byte_classes
|
|
21
|
+
transitions = build_transition_table(byte_class, class_count)
|
|
22
|
+
|
|
23
|
+
{
|
|
24
|
+
byte_class: byte_class,
|
|
25
|
+
class_count: class_count,
|
|
26
|
+
transitions: transitions
|
|
27
|
+
}
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def build_byte_classes
|
|
33
|
+
signatures = {}
|
|
34
|
+
|
|
35
|
+
(0..255).each do |byte|
|
|
36
|
+
sig = @dfa_states.keys.map do |nfa_set|
|
|
37
|
+
state = @dfa_states[nfa_set]
|
|
38
|
+
@dfa_transitions[[state, byte]] || 0
|
|
39
|
+
end
|
|
40
|
+
signatures[byte] = sig
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
sig_to_class = {}
|
|
44
|
+
class_count = 0
|
|
45
|
+
byte_class = Array.new(256)
|
|
46
|
+
|
|
47
|
+
(0..255).each do |byte|
|
|
48
|
+
sig = signatures[byte]
|
|
49
|
+
unless sig_to_class.key?(sig)
|
|
50
|
+
sig_to_class[sig] = class_count
|
|
51
|
+
class_count += 1
|
|
52
|
+
end
|
|
53
|
+
byte_class[byte] = sig_to_class[sig]
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
[byte_class, class_count]
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def build_transition_table(byte_class, class_count)
|
|
60
|
+
transitions = Array.new(@state_count * class_count, 0)
|
|
61
|
+
@dfa_transitions.each do |(state, byte), target|
|
|
62
|
+
cls = byte_class[byte]
|
|
63
|
+
transitions[(state * class_count) + cls] = target
|
|
64
|
+
end
|
|
65
|
+
transitions
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# Case folding helpers for ASCII letters.
|
|
6
|
+
# Include this module and set @case_insensitive to use.
|
|
7
|
+
module CaseFolding
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
def ascii_letter?(byte)
|
|
11
|
+
byte.between?(0x41, 0x5A) || byte.between?(0x61, 0x7A)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def case_fold_byte(byte)
|
|
15
|
+
return [byte] unless @case_insensitive && ascii_letter?(byte)
|
|
16
|
+
|
|
17
|
+
if byte.between?(0x41, 0x5A) # A-Z
|
|
18
|
+
[byte, byte + 32]
|
|
19
|
+
else # a-z
|
|
20
|
+
[byte - 32, byte]
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def case_fold_range(from, to)
|
|
25
|
+
return [[from, to]] unless @case_insensitive
|
|
26
|
+
|
|
27
|
+
ranges = [[from, to]]
|
|
28
|
+
|
|
29
|
+
if from <= 0x5A && to >= 0x41
|
|
30
|
+
upper_start = [from, 0x41].max
|
|
31
|
+
upper_end = [to, 0x5A].min
|
|
32
|
+
ranges << [upper_start + 32, upper_end + 32]
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
if from <= 0x7A && to >= 0x61
|
|
36
|
+
lower_start = [from, 0x61].max
|
|
37
|
+
lower_end = [to, 0x7A].min
|
|
38
|
+
ranges << [lower_start - 32, lower_end - 32]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
ranges.uniq
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# CharClassCollector collects character class items and builds appropriate AST.
|
|
6
|
+
# Separates byte/codepoint handling from parsing control flow.
|
|
7
|
+
# Case folding is handled at the NFA layer, not here.
|
|
8
|
+
class CharClassCollector
|
|
9
|
+
include RegexAST
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@byte_ranges = []
|
|
13
|
+
@codepoint_ranges = []
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Add a single item (byte or codepoint)
|
|
17
|
+
def add_item(item)
|
|
18
|
+
if item[:type] == :byte
|
|
19
|
+
@byte_ranges << [item[:value], item[:value]]
|
|
20
|
+
else
|
|
21
|
+
@codepoint_ranges << [item[:value], item[:value]]
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Add a range of items
|
|
26
|
+
def add_range(start_item, end_item)
|
|
27
|
+
raise ArgumentError, "mixed byte and multibyte range in char class" if start_item[:type] != end_item[:type]
|
|
28
|
+
|
|
29
|
+
if start_item[:type] == :byte
|
|
30
|
+
@byte_ranges << [start_item[:value], end_item[:value]]
|
|
31
|
+
else
|
|
32
|
+
start_cp = start_item[:value]
|
|
33
|
+
end_cp = end_item[:value]
|
|
34
|
+
raise ArgumentError, "invalid multibyte range" if start_cp > end_cp
|
|
35
|
+
|
|
36
|
+
@codepoint_ranges << [start_cp, end_cp]
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Build the final AST
|
|
41
|
+
def to_ast(negated:, meta:)
|
|
42
|
+
validate_negated_multibyte!(negated)
|
|
43
|
+
ascii_ast = build_ascii_ast(negated, meta)
|
|
44
|
+
utf8_ast = build_utf8_ast(meta)
|
|
45
|
+
combine_asts(ascii_ast, utf8_ast, negated, meta)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def validate_negated_multibyte!(negated)
|
|
51
|
+
return unless negated && @codepoint_ranges.any?
|
|
52
|
+
|
|
53
|
+
raise ArgumentError, "negated char class with multibyte is not supported"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def build_ascii_ast(negated, meta)
|
|
57
|
+
return nil unless @byte_ranges.any? || negated
|
|
58
|
+
|
|
59
|
+
CharClass.new(ranges: @byte_ranges, negated: negated, meta: meta)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def build_utf8_ast(meta)
|
|
63
|
+
return nil unless @codepoint_ranges.any?
|
|
64
|
+
|
|
65
|
+
utf8_ast = Utf8Range.ast_for_ranges(@codepoint_ranges)
|
|
66
|
+
utf8_ast = utf8_ast.with(meta: meta) if utf8_ast.respond_to?(:with)
|
|
67
|
+
utf8_ast
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def combine_asts(ascii_ast, utf8_ast, negated, meta)
|
|
71
|
+
if utf8_ast
|
|
72
|
+
return utf8_ast unless ascii_ast
|
|
73
|
+
|
|
74
|
+
return Alternation.new(children: [ascii_ast, utf8_ast], meta: meta)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
ascii_ast || CharClass.new(ranges: [], negated: negated, meta: meta)
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# DFABuilder converts NFA to DFA using subset construction,
|
|
6
|
+
# then compresses using byte classes.
|
|
7
|
+
class DFABuilder
|
|
8
|
+
def initialize(nfa)
|
|
9
|
+
@nfa = nfa
|
|
10
|
+
@dfa_states = {}
|
|
11
|
+
@dfa_transitions = {}
|
|
12
|
+
@dfa_accept = {}
|
|
13
|
+
@state_count = 1
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Build DFA from NFA
|
|
17
|
+
# @return [IR::DFATable]
|
|
18
|
+
def build
|
|
19
|
+
initial_set = @nfa.epsilon_closure(Set[@nfa.start_state])
|
|
20
|
+
run_subset_construction(initial_set)
|
|
21
|
+
create_dfa_table
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
def run_subset_construction(initial_set)
|
|
27
|
+
get_or_create_state(initial_set)
|
|
28
|
+
|
|
29
|
+
worklist = [initial_set]
|
|
30
|
+
processed = Set.new
|
|
31
|
+
|
|
32
|
+
while (current_set = worklist.shift)
|
|
33
|
+
next if processed.include?(current_set)
|
|
34
|
+
|
|
35
|
+
processed << current_set
|
|
36
|
+
current_state = @dfa_states[current_set]
|
|
37
|
+
|
|
38
|
+
(0..255).each do |byte|
|
|
39
|
+
moved = @nfa.move(current_set, byte)
|
|
40
|
+
next if moved.empty?
|
|
41
|
+
|
|
42
|
+
target_set = @nfa.epsilon_closure(moved)
|
|
43
|
+
target_state = get_or_create_state(target_set)
|
|
44
|
+
@dfa_transitions[[current_state, byte]] = target_state
|
|
45
|
+
|
|
46
|
+
worklist << target_set unless processed.include?(target_set)
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def get_or_create_state(nfa_state_set)
|
|
52
|
+
return @dfa_states[nfa_state_set] if @dfa_states.key?(nfa_state_set)
|
|
53
|
+
|
|
54
|
+
state_id = @state_count
|
|
55
|
+
@state_count += 1
|
|
56
|
+
@dfa_states[nfa_state_set] = state_id
|
|
57
|
+
|
|
58
|
+
if nfa_state_set.include?(@nfa.accept_state)
|
|
59
|
+
@dfa_accept[state_id] = @nfa.token_id
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
state_id
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def create_dfa_table
|
|
66
|
+
# Build byte classes and transition table
|
|
67
|
+
byte_class_result = ByteClassBuilder.new(
|
|
68
|
+
dfa_states: @dfa_states,
|
|
69
|
+
dfa_transitions: @dfa_transitions,
|
|
70
|
+
state_count: @state_count
|
|
71
|
+
).build
|
|
72
|
+
|
|
73
|
+
byte_class = byte_class_result[:byte_class]
|
|
74
|
+
class_count = byte_class_result[:class_count]
|
|
75
|
+
transitions = byte_class_result[:transitions]
|
|
76
|
+
|
|
77
|
+
# Minimize DFA using Hopcroft's algorithm
|
|
78
|
+
minimizer = DFAMinimizer.new(
|
|
79
|
+
state_count: @state_count,
|
|
80
|
+
transitions: transitions,
|
|
81
|
+
accept_states: @dfa_accept,
|
|
82
|
+
class_count: class_count
|
|
83
|
+
)
|
|
84
|
+
minimized = minimizer.minimize
|
|
85
|
+
|
|
86
|
+
IR::DFATable.new(
|
|
87
|
+
state_count: minimized[:state_count],
|
|
88
|
+
byte_class: byte_class,
|
|
89
|
+
transitions: minimized[:transitions],
|
|
90
|
+
accept_states: minimized[:accept_states]
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
end
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module LexerKit
|
|
4
|
+
module DFA
|
|
5
|
+
# DFAMinimizer reduces DFA state count using Hopcroft's algorithm.
|
|
6
|
+
# Merges equivalent states while preserving language recognition.
|
|
7
|
+
class DFAMinimizer
|
|
8
|
+
# @param state_count [Integer] number of states (including dead state 0)
|
|
9
|
+
# @param transitions [Array<Integer>] state × class_count → next_state
|
|
10
|
+
# @param accept_states [Hash<Integer, Integer>] state → token_id
|
|
11
|
+
# @param class_count [Integer] number of byte classes
|
|
12
|
+
def initialize(state_count:, transitions:, accept_states:, class_count:)
|
|
13
|
+
@state_count = state_count
|
|
14
|
+
@transitions = transitions
|
|
15
|
+
@accept_states = accept_states
|
|
16
|
+
@class_count = class_count
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Minimize the DFA
|
|
20
|
+
# @return [Hash] { state_count:, transitions:, accept_states: }
|
|
21
|
+
def minimize
|
|
22
|
+
return trivial_result if @state_count <= 2
|
|
23
|
+
|
|
24
|
+
# 1. Initial partition: group by acceptance status and token_id
|
|
25
|
+
partitions = build_initial_partitions
|
|
26
|
+
|
|
27
|
+
# 2. Refine partitions using Hopcroft's algorithm
|
|
28
|
+
partitions = refine_partitions(partitions)
|
|
29
|
+
|
|
30
|
+
# 3. Build minimized DFA
|
|
31
|
+
build_minimized_dfa(partitions)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def trivial_result
|
|
37
|
+
{
|
|
38
|
+
state_count: @state_count,
|
|
39
|
+
transitions: @transitions,
|
|
40
|
+
accept_states: @accept_states
|
|
41
|
+
}
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def build_initial_partitions
|
|
45
|
+
# Non-accepting states (excluding dead state 0)
|
|
46
|
+
non_accept = (1...@state_count).reject { |s| @accept_states.key?(s) }
|
|
47
|
+
|
|
48
|
+
# Accepting states grouped by token_id
|
|
49
|
+
accept_groups = @accept_states.group_by { |_, tid| tid }
|
|
50
|
+
.transform_values { |pairs| pairs.map(&:first) }
|
|
51
|
+
|
|
52
|
+
partitions = []
|
|
53
|
+
partitions << non_accept unless non_accept.empty?
|
|
54
|
+
accept_groups.each_value { |states| partitions << states }
|
|
55
|
+
|
|
56
|
+
partitions
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def refine_partitions(partitions)
|
|
60
|
+
# Build state → partition index mapping
|
|
61
|
+
state_to_partition = build_state_mapping(partitions)
|
|
62
|
+
|
|
63
|
+
loop do
|
|
64
|
+
new_partitions = []
|
|
65
|
+
changed = false
|
|
66
|
+
|
|
67
|
+
partitions.each do |partition|
|
|
68
|
+
split = split_partition(partition, state_to_partition)
|
|
69
|
+
new_partitions.concat(split)
|
|
70
|
+
changed = true if split.size > 1
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
break unless changed
|
|
74
|
+
|
|
75
|
+
partitions = new_partitions
|
|
76
|
+
state_to_partition = build_state_mapping(partitions)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
partitions
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def build_state_mapping(partitions)
|
|
83
|
+
mapping = {}
|
|
84
|
+
partitions.each_with_index do |states, idx|
|
|
85
|
+
states.each { |s| mapping[s] = idx }
|
|
86
|
+
end
|
|
87
|
+
mapping
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def split_partition(partition, state_to_partition)
|
|
91
|
+
return [partition] if partition.size <= 1
|
|
92
|
+
|
|
93
|
+
# Group states by their transition signature
|
|
94
|
+
groups = partition.group_by do |state|
|
|
95
|
+
compute_signature(state, state_to_partition)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
groups.values
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def compute_signature(state, state_to_partition)
|
|
102
|
+
(0...@class_count).map do |cls|
|
|
103
|
+
target = @transitions[(state * @class_count) + cls]
|
|
104
|
+
if target.zero?
|
|
105
|
+
-1 # Dead state
|
|
106
|
+
else
|
|
107
|
+
state_to_partition[target] || -1
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def build_minimized_dfa(partitions)
|
|
113
|
+
# Find which partition contains the start state (state 1)
|
|
114
|
+
state_to_partition = build_state_mapping(partitions)
|
|
115
|
+
start_partition = state_to_partition[1]
|
|
116
|
+
|
|
117
|
+
# Reorder partitions so start state's partition comes first
|
|
118
|
+
if start_partition && start_partition != 0
|
|
119
|
+
partitions[0], partitions[start_partition] = partitions[start_partition], partitions[0]
|
|
120
|
+
state_to_partition = build_state_mapping(partitions)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Build partition → new state mapping (partition 0 → state 1, etc.)
|
|
124
|
+
partition_to_new_state = {}
|
|
125
|
+
partitions.each_with_index { |_, idx| partition_to_new_state[idx] = idx + 1 }
|
|
126
|
+
|
|
127
|
+
new_state_count = partitions.size + 1 # +1 for dead state 0
|
|
128
|
+
new_transitions = Array.new(new_state_count * @class_count, 0)
|
|
129
|
+
new_accept = {}
|
|
130
|
+
|
|
131
|
+
partitions.each_with_index do |states, idx|
|
|
132
|
+
new_state = idx + 1
|
|
133
|
+
representative = states.first
|
|
134
|
+
|
|
135
|
+
# Copy transitions using representative
|
|
136
|
+
(0...@class_count).each do |cls|
|
|
137
|
+
target = @transitions[(representative * @class_count) + cls]
|
|
138
|
+
if target.zero?
|
|
139
|
+
new_transitions[(new_state * @class_count) + cls] = 0
|
|
140
|
+
else
|
|
141
|
+
target_partition = state_to_partition[target]
|
|
142
|
+
new_transitions[(new_state * @class_count) + cls] = partition_to_new_state[target_partition]
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Copy accept status if representative is accepting
|
|
147
|
+
new_accept[new_state] = @accept_states[representative] if @accept_states.key?(representative)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
{
|
|
151
|
+
state_count: new_state_count,
|
|
152
|
+
transitions: new_transitions,
|
|
153
|
+
accept_states: new_accept
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|