lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,203 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ module LexerKit
6
+ module Debug
7
+ # Visualizer for DFA tables and Jump tables
8
+ module Visualizer
9
+ # Format a DFA table as a readable string
10
+ # @param dfa [IR::DFATable] the DFA table
11
+ # @param program [IR::CompiledProgram, nil] optional program for token names
12
+ # @return [String]
13
+ def self.format_dfa(dfa, program: nil)
14
+ output = StringIO.new
15
+
16
+ output.puts "DFA Table"
17
+ output.puts " States: #{dfa.state_count}"
18
+ output.puts " Classes: #{dfa.class_count}"
19
+ output.puts
20
+
21
+ # Byte class mapping (show non-zero classes)
22
+ output.puts " Byte Classes:"
23
+ classes_by_value = Hash.new { |h, k| h[k] = [] }
24
+ dfa.byte_class.each_with_index do |cls, byte|
25
+ classes_by_value[cls] << byte if cls > 0
26
+ end
27
+ classes_by_value.sort.each do |cls, bytes|
28
+ byte_ranges = compress_ranges(bytes)
29
+ output.puts " Class #{cls}: #{byte_ranges}"
30
+ end
31
+ output.puts
32
+
33
+ # Transitions (non-dead state transitions)
34
+ output.puts " Transitions:"
35
+ (1...dfa.state_count).each do |state|
36
+ transitions = []
37
+ (0...dfa.class_count).each do |cls|
38
+ # Access transitions array directly: transitions[state * class_count + cls]
39
+ next_state = dfa.transitions[(state * dfa.class_count) + cls]
40
+ if next_state && next_state != 0 # Not dead state
41
+ transitions << "c#{cls}->s#{next_state}"
42
+ end
43
+ end
44
+ next unless transitions.any?
45
+
46
+ accept = dfa.accept(state)
47
+ accept_str = accept ? " [accept: #{format_token(accept, program)}]" : ""
48
+ output.puts " State #{state}: #{transitions.join(', ')}#{accept_str}"
49
+ end
50
+ output.puts
51
+
52
+ # Accept states
53
+ output.puts " Accept States:"
54
+ (1...dfa.state_count).each do |state|
55
+ token = dfa.accept(state)
56
+ if token
57
+ output.puts " State #{state}: #{format_token(token, program)}"
58
+ end
59
+ end
60
+
61
+ output.string
62
+ end
63
+
64
+ # Format a DFA table as Graphviz DOT format
65
+ # @param dfa [IR::DFATable] the DFA table
66
+ # @param program [IR::CompiledProgram, nil] optional program for token names
67
+ # @return [String]
68
+ def self.dfa_to_dot(dfa, program: nil)
69
+ output = StringIO.new
70
+
71
+ output.puts "digraph DFA {"
72
+ output.puts " rankdir=LR;"
73
+ output.puts " node [shape=circle];"
74
+ output.puts
75
+
76
+ # Mark accept states with double circle
77
+ (1...dfa.state_count).each do |state|
78
+ token = dfa.accept(state)
79
+ if token
80
+ label = format_token(token, program)
81
+ output.puts " s#{state} [shape=doublecircle, label=\"#{state}\\n#{label}\"];"
82
+ end
83
+ end
84
+ output.puts
85
+
86
+ # Build class to byte mapping for edge labels
87
+ bytes_by_class = Hash.new { |h, k| h[k] = [] }
88
+ dfa.byte_class.each_with_index do |cls, byte|
89
+ bytes_by_class[cls] << byte
90
+ end
91
+
92
+ # Transitions
93
+ (1...dfa.state_count).each do |state|
94
+ (0...dfa.class_count).each do |cls|
95
+ next_state = dfa.transitions[(state * dfa.class_count) + cls]
96
+ next if next_state.nil? || next_state == 0 # Skip dead state
97
+
98
+ bytes = bytes_by_class[cls]
99
+ label = compress_ranges(bytes)
100
+ output.puts " s#{state} -> s#{next_state} [label=\"#{escape_dot(label)}\"];"
101
+ end
102
+ end
103
+
104
+ output.puts "}"
105
+ output.string
106
+ end
107
+
108
+ # Format a JumpTable as a readable string
109
+ # @param table [IR::JumpTable] the jump table
110
+ # @return [String]
111
+ def self.format_jump_table(table)
112
+ output = StringIO.new
113
+
114
+ output.puts "Jump Table"
115
+ output.puts " Default: -> #{format('%04d', table.default_offset)}"
116
+ output.puts " Entries:"
117
+
118
+ # Group entries by target offset
119
+ by_offset = Hash.new { |h, k| h[k] = [] }
120
+ table.entries.each do |byte, offset|
121
+ by_offset[offset] << byte
122
+ end
123
+
124
+ by_offset.sort.each do |offset, bytes|
125
+ byte_str = compress_ranges(bytes)
126
+ output.puts " #{byte_str} -> #{format('%04d', offset)}"
127
+ end
128
+
129
+ output.string
130
+ end
131
+
132
+ # Format a KeywordTable as a readable string
133
+ # @param table [IR::KeywordTable] the keyword table
134
+ # @param program [IR::CompiledProgram, nil] optional program for token names
135
+ # @return [String]
136
+ def self.format_keyword_table(table, program: nil)
137
+ output = StringIO.new
138
+
139
+ output.puts "Keyword Table"
140
+ output.puts " Base Token: #{format_token(table.base_token_id, program)}"
141
+ output.puts " Keywords:"
142
+
143
+ table.keywords.each do |keyword, token_id|
144
+ output.puts " #{keyword.inspect} -> #{format_token(token_id, program)}"
145
+ end
146
+
147
+ output.string
148
+ end
149
+
150
+ private_class_method def self.format_token(token_id, program)
151
+ if program
152
+ name = program.token_name(token_id)
153
+ name ? "#{name} (#{token_id})" : token_id.to_s
154
+ else
155
+ token_id.to_s
156
+ end
157
+ end
158
+
159
+ private_class_method def self.compress_ranges(bytes)
160
+ return "" if bytes.empty?
161
+
162
+ bytes = bytes.sort.uniq
163
+ ranges = []
164
+ start = bytes.first
165
+ last = start
166
+
167
+ bytes[1..].each do |byte|
168
+ if byte == last + 1
169
+ last = byte
170
+ else
171
+ ranges << format_range(start, last)
172
+ start = byte
173
+ last = byte
174
+ end
175
+ end
176
+ ranges << format_range(start, last)
177
+
178
+ ranges.join(", ")
179
+ end
180
+
181
+ private_class_method def self.format_range(start, last)
182
+ if start == last
183
+ format_byte(start)
184
+ else
185
+ "#{format_byte(start)}-#{format_byte(last)}"
186
+ end
187
+ end
188
+
189
+ private_class_method def self.format_byte(byte)
190
+ case byte
191
+ when 0x22 then "'\"'"
192
+ when 0x5C then "'\\'"
193
+ when 0x20...0x7F then "'#{byte.chr}'"
194
+ else format("0x%02X", byte)
195
+ end
196
+ end
197
+
198
+ private_class_method def self.escape_dot(str)
199
+ str.gsub("\\", "\\\\\\\\").gsub("\"", "\\\"")
200
+ end
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "debug/disassembler"
4
+ require_relative "debug/visualizer"
5
+
6
+ module LexerKit
7
+ # Debug tools for LexerKit
8
+ # Provides disassembly and visualization utilities
9
+ module Debug
10
+ end
11
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # ByteClassBuilder compresses DFA transitions by grouping bytes
6
+ # with identical behavior into equivalence classes.
7
+ class ByteClassBuilder
8
+ # @param dfa_states [Hash] NFA state set → DFA state ID mapping
9
+ # @param dfa_transitions [Hash] [state, byte] → target state mapping
10
+ # @param state_count [Integer] number of DFA states
11
+ def initialize(dfa_states:, dfa_transitions:, state_count:)
12
+ @dfa_states = dfa_states
13
+ @dfa_transitions = dfa_transitions
14
+ @state_count = state_count
15
+ end
16
+
17
+ # Build byte classes and transition table
18
+ # @return [Hash] { byte_class:, class_count:, transitions: }
19
+ def build
20
+ byte_class, class_count = build_byte_classes
21
+ transitions = build_transition_table(byte_class, class_count)
22
+
23
+ {
24
+ byte_class: byte_class,
25
+ class_count: class_count,
26
+ transitions: transitions
27
+ }
28
+ end
29
+
30
+ private
31
+
32
+ def build_byte_classes
33
+ signatures = {}
34
+
35
+ (0..255).each do |byte|
36
+ sig = @dfa_states.keys.map do |nfa_set|
37
+ state = @dfa_states[nfa_set]
38
+ @dfa_transitions[[state, byte]] || 0
39
+ end
40
+ signatures[byte] = sig
41
+ end
42
+
43
+ sig_to_class = {}
44
+ class_count = 0
45
+ byte_class = Array.new(256)
46
+
47
+ (0..255).each do |byte|
48
+ sig = signatures[byte]
49
+ unless sig_to_class.key?(sig)
50
+ sig_to_class[sig] = class_count
51
+ class_count += 1
52
+ end
53
+ byte_class[byte] = sig_to_class[sig]
54
+ end
55
+
56
+ [byte_class, class_count]
57
+ end
58
+
59
+ def build_transition_table(byte_class, class_count)
60
+ transitions = Array.new(@state_count * class_count, 0)
61
+ @dfa_transitions.each do |(state, byte), target|
62
+ cls = byte_class[byte]
63
+ transitions[(state * class_count) + cls] = target
64
+ end
65
+ transitions
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # Case folding helpers for ASCII letters.
6
+ # Include this module and set @case_insensitive to use.
7
+ module CaseFolding
8
+ private
9
+
10
+ def ascii_letter?(byte)
11
+ byte.between?(0x41, 0x5A) || byte.between?(0x61, 0x7A)
12
+ end
13
+
14
+ def case_fold_byte(byte)
15
+ return [byte] unless @case_insensitive && ascii_letter?(byte)
16
+
17
+ if byte.between?(0x41, 0x5A) # A-Z
18
+ [byte, byte + 32]
19
+ else # a-z
20
+ [byte - 32, byte]
21
+ end
22
+ end
23
+
24
+ def case_fold_range(from, to)
25
+ return [[from, to]] unless @case_insensitive
26
+
27
+ ranges = [[from, to]]
28
+
29
+ if from <= 0x5A && to >= 0x41
30
+ upper_start = [from, 0x41].max
31
+ upper_end = [to, 0x5A].min
32
+ ranges << [upper_start + 32, upper_end + 32]
33
+ end
34
+
35
+ if from <= 0x7A && to >= 0x61
36
+ lower_start = [from, 0x61].max
37
+ lower_end = [to, 0x7A].min
38
+ ranges << [lower_start - 32, lower_end - 32]
39
+ end
40
+
41
+ ranges.uniq
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,81 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # CharClassCollector collects character class items and builds appropriate AST.
6
+ # Separates byte/codepoint handling from parsing control flow.
7
+ # Case folding is handled at the NFA layer, not here.
8
+ class CharClassCollector
9
+ include RegexAST
10
+
11
+ def initialize
12
+ @byte_ranges = []
13
+ @codepoint_ranges = []
14
+ end
15
+
16
+ # Add a single item (byte or codepoint)
17
+ def add_item(item)
18
+ if item[:type] == :byte
19
+ @byte_ranges << [item[:value], item[:value]]
20
+ else
21
+ @codepoint_ranges << [item[:value], item[:value]]
22
+ end
23
+ end
24
+
25
+ # Add a range of items
26
+ def add_range(start_item, end_item)
27
+ raise ArgumentError, "mixed byte and multibyte range in char class" if start_item[:type] != end_item[:type]
28
+
29
+ if start_item[:type] == :byte
30
+ @byte_ranges << [start_item[:value], end_item[:value]]
31
+ else
32
+ start_cp = start_item[:value]
33
+ end_cp = end_item[:value]
34
+ raise ArgumentError, "invalid multibyte range" if start_cp > end_cp
35
+
36
+ @codepoint_ranges << [start_cp, end_cp]
37
+ end
38
+ end
39
+
40
+ # Build the final AST
41
+ def to_ast(negated:, meta:)
42
+ validate_negated_multibyte!(negated)
43
+ ascii_ast = build_ascii_ast(negated, meta)
44
+ utf8_ast = build_utf8_ast(meta)
45
+ combine_asts(ascii_ast, utf8_ast, negated, meta)
46
+ end
47
+
48
+ private
49
+
50
+ def validate_negated_multibyte!(negated)
51
+ return unless negated && @codepoint_ranges.any?
52
+
53
+ raise ArgumentError, "negated char class with multibyte is not supported"
54
+ end
55
+
56
+ def build_ascii_ast(negated, meta)
57
+ return nil unless @byte_ranges.any? || negated
58
+
59
+ CharClass.new(ranges: @byte_ranges, negated: negated, meta: meta)
60
+ end
61
+
62
+ def build_utf8_ast(meta)
63
+ return nil unless @codepoint_ranges.any?
64
+
65
+ utf8_ast = Utf8Range.ast_for_ranges(@codepoint_ranges)
66
+ utf8_ast = utf8_ast.with(meta: meta) if utf8_ast.respond_to?(:with)
67
+ utf8_ast
68
+ end
69
+
70
+ def combine_asts(ascii_ast, utf8_ast, negated, meta)
71
+ if utf8_ast
72
+ return utf8_ast unless ascii_ast
73
+
74
+ return Alternation.new(children: [ascii_ast, utf8_ast], meta: meta)
75
+ end
76
+
77
+ ascii_ast || CharClass.new(ranges: [], negated: negated, meta: meta)
78
+ end
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # DFABuilder converts NFA to DFA using subset construction,
6
+ # then compresses using byte classes.
7
+ class DFABuilder
8
+ def initialize(nfa)
9
+ @nfa = nfa
10
+ @dfa_states = {}
11
+ @dfa_transitions = {}
12
+ @dfa_accept = {}
13
+ @state_count = 1
14
+ end
15
+
16
+ # Build DFA from NFA
17
+ # @return [IR::DFATable]
18
+ def build
19
+ initial_set = @nfa.epsilon_closure(Set[@nfa.start_state])
20
+ run_subset_construction(initial_set)
21
+ create_dfa_table
22
+ end
23
+
24
+ private
25
+
26
+ def run_subset_construction(initial_set)
27
+ get_or_create_state(initial_set)
28
+
29
+ worklist = [initial_set]
30
+ processed = Set.new
31
+
32
+ while (current_set = worklist.shift)
33
+ next if processed.include?(current_set)
34
+
35
+ processed << current_set
36
+ current_state = @dfa_states[current_set]
37
+
38
+ (0..255).each do |byte|
39
+ moved = @nfa.move(current_set, byte)
40
+ next if moved.empty?
41
+
42
+ target_set = @nfa.epsilon_closure(moved)
43
+ target_state = get_or_create_state(target_set)
44
+ @dfa_transitions[[current_state, byte]] = target_state
45
+
46
+ worklist << target_set unless processed.include?(target_set)
47
+ end
48
+ end
49
+ end
50
+
51
+ def get_or_create_state(nfa_state_set)
52
+ return @dfa_states[nfa_state_set] if @dfa_states.key?(nfa_state_set)
53
+
54
+ state_id = @state_count
55
+ @state_count += 1
56
+ @dfa_states[nfa_state_set] = state_id
57
+
58
+ if nfa_state_set.include?(@nfa.accept_state)
59
+ @dfa_accept[state_id] = @nfa.token_id
60
+ end
61
+
62
+ state_id
63
+ end
64
+
65
+ def create_dfa_table
66
+ # Build byte classes and transition table
67
+ byte_class_result = ByteClassBuilder.new(
68
+ dfa_states: @dfa_states,
69
+ dfa_transitions: @dfa_transitions,
70
+ state_count: @state_count
71
+ ).build
72
+
73
+ byte_class = byte_class_result[:byte_class]
74
+ class_count = byte_class_result[:class_count]
75
+ transitions = byte_class_result[:transitions]
76
+
77
+ # Minimize DFA using Hopcroft's algorithm
78
+ minimizer = DFAMinimizer.new(
79
+ state_count: @state_count,
80
+ transitions: transitions,
81
+ accept_states: @dfa_accept,
82
+ class_count: class_count
83
+ )
84
+ minimized = minimizer.minimize
85
+
86
+ IR::DFATable.new(
87
+ state_count: minimized[:state_count],
88
+ byte_class: byte_class,
89
+ transitions: minimized[:transitions],
90
+ accept_states: minimized[:accept_states]
91
+ )
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module DFA
5
+ # DFAMinimizer reduces DFA state count using Hopcroft's algorithm.
6
+ # Merges equivalent states while preserving language recognition.
7
+ class DFAMinimizer
8
+ # @param state_count [Integer] number of states (including dead state 0)
9
+ # @param transitions [Array<Integer>] state × class_count → next_state
10
+ # @param accept_states [Hash<Integer, Integer>] state → token_id
11
+ # @param class_count [Integer] number of byte classes
12
+ def initialize(state_count:, transitions:, accept_states:, class_count:)
13
+ @state_count = state_count
14
+ @transitions = transitions
15
+ @accept_states = accept_states
16
+ @class_count = class_count
17
+ end
18
+
19
+ # Minimize the DFA
20
+ # @return [Hash] { state_count:, transitions:, accept_states: }
21
+ def minimize
22
+ return trivial_result if @state_count <= 2
23
+
24
+ # 1. Initial partition: group by acceptance status and token_id
25
+ partitions = build_initial_partitions
26
+
27
+ # 2. Refine partitions using Hopcroft's algorithm
28
+ partitions = refine_partitions(partitions)
29
+
30
+ # 3. Build minimized DFA
31
+ build_minimized_dfa(partitions)
32
+ end
33
+
34
+ private
35
+
36
+ def trivial_result
37
+ {
38
+ state_count: @state_count,
39
+ transitions: @transitions,
40
+ accept_states: @accept_states
41
+ }
42
+ end
43
+
44
+ def build_initial_partitions
45
+ # Non-accepting states (excluding dead state 0)
46
+ non_accept = (1...@state_count).reject { |s| @accept_states.key?(s) }
47
+
48
+ # Accepting states grouped by token_id
49
+ accept_groups = @accept_states.group_by { |_, tid| tid }
50
+ .transform_values { |pairs| pairs.map(&:first) }
51
+
52
+ partitions = []
53
+ partitions << non_accept unless non_accept.empty?
54
+ accept_groups.each_value { |states| partitions << states }
55
+
56
+ partitions
57
+ end
58
+
59
+ def refine_partitions(partitions)
60
+ # Build state → partition index mapping
61
+ state_to_partition = build_state_mapping(partitions)
62
+
63
+ loop do
64
+ new_partitions = []
65
+ changed = false
66
+
67
+ partitions.each do |partition|
68
+ split = split_partition(partition, state_to_partition)
69
+ new_partitions.concat(split)
70
+ changed = true if split.size > 1
71
+ end
72
+
73
+ break unless changed
74
+
75
+ partitions = new_partitions
76
+ state_to_partition = build_state_mapping(partitions)
77
+ end
78
+
79
+ partitions
80
+ end
81
+
82
+ def build_state_mapping(partitions)
83
+ mapping = {}
84
+ partitions.each_with_index do |states, idx|
85
+ states.each { |s| mapping[s] = idx }
86
+ end
87
+ mapping
88
+ end
89
+
90
+ def split_partition(partition, state_to_partition)
91
+ return [partition] if partition.size <= 1
92
+
93
+ # Group states by their transition signature
94
+ groups = partition.group_by do |state|
95
+ compute_signature(state, state_to_partition)
96
+ end
97
+
98
+ groups.values
99
+ end
100
+
101
+ def compute_signature(state, state_to_partition)
102
+ (0...@class_count).map do |cls|
103
+ target = @transitions[(state * @class_count) + cls]
104
+ if target.zero?
105
+ -1 # Dead state
106
+ else
107
+ state_to_partition[target] || -1
108
+ end
109
+ end
110
+ end
111
+
112
+ def build_minimized_dfa(partitions)
113
+ # Find which partition contains the start state (state 1)
114
+ state_to_partition = build_state_mapping(partitions)
115
+ start_partition = state_to_partition[1]
116
+
117
+ # Reorder partitions so start state's partition comes first
118
+ if start_partition && start_partition != 0
119
+ partitions[0], partitions[start_partition] = partitions[start_partition], partitions[0]
120
+ state_to_partition = build_state_mapping(partitions)
121
+ end
122
+
123
+ # Build partition → new state mapping (partition 0 → state 1, etc.)
124
+ partition_to_new_state = {}
125
+ partitions.each_with_index { |_, idx| partition_to_new_state[idx] = idx + 1 }
126
+
127
+ new_state_count = partitions.size + 1 # +1 for dead state 0
128
+ new_transitions = Array.new(new_state_count * @class_count, 0)
129
+ new_accept = {}
130
+
131
+ partitions.each_with_index do |states, idx|
132
+ new_state = idx + 1
133
+ representative = states.first
134
+
135
+ # Copy transitions using representative
136
+ (0...@class_count).each do |cls|
137
+ target = @transitions[(representative * @class_count) + cls]
138
+ if target.zero?
139
+ new_transitions[(new_state * @class_count) + cls] = 0
140
+ else
141
+ target_partition = state_to_partition[target]
142
+ new_transitions[(new_state * @class_count) + cls] = partition_to_new_state[target_partition]
143
+ end
144
+ end
145
+
146
+ # Copy accept status if representative is accepting
147
+ new_accept[new_state] = @accept_states[representative] if @accept_states.key?(representative)
148
+ end
149
+
150
+ {
151
+ state_count: new_state_count,
152
+ transitions: new_transitions,
153
+ accept_states: new_accept
154
+ }
155
+ end
156
+ end
157
+ end
158
+ end