kleene 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/kleene/dsl.rb ADDED
@@ -0,0 +1,263 @@
1
+ # Most of the machines constructed here are based on section 2.5 of the Ragel User Guide (http://www.colm.net/files/ragel/ragel-guide-6.10.pdf)
2
+
3
+ module Kleene
4
+ module DSL
5
+ extend self
6
+
7
+ ############### The following methods create FSAs given a stream of input tokens #################
8
+
9
+ # given a string with N characters in it:
10
+ # N+1 states: start state and N other states
11
+ # structure: start state -> transition for first character in the string -> state for having observed first character in the string ->
12
+ # transition for second character in the string -> state for having observed second character in the string ->
13
+ # ...
14
+ # transition for last character in the string -> state for having observed last character in the string (marked final)
15
+ def literal(token_stream, alphabet = DEFAULT_ALPHABET)
16
+ start = current_state = State.new
17
+ nfa = NFA.new(start, alphabet)
18
+ token_stream.each_char do |token|
19
+ next_state = State.new
20
+ nfa.add_transition(token, current_state, next_state)
21
+ current_state = next_state
22
+ end
23
+ current_state.final = true
24
+ nfa.update_final_states
25
+ nfa.set_regex_pattern(token_stream)
26
+ nfa
27
+ end
28
+
29
+ # two states: start state and final state
30
+ # structure: start state -> transitions for each token in the token collection -> final state
31
+ def any(token_collection, alphabet = DEFAULT_ALPHABET)
32
+ start = State.new
33
+ nfa = NFA.new(start, alphabet)
34
+ final = State.new(true)
35
+ token_collection.each {|token| nfa.add_transition(token, start, final) }
36
+ nfa.update_final_states
37
+ regex_pattern = "[#{token_collection.join("")}]"
38
+ nfa.set_regex_pattern(regex_pattern)
39
+ nfa
40
+ end
41
+
42
+ # two states: start state and final state
43
+ # structure: start state -> transitions for every token in the alphabet -> final state
44
+ def dot(alphabet = DEFAULT_ALPHABET)
45
+ any(alphabet, alphabet).set_regex_pattern(".")
46
+ end
47
+
48
+ # This implements a character class, and is specifically for use with matching strings
49
+ def range(c_begin, c_end, alphabet = DEFAULT_ALPHABET)
50
+ any((c_begin..c_end).to_a, alphabet).set_regex_pattern("[#{c_begin}-#{c_end}]")
51
+ end
52
+
53
+ ############### The following methods create FSAs given other FSAs #################
54
+
55
+ # always clones the given nfa and returns a new nfa with a non-final error state
56
+ def with_err(nfa, alphabet = nfa.alphabet)
57
+ with_err!(nfa.deep_clone, alphabet)
58
+ end
59
+
60
+ # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
61
+ # the error state transitions to itself on any token.
62
+ def with_err!(nfa, alphabet = nfa.alphabet)
63
+ error_state = nfa.states.find(&:error?)
64
+ return nfa if error_state
65
+
66
+ error_state = State.new_error_state
67
+ nfa.add_state(error_state)
68
+
69
+ nfa.states.each do |state|
70
+ tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token)
71
+ missing_tokens = alphabet - tokens_on_outbound_transitions
72
+ missing_tokens.each do |token|
73
+ nfa.add_transition(token, state, error_state)
74
+ end
75
+ end
76
+
77
+ nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
78
+
79
+ nfa.set_regex_pattern("/#{nfa.regex_pattern}/E")
80
+ end
81
+
82
+ # always clones the given nfa and returns a new nfa with a non-final error state
83
+ def with_err_dead_end(nfa, alphabet = nfa.alphabet)
84
+ with_err_dead_end!(nfa.deep_clone, alphabet)
85
+ end
86
+
87
+ # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
88
+ # the error state doesn't have any outbound transitions.
89
+ def with_err_dead_end!(nfa, alphabet = nfa.alphabet)
90
+ error_state = nfa.states.find(&:error?)
91
+ return nfa if error_state
92
+
93
+ error_state = State.new_error_state
94
+ nfa.add_state(error_state)
95
+
96
+ nfa.states.each do |state|
97
+ unless state.error?
98
+ tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token).to_set
99
+ only_outbound_transition_is_epsilon_transition = tokens_on_outbound_transitions.size == 1 && tokens_on_outbound_transitions.first == NFATransition::Epsilon
100
+ unless only_outbound_transition_is_epsilon_transition
101
+ missing_tokens = (alphabet - Set[NFATransition::Epsilon]) - tokens_on_outbound_transitions
102
+ missing_tokens.each do |token|
103
+ nfa.add_transition(token, state, error_state)
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ # remove the error state if it has no inbound or outbound transitions
110
+ nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
111
+
112
+ nfa.set_regex_pattern("/#{nfa.regex_pattern}/DE")
113
+ end
114
+
115
+ # Append b onto a
116
+ # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
117
+ # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
118
+ def append(a, b)
119
+ a = a.deep_clone
120
+ b = b.deep_clone
121
+ append!(a, b)
122
+ end
123
+
124
+ # Destructively append b onto a
125
+ # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
126
+ # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
127
+ def append!(a, b)
128
+ a.alphabet = a.alphabet | b.alphabet
129
+
130
+ # add an epsilon transition from each final state of machine a to the start state of maachine b.
131
+ a.final_states.each do |final_state|
132
+ a.add_transition(NFATransition::Epsilon, final_state, b.start_state)
133
+ end
134
+
135
+ # add all of machine b's transitions to machine a
136
+ b.all_transitions.each {|transition| a.add_transition(transition.token, transition.from, transition.to) }
137
+ # a.final_states = a.final_states | b.final_states
138
+ a.update_final_states
139
+
140
+ a
141
+ end
142
+
143
+ def seq(*nfas)
144
+ nfas.flatten.reduce {|memo_nfa, nfa| seq2(memo_nfa, nfa) }
145
+ end
146
+
147
+ # Implements concatenation, as defined in the Ragel manual in section 2.5.5 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
148
+ # Seq produces a machine that matches all the strings in machine `a` followed by all the strings in machine `b`.
149
+ # Seq draws epsilon transitions from the final states of thefirst machine to the start state of the second machine.
150
+ # The final states of the first machine lose their final state status, unless the start state of the second machine is final as well.
151
+ def seq2(a, b)
152
+ a = a.deep_clone
153
+ b = b.deep_clone
154
+
155
+ a = append!(a, b)
156
+
157
+ # make sure that b's final states are the only final states in a after we have appended b onto a
158
+ a.states.each {|state| state.final = b.final_states.includes?(state) }
159
+ a.update_final_states
160
+
161
+ a.set_regex_pattern("#{a.regex_pattern}#{b.regex_pattern}")
162
+ end
163
+
164
+ # Build a new machine consisting of a new start state with epsilon transitions to the start state of all the given NFAs in `nfas`.
165
+ # The resulting machine's final states are the set of final states from all the NFAs in `nfas`.
166
+ #
167
+ # Implements Union, as defined in the Ragel manual in section 2.5.1 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
168
+ # The union operation produces a machine that matches any string in machine one or machine two.
169
+ # The operation first creates a new start state.
170
+ # Epsilon transitions are drawn from the new start state to the start states of both input machines.
171
+ # The resulting machine has a final state setequivalent to the union of the final state sets of both input machines.
172
+ def union(*nfas)
173
+ nfas.flatten!
174
+ nfas = nfas.map(&:deep_clone)
175
+ union!(nfas)
176
+ end
177
+
178
+ # same as union, but doesn't deep clone the constituent nfas
179
+ def union!(nfas)
180
+ start = State.new
181
+ composite_alphabet = nfas.map(&:alphabet).reduce {|memo, alphabet| memo | alphabet }
182
+ new_nfa = NFA.new(start, composite_alphabet)
183
+
184
+ # add epsilon transitions from the start state of the new machine to the start state of machines a and b
185
+ nfas.each do |nfa|
186
+ new_nfa.add_states(nfa.states)
187
+ new_nfa.add_transition(NFATransition::Epsilon, start, nfa.start_state)
188
+ nfa.all_transitions.each {|t| new_nfa.add_transition(t.token, t.from, t.to) }
189
+ end
190
+
191
+ new_nfa.update_final_states
192
+
193
+ new_nfa.set_regex_pattern("#{nfas.map(&:regex_pattern).join("|")}")
194
+ end
195
+
196
+ # Implements Kleene Star, as defined in the Ragel manual in section 2.5.6 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
197
+ # The machine resulting from the Kleene Star operator will match zero or more repetitions of the machine it is applied to.
198
+ # It creates a new start state and an additional final state.
199
+ # Epsilon transitions are drawn between the new start state and the old start state,
200
+ # between the new start state and the new final state, and between the final states of the machine and the new start state.
201
+ def kleene(machine)
202
+ machine = machine.deep_clone
203
+ start = State.new
204
+ final = State.new(true)
205
+
206
+ nfa = NFA.new(start, machine.alphabet)
207
+ nfa.add_states(machine.states)
208
+ nfa.add_transition(NFATransition::Epsilon, start, final)
209
+ nfa.add_transition(NFATransition::Epsilon, start, machine.start_state)
210
+ machine.final_states.each do |final_state|
211
+ nfa.add_transition(NFATransition::Epsilon, final_state, start)
212
+ final_state.final = false
213
+ end
214
+
215
+ # add all of machine's transitions to the new machine
216
+ (machine.all_transitions).each {|t| nfa.add_transition(t.token, t.from, t.to) }
217
+ nfa.update_final_states
218
+
219
+ nfa.set_regex_pattern("#{machine.regex_pattern}*")
220
+ end
221
+
222
+ def plus(machine)
223
+ seq(machine, kleene(machine)).set_regex_pattern("#{machine.regex_pattern}+")
224
+ end
225
+
226
+ def optional(machine)
227
+ empty = NFA.new(State.new(true), machine.alphabet).set_regex_pattern("")
228
+ union(machine, empty).set_regex_pattern("#{machine.regex_pattern}?")
229
+ end
230
+
231
+ # def repeat(machine, min, max = nil)
232
+ # max ||= min
233
+ # m = NFA.new(State.new(true), machine.alphabet)
234
+ # min.times { m = seq(m, machine) }
235
+ # (max - min).times { m = append(m, machine) }
236
+ # if min != max
237
+ # m.set_regex_pattern("#{machine.regex_pattern}{#{min},#{max}}")
238
+ # else
239
+ # m.set_regex_pattern("#{machine.regex_pattern}{#{min}}")
240
+ # end
241
+ # end
242
+
243
+ # def negate(machine)
244
+ # machine = machine.to_dfa
245
+
246
+ # # invert the final flag of every state
247
+ # machine.states.each {|state| state.final = !state.final? }
248
+ # machine.update_final_states
249
+
250
+ # machine.to_nfa.set_regex_pattern("(!#{machine.regex_pattern})")
251
+ # end
252
+
253
+ # # a - b == a && !b
254
+ # def difference(a, b)
255
+ # intersection(a, negate(b))
256
+ # end
257
+
258
+ # # By De Morgan's Law: !(!a || !b) = a && b
259
+ # def intersection(a, b)
260
+ # negate(union(negate(a), negate(b)))
261
+ # end
262
+ end
263
+ end
@@ -0,0 +1,88 @@
1
+ # this is a port and extension of https://github.com/davidkellis/kleene/
2
+
3
+ require_relative "./dsl"
4
+ require_relative "./nfa"
5
+ require_relative "./dfa"
6
+
7
+ module Kleene
8
+ # The default alphabet consists of the following:
9
+ # Set{' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
10
+ # '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
11
+ # ':', ';', '<', '=', '>', '?', '@',
12
+ # 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
13
+ # 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
14
+ # '[', '\\', ']', '^', '_', '`',
15
+ # 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
16
+ # 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
17
+ # '{', '|', '}', '~', "\n", "\t"}
18
+ DEFAULT_ALPHABET = ((' '..'~').to_a + "\n\t".chars).to_set
19
+
20
+ class State
21
+ @@next_id = 0
22
+
23
+ def self.next_id
24
+ @@next_id += 1
25
+ end
26
+
27
+ def self.new_error_state(final = false)
28
+ State.new(final, true)
29
+ end
30
+
31
+
32
+ attr_reader :id # : Int32
33
+ attr_accessor :final # : Bool
34
+ attr_accessor :error # : Bool
35
+
36
+ def initialize(final = false, error = false, id = nil)
37
+ @final = final
38
+ @error = error
39
+ @id = id || State.next_id
40
+ end
41
+
42
+ # is this an error state?
43
+ def error?
44
+ @error
45
+ end
46
+
47
+ # is this a final state?
48
+ def final?
49
+ @final
50
+ end
51
+
52
+ def dup
53
+ State.new(@final, @error, nil)
54
+ end
55
+
56
+ def to_s
57
+ "State{id: #{id}, final: #{final}, error: #{error}}"
58
+ end
59
+ end
60
+
61
+ class MatchRef
62
+ attr_accessor :string # : String
63
+ attr_accessor :range # : Range(Int32, Int32)
64
+
65
+ def initialize(original_string, match_range)
66
+ @string = original_string
67
+ @range = match_range
68
+ end
69
+
70
+ def text
71
+ @string[@range]
72
+ end
73
+
74
+ def to_s
75
+ text
76
+ end
77
+
78
+ def ==(other)
79
+ @string == other.string &&
80
+ @range == other.range
81
+ end
82
+
83
+ def eql?(other)
84
+ self == other
85
+ end
86
+ end
87
+
88
+ end
@@ -0,0 +1,308 @@
1
+ require_relative "./kleene"
2
+
3
+ module Kleene
4
+ class MachineTuple
5
+ attr_accessor :nfa # : NFA
6
+ attr_accessor :nfa_with_dead_err # : NFA
7
+ attr_accessor :dfa # : DFA
8
+
9
+ def initialize(nfa, nfa_with_dead_err, dfa)
10
+ @nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
11
+ end
12
+ end
13
+
14
+ class MultiMatchDFA
15
+ include DSL
16
+
17
+ # @original_nfas : Array(NFA)
18
+ attr_reader :nfas_with_err_state # : Array(NFA)
19
+ attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
20
+ attr_accessor :composite_nfa # : NFA
21
+ attr_accessor :composite_dfa # : DFA
22
+
23
+ attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
24
+ attr_accessor :nfa_to_index # : Hash(NFA, Int32)
25
+ attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
26
+ attr_accessor :dfa_to_index # : Hash(DFA, Int32)
27
+
28
+ def initialize(nfas)
29
+ composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
30
+
31
+ @original_nfas = nfas
32
+ @nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
33
+ dfas = @original_nfas.map(&:to_dfa)
34
+
35
+ @nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
36
+ @nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
37
+ @dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
38
+ @machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
39
+
40
+ # build a mapping of (state -> nfa) pairs that capture which nfa owns each state
41
+ @dead_end_nfa_state_to_dead_end_nfa = Hash.new
42
+ @nfas_with_err_state.each do |nfa_with_dead_err|
43
+ nfa_with_dead_err.states.each do |state|
44
+ @dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
45
+ end
46
+ end
47
+
48
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
49
+ @composite_nfa = create_composite_nfa(@nfas_with_err_state)
50
+ @composite_dfa = @composite_nfa.to_dfa
51
+ end
52
+
53
+ def machines_from_nfa(nfa) # : MachineTuple
54
+ machines_by_index[nfa_to_index[nfa]]
55
+ end
56
+
57
+ def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
58
+ machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
59
+ end
60
+
61
+ def machines_from_dfa(dfa) # : MachineTuple
62
+ machines_by_index[dfa_to_index[dfa]]
63
+ end
64
+
65
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
66
+ def create_composite_nfa(nfas)
67
+ nfa = union!(nfas)
68
+
69
+ # add epsilon transitions from all the states except the start state back to the start state
70
+ nfa.states.each do |state|
71
+ if state != nfa.start_state
72
+ nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
73
+ end
74
+ end
75
+
76
+ nfa.update_final_states
77
+
78
+ nfa
79
+ end
80
+
81
+ def match_tracker(input) # : MatchTracker
82
+ dfa = @composite_dfa.deep_clone
83
+ match_tracker = setup_callbacks(dfa)
84
+
85
+ input.each_char.with_index do |char, index|
86
+ dfa.handle_token!(char, index)
87
+ end
88
+
89
+ match_tracker
90
+ end
91
+
92
+ def matches(input) # : Hash(NFA, Array(MatchRef))
93
+ mt = match_tracker(input)
94
+
95
+ start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
96
+
97
+ mt.empty_matches.each do |nfa_with_dead_err, indices|
98
+ original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
99
+ indices.each do |index|
100
+ mt.add_match(original_nfa, MatchRef.new(input, index...index))
101
+ end
102
+ end
103
+
104
+ active_dfas = Array.new # the Int32 represents the start of match
105
+
106
+ input.each_char.with_index do |char, index|
107
+ active_dfas.reject! do |active_dfa_tuple|
108
+ dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
109
+
110
+ dfa_clone.handle_token!(char, index)
111
+ mt.add_match(original_nfa, MatchRef.new(input, start_of_match_index..index)) if dfa_clone.accept?
112
+
113
+ dfa_clone.error?
114
+ end
115
+
116
+ if nfas_with_dead_err = start_index_to_nfas_that_may_match[index]
117
+ nfas_with_dead_err.each do |nfa_with_dead_err|
118
+ machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
119
+ original_nfa = machines.nfa
120
+ dfa = machines.dfa
121
+ dfa_clone = dfa.shallow_clone
122
+
123
+ dfa_clone.handle_token!(char, index)
124
+ mt.add_match(original_nfa, MatchRef.new(input, index..index)) if dfa_clone.accept?
125
+
126
+ active_dfas << [dfa_clone, original_nfa, index] unless dfa_clone.error?
127
+ end
128
+ end
129
+ end
130
+
131
+ mt.matches
132
+ end
133
+
134
+ def setup_callbacks(dfa)
135
+ match_tracker = MatchTracker.new
136
+
137
+ # 1. identify DFA states that correspond to successful match of first character of the NFAs
138
+ epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
139
+ nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
140
+ reject {|transition| transition.epsilon? || transition.to.error? }.
141
+ map(&:to).to_set
142
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
143
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
144
+ reduce(Set.new) {|memo, state_set| memo | state_set }
145
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
146
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
147
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
148
+ select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
149
+ compact_map do |nfa_state|
150
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
151
+ end.to_set
152
+ end
153
+
154
+ # 2. identify DFA states that correspond to final states in the NFAs
155
+ nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
156
+ dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
157
+ reduce(Set.new) {|memo, state_set| memo | state_set }
158
+ dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
159
+ dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
160
+ dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
161
+ select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
162
+ compact_map do |nfa_state|
163
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
164
+ end.to_set
165
+ end
166
+
167
+ # 3. Identify DFA states that correspond to successful match without even having seen any characters.
168
+ # These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
169
+ nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
170
+ dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
171
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
172
+ reduce(Set.new) {|memo, state_set| memo | state_set }
173
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
174
+ dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
175
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
176
+ select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
177
+ compact_map do |nfa_state|
178
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
179
+ end.to_set
180
+ end
181
+
182
+ # set up call transition call backs, since the callbacks may only be defined once per state and transition
183
+ # For (1):
184
+ # Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
185
+ # to be matched on the transition to one of the states in (1)
186
+ # For (2):
187
+ # set up transition callbacks to push the index position of the end of a successful match onto the list
188
+ # of successful matches for the NFA that matched
189
+ # For (3):
190
+ # set up transision callbacks to capture successful empty matches
191
+ destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
192
+ destination_dfa_states_for_callbacks.each do |dfa_state|
193
+ dfa.on_transition_to(dfa_state) do |transition, token, token_index|
194
+ destination_dfa_state = transition.to
195
+
196
+ should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
197
+ should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
198
+ should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
199
+
200
+ if should_track_empty_match
201
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
202
+ match_tracker.add_empty_match(nfa_with_dead_end, token_index)
203
+ end
204
+ end
205
+
206
+ if should_track_start_of_candidate_match
207
+ nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
208
+ nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
209
+ dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
210
+ dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
211
+ match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
212
+ end
213
+ end
214
+
215
+ if should_track_end_of_match
216
+ dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
217
+ match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
218
+ end
219
+ end
220
+ end
221
+ end
222
+
223
+ match_tracker
224
+ end
225
+
226
+ end
227
+
228
+ class MatchTracker
229
+ # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
230
+ # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
231
+ # are the internal state of a MultiMatchDFA
232
+ attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
233
+ # The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
234
+ # however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
235
+ # OR it may be as a result of transitioning to (2) a non-error final state.
236
+ # In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
237
+ # is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
238
+ # while in the case of (2), the match may be a "normal" match.
239
+ # The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
240
+ # or the beginning of an empty match.
241
+ # This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
242
+ # to the composite DFA's start state.
243
+ attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
244
+ attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
245
+
246
+ # The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
247
+ # This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
248
+ # match_end_positions structures, documented above ^^^.
249
+ attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
250
+
251
+ def initialize
252
+ @candidate_match_start_positions = Hash.new
253
+ @match_end_positions = Hash.new
254
+ @empty_matches = Hash.new
255
+ @matches = Hash.new
256
+ end
257
+
258
+ def start_positions(nfa)
259
+ candidate_match_start_positions[nfa] ||= Array.new
260
+ end
261
+
262
+ def end_positions(nfa)
263
+ match_end_positions[nfa] ||= Array.new
264
+ end
265
+
266
+ def empty_match_positions(nfa)
267
+ empty_matches[nfa] ||= Array.new
268
+ end
269
+
270
+ def matches_for(nfa)
271
+ matches[nfa] ||= Array.new
272
+ end
273
+
274
+ def add_start_of_candidate_match(nfa_with_dead_end, token_index)
275
+ # puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
276
+ positions = start_positions(nfa_with_dead_end)
277
+ positions << token_index
278
+ end
279
+
280
+ # the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
281
+ def add_end_of_match(nfa_with_dead_end, token_index)
282
+ # puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
283
+ positions = end_positions(nfa_with_dead_end)
284
+ positions << token_index
285
+ end
286
+
287
+ def add_empty_match(nfa_with_dead_end, token_index)
288
+ positions = empty_match_positions(nfa_with_dead_end)
289
+ positions << token_index
290
+ end
291
+
292
+ def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
293
+ index_to_nfas = Hash.new
294
+ candidate_match_start_positions.each do |nfa_with_dead_end, indices|
295
+ indices.each do |index|
296
+ nfas = index_to_nfas[index] ||= Array.new
297
+ nfas << nfa_with_dead_end
298
+ end
299
+ end
300
+ index_to_nfas
301
+ end
302
+
303
+ def add_match(nfa, match)
304
+ matches = matches_for(nfa)
305
+ matches << match
306
+ end
307
+ end
308
+ end