kleene 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/kleene/dsl.rb ADDED
@@ -0,0 +1,263 @@
1
+ # Most of the machines constructed here are based on section 2.5 of the Ragel User Guide (http://www.colm.net/files/ragel/ragel-guide-6.10.pdf)
2
+
3
+ module Kleene
4
+ module DSL
5
+ extend self
6
+
7
+ ############### The following methods create FSAs given a stream of input tokens #################
8
+
9
+ # given a string with N characters in it:
10
+ # N+1 states: start state and N other states
11
+ # structure: start state -> transition for first character in the string -> state for having observed first character in the string ->
12
+ # transition for second character in the string -> state for having observed second character in the string ->
13
+ # ...
14
+ # transition for last character in the string -> state for having observed last character in the string (marked final)
15
+ def literal(token_stream, alphabet = DEFAULT_ALPHABET)
16
+ start = current_state = State.new
17
+ nfa = NFA.new(start, alphabet)
18
+ token_stream.each_char do |token|
19
+ next_state = State.new
20
+ nfa.add_transition(token, current_state, next_state)
21
+ current_state = next_state
22
+ end
23
+ current_state.final = true
24
+ nfa.update_final_states
25
+ nfa.set_regex_pattern(token_stream)
26
+ nfa
27
+ end
28
+
29
+ # two states: start state and final state
30
+ # structure: start state -> transitions for each token in the token collection -> final state
31
+ def any(token_collection, alphabet = DEFAULT_ALPHABET)
32
+ start = State.new
33
+ nfa = NFA.new(start, alphabet)
34
+ final = State.new(true)
35
+ token_collection.each {|token| nfa.add_transition(token, start, final) }
36
+ nfa.update_final_states
37
+ regex_pattern = "[#{token_collection.join("")}]"
38
+ nfa.set_regex_pattern(regex_pattern)
39
+ nfa
40
+ end
41
+
42
+ # two states: start state and final state
43
+ # structure: start state -> transitions for every token in the alphabet -> final state
44
+ def dot(alphabet = DEFAULT_ALPHABET)
45
+ any(alphabet, alphabet).set_regex_pattern(".")
46
+ end
47
+
48
+ # This implements a character class, and is specifically for use with matching strings
49
+ def range(c_begin, c_end, alphabet = DEFAULT_ALPHABET)
50
+ any((c_begin..c_end).to_a, alphabet).set_regex_pattern("[#{c_begin}-#{c_end}]")
51
+ end
52
+
53
+ ############### The following methods create FSAs given other FSAs #################
54
+
55
+ # always clones the given nfa and returns a new nfa with a non-final error state
56
+ def with_err(nfa, alphabet = nfa.alphabet)
57
+ with_err!(nfa.deep_clone, alphabet)
58
+ end
59
+
60
+ # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
61
+ # the error state transitions to itself on any token.
62
+ def with_err!(nfa, alphabet = nfa.alphabet)
63
+ error_state = nfa.states.find(&:error?)
64
+ return nfa if error_state
65
+
66
+ error_state = State.new_error_state
67
+ nfa.add_state(error_state)
68
+
69
+ nfa.states.each do |state|
70
+ tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token)
71
+ missing_tokens = alphabet - tokens_on_outbound_transitions
72
+ missing_tokens.each do |token|
73
+ nfa.add_transition(token, state, error_state)
74
+ end
75
+ end
76
+
77
+ nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
78
+
79
+ nfa.set_regex_pattern("/#{nfa.regex_pattern}/E")
80
+ end
81
+
82
+ # always clones the given nfa and returns a new nfa with a non-final error state
83
+ def with_err_dead_end(nfa, alphabet = nfa.alphabet)
84
+ with_err_dead_end!(nfa.deep_clone, alphabet)
85
+ end
86
+
87
+ # adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
88
+ # the error state doesn't have any outbound transitions.
89
+ def with_err_dead_end!(nfa, alphabet = nfa.alphabet)
90
+ error_state = nfa.states.find(&:error?)
91
+ return nfa if error_state
92
+
93
+ error_state = State.new_error_state
94
+ nfa.add_state(error_state)
95
+
96
+ nfa.states.each do |state|
97
+ unless state.error?
98
+ tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token).to_set
99
+ only_outbound_transition_is_epsilon_transition = tokens_on_outbound_transitions.size == 1 && tokens_on_outbound_transitions.first == NFATransition::Epsilon
100
+ unless only_outbound_transition_is_epsilon_transition
101
+ missing_tokens = (alphabet - Set[NFATransition::Epsilon]) - tokens_on_outbound_transitions
102
+ missing_tokens.each do |token|
103
+ nfa.add_transition(token, state, error_state)
104
+ end
105
+ end
106
+ end
107
+ end
108
+
109
+ # remove the error state if it has no inbound or outbound transitions
110
+ nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
111
+
112
+ nfa.set_regex_pattern("/#{nfa.regex_pattern}/DE")
113
+ end
114
+
115
+ # Append b onto a
116
+ # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
117
+ # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
118
+ def append(a, b)
119
+ a = a.deep_clone
120
+ b = b.deep_clone
121
+ append!(a, b)
122
+ end
123
+
124
+ # Destructively append b onto a
125
+ # Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
126
+ # This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
127
+ def append!(a, b)
128
+ a.alphabet = a.alphabet | b.alphabet
129
+
130
+ # add an epsilon transition from each final state of machine a to the start state of maachine b.
131
+ a.final_states.each do |final_state|
132
+ a.add_transition(NFATransition::Epsilon, final_state, b.start_state)
133
+ end
134
+
135
+ # add all of machine b's transitions to machine a
136
+ b.all_transitions.each {|transition| a.add_transition(transition.token, transition.from, transition.to) }
137
+ # a.final_states = a.final_states | b.final_states
138
+ a.update_final_states
139
+
140
+ a
141
+ end
142
+
143
+ def seq(*nfas)
144
+ nfas.flatten.reduce {|memo_nfa, nfa| seq2(memo_nfa, nfa) }
145
+ end
146
+
147
+ # Implements concatenation, as defined in the Ragel manual in section 2.5.5 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
148
+ # Seq produces a machine that matches all the strings in machine `a` followed by all the strings in machine `b`.
149
+ # Seq draws epsilon transitions from the final states of thefirst machine to the start state of the second machine.
150
+ # The final states of the first machine lose their final state status, unless the start state of the second machine is final as well.
151
+ def seq2(a, b)
152
+ a = a.deep_clone
153
+ b = b.deep_clone
154
+
155
+ a = append!(a, b)
156
+
157
+ # make sure that b's final states are the only final states in a after we have appended b onto a
158
+ a.states.each {|state| state.final = b.final_states.includes?(state) }
159
+ a.update_final_states
160
+
161
+ a.set_regex_pattern("#{a.regex_pattern}#{b.regex_pattern}")
162
+ end
163
+
164
+ # Build a new machine consisting of a new start state with epsilon transitions to the start state of all the given NFAs in `nfas`.
165
+ # The resulting machine's final states are the set of final states from all the NFAs in `nfas`.
166
+ #
167
+ # Implements Union, as defined in the Ragel manual in section 2.5.1 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
168
+ # The union operation produces a machine that matches any string in machine one or machine two.
169
+ # The operation first creates a new start state.
170
+ # Epsilon transitions are drawn from the new start state to the start states of both input machines.
171
+ # The resulting machine has a final state setequivalent to the union of the final state sets of both input machines.
172
+ def union(*nfas)
173
+ nfas.flatten!
174
+ nfas = nfas.map(&:deep_clone)
175
+ union!(nfas)
176
+ end
177
+
178
+ # same as union, but doesn't deep clone the constituent nfas
179
+ def union!(nfas)
180
+ start = State.new
181
+ composite_alphabet = nfas.map(&:alphabet).reduce {|memo, alphabet| memo | alphabet }
182
+ new_nfa = NFA.new(start, composite_alphabet)
183
+
184
+ # add epsilon transitions from the start state of the new machine to the start state of machines a and b
185
+ nfas.each do |nfa|
186
+ new_nfa.add_states(nfa.states)
187
+ new_nfa.add_transition(NFATransition::Epsilon, start, nfa.start_state)
188
+ nfa.all_transitions.each {|t| new_nfa.add_transition(t.token, t.from, t.to) }
189
+ end
190
+
191
+ new_nfa.update_final_states
192
+
193
+ new_nfa.set_regex_pattern("#{nfas.map(&:regex_pattern).join("|")}")
194
+ end
195
+
196
+ # Implements Kleene Star, as defined in the Ragel manual in section 2.5.6 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
197
+ # The machine resulting from the Kleene Star operator will match zero or more repetitions of the machine it is applied to.
198
+ # It creates a new start state and an additional final state.
199
+ # Epsilon transitions are drawn between the new start state and the old start state,
200
+ # between the new start state and the new final state, and between the final states of the machine and the new start state.
201
+ def kleene(machine)
202
+ machine = machine.deep_clone
203
+ start = State.new
204
+ final = State.new(true)
205
+
206
+ nfa = NFA.new(start, machine.alphabet)
207
+ nfa.add_states(machine.states)
208
+ nfa.add_transition(NFATransition::Epsilon, start, final)
209
+ nfa.add_transition(NFATransition::Epsilon, start, machine.start_state)
210
+ machine.final_states.each do |final_state|
211
+ nfa.add_transition(NFATransition::Epsilon, final_state, start)
212
+ final_state.final = false
213
+ end
214
+
215
+ # add all of machine's transitions to the new machine
216
+ (machine.all_transitions).each {|t| nfa.add_transition(t.token, t.from, t.to) }
217
+ nfa.update_final_states
218
+
219
+ nfa.set_regex_pattern("#{machine.regex_pattern}*")
220
+ end
221
+
222
+ def plus(machine)
223
+ seq(machine, kleene(machine)).set_regex_pattern("#{machine.regex_pattern}+")
224
+ end
225
+
226
+ def optional(machine)
227
+ empty = NFA.new(State.new(true), machine.alphabet).set_regex_pattern("")
228
+ union(machine, empty).set_regex_pattern("#{machine.regex_pattern}?")
229
+ end
230
+
231
+ # def repeat(machine, min, max = nil)
232
+ # max ||= min
233
+ # m = NFA.new(State.new(true), machine.alphabet)
234
+ # min.times { m = seq(m, machine) }
235
+ # (max - min).times { m = append(m, machine) }
236
+ # if min != max
237
+ # m.set_regex_pattern("#{machine.regex_pattern}{#{min},#{max}}")
238
+ # else
239
+ # m.set_regex_pattern("#{machine.regex_pattern}{#{min}}")
240
+ # end
241
+ # end
242
+
243
+ # def negate(machine)
244
+ # machine = machine.to_dfa
245
+
246
+ # # invert the final flag of every state
247
+ # machine.states.each {|state| state.final = !state.final? }
248
+ # machine.update_final_states
249
+
250
+ # machine.to_nfa.set_regex_pattern("(!#{machine.regex_pattern})")
251
+ # end
252
+
253
+ # # a - b == a && !b
254
+ # def difference(a, b)
255
+ # intersection(a, negate(b))
256
+ # end
257
+
258
+ # # By De Morgan's Law: !(!a || !b) = a && b
259
+ # def intersection(a, b)
260
+ # negate(union(negate(a), negate(b)))
261
+ # end
262
+ end
263
+ end
@@ -0,0 +1,88 @@
1
+ # this is a port and extension of https://github.com/davidkellis/kleene/
2
+
3
+ require_relative "./dsl"
4
+ require_relative "./nfa"
5
+ require_relative "./dfa"
6
+
7
+ module Kleene
8
+ # The default alphabet consists of the following:
9
+ # Set{' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
10
+ # '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
11
+ # ':', ';', '<', '=', '>', '?', '@',
12
+ # 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
13
+ # 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
14
+ # '[', '\\', ']', '^', '_', '`',
15
+ # 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
16
+ # 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
17
+ # '{', '|', '}', '~', "\n", "\t"}
18
+ DEFAULT_ALPHABET = ((' '..'~').to_a + "\n\t".chars).to_set
19
+
20
+ class State
21
+ @@next_id = 0
22
+
23
+ def self.next_id
24
+ @@next_id += 1
25
+ end
26
+
27
+ def self.new_error_state(final = false)
28
+ State.new(final, true)
29
+ end
30
+
31
+
32
+ attr_reader :id # : Int32
33
+ attr_accessor :final # : Bool
34
+ attr_accessor :error # : Bool
35
+
36
+ def initialize(final = false, error = false, id = nil)
37
+ @final = final
38
+ @error = error
39
+ @id = id || State.next_id
40
+ end
41
+
42
+ # is this an error state?
43
+ def error?
44
+ @error
45
+ end
46
+
47
+ # is this a final state?
48
+ def final?
49
+ @final
50
+ end
51
+
52
+ def dup
53
+ State.new(@final, @error, nil)
54
+ end
55
+
56
+ def to_s
57
+ "State{id: #{id}, final: #{final}, error: #{error}}"
58
+ end
59
+ end
60
+
61
+ class MatchRef
62
+ attr_accessor :string # : String
63
+ attr_accessor :range # : Range(Int32, Int32)
64
+
65
+ def initialize(original_string, match_range)
66
+ @string = original_string
67
+ @range = match_range
68
+ end
69
+
70
+ def text
71
+ @string[@range]
72
+ end
73
+
74
+ def to_s
75
+ text
76
+ end
77
+
78
+ def ==(other)
79
+ @string == other.string &&
80
+ @range == other.range
81
+ end
82
+
83
+ def eql?(other)
84
+ self == other
85
+ end
86
+ end
87
+
88
+ end
@@ -0,0 +1,308 @@
1
+ require_relative "./kleene"
2
+
3
+ module Kleene
4
+ class MachineTuple
5
+ attr_accessor :nfa # : NFA
6
+ attr_accessor :nfa_with_dead_err # : NFA
7
+ attr_accessor :dfa # : DFA
8
+
9
+ def initialize(nfa, nfa_with_dead_err, dfa)
10
+ @nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
11
+ end
12
+ end
13
+
14
+ class MultiMatchDFA
15
+ include DSL
16
+
17
+ # @original_nfas : Array(NFA)
18
+ attr_reader :nfas_with_err_state # : Array(NFA)
19
+ attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
20
+ attr_accessor :composite_nfa # : NFA
21
+ attr_accessor :composite_dfa # : DFA
22
+
23
+ attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
24
+ attr_accessor :nfa_to_index # : Hash(NFA, Int32)
25
+ attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
26
+ attr_accessor :dfa_to_index # : Hash(DFA, Int32)
27
+
28
+ def initialize(nfas)
29
+ composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
30
+
31
+ @original_nfas = nfas
32
+ @nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
33
+ dfas = @original_nfas.map(&:to_dfa)
34
+
35
+ @nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
36
+ @nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
37
+ @dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
38
+ @machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
39
+
40
+ # build a mapping of (state -> nfa) pairs that capture which nfa owns each state
41
+ @dead_end_nfa_state_to_dead_end_nfa = Hash.new
42
+ @nfas_with_err_state.each do |nfa_with_dead_err|
43
+ nfa_with_dead_err.states.each do |state|
44
+ @dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
45
+ end
46
+ end
47
+
48
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
49
+ @composite_nfa = create_composite_nfa(@nfas_with_err_state)
50
+ @composite_dfa = @composite_nfa.to_dfa
51
+ end
52
+
53
+ def machines_from_nfa(nfa) # : MachineTuple
54
+ machines_by_index[nfa_to_index[nfa]]
55
+ end
56
+
57
+ def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
58
+ machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
59
+ end
60
+
61
+ def machines_from_dfa(dfa) # : MachineTuple
62
+ machines_by_index[dfa_to_index[dfa]]
63
+ end
64
+
65
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
66
+ def create_composite_nfa(nfas)
67
+ nfa = union!(nfas)
68
+
69
+ # add epsilon transitions from all the states except the start state back to the start state
70
+ nfa.states.each do |state|
71
+ if state != nfa.start_state
72
+ nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
73
+ end
74
+ end
75
+
76
+ nfa.update_final_states
77
+
78
+ nfa
79
+ end
80
+
81
+ def match_tracker(input) # : MatchTracker
82
+ dfa = @composite_dfa.deep_clone
83
+ match_tracker = setup_callbacks(dfa)
84
+
85
+ input.each_char.with_index do |char, index|
86
+ dfa.handle_token!(char, index)
87
+ end
88
+
89
+ match_tracker
90
+ end
91
+
92
+ def matches(input) # : Hash(NFA, Array(MatchRef))
93
+ mt = match_tracker(input)
94
+
95
+ start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
96
+
97
+ mt.empty_matches.each do |nfa_with_dead_err, indices|
98
+ original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
99
+ indices.each do |index|
100
+ mt.add_match(original_nfa, MatchRef.new(input, index...index))
101
+ end
102
+ end
103
+
104
+ active_dfas = Array.new # the Int32 represents the start of match
105
+
106
+ input.each_char.with_index do |char, index|
107
+ active_dfas.reject! do |active_dfa_tuple|
108
+ dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
109
+
110
+ dfa_clone.handle_token!(char, index)
111
+ mt.add_match(original_nfa, MatchRef.new(input, start_of_match_index..index)) if dfa_clone.accept?
112
+
113
+ dfa_clone.error?
114
+ end
115
+
116
+ if nfas_with_dead_err = start_index_to_nfas_that_may_match[index]
117
+ nfas_with_dead_err.each do |nfa_with_dead_err|
118
+ machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
119
+ original_nfa = machines.nfa
120
+ dfa = machines.dfa
121
+ dfa_clone = dfa.shallow_clone
122
+
123
+ dfa_clone.handle_token!(char, index)
124
+ mt.add_match(original_nfa, MatchRef.new(input, index..index)) if dfa_clone.accept?
125
+
126
+ active_dfas << [dfa_clone, original_nfa, index] unless dfa_clone.error?
127
+ end
128
+ end
129
+ end
130
+
131
+ mt.matches
132
+ end
133
+
134
+ def setup_callbacks(dfa)
135
+ match_tracker = MatchTracker.new
136
+
137
+ # 1. identify DFA states that correspond to successful match of first character of the NFAs
138
+ epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
139
+ nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
140
+ reject {|transition| transition.epsilon? || transition.to.error? }.
141
+ map(&:to).to_set
142
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
143
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
144
+ reduce(Set.new) {|memo, state_set| memo | state_set }
145
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
146
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
147
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
148
+ select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
149
+ compact_map do |nfa_state|
150
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
151
+ end.to_set
152
+ end
153
+
154
+ # 2. identify DFA states that correspond to final states in the NFAs
155
+ nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
156
+ dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
157
+ reduce(Set.new) {|memo, state_set| memo | state_set }
158
+ dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
159
+ dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
160
+ dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
161
+ select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
162
+ compact_map do |nfa_state|
163
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
164
+ end.to_set
165
+ end
166
+
167
+ # 3. Identify DFA states that correspond to successful match without even having seen any characters.
168
+ # These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
169
+ nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
170
+ dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
171
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
172
+ reduce(Set.new) {|memo, state_set| memo | state_set }
173
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
174
+ dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
175
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
176
+ select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
177
+ compact_map do |nfa_state|
178
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
179
+ end.to_set
180
+ end
181
+
182
+ # set up call transition call backs, since the callbacks may only be defined once per state and transition
183
+ # For (1):
184
+ # Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
185
+ # to be matched on the transition to one of the states in (1)
186
+ # For (2):
187
+ # set up transition callbacks to push the index position of the end of a successful match onto the list
188
+ # of successful matches for the NFA that matched
189
+ # For (3):
190
+ # set up transision callbacks to capture successful empty matches
191
+ destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
192
+ destination_dfa_states_for_callbacks.each do |dfa_state|
193
+ dfa.on_transition_to(dfa_state) do |transition, token, token_index|
194
+ destination_dfa_state = transition.to
195
+
196
+ should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
197
+ should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
198
+ should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
199
+
200
+ if should_track_empty_match
201
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
202
+ match_tracker.add_empty_match(nfa_with_dead_end, token_index)
203
+ end
204
+ end
205
+
206
+ if should_track_start_of_candidate_match
207
+ nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
208
+ nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
209
+ dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
210
+ dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
211
+ match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
212
+ end
213
+ end
214
+
215
+ if should_track_end_of_match
216
+ dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
217
+ match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
218
+ end
219
+ end
220
+ end
221
+ end
222
+
223
+ match_tracker
224
+ end
225
+
226
+ end
227
+
228
+ class MatchTracker
229
+ # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
230
+ # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
231
+ # are the internal state of a MultiMatchDFA
232
+ attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
233
+ # The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
234
+ # however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
235
+ # OR it may be as a result of transitioning to (2) a non-error final state.
236
+ # In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
237
+ # is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
238
+ # while in the case of (2), the match may be a "normal" match.
239
+ # The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
240
+ # or the beginning of an empty match.
241
+ # This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
242
+ # to the composite DFA's start state.
243
+ attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
244
+ attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
245
+
246
+ # The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
247
+ # This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
248
+ # match_end_positions structures, documented above ^^^.
249
+ attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
250
+
251
+ def initialize
252
+ @candidate_match_start_positions = Hash.new
253
+ @match_end_positions = Hash.new
254
+ @empty_matches = Hash.new
255
+ @matches = Hash.new
256
+ end
257
+
258
+ def start_positions(nfa)
259
+ candidate_match_start_positions[nfa] ||= Array.new
260
+ end
261
+
262
+ def end_positions(nfa)
263
+ match_end_positions[nfa] ||= Array.new
264
+ end
265
+
266
+ def empty_match_positions(nfa)
267
+ empty_matches[nfa] ||= Array.new
268
+ end
269
+
270
+ def matches_for(nfa)
271
+ matches[nfa] ||= Array.new
272
+ end
273
+
274
+ def add_start_of_candidate_match(nfa_with_dead_end, token_index)
275
+ # puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
276
+ positions = start_positions(nfa_with_dead_end)
277
+ positions << token_index
278
+ end
279
+
280
+ # the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
281
+ def add_end_of_match(nfa_with_dead_end, token_index)
282
+ # puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
283
+ positions = end_positions(nfa_with_dead_end)
284
+ positions << token_index
285
+ end
286
+
287
+ def add_empty_match(nfa_with_dead_end, token_index)
288
+ positions = empty_match_positions(nfa_with_dead_end)
289
+ positions << token_index
290
+ end
291
+
292
+ def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
293
+ index_to_nfas = Hash.new
294
+ candidate_match_start_positions.each do |nfa_with_dead_end, indices|
295
+ indices.each do |index|
296
+ nfas = index_to_nfas[index] ||= Array.new
297
+ nfas << nfa_with_dead_end
298
+ end
299
+ end
300
+ index_to_nfas
301
+ end
302
+
303
+ def add_match(nfa, match)
304
+ matches = matches_for(nfa)
305
+ matches << match
306
+ end
307
+ end
308
+ end