kleene 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +117 -0
- data/LICENSE +21 -0
- data/README.md +21 -0
- data/Rakefile +8 -0
- data/build.ops +63 -0
- data/kleene.gemspec +39 -0
- data/lib/kleene/dfa.rb +258 -0
- data/lib/kleene/dsl.rb +263 -0
- data/lib/kleene/kleene.rb +88 -0
- data/lib/kleene/multi_match_dfa.rb +308 -0
- data/lib/kleene/nfa.rb +304 -0
- data/lib/kleene/patches.rb +23 -0
- data/lib/kleene/version.rb +3 -0
- data/lib/kleene.rb +17 -0
- metadata +76 -0
data/lib/kleene/dsl.rb
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
# Most of the machines constructed here are based on section 2.5 of the Ragel User Guide (http://www.colm.net/files/ragel/ragel-guide-6.10.pdf)
|
2
|
+
|
3
|
+
module Kleene
|
4
|
+
module DSL
|
5
|
+
extend self
|
6
|
+
|
7
|
+
############### The following methods create FSAs given a stream of input tokens #################
|
8
|
+
|
9
|
+
# given a string with N characters in it:
|
10
|
+
# N+1 states: start state and N other states
|
11
|
+
# structure: start state -> transition for first character in the string -> state for having observed first character in the string ->
|
12
|
+
# transition for second character in the string -> state for having observed second character in the string ->
|
13
|
+
# ...
|
14
|
+
# transition for last character in the string -> state for having observed last character in the string (marked final)
|
15
|
+
def literal(token_stream, alphabet = DEFAULT_ALPHABET)
|
16
|
+
start = current_state = State.new
|
17
|
+
nfa = NFA.new(start, alphabet)
|
18
|
+
token_stream.each_char do |token|
|
19
|
+
next_state = State.new
|
20
|
+
nfa.add_transition(token, current_state, next_state)
|
21
|
+
current_state = next_state
|
22
|
+
end
|
23
|
+
current_state.final = true
|
24
|
+
nfa.update_final_states
|
25
|
+
nfa.set_regex_pattern(token_stream)
|
26
|
+
nfa
|
27
|
+
end
|
28
|
+
|
29
|
+
# two states: start state and final state
|
30
|
+
# structure: start state -> transitions for each token in the token collection -> final state
|
31
|
+
def any(token_collection, alphabet = DEFAULT_ALPHABET)
|
32
|
+
start = State.new
|
33
|
+
nfa = NFA.new(start, alphabet)
|
34
|
+
final = State.new(true)
|
35
|
+
token_collection.each {|token| nfa.add_transition(token, start, final) }
|
36
|
+
nfa.update_final_states
|
37
|
+
regex_pattern = "[#{token_collection.join("")}]"
|
38
|
+
nfa.set_regex_pattern(regex_pattern)
|
39
|
+
nfa
|
40
|
+
end
|
41
|
+
|
42
|
+
# two states: start state and final state
|
43
|
+
# structure: start state -> transitions for every token in the alphabet -> final state
|
44
|
+
def dot(alphabet = DEFAULT_ALPHABET)
|
45
|
+
any(alphabet, alphabet).set_regex_pattern(".")
|
46
|
+
end
|
47
|
+
|
48
|
+
# This implements a character class, and is specifically for use with matching strings
|
49
|
+
def range(c_begin, c_end, alphabet = DEFAULT_ALPHABET)
|
50
|
+
any((c_begin..c_end).to_a, alphabet).set_regex_pattern("[#{c_begin}-#{c_end}]")
|
51
|
+
end
|
52
|
+
|
53
|
+
############### The following methods create FSAs given other FSAs #################
|
54
|
+
|
55
|
+
# always clones the given nfa and returns a new nfa with a non-final error state
|
56
|
+
def with_err(nfa, alphabet = nfa.alphabet)
|
57
|
+
with_err!(nfa.deep_clone, alphabet)
|
58
|
+
end
|
59
|
+
|
60
|
+
# adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
|
61
|
+
# the error state transitions to itself on any token.
|
62
|
+
def with_err!(nfa, alphabet = nfa.alphabet)
|
63
|
+
error_state = nfa.states.find(&:error?)
|
64
|
+
return nfa if error_state
|
65
|
+
|
66
|
+
error_state = State.new_error_state
|
67
|
+
nfa.add_state(error_state)
|
68
|
+
|
69
|
+
nfa.states.each do |state|
|
70
|
+
tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token)
|
71
|
+
missing_tokens = alphabet - tokens_on_outbound_transitions
|
72
|
+
missing_tokens.each do |token|
|
73
|
+
nfa.add_transition(token, state, error_state)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
|
78
|
+
|
79
|
+
nfa.set_regex_pattern("/#{nfa.regex_pattern}/E")
|
80
|
+
end
|
81
|
+
|
82
|
+
# always clones the given nfa and returns a new nfa with a non-final error state
|
83
|
+
def with_err_dead_end(nfa, alphabet = nfa.alphabet)
|
84
|
+
with_err_dead_end!(nfa.deep_clone, alphabet)
|
85
|
+
end
|
86
|
+
|
87
|
+
# adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
|
88
|
+
# the error state doesn't have any outbound transitions.
|
89
|
+
def with_err_dead_end!(nfa, alphabet = nfa.alphabet)
|
90
|
+
error_state = nfa.states.find(&:error?)
|
91
|
+
return nfa if error_state
|
92
|
+
|
93
|
+
error_state = State.new_error_state
|
94
|
+
nfa.add_state(error_state)
|
95
|
+
|
96
|
+
nfa.states.each do |state|
|
97
|
+
unless state.error?
|
98
|
+
tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token).to_set
|
99
|
+
only_outbound_transition_is_epsilon_transition = tokens_on_outbound_transitions.size == 1 && tokens_on_outbound_transitions.first == NFATransition::Epsilon
|
100
|
+
unless only_outbound_transition_is_epsilon_transition
|
101
|
+
missing_tokens = (alphabet - Set[NFATransition::Epsilon]) - tokens_on_outbound_transitions
|
102
|
+
missing_tokens.each do |token|
|
103
|
+
nfa.add_transition(token, state, error_state)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# remove the error state if it has no inbound or outbound transitions
|
110
|
+
nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
|
111
|
+
|
112
|
+
nfa.set_regex_pattern("/#{nfa.regex_pattern}/DE")
|
113
|
+
end
|
114
|
+
|
115
|
+
# Append b onto a
|
116
|
+
# Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
|
117
|
+
# This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
|
118
|
+
def append(a, b)
|
119
|
+
a = a.deep_clone
|
120
|
+
b = b.deep_clone
|
121
|
+
append!(a, b)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Destructively append b onto a
|
125
|
+
# Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
|
126
|
+
# This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
|
127
|
+
def append!(a, b)
|
128
|
+
a.alphabet = a.alphabet | b.alphabet
|
129
|
+
|
130
|
+
# add an epsilon transition from each final state of machine a to the start state of maachine b.
|
131
|
+
a.final_states.each do |final_state|
|
132
|
+
a.add_transition(NFATransition::Epsilon, final_state, b.start_state)
|
133
|
+
end
|
134
|
+
|
135
|
+
# add all of machine b's transitions to machine a
|
136
|
+
b.all_transitions.each {|transition| a.add_transition(transition.token, transition.from, transition.to) }
|
137
|
+
# a.final_states = a.final_states | b.final_states
|
138
|
+
a.update_final_states
|
139
|
+
|
140
|
+
a
|
141
|
+
end
|
142
|
+
|
143
|
+
def seq(*nfas)
|
144
|
+
nfas.flatten.reduce {|memo_nfa, nfa| seq2(memo_nfa, nfa) }
|
145
|
+
end
|
146
|
+
|
147
|
+
# Implements concatenation, as defined in the Ragel manual in section 2.5.5 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
148
|
+
# Seq produces a machine that matches all the strings in machine `a` followed by all the strings in machine `b`.
|
149
|
+
# Seq draws epsilon transitions from the final states of thefirst machine to the start state of the second machine.
|
150
|
+
# The final states of the first machine lose their final state status, unless the start state of the second machine is final as well.
|
151
|
+
def seq2(a, b)
|
152
|
+
a = a.deep_clone
|
153
|
+
b = b.deep_clone
|
154
|
+
|
155
|
+
a = append!(a, b)
|
156
|
+
|
157
|
+
# make sure that b's final states are the only final states in a after we have appended b onto a
|
158
|
+
a.states.each {|state| state.final = b.final_states.includes?(state) }
|
159
|
+
a.update_final_states
|
160
|
+
|
161
|
+
a.set_regex_pattern("#{a.regex_pattern}#{b.regex_pattern}")
|
162
|
+
end
|
163
|
+
|
164
|
+
# Build a new machine consisting of a new start state with epsilon transitions to the start state of all the given NFAs in `nfas`.
|
165
|
+
# The resulting machine's final states are the set of final states from all the NFAs in `nfas`.
|
166
|
+
#
|
167
|
+
# Implements Union, as defined in the Ragel manual in section 2.5.1 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
168
|
+
# The union operation produces a machine that matches any string in machine one or machine two.
|
169
|
+
# The operation first creates a new start state.
|
170
|
+
# Epsilon transitions are drawn from the new start state to the start states of both input machines.
|
171
|
+
# The resulting machine has a final state setequivalent to the union of the final state sets of both input machines.
|
172
|
+
def union(*nfas)
|
173
|
+
nfas.flatten!
|
174
|
+
nfas = nfas.map(&:deep_clone)
|
175
|
+
union!(nfas)
|
176
|
+
end
|
177
|
+
|
178
|
+
# same as union, but doesn't deep clone the constituent nfas
|
179
|
+
def union!(nfas)
|
180
|
+
start = State.new
|
181
|
+
composite_alphabet = nfas.map(&:alphabet).reduce {|memo, alphabet| memo | alphabet }
|
182
|
+
new_nfa = NFA.new(start, composite_alphabet)
|
183
|
+
|
184
|
+
# add epsilon transitions from the start state of the new machine to the start state of machines a and b
|
185
|
+
nfas.each do |nfa|
|
186
|
+
new_nfa.add_states(nfa.states)
|
187
|
+
new_nfa.add_transition(NFATransition::Epsilon, start, nfa.start_state)
|
188
|
+
nfa.all_transitions.each {|t| new_nfa.add_transition(t.token, t.from, t.to) }
|
189
|
+
end
|
190
|
+
|
191
|
+
new_nfa.update_final_states
|
192
|
+
|
193
|
+
new_nfa.set_regex_pattern("#{nfas.map(&:regex_pattern).join("|")}")
|
194
|
+
end
|
195
|
+
|
196
|
+
# Implements Kleene Star, as defined in the Ragel manual in section 2.5.6 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
197
|
+
# The machine resulting from the Kleene Star operator will match zero or more repetitions of the machine it is applied to.
|
198
|
+
# It creates a new start state and an additional final state.
|
199
|
+
# Epsilon transitions are drawn between the new start state and the old start state,
|
200
|
+
# between the new start state and the new final state, and between the final states of the machine and the new start state.
|
201
|
+
def kleene(machine)
|
202
|
+
machine = machine.deep_clone
|
203
|
+
start = State.new
|
204
|
+
final = State.new(true)
|
205
|
+
|
206
|
+
nfa = NFA.new(start, machine.alphabet)
|
207
|
+
nfa.add_states(machine.states)
|
208
|
+
nfa.add_transition(NFATransition::Epsilon, start, final)
|
209
|
+
nfa.add_transition(NFATransition::Epsilon, start, machine.start_state)
|
210
|
+
machine.final_states.each do |final_state|
|
211
|
+
nfa.add_transition(NFATransition::Epsilon, final_state, start)
|
212
|
+
final_state.final = false
|
213
|
+
end
|
214
|
+
|
215
|
+
# add all of machine's transitions to the new machine
|
216
|
+
(machine.all_transitions).each {|t| nfa.add_transition(t.token, t.from, t.to) }
|
217
|
+
nfa.update_final_states
|
218
|
+
|
219
|
+
nfa.set_regex_pattern("#{machine.regex_pattern}*")
|
220
|
+
end
|
221
|
+
|
222
|
+
def plus(machine)
|
223
|
+
seq(machine, kleene(machine)).set_regex_pattern("#{machine.regex_pattern}+")
|
224
|
+
end
|
225
|
+
|
226
|
+
def optional(machine)
|
227
|
+
empty = NFA.new(State.new(true), machine.alphabet).set_regex_pattern("")
|
228
|
+
union(machine, empty).set_regex_pattern("#{machine.regex_pattern}?")
|
229
|
+
end
|
230
|
+
|
231
|
+
# def repeat(machine, min, max = nil)
|
232
|
+
# max ||= min
|
233
|
+
# m = NFA.new(State.new(true), machine.alphabet)
|
234
|
+
# min.times { m = seq(m, machine) }
|
235
|
+
# (max - min).times { m = append(m, machine) }
|
236
|
+
# if min != max
|
237
|
+
# m.set_regex_pattern("#{machine.regex_pattern}{#{min},#{max}}")
|
238
|
+
# else
|
239
|
+
# m.set_regex_pattern("#{machine.regex_pattern}{#{min}}")
|
240
|
+
# end
|
241
|
+
# end
|
242
|
+
|
243
|
+
# def negate(machine)
|
244
|
+
# machine = machine.to_dfa
|
245
|
+
|
246
|
+
# # invert the final flag of every state
|
247
|
+
# machine.states.each {|state| state.final = !state.final? }
|
248
|
+
# machine.update_final_states
|
249
|
+
|
250
|
+
# machine.to_nfa.set_regex_pattern("(!#{machine.regex_pattern})")
|
251
|
+
# end
|
252
|
+
|
253
|
+
# # a - b == a && !b
|
254
|
+
# def difference(a, b)
|
255
|
+
# intersection(a, negate(b))
|
256
|
+
# end
|
257
|
+
|
258
|
+
# # By De Morgan's Law: !(!a || !b) = a && b
|
259
|
+
# def intersection(a, b)
|
260
|
+
# negate(union(negate(a), negate(b)))
|
261
|
+
# end
|
262
|
+
end
|
263
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# this is a port and extension of https://github.com/davidkellis/kleene/
|
2
|
+
|
3
|
+
require_relative "./dsl"
|
4
|
+
require_relative "./nfa"
|
5
|
+
require_relative "./dfa"
|
6
|
+
|
7
|
+
module Kleene
|
8
|
+
# The default alphabet consists of the following:
|
9
|
+
# Set{' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
|
10
|
+
# '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
11
|
+
# ':', ';', '<', '=', '>', '?', '@',
|
12
|
+
# 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
13
|
+
# 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
14
|
+
# '[', '\\', ']', '^', '_', '`',
|
15
|
+
# 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
16
|
+
# 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
17
|
+
# '{', '|', '}', '~', "\n", "\t"}
|
18
|
+
DEFAULT_ALPHABET = ((' '..'~').to_a + "\n\t".chars).to_set
|
19
|
+
|
20
|
+
class State
|
21
|
+
@@next_id = 0
|
22
|
+
|
23
|
+
def self.next_id
|
24
|
+
@@next_id += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.new_error_state(final = false)
|
28
|
+
State.new(final, true)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
attr_reader :id # : Int32
|
33
|
+
attr_accessor :final # : Bool
|
34
|
+
attr_accessor :error # : Bool
|
35
|
+
|
36
|
+
def initialize(final = false, error = false, id = nil)
|
37
|
+
@final = final
|
38
|
+
@error = error
|
39
|
+
@id = id || State.next_id
|
40
|
+
end
|
41
|
+
|
42
|
+
# is this an error state?
|
43
|
+
def error?
|
44
|
+
@error
|
45
|
+
end
|
46
|
+
|
47
|
+
# is this a final state?
|
48
|
+
def final?
|
49
|
+
@final
|
50
|
+
end
|
51
|
+
|
52
|
+
def dup
|
53
|
+
State.new(@final, @error, nil)
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_s
|
57
|
+
"State{id: #{id}, final: #{final}, error: #{error}}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class MatchRef
|
62
|
+
attr_accessor :string # : String
|
63
|
+
attr_accessor :range # : Range(Int32, Int32)
|
64
|
+
|
65
|
+
def initialize(original_string, match_range)
|
66
|
+
@string = original_string
|
67
|
+
@range = match_range
|
68
|
+
end
|
69
|
+
|
70
|
+
def text
|
71
|
+
@string[@range]
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_s
|
75
|
+
text
|
76
|
+
end
|
77
|
+
|
78
|
+
def ==(other)
|
79
|
+
@string == other.string &&
|
80
|
+
@range == other.range
|
81
|
+
end
|
82
|
+
|
83
|
+
def eql?(other)
|
84
|
+
self == other
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
require_relative "./kleene"
|
2
|
+
|
3
|
+
module Kleene
|
4
|
+
class MachineTuple
|
5
|
+
attr_accessor :nfa # : NFA
|
6
|
+
attr_accessor :nfa_with_dead_err # : NFA
|
7
|
+
attr_accessor :dfa # : DFA
|
8
|
+
|
9
|
+
def initialize(nfa, nfa_with_dead_err, dfa)
|
10
|
+
@nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class MultiMatchDFA
|
15
|
+
include DSL
|
16
|
+
|
17
|
+
# @original_nfas : Array(NFA)
|
18
|
+
attr_reader :nfas_with_err_state # : Array(NFA)
|
19
|
+
attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
|
20
|
+
attr_accessor :composite_nfa # : NFA
|
21
|
+
attr_accessor :composite_dfa # : DFA
|
22
|
+
|
23
|
+
attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
|
24
|
+
attr_accessor :nfa_to_index # : Hash(NFA, Int32)
|
25
|
+
attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
|
26
|
+
attr_accessor :dfa_to_index # : Hash(DFA, Int32)
|
27
|
+
|
28
|
+
def initialize(nfas)
|
29
|
+
composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
|
30
|
+
|
31
|
+
@original_nfas = nfas
|
32
|
+
@nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
|
33
|
+
dfas = @original_nfas.map(&:to_dfa)
|
34
|
+
|
35
|
+
@nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
|
36
|
+
@nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
|
37
|
+
@dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
|
38
|
+
@machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
|
39
|
+
|
40
|
+
# build a mapping of (state -> nfa) pairs that capture which nfa owns each state
|
41
|
+
@dead_end_nfa_state_to_dead_end_nfa = Hash.new
|
42
|
+
@nfas_with_err_state.each do |nfa_with_dead_err|
|
43
|
+
nfa_with_dead_err.states.each do |state|
|
44
|
+
@dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
49
|
+
@composite_nfa = create_composite_nfa(@nfas_with_err_state)
|
50
|
+
@composite_dfa = @composite_nfa.to_dfa
|
51
|
+
end
|
52
|
+
|
53
|
+
def machines_from_nfa(nfa) # : MachineTuple
|
54
|
+
machines_by_index[nfa_to_index[nfa]]
|
55
|
+
end
|
56
|
+
|
57
|
+
def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
|
58
|
+
machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
|
59
|
+
end
|
60
|
+
|
61
|
+
def machines_from_dfa(dfa) # : MachineTuple
|
62
|
+
machines_by_index[dfa_to_index[dfa]]
|
63
|
+
end
|
64
|
+
|
65
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
66
|
+
def create_composite_nfa(nfas)
|
67
|
+
nfa = union!(nfas)
|
68
|
+
|
69
|
+
# add epsilon transitions from all the states except the start state back to the start state
|
70
|
+
nfa.states.each do |state|
|
71
|
+
if state != nfa.start_state
|
72
|
+
nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
nfa.update_final_states
|
77
|
+
|
78
|
+
nfa
|
79
|
+
end
|
80
|
+
|
81
|
+
def match_tracker(input) # : MatchTracker
|
82
|
+
dfa = @composite_dfa.deep_clone
|
83
|
+
match_tracker = setup_callbacks(dfa)
|
84
|
+
|
85
|
+
input.each_char.with_index do |char, index|
|
86
|
+
dfa.handle_token!(char, index)
|
87
|
+
end
|
88
|
+
|
89
|
+
match_tracker
|
90
|
+
end
|
91
|
+
|
92
|
+
def matches(input) # : Hash(NFA, Array(MatchRef))
|
93
|
+
mt = match_tracker(input)
|
94
|
+
|
95
|
+
start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
|
96
|
+
|
97
|
+
mt.empty_matches.each do |nfa_with_dead_err, indices|
|
98
|
+
original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
|
99
|
+
indices.each do |index|
|
100
|
+
mt.add_match(original_nfa, MatchRef.new(input, index...index))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
active_dfas = Array.new # the Int32 represents the start of match
|
105
|
+
|
106
|
+
input.each_char.with_index do |char, index|
|
107
|
+
active_dfas.reject! do |active_dfa_tuple|
|
108
|
+
dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
|
109
|
+
|
110
|
+
dfa_clone.handle_token!(char, index)
|
111
|
+
mt.add_match(original_nfa, MatchRef.new(input, start_of_match_index..index)) if dfa_clone.accept?
|
112
|
+
|
113
|
+
dfa_clone.error?
|
114
|
+
end
|
115
|
+
|
116
|
+
if nfas_with_dead_err = start_index_to_nfas_that_may_match[index]
|
117
|
+
nfas_with_dead_err.each do |nfa_with_dead_err|
|
118
|
+
machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
|
119
|
+
original_nfa = machines.nfa
|
120
|
+
dfa = machines.dfa
|
121
|
+
dfa_clone = dfa.shallow_clone
|
122
|
+
|
123
|
+
dfa_clone.handle_token!(char, index)
|
124
|
+
mt.add_match(original_nfa, MatchRef.new(input, index..index)) if dfa_clone.accept?
|
125
|
+
|
126
|
+
active_dfas << [dfa_clone, original_nfa, index] unless dfa_clone.error?
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
mt.matches
|
132
|
+
end
|
133
|
+
|
134
|
+
def setup_callbacks(dfa)
|
135
|
+
match_tracker = MatchTracker.new
|
136
|
+
|
137
|
+
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
138
|
+
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
139
|
+
nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
|
140
|
+
reject {|transition| transition.epsilon? || transition.to.error? }.
|
141
|
+
map(&:to).to_set
|
142
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
|
143
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
144
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
145
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
|
146
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
|
147
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
148
|
+
select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
|
149
|
+
compact_map do |nfa_state|
|
150
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
151
|
+
end.to_set
|
152
|
+
end
|
153
|
+
|
154
|
+
# 2. identify DFA states that correspond to final states in the NFAs
|
155
|
+
nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
|
156
|
+
dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
157
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
158
|
+
dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
|
159
|
+
dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
|
160
|
+
dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
161
|
+
select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
|
162
|
+
compact_map do |nfa_state|
|
163
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
164
|
+
end.to_set
|
165
|
+
end
|
166
|
+
|
167
|
+
# 3. Identify DFA states that correspond to successful match without even having seen any characters.
|
168
|
+
# These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
|
169
|
+
nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
|
170
|
+
dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
|
171
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
172
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
173
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
|
174
|
+
dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
|
175
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
176
|
+
select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
|
177
|
+
compact_map do |nfa_state|
|
178
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
179
|
+
end.to_set
|
180
|
+
end
|
181
|
+
|
182
|
+
# set up call transition call backs, since the callbacks may only be defined once per state and transition
|
183
|
+
# For (1):
|
184
|
+
# Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
|
185
|
+
# to be matched on the transition to one of the states in (1)
|
186
|
+
# For (2):
|
187
|
+
# set up transition callbacks to push the index position of the end of a successful match onto the list
|
188
|
+
# of successful matches for the NFA that matched
|
189
|
+
# For (3):
|
190
|
+
# set up transision callbacks to capture successful empty matches
|
191
|
+
destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
|
192
|
+
destination_dfa_states_for_callbacks.each do |dfa_state|
|
193
|
+
dfa.on_transition_to(dfa_state) do |transition, token, token_index|
|
194
|
+
destination_dfa_state = transition.to
|
195
|
+
|
196
|
+
should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
|
197
|
+
should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
|
198
|
+
should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
|
199
|
+
|
200
|
+
if should_track_empty_match
|
201
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
|
202
|
+
match_tracker.add_empty_match(nfa_with_dead_end, token_index)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
if should_track_start_of_candidate_match
|
207
|
+
nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
|
208
|
+
nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
|
209
|
+
dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
|
210
|
+
dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
|
211
|
+
match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
if should_track_end_of_match
|
216
|
+
dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
|
217
|
+
match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
match_tracker
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
class MatchTracker
|
229
|
+
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
230
|
+
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
231
|
+
# are the internal state of a MultiMatchDFA
|
232
|
+
attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
|
233
|
+
# The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
|
234
|
+
# however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
|
235
|
+
# OR it may be as a result of transitioning to (2) a non-error final state.
|
236
|
+
# In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
|
237
|
+
# is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
|
238
|
+
# while in the case of (2), the match may be a "normal" match.
|
239
|
+
# The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
|
240
|
+
# or the beginning of an empty match.
|
241
|
+
# This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
|
242
|
+
# to the composite DFA's start state.
|
243
|
+
attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
|
244
|
+
attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
|
245
|
+
|
246
|
+
# The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
|
247
|
+
# This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
|
248
|
+
# match_end_positions structures, documented above ^^^.
|
249
|
+
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
250
|
+
|
251
|
+
def initialize
|
252
|
+
@candidate_match_start_positions = Hash.new
|
253
|
+
@match_end_positions = Hash.new
|
254
|
+
@empty_matches = Hash.new
|
255
|
+
@matches = Hash.new
|
256
|
+
end
|
257
|
+
|
258
|
+
def start_positions(nfa)
|
259
|
+
candidate_match_start_positions[nfa] ||= Array.new
|
260
|
+
end
|
261
|
+
|
262
|
+
def end_positions(nfa)
|
263
|
+
match_end_positions[nfa] ||= Array.new
|
264
|
+
end
|
265
|
+
|
266
|
+
def empty_match_positions(nfa)
|
267
|
+
empty_matches[nfa] ||= Array.new
|
268
|
+
end
|
269
|
+
|
270
|
+
def matches_for(nfa)
|
271
|
+
matches[nfa] ||= Array.new
|
272
|
+
end
|
273
|
+
|
274
|
+
def add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
275
|
+
# puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
|
276
|
+
positions = start_positions(nfa_with_dead_end)
|
277
|
+
positions << token_index
|
278
|
+
end
|
279
|
+
|
280
|
+
# the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
|
281
|
+
def add_end_of_match(nfa_with_dead_end, token_index)
|
282
|
+
# puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
|
283
|
+
positions = end_positions(nfa_with_dead_end)
|
284
|
+
positions << token_index
|
285
|
+
end
|
286
|
+
|
287
|
+
def add_empty_match(nfa_with_dead_end, token_index)
|
288
|
+
positions = empty_match_positions(nfa_with_dead_end)
|
289
|
+
positions << token_index
|
290
|
+
end
|
291
|
+
|
292
|
+
def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
|
293
|
+
index_to_nfas = Hash.new
|
294
|
+
candidate_match_start_positions.each do |nfa_with_dead_end, indices|
|
295
|
+
indices.each do |index|
|
296
|
+
nfas = index_to_nfas[index] ||= Array.new
|
297
|
+
nfas << nfa_with_dead_end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
index_to_nfas
|
301
|
+
end
|
302
|
+
|
303
|
+
def add_match(nfa, match)
|
304
|
+
matches = matches_for(nfa)
|
305
|
+
matches << match
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|