kleene 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +117 -0
- data/LICENSE +21 -0
- data/README.md +21 -0
- data/Rakefile +8 -0
- data/build.ops +63 -0
- data/kleene.gemspec +39 -0
- data/lib/kleene/dfa.rb +258 -0
- data/lib/kleene/dsl.rb +263 -0
- data/lib/kleene/kleene.rb +88 -0
- data/lib/kleene/multi_match_dfa.rb +308 -0
- data/lib/kleene/nfa.rb +304 -0
- data/lib/kleene/patches.rb +23 -0
- data/lib/kleene/version.rb +3 -0
- data/lib/kleene.rb +17 -0
- metadata +76 -0
data/lib/kleene/dsl.rb
ADDED
@@ -0,0 +1,263 @@
|
|
1
|
+
# Most of the machines constructed here are based on section 2.5 of the Ragel User Guide (http://www.colm.net/files/ragel/ragel-guide-6.10.pdf)
|
2
|
+
|
3
|
+
module Kleene
|
4
|
+
module DSL
|
5
|
+
extend self
|
6
|
+
|
7
|
+
############### The following methods create FSAs given a stream of input tokens #################
|
8
|
+
|
9
|
+
# given a string with N characters in it:
|
10
|
+
# N+1 states: start state and N other states
|
11
|
+
# structure: start state -> transition for first character in the string -> state for having observed first character in the string ->
|
12
|
+
# transition for second character in the string -> state for having observed second character in the string ->
|
13
|
+
# ...
|
14
|
+
# transition for last character in the string -> state for having observed last character in the string (marked final)
|
15
|
+
def literal(token_stream, alphabet = DEFAULT_ALPHABET)
|
16
|
+
start = current_state = State.new
|
17
|
+
nfa = NFA.new(start, alphabet)
|
18
|
+
token_stream.each_char do |token|
|
19
|
+
next_state = State.new
|
20
|
+
nfa.add_transition(token, current_state, next_state)
|
21
|
+
current_state = next_state
|
22
|
+
end
|
23
|
+
current_state.final = true
|
24
|
+
nfa.update_final_states
|
25
|
+
nfa.set_regex_pattern(token_stream)
|
26
|
+
nfa
|
27
|
+
end
|
28
|
+
|
29
|
+
# two states: start state and final state
|
30
|
+
# structure: start state -> transitions for each token in the token collection -> final state
|
31
|
+
def any(token_collection, alphabet = DEFAULT_ALPHABET)
|
32
|
+
start = State.new
|
33
|
+
nfa = NFA.new(start, alphabet)
|
34
|
+
final = State.new(true)
|
35
|
+
token_collection.each {|token| nfa.add_transition(token, start, final) }
|
36
|
+
nfa.update_final_states
|
37
|
+
regex_pattern = "[#{token_collection.join("")}]"
|
38
|
+
nfa.set_regex_pattern(regex_pattern)
|
39
|
+
nfa
|
40
|
+
end
|
41
|
+
|
42
|
+
# two states: start state and final state
|
43
|
+
# structure: start state -> transitions for every token in the alphabet -> final state
|
44
|
+
def dot(alphabet = DEFAULT_ALPHABET)
|
45
|
+
any(alphabet, alphabet).set_regex_pattern(".")
|
46
|
+
end
|
47
|
+
|
48
|
+
# This implements a character class, and is specifically for use with matching strings
|
49
|
+
def range(c_begin, c_end, alphabet = DEFAULT_ALPHABET)
|
50
|
+
any((c_begin..c_end).to_a, alphabet).set_regex_pattern("[#{c_begin}-#{c_end}]")
|
51
|
+
end
|
52
|
+
|
53
|
+
############### The following methods create FSAs given other FSAs #################
|
54
|
+
|
55
|
+
# always clones the given nfa and returns a new nfa with a non-final error state
|
56
|
+
def with_err(nfa, alphabet = nfa.alphabet)
|
57
|
+
with_err!(nfa.deep_clone, alphabet)
|
58
|
+
end
|
59
|
+
|
60
|
+
# adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
|
61
|
+
# the error state transitions to itself on any token.
|
62
|
+
def with_err!(nfa, alphabet = nfa.alphabet)
|
63
|
+
error_state = nfa.states.find(&:error?)
|
64
|
+
return nfa if error_state
|
65
|
+
|
66
|
+
error_state = State.new_error_state
|
67
|
+
nfa.add_state(error_state)
|
68
|
+
|
69
|
+
nfa.states.each do |state|
|
70
|
+
tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token)
|
71
|
+
missing_tokens = alphabet - tokens_on_outbound_transitions
|
72
|
+
missing_tokens.each do |token|
|
73
|
+
nfa.add_transition(token, state, error_state)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
|
78
|
+
|
79
|
+
nfa.set_regex_pattern("/#{nfa.regex_pattern}/E")
|
80
|
+
end
|
81
|
+
|
82
|
+
# always clones the given nfa and returns a new nfa with a non-final error state
|
83
|
+
def with_err_dead_end(nfa, alphabet = nfa.alphabet)
|
84
|
+
with_err_dead_end!(nfa.deep_clone, alphabet)
|
85
|
+
end
|
86
|
+
|
87
|
+
# adds and error state to the NFA, create error transitions from all non-error states to the error state on any unhandled token.
|
88
|
+
# the error state doesn't have any outbound transitions.
|
89
|
+
def with_err_dead_end!(nfa, alphabet = nfa.alphabet)
|
90
|
+
error_state = nfa.states.find(&:error?)
|
91
|
+
return nfa if error_state
|
92
|
+
|
93
|
+
error_state = State.new_error_state
|
94
|
+
nfa.add_state(error_state)
|
95
|
+
|
96
|
+
nfa.states.each do |state|
|
97
|
+
unless state.error?
|
98
|
+
tokens_on_outbound_transitions = nfa.transitions_from(state).map(&:token).to_set
|
99
|
+
only_outbound_transition_is_epsilon_transition = tokens_on_outbound_transitions.size == 1 && tokens_on_outbound_transitions.first == NFATransition::Epsilon
|
100
|
+
unless only_outbound_transition_is_epsilon_transition
|
101
|
+
missing_tokens = (alphabet - Set[NFATransition::Epsilon]) - tokens_on_outbound_transitions
|
102
|
+
missing_tokens.each do |token|
|
103
|
+
nfa.add_transition(token, state, error_state)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
# remove the error state if it has no inbound or outbound transitions
|
110
|
+
nfa.remove_state(error_state) if nfa.all_transitions.none? {|transition| transition.from == error_state || transition.to == error_state }
|
111
|
+
|
112
|
+
nfa.set_regex_pattern("/#{nfa.regex_pattern}/DE")
|
113
|
+
end
|
114
|
+
|
115
|
+
# Append b onto a
|
116
|
+
# Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
|
117
|
+
# This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
|
118
|
+
def append(a, b)
|
119
|
+
a = a.deep_clone
|
120
|
+
b = b.deep_clone
|
121
|
+
append!(a, b)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Destructively append b onto a
|
125
|
+
# Appending produces a machine that matches all the strings in machine a followed by all the strings in machine b.
|
126
|
+
# This differs from `seq` in that the composite machine's final states are the union of machine a's final states and machine b's final states.
|
127
|
+
def append!(a, b)
|
128
|
+
a.alphabet = a.alphabet | b.alphabet
|
129
|
+
|
130
|
+
# add an epsilon transition from each final state of machine a to the start state of maachine b.
|
131
|
+
a.final_states.each do |final_state|
|
132
|
+
a.add_transition(NFATransition::Epsilon, final_state, b.start_state)
|
133
|
+
end
|
134
|
+
|
135
|
+
# add all of machine b's transitions to machine a
|
136
|
+
b.all_transitions.each {|transition| a.add_transition(transition.token, transition.from, transition.to) }
|
137
|
+
# a.final_states = a.final_states | b.final_states
|
138
|
+
a.update_final_states
|
139
|
+
|
140
|
+
a
|
141
|
+
end
|
142
|
+
|
143
|
+
def seq(*nfas)
|
144
|
+
nfas.flatten.reduce {|memo_nfa, nfa| seq2(memo_nfa, nfa) }
|
145
|
+
end
|
146
|
+
|
147
|
+
# Implements concatenation, as defined in the Ragel manual in section 2.5.5 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
148
|
+
# Seq produces a machine that matches all the strings in machine `a` followed by all the strings in machine `b`.
|
149
|
+
# Seq draws epsilon transitions from the final states of thefirst machine to the start state of the second machine.
|
150
|
+
# The final states of the first machine lose their final state status, unless the start state of the second machine is final as well.
|
151
|
+
def seq2(a, b)
|
152
|
+
a = a.deep_clone
|
153
|
+
b = b.deep_clone
|
154
|
+
|
155
|
+
a = append!(a, b)
|
156
|
+
|
157
|
+
# make sure that b's final states are the only final states in a after we have appended b onto a
|
158
|
+
a.states.each {|state| state.final = b.final_states.includes?(state) }
|
159
|
+
a.update_final_states
|
160
|
+
|
161
|
+
a.set_regex_pattern("#{a.regex_pattern}#{b.regex_pattern}")
|
162
|
+
end
|
163
|
+
|
164
|
+
# Build a new machine consisting of a new start state with epsilon transitions to the start state of all the given NFAs in `nfas`.
|
165
|
+
# The resulting machine's final states are the set of final states from all the NFAs in `nfas`.
|
166
|
+
#
|
167
|
+
# Implements Union, as defined in the Ragel manual in section 2.5.1 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
168
|
+
# The union operation produces a machine that matches any string in machine one or machine two.
|
169
|
+
# The operation first creates a new start state.
|
170
|
+
# Epsilon transitions are drawn from the new start state to the start states of both input machines.
|
171
|
+
# The resulting machine has a final state setequivalent to the union of the final state sets of both input machines.
|
172
|
+
def union(*nfas)
|
173
|
+
nfas.flatten!
|
174
|
+
nfas = nfas.map(&:deep_clone)
|
175
|
+
union!(nfas)
|
176
|
+
end
|
177
|
+
|
178
|
+
# same as union, but doesn't deep clone the constituent nfas
|
179
|
+
def union!(nfas)
|
180
|
+
start = State.new
|
181
|
+
composite_alphabet = nfas.map(&:alphabet).reduce {|memo, alphabet| memo | alphabet }
|
182
|
+
new_nfa = NFA.new(start, composite_alphabet)
|
183
|
+
|
184
|
+
# add epsilon transitions from the start state of the new machine to the start state of machines a and b
|
185
|
+
nfas.each do |nfa|
|
186
|
+
new_nfa.add_states(nfa.states)
|
187
|
+
new_nfa.add_transition(NFATransition::Epsilon, start, nfa.start_state)
|
188
|
+
nfa.all_transitions.each {|t| new_nfa.add_transition(t.token, t.from, t.to) }
|
189
|
+
end
|
190
|
+
|
191
|
+
new_nfa.update_final_states
|
192
|
+
|
193
|
+
new_nfa.set_regex_pattern("#{nfas.map(&:regex_pattern).join("|")}")
|
194
|
+
end
|
195
|
+
|
196
|
+
# Implements Kleene Star, as defined in the Ragel manual in section 2.5.6 of http://www.colm.net/files/ragel/ragel-guide-6.10.pdf:
|
197
|
+
# The machine resulting from the Kleene Star operator will match zero or more repetitions of the machine it is applied to.
|
198
|
+
# It creates a new start state and an additional final state.
|
199
|
+
# Epsilon transitions are drawn between the new start state and the old start state,
|
200
|
+
# between the new start state and the new final state, and between the final states of the machine and the new start state.
|
201
|
+
def kleene(machine)
|
202
|
+
machine = machine.deep_clone
|
203
|
+
start = State.new
|
204
|
+
final = State.new(true)
|
205
|
+
|
206
|
+
nfa = NFA.new(start, machine.alphabet)
|
207
|
+
nfa.add_states(machine.states)
|
208
|
+
nfa.add_transition(NFATransition::Epsilon, start, final)
|
209
|
+
nfa.add_transition(NFATransition::Epsilon, start, machine.start_state)
|
210
|
+
machine.final_states.each do |final_state|
|
211
|
+
nfa.add_transition(NFATransition::Epsilon, final_state, start)
|
212
|
+
final_state.final = false
|
213
|
+
end
|
214
|
+
|
215
|
+
# add all of machine's transitions to the new machine
|
216
|
+
(machine.all_transitions).each {|t| nfa.add_transition(t.token, t.from, t.to) }
|
217
|
+
nfa.update_final_states
|
218
|
+
|
219
|
+
nfa.set_regex_pattern("#{machine.regex_pattern}*")
|
220
|
+
end
|
221
|
+
|
222
|
+
def plus(machine)
|
223
|
+
seq(machine, kleene(machine)).set_regex_pattern("#{machine.regex_pattern}+")
|
224
|
+
end
|
225
|
+
|
226
|
+
def optional(machine)
|
227
|
+
empty = NFA.new(State.new(true), machine.alphabet).set_regex_pattern("")
|
228
|
+
union(machine, empty).set_regex_pattern("#{machine.regex_pattern}?")
|
229
|
+
end
|
230
|
+
|
231
|
+
# def repeat(machine, min, max = nil)
|
232
|
+
# max ||= min
|
233
|
+
# m = NFA.new(State.new(true), machine.alphabet)
|
234
|
+
# min.times { m = seq(m, machine) }
|
235
|
+
# (max - min).times { m = append(m, machine) }
|
236
|
+
# if min != max
|
237
|
+
# m.set_regex_pattern("#{machine.regex_pattern}{#{min},#{max}}")
|
238
|
+
# else
|
239
|
+
# m.set_regex_pattern("#{machine.regex_pattern}{#{min}}")
|
240
|
+
# end
|
241
|
+
# end
|
242
|
+
|
243
|
+
# def negate(machine)
|
244
|
+
# machine = machine.to_dfa
|
245
|
+
|
246
|
+
# # invert the final flag of every state
|
247
|
+
# machine.states.each {|state| state.final = !state.final? }
|
248
|
+
# machine.update_final_states
|
249
|
+
|
250
|
+
# machine.to_nfa.set_regex_pattern("(!#{machine.regex_pattern})")
|
251
|
+
# end
|
252
|
+
|
253
|
+
# # a - b == a && !b
|
254
|
+
# def difference(a, b)
|
255
|
+
# intersection(a, negate(b))
|
256
|
+
# end
|
257
|
+
|
258
|
+
# # By De Morgan's Law: !(!a || !b) = a && b
|
259
|
+
# def intersection(a, b)
|
260
|
+
# negate(union(negate(a), negate(b)))
|
261
|
+
# end
|
262
|
+
end
|
263
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# this is a port and extension of https://github.com/davidkellis/kleene/
|
2
|
+
|
3
|
+
require_relative "./dsl"
|
4
|
+
require_relative "./nfa"
|
5
|
+
require_relative "./dfa"
|
6
|
+
|
7
|
+
module Kleene
|
8
|
+
# The default alphabet consists of the following:
|
9
|
+
# Set{' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
|
10
|
+
# '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
|
11
|
+
# ':', ';', '<', '=', '>', '?', '@',
|
12
|
+
# 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
|
13
|
+
# 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
|
14
|
+
# '[', '\\', ']', '^', '_', '`',
|
15
|
+
# 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
|
16
|
+
# 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
|
17
|
+
# '{', '|', '}', '~', "\n", "\t"}
|
18
|
+
DEFAULT_ALPHABET = ((' '..'~').to_a + "\n\t".chars).to_set
|
19
|
+
|
20
|
+
class State
|
21
|
+
@@next_id = 0
|
22
|
+
|
23
|
+
def self.next_id
|
24
|
+
@@next_id += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.new_error_state(final = false)
|
28
|
+
State.new(final, true)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
attr_reader :id # : Int32
|
33
|
+
attr_accessor :final # : Bool
|
34
|
+
attr_accessor :error # : Bool
|
35
|
+
|
36
|
+
def initialize(final = false, error = false, id = nil)
|
37
|
+
@final = final
|
38
|
+
@error = error
|
39
|
+
@id = id || State.next_id
|
40
|
+
end
|
41
|
+
|
42
|
+
# is this an error state?
|
43
|
+
def error?
|
44
|
+
@error
|
45
|
+
end
|
46
|
+
|
47
|
+
# is this a final state?
|
48
|
+
def final?
|
49
|
+
@final
|
50
|
+
end
|
51
|
+
|
52
|
+
def dup
|
53
|
+
State.new(@final, @error, nil)
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_s
|
57
|
+
"State{id: #{id}, final: #{final}, error: #{error}}"
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class MatchRef
|
62
|
+
attr_accessor :string # : String
|
63
|
+
attr_accessor :range # : Range(Int32, Int32)
|
64
|
+
|
65
|
+
def initialize(original_string, match_range)
|
66
|
+
@string = original_string
|
67
|
+
@range = match_range
|
68
|
+
end
|
69
|
+
|
70
|
+
def text
|
71
|
+
@string[@range]
|
72
|
+
end
|
73
|
+
|
74
|
+
def to_s
|
75
|
+
text
|
76
|
+
end
|
77
|
+
|
78
|
+
def ==(other)
|
79
|
+
@string == other.string &&
|
80
|
+
@range == other.range
|
81
|
+
end
|
82
|
+
|
83
|
+
def eql?(other)
|
84
|
+
self == other
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
require_relative "./kleene"
|
2
|
+
|
3
|
+
module Kleene
|
4
|
+
class MachineTuple
|
5
|
+
attr_accessor :nfa # : NFA
|
6
|
+
attr_accessor :nfa_with_dead_err # : NFA
|
7
|
+
attr_accessor :dfa # : DFA
|
8
|
+
|
9
|
+
def initialize(nfa, nfa_with_dead_err, dfa)
|
10
|
+
@nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class MultiMatchDFA
|
15
|
+
include DSL
|
16
|
+
|
17
|
+
# @original_nfas : Array(NFA)
|
18
|
+
attr_reader :nfas_with_err_state # : Array(NFA)
|
19
|
+
attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
|
20
|
+
attr_accessor :composite_nfa # : NFA
|
21
|
+
attr_accessor :composite_dfa # : DFA
|
22
|
+
|
23
|
+
attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
|
24
|
+
attr_accessor :nfa_to_index # : Hash(NFA, Int32)
|
25
|
+
attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
|
26
|
+
attr_accessor :dfa_to_index # : Hash(DFA, Int32)
|
27
|
+
|
28
|
+
def initialize(nfas)
|
29
|
+
composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
|
30
|
+
|
31
|
+
@original_nfas = nfas
|
32
|
+
@nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
|
33
|
+
dfas = @original_nfas.map(&:to_dfa)
|
34
|
+
|
35
|
+
@nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
|
36
|
+
@nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
|
37
|
+
@dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
|
38
|
+
@machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
|
39
|
+
|
40
|
+
# build a mapping of (state -> nfa) pairs that capture which nfa owns each state
|
41
|
+
@dead_end_nfa_state_to_dead_end_nfa = Hash.new
|
42
|
+
@nfas_with_err_state.each do |nfa_with_dead_err|
|
43
|
+
nfa_with_dead_err.states.each do |state|
|
44
|
+
@dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
49
|
+
@composite_nfa = create_composite_nfa(@nfas_with_err_state)
|
50
|
+
@composite_dfa = @composite_nfa.to_dfa
|
51
|
+
end
|
52
|
+
|
53
|
+
def machines_from_nfa(nfa) # : MachineTuple
|
54
|
+
machines_by_index[nfa_to_index[nfa]]
|
55
|
+
end
|
56
|
+
|
57
|
+
def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
|
58
|
+
machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
|
59
|
+
end
|
60
|
+
|
61
|
+
def machines_from_dfa(dfa) # : MachineTuple
|
62
|
+
machines_by_index[dfa_to_index[dfa]]
|
63
|
+
end
|
64
|
+
|
65
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
66
|
+
def create_composite_nfa(nfas)
|
67
|
+
nfa = union!(nfas)
|
68
|
+
|
69
|
+
# add epsilon transitions from all the states except the start state back to the start state
|
70
|
+
nfa.states.each do |state|
|
71
|
+
if state != nfa.start_state
|
72
|
+
nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
nfa.update_final_states
|
77
|
+
|
78
|
+
nfa
|
79
|
+
end
|
80
|
+
|
81
|
+
def match_tracker(input) # : MatchTracker
|
82
|
+
dfa = @composite_dfa.deep_clone
|
83
|
+
match_tracker = setup_callbacks(dfa)
|
84
|
+
|
85
|
+
input.each_char.with_index do |char, index|
|
86
|
+
dfa.handle_token!(char, index)
|
87
|
+
end
|
88
|
+
|
89
|
+
match_tracker
|
90
|
+
end
|
91
|
+
|
92
|
+
def matches(input) # : Hash(NFA, Array(MatchRef))
|
93
|
+
mt = match_tracker(input)
|
94
|
+
|
95
|
+
start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
|
96
|
+
|
97
|
+
mt.empty_matches.each do |nfa_with_dead_err, indices|
|
98
|
+
original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
|
99
|
+
indices.each do |index|
|
100
|
+
mt.add_match(original_nfa, MatchRef.new(input, index...index))
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
active_dfas = Array.new # the Int32 represents the start of match
|
105
|
+
|
106
|
+
input.each_char.with_index do |char, index|
|
107
|
+
active_dfas.reject! do |active_dfa_tuple|
|
108
|
+
dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
|
109
|
+
|
110
|
+
dfa_clone.handle_token!(char, index)
|
111
|
+
mt.add_match(original_nfa, MatchRef.new(input, start_of_match_index..index)) if dfa_clone.accept?
|
112
|
+
|
113
|
+
dfa_clone.error?
|
114
|
+
end
|
115
|
+
|
116
|
+
if nfas_with_dead_err = start_index_to_nfas_that_may_match[index]
|
117
|
+
nfas_with_dead_err.each do |nfa_with_dead_err|
|
118
|
+
machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
|
119
|
+
original_nfa = machines.nfa
|
120
|
+
dfa = machines.dfa
|
121
|
+
dfa_clone = dfa.shallow_clone
|
122
|
+
|
123
|
+
dfa_clone.handle_token!(char, index)
|
124
|
+
mt.add_match(original_nfa, MatchRef.new(input, index..index)) if dfa_clone.accept?
|
125
|
+
|
126
|
+
active_dfas << [dfa_clone, original_nfa, index] unless dfa_clone.error?
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
mt.matches
|
132
|
+
end
|
133
|
+
|
134
|
+
def setup_callbacks(dfa)
|
135
|
+
match_tracker = MatchTracker.new
|
136
|
+
|
137
|
+
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
138
|
+
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
139
|
+
nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
|
140
|
+
reject {|transition| transition.epsilon? || transition.to.error? }.
|
141
|
+
map(&:to).to_set
|
142
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
|
143
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
144
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
145
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
|
146
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
|
147
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
148
|
+
select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
|
149
|
+
compact_map do |nfa_state|
|
150
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
151
|
+
end.to_set
|
152
|
+
end
|
153
|
+
|
154
|
+
# 2. identify DFA states that correspond to final states in the NFAs
|
155
|
+
nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
|
156
|
+
dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
157
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
158
|
+
dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
|
159
|
+
dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
|
160
|
+
dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
161
|
+
select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
|
162
|
+
compact_map do |nfa_state|
|
163
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
164
|
+
end.to_set
|
165
|
+
end
|
166
|
+
|
167
|
+
# 3. Identify DFA states that correspond to successful match without even having seen any characters.
|
168
|
+
# These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
|
169
|
+
nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
|
170
|
+
dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
|
171
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
172
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
173
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
|
174
|
+
dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
|
175
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
176
|
+
select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
|
177
|
+
compact_map do |nfa_state|
|
178
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
179
|
+
end.to_set
|
180
|
+
end
|
181
|
+
|
182
|
+
# set up call transition call backs, since the callbacks may only be defined once per state and transition
|
183
|
+
# For (1):
|
184
|
+
# Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
|
185
|
+
# to be matched on the transition to one of the states in (1)
|
186
|
+
# For (2):
|
187
|
+
# set up transition callbacks to push the index position of the end of a successful match onto the list
|
188
|
+
# of successful matches for the NFA that matched
|
189
|
+
# For (3):
|
190
|
+
# set up transision callbacks to capture successful empty matches
|
191
|
+
destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
|
192
|
+
destination_dfa_states_for_callbacks.each do |dfa_state|
|
193
|
+
dfa.on_transition_to(dfa_state) do |transition, token, token_index|
|
194
|
+
destination_dfa_state = transition.to
|
195
|
+
|
196
|
+
should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
|
197
|
+
should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
|
198
|
+
should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
|
199
|
+
|
200
|
+
if should_track_empty_match
|
201
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
|
202
|
+
match_tracker.add_empty_match(nfa_with_dead_end, token_index)
|
203
|
+
end
|
204
|
+
end
|
205
|
+
|
206
|
+
if should_track_start_of_candidate_match
|
207
|
+
nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
|
208
|
+
nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
|
209
|
+
dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
|
210
|
+
dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
|
211
|
+
match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
if should_track_end_of_match
|
216
|
+
dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
|
217
|
+
match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
222
|
+
|
223
|
+
match_tracker
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
class MatchTracker
|
229
|
+
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
230
|
+
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
231
|
+
# are the internal state of a MultiMatchDFA
|
232
|
+
attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
|
233
|
+
# The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
|
234
|
+
# however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
|
235
|
+
# OR it may be as a result of transitioning to (2) a non-error final state.
|
236
|
+
# In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
|
237
|
+
# is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
|
238
|
+
# while in the case of (2), the match may be a "normal" match.
|
239
|
+
# The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
|
240
|
+
# or the beginning of an empty match.
|
241
|
+
# This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
|
242
|
+
# to the composite DFA's start state.
|
243
|
+
attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
|
244
|
+
attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
|
245
|
+
|
246
|
+
# The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
|
247
|
+
# This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
|
248
|
+
# match_end_positions structures, documented above ^^^.
|
249
|
+
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
250
|
+
|
251
|
+
def initialize
|
252
|
+
@candidate_match_start_positions = Hash.new
|
253
|
+
@match_end_positions = Hash.new
|
254
|
+
@empty_matches = Hash.new
|
255
|
+
@matches = Hash.new
|
256
|
+
end
|
257
|
+
|
258
|
+
def start_positions(nfa)
|
259
|
+
candidate_match_start_positions[nfa] ||= Array.new
|
260
|
+
end
|
261
|
+
|
262
|
+
def end_positions(nfa)
|
263
|
+
match_end_positions[nfa] ||= Array.new
|
264
|
+
end
|
265
|
+
|
266
|
+
def empty_match_positions(nfa)
|
267
|
+
empty_matches[nfa] ||= Array.new
|
268
|
+
end
|
269
|
+
|
270
|
+
def matches_for(nfa)
|
271
|
+
matches[nfa] ||= Array.new
|
272
|
+
end
|
273
|
+
|
274
|
+
def add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
275
|
+
# puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
|
276
|
+
positions = start_positions(nfa_with_dead_end)
|
277
|
+
positions << token_index
|
278
|
+
end
|
279
|
+
|
280
|
+
# the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
|
281
|
+
def add_end_of_match(nfa_with_dead_end, token_index)
|
282
|
+
# puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
|
283
|
+
positions = end_positions(nfa_with_dead_end)
|
284
|
+
positions << token_index
|
285
|
+
end
|
286
|
+
|
287
|
+
def add_empty_match(nfa_with_dead_end, token_index)
|
288
|
+
positions = empty_match_positions(nfa_with_dead_end)
|
289
|
+
positions << token_index
|
290
|
+
end
|
291
|
+
|
292
|
+
def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
|
293
|
+
index_to_nfas = Hash.new
|
294
|
+
candidate_match_start_positions.each do |nfa_with_dead_end, indices|
|
295
|
+
indices.each do |index|
|
296
|
+
nfas = index_to_nfas[index] ||= Array.new
|
297
|
+
nfas << nfa_with_dead_end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
index_to_nfas
|
301
|
+
end
|
302
|
+
|
303
|
+
def add_match(nfa, match)
|
304
|
+
matches = matches_for(nfa)
|
305
|
+
matches << match
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|