kleene 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +3 -2
- data/kleene.gemspec +1 -0
- data/lib/kleene/multi_match_dfa.rb +20 -14
- data/lib/kleene/naive_online_regex.rb +63 -0
- data/lib/kleene/online_dfa.rb +323 -0
- data/lib/kleene/parser.rb +9 -0
- data/lib/kleene/patches.rb +6 -0
- data/lib/kleene/version.rb +1 -1
- data/lib/kleene.rb +4 -1
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 674bbda22ddfbc6c4ec1624de621b96c24576cbe8aa656d228697e1d98549cdb
|
4
|
+
data.tar.gz: ddca6b95201b21359dd23c5d6b4d9591561e9045764ce90b470b6add1c4518b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9392d0b56aa48b8cef4f0337be625d6160d49c3fb0214b034f379a505ee3a142347e8bb5e62a82ad0cb982bb3ca00ad6f9b12f951e00cf48b0447a1b6fb78320
|
7
|
+
data.tar.gz: 1521365696f470bc249dac8aa77038bc9121ff60499dc9368aaf86224e64af14a06e092aef191bc9571d62d2dd63567a7051f4b07490e4f377eac8d24de83025
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kleene (0.
|
4
|
+
kleene (0.6.0)
|
5
5
|
activesupport (~> 7.1)
|
6
|
+
regexp_parser (~> 2.8)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -45,7 +46,7 @@ GEM
|
|
45
46
|
parser (3.2.2.4)
|
46
47
|
ast (~> 2.4.1)
|
47
48
|
racc
|
48
|
-
racc (1.7.
|
49
|
+
racc (1.7.3)
|
49
50
|
rainbow (3.1.1)
|
50
51
|
rake (13.1.0)
|
51
52
|
rbs (2.8.4)
|
data/kleene.gemspec
CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
|
34
34
|
# Uncomment to register a new dependency of your gem
|
35
35
|
spec.add_dependency "activesupport", "~> 7.1"
|
36
|
+
spec.add_dependency "regexp_parser", "~> 2.8"
|
36
37
|
|
37
38
|
# For more information and examples about making a new gem, check out our
|
38
39
|
# guide at: https://bundler.io/guides/creating_gem.html
|
@@ -77,18 +77,10 @@ module Kleene
|
|
77
77
|
|
78
78
|
nfa
|
79
79
|
end
|
80
|
+
end
|
80
81
|
|
81
|
-
|
82
|
-
|
83
|
-
match_tracker = setup_callbacks(dfa)
|
84
|
-
|
85
|
-
input.each_char.with_index do |char, index|
|
86
|
-
dfa.handle_token!(char, index)
|
87
|
-
end
|
88
|
-
|
89
|
-
match_tracker
|
90
|
-
end
|
91
|
-
|
82
|
+
class BatchMultiMatchDFA < MultiMatchDFA
|
83
|
+
# #matches(input) is the batch-style matching interface
|
92
84
|
def matches(input) # : Hash(NFA, Array(MatchRef))
|
93
85
|
mt = match_tracker(input)
|
94
86
|
|
@@ -131,8 +123,19 @@ module Kleene
|
|
131
123
|
mt.matches
|
132
124
|
end
|
133
125
|
|
126
|
+
def match_tracker(input) # : BatchMatchTracker
|
127
|
+
dfa = @composite_dfa.deep_clone
|
128
|
+
match_tracker = setup_callbacks(dfa)
|
129
|
+
|
130
|
+
input.each_char.with_index do |char, index|
|
131
|
+
dfa.handle_token!(char, index)
|
132
|
+
end
|
133
|
+
|
134
|
+
match_tracker
|
135
|
+
end
|
136
|
+
|
134
137
|
def setup_callbacks(dfa)
|
135
|
-
match_tracker =
|
138
|
+
match_tracker = BatchMatchTracker.new
|
136
139
|
|
137
140
|
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
138
141
|
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
@@ -222,10 +225,9 @@ module Kleene
|
|
222
225
|
|
223
226
|
match_tracker
|
224
227
|
end
|
225
|
-
|
226
228
|
end
|
227
229
|
|
228
|
-
class
|
230
|
+
class BatchMatchTracker
|
229
231
|
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
230
232
|
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
231
233
|
# are the internal state of a MultiMatchDFA
|
@@ -249,6 +251,10 @@ module Kleene
|
|
249
251
|
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
250
252
|
|
251
253
|
def initialize
|
254
|
+
reset
|
255
|
+
end
|
256
|
+
|
257
|
+
def reset
|
252
258
|
@candidate_match_start_positions = Hash.new
|
253
259
|
@match_end_positions = Hash.new
|
254
260
|
@empty_matches = Hash.new
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require "set"
|
2
|
+
require "stringio"
|
3
|
+
require_relative "./kleene"
|
4
|
+
|
5
|
+
module Kleene
|
6
|
+
class NaiveOnlineRegex
|
7
|
+
def initialize(regexen, window_size = 100)
|
8
|
+
@regexen = regexen
|
9
|
+
@window_size = window_size
|
10
|
+
|
11
|
+
reset
|
12
|
+
end
|
13
|
+
|
14
|
+
def reset
|
15
|
+
@buffer = ""
|
16
|
+
@matches_per_regex = Hash.new # Hash(Regexp, Set(MatchData))
|
17
|
+
end
|
18
|
+
|
19
|
+
# #ingest(input) is the online-style matching interface
|
20
|
+
def ingest(input, debug = false) # : Set(OnlineMatch)
|
21
|
+
@buffer << input
|
22
|
+
new_online_matches = Set.new
|
23
|
+
@regexen.each do |regex|
|
24
|
+
existing_matches_for_regex = (@matches_per_regex[regex] ||= Set.new)
|
25
|
+
scan_matches = @buffer.scan_matches(regex).to_set
|
26
|
+
new_matches = scan_matches - existing_matches_for_regex # new_matches : Set(MatchData)
|
27
|
+
existing_matches_for_regex.merge(new_matches)
|
28
|
+
new_online_matches.merge(new_matches.map {|match_data| OnlineMatch.new(regex, match_data) })
|
29
|
+
end
|
30
|
+
resize_buffer!
|
31
|
+
new_online_matches
|
32
|
+
end
|
33
|
+
|
34
|
+
def matches # Hash(Regexp, Set(MatchData))
|
35
|
+
@matches_per_regex
|
36
|
+
end
|
37
|
+
|
38
|
+
def matches_for(regex) # Set(MatchData) | Nil
|
39
|
+
@matches_per_regex[regex]
|
40
|
+
end
|
41
|
+
|
42
|
+
def resize_buffer!
|
43
|
+
if @buffer.size > @window_size
|
44
|
+
@buffer = @buffer[-@window_size..-1]
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# A {Regexp, MatchData} pair
|
50
|
+
class OnlineMatch
|
51
|
+
attr_reader :regex # Regexp
|
52
|
+
attr_reader :match # MatchData
|
53
|
+
def initialize(regex, match)
|
54
|
+
@regex, @match = regex, match
|
55
|
+
end
|
56
|
+
def to_a
|
57
|
+
@match.to_a
|
58
|
+
end
|
59
|
+
def to_h
|
60
|
+
{@regex => to_a}
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,323 @@
|
|
1
|
+
require "stringio"
|
2
|
+
require_relative "./kleene"
|
3
|
+
|
4
|
+
module Kleene
|
5
|
+
class MachineTuple
|
6
|
+
attr_accessor :nfa # : NFA
|
7
|
+
attr_accessor :nfa_with_dead_err # : NFA
|
8
|
+
attr_accessor :dfa # : DFA
|
9
|
+
|
10
|
+
def initialize(nfa, nfa_with_dead_err, dfa)
|
11
|
+
@nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class OnlineDFA
|
16
|
+
include DSL
|
17
|
+
|
18
|
+
# @original_nfas : Array(NFA)
|
19
|
+
attr_reader :nfas_with_err_state # : Array(NFA)
|
20
|
+
attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
|
21
|
+
attr_accessor :composite_nfa # : NFA
|
22
|
+
attr_accessor :composite_dfa # : DFA
|
23
|
+
|
24
|
+
attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
|
25
|
+
attr_accessor :nfa_to_index # : Hash(NFA, Int32)
|
26
|
+
attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
|
27
|
+
attr_accessor :dfa_to_index # : Hash(DFA, Int32)
|
28
|
+
|
29
|
+
def initialize(nfas)
|
30
|
+
composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
|
31
|
+
|
32
|
+
@original_nfas = nfas
|
33
|
+
@nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
|
34
|
+
dfas = @original_nfas.map(&:to_dfa)
|
35
|
+
|
36
|
+
@nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
|
37
|
+
@nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
|
38
|
+
@dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
|
39
|
+
@machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
|
40
|
+
|
41
|
+
# build a mapping of (state -> nfa) pairs that capture which nfa owns each state
|
42
|
+
@dead_end_nfa_state_to_dead_end_nfa = Hash.new
|
43
|
+
@nfas_with_err_state.each do |nfa_with_dead_err|
|
44
|
+
nfa_with_dead_err.states.each do |state|
|
45
|
+
@dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
50
|
+
@composite_nfa = create_composite_nfa(@nfas_with_err_state)
|
51
|
+
@composite_dfa = @composite_nfa.to_dfa
|
52
|
+
|
53
|
+
reset
|
54
|
+
end
|
55
|
+
|
56
|
+
def machines_from_nfa(nfa) # : MachineTuple
|
57
|
+
machines_by_index[nfa_to_index[nfa]]
|
58
|
+
end
|
59
|
+
|
60
|
+
def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
|
61
|
+
machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
|
62
|
+
end
|
63
|
+
|
64
|
+
def machines_from_dfa(dfa) # : MachineTuple
|
65
|
+
machines_by_index[dfa_to_index[dfa]]
|
66
|
+
end
|
67
|
+
|
68
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
69
|
+
def create_composite_nfa(nfas)
|
70
|
+
nfa = union!(nfas)
|
71
|
+
|
72
|
+
# add epsilon transitions from all the states except the start state back to the start state
|
73
|
+
nfa.states.each do |state|
|
74
|
+
if state != nfa.start_state
|
75
|
+
nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
nfa.update_final_states
|
80
|
+
|
81
|
+
nfa
|
82
|
+
end
|
83
|
+
|
84
|
+
def reset # : OnlineMatchTracker
|
85
|
+
@active_composite_dfa = @composite_dfa.deep_clone
|
86
|
+
@active_candidate_dfas = []
|
87
|
+
@match_tracker = setup_callbacks(@active_composite_dfa)
|
88
|
+
@buffer = ""
|
89
|
+
end
|
90
|
+
|
91
|
+
# #ingest(input) is the online-style matching interface
|
92
|
+
def ingest(input, debug = false) # : Hash(NFA, Array(MatchRef))
|
93
|
+
mt = @match_tracker
|
94
|
+
|
95
|
+
start_index_of_input_fragment_in_buffer = @buffer.length
|
96
|
+
|
97
|
+
input.each_char.with_index do |char, index|
|
98
|
+
@active_composite_dfa.handle_token!(char, start_index_of_input_fragment_in_buffer + index)
|
99
|
+
end
|
100
|
+
|
101
|
+
@buffer << input
|
102
|
+
|
103
|
+
start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
|
104
|
+
|
105
|
+
mt.empty_matches.each do |nfa_with_dead_err, indices|
|
106
|
+
original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
|
107
|
+
indices.select {|index| index >= start_index_of_input_fragment_in_buffer }.each do |index|
|
108
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, index...index))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
input.each_char.with_index do |char, index|
|
113
|
+
index_in_buffer = start_index_of_input_fragment_in_buffer + index
|
114
|
+
|
115
|
+
@active_candidate_dfas.reject! do |active_dfa_tuple|
|
116
|
+
dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
|
117
|
+
|
118
|
+
dfa_clone.handle_token!(char, index_in_buffer)
|
119
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, start_of_match_index..index_in_buffer)) if dfa_clone.accept?
|
120
|
+
|
121
|
+
dfa_clone.error?
|
122
|
+
end
|
123
|
+
|
124
|
+
if nfas_with_dead_err = start_index_to_nfas_that_may_match[index_in_buffer]
|
125
|
+
nfas_with_dead_err.each do |nfa_with_dead_err|
|
126
|
+
machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
|
127
|
+
original_nfa = machines.nfa
|
128
|
+
dfa = machines.dfa
|
129
|
+
dfa_clone = dfa.shallow_clone
|
130
|
+
|
131
|
+
dfa_clone.handle_token!(char, index_in_buffer)
|
132
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, index_in_buffer..index_in_buffer)) if dfa_clone.accept?
|
133
|
+
|
134
|
+
@active_candidate_dfas << [dfa_clone, original_nfa, index_in_buffer] unless dfa_clone.error?
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
matches
|
140
|
+
end
|
141
|
+
|
142
|
+
def matches
|
143
|
+
@match_tracker.matches
|
144
|
+
end
|
145
|
+
|
146
|
+
def setup_callbacks(dfa)
|
147
|
+
match_tracker = OnlineMatchTracker.new
|
148
|
+
|
149
|
+
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
150
|
+
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
151
|
+
nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
|
152
|
+
reject {|transition| transition.epsilon? || transition.to.error? }.
|
153
|
+
map(&:to).to_set
|
154
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
|
155
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
156
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
157
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
|
158
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
|
159
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
160
|
+
select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
|
161
|
+
compact_map do |nfa_state|
|
162
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
163
|
+
end.to_set
|
164
|
+
end
|
165
|
+
|
166
|
+
# 2. identify DFA states that correspond to final states in the NFAs
|
167
|
+
nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
|
168
|
+
dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
169
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
170
|
+
dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
|
171
|
+
dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
|
172
|
+
dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
173
|
+
select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
|
174
|
+
compact_map do |nfa_state|
|
175
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
176
|
+
end.to_set
|
177
|
+
end
|
178
|
+
|
179
|
+
# 3. Identify DFA states that correspond to successful match without even having seen any characters.
|
180
|
+
# These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
|
181
|
+
nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
|
182
|
+
dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
|
183
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
184
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
185
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
|
186
|
+
dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
|
187
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
188
|
+
select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
|
189
|
+
compact_map do |nfa_state|
|
190
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
191
|
+
end.to_set
|
192
|
+
end
|
193
|
+
|
194
|
+
# set up call transition call backs, since the callbacks may only be defined once per state and transition
|
195
|
+
# For (1):
|
196
|
+
# Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
|
197
|
+
# to be matched on the transition to one of the states in (1)
|
198
|
+
# For (2):
|
199
|
+
# set up transition callbacks to push the index position of the end of a successful match onto the list
|
200
|
+
# of successful matches for the NFA that matched
|
201
|
+
# For (3):
|
202
|
+
# set up transision callbacks to capture successful empty matches
|
203
|
+
destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
|
204
|
+
destination_dfa_states_for_callbacks.each do |dfa_state|
|
205
|
+
dfa.on_transition_to(dfa_state) do |transition, token, token_index|
|
206
|
+
destination_dfa_state = transition.to
|
207
|
+
|
208
|
+
should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
|
209
|
+
should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
|
210
|
+
should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
|
211
|
+
|
212
|
+
if should_track_empty_match
|
213
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
|
214
|
+
match_tracker.add_empty_match(nfa_with_dead_end, token_index)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
if should_track_start_of_candidate_match
|
219
|
+
nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
|
220
|
+
nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
|
221
|
+
dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
|
222
|
+
dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
|
223
|
+
match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
if should_track_end_of_match
|
228
|
+
dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
|
229
|
+
match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
match_tracker
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
class OnlineMatchTracker
|
240
|
+
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
241
|
+
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
242
|
+
# are the internal state of a MultiMatchDFA
|
243
|
+
attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
|
244
|
+
# The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
|
245
|
+
# however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
|
246
|
+
# OR it may be as a result of transitioning to (2) a non-error final state.
|
247
|
+
# In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
|
248
|
+
# is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
|
249
|
+
# while in the case of (2), the match may be a "normal" match.
|
250
|
+
# The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
|
251
|
+
# or the beginning of an empty match.
|
252
|
+
# This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
|
253
|
+
# to the composite DFA's start state.
|
254
|
+
attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
|
255
|
+
attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
|
256
|
+
|
257
|
+
# The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
|
258
|
+
# This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
|
259
|
+
# match_end_positions structures, documented above ^^^.
|
260
|
+
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
261
|
+
|
262
|
+
def initialize
|
263
|
+
reset
|
264
|
+
end
|
265
|
+
|
266
|
+
def reset
|
267
|
+
@candidate_match_start_positions = Hash.new
|
268
|
+
@match_end_positions = Hash.new
|
269
|
+
@empty_matches = Hash.new
|
270
|
+
@matches = Hash.new
|
271
|
+
end
|
272
|
+
|
273
|
+
def start_positions(nfa)
|
274
|
+
candidate_match_start_positions[nfa] ||= Array.new
|
275
|
+
end
|
276
|
+
|
277
|
+
def end_positions(nfa)
|
278
|
+
match_end_positions[nfa] ||= Array.new
|
279
|
+
end
|
280
|
+
|
281
|
+
def empty_match_positions(nfa)
|
282
|
+
empty_matches[nfa] ||= Array.new
|
283
|
+
end
|
284
|
+
|
285
|
+
def matches_for(nfa)
|
286
|
+
matches[nfa] ||= Array.new
|
287
|
+
end
|
288
|
+
|
289
|
+
def add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
290
|
+
# puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
|
291
|
+
positions = start_positions(nfa_with_dead_end)
|
292
|
+
positions << token_index
|
293
|
+
end
|
294
|
+
|
295
|
+
# the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
|
296
|
+
def add_end_of_match(nfa_with_dead_end, token_index)
|
297
|
+
# puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
|
298
|
+
positions = end_positions(nfa_with_dead_end)
|
299
|
+
positions << token_index
|
300
|
+
end
|
301
|
+
|
302
|
+
def add_empty_match(nfa_with_dead_end, token_index)
|
303
|
+
positions = empty_match_positions(nfa_with_dead_end)
|
304
|
+
positions << token_index
|
305
|
+
end
|
306
|
+
|
307
|
+
def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
|
308
|
+
index_to_nfas = Hash.new
|
309
|
+
candidate_match_start_positions.each do |nfa_with_dead_end, indices|
|
310
|
+
indices.each do |index|
|
311
|
+
nfas = index_to_nfas[index] ||= Array.new
|
312
|
+
nfas << nfa_with_dead_end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
index_to_nfas
|
316
|
+
end
|
317
|
+
|
318
|
+
def add_match(nfa, match)
|
319
|
+
matches = matches_for(nfa)
|
320
|
+
matches << match
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
data/lib/kleene/patches.rb
CHANGED
data/lib/kleene/version.rb
CHANGED
data/lib/kleene.rb
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require "active_support"
|
4
4
|
require "active_support/core_ext"
|
5
|
+
require "regexp_parser"
|
5
6
|
require_relative "kleene/version"
|
6
7
|
require_relative "kleene/patches"
|
7
8
|
require_relative "kleene/kleene"
|
@@ -9,7 +10,9 @@ require_relative "kleene/dsl"
|
|
9
10
|
require_relative "kleene/nfa"
|
10
11
|
require_relative "kleene/dfa"
|
11
12
|
require_relative "kleene/multi_match_dfa"
|
12
|
-
|
13
|
+
require_relative "kleene/online_dfa"
|
14
|
+
require_relative "kleene/naive_online_regex"
|
15
|
+
require_relative "kleene/parser"
|
13
16
|
|
14
17
|
module Kleene
|
15
18
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kleene
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Ellis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '7.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: regexp_parser
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.8'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.8'
|
27
41
|
description: kleene is a library for building regular expression recognition automata
|
28
42
|
- nfas, dfas, and some specialty structures.
|
29
43
|
email:
|
@@ -45,7 +59,10 @@ files:
|
|
45
59
|
- lib/kleene/dsl.rb
|
46
60
|
- lib/kleene/kleene.rb
|
47
61
|
- lib/kleene/multi_match_dfa.rb
|
62
|
+
- lib/kleene/naive_online_regex.rb
|
48
63
|
- lib/kleene/nfa.rb
|
64
|
+
- lib/kleene/online_dfa.rb
|
65
|
+
- lib/kleene/parser.rb
|
49
66
|
- lib/kleene/patches.rb
|
50
67
|
- lib/kleene/version.rb
|
51
68
|
homepage: https://github.com/davidkellis/kleene-rb
|