kleene 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +14 -0
- data/Gemfile.lock +3 -2
- data/kleene.gemspec +1 -0
- data/lib/kleene/kleene.rb +4 -7
- data/lib/kleene/multi_match_dfa.rb +20 -14
- data/lib/kleene/naive_online_regex.rb +102 -0
- data/lib/kleene/online_dfa.rb +323 -0
- data/lib/kleene/parser.rb +9 -0
- data/lib/kleene/patches.rb +8 -4
- data/lib/kleene/version.rb +1 -1
- data/lib/kleene.rb +13 -10
- metadata +20 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03ad3b5293809eb768a2bb47eb3f1a9ccee680d7d71873d75c7511eb9fc711be
|
4
|
+
data.tar.gz: cb0e8f6600e878153bf2454cea8e3dd4dc67d1872660bd1a77b0648a5f91efa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f7043fb3024741baf02ed48de44d14c616d4bb83315ad23481d0908c962fbd41a56a9ccc8d0444577526baa94933621b43203c29b2cba14192d98d8f5a7246ef
|
7
|
+
data.tar.gz: a78f7aee47db290800efa79cb6096d604f30dc249907db8759bb319b29f01d378801395557a9a8760fc3b6d06e2e447335398de053e1594233da62128a3aafb9
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
AllCops:
|
2
|
+
StyleGuideBaseURL: https://rubystyle.guide
|
3
|
+
|
4
|
+
Layout/SpaceInsideBlockBraces:
|
5
|
+
SpaceBeforeBlockParameters: false
|
6
|
+
|
7
|
+
Layout/LineLength:
|
8
|
+
Max: 160
|
9
|
+
|
10
|
+
Style/AccessorGrouping:
|
11
|
+
EnforcedStyle: separated
|
12
|
+
|
13
|
+
Style/Encoding:
|
14
|
+
Enabled: true
|
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
kleene (0.
|
4
|
+
kleene (0.7.0)
|
5
5
|
activesupport (~> 7.1)
|
6
|
+
regexp_parser (~> 2.8)
|
6
7
|
|
7
8
|
GEM
|
8
9
|
remote: https://rubygems.org/
|
@@ -45,7 +46,7 @@ GEM
|
|
45
46
|
parser (3.2.2.4)
|
46
47
|
ast (~> 2.4.1)
|
47
48
|
racc
|
48
|
-
racc (1.7.
|
49
|
+
racc (1.7.3)
|
49
50
|
rainbow (3.1.1)
|
50
51
|
rake (13.1.0)
|
51
52
|
rbs (2.8.4)
|
data/kleene.gemspec
CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
|
34
34
|
# Uncomment to register a new dependency of your gem
|
35
35
|
spec.add_dependency "activesupport", "~> 7.1"
|
36
|
+
spec.add_dependency "regexp_parser", "~> 2.8"
|
36
37
|
|
37
38
|
# For more information and examples about making a new gem, check out our
|
38
39
|
# guide at: https://bundler.io/guides/creating_gem.html
|
data/lib/kleene/kleene.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# this is a port and extension of https://github.com/davidkellis/kleene/
|
2
2
|
|
3
|
-
require_relative
|
4
|
-
require_relative
|
5
|
-
require_relative
|
3
|
+
require_relative './dsl'
|
4
|
+
require_relative './nfa'
|
5
|
+
require_relative './dfa'
|
6
6
|
|
7
7
|
module Kleene
|
8
8
|
# The default alphabet consists of the following:
|
@@ -28,7 +28,6 @@ module Kleene
|
|
28
28
|
State.new(final, true)
|
29
29
|
end
|
30
30
|
|
31
|
-
|
32
31
|
attr_reader :id # : Int32
|
33
32
|
attr_accessor :final # : Bool
|
34
33
|
attr_accessor :error # : Bool
|
@@ -76,13 +75,11 @@ module Kleene
|
|
76
75
|
end
|
77
76
|
|
78
77
|
def ==(other)
|
79
|
-
@string == other.string &&
|
80
|
-
@range == other.range
|
78
|
+
@string == other.string && @range == other.range
|
81
79
|
end
|
82
80
|
|
83
81
|
def eql?(other)
|
84
82
|
self == other
|
85
83
|
end
|
86
84
|
end
|
87
|
-
|
88
85
|
end
|
@@ -77,18 +77,10 @@ module Kleene
|
|
77
77
|
|
78
78
|
nfa
|
79
79
|
end
|
80
|
+
end
|
80
81
|
|
81
|
-
|
82
|
-
|
83
|
-
match_tracker = setup_callbacks(dfa)
|
84
|
-
|
85
|
-
input.each_char.with_index do |char, index|
|
86
|
-
dfa.handle_token!(char, index)
|
87
|
-
end
|
88
|
-
|
89
|
-
match_tracker
|
90
|
-
end
|
91
|
-
|
82
|
+
class BatchMultiMatchDFA < MultiMatchDFA
|
83
|
+
# #matches(input) is the batch-style matching interface
|
92
84
|
def matches(input) # : Hash(NFA, Array(MatchRef))
|
93
85
|
mt = match_tracker(input)
|
94
86
|
|
@@ -131,8 +123,19 @@ module Kleene
|
|
131
123
|
mt.matches
|
132
124
|
end
|
133
125
|
|
126
|
+
def match_tracker(input) # : BatchMatchTracker
|
127
|
+
dfa = @composite_dfa.deep_clone
|
128
|
+
match_tracker = setup_callbacks(dfa)
|
129
|
+
|
130
|
+
input.each_char.with_index do |char, index|
|
131
|
+
dfa.handle_token!(char, index)
|
132
|
+
end
|
133
|
+
|
134
|
+
match_tracker
|
135
|
+
end
|
136
|
+
|
134
137
|
def setup_callbacks(dfa)
|
135
|
-
match_tracker =
|
138
|
+
match_tracker = BatchMatchTracker.new
|
136
139
|
|
137
140
|
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
138
141
|
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
@@ -222,10 +225,9 @@ module Kleene
|
|
222
225
|
|
223
226
|
match_tracker
|
224
227
|
end
|
225
|
-
|
226
228
|
end
|
227
229
|
|
228
|
-
class
|
230
|
+
class BatchMatchTracker
|
229
231
|
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
230
232
|
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
231
233
|
# are the internal state of a MultiMatchDFA
|
@@ -249,6 +251,10 @@ module Kleene
|
|
249
251
|
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
250
252
|
|
251
253
|
def initialize
|
254
|
+
reset
|
255
|
+
end
|
256
|
+
|
257
|
+
def reset
|
252
258
|
@candidate_match_start_positions = Hash.new
|
253
259
|
@match_end_positions = Hash.new
|
254
260
|
@empty_matches = Hash.new
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'set'
|
2
|
+
require_relative './kleene'
|
3
|
+
|
4
|
+
module Kleene
|
5
|
+
class NaiveOnlineRegex
|
6
|
+
def initialize(regexen, window_size = 100)
|
7
|
+
@regexen = regexen
|
8
|
+
@window_size = window_size
|
9
|
+
|
10
|
+
reset
|
11
|
+
end
|
12
|
+
|
13
|
+
def reset
|
14
|
+
@buffer = ''
|
15
|
+
@matches_per_regex = {} # Hash(Regexp, Set(OnlineMatch))
|
16
|
+
end
|
17
|
+
|
18
|
+
# #ingest(input) is the online-style matching interface
|
19
|
+
def ingest(input, _debug = false) # : Set(OnlineMatch)
|
20
|
+
@buffer << input
|
21
|
+
new_online_matches = Set.new
|
22
|
+
@regexen.each do |regex|
|
23
|
+
existing_matches_for_regex = (@matches_per_regex[regex] ||= Set.new)
|
24
|
+
scan_matches = @buffer.scan_matches(regex)
|
25
|
+
scan_online_matches = scan_matches.map {|match_data| OnlineMatch.new(regex, match_data) }.to_set
|
26
|
+
new_matches = scan_online_matches - existing_matches_for_regex # new_matches : Set(OnlineMatch)
|
27
|
+
existing_matches_for_regex.merge(new_matches)
|
28
|
+
new_online_matches.merge(new_matches)
|
29
|
+
end
|
30
|
+
resize_buffer!
|
31
|
+
new_online_matches
|
32
|
+
end
|
33
|
+
|
34
|
+
def matches # Hash(Regexp, Set(OnlineMatch))
|
35
|
+
@matches_per_regex
|
36
|
+
end
|
37
|
+
|
38
|
+
def matches_for(regex) # Set(OnlineMatch) | Nil
|
39
|
+
@matches_per_regex[regex]
|
40
|
+
end
|
41
|
+
|
42
|
+
def resize_buffer!
|
43
|
+
return unless @buffer.size > @window_size
|
44
|
+
|
45
|
+
number_of_chars_at_front_of_buffer_that_should_roll_off = @buffer.size - @window_size
|
46
|
+
|
47
|
+
@buffer = @buffer[-@window_size..-1]
|
48
|
+
drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_should_roll_off)
|
49
|
+
end
|
50
|
+
|
51
|
+
def drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_rolled_off)
|
52
|
+
@matches_per_regex.each do |regex, match_set|
|
53
|
+
match_set.reject! {|online_match| online_match.offsets.first < number_of_chars_at_front_of_buffer_that_rolled_off }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# A {Regexp, MatchData} pair
|
59
|
+
class OnlineMatch
|
60
|
+
# Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
|
61
|
+
attr_reader :regex
|
62
|
+
attr_reader :match
|
63
|
+
attr_reader :offsets # Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
|
64
|
+
|
65
|
+
def initialize(regex, match)
|
66
|
+
@regex, @match, @offsets = regex, match
|
67
|
+
@offsets = match.offset(0)
|
68
|
+
end
|
69
|
+
|
70
|
+
def identity
|
71
|
+
[@regex, @offsets, to_a]
|
72
|
+
end
|
73
|
+
|
74
|
+
def ==(other)
|
75
|
+
identity == other.identity
|
76
|
+
end
|
77
|
+
|
78
|
+
def eql?(other)
|
79
|
+
self == other
|
80
|
+
end
|
81
|
+
|
82
|
+
def hash
|
83
|
+
identity.hash
|
84
|
+
end
|
85
|
+
|
86
|
+
def to_a
|
87
|
+
@match.to_a
|
88
|
+
end
|
89
|
+
|
90
|
+
def to_h
|
91
|
+
{ @regex => to_a, :offsets => @offsets }
|
92
|
+
end
|
93
|
+
|
94
|
+
def captures
|
95
|
+
@match.captures
|
96
|
+
end
|
97
|
+
|
98
|
+
def [](*args)
|
99
|
+
@match.method(:[]).call(*args)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,323 @@
|
|
1
|
+
require "stringio"
|
2
|
+
require_relative "./kleene"
|
3
|
+
|
4
|
+
module Kleene
|
5
|
+
class MachineTuple
|
6
|
+
attr_accessor :nfa # : NFA
|
7
|
+
attr_accessor :nfa_with_dead_err # : NFA
|
8
|
+
attr_accessor :dfa # : DFA
|
9
|
+
|
10
|
+
def initialize(nfa, nfa_with_dead_err, dfa)
|
11
|
+
@nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class OnlineDFA
|
16
|
+
include DSL
|
17
|
+
|
18
|
+
# @original_nfas : Array(NFA)
|
19
|
+
attr_reader :nfas_with_err_state # : Array(NFA)
|
20
|
+
attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
|
21
|
+
attr_accessor :composite_nfa # : NFA
|
22
|
+
attr_accessor :composite_dfa # : DFA
|
23
|
+
|
24
|
+
attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
|
25
|
+
attr_accessor :nfa_to_index # : Hash(NFA, Int32)
|
26
|
+
attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
|
27
|
+
attr_accessor :dfa_to_index # : Hash(DFA, Int32)
|
28
|
+
|
29
|
+
def initialize(nfas)
|
30
|
+
composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
|
31
|
+
|
32
|
+
@original_nfas = nfas
|
33
|
+
@nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
|
34
|
+
dfas = @original_nfas.map(&:to_dfa)
|
35
|
+
|
36
|
+
@nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
|
37
|
+
@nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
|
38
|
+
@dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
|
39
|
+
@machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
|
40
|
+
|
41
|
+
# build a mapping of (state -> nfa) pairs that capture which nfa owns each state
|
42
|
+
@dead_end_nfa_state_to_dead_end_nfa = Hash.new
|
43
|
+
@nfas_with_err_state.each do |nfa_with_dead_err|
|
44
|
+
nfa_with_dead_err.states.each do |state|
|
45
|
+
@dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
50
|
+
@composite_nfa = create_composite_nfa(@nfas_with_err_state)
|
51
|
+
@composite_dfa = @composite_nfa.to_dfa
|
52
|
+
|
53
|
+
reset
|
54
|
+
end
|
55
|
+
|
56
|
+
def machines_from_nfa(nfa) # : MachineTuple
|
57
|
+
machines_by_index[nfa_to_index[nfa]]
|
58
|
+
end
|
59
|
+
|
60
|
+
def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
|
61
|
+
machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
|
62
|
+
end
|
63
|
+
|
64
|
+
def machines_from_dfa(dfa) # : MachineTuple
|
65
|
+
machines_by_index[dfa_to_index[dfa]]
|
66
|
+
end
|
67
|
+
|
68
|
+
# create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
|
69
|
+
def create_composite_nfa(nfas)
|
70
|
+
nfa = union!(nfas)
|
71
|
+
|
72
|
+
# add epsilon transitions from all the states except the start state back to the start state
|
73
|
+
nfa.states.each do |state|
|
74
|
+
if state != nfa.start_state
|
75
|
+
nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
nfa.update_final_states
|
80
|
+
|
81
|
+
nfa
|
82
|
+
end
|
83
|
+
|
84
|
+
def reset # : OnlineMatchTracker
|
85
|
+
@active_composite_dfa = @composite_dfa.deep_clone
|
86
|
+
@active_candidate_dfas = []
|
87
|
+
@match_tracker = setup_callbacks(@active_composite_dfa)
|
88
|
+
@buffer = ""
|
89
|
+
end
|
90
|
+
|
91
|
+
# #ingest(input) is the online-style matching interface
|
92
|
+
def ingest(input, debug = false) # : Hash(NFA, Array(MatchRef))
|
93
|
+
mt = @match_tracker
|
94
|
+
|
95
|
+
start_index_of_input_fragment_in_buffer = @buffer.length
|
96
|
+
|
97
|
+
input.each_char.with_index do |char, index|
|
98
|
+
@active_composite_dfa.handle_token!(char, start_index_of_input_fragment_in_buffer + index)
|
99
|
+
end
|
100
|
+
|
101
|
+
@buffer << input
|
102
|
+
|
103
|
+
start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
|
104
|
+
|
105
|
+
mt.empty_matches.each do |nfa_with_dead_err, indices|
|
106
|
+
original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
|
107
|
+
indices.select {|index| index >= start_index_of_input_fragment_in_buffer }.each do |index|
|
108
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, index...index))
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
input.each_char.with_index do |char, index|
|
113
|
+
index_in_buffer = start_index_of_input_fragment_in_buffer + index
|
114
|
+
|
115
|
+
@active_candidate_dfas.reject! do |active_dfa_tuple|
|
116
|
+
dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
|
117
|
+
|
118
|
+
dfa_clone.handle_token!(char, index_in_buffer)
|
119
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, start_of_match_index..index_in_buffer)) if dfa_clone.accept?
|
120
|
+
|
121
|
+
dfa_clone.error?
|
122
|
+
end
|
123
|
+
|
124
|
+
if nfas_with_dead_err = start_index_to_nfas_that_may_match[index_in_buffer]
|
125
|
+
nfas_with_dead_err.each do |nfa_with_dead_err|
|
126
|
+
machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
|
127
|
+
original_nfa = machines.nfa
|
128
|
+
dfa = machines.dfa
|
129
|
+
dfa_clone = dfa.shallow_clone
|
130
|
+
|
131
|
+
dfa_clone.handle_token!(char, index_in_buffer)
|
132
|
+
mt.add_match(original_nfa, MatchRef.new(@buffer, index_in_buffer..index_in_buffer)) if dfa_clone.accept?
|
133
|
+
|
134
|
+
@active_candidate_dfas << [dfa_clone, original_nfa, index_in_buffer] unless dfa_clone.error?
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
matches
|
140
|
+
end
|
141
|
+
|
142
|
+
def matches
|
143
|
+
@match_tracker.matches
|
144
|
+
end
|
145
|
+
|
146
|
+
def setup_callbacks(dfa)
|
147
|
+
match_tracker = OnlineMatchTracker.new
|
148
|
+
|
149
|
+
# 1. identify DFA states that correspond to successful match of first character of the NFAs
|
150
|
+
epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
|
151
|
+
nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
|
152
|
+
reject {|transition| transition.epsilon? || transition.to.error? }.
|
153
|
+
map(&:to).to_set
|
154
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
|
155
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
156
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
157
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
|
158
|
+
dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
|
159
|
+
dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
160
|
+
select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
|
161
|
+
compact_map do |nfa_state|
|
162
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
163
|
+
end.to_set
|
164
|
+
end
|
165
|
+
|
166
|
+
# 2. identify DFA states that correspond to final states in the NFAs
|
167
|
+
nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
|
168
|
+
dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
169
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
170
|
+
dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
|
171
|
+
dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
|
172
|
+
dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
173
|
+
select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
|
174
|
+
compact_map do |nfa_state|
|
175
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
176
|
+
end.to_set
|
177
|
+
end
|
178
|
+
|
179
|
+
# 3. Identify DFA states that correspond to successful match without even having seen any characters.
|
180
|
+
# These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
|
181
|
+
nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
|
182
|
+
dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
|
183
|
+
compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
|
184
|
+
reduce(Set.new) {|memo, state_set| memo | state_set }
|
185
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
|
186
|
+
dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
|
187
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
|
188
|
+
select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
|
189
|
+
compact_map do |nfa_state|
|
190
|
+
dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
|
191
|
+
end.to_set
|
192
|
+
end
|
193
|
+
|
194
|
+
# set up call transition call backs, since the callbacks may only be defined once per state and transition
|
195
|
+
# For (1):
|
196
|
+
# Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
|
197
|
+
# to be matched on the transition to one of the states in (1)
|
198
|
+
# For (2):
|
199
|
+
# set up transition callbacks to push the index position of the end of a successful match onto the list
|
200
|
+
# of successful matches for the NFA that matched
|
201
|
+
# For (3):
|
202
|
+
# set up transision callbacks to capture successful empty matches
|
203
|
+
destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
|
204
|
+
destination_dfa_states_for_callbacks.each do |dfa_state|
|
205
|
+
dfa.on_transition_to(dfa_state) do |transition, token, token_index|
|
206
|
+
destination_dfa_state = transition.to
|
207
|
+
|
208
|
+
should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
|
209
|
+
should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
|
210
|
+
should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
|
211
|
+
|
212
|
+
if should_track_empty_match
|
213
|
+
dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
|
214
|
+
match_tracker.add_empty_match(nfa_with_dead_end, token_index)
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
if should_track_start_of_candidate_match
|
219
|
+
nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
|
220
|
+
nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
|
221
|
+
dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
|
222
|
+
dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
|
223
|
+
match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
if should_track_end_of_match
|
228
|
+
dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
|
229
|
+
match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
match_tracker
|
236
|
+
end
|
237
|
+
end
|
238
|
+
|
239
|
+
class OnlineMatchTracker
|
240
|
+
# The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
|
241
|
+
# They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
|
242
|
+
# are the internal state of a MultiMatchDFA
|
243
|
+
attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
|
244
|
+
# The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
|
245
|
+
# however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
|
246
|
+
# OR it may be as a result of transitioning to (2) a non-error final state.
|
247
|
+
# In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
|
248
|
+
# is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
|
249
|
+
# while in the case of (2), the match may be a "normal" match.
|
250
|
+
# The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
|
251
|
+
# or the beginning of an empty match.
|
252
|
+
# This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
|
253
|
+
# to the composite DFA's start state.
|
254
|
+
attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
|
255
|
+
attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
|
256
|
+
|
257
|
+
# The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
|
258
|
+
# This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
|
259
|
+
# match_end_positions structures, documented above ^^^.
|
260
|
+
attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
|
261
|
+
|
262
|
+
def initialize
|
263
|
+
reset
|
264
|
+
end
|
265
|
+
|
266
|
+
def reset
|
267
|
+
@candidate_match_start_positions = Hash.new
|
268
|
+
@match_end_positions = Hash.new
|
269
|
+
@empty_matches = Hash.new
|
270
|
+
@matches = Hash.new
|
271
|
+
end
|
272
|
+
|
273
|
+
def start_positions(nfa)
|
274
|
+
candidate_match_start_positions[nfa] ||= Array.new
|
275
|
+
end
|
276
|
+
|
277
|
+
def end_positions(nfa)
|
278
|
+
match_end_positions[nfa] ||= Array.new
|
279
|
+
end
|
280
|
+
|
281
|
+
def empty_match_positions(nfa)
|
282
|
+
empty_matches[nfa] ||= Array.new
|
283
|
+
end
|
284
|
+
|
285
|
+
def matches_for(nfa)
|
286
|
+
matches[nfa] ||= Array.new
|
287
|
+
end
|
288
|
+
|
289
|
+
def add_start_of_candidate_match(nfa_with_dead_end, token_index)
|
290
|
+
# puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
|
291
|
+
positions = start_positions(nfa_with_dead_end)
|
292
|
+
positions << token_index
|
293
|
+
end
|
294
|
+
|
295
|
+
# the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
|
296
|
+
def add_end_of_match(nfa_with_dead_end, token_index)
|
297
|
+
# puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
|
298
|
+
positions = end_positions(nfa_with_dead_end)
|
299
|
+
positions << token_index
|
300
|
+
end
|
301
|
+
|
302
|
+
def add_empty_match(nfa_with_dead_end, token_index)
|
303
|
+
positions = empty_match_positions(nfa_with_dead_end)
|
304
|
+
positions << token_index
|
305
|
+
end
|
306
|
+
|
307
|
+
def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
|
308
|
+
index_to_nfas = Hash.new
|
309
|
+
candidate_match_start_positions.each do |nfa_with_dead_end, indices|
|
310
|
+
indices.each do |index|
|
311
|
+
nfas = index_to_nfas[index] ||= Array.new
|
312
|
+
nfas << nfa_with_dead_end
|
313
|
+
end
|
314
|
+
end
|
315
|
+
index_to_nfas
|
316
|
+
end
|
317
|
+
|
318
|
+
def add_match(nfa, match)
|
319
|
+
matches = matches_for(nfa)
|
320
|
+
matches << match
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
data/lib/kleene/patches.rb
CHANGED
@@ -12,12 +12,16 @@ module Enumerable
|
|
12
12
|
ary = []
|
13
13
|
each do |e|
|
14
14
|
v = block.call(e)
|
15
|
-
unless v.nil?
|
16
|
-
ary << v
|
17
|
-
end
|
15
|
+
ary << v unless v.nil?
|
18
16
|
end
|
19
17
|
ary
|
20
18
|
end
|
21
19
|
|
22
|
-
|
20
|
+
alias includes? include?
|
21
|
+
end
|
22
|
+
|
23
|
+
class String
|
24
|
+
def scan_matches(pattern) # : Array(MatchData)
|
25
|
+
to_enum(:scan, pattern).map { Regexp.last_match }
|
26
|
+
end
|
23
27
|
end
|
data/lib/kleene/version.rb
CHANGED
data/lib/kleene.rb
CHANGED
@@ -1,15 +1,18 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
5
|
-
|
6
|
-
require_relative
|
7
|
-
require_relative
|
8
|
-
require_relative
|
9
|
-
require_relative
|
10
|
-
require_relative
|
11
|
-
require_relative
|
12
|
-
|
3
|
+
require 'active_support'
|
4
|
+
require 'active_support/core_ext'
|
5
|
+
require 'regexp_parser'
|
6
|
+
require_relative 'kleene/version'
|
7
|
+
require_relative 'kleene/patches'
|
8
|
+
require_relative 'kleene/kleene'
|
9
|
+
require_relative 'kleene/dsl'
|
10
|
+
require_relative 'kleene/nfa'
|
11
|
+
require_relative 'kleene/dfa'
|
12
|
+
require_relative 'kleene/multi_match_dfa'
|
13
|
+
require_relative 'kleene/online_dfa'
|
14
|
+
require_relative 'kleene/naive_online_regex'
|
15
|
+
require_relative 'kleene/parser'
|
13
16
|
|
14
17
|
module Kleene
|
15
18
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: kleene
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Ellis
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '7.1'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: regexp_parser
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.8'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.8'
|
27
41
|
description: kleene is a library for building regular expression recognition automata
|
28
42
|
- nfas, dfas, and some specialty structures.
|
29
43
|
email:
|
@@ -33,6 +47,7 @@ extensions: []
|
|
33
47
|
extra_rdoc_files: []
|
34
48
|
files:
|
35
49
|
- ".rspec"
|
50
|
+
- ".rubocop.yml"
|
36
51
|
- Gemfile
|
37
52
|
- Gemfile.lock
|
38
53
|
- LICENSE
|
@@ -45,7 +60,10 @@ files:
|
|
45
60
|
- lib/kleene/dsl.rb
|
46
61
|
- lib/kleene/kleene.rb
|
47
62
|
- lib/kleene/multi_match_dfa.rb
|
63
|
+
- lib/kleene/naive_online_regex.rb
|
48
64
|
- lib/kleene/nfa.rb
|
65
|
+
- lib/kleene/online_dfa.rb
|
66
|
+
- lib/kleene/parser.rb
|
49
67
|
- lib/kleene/patches.rb
|
50
68
|
- lib/kleene/version.rb
|
51
69
|
homepage: https://github.com/davidkellis/kleene-rb
|