kleene 0.6.0 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adae43aaa27339c7c8affc091b968c782c6494095fbb513c5f95ca423f854b56
4
- data.tar.gz: 80b52c420273ba1f8c16a5b1e9aebbb18782477fe121ae71ec7603736f2b15ac
3
+ metadata.gz: 03ad3b5293809eb768a2bb47eb3f1a9ccee680d7d71873d75c7511eb9fc711be
4
+ data.tar.gz: cb0e8f6600e878153bf2454cea8e3dd4dc67d1872660bd1a77b0648a5f91efa5
5
5
  SHA512:
6
- metadata.gz: 3572f4e64ce3941fe071bfefdc2e3e3c87c82a14ebf98049d323114397a367491ff76c25afbc7caad12b91cb9eddc17e598efcacc23eb59fe8d4996bb0602cd0
7
- data.tar.gz: 0053e335ec9a68d34237ba83f281abf33c1b94f76a31437705a63e68583cc75af4f5b61b0e303ba149a0ce8062ecb4bd7bc4ad63f1293c4e1c905b390ba87df1
6
+ metadata.gz: f7043fb3024741baf02ed48de44d14c616d4bb83315ad23481d0908c962fbd41a56a9ccc8d0444577526baa94933621b43203c29b2cba14192d98d8f5a7246ef
7
+ data.tar.gz: a78f7aee47db290800efa79cb6096d604f30dc249907db8759bb319b29f01d378801395557a9a8760fc3b6d06e2e447335398de053e1594233da62128a3aafb9
data/.rubocop.yml ADDED
@@ -0,0 +1,14 @@
1
+ AllCops:
2
+ StyleGuideBaseURL: https://rubystyle.guide
3
+
4
+ Layout/SpaceInsideBlockBraces:
5
+ SpaceBeforeBlockParameters: false
6
+
7
+ Layout/LineLength:
8
+ Max: 160
9
+
10
+ Style/AccessorGrouping:
11
+ EnforcedStyle: separated
12
+
13
+ Style/Encoding:
14
+ Enabled: true
data/Gemfile.lock CHANGED
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kleene (0.1.0)
4
+ kleene (0.7.0)
5
5
  activesupport (~> 7.1)
6
+ regexp_parser (~> 2.8)
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
@@ -45,7 +46,7 @@ GEM
45
46
  parser (3.2.2.4)
46
47
  ast (~> 2.4.1)
47
48
  racc
48
- racc (1.7.2)
49
+ racc (1.7.3)
49
50
  rainbow (3.1.1)
50
51
  rake (13.1.0)
51
52
  rbs (2.8.4)
data/kleene.gemspec CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
33
33
 
34
34
  # Uncomment to register a new dependency of your gem
35
35
  spec.add_dependency "activesupport", "~> 7.1"
36
+ spec.add_dependency "regexp_parser", "~> 2.8"
36
37
 
37
38
  # For more information and examples about making a new gem, check out our
38
39
  # guide at: https://bundler.io/guides/creating_gem.html
data/lib/kleene/kleene.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # this is a port and extension of https://github.com/davidkellis/kleene/
2
2
 
3
- require_relative "./dsl"
4
- require_relative "./nfa"
5
- require_relative "./dfa"
3
+ require_relative './dsl'
4
+ require_relative './nfa'
5
+ require_relative './dfa'
6
6
 
7
7
  module Kleene
8
8
  # The default alphabet consists of the following:
@@ -28,7 +28,6 @@ module Kleene
28
28
  State.new(final, true)
29
29
  end
30
30
 
31
-
32
31
  attr_reader :id # : Int32
33
32
  attr_accessor :final # : Bool
34
33
  attr_accessor :error # : Bool
@@ -76,13 +75,11 @@ module Kleene
76
75
  end
77
76
 
78
77
  def ==(other)
79
- @string == other.string &&
80
- @range == other.range
78
+ @string == other.string && @range == other.range
81
79
  end
82
80
 
83
81
  def eql?(other)
84
82
  self == other
85
83
  end
86
84
  end
87
-
88
85
  end
@@ -77,18 +77,10 @@ module Kleene
77
77
 
78
78
  nfa
79
79
  end
80
+ end
80
81
 
81
- def match_tracker(input) # : MatchTracker
82
- dfa = @composite_dfa.deep_clone
83
- match_tracker = setup_callbacks(dfa)
84
-
85
- input.each_char.with_index do |char, index|
86
- dfa.handle_token!(char, index)
87
- end
88
-
89
- match_tracker
90
- end
91
-
82
+ class BatchMultiMatchDFA < MultiMatchDFA
83
+ # #matches(input) is the batch-style matching interface
92
84
  def matches(input) # : Hash(NFA, Array(MatchRef))
93
85
  mt = match_tracker(input)
94
86
 
@@ -131,8 +123,19 @@ module Kleene
131
123
  mt.matches
132
124
  end
133
125
 
126
+ def match_tracker(input) # : BatchMatchTracker
127
+ dfa = @composite_dfa.deep_clone
128
+ match_tracker = setup_callbacks(dfa)
129
+
130
+ input.each_char.with_index do |char, index|
131
+ dfa.handle_token!(char, index)
132
+ end
133
+
134
+ match_tracker
135
+ end
136
+
134
137
  def setup_callbacks(dfa)
135
- match_tracker = MatchTracker.new
138
+ match_tracker = BatchMatchTracker.new
136
139
 
137
140
  # 1. identify DFA states that correspond to successful match of first character of the NFAs
138
141
  epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
@@ -222,10 +225,9 @@ module Kleene
222
225
 
223
226
  match_tracker
224
227
  end
225
-
226
228
  end
227
229
 
228
- class MatchTracker
230
+ class BatchMatchTracker
229
231
  # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
230
232
  # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
231
233
  # are the internal state of a MultiMatchDFA
@@ -249,6 +251,10 @@ module Kleene
249
251
  attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
250
252
 
251
253
  def initialize
254
+ reset
255
+ end
256
+
257
+ def reset
252
258
  @candidate_match_start_positions = Hash.new
253
259
  @match_end_positions = Hash.new
254
260
  @empty_matches = Hash.new
@@ -0,0 +1,102 @@
1
+ require 'set'
2
+ require_relative './kleene'
3
+
4
+ module Kleene
5
+ class NaiveOnlineRegex
6
+ def initialize(regexen, window_size = 100)
7
+ @regexen = regexen
8
+ @window_size = window_size
9
+
10
+ reset
11
+ end
12
+
13
+ def reset
14
+ @buffer = ''
15
+ @matches_per_regex = {} # Hash(Regexp, Set(OnlineMatch))
16
+ end
17
+
18
+ # #ingest(input) is the online-style matching interface
19
+ def ingest(input, _debug = false) # : Set(OnlineMatch)
20
+ @buffer << input
21
+ new_online_matches = Set.new
22
+ @regexen.each do |regex|
23
+ existing_matches_for_regex = (@matches_per_regex[regex] ||= Set.new)
24
+ scan_matches = @buffer.scan_matches(regex)
25
+ scan_online_matches = scan_matches.map {|match_data| OnlineMatch.new(regex, match_data) }.to_set
26
+ new_matches = scan_online_matches - existing_matches_for_regex # new_matches : Set(OnlineMatch)
27
+ existing_matches_for_regex.merge(new_matches)
28
+ new_online_matches.merge(new_matches)
29
+ end
30
+ resize_buffer!
31
+ new_online_matches
32
+ end
33
+
34
+ def matches # Hash(Regexp, Set(OnlineMatch))
35
+ @matches_per_regex
36
+ end
37
+
38
+ def matches_for(regex) # Set(OnlineMatch) | Nil
39
+ @matches_per_regex[regex]
40
+ end
41
+
42
+ def resize_buffer!
43
+ return unless @buffer.size > @window_size
44
+
45
+ number_of_chars_at_front_of_buffer_that_should_roll_off = @buffer.size - @window_size
46
+
47
+ @buffer = @buffer[-@window_size..-1]
48
+ drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_should_roll_off)
49
+ end
50
+
51
+ def drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_rolled_off)
52
+ @matches_per_regex.each do |regex, match_set|
53
+ match_set.reject! {|online_match| online_match.offsets.first < number_of_chars_at_front_of_buffer_that_rolled_off }
54
+ end
55
+ end
56
+ end
57
+
58
+ # A {Regexp, MatchData} pair
59
+ class OnlineMatch
60
+ # Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
61
+ attr_reader :regex
62
+ attr_reader :match
63
+ attr_reader :offsets # Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
64
+
65
+ def initialize(regex, match)
66
+ @regex, @match, @offsets = regex, match
67
+ @offsets = match.offset(0)
68
+ end
69
+
70
+ def identity
71
+ [@regex, @offsets, to_a]
72
+ end
73
+
74
+ def ==(other)
75
+ identity == other.identity
76
+ end
77
+
78
+ def eql?(other)
79
+ self == other
80
+ end
81
+
82
+ def hash
83
+ identity.hash
84
+ end
85
+
86
+ def to_a
87
+ @match.to_a
88
+ end
89
+
90
+ def to_h
91
+ { @regex => to_a, :offsets => @offsets }
92
+ end
93
+
94
+ def captures
95
+ @match.captures
96
+ end
97
+
98
+ def [](*args)
99
+ @match.method(:[]).call(*args)
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,323 @@
1
+ require "stringio"
2
+ require_relative "./kleene"
3
+
4
+ module Kleene
5
+ class MachineTuple
6
+ attr_accessor :nfa # : NFA
7
+ attr_accessor :nfa_with_dead_err # : NFA
8
+ attr_accessor :dfa # : DFA
9
+
10
+ def initialize(nfa, nfa_with_dead_err, dfa)
11
+ @nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
12
+ end
13
+ end
14
+
15
+ class OnlineDFA
16
+ include DSL
17
+
18
+ # @original_nfas : Array(NFA)
19
+ attr_reader :nfas_with_err_state # : Array(NFA)
20
+ attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
21
+ attr_accessor :composite_nfa # : NFA
22
+ attr_accessor :composite_dfa # : DFA
23
+
24
+ attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
25
+ attr_accessor :nfa_to_index # : Hash(NFA, Int32)
26
+ attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
27
+ attr_accessor :dfa_to_index # : Hash(DFA, Int32)
28
+
29
+ def initialize(nfas)
30
+ composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
31
+
32
+ @original_nfas = nfas
33
+ @nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
34
+ dfas = @original_nfas.map(&:to_dfa)
35
+
36
+ @nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
37
+ @nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
38
+ @dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
39
+ @machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
40
+
41
+ # build a mapping of (state -> nfa) pairs that capture which nfa owns each state
42
+ @dead_end_nfa_state_to_dead_end_nfa = Hash.new
43
+ @nfas_with_err_state.each do |nfa_with_dead_err|
44
+ nfa_with_dead_err.states.each do |state|
45
+ @dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
46
+ end
47
+ end
48
+
49
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
50
+ @composite_nfa = create_composite_nfa(@nfas_with_err_state)
51
+ @composite_dfa = @composite_nfa.to_dfa
52
+
53
+ reset
54
+ end
55
+
56
+ def machines_from_nfa(nfa) # : MachineTuple
57
+ machines_by_index[nfa_to_index[nfa]]
58
+ end
59
+
60
+ def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
61
+ machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
62
+ end
63
+
64
+ def machines_from_dfa(dfa) # : MachineTuple
65
+ machines_by_index[dfa_to_index[dfa]]
66
+ end
67
+
68
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
69
+ def create_composite_nfa(nfas)
70
+ nfa = union!(nfas)
71
+
72
+ # add epsilon transitions from all the states except the start state back to the start state
73
+ nfa.states.each do |state|
74
+ if state != nfa.start_state
75
+ nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
76
+ end
77
+ end
78
+
79
+ nfa.update_final_states
80
+
81
+ nfa
82
+ end
83
+
84
+ def reset # : OnlineMatchTracker
85
+ @active_composite_dfa = @composite_dfa.deep_clone
86
+ @active_candidate_dfas = []
87
+ @match_tracker = setup_callbacks(@active_composite_dfa)
88
+ @buffer = ""
89
+ end
90
+
91
+ # #ingest(input) is the online-style matching interface
92
+ def ingest(input, debug = false) # : Hash(NFA, Array(MatchRef))
93
+ mt = @match_tracker
94
+
95
+ start_index_of_input_fragment_in_buffer = @buffer.length
96
+
97
+ input.each_char.with_index do |char, index|
98
+ @active_composite_dfa.handle_token!(char, start_index_of_input_fragment_in_buffer + index)
99
+ end
100
+
101
+ @buffer << input
102
+
103
+ start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
104
+
105
+ mt.empty_matches.each do |nfa_with_dead_err, indices|
106
+ original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
107
+ indices.select {|index| index >= start_index_of_input_fragment_in_buffer }.each do |index|
108
+ mt.add_match(original_nfa, MatchRef.new(@buffer, index...index))
109
+ end
110
+ end
111
+
112
+ input.each_char.with_index do |char, index|
113
+ index_in_buffer = start_index_of_input_fragment_in_buffer + index
114
+
115
+ @active_candidate_dfas.reject! do |active_dfa_tuple|
116
+ dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
117
+
118
+ dfa_clone.handle_token!(char, index_in_buffer)
119
+ mt.add_match(original_nfa, MatchRef.new(@buffer, start_of_match_index..index_in_buffer)) if dfa_clone.accept?
120
+
121
+ dfa_clone.error?
122
+ end
123
+
124
+ if nfas_with_dead_err = start_index_to_nfas_that_may_match[index_in_buffer]
125
+ nfas_with_dead_err.each do |nfa_with_dead_err|
126
+ machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
127
+ original_nfa = machines.nfa
128
+ dfa = machines.dfa
129
+ dfa_clone = dfa.shallow_clone
130
+
131
+ dfa_clone.handle_token!(char, index_in_buffer)
132
+ mt.add_match(original_nfa, MatchRef.new(@buffer, index_in_buffer..index_in_buffer)) if dfa_clone.accept?
133
+
134
+ @active_candidate_dfas << [dfa_clone, original_nfa, index_in_buffer] unless dfa_clone.error?
135
+ end
136
+ end
137
+ end
138
+
139
+ matches
140
+ end
141
+
142
+ def matches
143
+ @match_tracker.matches
144
+ end
145
+
146
+ def setup_callbacks(dfa)
147
+ match_tracker = OnlineMatchTracker.new
148
+
149
+ # 1. identify DFA states that correspond to successful match of first character of the NFAs
150
+ epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
151
+ nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
152
+ reject {|transition| transition.epsilon? || transition.to.error? }.
153
+ map(&:to).to_set
154
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
155
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
156
+ reduce(Set.new) {|memo, state_set| memo | state_set }
157
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
158
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
159
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
160
+ select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
161
+ compact_map do |nfa_state|
162
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
163
+ end.to_set
164
+ end
165
+
166
+ # 2. identify DFA states that correspond to final states in the NFAs
167
+ nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
168
+ dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
169
+ reduce(Set.new) {|memo, state_set| memo | state_set }
170
+ dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
171
+ dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
172
+ dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
173
+ select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
174
+ compact_map do |nfa_state|
175
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
176
+ end.to_set
177
+ end
178
+
179
+ # 3. Identify DFA states that correspond to successful match without even having seen any characters.
180
+ # These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
181
+ nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
182
+ dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
183
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
184
+ reduce(Set.new) {|memo, state_set| memo | state_set }
185
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
186
+ dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
187
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
188
+ select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
189
+ compact_map do |nfa_state|
190
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
191
+ end.to_set
192
+ end
193
+
194
+ # set up call transition call backs, since the callbacks may only be defined once per state and transition
195
+ # For (1):
196
+ # Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
197
+ # to be matched on the transition to one of the states in (1)
198
+ # For (2):
199
+ # set up transition callbacks to push the index position of the end of a successful match onto the list
200
+ # of successful matches for the NFA that matched
201
+ # For (3):
202
+ # set up transision callbacks to capture successful empty matches
203
+ destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
204
+ destination_dfa_states_for_callbacks.each do |dfa_state|
205
+ dfa.on_transition_to(dfa_state) do |transition, token, token_index|
206
+ destination_dfa_state = transition.to
207
+
208
+ should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
209
+ should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
210
+ should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
211
+
212
+ if should_track_empty_match
213
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
214
+ match_tracker.add_empty_match(nfa_with_dead_end, token_index)
215
+ end
216
+ end
217
+
218
+ if should_track_start_of_candidate_match
219
+ nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
220
+ nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
221
+ dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
222
+ dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
223
+ match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
224
+ end
225
+ end
226
+
227
+ if should_track_end_of_match
228
+ dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
229
+ match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ match_tracker
236
+ end
237
+ end
238
+
239
+ class OnlineMatchTracker
240
+ # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
241
+ # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
242
+ # are the internal state of a MultiMatchDFA
243
+ attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
244
+ # The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
245
+ # however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
246
+ # OR it may be as a result of transitioning to (2) a non-error final state.
247
+ # In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
248
+ # is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
249
+ # while in the case of (2), the match may be a "normal" match.
250
+ # The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
251
+ # or the beginning of an empty match.
252
+ # This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
253
+ # to the composite DFA's start state.
254
+ attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
255
+ attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
256
+
257
+ # The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
258
+ # This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
259
+ # match_end_positions structures, documented above ^^^.
260
+ attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
261
+
262
+ def initialize
263
+ reset
264
+ end
265
+
266
+ def reset
267
+ @candidate_match_start_positions = Hash.new
268
+ @match_end_positions = Hash.new
269
+ @empty_matches = Hash.new
270
+ @matches = Hash.new
271
+ end
272
+
273
+ def start_positions(nfa)
274
+ candidate_match_start_positions[nfa] ||= Array.new
275
+ end
276
+
277
+ def end_positions(nfa)
278
+ match_end_positions[nfa] ||= Array.new
279
+ end
280
+
281
+ def empty_match_positions(nfa)
282
+ empty_matches[nfa] ||= Array.new
283
+ end
284
+
285
+ def matches_for(nfa)
286
+ matches[nfa] ||= Array.new
287
+ end
288
+
289
+ def add_start_of_candidate_match(nfa_with_dead_end, token_index)
290
+ # puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
291
+ positions = start_positions(nfa_with_dead_end)
292
+ positions << token_index
293
+ end
294
+
295
+ # the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
296
+ def add_end_of_match(nfa_with_dead_end, token_index)
297
+ # puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
298
+ positions = end_positions(nfa_with_dead_end)
299
+ positions << token_index
300
+ end
301
+
302
+ def add_empty_match(nfa_with_dead_end, token_index)
303
+ positions = empty_match_positions(nfa_with_dead_end)
304
+ positions << token_index
305
+ end
306
+
307
+ def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
308
+ index_to_nfas = Hash.new
309
+ candidate_match_start_positions.each do |nfa_with_dead_end, indices|
310
+ indices.each do |index|
311
+ nfas = index_to_nfas[index] ||= Array.new
312
+ nfas << nfa_with_dead_end
313
+ end
314
+ end
315
+ index_to_nfas
316
+ end
317
+
318
+ def add_match(nfa, match)
319
+ matches = matches_for(nfa)
320
+ matches << match
321
+ end
322
+ end
323
+ end
@@ -0,0 +1,9 @@
1
+
2
+ module Kleene
3
+ class Parser
4
+ def parse(pattern)
5
+ ast = Regexp::Parser.parse(pattern)
6
+ ast
7
+ end
8
+ end
9
+ end
@@ -12,12 +12,16 @@ module Enumerable
12
12
  ary = []
13
13
  each do |e|
14
14
  v = block.call(e)
15
- unless v.nil?
16
- ary << v
17
- end
15
+ ary << v unless v.nil?
18
16
  end
19
17
  ary
20
18
  end
21
19
 
22
- alias_method :includes?, :include?
20
+ alias includes? include?
21
+ end
22
+
23
+ class String
24
+ def scan_matches(pattern) # : Array(MatchData)
25
+ to_enum(:scan, pattern).map { Regexp.last_match }
26
+ end
23
27
  end
@@ -1,3 +1,3 @@
1
1
  module Kleene
2
- VERSION = "0.6.0"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/kleene.rb CHANGED
@@ -1,15 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "active_support"
4
- require "active_support/core_ext"
5
- require_relative "kleene/version"
6
- require_relative "kleene/patches"
7
- require_relative "kleene/kleene"
8
- require_relative "kleene/dsl"
9
- require_relative "kleene/nfa"
10
- require_relative "kleene/dfa"
11
- require_relative "kleene/multi_match_dfa"
12
-
3
+ require 'active_support'
4
+ require 'active_support/core_ext'
5
+ require 'regexp_parser'
6
+ require_relative 'kleene/version'
7
+ require_relative 'kleene/patches'
8
+ require_relative 'kleene/kleene'
9
+ require_relative 'kleene/dsl'
10
+ require_relative 'kleene/nfa'
11
+ require_relative 'kleene/dfa'
12
+ require_relative 'kleene/multi_match_dfa'
13
+ require_relative 'kleene/online_dfa'
14
+ require_relative 'kleene/naive_online_regex'
15
+ require_relative 'kleene/parser'
13
16
 
14
17
  module Kleene
15
18
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kleene
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Ellis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-04 00:00:00.000000000 Z
11
+ date: 2023-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '7.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: regexp_parser
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.8'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
27
41
  description: kleene is a library for building regular expression recognition automata
28
42
  - nfas, dfas, and some specialty structures.
29
43
  email:
@@ -33,6 +47,7 @@ extensions: []
33
47
  extra_rdoc_files: []
34
48
  files:
35
49
  - ".rspec"
50
+ - ".rubocop.yml"
36
51
  - Gemfile
37
52
  - Gemfile.lock
38
53
  - LICENSE
@@ -45,7 +60,10 @@ files:
45
60
  - lib/kleene/dsl.rb
46
61
  - lib/kleene/kleene.rb
47
62
  - lib/kleene/multi_match_dfa.rb
63
+ - lib/kleene/naive_online_regex.rb
48
64
  - lib/kleene/nfa.rb
65
+ - lib/kleene/online_dfa.rb
66
+ - lib/kleene/parser.rb
49
67
  - lib/kleene/patches.rb
50
68
  - lib/kleene/version.rb
51
69
  homepage: https://github.com/davidkellis/kleene-rb