kleene 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: adae43aaa27339c7c8affc091b968c782c6494095fbb513c5f95ca423f854b56
4
- data.tar.gz: 80b52c420273ba1f8c16a5b1e9aebbb18782477fe121ae71ec7603736f2b15ac
3
+ metadata.gz: 03ad3b5293809eb768a2bb47eb3f1a9ccee680d7d71873d75c7511eb9fc711be
4
+ data.tar.gz: cb0e8f6600e878153bf2454cea8e3dd4dc67d1872660bd1a77b0648a5f91efa5
5
5
  SHA512:
6
- metadata.gz: 3572f4e64ce3941fe071bfefdc2e3e3c87c82a14ebf98049d323114397a367491ff76c25afbc7caad12b91cb9eddc17e598efcacc23eb59fe8d4996bb0602cd0
7
- data.tar.gz: 0053e335ec9a68d34237ba83f281abf33c1b94f76a31437705a63e68583cc75af4f5b61b0e303ba149a0ce8062ecb4bd7bc4ad63f1293c4e1c905b390ba87df1
6
+ metadata.gz: f7043fb3024741baf02ed48de44d14c616d4bb83315ad23481d0908c962fbd41a56a9ccc8d0444577526baa94933621b43203c29b2cba14192d98d8f5a7246ef
7
+ data.tar.gz: a78f7aee47db290800efa79cb6096d604f30dc249907db8759bb319b29f01d378801395557a9a8760fc3b6d06e2e447335398de053e1594233da62128a3aafb9
data/.rubocop.yml ADDED
@@ -0,0 +1,14 @@
1
+ AllCops:
2
+ StyleGuideBaseURL: https://rubystyle.guide
3
+
4
+ Layout/SpaceInsideBlockBraces:
5
+ SpaceBeforeBlockParameters: false
6
+
7
+ Layout/LineLength:
8
+ Max: 160
9
+
10
+ Style/AccessorGrouping:
11
+ EnforcedStyle: separated
12
+
13
+ Style/Encoding:
14
+ Enabled: true
data/Gemfile.lock CHANGED
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kleene (0.1.0)
4
+ kleene (0.7.0)
5
5
  activesupport (~> 7.1)
6
+ regexp_parser (~> 2.8)
6
7
 
7
8
  GEM
8
9
  remote: https://rubygems.org/
@@ -45,7 +46,7 @@ GEM
45
46
  parser (3.2.2.4)
46
47
  ast (~> 2.4.1)
47
48
  racc
48
- racc (1.7.2)
49
+ racc (1.7.3)
49
50
  rainbow (3.1.1)
50
51
  rake (13.1.0)
51
52
  rbs (2.8.4)
data/kleene.gemspec CHANGED
@@ -33,6 +33,7 @@ Gem::Specification.new do |spec|
33
33
 
34
34
  # Uncomment to register a new dependency of your gem
35
35
  spec.add_dependency "activesupport", "~> 7.1"
36
+ spec.add_dependency "regexp_parser", "~> 2.8"
36
37
 
37
38
  # For more information and examples about making a new gem, check out our
38
39
  # guide at: https://bundler.io/guides/creating_gem.html
data/lib/kleene/kleene.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # this is a port and extension of https://github.com/davidkellis/kleene/
2
2
 
3
- require_relative "./dsl"
4
- require_relative "./nfa"
5
- require_relative "./dfa"
3
+ require_relative './dsl'
4
+ require_relative './nfa'
5
+ require_relative './dfa'
6
6
 
7
7
  module Kleene
8
8
  # The default alphabet consists of the following:
@@ -28,7 +28,6 @@ module Kleene
28
28
  State.new(final, true)
29
29
  end
30
30
 
31
-
32
31
  attr_reader :id # : Int32
33
32
  attr_accessor :final # : Bool
34
33
  attr_accessor :error # : Bool
@@ -76,13 +75,11 @@ module Kleene
76
75
  end
77
76
 
78
77
  def ==(other)
79
- @string == other.string &&
80
- @range == other.range
78
+ @string == other.string && @range == other.range
81
79
  end
82
80
 
83
81
  def eql?(other)
84
82
  self == other
85
83
  end
86
84
  end
87
-
88
85
  end
@@ -77,18 +77,10 @@ module Kleene
77
77
 
78
78
  nfa
79
79
  end
80
+ end
80
81
 
81
- def match_tracker(input) # : MatchTracker
82
- dfa = @composite_dfa.deep_clone
83
- match_tracker = setup_callbacks(dfa)
84
-
85
- input.each_char.with_index do |char, index|
86
- dfa.handle_token!(char, index)
87
- end
88
-
89
- match_tracker
90
- end
91
-
82
+ class BatchMultiMatchDFA < MultiMatchDFA
83
+ # #matches(input) is the batch-style matching interface
92
84
  def matches(input) # : Hash(NFA, Array(MatchRef))
93
85
  mt = match_tracker(input)
94
86
 
@@ -131,8 +123,19 @@ module Kleene
131
123
  mt.matches
132
124
  end
133
125
 
126
+ def match_tracker(input) # : BatchMatchTracker
127
+ dfa = @composite_dfa.deep_clone
128
+ match_tracker = setup_callbacks(dfa)
129
+
130
+ input.each_char.with_index do |char, index|
131
+ dfa.handle_token!(char, index)
132
+ end
133
+
134
+ match_tracker
135
+ end
136
+
134
137
  def setup_callbacks(dfa)
135
- match_tracker = MatchTracker.new
138
+ match_tracker = BatchMatchTracker.new
136
139
 
137
140
  # 1. identify DFA states that correspond to successful match of first character of the NFAs
138
141
  epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
@@ -222,10 +225,9 @@ module Kleene
222
225
 
223
226
  match_tracker
224
227
  end
225
-
226
228
  end
227
229
 
228
- class MatchTracker
230
+ class BatchMatchTracker
229
231
  # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
230
232
  # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
231
233
  # are the internal state of a MultiMatchDFA
@@ -249,6 +251,10 @@ module Kleene
249
251
  attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
250
252
 
251
253
  def initialize
254
+ reset
255
+ end
256
+
257
+ def reset
252
258
  @candidate_match_start_positions = Hash.new
253
259
  @match_end_positions = Hash.new
254
260
  @empty_matches = Hash.new
@@ -0,0 +1,102 @@
1
+ require 'set'
2
+ require_relative './kleene'
3
+
4
+ module Kleene
5
+ class NaiveOnlineRegex
6
+ def initialize(regexen, window_size = 100)
7
+ @regexen = regexen
8
+ @window_size = window_size
9
+
10
+ reset
11
+ end
12
+
13
+ def reset
14
+ @buffer = ''
15
+ @matches_per_regex = {} # Hash(Regexp, Set(OnlineMatch))
16
+ end
17
+
18
+ # #ingest(input) is the online-style matching interface
19
+ def ingest(input, _debug = false) # : Set(OnlineMatch)
20
+ @buffer << input
21
+ new_online_matches = Set.new
22
+ @regexen.each do |regex|
23
+ existing_matches_for_regex = (@matches_per_regex[regex] ||= Set.new)
24
+ scan_matches = @buffer.scan_matches(regex)
25
+ scan_online_matches = scan_matches.map {|match_data| OnlineMatch.new(regex, match_data) }.to_set
26
+ new_matches = scan_online_matches - existing_matches_for_regex # new_matches : Set(OnlineMatch)
27
+ existing_matches_for_regex.merge(new_matches)
28
+ new_online_matches.merge(new_matches)
29
+ end
30
+ resize_buffer!
31
+ new_online_matches
32
+ end
33
+
34
+ def matches # Hash(Regexp, Set(OnlineMatch))
35
+ @matches_per_regex
36
+ end
37
+
38
+ def matches_for(regex) # Set(OnlineMatch) | Nil
39
+ @matches_per_regex[regex]
40
+ end
41
+
42
+ def resize_buffer!
43
+ return unless @buffer.size > @window_size
44
+
45
+ number_of_chars_at_front_of_buffer_that_should_roll_off = @buffer.size - @window_size
46
+
47
+ @buffer = @buffer[-@window_size..-1]
48
+ drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_should_roll_off)
49
+ end
50
+
51
+ def drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_rolled_off)
52
+ @matches_per_regex.each do |regex, match_set|
53
+ match_set.reject! {|online_match| online_match.offsets.first < number_of_chars_at_front_of_buffer_that_rolled_off }
54
+ end
55
+ end
56
+ end
57
+
58
+ # A {Regexp, MatchData} pair
59
+ class OnlineMatch
60
+ # Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
61
+ attr_reader :regex
62
+ attr_reader :match
63
+ attr_reader :offsets # Regexp # MatchData # Array(Int) -> [start, end] # excludes the end offset
64
+
65
+ def initialize(regex, match)
66
+ @regex, @match, @offsets = regex, match
67
+ @offsets = match.offset(0)
68
+ end
69
+
70
+ def identity
71
+ [@regex, @offsets, to_a]
72
+ end
73
+
74
+ def ==(other)
75
+ identity == other.identity
76
+ end
77
+
78
+ def eql?(other)
79
+ self == other
80
+ end
81
+
82
+ def hash
83
+ identity.hash
84
+ end
85
+
86
+ def to_a
87
+ @match.to_a
88
+ end
89
+
90
+ def to_h
91
+ { @regex => to_a, :offsets => @offsets }
92
+ end
93
+
94
+ def captures
95
+ @match.captures
96
+ end
97
+
98
+ def [](*args)
99
+ @match.method(:[]).call(*args)
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,323 @@
1
+ require "stringio"
2
+ require_relative "./kleene"
3
+
4
+ module Kleene
5
+ class MachineTuple
6
+ attr_accessor :nfa # : NFA
7
+ attr_accessor :nfa_with_dead_err # : NFA
8
+ attr_accessor :dfa # : DFA
9
+
10
+ def initialize(nfa, nfa_with_dead_err, dfa)
11
+ @nfa, @nfa_with_dead_err, @dfa = nfa, nfa_with_dead_err, dfa
12
+ end
13
+ end
14
+
15
+ class OnlineDFA
16
+ include DSL
17
+
18
+ # @original_nfas : Array(NFA)
19
+ attr_reader :nfas_with_err_state # : Array(NFA)
20
+ attr_accessor :dead_end_nfa_state_to_dead_end_nfa # : Hash(State, NFA)
21
+ attr_accessor :composite_nfa # : NFA
22
+ attr_accessor :composite_dfa # : DFA
23
+
24
+ attr_accessor :machines_by_index # : Hash(Int32, MachineTuple)
25
+ attr_accessor :nfa_to_index # : Hash(NFA, Int32)
26
+ attr_accessor :nfa_with_dead_err_to_index # : Hash(NFA, Int32)
27
+ attr_accessor :dfa_to_index # : Hash(DFA, Int32)
28
+
29
+ def initialize(nfas)
30
+ composite_alphabet = nfas.reduce(Set.new) {|memo, nfa| memo | nfa.alphabet }
31
+
32
+ @original_nfas = nfas
33
+ @nfas_with_err_state = nfas.map {|nfa| with_err_dead_end(nfa, composite_alphabet) } # copy NFAs and add dead-end error states to each of them
34
+ dfas = @original_nfas.map(&:to_dfa)
35
+
36
+ @nfa_to_index = @original_nfas.map.with_index {|nfa, index| [nfa, index] }.to_h
37
+ @nfa_with_dead_err_to_index = @nfas_with_err_state.map.with_index {|nfa, index| [nfa, index] }.to_h
38
+ @dfa_to_index = dfas.map.with_index {|dfa, index| [dfa, index] }.to_h
39
+ @machines_by_index = @original_nfas.zip(nfas_with_err_state, dfas).map.with_index {|tuple, index| nfa, nfa_with_dead_err, dfa = tuple; [index, MachineTuple.new(nfa, nfa_with_dead_err, dfa)] }.to_h
40
+
41
+ # build a mapping of (state -> nfa) pairs that capture which nfa owns each state
42
+ @dead_end_nfa_state_to_dead_end_nfa = Hash.new
43
+ @nfas_with_err_state.each do |nfa_with_dead_err|
44
+ nfa_with_dead_err.states.each do |state|
45
+ @dead_end_nfa_state_to_dead_end_nfa[state] = nfa_with_dead_err
46
+ end
47
+ end
48
+
49
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
50
+ @composite_nfa = create_composite_nfa(@nfas_with_err_state)
51
+ @composite_dfa = @composite_nfa.to_dfa
52
+
53
+ reset
54
+ end
55
+
56
+ def machines_from_nfa(nfa) # : MachineTuple
57
+ machines_by_index[nfa_to_index[nfa]]
58
+ end
59
+
60
+ def machines_from_nfa_with_dead_err(nfa_with_dead_err) # : MachineTuple
61
+ machines_by_index[nfa_with_dead_err_to_index[nfa_with_dead_err]]
62
+ end
63
+
64
+ def machines_from_dfa(dfa) # : MachineTuple
65
+ machines_by_index[dfa_to_index[dfa]]
66
+ end
67
+
68
+ # create a composite NFA as the union of all the NFAs with epsilon transitions from every NFA state back to the union NFA's start state
69
+ def create_composite_nfa(nfas)
70
+ nfa = union!(nfas)
71
+
72
+ # add epsilon transitions from all the states except the start state back to the start state
73
+ nfa.states.each do |state|
74
+ if state != nfa.start_state
75
+ nfa.add_transition(NFATransition::Epsilon, state, nfa.start_state)
76
+ end
77
+ end
78
+
79
+ nfa.update_final_states
80
+
81
+ nfa
82
+ end
83
+
84
+ def reset # : OnlineMatchTracker
85
+ @active_composite_dfa = @composite_dfa.deep_clone
86
+ @active_candidate_dfas = []
87
+ @match_tracker = setup_callbacks(@active_composite_dfa)
88
+ @buffer = ""
89
+ end
90
+
91
+ # #ingest(input) is the online-style matching interface
92
+ def ingest(input, debug = false) # : Hash(NFA, Array(MatchRef))
93
+ mt = @match_tracker
94
+
95
+ start_index_of_input_fragment_in_buffer = @buffer.length
96
+
97
+ input.each_char.with_index do |char, index|
98
+ @active_composite_dfa.handle_token!(char, start_index_of_input_fragment_in_buffer + index)
99
+ end
100
+
101
+ @buffer << input
102
+
103
+ start_index_to_nfas_that_may_match = mt.invert_candidate_match_start_positions
104
+
105
+ mt.empty_matches.each do |nfa_with_dead_err, indices|
106
+ original_nfa = machines_from_nfa_with_dead_err(nfa_with_dead_err).nfa
107
+ indices.select {|index| index >= start_index_of_input_fragment_in_buffer }.each do |index|
108
+ mt.add_match(original_nfa, MatchRef.new(@buffer, index...index))
109
+ end
110
+ end
111
+
112
+ input.each_char.with_index do |char, index|
113
+ index_in_buffer = start_index_of_input_fragment_in_buffer + index
114
+
115
+ @active_candidate_dfas.reject! do |active_dfa_tuple|
116
+ dfa_clone, original_nfa, start_of_match_index = active_dfa_tuple
117
+
118
+ dfa_clone.handle_token!(char, index_in_buffer)
119
+ mt.add_match(original_nfa, MatchRef.new(@buffer, start_of_match_index..index_in_buffer)) if dfa_clone.accept?
120
+
121
+ dfa_clone.error?
122
+ end
123
+
124
+ if nfas_with_dead_err = start_index_to_nfas_that_may_match[index_in_buffer]
125
+ nfas_with_dead_err.each do |nfa_with_dead_err|
126
+ machines = machines_from_nfa_with_dead_err(nfa_with_dead_err)
127
+ original_nfa = machines.nfa
128
+ dfa = machines.dfa
129
+ dfa_clone = dfa.shallow_clone
130
+
131
+ dfa_clone.handle_token!(char, index_in_buffer)
132
+ mt.add_match(original_nfa, MatchRef.new(@buffer, index_in_buffer..index_in_buffer)) if dfa_clone.accept?
133
+
134
+ @active_candidate_dfas << [dfa_clone, original_nfa, index_in_buffer] unless dfa_clone.error?
135
+ end
136
+ end
137
+ end
138
+
139
+ matches
140
+ end
141
+
142
+ def matches
143
+ @match_tracker.matches
144
+ end
145
+
146
+ def setup_callbacks(dfa)
147
+ match_tracker = OnlineMatchTracker.new
148
+
149
+ # 1. identify DFA states that correspond to successful match of first character of the NFAs
150
+ epsilon_closure_of_nfa_start_state = composite_nfa.epsilon_closure(composite_nfa.start_state)
151
+ nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = composite_nfa.transitions_from(epsilon_closure_of_nfa_start_state).
152
+ reject {|transition| transition.epsilon? || transition.to.error? }.
153
+ map(&:to).to_set
154
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa = nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.
155
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
156
+ reduce(Set.new) {|memo, state_set| memo | state_set }
157
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character = Hash.new
158
+ dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.each do |dfa_state|
159
+ dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
160
+ select {|nfa_state| nfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(nfa_state) }.
161
+ compact_map do |nfa_state|
162
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
163
+ end.to_set
164
+ end
165
+
166
+ # 2. identify DFA states that correspond to final states in the NFAs
167
+ nfa_final_states = @nfas_with_err_state.map(&:final_states).reduce(Set.new) {|memo, state_set| memo | state_set }
168
+ dfa_states_that_correspond_to_nfa_final_states = nfa_final_states.compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
169
+ reduce(Set.new) {|memo, state_set| memo | state_set }
170
+ dead_end_nfas_that_have_transitioned_to_final_state = Hash.new
171
+ dfa_states_that_correspond_to_nfa_final_states.each do |dfa_state|
172
+ dead_end_nfas_that_have_transitioned_to_final_state[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
173
+ select {|nfa_state| nfa_final_states.includes?(nfa_state) }.
174
+ compact_map do |nfa_state|
175
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
176
+ end.to_set
177
+ end
178
+
179
+ # 3. Identify DFA states that correspond to successful match without even having seen any characters.
180
+ # These are cases where the NFA's start state is a final state or can reach a final state by following only epsilon transitions.
181
+ nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state = epsilon_closure_of_nfa_start_state.select(&:final?).to_set
182
+ dfa_states_that_represent_both_start_states_and_final_states = nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.
183
+ compact_map {|nfa_state| dfa.nfa_state_to_dfa_state_sets[nfa_state] }.
184
+ reduce(Set.new) {|memo, state_set| memo | state_set }
185
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters = Hash.new
186
+ dfa_states_that_represent_both_start_states_and_final_states.each do |dfa_state|
187
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[dfa_state] = dfa.dfa_state_to_nfa_state_sets[dfa_state].
188
+ select {|nfa_state| nfa_final_states_that_are_epsilon_reachable_from_nfa_start_state.includes?(nfa_state) }.
189
+ compact_map do |nfa_state|
190
+ dead_end_nfa_state_to_dead_end_nfa[nfa_state] unless nfa_state == composite_nfa.start_state # composite_nfa.start_state is not referenced in the dead_end_nfa_state_to_dead_end_nfa map
191
+ end.to_set
192
+ end
193
+
194
+ # set up call transition call backs, since the callbacks may only be defined once per state and transition
195
+ # For (1):
196
+ # Set up transition callbacks to push the index position of the start of a match of each NFA that has begun
197
+ # to be matched on the transition to one of the states in (1)
198
+ # For (2):
199
+ # set up transition callbacks to push the index position of the end of a successful match onto the list
200
+ # of successful matches for the NFA that matched
201
+ # For (3):
202
+ # set up transision callbacks to capture successful empty matches
203
+ destination_dfa_states_for_callbacks = dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa | dfa_states_that_correspond_to_nfa_final_states
204
+ destination_dfa_states_for_callbacks.each do |dfa_state|
205
+ dfa.on_transition_to(dfa_state) do |transition, token, token_index|
206
+ destination_dfa_state = transition.to
207
+
208
+ should_track_empty_match = dfa_states_that_represent_both_start_states_and_final_states.includes?(destination_dfa_state)
209
+ should_track_start_of_candidate_match = should_track_empty_match || dfa_states_that_correspond_to_successful_match_of_first_character_of_component_nfa.includes?(destination_dfa_state)
210
+ should_track_end_of_match = dfa_states_that_correspond_to_nfa_final_states.includes?(destination_dfa_state)
211
+
212
+ if should_track_empty_match
213
+ dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state].each do |nfa_with_dead_end|
214
+ match_tracker.add_empty_match(nfa_with_dead_end, token_index)
215
+ end
216
+ end
217
+
218
+ if should_track_start_of_candidate_match
219
+ nfas_that_matched_first_character = dfa_state_to_dead_end_nfas_that_have_matched_their_first_character[destination_dfa_state] || Set.new
220
+ nfas_that_matched_empty_match = dfa_state_to_dead_end_nfas_that_have_matched_before_handling_any_characters[destination_dfa_state] || Set.new
221
+ dead_end_nfas_that_are_starting_to_match = nfas_that_matched_first_character | nfas_that_matched_empty_match
222
+ dead_end_nfas_that_are_starting_to_match.each do |nfa_with_dead_end|
223
+ match_tracker.add_start_of_candidate_match(nfa_with_dead_end, token_index)
224
+ end
225
+ end
226
+
227
+ if should_track_end_of_match
228
+ dead_end_nfas_that_have_transitioned_to_final_state[destination_dfa_state].each do |nfa_with_dead_end|
229
+ match_tracker.add_end_of_match(nfa_with_dead_end, token_index)
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ match_tracker
236
+ end
237
+ end
238
+
239
+ class OnlineMatchTracker
240
+ # The NFA keys in the following two structures are not the original NFAs supplied to the MultiMatchDFA.
241
+ # They are the original NFAs that have been augmented with a dead end error state, so the keys are objects that
242
+ # are the internal state of a MultiMatchDFA
243
+ attr_accessor :candidate_match_start_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfStartOfMatch)
244
+ # The end positions are indices at which, after handling the character, the DFA was observed to be in a match/accept state;
245
+ # however, the interpretation is ambiguous, because the accepting state may be as a result of (1) transitioning to an error state that is also marked final/accepting,
246
+ # OR it may be as a result of transitioning to (2) a non-error final state.
247
+ # In the case of (1), the match may be an empty match, where after transitioning to an error state, the DFA is in a state that
248
+ # is equivalent to the error state and start state and final state (e.g. as in an optional or kleene star DFA),
249
+ # while in the case of (2), the match may be a "normal" match.
250
+ # The ambiguity is problematic because it isn't clear whether the index position of the match is end inclusive end of a match
251
+ # or the beginning of an empty match.
252
+ # This ambiguity is all due to the construction of the composite DFA in the MultiMatchDFA - the dead end error states are epsilon-transitioned
253
+ # to the composite DFA's start state.
254
+ attr_accessor :match_end_positions # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEndOfMatch)
255
+ attr_accessor :empty_matches # : Hash(NFA, Array(Int32)) # NFA -> Array(IndexPositionOfEmptyMatch)
256
+
257
+ # The NFA keys in the following structure are the original NFAs supplied to the MultiMatchDFA.
258
+ # This is in contrast to the augmented NFAs that are used as keys in the candidate_match_start_positions and
259
+ # match_end_positions structures, documented above ^^^.
260
+ attr_accessor :matches # : Hash(NFA, Array(MatchRef)) # NFA -> Array(MatchRef)
261
+
262
+ def initialize
263
+ reset
264
+ end
265
+
266
+ def reset
267
+ @candidate_match_start_positions = Hash.new
268
+ @match_end_positions = Hash.new
269
+ @empty_matches = Hash.new
270
+ @matches = Hash.new
271
+ end
272
+
273
+ def start_positions(nfa)
274
+ candidate_match_start_positions[nfa] ||= Array.new
275
+ end
276
+
277
+ def end_positions(nfa)
278
+ match_end_positions[nfa] ||= Array.new
279
+ end
280
+
281
+ def empty_match_positions(nfa)
282
+ empty_matches[nfa] ||= Array.new
283
+ end
284
+
285
+ def matches_for(nfa)
286
+ matches[nfa] ||= Array.new
287
+ end
288
+
289
+ def add_start_of_candidate_match(nfa_with_dead_end, token_index)
290
+ # puts "add_start_of_candidate_match(#{nfa.object_id}, #{token_index})"
291
+ positions = start_positions(nfa_with_dead_end)
292
+ positions << token_index
293
+ end
294
+
295
+ # the end positions are inclusive of the index of the last character matched, so empty matches are not accounted for in the match_end_positions array
296
+ def add_end_of_match(nfa_with_dead_end, token_index)
297
+ # puts "add_end_of_match(#{nfa.object_id}, #{token_index})"
298
+ positions = end_positions(nfa_with_dead_end)
299
+ positions << token_index
300
+ end
301
+
302
+ def add_empty_match(nfa_with_dead_end, token_index)
303
+ positions = empty_match_positions(nfa_with_dead_end)
304
+ positions << token_index
305
+ end
306
+
307
+ def invert_candidate_match_start_positions # : Hash(Int32, Array(NFA))
308
+ index_to_nfas = Hash.new
309
+ candidate_match_start_positions.each do |nfa_with_dead_end, indices|
310
+ indices.each do |index|
311
+ nfas = index_to_nfas[index] ||= Array.new
312
+ nfas << nfa_with_dead_end
313
+ end
314
+ end
315
+ index_to_nfas
316
+ end
317
+
318
+ def add_match(nfa, match)
319
+ matches = matches_for(nfa)
320
+ matches << match
321
+ end
322
+ end
323
+ end
@@ -0,0 +1,9 @@
1
+
2
+ module Kleene
3
+ class Parser
4
+ def parse(pattern)
5
+ ast = Regexp::Parser.parse(pattern)
6
+ ast
7
+ end
8
+ end
9
+ end
@@ -12,12 +12,16 @@ module Enumerable
12
12
  ary = []
13
13
  each do |e|
14
14
  v = block.call(e)
15
- unless v.nil?
16
- ary << v
17
- end
15
+ ary << v unless v.nil?
18
16
  end
19
17
  ary
20
18
  end
21
19
 
22
- alias_method :includes?, :include?
20
+ alias includes? include?
21
+ end
22
+
23
+ class String
24
+ def scan_matches(pattern) # : Array(MatchData)
25
+ to_enum(:scan, pattern).map { Regexp.last_match }
26
+ end
23
27
  end
@@ -1,3 +1,3 @@
1
1
  module Kleene
2
- VERSION = "0.6.0"
2
+ VERSION = "0.8.0"
3
3
  end
data/lib/kleene.rb CHANGED
@@ -1,15 +1,18 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "active_support"
4
- require "active_support/core_ext"
5
- require_relative "kleene/version"
6
- require_relative "kleene/patches"
7
- require_relative "kleene/kleene"
8
- require_relative "kleene/dsl"
9
- require_relative "kleene/nfa"
10
- require_relative "kleene/dfa"
11
- require_relative "kleene/multi_match_dfa"
12
-
3
+ require 'active_support'
4
+ require 'active_support/core_ext'
5
+ require 'regexp_parser'
6
+ require_relative 'kleene/version'
7
+ require_relative 'kleene/patches'
8
+ require_relative 'kleene/kleene'
9
+ require_relative 'kleene/dsl'
10
+ require_relative 'kleene/nfa'
11
+ require_relative 'kleene/dfa'
12
+ require_relative 'kleene/multi_match_dfa'
13
+ require_relative 'kleene/online_dfa'
14
+ require_relative 'kleene/naive_online_regex'
15
+ require_relative 'kleene/parser'
13
16
 
14
17
  module Kleene
15
18
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kleene
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Ellis
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-11-04 00:00:00.000000000 Z
11
+ date: 2023-11-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '7.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: regexp_parser
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.8'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
27
41
  description: kleene is a library for building regular expression recognition automata
28
42
  - nfas, dfas, and some specialty structures.
29
43
  email:
@@ -33,6 +47,7 @@ extensions: []
33
47
  extra_rdoc_files: []
34
48
  files:
35
49
  - ".rspec"
50
+ - ".rubocop.yml"
36
51
  - Gemfile
37
52
  - Gemfile.lock
38
53
  - LICENSE
@@ -45,7 +60,10 @@ files:
45
60
  - lib/kleene/dsl.rb
46
61
  - lib/kleene/kleene.rb
47
62
  - lib/kleene/multi_match_dfa.rb
63
+ - lib/kleene/naive_online_regex.rb
48
64
  - lib/kleene/nfa.rb
65
+ - lib/kleene/online_dfa.rb
66
+ - lib/kleene/parser.rb
49
67
  - lib/kleene/patches.rb
50
68
  - lib/kleene/version.rb
51
69
  homepage: https://github.com/davidkellis/kleene-rb