kleene 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/kleene/nfa.rb ADDED
@@ -0,0 +1,304 @@
1
+ module Kleene
2
+ class NFATransition
3
+ Epsilon = "\u0000" # todo/hack: we use the null character as a sentinal character indicating epsilon transition
4
+
5
+ attr_accessor :token # : Char
6
+ attr_accessor :from # : State
7
+ attr_accessor :to # : State
8
+
9
+ def initialize(token, from_state, to_state)
10
+ @token = token
11
+ @from = from_state
12
+ @to = to_state
13
+ end
14
+
15
+ def accept?(input)
16
+ @token == input
17
+ end
18
+
19
+ def epsilon?
20
+ token == Epsilon
21
+ end
22
+ end
23
+
24
+ class NFA
25
+ attr_accessor :alphabet # : Set(Char)
26
+ attr_accessor :states # : Set(State)
27
+ attr_accessor :start_state # : State
28
+ attr_accessor :transitions # : Hash(State, Hash(Char, Set(NFATransition)))
29
+ attr_accessor :current_states # : Set(State)
30
+ attr_accessor :final_states # : Set(State)
31
+ # @regex_pattern
32
+
33
+ def initialize(start_state, alphabet = DEFAULT_ALPHABET, transitions = Hash.new, initial_states = nil)
34
+ @start_state = start_state
35
+ @transitions = transitions
36
+
37
+ @alphabet = alphabet + all_transitions.map(&:token)
38
+
39
+ @states = initial_states || reachable_states(start_state)
40
+ @current_states = Set.new
41
+ @final_states = Set.new
42
+
43
+ update_final_states
44
+ reset_current_states
45
+ end
46
+
47
+ def all_transitions() # : Array(NFATransition)
48
+ transitions.flat_map {|state, char_transition_map| char_transition_map.values.flat_map(&:to_a) }
49
+ end
50
+
51
+ # def transitions_from(state) # : Set(NFATransition)
52
+ # @transitions[state]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
53
+ # end
54
+ def transitions_from(state_set) # : Set(NFATransition)
55
+ case state_set
56
+ when State
57
+ @transitions[state_set]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
58
+ when Set
59
+ state_set.map {|state| transitions_from(state) }.reduce {|memo, state_set| memo | state_set }
60
+ else
61
+ raise "boom"
62
+ end
63
+
64
+ end
65
+
66
+ def deep_clone
67
+ old_states = @states.to_a
68
+ new_states = old_states.map(&:dup)
69
+ state_mapping = old_states.zip(new_states).to_h
70
+ new_transitions = transitions.map {|state, char_transition_map|
71
+ [
72
+ state_mapping[state],
73
+ char_transition_map.map {|char, set_of_transisions|
74
+ [
75
+ char,
76
+ set_of_transisions.map {|transition| NFATransition.new(transition.token, state_mapping[transition.from], state_mapping[transition.to])}.to_set
77
+ ]
78
+ }.to_h
79
+ ]
80
+ }.to_h
81
+
82
+ NFA.new(state_mapping[@start_state], @alphabet.clone, new_transitions, new_states.to_set).set_regex_pattern(regex_pattern)
83
+ end
84
+
85
+ def update_final_states
86
+ @final_states = @states.select { |s| s.final? }.to_set
87
+ end
88
+
89
+ def reset_current_states
90
+ @current_states = epsilon_closure(@start_state)
91
+ end
92
+
93
+ def error_states
94
+ @states.select(&:error?).to_set
95
+ end
96
+
97
+ def add_state(new_state)
98
+ @states << new_state
99
+ end
100
+
101
+ def add_states(states)
102
+ @states.merge(states)
103
+ end
104
+
105
+ def remove_state(state)
106
+ raise "Unable to remove state from NFA: at least one transition leads to or from the state." if all_transitions.any? {|transition| transition.from == state || transition.to == state }
107
+ @states.delete(state)
108
+ end
109
+
110
+ def add_transition(token, from_state, to_state)
111
+ # # make sure states EITHER have a single outbound epsilon transition OR non-epsilon outbound transitions; they can't have both
112
+ # if token == NFATransition::Epsilon
113
+ # # make sure from_state doesn't have any outbound non-epsilon transitions
114
+ # raise "Error: Non-epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any? {|t| !t.epsilon? }
115
+ # else
116
+ # # make sure from_state doesn't have any outbound epsilon transition
117
+ # raise "Error: Epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any?(&:epsilon?)
118
+ # end
119
+
120
+ @alphabet << token # alphabet is a set, so there will be no duplications
121
+ @states << from_state
122
+ @states << to_state
123
+ new_transition = NFATransition.new(token, from_state, to_state)
124
+
125
+ char_transition_map = @transitions[from_state] ||= Hash.new
126
+ set_of_transisions = char_transition_map[token] ||= Set.new
127
+ set_of_transisions << new_transition
128
+
129
+ new_transition
130
+ end
131
+
132
+ # Returns an array of matches found in the input string, each of which begins at the offset input_start_offset
133
+ def matches_at_offset(input, input_start_offset)
134
+ reset_current_states
135
+
136
+ matches = []
137
+ (input_start_offset...input.size).each do |offset|
138
+ token = input[offset]
139
+ handle_token!(token)
140
+ if accept?
141
+ matches << MatchRef.new(input, input_start_offset..offset)
142
+ end
143
+ end
144
+ matches
145
+ end
146
+
147
+ # Returns an array of matches found anywhere in the input string
148
+ def matches(input)
149
+ (0...input.size).reduce([]) do |memo, offset|
150
+ memo + matches_at_offset(input, offset)
151
+ end
152
+ end
153
+
154
+ def match?(input) # : MatchRef?
155
+ # puts "match?(\"#{input}\")"
156
+ # puts self.to_s
157
+ reset_current_states
158
+
159
+ # puts @current_states.map(&:id)
160
+ input.each_char.with_index do |char, index|
161
+ # puts char
162
+ handle_token!(char)
163
+ # puts @current_states.map(&:id)
164
+ end
165
+
166
+ if accept?
167
+ MatchRef.new(input, 0...input.size)
168
+ end
169
+ end
170
+
171
+ # process another input token
172
+ def handle_token!(input_token)
173
+ @current_states = next_states(@current_states, input_token)
174
+ end
175
+
176
+ def accept?
177
+ @current_states.any?(&:final?)
178
+ end
179
+
180
+ def next_states(state_set, input_token)
181
+ # Retrieve a list of states in the epsilon closure of the given state set
182
+ epsilon_reachable_states = epsilon_closure(state_set)
183
+ # puts "epsilon_reachable_states = #{epsilon_reachable_states.map(&:id)}"
184
+
185
+ # Build an array of outbound transitions from each state in the epsilon-closure
186
+ # Filter the outbound transitions, selecting only those that accept the input we are given.
187
+ outbound_transitions = epsilon_reachable_states.compact_map {|state| @transitions.dig(state, input_token) }.flat_map(&:to_a)
188
+ # puts "outbound_transitions = #{outbound_transitions.inspect}"
189
+
190
+ # Build an array of epsilon-closures of each transition's destination state.
191
+ destination_state_epsilon_closures = outbound_transitions.map {|transition| epsilon_closure(transition.to) }
192
+
193
+ # Union each of the epsilon-closures (each is a set) together to form a flat array of states in the epsilon-closure of all of our current states.
194
+ next_states = destination_state_epsilon_closures.reduce {|combined_state_set, individual_state_set| combined_state_set.merge(individual_state_set) }
195
+
196
+ next_states || Set.new
197
+ end
198
+
199
+ # Determine the epsilon closure of the given state set
200
+ # That is, determine what states are reachable on an epsilon transition from the current state set (@current_states).
201
+ # Returns a Set of State objects.
202
+ def epsilon_closure(state_set) # : Set(State)
203
+ state_set = state_set.is_a?(State) ? Set[state_set] : state_set
204
+ visited_states = Set.new()
205
+ unvisited_states = state_set
206
+ while !unvisited_states.empty?
207
+ epsilon_transitions = unvisited_states.compact_map {|state| @transitions.dig(state, NFATransition::Epsilon) }.flat_map(&:to_a)
208
+ destination_states = epsilon_transitions.map(&:to).to_set
209
+ visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
210
+ unvisited_states = destination_states - visited_states
211
+ end
212
+ visited_states
213
+ end
214
+
215
+ # Returns a set of State objects which are reachable through any transition path from the NFA's start_state.
216
+ def reachable_states(start_state)
217
+ visited_states = Set.new()
218
+ unvisited_states = Set[start_state]
219
+ while !unvisited_states.empty?
220
+ outbound_transitions = unvisited_states.flat_map {|state| @transitions[state]&.values&.flat_map(&:to_a) || Array.new }
221
+ destination_states = outbound_transitions.map(&:to).to_set
222
+ visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
223
+ unvisited_states = destination_states - visited_states
224
+ end
225
+ visited_states
226
+ end
227
+
228
+ # This implements the subset construction algorithm presented on page 118 of the first edition of the dragon book.
229
+ # I found a similar explanation at: http://web.cecs.pdx.edu/~harry/compilers/slides/LexicalPart3.pdf
230
+ def to_dfa
231
+ state_map = Hash.new # this map contains (nfa_state_set => dfa_state) pairs
232
+ dfa_transitions = Hash.new
233
+ dfa_alphabet = @alphabet - Set[NFATransition::Epsilon]
234
+ visited_state_sets = Set.new()
235
+ nfa_start_state_set = epsilon_closure(@start_state)
236
+ unvisited_state_sets = Set[nfa_start_state_set]
237
+
238
+ dfa_start_state = State.new(nfa_start_state_set.any?(&:final?), nfa_start_state_set.any?(&:error?))
239
+ state_map[nfa_start_state_set] = dfa_start_state
240
+ until unvisited_state_sets.empty?
241
+ # take one of the unvisited state sets
242
+ state_set = unvisited_state_sets.first
243
+
244
+ current_dfa_state = state_map[state_set]
245
+
246
+ # Figure out the set of next-states for each token in the alphabet
247
+ # Add each set of next-states to unvisited_state_sets
248
+ dfa_alphabet.each do |token|
249
+ next_nfa_state_set = next_states(state_set, token)
250
+ unvisited_state_sets << next_nfa_state_set
251
+
252
+ # this new DFA state, next_dfa_state, represents the next nfa state set, next_nfa_state_set
253
+ next_dfa_state = state_map[next_nfa_state_set] ||= State.new(next_nfa_state_set.any?(&:final?), next_nfa_state_set.any?(&:error?))
254
+
255
+ char_transition_map = dfa_transitions[current_dfa_state] ||= Hash.new
256
+ char_transition_map[token] = DFATransition.new(token, current_dfa_state, next_dfa_state)
257
+ end
258
+
259
+ visited_state_sets << state_set
260
+ unvisited_state_sets = unvisited_state_sets - visited_state_sets
261
+ end
262
+
263
+ # `state_map.invert` is sufficient to convert from a (nfa_state_set => dfa_state) mapping to a (dfa_state => nfa_state_set) mapping, because the mappings are strictly one-to-one.
264
+ DFA.new(state_map[nfa_start_state_set], dfa_alphabet, dfa_transitions, state_map.invert, origin_nfa: self).set_regex_pattern(regex_pattern)
265
+ end
266
+
267
+ def graphviz
268
+ retval = "digraph G { "
269
+ all_transitions.each do |t|
270
+ transition_label = t.epsilon? ? "ε" : t.token
271
+ retval += "#{t.from.id} -> #{t.to.id} [label=\"#{transition_label}\"];"
272
+ end
273
+ @final_states.each do |s|
274
+ retval += "#{s.id} [color=lightblue2, style=filled, shape=doublecircle];"
275
+ end
276
+ retval += " }"
277
+ retval
278
+ end
279
+
280
+ def to_s(verbose = false)
281
+ if verbose
282
+ retval = states.map(&:to_s).join("\n")
283
+ retval += "\n"
284
+ all_transitions.each do |t|
285
+ transition_label = t.epsilon? ? "epsilon" : t.token
286
+ retval += "#{t.from.id} -> #{transition_label} -> #{t.to.id}\n"
287
+ end
288
+ retval
289
+ else
290
+ regex_pattern
291
+ end
292
+ end
293
+
294
+ def set_regex_pattern(pattern)
295
+ @regex_pattern = pattern
296
+ self
297
+ end
298
+
299
+ def regex_pattern
300
+ @regex_pattern || "<<empty>>"
301
+ end
302
+ end
303
+
304
+ end
@@ -0,0 +1,23 @@
1
+ module Enumerable
2
+ # calls the block with successive elements; returns the first truthy object returned by the block
3
+ def find_map(&block)
4
+ each do |element|
5
+ mapped_value = block.call(element)
6
+ return mapped_value if mapped_value
7
+ end
8
+ nil
9
+ end
10
+
11
+ def compact_map(&block)
12
+ ary = []
13
+ each do |e|
14
+ v = block.call(e)
15
+ unless v.nil?
16
+ ary << v
17
+ end
18
+ end
19
+ ary
20
+ end
21
+
22
+ alias_method :includes?, :include?
23
+ end
@@ -0,0 +1,3 @@
1
+ module Kleene
2
+ VERSION = "0.4.0"
3
+ end
data/lib/kleene.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support"
4
+ require "active_support/core_ext"
5
+ require_relative "kleene/version"
6
+ require_relative "kleene/patches"
7
+ require_relative "kleene/kleene"
8
+ require_relative "kleene/dsl"
9
+ require_relative "kleene/nfa"
10
+ require_relative "kleene/dfa"
11
+ require_relative "kleene/multi_match_dfa"
12
+
13
+
14
+ module Kleene
15
+ class Error < StandardError; end
16
+ # Your code goes here...
17
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kleene
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - David Ellis
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-11-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '7.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '7.1'
27
+ description: kleene is a library for building regular expression recognition automata
28
+ - nfas, dfas, and some specialty structures.
29
+ email:
30
+ - david@conquerthelawn.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - ".rspec"
36
+ - Gemfile
37
+ - Gemfile.lock
38
+ - LICENSE
39
+ - README.md
40
+ - Rakefile
41
+ - build.ops
42
+ - kleene.gemspec
43
+ - lib/kleene.rb
44
+ - lib/kleene/dfa.rb
45
+ - lib/kleene/dsl.rb
46
+ - lib/kleene/kleene.rb
47
+ - lib/kleene/multi_match_dfa.rb
48
+ - lib/kleene/nfa.rb
49
+ - lib/kleene/patches.rb
50
+ - lib/kleene/version.rb
51
+ homepage: https://github.com/davidkellis/kleene-rb
52
+ licenses:
53
+ - MIT
54
+ metadata:
55
+ homepage_uri: https://github.com/davidkellis/kleene-rb
56
+ source_code_uri: https://github.com/davidkellis/kleene-rb
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: 3.0.0
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubygems_version: 3.4.10
73
+ signing_key:
74
+ specification_version: 4
75
+ summary: kleene is a library for building regular expression recognition automata
76
+ test_files: []