kleene 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/kleene/nfa.rb ADDED
@@ -0,0 +1,304 @@
1
+ module Kleene
2
+ class NFATransition
3
+ Epsilon = "\u0000" # todo/hack: we use the null character as a sentinal character indicating epsilon transition
4
+
5
+ attr_accessor :token # : Char
6
+ attr_accessor :from # : State
7
+ attr_accessor :to # : State
8
+
9
+ def initialize(token, from_state, to_state)
10
+ @token = token
11
+ @from = from_state
12
+ @to = to_state
13
+ end
14
+
15
+ def accept?(input)
16
+ @token == input
17
+ end
18
+
19
+ def epsilon?
20
+ token == Epsilon
21
+ end
22
+ end
23
+
24
+ class NFA
25
+ attr_accessor :alphabet # : Set(Char)
26
+ attr_accessor :states # : Set(State)
27
+ attr_accessor :start_state # : State
28
+ attr_accessor :transitions # : Hash(State, Hash(Char, Set(NFATransition)))
29
+ attr_accessor :current_states # : Set(State)
30
+ attr_accessor :final_states # : Set(State)
31
+ # @regex_pattern
32
+
33
+ def initialize(start_state, alphabet = DEFAULT_ALPHABET, transitions = Hash.new, initial_states = nil)
34
+ @start_state = start_state
35
+ @transitions = transitions
36
+
37
+ @alphabet = alphabet + all_transitions.map(&:token)
38
+
39
+ @states = initial_states || reachable_states(start_state)
40
+ @current_states = Set.new
41
+ @final_states = Set.new
42
+
43
+ update_final_states
44
+ reset_current_states
45
+ end
46
+
47
+ def all_transitions() # : Array(NFATransition)
48
+ transitions.flat_map {|state, char_transition_map| char_transition_map.values.flat_map(&:to_a) }
49
+ end
50
+
51
+ # def transitions_from(state) # : Set(NFATransition)
52
+ # @transitions[state]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
53
+ # end
54
+ def transitions_from(state_set) # : Set(NFATransition)
55
+ case state_set
56
+ when State
57
+ @transitions[state_set]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
58
+ when Set
59
+ state_set.map {|state| transitions_from(state) }.reduce {|memo, state_set| memo | state_set }
60
+ else
61
+ raise "boom"
62
+ end
63
+
64
+ end
65
+
66
+ def deep_clone
67
+ old_states = @states.to_a
68
+ new_states = old_states.map(&:dup)
69
+ state_mapping = old_states.zip(new_states).to_h
70
+ new_transitions = transitions.map {|state, char_transition_map|
71
+ [
72
+ state_mapping[state],
73
+ char_transition_map.map {|char, set_of_transisions|
74
+ [
75
+ char,
76
+ set_of_transisions.map {|transition| NFATransition.new(transition.token, state_mapping[transition.from], state_mapping[transition.to])}.to_set
77
+ ]
78
+ }.to_h
79
+ ]
80
+ }.to_h
81
+
82
+ NFA.new(state_mapping[@start_state], @alphabet.clone, new_transitions, new_states.to_set).set_regex_pattern(regex_pattern)
83
+ end
84
+
85
+ def update_final_states
86
+ @final_states = @states.select { |s| s.final? }.to_set
87
+ end
88
+
89
+ def reset_current_states
90
+ @current_states = epsilon_closure(@start_state)
91
+ end
92
+
93
+ def error_states
94
+ @states.select(&:error?).to_set
95
+ end
96
+
97
+ def add_state(new_state)
98
+ @states << new_state
99
+ end
100
+
101
+ def add_states(states)
102
+ @states.merge(states)
103
+ end
104
+
105
+ def remove_state(state)
106
+ raise "Unable to remove state from NFA: at least one transition leads to or from the state." if all_transitions.any? {|transition| transition.from == state || transition.to == state }
107
+ @states.delete(state)
108
+ end
109
+
110
+ def add_transition(token, from_state, to_state)
111
+ # # make sure states EITHER have a single outbound epsilon transition OR non-epsilon outbound transitions; they can't have both
112
+ # if token == NFATransition::Epsilon
113
+ # # make sure from_state doesn't have any outbound non-epsilon transitions
114
+ # raise "Error: Non-epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any? {|t| !t.epsilon? }
115
+ # else
116
+ # # make sure from_state doesn't have any outbound epsilon transition
117
+ # raise "Error: Epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any?(&:epsilon?)
118
+ # end
119
+
120
+ @alphabet << token # alphabet is a set, so there will be no duplications
121
+ @states << from_state
122
+ @states << to_state
123
+ new_transition = NFATransition.new(token, from_state, to_state)
124
+
125
+ char_transition_map = @transitions[from_state] ||= Hash.new
126
+ set_of_transisions = char_transition_map[token] ||= Set.new
127
+ set_of_transisions << new_transition
128
+
129
+ new_transition
130
+ end
131
+
132
+ # Returns an array of matches found in the input string, each of which begins at the offset input_start_offset
133
+ def matches_at_offset(input, input_start_offset)
134
+ reset_current_states
135
+
136
+ matches = []
137
+ (input_start_offset...input.size).each do |offset|
138
+ token = input[offset]
139
+ handle_token!(token)
140
+ if accept?
141
+ matches << MatchRef.new(input, input_start_offset..offset)
142
+ end
143
+ end
144
+ matches
145
+ end
146
+
147
+ # Returns an array of matches found anywhere in the input string
148
+ def matches(input)
149
+ (0...input.size).reduce([]) do |memo, offset|
150
+ memo + matches_at_offset(input, offset)
151
+ end
152
+ end
153
+
154
+ def match?(input) # : MatchRef?
155
+ # puts "match?(\"#{input}\")"
156
+ # puts self.to_s
157
+ reset_current_states
158
+
159
+ # puts @current_states.map(&:id)
160
+ input.each_char.with_index do |char, index|
161
+ # puts char
162
+ handle_token!(char)
163
+ # puts @current_states.map(&:id)
164
+ end
165
+
166
+ if accept?
167
+ MatchRef.new(input, 0...input.size)
168
+ end
169
+ end
170
+
171
+ # process another input token
172
+ def handle_token!(input_token)
173
+ @current_states = next_states(@current_states, input_token)
174
+ end
175
+
176
+ def accept?
177
+ @current_states.any?(&:final?)
178
+ end
179
+
180
+ def next_states(state_set, input_token)
181
+ # Retrieve a list of states in the epsilon closure of the given state set
182
+ epsilon_reachable_states = epsilon_closure(state_set)
183
+ # puts "epsilon_reachable_states = #{epsilon_reachable_states.map(&:id)}"
184
+
185
+ # Build an array of outbound transitions from each state in the epsilon-closure
186
+ # Filter the outbound transitions, selecting only those that accept the input we are given.
187
+ outbound_transitions = epsilon_reachable_states.compact_map {|state| @transitions.dig(state, input_token) }.flat_map(&:to_a)
188
+ # puts "outbound_transitions = #{outbound_transitions.inspect}"
189
+
190
+ # Build an array of epsilon-closures of each transition's destination state.
191
+ destination_state_epsilon_closures = outbound_transitions.map {|transition| epsilon_closure(transition.to) }
192
+
193
+ # Union each of the epsilon-closures (each is a set) together to form a flat array of states in the epsilon-closure of all of our current states.
194
+ next_states = destination_state_epsilon_closures.reduce {|combined_state_set, individual_state_set| combined_state_set.merge(individual_state_set) }
195
+
196
+ next_states || Set.new
197
+ end
198
+
199
+ # Determine the epsilon closure of the given state set
200
+ # That is, determine what states are reachable on an epsilon transition from the current state set (@current_states).
201
+ # Returns a Set of State objects.
202
+ def epsilon_closure(state_set) # : Set(State)
203
+ state_set = state_set.is_a?(State) ? Set[state_set] : state_set
204
+ visited_states = Set.new()
205
+ unvisited_states = state_set
206
+ while !unvisited_states.empty?
207
+ epsilon_transitions = unvisited_states.compact_map {|state| @transitions.dig(state, NFATransition::Epsilon) }.flat_map(&:to_a)
208
+ destination_states = epsilon_transitions.map(&:to).to_set
209
+ visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
210
+ unvisited_states = destination_states - visited_states
211
+ end
212
+ visited_states
213
+ end
214
+
215
+ # Returns a set of State objects which are reachable through any transition path from the NFA's start_state.
216
+ def reachable_states(start_state)
217
+ visited_states = Set.new()
218
+ unvisited_states = Set[start_state]
219
+ while !unvisited_states.empty?
220
+ outbound_transitions = unvisited_states.flat_map {|state| @transitions[state]&.values&.flat_map(&:to_a) || Array.new }
221
+ destination_states = outbound_transitions.map(&:to).to_set
222
+ visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
223
+ unvisited_states = destination_states - visited_states
224
+ end
225
+ visited_states
226
+ end
227
+
228
+ # This implements the subset construction algorithm presented on page 118 of the first edition of the dragon book.
229
+ # I found a similar explanation at: http://web.cecs.pdx.edu/~harry/compilers/slides/LexicalPart3.pdf
230
+ def to_dfa
231
+ state_map = Hash.new # this map contains (nfa_state_set => dfa_state) pairs
232
+ dfa_transitions = Hash.new
233
+ dfa_alphabet = @alphabet - Set[NFATransition::Epsilon]
234
+ visited_state_sets = Set.new()
235
+ nfa_start_state_set = epsilon_closure(@start_state)
236
+ unvisited_state_sets = Set[nfa_start_state_set]
237
+
238
+ dfa_start_state = State.new(nfa_start_state_set.any?(&:final?), nfa_start_state_set.any?(&:error?))
239
+ state_map[nfa_start_state_set] = dfa_start_state
240
+ until unvisited_state_sets.empty?
241
+ # take one of the unvisited state sets
242
+ state_set = unvisited_state_sets.first
243
+
244
+ current_dfa_state = state_map[state_set]
245
+
246
+ # Figure out the set of next-states for each token in the alphabet
247
+ # Add each set of next-states to unvisited_state_sets
248
+ dfa_alphabet.each do |token|
249
+ next_nfa_state_set = next_states(state_set, token)
250
+ unvisited_state_sets << next_nfa_state_set
251
+
252
+ # this new DFA state, next_dfa_state, represents the next nfa state set, next_nfa_state_set
253
+ next_dfa_state = state_map[next_nfa_state_set] ||= State.new(next_nfa_state_set.any?(&:final?), next_nfa_state_set.any?(&:error?))
254
+
255
+ char_transition_map = dfa_transitions[current_dfa_state] ||= Hash.new
256
+ char_transition_map[token] = DFATransition.new(token, current_dfa_state, next_dfa_state)
257
+ end
258
+
259
+ visited_state_sets << state_set
260
+ unvisited_state_sets = unvisited_state_sets - visited_state_sets
261
+ end
262
+
263
+ # `state_map.invert` is sufficient to convert from a (nfa_state_set => dfa_state) mapping to a (dfa_state => nfa_state_set) mapping, because the mappings are strictly one-to-one.
264
+ DFA.new(state_map[nfa_start_state_set], dfa_alphabet, dfa_transitions, state_map.invert, origin_nfa: self).set_regex_pattern(regex_pattern)
265
+ end
266
+
267
+ def graphviz
268
+ retval = "digraph G { "
269
+ all_transitions.each do |t|
270
+ transition_label = t.epsilon? ? "ε" : t.token
271
+ retval += "#{t.from.id} -> #{t.to.id} [label=\"#{transition_label}\"];"
272
+ end
273
+ @final_states.each do |s|
274
+ retval += "#{s.id} [color=lightblue2, style=filled, shape=doublecircle];"
275
+ end
276
+ retval += " }"
277
+ retval
278
+ end
279
+
280
+ def to_s(verbose = false)
281
+ if verbose
282
+ retval = states.map(&:to_s).join("\n")
283
+ retval += "\n"
284
+ all_transitions.each do |t|
285
+ transition_label = t.epsilon? ? "epsilon" : t.token
286
+ retval += "#{t.from.id} -> #{transition_label} -> #{t.to.id}\n"
287
+ end
288
+ retval
289
+ else
290
+ regex_pattern
291
+ end
292
+ end
293
+
294
+ def set_regex_pattern(pattern)
295
+ @regex_pattern = pattern
296
+ self
297
+ end
298
+
299
+ def regex_pattern
300
+ @regex_pattern || "<<empty>>"
301
+ end
302
+ end
303
+
304
+ end
@@ -0,0 +1,23 @@
1
+ module Enumerable
2
+ # calls the block with successive elements; returns the first truthy object returned by the block
3
+ def find_map(&block)
4
+ each do |element|
5
+ mapped_value = block.call(element)
6
+ return mapped_value if mapped_value
7
+ end
8
+ nil
9
+ end
10
+
11
+ def compact_map(&block)
12
+ ary = []
13
+ each do |e|
14
+ v = block.call(e)
15
+ unless v.nil?
16
+ ary << v
17
+ end
18
+ end
19
+ ary
20
+ end
21
+
22
+ alias_method :includes?, :include?
23
+ end
@@ -0,0 +1,3 @@
1
+ module Kleene
2
+ VERSION = "0.4.0"
3
+ end
data/lib/kleene.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support"
4
+ require "active_support/core_ext"
5
+ require_relative "kleene/version"
6
+ require_relative "kleene/patches"
7
+ require_relative "kleene/kleene"
8
+ require_relative "kleene/dsl"
9
+ require_relative "kleene/nfa"
10
+ require_relative "kleene/dfa"
11
+ require_relative "kleene/multi_match_dfa"
12
+
13
+
14
+ module Kleene
15
+ class Error < StandardError; end
16
+ # Your code goes here...
17
+ end
metadata ADDED
@@ -0,0 +1,76 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: kleene
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.0
5
+ platform: ruby
6
+ authors:
7
+ - David Ellis
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-11-04 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '7.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '7.1'
27
+ description: kleene is a library for building regular expression recognition automata
28
+ - nfas, dfas, and some specialty structures.
29
+ email:
30
+ - david@conquerthelawn.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - ".rspec"
36
+ - Gemfile
37
+ - Gemfile.lock
38
+ - LICENSE
39
+ - README.md
40
+ - Rakefile
41
+ - build.ops
42
+ - kleene.gemspec
43
+ - lib/kleene.rb
44
+ - lib/kleene/dfa.rb
45
+ - lib/kleene/dsl.rb
46
+ - lib/kleene/kleene.rb
47
+ - lib/kleene/multi_match_dfa.rb
48
+ - lib/kleene/nfa.rb
49
+ - lib/kleene/patches.rb
50
+ - lib/kleene/version.rb
51
+ homepage: https://github.com/davidkellis/kleene-rb
52
+ licenses:
53
+ - MIT
54
+ metadata:
55
+ homepage_uri: https://github.com/davidkellis/kleene-rb
56
+ source_code_uri: https://github.com/davidkellis/kleene-rb
57
+ post_install_message:
58
+ rdoc_options: []
59
+ require_paths:
60
+ - lib
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
63
+ - - ">="
64
+ - !ruby/object:Gem::Version
65
+ version: 3.0.0
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
71
+ requirements: []
72
+ rubygems_version: 3.4.10
73
+ signing_key:
74
+ specification_version: 4
75
+ summary: kleene is a library for building regular expression recognition automata
76
+ test_files: []