kleene 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +117 -0
- data/LICENSE +21 -0
- data/README.md +21 -0
- data/Rakefile +8 -0
- data/build.ops +63 -0
- data/kleene.gemspec +39 -0
- data/lib/kleene/dfa.rb +258 -0
- data/lib/kleene/dsl.rb +263 -0
- data/lib/kleene/kleene.rb +88 -0
- data/lib/kleene/multi_match_dfa.rb +308 -0
- data/lib/kleene/nfa.rb +304 -0
- data/lib/kleene/patches.rb +23 -0
- data/lib/kleene/version.rb +3 -0
- data/lib/kleene.rb +17 -0
- metadata +76 -0
data/lib/kleene/nfa.rb
ADDED
@@ -0,0 +1,304 @@
|
|
1
|
+
module Kleene
|
2
|
+
class NFATransition
|
3
|
+
Epsilon = "\u0000" # todo/hack: we use the null character as a sentinal character indicating epsilon transition
|
4
|
+
|
5
|
+
attr_accessor :token # : Char
|
6
|
+
attr_accessor :from # : State
|
7
|
+
attr_accessor :to # : State
|
8
|
+
|
9
|
+
def initialize(token, from_state, to_state)
|
10
|
+
@token = token
|
11
|
+
@from = from_state
|
12
|
+
@to = to_state
|
13
|
+
end
|
14
|
+
|
15
|
+
def accept?(input)
|
16
|
+
@token == input
|
17
|
+
end
|
18
|
+
|
19
|
+
def epsilon?
|
20
|
+
token == Epsilon
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class NFA
|
25
|
+
attr_accessor :alphabet # : Set(Char)
|
26
|
+
attr_accessor :states # : Set(State)
|
27
|
+
attr_accessor :start_state # : State
|
28
|
+
attr_accessor :transitions # : Hash(State, Hash(Char, Set(NFATransition)))
|
29
|
+
attr_accessor :current_states # : Set(State)
|
30
|
+
attr_accessor :final_states # : Set(State)
|
31
|
+
# @regex_pattern
|
32
|
+
|
33
|
+
def initialize(start_state, alphabet = DEFAULT_ALPHABET, transitions = Hash.new, initial_states = nil)
|
34
|
+
@start_state = start_state
|
35
|
+
@transitions = transitions
|
36
|
+
|
37
|
+
@alphabet = alphabet + all_transitions.map(&:token)
|
38
|
+
|
39
|
+
@states = initial_states || reachable_states(start_state)
|
40
|
+
@current_states = Set.new
|
41
|
+
@final_states = Set.new
|
42
|
+
|
43
|
+
update_final_states
|
44
|
+
reset_current_states
|
45
|
+
end
|
46
|
+
|
47
|
+
def all_transitions() # : Array(NFATransition)
|
48
|
+
transitions.flat_map {|state, char_transition_map| char_transition_map.values.flat_map(&:to_a) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# def transitions_from(state) # : Set(NFATransition)
|
52
|
+
# @transitions[state]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
|
53
|
+
# end
|
54
|
+
def transitions_from(state_set) # : Set(NFATransition)
|
55
|
+
case state_set
|
56
|
+
when State
|
57
|
+
@transitions[state_set]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
|
58
|
+
when Set
|
59
|
+
state_set.map {|state| transitions_from(state) }.reduce {|memo, state_set| memo | state_set }
|
60
|
+
else
|
61
|
+
raise "boom"
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
def deep_clone
|
67
|
+
old_states = @states.to_a
|
68
|
+
new_states = old_states.map(&:dup)
|
69
|
+
state_mapping = old_states.zip(new_states).to_h
|
70
|
+
new_transitions = transitions.map {|state, char_transition_map|
|
71
|
+
[
|
72
|
+
state_mapping[state],
|
73
|
+
char_transition_map.map {|char, set_of_transisions|
|
74
|
+
[
|
75
|
+
char,
|
76
|
+
set_of_transisions.map {|transition| NFATransition.new(transition.token, state_mapping[transition.from], state_mapping[transition.to])}.to_set
|
77
|
+
]
|
78
|
+
}.to_h
|
79
|
+
]
|
80
|
+
}.to_h
|
81
|
+
|
82
|
+
NFA.new(state_mapping[@start_state], @alphabet.clone, new_transitions, new_states.to_set).set_regex_pattern(regex_pattern)
|
83
|
+
end
|
84
|
+
|
85
|
+
def update_final_states
|
86
|
+
@final_states = @states.select { |s| s.final? }.to_set
|
87
|
+
end
|
88
|
+
|
89
|
+
def reset_current_states
|
90
|
+
@current_states = epsilon_closure(@start_state)
|
91
|
+
end
|
92
|
+
|
93
|
+
def error_states
|
94
|
+
@states.select(&:error?).to_set
|
95
|
+
end
|
96
|
+
|
97
|
+
def add_state(new_state)
|
98
|
+
@states << new_state
|
99
|
+
end
|
100
|
+
|
101
|
+
def add_states(states)
|
102
|
+
@states.merge(states)
|
103
|
+
end
|
104
|
+
|
105
|
+
def remove_state(state)
|
106
|
+
raise "Unable to remove state from NFA: at least one transition leads to or from the state." if all_transitions.any? {|transition| transition.from == state || transition.to == state }
|
107
|
+
@states.delete(state)
|
108
|
+
end
|
109
|
+
|
110
|
+
def add_transition(token, from_state, to_state)
|
111
|
+
# # make sure states EITHER have a single outbound epsilon transition OR non-epsilon outbound transitions; they can't have both
|
112
|
+
# if token == NFATransition::Epsilon
|
113
|
+
# # make sure from_state doesn't have any outbound non-epsilon transitions
|
114
|
+
# raise "Error: Non-epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any? {|t| !t.epsilon? }
|
115
|
+
# else
|
116
|
+
# # make sure from_state doesn't have any outbound epsilon transition
|
117
|
+
# raise "Error: Epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any?(&:epsilon?)
|
118
|
+
# end
|
119
|
+
|
120
|
+
@alphabet << token # alphabet is a set, so there will be no duplications
|
121
|
+
@states << from_state
|
122
|
+
@states << to_state
|
123
|
+
new_transition = NFATransition.new(token, from_state, to_state)
|
124
|
+
|
125
|
+
char_transition_map = @transitions[from_state] ||= Hash.new
|
126
|
+
set_of_transisions = char_transition_map[token] ||= Set.new
|
127
|
+
set_of_transisions << new_transition
|
128
|
+
|
129
|
+
new_transition
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns an array of matches found in the input string, each of which begins at the offset input_start_offset
|
133
|
+
def matches_at_offset(input, input_start_offset)
|
134
|
+
reset_current_states
|
135
|
+
|
136
|
+
matches = []
|
137
|
+
(input_start_offset...input.size).each do |offset|
|
138
|
+
token = input[offset]
|
139
|
+
handle_token!(token)
|
140
|
+
if accept?
|
141
|
+
matches << MatchRef.new(input, input_start_offset..offset)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
matches
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns an array of matches found anywhere in the input string
|
148
|
+
def matches(input)
|
149
|
+
(0...input.size).reduce([]) do |memo, offset|
|
150
|
+
memo + matches_at_offset(input, offset)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def match?(input) # : MatchRef?
|
155
|
+
# puts "match?(\"#{input}\")"
|
156
|
+
# puts self.to_s
|
157
|
+
reset_current_states
|
158
|
+
|
159
|
+
# puts @current_states.map(&:id)
|
160
|
+
input.each_char.with_index do |char, index|
|
161
|
+
# puts char
|
162
|
+
handle_token!(char)
|
163
|
+
# puts @current_states.map(&:id)
|
164
|
+
end
|
165
|
+
|
166
|
+
if accept?
|
167
|
+
MatchRef.new(input, 0...input.size)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# process another input token
|
172
|
+
def handle_token!(input_token)
|
173
|
+
@current_states = next_states(@current_states, input_token)
|
174
|
+
end
|
175
|
+
|
176
|
+
def accept?
|
177
|
+
@current_states.any?(&:final?)
|
178
|
+
end
|
179
|
+
|
180
|
+
def next_states(state_set, input_token)
|
181
|
+
# Retrieve a list of states in the epsilon closure of the given state set
|
182
|
+
epsilon_reachable_states = epsilon_closure(state_set)
|
183
|
+
# puts "epsilon_reachable_states = #{epsilon_reachable_states.map(&:id)}"
|
184
|
+
|
185
|
+
# Build an array of outbound transitions from each state in the epsilon-closure
|
186
|
+
# Filter the outbound transitions, selecting only those that accept the input we are given.
|
187
|
+
outbound_transitions = epsilon_reachable_states.compact_map {|state| @transitions.dig(state, input_token) }.flat_map(&:to_a)
|
188
|
+
# puts "outbound_transitions = #{outbound_transitions.inspect}"
|
189
|
+
|
190
|
+
# Build an array of epsilon-closures of each transition's destination state.
|
191
|
+
destination_state_epsilon_closures = outbound_transitions.map {|transition| epsilon_closure(transition.to) }
|
192
|
+
|
193
|
+
# Union each of the epsilon-closures (each is a set) together to form a flat array of states in the epsilon-closure of all of our current states.
|
194
|
+
next_states = destination_state_epsilon_closures.reduce {|combined_state_set, individual_state_set| combined_state_set.merge(individual_state_set) }
|
195
|
+
|
196
|
+
next_states || Set.new
|
197
|
+
end
|
198
|
+
|
199
|
+
# Determine the epsilon closure of the given state set
|
200
|
+
# That is, determine what states are reachable on an epsilon transition from the current state set (@current_states).
|
201
|
+
# Returns a Set of State objects.
|
202
|
+
def epsilon_closure(state_set) # : Set(State)
|
203
|
+
state_set = state_set.is_a?(State) ? Set[state_set] : state_set
|
204
|
+
visited_states = Set.new()
|
205
|
+
unvisited_states = state_set
|
206
|
+
while !unvisited_states.empty?
|
207
|
+
epsilon_transitions = unvisited_states.compact_map {|state| @transitions.dig(state, NFATransition::Epsilon) }.flat_map(&:to_a)
|
208
|
+
destination_states = epsilon_transitions.map(&:to).to_set
|
209
|
+
visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
|
210
|
+
unvisited_states = destination_states - visited_states
|
211
|
+
end
|
212
|
+
visited_states
|
213
|
+
end
|
214
|
+
|
215
|
+
# Returns a set of State objects which are reachable through any transition path from the NFA's start_state.
|
216
|
+
def reachable_states(start_state)
|
217
|
+
visited_states = Set.new()
|
218
|
+
unvisited_states = Set[start_state]
|
219
|
+
while !unvisited_states.empty?
|
220
|
+
outbound_transitions = unvisited_states.flat_map {|state| @transitions[state]&.values&.flat_map(&:to_a) || Array.new }
|
221
|
+
destination_states = outbound_transitions.map(&:to).to_set
|
222
|
+
visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
|
223
|
+
unvisited_states = destination_states - visited_states
|
224
|
+
end
|
225
|
+
visited_states
|
226
|
+
end
|
227
|
+
|
228
|
+
# This implements the subset construction algorithm presented on page 118 of the first edition of the dragon book.
|
229
|
+
# I found a similar explanation at: http://web.cecs.pdx.edu/~harry/compilers/slides/LexicalPart3.pdf
|
230
|
+
def to_dfa
|
231
|
+
state_map = Hash.new # this map contains (nfa_state_set => dfa_state) pairs
|
232
|
+
dfa_transitions = Hash.new
|
233
|
+
dfa_alphabet = @alphabet - Set[NFATransition::Epsilon]
|
234
|
+
visited_state_sets = Set.new()
|
235
|
+
nfa_start_state_set = epsilon_closure(@start_state)
|
236
|
+
unvisited_state_sets = Set[nfa_start_state_set]
|
237
|
+
|
238
|
+
dfa_start_state = State.new(nfa_start_state_set.any?(&:final?), nfa_start_state_set.any?(&:error?))
|
239
|
+
state_map[nfa_start_state_set] = dfa_start_state
|
240
|
+
until unvisited_state_sets.empty?
|
241
|
+
# take one of the unvisited state sets
|
242
|
+
state_set = unvisited_state_sets.first
|
243
|
+
|
244
|
+
current_dfa_state = state_map[state_set]
|
245
|
+
|
246
|
+
# Figure out the set of next-states for each token in the alphabet
|
247
|
+
# Add each set of next-states to unvisited_state_sets
|
248
|
+
dfa_alphabet.each do |token|
|
249
|
+
next_nfa_state_set = next_states(state_set, token)
|
250
|
+
unvisited_state_sets << next_nfa_state_set
|
251
|
+
|
252
|
+
# this new DFA state, next_dfa_state, represents the next nfa state set, next_nfa_state_set
|
253
|
+
next_dfa_state = state_map[next_nfa_state_set] ||= State.new(next_nfa_state_set.any?(&:final?), next_nfa_state_set.any?(&:error?))
|
254
|
+
|
255
|
+
char_transition_map = dfa_transitions[current_dfa_state] ||= Hash.new
|
256
|
+
char_transition_map[token] = DFATransition.new(token, current_dfa_state, next_dfa_state)
|
257
|
+
end
|
258
|
+
|
259
|
+
visited_state_sets << state_set
|
260
|
+
unvisited_state_sets = unvisited_state_sets - visited_state_sets
|
261
|
+
end
|
262
|
+
|
263
|
+
# `state_map.invert` is sufficient to convert from a (nfa_state_set => dfa_state) mapping to a (dfa_state => nfa_state_set) mapping, because the mappings are strictly one-to-one.
|
264
|
+
DFA.new(state_map[nfa_start_state_set], dfa_alphabet, dfa_transitions, state_map.invert, origin_nfa: self).set_regex_pattern(regex_pattern)
|
265
|
+
end
|
266
|
+
|
267
|
+
def graphviz
|
268
|
+
retval = "digraph G { "
|
269
|
+
all_transitions.each do |t|
|
270
|
+
transition_label = t.epsilon? ? "ε" : t.token
|
271
|
+
retval += "#{t.from.id} -> #{t.to.id} [label=\"#{transition_label}\"];"
|
272
|
+
end
|
273
|
+
@final_states.each do |s|
|
274
|
+
retval += "#{s.id} [color=lightblue2, style=filled, shape=doublecircle];"
|
275
|
+
end
|
276
|
+
retval += " }"
|
277
|
+
retval
|
278
|
+
end
|
279
|
+
|
280
|
+
def to_s(verbose = false)
|
281
|
+
if verbose
|
282
|
+
retval = states.map(&:to_s).join("\n")
|
283
|
+
retval += "\n"
|
284
|
+
all_transitions.each do |t|
|
285
|
+
transition_label = t.epsilon? ? "epsilon" : t.token
|
286
|
+
retval += "#{t.from.id} -> #{transition_label} -> #{t.to.id}\n"
|
287
|
+
end
|
288
|
+
retval
|
289
|
+
else
|
290
|
+
regex_pattern
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def set_regex_pattern(pattern)
|
295
|
+
@regex_pattern = pattern
|
296
|
+
self
|
297
|
+
end
|
298
|
+
|
299
|
+
def regex_pattern
|
300
|
+
@regex_pattern || "<<empty>>"
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Enumerable
|
2
|
+
# calls the block with successive elements; returns the first truthy object returned by the block
|
3
|
+
def find_map(&block)
|
4
|
+
each do |element|
|
5
|
+
mapped_value = block.call(element)
|
6
|
+
return mapped_value if mapped_value
|
7
|
+
end
|
8
|
+
nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def compact_map(&block)
|
12
|
+
ary = []
|
13
|
+
each do |e|
|
14
|
+
v = block.call(e)
|
15
|
+
unless v.nil?
|
16
|
+
ary << v
|
17
|
+
end
|
18
|
+
end
|
19
|
+
ary
|
20
|
+
end
|
21
|
+
|
22
|
+
alias_method :includes?, :include?
|
23
|
+
end
|
data/lib/kleene.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext"
|
5
|
+
require_relative "kleene/version"
|
6
|
+
require_relative "kleene/patches"
|
7
|
+
require_relative "kleene/kleene"
|
8
|
+
require_relative "kleene/dsl"
|
9
|
+
require_relative "kleene/nfa"
|
10
|
+
require_relative "kleene/dfa"
|
11
|
+
require_relative "kleene/multi_match_dfa"
|
12
|
+
|
13
|
+
|
14
|
+
module Kleene
|
15
|
+
class Error < StandardError; end
|
16
|
+
# Your code goes here...
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kleene
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David Ellis
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-11-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '7.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '7.1'
|
27
|
+
description: kleene is a library for building regular expression recognition automata
|
28
|
+
- nfas, dfas, and some specialty structures.
|
29
|
+
email:
|
30
|
+
- david@conquerthelawn.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- ".rspec"
|
36
|
+
- Gemfile
|
37
|
+
- Gemfile.lock
|
38
|
+
- LICENSE
|
39
|
+
- README.md
|
40
|
+
- Rakefile
|
41
|
+
- build.ops
|
42
|
+
- kleene.gemspec
|
43
|
+
- lib/kleene.rb
|
44
|
+
- lib/kleene/dfa.rb
|
45
|
+
- lib/kleene/dsl.rb
|
46
|
+
- lib/kleene/kleene.rb
|
47
|
+
- lib/kleene/multi_match_dfa.rb
|
48
|
+
- lib/kleene/nfa.rb
|
49
|
+
- lib/kleene/patches.rb
|
50
|
+
- lib/kleene/version.rb
|
51
|
+
homepage: https://github.com/davidkellis/kleene-rb
|
52
|
+
licenses:
|
53
|
+
- MIT
|
54
|
+
metadata:
|
55
|
+
homepage_uri: https://github.com/davidkellis/kleene-rb
|
56
|
+
source_code_uri: https://github.com/davidkellis/kleene-rb
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 3.0.0
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubygems_version: 3.4.10
|
73
|
+
signing_key:
|
74
|
+
specification_version: 4
|
75
|
+
summary: kleene is a library for building regular expression recognition automata
|
76
|
+
test_files: []
|