kleene 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +117 -0
- data/LICENSE +21 -0
- data/README.md +21 -0
- data/Rakefile +8 -0
- data/build.ops +63 -0
- data/kleene.gemspec +39 -0
- data/lib/kleene/dfa.rb +258 -0
- data/lib/kleene/dsl.rb +263 -0
- data/lib/kleene/kleene.rb +88 -0
- data/lib/kleene/multi_match_dfa.rb +308 -0
- data/lib/kleene/nfa.rb +304 -0
- data/lib/kleene/patches.rb +23 -0
- data/lib/kleene/version.rb +3 -0
- data/lib/kleene.rb +17 -0
- metadata +76 -0
data/lib/kleene/nfa.rb
ADDED
@@ -0,0 +1,304 @@
|
|
1
|
+
module Kleene
|
2
|
+
class NFATransition
|
3
|
+
Epsilon = "\u0000" # todo/hack: we use the null character as a sentinal character indicating epsilon transition
|
4
|
+
|
5
|
+
attr_accessor :token # : Char
|
6
|
+
attr_accessor :from # : State
|
7
|
+
attr_accessor :to # : State
|
8
|
+
|
9
|
+
def initialize(token, from_state, to_state)
|
10
|
+
@token = token
|
11
|
+
@from = from_state
|
12
|
+
@to = to_state
|
13
|
+
end
|
14
|
+
|
15
|
+
def accept?(input)
|
16
|
+
@token == input
|
17
|
+
end
|
18
|
+
|
19
|
+
def epsilon?
|
20
|
+
token == Epsilon
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class NFA
|
25
|
+
attr_accessor :alphabet # : Set(Char)
|
26
|
+
attr_accessor :states # : Set(State)
|
27
|
+
attr_accessor :start_state # : State
|
28
|
+
attr_accessor :transitions # : Hash(State, Hash(Char, Set(NFATransition)))
|
29
|
+
attr_accessor :current_states # : Set(State)
|
30
|
+
attr_accessor :final_states # : Set(State)
|
31
|
+
# @regex_pattern
|
32
|
+
|
33
|
+
def initialize(start_state, alphabet = DEFAULT_ALPHABET, transitions = Hash.new, initial_states = nil)
|
34
|
+
@start_state = start_state
|
35
|
+
@transitions = transitions
|
36
|
+
|
37
|
+
@alphabet = alphabet + all_transitions.map(&:token)
|
38
|
+
|
39
|
+
@states = initial_states || reachable_states(start_state)
|
40
|
+
@current_states = Set.new
|
41
|
+
@final_states = Set.new
|
42
|
+
|
43
|
+
update_final_states
|
44
|
+
reset_current_states
|
45
|
+
end
|
46
|
+
|
47
|
+
def all_transitions() # : Array(NFATransition)
|
48
|
+
transitions.flat_map {|state, char_transition_map| char_transition_map.values.flat_map(&:to_a) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# def transitions_from(state) # : Set(NFATransition)
|
52
|
+
# @transitions[state]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
|
53
|
+
# end
|
54
|
+
def transitions_from(state_set) # : Set(NFATransition)
|
55
|
+
case state_set
|
56
|
+
when State
|
57
|
+
@transitions[state_set]&.values&.reduce {|memo, set_of_transisions| memo | set_of_transisions} || Set.new
|
58
|
+
when Set
|
59
|
+
state_set.map {|state| transitions_from(state) }.reduce {|memo, state_set| memo | state_set }
|
60
|
+
else
|
61
|
+
raise "boom"
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
def deep_clone
|
67
|
+
old_states = @states.to_a
|
68
|
+
new_states = old_states.map(&:dup)
|
69
|
+
state_mapping = old_states.zip(new_states).to_h
|
70
|
+
new_transitions = transitions.map {|state, char_transition_map|
|
71
|
+
[
|
72
|
+
state_mapping[state],
|
73
|
+
char_transition_map.map {|char, set_of_transisions|
|
74
|
+
[
|
75
|
+
char,
|
76
|
+
set_of_transisions.map {|transition| NFATransition.new(transition.token, state_mapping[transition.from], state_mapping[transition.to])}.to_set
|
77
|
+
]
|
78
|
+
}.to_h
|
79
|
+
]
|
80
|
+
}.to_h
|
81
|
+
|
82
|
+
NFA.new(state_mapping[@start_state], @alphabet.clone, new_transitions, new_states.to_set).set_regex_pattern(regex_pattern)
|
83
|
+
end
|
84
|
+
|
85
|
+
def update_final_states
|
86
|
+
@final_states = @states.select { |s| s.final? }.to_set
|
87
|
+
end
|
88
|
+
|
89
|
+
def reset_current_states
|
90
|
+
@current_states = epsilon_closure(@start_state)
|
91
|
+
end
|
92
|
+
|
93
|
+
def error_states
|
94
|
+
@states.select(&:error?).to_set
|
95
|
+
end
|
96
|
+
|
97
|
+
def add_state(new_state)
|
98
|
+
@states << new_state
|
99
|
+
end
|
100
|
+
|
101
|
+
def add_states(states)
|
102
|
+
@states.merge(states)
|
103
|
+
end
|
104
|
+
|
105
|
+
def remove_state(state)
|
106
|
+
raise "Unable to remove state from NFA: at least one transition leads to or from the state." if all_transitions.any? {|transition| transition.from == state || transition.to == state }
|
107
|
+
@states.delete(state)
|
108
|
+
end
|
109
|
+
|
110
|
+
def add_transition(token, from_state, to_state)
|
111
|
+
# # make sure states EITHER have a single outbound epsilon transition OR non-epsilon outbound transitions; they can't have both
|
112
|
+
# if token == NFATransition::Epsilon
|
113
|
+
# # make sure from_state doesn't have any outbound non-epsilon transitions
|
114
|
+
# raise "Error: Non-epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any? {|t| !t.epsilon? }
|
115
|
+
# else
|
116
|
+
# # make sure from_state doesn't have any outbound epsilon transition
|
117
|
+
# raise "Error: Epsilon transitions are already present on #{from_state.to_s}! States may EITHER have a single outbound epsilon transision OR have outbound non-epsilon transitions, but not both." if transitions_from(from_state).any?(&:epsilon?)
|
118
|
+
# end
|
119
|
+
|
120
|
+
@alphabet << token # alphabet is a set, so there will be no duplications
|
121
|
+
@states << from_state
|
122
|
+
@states << to_state
|
123
|
+
new_transition = NFATransition.new(token, from_state, to_state)
|
124
|
+
|
125
|
+
char_transition_map = @transitions[from_state] ||= Hash.new
|
126
|
+
set_of_transisions = char_transition_map[token] ||= Set.new
|
127
|
+
set_of_transisions << new_transition
|
128
|
+
|
129
|
+
new_transition
|
130
|
+
end
|
131
|
+
|
132
|
+
# Returns an array of matches found in the input string, each of which begins at the offset input_start_offset
|
133
|
+
def matches_at_offset(input, input_start_offset)
|
134
|
+
reset_current_states
|
135
|
+
|
136
|
+
matches = []
|
137
|
+
(input_start_offset...input.size).each do |offset|
|
138
|
+
token = input[offset]
|
139
|
+
handle_token!(token)
|
140
|
+
if accept?
|
141
|
+
matches << MatchRef.new(input, input_start_offset..offset)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
matches
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns an array of matches found anywhere in the input string
|
148
|
+
def matches(input)
|
149
|
+
(0...input.size).reduce([]) do |memo, offset|
|
150
|
+
memo + matches_at_offset(input, offset)
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
def match?(input) # : MatchRef?
|
155
|
+
# puts "match?(\"#{input}\")"
|
156
|
+
# puts self.to_s
|
157
|
+
reset_current_states
|
158
|
+
|
159
|
+
# puts @current_states.map(&:id)
|
160
|
+
input.each_char.with_index do |char, index|
|
161
|
+
# puts char
|
162
|
+
handle_token!(char)
|
163
|
+
# puts @current_states.map(&:id)
|
164
|
+
end
|
165
|
+
|
166
|
+
if accept?
|
167
|
+
MatchRef.new(input, 0...input.size)
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
# process another input token
|
172
|
+
def handle_token!(input_token)
|
173
|
+
@current_states = next_states(@current_states, input_token)
|
174
|
+
end
|
175
|
+
|
176
|
+
def accept?
|
177
|
+
@current_states.any?(&:final?)
|
178
|
+
end
|
179
|
+
|
180
|
+
def next_states(state_set, input_token)
|
181
|
+
# Retrieve a list of states in the epsilon closure of the given state set
|
182
|
+
epsilon_reachable_states = epsilon_closure(state_set)
|
183
|
+
# puts "epsilon_reachable_states = #{epsilon_reachable_states.map(&:id)}"
|
184
|
+
|
185
|
+
# Build an array of outbound transitions from each state in the epsilon-closure
|
186
|
+
# Filter the outbound transitions, selecting only those that accept the input we are given.
|
187
|
+
outbound_transitions = epsilon_reachable_states.compact_map {|state| @transitions.dig(state, input_token) }.flat_map(&:to_a)
|
188
|
+
# puts "outbound_transitions = #{outbound_transitions.inspect}"
|
189
|
+
|
190
|
+
# Build an array of epsilon-closures of each transition's destination state.
|
191
|
+
destination_state_epsilon_closures = outbound_transitions.map {|transition| epsilon_closure(transition.to) }
|
192
|
+
|
193
|
+
# Union each of the epsilon-closures (each is a set) together to form a flat array of states in the epsilon-closure of all of our current states.
|
194
|
+
next_states = destination_state_epsilon_closures.reduce {|combined_state_set, individual_state_set| combined_state_set.merge(individual_state_set) }
|
195
|
+
|
196
|
+
next_states || Set.new
|
197
|
+
end
|
198
|
+
|
199
|
+
# Determine the epsilon closure of the given state set
|
200
|
+
# That is, determine what states are reachable on an epsilon transition from the current state set (@current_states).
|
201
|
+
# Returns a Set of State objects.
|
202
|
+
def epsilon_closure(state_set) # : Set(State)
|
203
|
+
state_set = state_set.is_a?(State) ? Set[state_set] : state_set
|
204
|
+
visited_states = Set.new()
|
205
|
+
unvisited_states = state_set
|
206
|
+
while !unvisited_states.empty?
|
207
|
+
epsilon_transitions = unvisited_states.compact_map {|state| @transitions.dig(state, NFATransition::Epsilon) }.flat_map(&:to_a)
|
208
|
+
destination_states = epsilon_transitions.map(&:to).to_set
|
209
|
+
visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
|
210
|
+
unvisited_states = destination_states - visited_states
|
211
|
+
end
|
212
|
+
visited_states
|
213
|
+
end
|
214
|
+
|
215
|
+
# Returns a set of State objects which are reachable through any transition path from the NFA's start_state.
|
216
|
+
def reachable_states(start_state)
|
217
|
+
visited_states = Set.new()
|
218
|
+
unvisited_states = Set[start_state]
|
219
|
+
while !unvisited_states.empty?
|
220
|
+
outbound_transitions = unvisited_states.flat_map {|state| @transitions[state]&.values&.flat_map(&:to_a) || Array.new }
|
221
|
+
destination_states = outbound_transitions.map(&:to).to_set
|
222
|
+
visited_states.merge(unvisited_states) # add the unvisited states to the visited_states
|
223
|
+
unvisited_states = destination_states - visited_states
|
224
|
+
end
|
225
|
+
visited_states
|
226
|
+
end
|
227
|
+
|
228
|
+
# This implements the subset construction algorithm presented on page 118 of the first edition of the dragon book.
|
229
|
+
# I found a similar explanation at: http://web.cecs.pdx.edu/~harry/compilers/slides/LexicalPart3.pdf
|
230
|
+
def to_dfa
|
231
|
+
state_map = Hash.new # this map contains (nfa_state_set => dfa_state) pairs
|
232
|
+
dfa_transitions = Hash.new
|
233
|
+
dfa_alphabet = @alphabet - Set[NFATransition::Epsilon]
|
234
|
+
visited_state_sets = Set.new()
|
235
|
+
nfa_start_state_set = epsilon_closure(@start_state)
|
236
|
+
unvisited_state_sets = Set[nfa_start_state_set]
|
237
|
+
|
238
|
+
dfa_start_state = State.new(nfa_start_state_set.any?(&:final?), nfa_start_state_set.any?(&:error?))
|
239
|
+
state_map[nfa_start_state_set] = dfa_start_state
|
240
|
+
until unvisited_state_sets.empty?
|
241
|
+
# take one of the unvisited state sets
|
242
|
+
state_set = unvisited_state_sets.first
|
243
|
+
|
244
|
+
current_dfa_state = state_map[state_set]
|
245
|
+
|
246
|
+
# Figure out the set of next-states for each token in the alphabet
|
247
|
+
# Add each set of next-states to unvisited_state_sets
|
248
|
+
dfa_alphabet.each do |token|
|
249
|
+
next_nfa_state_set = next_states(state_set, token)
|
250
|
+
unvisited_state_sets << next_nfa_state_set
|
251
|
+
|
252
|
+
# this new DFA state, next_dfa_state, represents the next nfa state set, next_nfa_state_set
|
253
|
+
next_dfa_state = state_map[next_nfa_state_set] ||= State.new(next_nfa_state_set.any?(&:final?), next_nfa_state_set.any?(&:error?))
|
254
|
+
|
255
|
+
char_transition_map = dfa_transitions[current_dfa_state] ||= Hash.new
|
256
|
+
char_transition_map[token] = DFATransition.new(token, current_dfa_state, next_dfa_state)
|
257
|
+
end
|
258
|
+
|
259
|
+
visited_state_sets << state_set
|
260
|
+
unvisited_state_sets = unvisited_state_sets - visited_state_sets
|
261
|
+
end
|
262
|
+
|
263
|
+
# `state_map.invert` is sufficient to convert from a (nfa_state_set => dfa_state) mapping to a (dfa_state => nfa_state_set) mapping, because the mappings are strictly one-to-one.
|
264
|
+
DFA.new(state_map[nfa_start_state_set], dfa_alphabet, dfa_transitions, state_map.invert, origin_nfa: self).set_regex_pattern(regex_pattern)
|
265
|
+
end
|
266
|
+
|
267
|
+
def graphviz
|
268
|
+
retval = "digraph G { "
|
269
|
+
all_transitions.each do |t|
|
270
|
+
transition_label = t.epsilon? ? "ε" : t.token
|
271
|
+
retval += "#{t.from.id} -> #{t.to.id} [label=\"#{transition_label}\"];"
|
272
|
+
end
|
273
|
+
@final_states.each do |s|
|
274
|
+
retval += "#{s.id} [color=lightblue2, style=filled, shape=doublecircle];"
|
275
|
+
end
|
276
|
+
retval += " }"
|
277
|
+
retval
|
278
|
+
end
|
279
|
+
|
280
|
+
def to_s(verbose = false)
|
281
|
+
if verbose
|
282
|
+
retval = states.map(&:to_s).join("\n")
|
283
|
+
retval += "\n"
|
284
|
+
all_transitions.each do |t|
|
285
|
+
transition_label = t.epsilon? ? "epsilon" : t.token
|
286
|
+
retval += "#{t.from.id} -> #{transition_label} -> #{t.to.id}\n"
|
287
|
+
end
|
288
|
+
retval
|
289
|
+
else
|
290
|
+
regex_pattern
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
def set_regex_pattern(pattern)
|
295
|
+
@regex_pattern = pattern
|
296
|
+
self
|
297
|
+
end
|
298
|
+
|
299
|
+
def regex_pattern
|
300
|
+
@regex_pattern || "<<empty>>"
|
301
|
+
end
|
302
|
+
end
|
303
|
+
|
304
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Enumerable
|
2
|
+
# calls the block with successive elements; returns the first truthy object returned by the block
|
3
|
+
def find_map(&block)
|
4
|
+
each do |element|
|
5
|
+
mapped_value = block.call(element)
|
6
|
+
return mapped_value if mapped_value
|
7
|
+
end
|
8
|
+
nil
|
9
|
+
end
|
10
|
+
|
11
|
+
def compact_map(&block)
|
12
|
+
ary = []
|
13
|
+
each do |e|
|
14
|
+
v = block.call(e)
|
15
|
+
unless v.nil?
|
16
|
+
ary << v
|
17
|
+
end
|
18
|
+
end
|
19
|
+
ary
|
20
|
+
end
|
21
|
+
|
22
|
+
alias_method :includes?, :include?
|
23
|
+
end
|
data/lib/kleene.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support"
|
4
|
+
require "active_support/core_ext"
|
5
|
+
require_relative "kleene/version"
|
6
|
+
require_relative "kleene/patches"
|
7
|
+
require_relative "kleene/kleene"
|
8
|
+
require_relative "kleene/dsl"
|
9
|
+
require_relative "kleene/nfa"
|
10
|
+
require_relative "kleene/dfa"
|
11
|
+
require_relative "kleene/multi_match_dfa"
|
12
|
+
|
13
|
+
|
14
|
+
module Kleene
|
15
|
+
class Error < StandardError; end
|
16
|
+
# Your code goes here...
|
17
|
+
end
|
metadata
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: kleene
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- David Ellis
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-11-04 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: activesupport
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '7.1'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '7.1'
|
27
|
+
description: kleene is a library for building regular expression recognition automata
|
28
|
+
- nfas, dfas, and some specialty structures.
|
29
|
+
email:
|
30
|
+
- david@conquerthelawn.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- ".rspec"
|
36
|
+
- Gemfile
|
37
|
+
- Gemfile.lock
|
38
|
+
- LICENSE
|
39
|
+
- README.md
|
40
|
+
- Rakefile
|
41
|
+
- build.ops
|
42
|
+
- kleene.gemspec
|
43
|
+
- lib/kleene.rb
|
44
|
+
- lib/kleene/dfa.rb
|
45
|
+
- lib/kleene/dsl.rb
|
46
|
+
- lib/kleene/kleene.rb
|
47
|
+
- lib/kleene/multi_match_dfa.rb
|
48
|
+
- lib/kleene/nfa.rb
|
49
|
+
- lib/kleene/patches.rb
|
50
|
+
- lib/kleene/version.rb
|
51
|
+
homepage: https://github.com/davidkellis/kleene-rb
|
52
|
+
licenses:
|
53
|
+
- MIT
|
54
|
+
metadata:
|
55
|
+
homepage_uri: https://github.com/davidkellis/kleene-rb
|
56
|
+
source_code_uri: https://github.com/davidkellis/kleene-rb
|
57
|
+
post_install_message:
|
58
|
+
rdoc_options: []
|
59
|
+
require_paths:
|
60
|
+
- lib
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 3.0.0
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
71
|
+
requirements: []
|
72
|
+
rubygems_version: 3.4.10
|
73
|
+
signing_key:
|
74
|
+
specification_version: 4
|
75
|
+
summary: kleene is a library for building regular expression recognition automata
|
76
|
+
test_files: []
|