sequitur 0.1.02 → 0.1.03
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +5 -0
- data/lib/sequitur/production.rb +1 -1
- data/lib/sequitur/sequitur_grammar.rb +106 -165
- data/spec/sequitur/digram_spec.rb +9 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Mzg2M2MwZDY5M2M1Zjc5MGZlOWI5MTMwM2ExNTc3YWZlMWI4MjliZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTY4NzRlYjJmMGU1Nzc4ZDdiZmNkYWFiNzRmMWU3OThiMmExNDNkYg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
Y2M3Mjg1ZTFkYzEzNWI0YWYwMmIxMGIzZGE5ZjZmZmE4NmRjMDQ3NGE3Nzhl
|
10
|
+
YTZiMjM1ZWM1OTRiODAwMmE0NDg0NWI1MzYyZGQ1Y2RiMTAzNWMwMzFjNzE1
|
11
|
+
OGVkYmNlYzBmMjlhYmI3NDZkZTZjYWNjY2VkZTk2MzQxNWQ4OGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTNiNjhlYTNhNGU1NjY4ZTUwNzkzNWRiYzllMDA5MjczYmFhMzE3MDJiZDI3
|
14
|
+
NWUzYzIyNTJlMDYzZTUxOWFmOTM0MzE3MDU3YTJiMTgzMTVkN2QyMGU5ZDFi
|
15
|
+
MmU0ZTMxZjYwMzNkZjFmNGVhMGU4M2E1ZTQ5YTliMDg4NmNlZGQ=
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
### 0.1.03 / 2014-09-21
|
2
|
+
* [CHANGE] Class `Sequitur::SequiturGrammar` Code refactoring: cleaner and simpler implementation the algorithm.
|
3
|
+
* [CHANGE] Class `Sequitur::Digram`. Added new method `repeating?` that tells whether digram members are the same.
|
4
|
+
|
1
5
|
### 0.1.02 / 2014-09-18
|
2
6
|
* [CHANGE] File `README.md`: expanded introductory text.
|
3
7
|
* [CHANGE] File `sequitur.gemspec` : expanded gem description in the specification.
|
data/lib/sequitur/constants.rb
CHANGED
data/lib/sequitur/digram.rb
CHANGED
data/lib/sequitur/production.rb
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
require_relative 'dynamic_grammar'
|
2
2
|
|
3
|
+
|
3
4
|
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
5
|
|
5
6
|
class SequiturGrammar < DynamicGrammar
|
6
|
-
# A hash with pairs of the form: digram key => digram
|
7
|
-
attr_reader(:digrams)
|
8
|
-
|
9
|
-
# The input
|
10
|
-
attr_reader(:parsed)
|
11
7
|
|
12
8
|
# Constructor. Build the grammar from an enumerator of tokens
|
13
9
|
def initialize(anEnum)
|
@@ -15,191 +11,136 @@ class SequiturGrammar < DynamicGrammar
|
|
15
11
|
# Make start production compliant with utility rule
|
16
12
|
2.times { root.incr_refcount }
|
17
13
|
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
# Read the input sequence and apply the Sequitur algorithm
|
15
|
+
anEnum.each do |a_token|
|
16
|
+
add_token(a_token)
|
17
|
+
enforce_rules
|
18
|
+
end
|
21
19
|
end
|
22
20
|
|
23
21
|
public
|
24
22
|
|
25
|
-
# Add the given token to the grammar.
|
26
|
-
def add_token(aToken)
|
27
|
-
parsed << aToken
|
28
|
-
super
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
# Assumption: last digram of production isn't yet registered.
|
35
|
-
def add_production(aProduction)
|
36
|
-
super # Call original method from superclass...
|
37
|
-
|
38
|
-
# ... then add this behaviour
|
39
|
-
last_digram = aProduction.last_digram
|
40
|
-
digrams[last_digram.key] = last_digram
|
41
|
-
end
|
42
|
-
|
43
|
-
# Remove a production from the grammar
|
44
|
-
def remove_production(anIndex)
|
45
|
-
prod = productions[anIndex]
|
46
|
-
|
47
|
-
# Retrieve in the Hash all registered digrams from the removed production
|
48
|
-
digrams_subset = digrams.select do |_, digr|
|
49
|
-
digr.production == prod
|
50
|
-
end
|
51
23
|
|
52
|
-
|
53
|
-
|
54
|
-
|
24
|
+
CollisionDiagnosis = Struct.new(:collision_found, :digram, :productions)
|
25
|
+
|
26
|
+
|
27
|
+
# Assuming that a new input token was added to the start production,
|
28
|
+
# enforce the digram unicity and rule utility rules
|
29
|
+
# begin
|
30
|
+
# if a digram D occurs twice in the grammar then
|
31
|
+
# add a production P : D (if not already there)
|
32
|
+
# replace both Ds with R (reduction step).
|
33
|
+
# end
|
34
|
+
# if a production P : RHS in referenced only once then
|
35
|
+
# replace P by its RHS (derivation step)
|
36
|
+
# remove P from grammar
|
37
|
+
# end
|
38
|
+
# end until digram unicity and rule utility are met
|
39
|
+
def enforce_rules()
|
40
|
+
begin
|
41
|
+
unicity_diagnosis = detect_collision if unicity_diagnosis.nil?
|
42
|
+
restore_unicity(unicity_diagnosis) if unicity_diagnosis.collision_found
|
43
|
+
|
44
|
+
useless_prod = detect_useless_production
|
45
|
+
restore_utility(useless_prod) if useless_prod
|
46
|
+
|
47
|
+
unicity_diagnosis = detect_collision
|
48
|
+
useless_prod = detect_useless_production
|
49
|
+
|
50
|
+
end while unicity_diagnosis.collision_found || useless_prod
|
55
51
|
end
|
56
52
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
53
|
+
# Check whether a digram is used twice in the grammar.
|
54
|
+
# Return an empty Hash if each digram appears once.
|
55
|
+
# Otherwise return a Hash with a pair of the form: digram => [Pi, Pk]
|
56
|
+
# Where Pi, Pk are two productions where the digram occurs.
|
57
|
+
def detect_collision()
|
58
|
+
diagnosis = CollisionDiagnosis.new(false)
|
59
|
+
found_so_far = {}
|
60
|
+
productions.each do |a_prod|
|
61
|
+
prod_digrams = a_prod.digrams
|
62
|
+
prod_digrams.each do |a_digr|
|
63
|
+
its_key = a_digr.key
|
64
|
+
if found_so_far.include? its_key
|
65
|
+
orig_digr = found_so_far[its_key]
|
66
|
+
# Disregard sequence like a a a
|
67
|
+
if ((orig_digr.production == a_prod) && a_digr.repeating? &&
|
68
|
+
(orig_digr == a_digr))
|
69
|
+
next
|
70
|
+
end
|
71
|
+
|
72
|
+
diagnosis.digram = orig_digr
|
73
|
+
diagnosis.productions = [orig_digr.production, a_prod]
|
74
|
+
diagnosis.collision_found = true
|
75
|
+
break
|
76
|
+
else
|
77
|
+
found_so_far[its_key] = a_digr
|
78
|
+
end
|
72
79
|
end
|
80
|
+
break if diagnosis.collision_found
|
73
81
|
end
|
82
|
+
|
83
|
+
return diagnosis
|
74
84
|
end
|
75
85
|
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
# the matching digram
|
89
|
-
productions.each do |prod|
|
90
|
-
its_digrams = prod.digrams
|
91
|
-
its_keys = its_digrams.map(&:key)
|
92
|
-
next if prod == last_digram.production
|
93
|
-
next unless its_keys.include? last_digram.key
|
94
|
-
msg = "Digram #{last_digram.symbols} occurs three times!"
|
95
|
-
msg << "\nTwice in production #{aProduction.object_id}"
|
96
|
-
msg << "\nThird in production #{prod.object_id}"
|
97
|
-
msg << "\n#{to_string}"
|
98
|
-
fail StandardError, msg
|
86
|
+
# When a collision diagnosis indicates that a given
|
87
|
+
# digram d occurs twice in the grammar
|
88
|
+
# Then create a new production that will have
|
89
|
+
# the symbols of d as its rhs members.
|
90
|
+
def restore_unicity(aDiagnosis)
|
91
|
+
return if aDiagnosis.nil?
|
92
|
+
|
93
|
+
digr = aDiagnosis.digram
|
94
|
+
prods = aDiagnosis.productions
|
95
|
+
if prods.any?(&:single_digram?)
|
96
|
+
(simple, compound) = prods.partition do |a_prod|
|
97
|
+
a_prod.single_digram?
|
99
98
|
end
|
100
|
-
|
101
|
-
|
102
|
-
#
|
99
|
+
compound[0].replace_digram(simple[0])
|
100
|
+
else
|
101
|
+
# Create a new production with the digram's symbols as its
|
102
|
+
# sole rhs members.
|
103
103
|
new_prod = Production.new
|
104
|
-
new_prod.append_symbol(
|
105
|
-
new_prod.append_symbol(last_digram.symbols[1])
|
106
|
-
|
107
|
-
# ... replace duplicate digram by reference to new production
|
108
|
-
aProduction.replace_digram(new_prod)
|
104
|
+
digr.symbols.each { |sym| new_prod.append_symbol(sym) }
|
109
105
|
add_production(new_prod)
|
110
|
-
|
111
|
-
|
112
|
-
# Duplicate digram used in distinct production
|
113
|
-
# Two cases: other production is a single digram one or a multi-digram
|
114
|
-
other_prod = matching_digram.production
|
115
|
-
if other_prod.single_digram?
|
116
|
-
# ... replace duplicate digram by reference to other production
|
117
|
-
aProduction.replace_digram(other_prod)
|
118
|
-
update_digrams_from(aProduction)
|
119
|
-
|
120
|
-
# Special case a: replacement causes another digram duplication
|
121
|
-
# in the given production
|
122
|
-
# Special case b: replacement causes another digram duplication
|
123
|
-
# with other production
|
124
|
-
if aProduction.repeated_digram? ||
|
125
|
-
(digrams[aProduction.last_digram.key].production != aProduction)
|
126
|
-
preserve_unicity(aProduction)
|
127
|
-
end
|
128
|
-
|
106
|
+
if prods[0] == prods[1]
|
107
|
+
prods[0].replace_digram(new_prod)
|
129
108
|
else
|
130
|
-
|
131
|
-
# Then create a new production with the digram as its rhs
|
132
|
-
new_prod = Production.new
|
133
|
-
new_prod.append_symbol(last_digram.symbols[0])
|
134
|
-
new_prod.append_symbol(last_digram.symbols[1])
|
135
|
-
|
136
|
-
# ... replace duplicate digram by reference to new production
|
137
|
-
aProduction.replace_digram(new_prod)
|
138
|
-
other_prod.replace_digram(new_prod)
|
139
|
-
add_production(new_prod)
|
140
|
-
update_digrams_from(aProduction)
|
141
|
-
|
142
|
-
# TODO: Check when aProduction and other_prod have same preceding symbol
|
143
|
-
update_digrams_from(other_prod)
|
109
|
+
prods.each { |a_prod| a_prod.replace_digram(new_prod) }
|
144
110
|
end
|
145
111
|
end
|
146
112
|
end
|
147
113
|
|
148
|
-
#
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# - Detect occurrence of any production in the rhs
|
153
|
-
# - Identify the occurring production
|
154
|
-
# - In the occurrence hash push the production id of the lhs
|
155
|
-
# Select each production that occurs once (singleton rule):
|
156
|
-
# Replace the occurrence in the rhs by the rhs of the singleton rule
|
157
|
-
# Delete the singleton rule
|
158
|
-
# Update digrams
|
159
|
-
def enforce_rule_utility()
|
160
|
-
return if productions.size < 2
|
161
|
-
|
162
|
-
loop do
|
163
|
-
all_refcount_ok = true
|
164
|
-
(1...productions.size).to_a.reverse.each do |index|
|
165
|
-
curr_production = productions[index]
|
166
|
-
next unless curr_production.refcount == 1
|
167
|
-
|
168
|
-
all_refcount_ok = false
|
169
|
-
dependent = productions.find do |a_prod|
|
170
|
-
!a_prod.references_of(curr_production).empty?
|
171
|
-
end
|
172
|
-
dependent.replace_production(productions[index])
|
173
|
-
remove_production(index)
|
174
|
-
update_digrams_from(dependent)
|
175
|
-
end
|
176
|
-
|
177
|
-
break if all_refcount_ok
|
178
|
-
end
|
114
|
+
# Return a production that is used less than twice in the grammar.
|
115
|
+
def detect_useless_production()
|
116
|
+
useless = productions.find { |prod| prod.refcount < 2 }
|
117
|
+
return (useless == productions[0]) ? nil : useless
|
179
118
|
end
|
180
119
|
|
181
|
-
|
182
|
-
#
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
120
|
+
# Given the passed production P is referenced only once.
|
121
|
+
# Then replace P by its RHS where it is referenced.
|
122
|
+
# And delete P
|
123
|
+
def restore_utility(useless_prod)
|
124
|
+
# Retrieve index of useless_prod
|
125
|
+
index = productions.index(useless_prod)
|
126
|
+
|
127
|
+
# Retrieve production referencing useless one
|
128
|
+
referencing = nil
|
129
|
+
productions.each do |a_prod|
|
130
|
+
# Next line assumes non-recursive productions
|
131
|
+
next if a_prod == useless_prod
|
132
|
+
|
133
|
+
refs = a_prod.references_of(useless_prod)
|
134
|
+
next if refs.empty?
|
135
|
+
referencing = a_prod
|
136
|
+
break
|
189
137
|
end
|
190
138
|
|
191
|
-
|
192
|
-
|
193
|
-
digr.production == aProduction
|
194
|
-
end
|
195
|
-
|
196
|
-
# Remove obsolete digrams
|
197
|
-
current_keys = current_digrams.map(&:key)
|
198
|
-
digrams_subset.keys.each do |a_key|
|
199
|
-
digrams.delete(a_key) unless current_keys.include? a_key
|
200
|
-
end
|
139
|
+
referencing.replace_production(useless_prod)
|
140
|
+
remove_production(index)
|
201
141
|
end
|
202
142
|
|
143
|
+
|
203
144
|
end # class
|
204
145
|
|
205
146
|
end # module
|
@@ -22,10 +22,18 @@ describe Digram do
|
|
22
22
|
instance = Digram.new(:b, :c, production)
|
23
23
|
expect(instance.production).to eq(production)
|
24
24
|
end
|
25
|
+
|
26
|
+
it 'should whether its symbols are the same' do
|
27
|
+
instance1 = Digram.new(:a, :a, production)
|
28
|
+
expect(instance1).to be_repeating
|
29
|
+
|
30
|
+
instance1 = Digram.new(:a, :b, production)
|
31
|
+
expect(instance1).not_to be_repeating
|
32
|
+
end
|
25
33
|
|
26
34
|
end # context
|
27
35
|
|
28
|
-
context '
|
36
|
+
context 'Provided services:' do
|
29
37
|
|
30
38
|
it 'should compare itself to another digram' do
|
31
39
|
instance1 = Digram.new(:a, :b, production)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sequitur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.03
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitri Geshef
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|