sequitur 0.1.02 → 0.1.03
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +5 -0
- data/lib/sequitur/production.rb +1 -1
- data/lib/sequitur/sequitur_grammar.rb +106 -165
- data/spec/sequitur/digram_spec.rb +9 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Mzg2M2MwZDY5M2M1Zjc5MGZlOWI5MTMwM2ExNTc3YWZlMWI4MjliZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTY4NzRlYjJmMGU1Nzc4ZDdiZmNkYWFiNzRmMWU3OThiMmExNDNkYg==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
Y2M3Mjg1ZTFkYzEzNWI0YWYwMmIxMGIzZGE5ZjZmZmE4NmRjMDQ3NGE3Nzhl
|
10
|
+
YTZiMjM1ZWM1OTRiODAwMmE0NDg0NWI1MzYyZGQ1Y2RiMTAzNWMwMzFjNzE1
|
11
|
+
OGVkYmNlYzBmMjlhYmI3NDZkZTZjYWNjY2VkZTk2MzQxNWQ4OGY=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTNiNjhlYTNhNGU1NjY4ZTUwNzkzNWRiYzllMDA5MjczYmFhMzE3MDJiZDI3
|
14
|
+
NWUzYzIyNTJlMDYzZTUxOWFmOTM0MzE3MDU3YTJiMTgzMTVkN2QyMGU5ZDFi
|
15
|
+
MmU0ZTMxZjYwMzNkZjFmNGVhMGU4M2E1ZTQ5YTliMDg4NmNlZGQ=
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
### 0.1.03 / 2014-09-21
|
2
|
+
* [CHANGE] Class `Sequitur::SequiturGrammar` Code refactoring: cleaner and simpler implementation the algorithm.
|
3
|
+
* [CHANGE] Class `Sequitur::Digram`. Added new method `repeating?` that tells whether digram members are the same.
|
4
|
+
|
1
5
|
### 0.1.02 / 2014-09-18
|
2
6
|
* [CHANGE] File `README.md`: expanded introductory text.
|
3
7
|
* [CHANGE] File `sequitur.gemspec` : expanded gem description in the specification.
|
data/lib/sequitur/constants.rb
CHANGED
data/lib/sequitur/digram.rb
CHANGED
data/lib/sequitur/production.rb
CHANGED
@@ -1,13 +1,9 @@
|
|
1
1
|
require_relative 'dynamic_grammar'
|
2
2
|
|
3
|
+
|
3
4
|
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
5
|
|
5
6
|
class SequiturGrammar < DynamicGrammar
|
6
|
-
# A hash with pairs of the form: digram key => digram
|
7
|
-
attr_reader(:digrams)
|
8
|
-
|
9
|
-
# The input
|
10
|
-
attr_reader(:parsed)
|
11
7
|
|
12
8
|
# Constructor. Build the grammar from an enumerator of tokens
|
13
9
|
def initialize(anEnum)
|
@@ -15,191 +11,136 @@ class SequiturGrammar < DynamicGrammar
|
|
15
11
|
# Make start production compliant with utility rule
|
16
12
|
2.times { root.incr_refcount }
|
17
13
|
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
# Read the input sequence and apply the Sequitur algorithm
|
15
|
+
anEnum.each do |a_token|
|
16
|
+
add_token(a_token)
|
17
|
+
enforce_rules
|
18
|
+
end
|
21
19
|
end
|
22
20
|
|
23
21
|
public
|
24
22
|
|
25
|
-
# Add the given token to the grammar.
|
26
|
-
def add_token(aToken)
|
27
|
-
parsed << aToken
|
28
|
-
super
|
29
|
-
end
|
30
|
-
|
31
|
-
|
32
|
-
private
|
33
|
-
|
34
|
-
# Assumption: last digram of production isn't yet registered.
|
35
|
-
def add_production(aProduction)
|
36
|
-
super # Call original method from superclass...
|
37
|
-
|
38
|
-
# ... then add this behaviour
|
39
|
-
last_digram = aProduction.last_digram
|
40
|
-
digrams[last_digram.key] = last_digram
|
41
|
-
end
|
42
|
-
|
43
|
-
# Remove a production from the grammar
|
44
|
-
def remove_production(anIndex)
|
45
|
-
prod = productions[anIndex]
|
46
|
-
|
47
|
-
# Retrieve in the Hash all registered digrams from the removed production
|
48
|
-
digrams_subset = digrams.select do |_, digr|
|
49
|
-
digr.production == prod
|
50
|
-
end
|
51
23
|
|
52
|
-
|
53
|
-
|
54
|
-
|
24
|
+
CollisionDiagnosis = Struct.new(:collision_found, :digram, :productions)
|
25
|
+
|
26
|
+
|
27
|
+
# Assuming that a new input token was added to the start production,
|
28
|
+
# enforce the digram unicity and rule utility rules
|
29
|
+
# begin
|
30
|
+
# if a digram D occurs twice in the grammar then
|
31
|
+
# add a production P : D (if not already there)
|
32
|
+
# replace both Ds with R (reduction step).
|
33
|
+
# end
|
34
|
+
# if a production P : RHS in referenced only once then
|
35
|
+
# replace P by its RHS (derivation step)
|
36
|
+
# remove P from grammar
|
37
|
+
# end
|
38
|
+
# end until digram unicity and rule utility are met
|
39
|
+
def enforce_rules()
|
40
|
+
begin
|
41
|
+
unicity_diagnosis = detect_collision if unicity_diagnosis.nil?
|
42
|
+
restore_unicity(unicity_diagnosis) if unicity_diagnosis.collision_found
|
43
|
+
|
44
|
+
useless_prod = detect_useless_production
|
45
|
+
restore_utility(useless_prod) if useless_prod
|
46
|
+
|
47
|
+
unicity_diagnosis = detect_collision
|
48
|
+
useless_prod = detect_useless_production
|
49
|
+
|
50
|
+
end while unicity_diagnosis.collision_found || useless_prod
|
55
51
|
end
|
56
52
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
53
|
+
# Check whether a digram is used twice in the grammar.
|
54
|
+
# Return an empty Hash if each digram appears once.
|
55
|
+
# Otherwise return a Hash with a pair of the form: digram => [Pi, Pk]
|
56
|
+
# Where Pi, Pk are two productions where the digram occurs.
|
57
|
+
def detect_collision()
|
58
|
+
diagnosis = CollisionDiagnosis.new(false)
|
59
|
+
found_so_far = {}
|
60
|
+
productions.each do |a_prod|
|
61
|
+
prod_digrams = a_prod.digrams
|
62
|
+
prod_digrams.each do |a_digr|
|
63
|
+
its_key = a_digr.key
|
64
|
+
if found_so_far.include? its_key
|
65
|
+
orig_digr = found_so_far[its_key]
|
66
|
+
# Disregard sequence like a a a
|
67
|
+
if ((orig_digr.production == a_prod) && a_digr.repeating? &&
|
68
|
+
(orig_digr == a_digr))
|
69
|
+
next
|
70
|
+
end
|
71
|
+
|
72
|
+
diagnosis.digram = orig_digr
|
73
|
+
diagnosis.productions = [orig_digr.production, a_prod]
|
74
|
+
diagnosis.collision_found = true
|
75
|
+
break
|
76
|
+
else
|
77
|
+
found_so_far[its_key] = a_digr
|
78
|
+
end
|
72
79
|
end
|
80
|
+
break if diagnosis.collision_found
|
73
81
|
end
|
82
|
+
|
83
|
+
return diagnosis
|
74
84
|
end
|
75
85
|
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
# the matching digram
|
89
|
-
productions.each do |prod|
|
90
|
-
its_digrams = prod.digrams
|
91
|
-
its_keys = its_digrams.map(&:key)
|
92
|
-
next if prod == last_digram.production
|
93
|
-
next unless its_keys.include? last_digram.key
|
94
|
-
msg = "Digram #{last_digram.symbols} occurs three times!"
|
95
|
-
msg << "\nTwice in production #{aProduction.object_id}"
|
96
|
-
msg << "\nThird in production #{prod.object_id}"
|
97
|
-
msg << "\n#{to_string}"
|
98
|
-
fail StandardError, msg
|
86
|
+
# When a collision diagnosis indicates that a given
|
87
|
+
# digram d occurs twice in the grammar
|
88
|
+
# Then create a new production that will have
|
89
|
+
# the symbols of d as its rhs members.
|
90
|
+
def restore_unicity(aDiagnosis)
|
91
|
+
return if aDiagnosis.nil?
|
92
|
+
|
93
|
+
digr = aDiagnosis.digram
|
94
|
+
prods = aDiagnosis.productions
|
95
|
+
if prods.any?(&:single_digram?)
|
96
|
+
(simple, compound) = prods.partition do |a_prod|
|
97
|
+
a_prod.single_digram?
|
99
98
|
end
|
100
|
-
|
101
|
-
|
102
|
-
#
|
99
|
+
compound[0].replace_digram(simple[0])
|
100
|
+
else
|
101
|
+
# Create a new production with the digram's symbols as its
|
102
|
+
# sole rhs members.
|
103
103
|
new_prod = Production.new
|
104
|
-
new_prod.append_symbol(
|
105
|
-
new_prod.append_symbol(last_digram.symbols[1])
|
106
|
-
|
107
|
-
# ... replace duplicate digram by reference to new production
|
108
|
-
aProduction.replace_digram(new_prod)
|
104
|
+
digr.symbols.each { |sym| new_prod.append_symbol(sym) }
|
109
105
|
add_production(new_prod)
|
110
|
-
|
111
|
-
|
112
|
-
# Duplicate digram used in distinct production
|
113
|
-
# Two cases: other production is a single digram one or a multi-digram
|
114
|
-
other_prod = matching_digram.production
|
115
|
-
if other_prod.single_digram?
|
116
|
-
# ... replace duplicate digram by reference to other production
|
117
|
-
aProduction.replace_digram(other_prod)
|
118
|
-
update_digrams_from(aProduction)
|
119
|
-
|
120
|
-
# Special case a: replacement causes another digram duplication
|
121
|
-
# in the given production
|
122
|
-
# Special case b: replacement causes another digram duplication
|
123
|
-
# with other production
|
124
|
-
if aProduction.repeated_digram? ||
|
125
|
-
(digrams[aProduction.last_digram.key].production != aProduction)
|
126
|
-
preserve_unicity(aProduction)
|
127
|
-
end
|
128
|
-
|
106
|
+
if prods[0] == prods[1]
|
107
|
+
prods[0].replace_digram(new_prod)
|
129
108
|
else
|
130
|
-
|
131
|
-
# Then create a new production with the digram as its rhs
|
132
|
-
new_prod = Production.new
|
133
|
-
new_prod.append_symbol(last_digram.symbols[0])
|
134
|
-
new_prod.append_symbol(last_digram.symbols[1])
|
135
|
-
|
136
|
-
# ... replace duplicate digram by reference to new production
|
137
|
-
aProduction.replace_digram(new_prod)
|
138
|
-
other_prod.replace_digram(new_prod)
|
139
|
-
add_production(new_prod)
|
140
|
-
update_digrams_from(aProduction)
|
141
|
-
|
142
|
-
# TODO: Check when aProduction and other_prod have same preceding symbol
|
143
|
-
update_digrams_from(other_prod)
|
109
|
+
prods.each { |a_prod| a_prod.replace_digram(new_prod) }
|
144
110
|
end
|
145
111
|
end
|
146
112
|
end
|
147
113
|
|
148
|
-
#
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
# - Detect occurrence of any production in the rhs
|
153
|
-
# - Identify the occurring production
|
154
|
-
# - In the occurrence hash push the production id of the lhs
|
155
|
-
# Select each production that occurs once (singleton rule):
|
156
|
-
# Replace the occurrence in the rhs by the rhs of the singleton rule
|
157
|
-
# Delete the singleton rule
|
158
|
-
# Update digrams
|
159
|
-
def enforce_rule_utility()
|
160
|
-
return if productions.size < 2
|
161
|
-
|
162
|
-
loop do
|
163
|
-
all_refcount_ok = true
|
164
|
-
(1...productions.size).to_a.reverse.each do |index|
|
165
|
-
curr_production = productions[index]
|
166
|
-
next unless curr_production.refcount == 1
|
167
|
-
|
168
|
-
all_refcount_ok = false
|
169
|
-
dependent = productions.find do |a_prod|
|
170
|
-
!a_prod.references_of(curr_production).empty?
|
171
|
-
end
|
172
|
-
dependent.replace_production(productions[index])
|
173
|
-
remove_production(index)
|
174
|
-
update_digrams_from(dependent)
|
175
|
-
end
|
176
|
-
|
177
|
-
break if all_refcount_ok
|
178
|
-
end
|
114
|
+
# Return a production that is used less than twice in the grammar.
|
115
|
+
def detect_useless_production()
|
116
|
+
useless = productions.find { |prod| prod.refcount < 2 }
|
117
|
+
return (useless == productions[0]) ? nil : useless
|
179
118
|
end
|
180
119
|
|
181
|
-
|
182
|
-
#
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
120
|
+
# Given the passed production P is referenced only once.
|
121
|
+
# Then replace P by its RHS where it is referenced.
|
122
|
+
# And delete P
|
123
|
+
def restore_utility(useless_prod)
|
124
|
+
# Retrieve index of useless_prod
|
125
|
+
index = productions.index(useless_prod)
|
126
|
+
|
127
|
+
# Retrieve production referencing useless one
|
128
|
+
referencing = nil
|
129
|
+
productions.each do |a_prod|
|
130
|
+
# Next line assumes non-recursive productions
|
131
|
+
next if a_prod == useless_prod
|
132
|
+
|
133
|
+
refs = a_prod.references_of(useless_prod)
|
134
|
+
next if refs.empty?
|
135
|
+
referencing = a_prod
|
136
|
+
break
|
189
137
|
end
|
190
138
|
|
191
|
-
|
192
|
-
|
193
|
-
digr.production == aProduction
|
194
|
-
end
|
195
|
-
|
196
|
-
# Remove obsolete digrams
|
197
|
-
current_keys = current_digrams.map(&:key)
|
198
|
-
digrams_subset.keys.each do |a_key|
|
199
|
-
digrams.delete(a_key) unless current_keys.include? a_key
|
200
|
-
end
|
139
|
+
referencing.replace_production(useless_prod)
|
140
|
+
remove_production(index)
|
201
141
|
end
|
202
142
|
|
143
|
+
|
203
144
|
end # class
|
204
145
|
|
205
146
|
end # module
|
@@ -22,10 +22,18 @@ describe Digram do
|
|
22
22
|
instance = Digram.new(:b, :c, production)
|
23
23
|
expect(instance.production).to eq(production)
|
24
24
|
end
|
25
|
+
|
26
|
+
it 'should whether its symbols are the same' do
|
27
|
+
instance1 = Digram.new(:a, :a, production)
|
28
|
+
expect(instance1).to be_repeating
|
29
|
+
|
30
|
+
instance1 = Digram.new(:a, :b, production)
|
31
|
+
expect(instance1).not_to be_repeating
|
32
|
+
end
|
25
33
|
|
26
34
|
end # context
|
27
35
|
|
28
|
-
context '
|
36
|
+
context 'Provided services:' do
|
29
37
|
|
30
38
|
it 'should compare itself to another digram' do
|
31
39
|
instance1 = Digram.new(:a, :b, production)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sequitur
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.03
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitri Geshef
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-09-
|
11
|
+
date: 2014-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|