sequitur 0.0.04
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.rspec +1 -0
- data/.rubocop.yml +74 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.simplecov +7 -0
- data/.travis.yml +15 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +9 -0
- data/LICENSE.txt +19 -0
- data/README.md +10 -0
- data/Rakefile +31 -0
- data/lib/sequitur.rb +8 -0
- data/lib/sequitur/constants.rb +26 -0
- data/lib/sequitur/digram.rb +29 -0
- data/lib/sequitur/dynamic-grammar.rb +126 -0
- data/lib/sequitur/production.rb +202 -0
- data/lib/sequitur/sequitur-grammar.rb +332 -0
- data/spec/sequitur/digram_spec.rb +33 -0
- data/spec/sequitur/dynamic-grammar_spec.rb +123 -0
- data/spec/sequitur/production_spec.rb +271 -0
- data/spec/sequitur/sequitur-grammar_spec.rb +213 -0
- data/spec/spec_helper.rb +21 -0
- metadata +134 -0
@@ -0,0 +1,332 @@
|
|
1
|
+
require_relative 'dynamic-grammar'
|
2
|
+
|
3
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
+
|
5
|
+
class SequiturGrammar < DynamicGrammar
|
6
|
+
# A hash with pairs of the form: digram key => digram
|
7
|
+
attr_reader(:digrams)
|
8
|
+
|
9
|
+
# The input
|
10
|
+
attr_reader(:parsed)
|
11
|
+
|
12
|
+
# Constructor. Build the grammar from an enumerator of tokens
|
13
|
+
def initialize(anEnum)
|
14
|
+
super()
|
15
|
+
# Make start production compliant with utility rule
|
16
|
+
2.times { root.add_backref(root) }
|
17
|
+
|
18
|
+
@digrams = {}
|
19
|
+
@parsed = []
|
20
|
+
anEnum.each { |a_token| add_token(a_token) }
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# Add the given token to the grammar.
|
26
|
+
def add_token(aToken)
|
27
|
+
parsed << aToken
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
# Check the invariant:
|
32
|
+
# Every digram appearing in a rhs must occur at most once in the grammar.
|
33
|
+
def check_unicity()
|
34
|
+
all_digrams = {}
|
35
|
+
productions.each do |a_prod|
|
36
|
+
prod_digrams = a_prod.digrams
|
37
|
+
prod_digrams.each do |a_digram|
|
38
|
+
if all_digrams.include? a_digram.key
|
39
|
+
msg = "Digram #{a_digram.symbols} occurs twice!"
|
40
|
+
colliding = all_digrams[a_digram.key]
|
41
|
+
msg << "\nOnce in production #{colliding.production_id}"
|
42
|
+
msg << "\nSecond in production #{a_prod.object_id}"
|
43
|
+
msg << "\n#{to_string}"
|
44
|
+
fail StandardError, msg
|
45
|
+
else
|
46
|
+
all_digrams[a_digram.key] = a_digram
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# Assumption: last digram of production isn't yet registered.
|
56
|
+
def add_production(aProduction)
|
57
|
+
super # Call original method from superclass...
|
58
|
+
|
59
|
+
# ... then add this behaviour
|
60
|
+
last_digram = aProduction.last_digram
|
61
|
+
digrams[last_digram.key] = last_digram
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove a production from the grammar
|
65
|
+
def delete_production(anIndex)
|
66
|
+
prod = productions[anIndex]
|
67
|
+
|
68
|
+
# Retrieve in the Hash all registered digrams from the removed production
|
69
|
+
digrams_subset = digrams.select do |_, digr|
|
70
|
+
digr.production_id == prod.object_id
|
71
|
+
end
|
72
|
+
|
73
|
+
# Remove them...
|
74
|
+
digrams_subset.each_key { |a_key| digrams.delete(a_key) }
|
75
|
+
super
|
76
|
+
end
|
77
|
+
|
78
|
+
def append_symbol_to(aProduction, aSymbol)
|
79
|
+
prod_digrams = aProduction.calc_append_symbol(aSymbol)
|
80
|
+
check_digrams # TODO: remove this
|
81
|
+
check_backrefs # TODO: remove this
|
82
|
+
super
|
83
|
+
unless prod_digrams.empty?
|
84
|
+
last_digram = prod_digrams.last
|
85
|
+
matching_digram = digrams[last_digram.key]
|
86
|
+
if matching_digram.nil?
|
87
|
+
# ... No registered occurrence of the digram, then register it
|
88
|
+
digrams[last_digram.key] = last_digram
|
89
|
+
else
|
90
|
+
# Digram is already registered...
|
91
|
+
# the digram unicity rule is broken: fix this
|
92
|
+
preserve_unicity(aProduction)
|
93
|
+
enforce_rule_utility
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# The given production breaks the digram unicity rule.
|
99
|
+
# Fix this either by a creating a new production having the duplicate
|
100
|
+
# digram as its rhs or by referencing such a production.
|
101
|
+
# then by replacing all occurrences of the digram by reference to
|
102
|
+
# the fixing production.
|
103
|
+
# Pre-condition: the given production has a repeated digram
|
104
|
+
# or its last digram is used elsewhere
|
105
|
+
def preserve_unicity(aProduction)
|
106
|
+
last_digram = aProduction.last_digram
|
107
|
+
matching_digram = digrams[last_digram.key]
|
108
|
+
if last_digram.production_id == matching_digram.production_id
|
109
|
+
# Rule: no other production distinct from aProduction should have
|
110
|
+
# the matching digram
|
111
|
+
productions.each do |prod|
|
112
|
+
its_digrams = prod.digrams
|
113
|
+
its_keys = its_digrams.map(&:key)
|
114
|
+
if prod.object_id == last_digram.production_id
|
115
|
+
# TODO: check that digram really occurs twice in the production.
|
116
|
+
# occurrences = its_keys.select { |a_key| a_key == last_digram.key }
|
117
|
+
# if occurrences.size != 2
|
118
|
+
# msg = "Digram #{last_digram.symbols} should occur twice"
|
119
|
+
# msg << "\nin production #{aProduction.object_id}"
|
120
|
+
# msg << "\nBut occurs #{occurrences.size}"
|
121
|
+
# msg << "\n#{self.to_string}"
|
122
|
+
# fail StandardError, msg
|
123
|
+
# end
|
124
|
+
|
125
|
+
else
|
126
|
+
if its_keys.include? last_digram.key
|
127
|
+
msg = "Digram #{last_digram.symbols} occurs three times!"
|
128
|
+
msg << "\nTwice in production #{aProduction.object_id}"
|
129
|
+
msg << "\nThird in production #{prod.object_id}"
|
130
|
+
msg << "\n#{to_string}"
|
131
|
+
fail StandardError, msg
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Digram appears twice in given production...
|
137
|
+
# Then create a new production with the digram as its rhs
|
138
|
+
new_prod = Production.new
|
139
|
+
new_prod.append_symbol(last_digram.symbols[0])
|
140
|
+
new_prod.append_symbol(last_digram.symbols[1])
|
141
|
+
|
142
|
+
# ... replace duplicate digram by reference to new production
|
143
|
+
aProduction.replace_digram(new_prod)
|
144
|
+
add_production(new_prod)
|
145
|
+
update_digrams_from(aProduction)
|
146
|
+
check_digrams # TODO: remove
|
147
|
+
check_unicity
|
148
|
+
else
|
149
|
+
# Duplicate digram used in distinct production
|
150
|
+
# Two cases: other production is a single digram one or a multi-digram
|
151
|
+
other_prod = ObjectSpace._id2ref(matching_digram.production_id)
|
152
|
+
if other_prod.single_digram?
|
153
|
+
# ... replace duplicate digram by reference to other production
|
154
|
+
aProduction.replace_digram(other_prod)
|
155
|
+
update_digrams_from(aProduction)
|
156
|
+
|
157
|
+
# Special case a: replacement causes another digram duplication
|
158
|
+
# in the given production
|
159
|
+
# Special case b: replacement causes another digram duplication
|
160
|
+
# with other production
|
161
|
+
if aProduction.repeated_digram? || digrams[aProduction.last_digram.key]
|
162
|
+
preserve_unicity(aProduction)
|
163
|
+
end
|
164
|
+
|
165
|
+
check_references # TODO: remove this
|
166
|
+
else
|
167
|
+
# aProduction, other_prod use both the same digram
|
168
|
+
# Then create a new production with the digram as its rhs
|
169
|
+
new_prod = Production.new
|
170
|
+
new_prod.append_symbol(last_digram.symbols[0])
|
171
|
+
new_prod.append_symbol(last_digram.symbols[1])
|
172
|
+
|
173
|
+
# ... replace duplicate digram by reference to new production
|
174
|
+
aProduction.replace_digram(new_prod)
|
175
|
+
other_prod.replace_digram(new_prod)
|
176
|
+
add_production(new_prod)
|
177
|
+
update_digrams_from(aProduction)
|
178
|
+
|
179
|
+
# TODO: Check when aProduction and other_prod have same preceding symbol
|
180
|
+
update_digrams_from(other_prod)
|
181
|
+
check_backrefs # TODO: remove this
|
182
|
+
end
|
183
|
+
check_unicity
|
184
|
+
end
|
185
|
+
|
186
|
+
check_unicity
|
187
|
+
check_registered
|
188
|
+
end
|
189
|
+
|
190
|
+
# Rule utility: except for the root production, every production must occur
|
191
|
+
# multiple times in all the rhs.
|
192
|
+
# Initialize occurrence hash with pairs: production id => []
|
193
|
+
# For each production:
|
194
|
+
# - Detect occurrence of any production in the rhs
|
195
|
+
# - Identify the occurring production
|
196
|
+
# - In the occurrence hash push the production id of the lhs
|
197
|
+
# Select each production that occurs once (singleton rule):
|
198
|
+
# Replace the occurrence in the rhs by the rhs of the singleton rule
|
199
|
+
# Delete the singleton rule
|
200
|
+
# Update digrams
|
201
|
+
def enforce_rule_utility()
|
202
|
+
return if productions.size < 2
|
203
|
+
check_references
|
204
|
+
|
205
|
+
loop do
|
206
|
+
all_refcount_OK = true
|
207
|
+
(1...productions.size).to_a.reverse.each do |index|
|
208
|
+
next unless productions[index].refcount == 1
|
209
|
+
|
210
|
+
all_refcount_OK = false
|
211
|
+
other_id = productions[index].backrefs.keys.first
|
212
|
+
dependent = ObjectSpace._id2ref(other_id)
|
213
|
+
dependent.replace_production(productions[index])
|
214
|
+
delete_production(index)
|
215
|
+
update_digrams_from(dependent)
|
216
|
+
check_references
|
217
|
+
check_backrefs
|
218
|
+
end
|
219
|
+
|
220
|
+
break if all_refcount_OK
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
# Update the digrams Hash with the digrams from the given production.
|
226
|
+
def update_digrams_from(aProduction)
|
227
|
+
current_digrams = aProduction.digrams
|
228
|
+
|
229
|
+
# Add new digrams
|
230
|
+
current_digrams.each do |digr|
|
231
|
+
digrams[digr.key] = digr unless digrams.include? digr.key
|
232
|
+
end
|
233
|
+
|
234
|
+
# Retrieve all registered digrams from the production
|
235
|
+
digrams_subset = digrams.select do |_, digr|
|
236
|
+
digr.production_id == aProduction.object_id
|
237
|
+
end
|
238
|
+
|
239
|
+
# Remove obsolete digrams
|
240
|
+
current_keys = current_digrams.map(&:key)
|
241
|
+
digrams_subset.keys.each do |a_key|
|
242
|
+
digrams.delete(a_key) unless current_keys.include? a_key
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# Check the invariant:
|
247
|
+
# Every production reference in a rhs must point
|
248
|
+
# to a production of the grammar.
|
249
|
+
def check_references()
|
250
|
+
productions.each do |a_prod|
|
251
|
+
rhs_prods = a_prod.references
|
252
|
+
rhs_prods.each do |referenced_prod|
|
253
|
+
next if productions.include? referenced_prod
|
254
|
+
|
255
|
+
msg = "Production #{a_prod.object_id} references the "
|
256
|
+
msg << "unknown production #{referenced_prod.object_id}"
|
257
|
+
msg << "\nOrphan production: #{referenced_prod.to_string}"
|
258
|
+
msg << "\n#{to_string}"
|
259
|
+
fail StandardError, msg
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Check the invariant:
|
265
|
+
# Every registered digram must reference a production from the grammar
|
266
|
+
def check_registered()
|
267
|
+
digrams.each do |_key, digr|
|
268
|
+
found = productions.find do |a_prod|
|
269
|
+
digr.production_id == a_prod.object_id
|
270
|
+
end
|
271
|
+
next if found
|
272
|
+
|
273
|
+
msg = "Digram #{digr.symbols} references the unknown "
|
274
|
+
msg << "production (#{digr.production_id})."
|
275
|
+
msg << "\n#{to_string}"
|
276
|
+
fail StandardError, msg
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# Compare the contents of digrams Hash with
|
281
|
+
# All digrams from all productions
|
282
|
+
def check_digrams()
|
283
|
+
# Control that every registered digram refers
|
284
|
+
# to a production that really has that digram
|
285
|
+
digrams.each do |key, digr|
|
286
|
+
its_prod = ObjectSpace._id2ref(digr.production_id)
|
287
|
+
prod_digrams = its_prod.digrams
|
288
|
+
prod_keys = prod_digrams.map(&:key)
|
289
|
+
next if prod_keys.include? key
|
290
|
+
|
291
|
+
msg = "Production #{digr.production_id} doesn't have "
|
292
|
+
msg << "the digram #{digr.symbols}"
|
293
|
+
msg << "\n#{prod_digrams.map(&:symbols)}"
|
294
|
+
msg << "\n#{to_string}"
|
295
|
+
fail StandardError, msg
|
296
|
+
end
|
297
|
+
|
298
|
+
all_digrams = {}
|
299
|
+
productions.each do |a_prod|
|
300
|
+
its_digrams = a_prod.digrams
|
301
|
+
its_digrams.each do |digr|
|
302
|
+
check_unicity if all_digrams[digr.key]
|
303
|
+
all_digrams[digr.key] = digr
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
all_digrams.each do |key, digr|
|
308
|
+
registered = digrams[key]
|
309
|
+
if registered
|
310
|
+
if registered.production_id != digr.production_id
|
311
|
+
msg = "Production #{digr.production_id} has "
|
312
|
+
msg << "the digram #{digr.symbols} that collides"
|
313
|
+
msg << "\n with same digram from #{registered.production_id}"
|
314
|
+
msg << "\n#{to_string}"
|
315
|
+
fail StandardError, msg
|
316
|
+
end
|
317
|
+
else
|
318
|
+
its_prod = ObjectSpace._id2ref(digr.production_id)
|
319
|
+
msg = "Production #{its_prod.object_id} (#{its_prod.rhs}) "
|
320
|
+
msg << "has the digram #{digr.symbols} that isn't registered."
|
321
|
+
msg << "\n#{to_string}"
|
322
|
+
fail StandardError, msg
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
|
328
|
+
end # class
|
329
|
+
|
330
|
+
end # module
|
331
|
+
|
332
|
+
# End of file
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
# Load the class under test
|
4
|
+
require_relative '../../lib/sequitur/digram'
|
5
|
+
|
6
|
+
module Sequitur # Re-open the module to get rid of qualified names
|
7
|
+
|
8
|
+
describe Digram do
|
9
|
+
let(:two_symbols) { [:b, :c] }
|
10
|
+
|
11
|
+
context 'Standard creation & initialization:' do
|
12
|
+
|
13
|
+
it 'should be created with 3 arguments' do
|
14
|
+
production = double('sample-production')
|
15
|
+
instance = Digram.new(:b, :c, production)
|
16
|
+
|
17
|
+
expect(instance.symbols).to eq(two_symbols)
|
18
|
+
expect(instance.production_id).to eq(production.object_id)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should return the production that it refers to' do
|
22
|
+
production = double('sample-production')
|
23
|
+
instance = Digram.new(:b, :c, production)
|
24
|
+
expect(instance.production).to eq(production)
|
25
|
+
end
|
26
|
+
|
27
|
+
end # context
|
28
|
+
|
29
|
+
end # describe
|
30
|
+
|
31
|
+
end # module
|
32
|
+
|
33
|
+
# End of file
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
# Load the class under test
|
4
|
+
require_relative '../../lib/sequitur/dynamic-grammar'
|
5
|
+
|
6
|
+
module Sequitur # Re-open the module to get rid of qualified names
|
7
|
+
|
8
|
+
describe DynamicGrammar do
|
9
|
+
# Factory method. Build a production with the given sequence
|
10
|
+
# of symbols as its rhs.
|
11
|
+
def build_production(*symbols)
|
12
|
+
prod = Production.new
|
13
|
+
symbols.each { |symb| prod.append_symbol(symb) }
|
14
|
+
return prod
|
15
|
+
end
|
16
|
+
|
17
|
+
let(:p_a) { build_production(:a) }
|
18
|
+
let(:p_b) { build_production(:b) }
|
19
|
+
let(:p_c) { build_production(:c) }
|
20
|
+
let(:p_bc) { build_production(p_b, p_c) }
|
21
|
+
|
22
|
+
|
23
|
+
context 'Creation & initialization:' do
|
24
|
+
|
25
|
+
it 'should be created without parameter' do
|
26
|
+
expect { DynamicGrammar.new }.not_to raise_error
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should have an empty root/start production' do
|
30
|
+
expect(subject.root).to be_empty
|
31
|
+
expect(subject.productions.size).to eq(1)
|
32
|
+
expect(subject.productions.first).to be_empty
|
33
|
+
end
|
34
|
+
|
35
|
+
end # context
|
36
|
+
|
37
|
+
|
38
|
+
context 'Adding productions to the grammar:' do
|
39
|
+
it 'should add a simple production' do
|
40
|
+
subject.add_production(p_a)
|
41
|
+
expect(subject.productions.size).to eq(2)
|
42
|
+
expect(subject.productions.last).to eq(p_a)
|
43
|
+
|
44
|
+
# Error: p_b, p_c not in grammar
|
45
|
+
expect { add_production(p_bc) }.to raise_error(StandardError)
|
46
|
+
|
47
|
+
subject.add_production(p_b)
|
48
|
+
expect(subject.productions.size).to eq(3)
|
49
|
+
expect(subject.productions.last).to eq(p_b)
|
50
|
+
|
51
|
+
# Error: p_c not in grammar
|
52
|
+
expect { add_production(p_bc) }.to raise_error(StandardError)
|
53
|
+
|
54
|
+
subject.add_production(p_c)
|
55
|
+
expect(subject.productions.size).to eq(4)
|
56
|
+
expect(subject.productions.last).to eq(p_c)
|
57
|
+
|
58
|
+
subject.add_production(p_bc)
|
59
|
+
expect(subject.productions.size).to eq(5)
|
60
|
+
expect(subject.productions.last).to eq(p_bc)
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should complain when rhs refers to unknown production' do
|
64
|
+
subject.add_production(p_a)
|
65
|
+
subject.add_production(p_b)
|
66
|
+
# Test fails because of Production#references
|
67
|
+
msg = "Production #{p_bc.object_id} refers to production #{p_c.object_id}"
|
68
|
+
msg << ' that is not part of the grammar.'
|
69
|
+
expect { subject.add_production(p_bc) }.to raise_error(StandardError, msg)
|
70
|
+
|
71
|
+
end
|
72
|
+
end # context
|
73
|
+
|
74
|
+
|
75
|
+
context 'Removing a production from the grammar:' do
|
76
|
+
it 'should remove an existing production' do
|
77
|
+
subject.add_production(p_a) # index = 1
|
78
|
+
subject.add_production(p_b) # index = 2
|
79
|
+
subject.add_production(p_c) # index = 3
|
80
|
+
subject.add_production(p_bc) # index = 4
|
81
|
+
expect(subject.productions.size).to eq(5)
|
82
|
+
|
83
|
+
expect(p_a.refcount).to eq(0)
|
84
|
+
expect(p_b.refcount).to eq(1)
|
85
|
+
expect(p_c.refcount).to eq(1)
|
86
|
+
|
87
|
+
subject.delete_production(1) # 1 => p_a
|
88
|
+
expect(subject.productions.size).to eq(4)
|
89
|
+
expect(p_b.refcount).to eq(1)
|
90
|
+
expect(p_c.refcount).to eq(1)
|
91
|
+
expect(subject.productions).not_to include(p_a)
|
92
|
+
|
93
|
+
subject.delete_production(3) # 3 => p_bc
|
94
|
+
|
95
|
+
expect(subject.productions.size).to eq(3)
|
96
|
+
expect(p_b.refcount).to eq(0)
|
97
|
+
expect(p_c.refcount).to eq(0)
|
98
|
+
expect(subject.productions).not_to include(p_bc)
|
99
|
+
end
|
100
|
+
|
101
|
+
end # context
|
102
|
+
|
103
|
+
|
104
|
+
context 'Generating a text representation of itself:' do
|
105
|
+
|
106
|
+
it 'should generate a text representation when empty' do
|
107
|
+
expectation = "#{subject.root.object_id} : ."
|
108
|
+
expect(subject.to_string).to eq(expectation)
|
109
|
+
end
|
110
|
+
|
111
|
+
# it 'should generate a text representation of a simple production' do
|
112
|
+
# instance = SequiturGrammar.new([:a].to_enum)
|
113
|
+
# expectation = "#{instance.root.object_id} : a."
|
114
|
+
# expect(instance.to_string).to eq(expectation)
|
115
|
+
# end
|
116
|
+
|
117
|
+
end # context
|
118
|
+
|
119
|
+
end # describe
|
120
|
+
|
121
|
+
end # module
|
122
|
+
|
123
|
+
# End of file
|