sequitur 0.0.04
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.rspec +1 -0
- data/.rubocop.yml +74 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.simplecov +7 -0
- data/.travis.yml +15 -0
- data/.yardopts +6 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +9 -0
- data/LICENSE.txt +19 -0
- data/README.md +10 -0
- data/Rakefile +31 -0
- data/lib/sequitur.rb +8 -0
- data/lib/sequitur/constants.rb +26 -0
- data/lib/sequitur/digram.rb +29 -0
- data/lib/sequitur/dynamic-grammar.rb +126 -0
- data/lib/sequitur/production.rb +202 -0
- data/lib/sequitur/sequitur-grammar.rb +332 -0
- data/spec/sequitur/digram_spec.rb +33 -0
- data/spec/sequitur/dynamic-grammar_spec.rb +123 -0
- data/spec/sequitur/production_spec.rb +271 -0
- data/spec/sequitur/sequitur-grammar_spec.rb +213 -0
- data/spec/spec_helper.rb +21 -0
- metadata +134 -0
@@ -0,0 +1,332 @@
|
|
1
|
+
require_relative 'dynamic-grammar'
|
2
|
+
|
3
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
+
|
5
|
+
class SequiturGrammar < DynamicGrammar
|
6
|
+
# A hash with pairs of the form: digram key => digram
|
7
|
+
attr_reader(:digrams)
|
8
|
+
|
9
|
+
# The input
|
10
|
+
attr_reader(:parsed)
|
11
|
+
|
12
|
+
# Constructor. Build the grammar from an enumerator of tokens
|
13
|
+
def initialize(anEnum)
|
14
|
+
super()
|
15
|
+
# Make start production compliant with utility rule
|
16
|
+
2.times { root.add_backref(root) }
|
17
|
+
|
18
|
+
@digrams = {}
|
19
|
+
@parsed = []
|
20
|
+
anEnum.each { |a_token| add_token(a_token) }
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# Add the given token to the grammar.
|
26
|
+
def add_token(aToken)
|
27
|
+
parsed << aToken
|
28
|
+
super
|
29
|
+
end
|
30
|
+
|
31
|
+
# Check the invariant:
|
32
|
+
# Every digram appearing in a rhs must occur at most once in the grammar.
|
33
|
+
def check_unicity()
|
34
|
+
all_digrams = {}
|
35
|
+
productions.each do |a_prod|
|
36
|
+
prod_digrams = a_prod.digrams
|
37
|
+
prod_digrams.each do |a_digram|
|
38
|
+
if all_digrams.include? a_digram.key
|
39
|
+
msg = "Digram #{a_digram.symbols} occurs twice!"
|
40
|
+
colliding = all_digrams[a_digram.key]
|
41
|
+
msg << "\nOnce in production #{colliding.production_id}"
|
42
|
+
msg << "\nSecond in production #{a_prod.object_id}"
|
43
|
+
msg << "\n#{to_string}"
|
44
|
+
fail StandardError, msg
|
45
|
+
else
|
46
|
+
all_digrams[a_digram.key] = a_digram
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
# Assumption: last digram of production isn't yet registered.
|
56
|
+
def add_production(aProduction)
|
57
|
+
super # Call original method from superclass...
|
58
|
+
|
59
|
+
# ... then add this behaviour
|
60
|
+
last_digram = aProduction.last_digram
|
61
|
+
digrams[last_digram.key] = last_digram
|
62
|
+
end
|
63
|
+
|
64
|
+
# Remove a production from the grammar
|
65
|
+
def delete_production(anIndex)
|
66
|
+
prod = productions[anIndex]
|
67
|
+
|
68
|
+
# Retrieve in the Hash all registered digrams from the removed production
|
69
|
+
digrams_subset = digrams.select do |_, digr|
|
70
|
+
digr.production_id == prod.object_id
|
71
|
+
end
|
72
|
+
|
73
|
+
# Remove them...
|
74
|
+
digrams_subset.each_key { |a_key| digrams.delete(a_key) }
|
75
|
+
super
|
76
|
+
end
|
77
|
+
|
78
|
+
def append_symbol_to(aProduction, aSymbol)
|
79
|
+
prod_digrams = aProduction.calc_append_symbol(aSymbol)
|
80
|
+
check_digrams # TODO: remove this
|
81
|
+
check_backrefs # TODO: remove this
|
82
|
+
super
|
83
|
+
unless prod_digrams.empty?
|
84
|
+
last_digram = prod_digrams.last
|
85
|
+
matching_digram = digrams[last_digram.key]
|
86
|
+
if matching_digram.nil?
|
87
|
+
# ... No registered occurrence of the digram, then register it
|
88
|
+
digrams[last_digram.key] = last_digram
|
89
|
+
else
|
90
|
+
# Digram is already registered...
|
91
|
+
# the digram unicity rule is broken: fix this
|
92
|
+
preserve_unicity(aProduction)
|
93
|
+
enforce_rule_utility
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# The given production breaks the digram unicity rule.
|
99
|
+
# Fix this either by a creating a new production having the duplicate
|
100
|
+
# digram as its rhs or by referencing such a production.
|
101
|
+
# then by replacing all occurrences of the digram by reference to
|
102
|
+
# the fixing production.
|
103
|
+
# Pre-condition: the given production has a repeated digram
|
104
|
+
# or its last digram is used elsewhere
|
105
|
+
def preserve_unicity(aProduction)
|
106
|
+
last_digram = aProduction.last_digram
|
107
|
+
matching_digram = digrams[last_digram.key]
|
108
|
+
if last_digram.production_id == matching_digram.production_id
|
109
|
+
# Rule: no other production distinct from aProduction should have
|
110
|
+
# the matching digram
|
111
|
+
productions.each do |prod|
|
112
|
+
its_digrams = prod.digrams
|
113
|
+
its_keys = its_digrams.map(&:key)
|
114
|
+
if prod.object_id == last_digram.production_id
|
115
|
+
# TODO: check that digram really occurs twice in the production.
|
116
|
+
# occurrences = its_keys.select { |a_key| a_key == last_digram.key }
|
117
|
+
# if occurrences.size != 2
|
118
|
+
# msg = "Digram #{last_digram.symbols} should occur twice"
|
119
|
+
# msg << "\nin production #{aProduction.object_id}"
|
120
|
+
# msg << "\nBut occurs #{occurrences.size}"
|
121
|
+
# msg << "\n#{self.to_string}"
|
122
|
+
# fail StandardError, msg
|
123
|
+
# end
|
124
|
+
|
125
|
+
else
|
126
|
+
if its_keys.include? last_digram.key
|
127
|
+
msg = "Digram #{last_digram.symbols} occurs three times!"
|
128
|
+
msg << "\nTwice in production #{aProduction.object_id}"
|
129
|
+
msg << "\nThird in production #{prod.object_id}"
|
130
|
+
msg << "\n#{to_string}"
|
131
|
+
fail StandardError, msg
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Digram appears twice in given production...
|
137
|
+
# Then create a new production with the digram as its rhs
|
138
|
+
new_prod = Production.new
|
139
|
+
new_prod.append_symbol(last_digram.symbols[0])
|
140
|
+
new_prod.append_symbol(last_digram.symbols[1])
|
141
|
+
|
142
|
+
# ... replace duplicate digram by reference to new production
|
143
|
+
aProduction.replace_digram(new_prod)
|
144
|
+
add_production(new_prod)
|
145
|
+
update_digrams_from(aProduction)
|
146
|
+
check_digrams # TODO: remove
|
147
|
+
check_unicity
|
148
|
+
else
|
149
|
+
# Duplicate digram used in distinct production
|
150
|
+
# Two cases: other production is a single digram one or a multi-digram
|
151
|
+
other_prod = ObjectSpace._id2ref(matching_digram.production_id)
|
152
|
+
if other_prod.single_digram?
|
153
|
+
# ... replace duplicate digram by reference to other production
|
154
|
+
aProduction.replace_digram(other_prod)
|
155
|
+
update_digrams_from(aProduction)
|
156
|
+
|
157
|
+
# Special case a: replacement causes another digram duplication
|
158
|
+
# in the given production
|
159
|
+
# Special case b: replacement causes another digram duplication
|
160
|
+
# with other production
|
161
|
+
if aProduction.repeated_digram? || digrams[aProduction.last_digram.key]
|
162
|
+
preserve_unicity(aProduction)
|
163
|
+
end
|
164
|
+
|
165
|
+
check_references # TODO: remove this
|
166
|
+
else
|
167
|
+
# aProduction, other_prod use both the same digram
|
168
|
+
# Then create a new production with the digram as its rhs
|
169
|
+
new_prod = Production.new
|
170
|
+
new_prod.append_symbol(last_digram.symbols[0])
|
171
|
+
new_prod.append_symbol(last_digram.symbols[1])
|
172
|
+
|
173
|
+
# ... replace duplicate digram by reference to new production
|
174
|
+
aProduction.replace_digram(new_prod)
|
175
|
+
other_prod.replace_digram(new_prod)
|
176
|
+
add_production(new_prod)
|
177
|
+
update_digrams_from(aProduction)
|
178
|
+
|
179
|
+
# TODO: Check when aProduction and other_prod have same preceding symbol
|
180
|
+
update_digrams_from(other_prod)
|
181
|
+
check_backrefs # TODO: remove this
|
182
|
+
end
|
183
|
+
check_unicity
|
184
|
+
end
|
185
|
+
|
186
|
+
check_unicity
|
187
|
+
check_registered
|
188
|
+
end
|
189
|
+
|
190
|
+
# Rule utility: except for the root production, every production must occur
|
191
|
+
# multiple times in all the rhs.
|
192
|
+
# Initialize occurrence hash with pairs: production id => []
|
193
|
+
# For each production:
|
194
|
+
# - Detect occurrence of any production in the rhs
|
195
|
+
# - Identify the occurring production
|
196
|
+
# - In the occurrence hash push the production id of the lhs
|
197
|
+
# Select each production that occurs once (singleton rule):
|
198
|
+
# Replace the occurrence in the rhs by the rhs of the singleton rule
|
199
|
+
# Delete the singleton rule
|
200
|
+
# Update digrams
|
201
|
+
def enforce_rule_utility()
|
202
|
+
return if productions.size < 2
|
203
|
+
check_references
|
204
|
+
|
205
|
+
loop do
|
206
|
+
all_refcount_OK = true
|
207
|
+
(1...productions.size).to_a.reverse.each do |index|
|
208
|
+
next unless productions[index].refcount == 1
|
209
|
+
|
210
|
+
all_refcount_OK = false
|
211
|
+
other_id = productions[index].backrefs.keys.first
|
212
|
+
dependent = ObjectSpace._id2ref(other_id)
|
213
|
+
dependent.replace_production(productions[index])
|
214
|
+
delete_production(index)
|
215
|
+
update_digrams_from(dependent)
|
216
|
+
check_references
|
217
|
+
check_backrefs
|
218
|
+
end
|
219
|
+
|
220
|
+
break if all_refcount_OK
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
# Update the digrams Hash with the digrams from the given production.
|
226
|
+
def update_digrams_from(aProduction)
|
227
|
+
current_digrams = aProduction.digrams
|
228
|
+
|
229
|
+
# Add new digrams
|
230
|
+
current_digrams.each do |digr|
|
231
|
+
digrams[digr.key] = digr unless digrams.include? digr.key
|
232
|
+
end
|
233
|
+
|
234
|
+
# Retrieve all registered digrams from the production
|
235
|
+
digrams_subset = digrams.select do |_, digr|
|
236
|
+
digr.production_id == aProduction.object_id
|
237
|
+
end
|
238
|
+
|
239
|
+
# Remove obsolete digrams
|
240
|
+
current_keys = current_digrams.map(&:key)
|
241
|
+
digrams_subset.keys.each do |a_key|
|
242
|
+
digrams.delete(a_key) unless current_keys.include? a_key
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# Check the invariant:
|
247
|
+
# Every production reference in a rhs must point
|
248
|
+
# to a production of the grammar.
|
249
|
+
def check_references()
|
250
|
+
productions.each do |a_prod|
|
251
|
+
rhs_prods = a_prod.references
|
252
|
+
rhs_prods.each do |referenced_prod|
|
253
|
+
next if productions.include? referenced_prod
|
254
|
+
|
255
|
+
msg = "Production #{a_prod.object_id} references the "
|
256
|
+
msg << "unknown production #{referenced_prod.object_id}"
|
257
|
+
msg << "\nOrphan production: #{referenced_prod.to_string}"
|
258
|
+
msg << "\n#{to_string}"
|
259
|
+
fail StandardError, msg
|
260
|
+
end
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Check the invariant:
|
265
|
+
# Every registered digram must reference a production from the grammar
|
266
|
+
def check_registered()
|
267
|
+
digrams.each do |_key, digr|
|
268
|
+
found = productions.find do |a_prod|
|
269
|
+
digr.production_id == a_prod.object_id
|
270
|
+
end
|
271
|
+
next if found
|
272
|
+
|
273
|
+
msg = "Digram #{digr.symbols} references the unknown "
|
274
|
+
msg << "production (#{digr.production_id})."
|
275
|
+
msg << "\n#{to_string}"
|
276
|
+
fail StandardError, msg
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
# Compare the contents of digrams Hash with
|
281
|
+
# All digrams from all productions
|
282
|
+
def check_digrams()
|
283
|
+
# Control that every registered digram refers
|
284
|
+
# to a production that really has that digram
|
285
|
+
digrams.each do |key, digr|
|
286
|
+
its_prod = ObjectSpace._id2ref(digr.production_id)
|
287
|
+
prod_digrams = its_prod.digrams
|
288
|
+
prod_keys = prod_digrams.map(&:key)
|
289
|
+
next if prod_keys.include? key
|
290
|
+
|
291
|
+
msg = "Production #{digr.production_id} doesn't have "
|
292
|
+
msg << "the digram #{digr.symbols}"
|
293
|
+
msg << "\n#{prod_digrams.map(&:symbols)}"
|
294
|
+
msg << "\n#{to_string}"
|
295
|
+
fail StandardError, msg
|
296
|
+
end
|
297
|
+
|
298
|
+
all_digrams = {}
|
299
|
+
productions.each do |a_prod|
|
300
|
+
its_digrams = a_prod.digrams
|
301
|
+
its_digrams.each do |digr|
|
302
|
+
check_unicity if all_digrams[digr.key]
|
303
|
+
all_digrams[digr.key] = digr
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
all_digrams.each do |key, digr|
|
308
|
+
registered = digrams[key]
|
309
|
+
if registered
|
310
|
+
if registered.production_id != digr.production_id
|
311
|
+
msg = "Production #{digr.production_id} has "
|
312
|
+
msg << "the digram #{digr.symbols} that collides"
|
313
|
+
msg << "\n with same digram from #{registered.production_id}"
|
314
|
+
msg << "\n#{to_string}"
|
315
|
+
fail StandardError, msg
|
316
|
+
end
|
317
|
+
else
|
318
|
+
its_prod = ObjectSpace._id2ref(digr.production_id)
|
319
|
+
msg = "Production #{its_prod.object_id} (#{its_prod.rhs}) "
|
320
|
+
msg << "has the digram #{digr.symbols} that isn't registered."
|
321
|
+
msg << "\n#{to_string}"
|
322
|
+
fail StandardError, msg
|
323
|
+
end
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
|
328
|
+
end # class
|
329
|
+
|
330
|
+
end # module
|
331
|
+
|
332
|
+
# End of file
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
# Load the class under test
|
4
|
+
require_relative '../../lib/sequitur/digram'
|
5
|
+
|
6
|
+
module Sequitur # Re-open the module to get rid of qualified names
|
7
|
+
|
8
|
+
describe Digram do
|
9
|
+
let(:two_symbols) { [:b, :c] }
|
10
|
+
|
11
|
+
context 'Standard creation & initialization:' do
|
12
|
+
|
13
|
+
it 'should be created with 3 arguments' do
|
14
|
+
production = double('sample-production')
|
15
|
+
instance = Digram.new(:b, :c, production)
|
16
|
+
|
17
|
+
expect(instance.symbols).to eq(two_symbols)
|
18
|
+
expect(instance.production_id).to eq(production.object_id)
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'should return the production that it refers to' do
|
22
|
+
production = double('sample-production')
|
23
|
+
instance = Digram.new(:b, :c, production)
|
24
|
+
expect(instance.production).to eq(production)
|
25
|
+
end
|
26
|
+
|
27
|
+
end # context
|
28
|
+
|
29
|
+
end # describe
|
30
|
+
|
31
|
+
end # module
|
32
|
+
|
33
|
+
# End of file
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
# Load the class under test
|
4
|
+
require_relative '../../lib/sequitur/dynamic-grammar'
|
5
|
+
|
6
|
+
module Sequitur # Re-open the module to get rid of qualified names
|
7
|
+
|
8
|
+
describe DynamicGrammar do
|
9
|
+
# Factory method. Build a production with the given sequence
|
10
|
+
# of symbols as its rhs.
|
11
|
+
def build_production(*symbols)
|
12
|
+
prod = Production.new
|
13
|
+
symbols.each { |symb| prod.append_symbol(symb) }
|
14
|
+
return prod
|
15
|
+
end
|
16
|
+
|
17
|
+
let(:p_a) { build_production(:a) }
|
18
|
+
let(:p_b) { build_production(:b) }
|
19
|
+
let(:p_c) { build_production(:c) }
|
20
|
+
let(:p_bc) { build_production(p_b, p_c) }
|
21
|
+
|
22
|
+
|
23
|
+
context 'Creation & initialization:' do
|
24
|
+
|
25
|
+
it 'should be created without parameter' do
|
26
|
+
expect { DynamicGrammar.new }.not_to raise_error
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should have an empty root/start production' do
|
30
|
+
expect(subject.root).to be_empty
|
31
|
+
expect(subject.productions.size).to eq(1)
|
32
|
+
expect(subject.productions.first).to be_empty
|
33
|
+
end
|
34
|
+
|
35
|
+
end # context
|
36
|
+
|
37
|
+
|
38
|
+
context 'Adding productions to the grammar:' do
|
39
|
+
it 'should add a simple production' do
|
40
|
+
subject.add_production(p_a)
|
41
|
+
expect(subject.productions.size).to eq(2)
|
42
|
+
expect(subject.productions.last).to eq(p_a)
|
43
|
+
|
44
|
+
# Error: p_b, p_c not in grammar
|
45
|
+
expect { add_production(p_bc) }.to raise_error(StandardError)
|
46
|
+
|
47
|
+
subject.add_production(p_b)
|
48
|
+
expect(subject.productions.size).to eq(3)
|
49
|
+
expect(subject.productions.last).to eq(p_b)
|
50
|
+
|
51
|
+
# Error: p_c not in grammar
|
52
|
+
expect { add_production(p_bc) }.to raise_error(StandardError)
|
53
|
+
|
54
|
+
subject.add_production(p_c)
|
55
|
+
expect(subject.productions.size).to eq(4)
|
56
|
+
expect(subject.productions.last).to eq(p_c)
|
57
|
+
|
58
|
+
subject.add_production(p_bc)
|
59
|
+
expect(subject.productions.size).to eq(5)
|
60
|
+
expect(subject.productions.last).to eq(p_bc)
|
61
|
+
end
|
62
|
+
|
63
|
+
it 'should complain when rhs refers to unknown production' do
|
64
|
+
subject.add_production(p_a)
|
65
|
+
subject.add_production(p_b)
|
66
|
+
# Test fails because of Production#references
|
67
|
+
msg = "Production #{p_bc.object_id} refers to production #{p_c.object_id}"
|
68
|
+
msg << ' that is not part of the grammar.'
|
69
|
+
expect { subject.add_production(p_bc) }.to raise_error(StandardError, msg)
|
70
|
+
|
71
|
+
end
|
72
|
+
end # context
|
73
|
+
|
74
|
+
|
75
|
+
context 'Removing a production from the grammar:' do
|
76
|
+
it 'should remove an existing production' do
|
77
|
+
subject.add_production(p_a) # index = 1
|
78
|
+
subject.add_production(p_b) # index = 2
|
79
|
+
subject.add_production(p_c) # index = 3
|
80
|
+
subject.add_production(p_bc) # index = 4
|
81
|
+
expect(subject.productions.size).to eq(5)
|
82
|
+
|
83
|
+
expect(p_a.refcount).to eq(0)
|
84
|
+
expect(p_b.refcount).to eq(1)
|
85
|
+
expect(p_c.refcount).to eq(1)
|
86
|
+
|
87
|
+
subject.delete_production(1) # 1 => p_a
|
88
|
+
expect(subject.productions.size).to eq(4)
|
89
|
+
expect(p_b.refcount).to eq(1)
|
90
|
+
expect(p_c.refcount).to eq(1)
|
91
|
+
expect(subject.productions).not_to include(p_a)
|
92
|
+
|
93
|
+
subject.delete_production(3) # 3 => p_bc
|
94
|
+
|
95
|
+
expect(subject.productions.size).to eq(3)
|
96
|
+
expect(p_b.refcount).to eq(0)
|
97
|
+
expect(p_c.refcount).to eq(0)
|
98
|
+
expect(subject.productions).not_to include(p_bc)
|
99
|
+
end
|
100
|
+
|
101
|
+
end # context
|
102
|
+
|
103
|
+
|
104
|
+
context 'Generating a text representation of itself:' do
|
105
|
+
|
106
|
+
it 'should generate a text representation when empty' do
|
107
|
+
expectation = "#{subject.root.object_id} : ."
|
108
|
+
expect(subject.to_string).to eq(expectation)
|
109
|
+
end
|
110
|
+
|
111
|
+
# it 'should generate a text representation of a simple production' do
|
112
|
+
# instance = SequiturGrammar.new([:a].to_enum)
|
113
|
+
# expectation = "#{instance.root.object_id} : a."
|
114
|
+
# expect(instance.to_string).to eq(expectation)
|
115
|
+
# end
|
116
|
+
|
117
|
+
end # context
|
118
|
+
|
119
|
+
end # describe
|
120
|
+
|
121
|
+
end # module
|
122
|
+
|
123
|
+
# End of file
|