sequitur 0.0.04

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,332 @@
1
+ require_relative 'dynamic-grammar'
2
+
3
+ module Sequitur # Module for classes implementing the Sequitur algorithm
4
+
5
+ class SequiturGrammar < DynamicGrammar
6
+ # A hash with pairs of the form: digram key => digram
7
+ attr_reader(:digrams)
8
+
9
+ # The input
10
+ attr_reader(:parsed)
11
+
12
+ # Constructor. Build the grammar from an enumerator of tokens
13
+ def initialize(anEnum)
14
+ super()
15
+ # Make start production compliant with utility rule
16
+ 2.times { root.add_backref(root) }
17
+
18
+ @digrams = {}
19
+ @parsed = []
20
+ anEnum.each { |a_token| add_token(a_token) }
21
+ end
22
+
23
+ public
24
+
25
+ # Add the given token to the grammar.
26
+ def add_token(aToken)
27
+ parsed << aToken
28
+ super
29
+ end
30
+
31
+ # Check the invariant:
32
+ # Every digram appearing in a rhs must occur at most once in the grammar.
33
+ def check_unicity()
34
+ all_digrams = {}
35
+ productions.each do |a_prod|
36
+ prod_digrams = a_prod.digrams
37
+ prod_digrams.each do |a_digram|
38
+ if all_digrams.include? a_digram.key
39
+ msg = "Digram #{a_digram.symbols} occurs twice!"
40
+ colliding = all_digrams[a_digram.key]
41
+ msg << "\nOnce in production #{colliding.production_id}"
42
+ msg << "\nSecond in production #{a_prod.object_id}"
43
+ msg << "\n#{to_string}"
44
+ fail StandardError, msg
45
+ else
46
+ all_digrams[a_digram.key] = a_digram
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+
53
+ private
54
+
55
+ # Assumption: last digram of production isn't yet registered.
56
+ def add_production(aProduction)
57
+ super # Call original method from superclass...
58
+
59
+ # ... then add this behaviour
60
+ last_digram = aProduction.last_digram
61
+ digrams[last_digram.key] = last_digram
62
+ end
63
+
64
+ # Remove a production from the grammar
65
+ def delete_production(anIndex)
66
+ prod = productions[anIndex]
67
+
68
+ # Retrieve in the Hash all registered digrams from the removed production
69
+ digrams_subset = digrams.select do |_, digr|
70
+ digr.production_id == prod.object_id
71
+ end
72
+
73
+ # Remove them...
74
+ digrams_subset.each_key { |a_key| digrams.delete(a_key) }
75
+ super
76
+ end
77
+
78
+ def append_symbol_to(aProduction, aSymbol)
79
+ prod_digrams = aProduction.calc_append_symbol(aSymbol)
80
+ check_digrams # TODO: remove this
81
+ check_backrefs # TODO: remove this
82
+ super
83
+ unless prod_digrams.empty?
84
+ last_digram = prod_digrams.last
85
+ matching_digram = digrams[last_digram.key]
86
+ if matching_digram.nil?
87
+ # ... No registered occurrence of the digram, then register it
88
+ digrams[last_digram.key] = last_digram
89
+ else
90
+ # Digram is already registered...
91
+ # the digram unicity rule is broken: fix this
92
+ preserve_unicity(aProduction)
93
+ enforce_rule_utility
94
+ end
95
+ end
96
+ end
97
+
98
+ # The given production breaks the digram unicity rule.
99
+ # Fix this either by a creating a new production having the duplicate
100
+ # digram as its rhs or by referencing such a production.
101
+ # then by replacing all occurrences of the digram by reference to
102
+ # the fixing production.
103
+ # Pre-condition: the given production has a repeated digram
104
+ # or its last digram is used elsewhere
105
+ def preserve_unicity(aProduction)
106
+ last_digram = aProduction.last_digram
107
+ matching_digram = digrams[last_digram.key]
108
+ if last_digram.production_id == matching_digram.production_id
109
+ # Rule: no other production distinct from aProduction should have
110
+ # the matching digram
111
+ productions.each do |prod|
112
+ its_digrams = prod.digrams
113
+ its_keys = its_digrams.map(&:key)
114
+ if prod.object_id == last_digram.production_id
115
+ # TODO: check that digram really occurs twice in the production.
116
+ # occurrences = its_keys.select { |a_key| a_key == last_digram.key }
117
+ # if occurrences.size != 2
118
+ # msg = "Digram #{last_digram.symbols} should occur twice"
119
+ # msg << "\nin production #{aProduction.object_id}"
120
+ # msg << "\nBut occurs #{occurrences.size}"
121
+ # msg << "\n#{self.to_string}"
122
+ # fail StandardError, msg
123
+ # end
124
+
125
+ else
126
+ if its_keys.include? last_digram.key
127
+ msg = "Digram #{last_digram.symbols} occurs three times!"
128
+ msg << "\nTwice in production #{aProduction.object_id}"
129
+ msg << "\nThird in production #{prod.object_id}"
130
+ msg << "\n#{to_string}"
131
+ fail StandardError, msg
132
+ end
133
+ end
134
+ end
135
+
136
+ # Digram appears twice in given production...
137
+ # Then create a new production with the digram as its rhs
138
+ new_prod = Production.new
139
+ new_prod.append_symbol(last_digram.symbols[0])
140
+ new_prod.append_symbol(last_digram.symbols[1])
141
+
142
+ # ... replace duplicate digram by reference to new production
143
+ aProduction.replace_digram(new_prod)
144
+ add_production(new_prod)
145
+ update_digrams_from(aProduction)
146
+ check_digrams # TODO: remove
147
+ check_unicity
148
+ else
149
+ # Duplicate digram used in distinct production
150
+ # Two cases: other production is a single digram one or a multi-digram
151
+ other_prod = ObjectSpace._id2ref(matching_digram.production_id)
152
+ if other_prod.single_digram?
153
+ # ... replace duplicate digram by reference to other production
154
+ aProduction.replace_digram(other_prod)
155
+ update_digrams_from(aProduction)
156
+
157
+ # Special case a: replacement causes another digram duplication
158
+ # in the given production
159
+ # Special case b: replacement causes another digram duplication
160
+ # with other production
161
+ if aProduction.repeated_digram? || digrams[aProduction.last_digram.key]
162
+ preserve_unicity(aProduction)
163
+ end
164
+
165
+ check_references # TODO: remove this
166
+ else
167
+ # aProduction, other_prod use both the same digram
168
+ # Then create a new production with the digram as its rhs
169
+ new_prod = Production.new
170
+ new_prod.append_symbol(last_digram.symbols[0])
171
+ new_prod.append_symbol(last_digram.symbols[1])
172
+
173
+ # ... replace duplicate digram by reference to new production
174
+ aProduction.replace_digram(new_prod)
175
+ other_prod.replace_digram(new_prod)
176
+ add_production(new_prod)
177
+ update_digrams_from(aProduction)
178
+
179
+ # TODO: Check when aProduction and other_prod have same preceding symbol
180
+ update_digrams_from(other_prod)
181
+ check_backrefs # TODO: remove this
182
+ end
183
+ check_unicity
184
+ end
185
+
186
+ check_unicity
187
+ check_registered
188
+ end
189
+
190
+ # Rule utility: except for the root production, every production must occur
191
+ # multiple times in all the rhs.
192
+ # Initialize occurrence hash with pairs: production id => []
193
+ # For each production:
194
+ # - Detect occurrence of any production in the rhs
195
+ # - Identify the occurring production
196
+ # - In the occurrence hash push the production id of the lhs
197
+ # Select each production that occurs once (singleton rule):
198
+ # Replace the occurrence in the rhs by the rhs of the singleton rule
199
+ # Delete the singleton rule
200
+ # Update digrams
201
+ def enforce_rule_utility()
202
+ return if productions.size < 2
203
+ check_references
204
+
205
+ loop do
206
+ all_refcount_OK = true
207
+ (1...productions.size).to_a.reverse.each do |index|
208
+ next unless productions[index].refcount == 1
209
+
210
+ all_refcount_OK = false
211
+ other_id = productions[index].backrefs.keys.first
212
+ dependent = ObjectSpace._id2ref(other_id)
213
+ dependent.replace_production(productions[index])
214
+ delete_production(index)
215
+ update_digrams_from(dependent)
216
+ check_references
217
+ check_backrefs
218
+ end
219
+
220
+ break if all_refcount_OK
221
+ end
222
+ end
223
+
224
+
225
+ # Update the digrams Hash with the digrams from the given production.
226
+ def update_digrams_from(aProduction)
227
+ current_digrams = aProduction.digrams
228
+
229
+ # Add new digrams
230
+ current_digrams.each do |digr|
231
+ digrams[digr.key] = digr unless digrams.include? digr.key
232
+ end
233
+
234
+ # Retrieve all registered digrams from the production
235
+ digrams_subset = digrams.select do |_, digr|
236
+ digr.production_id == aProduction.object_id
237
+ end
238
+
239
+ # Remove obsolete digrams
240
+ current_keys = current_digrams.map(&:key)
241
+ digrams_subset.keys.each do |a_key|
242
+ digrams.delete(a_key) unless current_keys.include? a_key
243
+ end
244
+ end
245
+
246
+ # Check the invariant:
247
+ # Every production reference in a rhs must point
248
+ # to a production of the grammar.
249
+ def check_references()
250
+ productions.each do |a_prod|
251
+ rhs_prods = a_prod.references
252
+ rhs_prods.each do |referenced_prod|
253
+ next if productions.include? referenced_prod
254
+
255
+ msg = "Production #{a_prod.object_id} references the "
256
+ msg << "unknown production #{referenced_prod.object_id}"
257
+ msg << "\nOrphan production: #{referenced_prod.to_string}"
258
+ msg << "\n#{to_string}"
259
+ fail StandardError, msg
260
+ end
261
+ end
262
+ end
263
+
264
+ # Check the invariant:
265
+ # Every registered digram must reference a production from the grammar
266
+ def check_registered()
267
+ digrams.each do |_key, digr|
268
+ found = productions.find do |a_prod|
269
+ digr.production_id == a_prod.object_id
270
+ end
271
+ next if found
272
+
273
+ msg = "Digram #{digr.symbols} references the unknown "
274
+ msg << "production (#{digr.production_id})."
275
+ msg << "\n#{to_string}"
276
+ fail StandardError, msg
277
+ end
278
+ end
279
+
280
+ # Compare the contents of digrams Hash with
281
+ # All digrams from all productions
282
+ def check_digrams()
283
+ # Control that every registered digram refers
284
+ # to a production that really has that digram
285
+ digrams.each do |key, digr|
286
+ its_prod = ObjectSpace._id2ref(digr.production_id)
287
+ prod_digrams = its_prod.digrams
288
+ prod_keys = prod_digrams.map(&:key)
289
+ next if prod_keys.include? key
290
+
291
+ msg = "Production #{digr.production_id} doesn't have "
292
+ msg << "the digram #{digr.symbols}"
293
+ msg << "\n#{prod_digrams.map(&:symbols)}"
294
+ msg << "\n#{to_string}"
295
+ fail StandardError, msg
296
+ end
297
+
298
+ all_digrams = {}
299
+ productions.each do |a_prod|
300
+ its_digrams = a_prod.digrams
301
+ its_digrams.each do |digr|
302
+ check_unicity if all_digrams[digr.key]
303
+ all_digrams[digr.key] = digr
304
+ end
305
+ end
306
+
307
+ all_digrams.each do |key, digr|
308
+ registered = digrams[key]
309
+ if registered
310
+ if registered.production_id != digr.production_id
311
+ msg = "Production #{digr.production_id} has "
312
+ msg << "the digram #{digr.symbols} that collides"
313
+ msg << "\n with same digram from #{registered.production_id}"
314
+ msg << "\n#{to_string}"
315
+ fail StandardError, msg
316
+ end
317
+ else
318
+ its_prod = ObjectSpace._id2ref(digr.production_id)
319
+ msg = "Production #{its_prod.object_id} (#{its_prod.rhs}) "
320
+ msg << "has the digram #{digr.symbols} that isn't registered."
321
+ msg << "\n#{to_string}"
322
+ fail StandardError, msg
323
+ end
324
+ end
325
+ end
326
+
327
+
328
+ end # class
329
+
330
+ end # module
331
+
332
+ # End of file
@@ -0,0 +1,33 @@
1
+ require_relative '../spec_helper'
2
+
3
+ # Load the class under test
4
+ require_relative '../../lib/sequitur/digram'
5
+
6
+ module Sequitur # Re-open the module to get rid of qualified names
7
+
8
+ describe Digram do
9
+ let(:two_symbols) { [:b, :c] }
10
+
11
+ context 'Standard creation & initialization:' do
12
+
13
+ it 'should be created with 3 arguments' do
14
+ production = double('sample-production')
15
+ instance = Digram.new(:b, :c, production)
16
+
17
+ expect(instance.symbols).to eq(two_symbols)
18
+ expect(instance.production_id).to eq(production.object_id)
19
+ end
20
+
21
+ it 'should return the production that it refers to' do
22
+ production = double('sample-production')
23
+ instance = Digram.new(:b, :c, production)
24
+ expect(instance.production).to eq(production)
25
+ end
26
+
27
+ end # context
28
+
29
+ end # describe
30
+
31
+ end # module
32
+
33
+ # End of file
@@ -0,0 +1,123 @@
1
+ require_relative '../spec_helper'
2
+
3
+ # Load the class under test
4
+ require_relative '../../lib/sequitur/dynamic-grammar'
5
+
6
+ module Sequitur # Re-open the module to get rid of qualified names
7
+
8
+ describe DynamicGrammar do
9
+ # Factory method. Build a production with the given sequence
10
+ # of symbols as its rhs.
11
+ def build_production(*symbols)
12
+ prod = Production.new
13
+ symbols.each { |symb| prod.append_symbol(symb) }
14
+ return prod
15
+ end
16
+
17
+ let(:p_a) { build_production(:a) }
18
+ let(:p_b) { build_production(:b) }
19
+ let(:p_c) { build_production(:c) }
20
+ let(:p_bc) { build_production(p_b, p_c) }
21
+
22
+
23
+ context 'Creation & initialization:' do
24
+
25
+ it 'should be created without parameter' do
26
+ expect { DynamicGrammar.new }.not_to raise_error
27
+ end
28
+
29
+ it 'should have an empty root/start production' do
30
+ expect(subject.root).to be_empty
31
+ expect(subject.productions.size).to eq(1)
32
+ expect(subject.productions.first).to be_empty
33
+ end
34
+
35
+ end # context
36
+
37
+
38
+ context 'Adding productions to the grammar:' do
39
+ it 'should add a simple production' do
40
+ subject.add_production(p_a)
41
+ expect(subject.productions.size).to eq(2)
42
+ expect(subject.productions.last).to eq(p_a)
43
+
44
+ # Error: p_b, p_c not in grammar
45
+ expect { add_production(p_bc) }.to raise_error(StandardError)
46
+
47
+ subject.add_production(p_b)
48
+ expect(subject.productions.size).to eq(3)
49
+ expect(subject.productions.last).to eq(p_b)
50
+
51
+ # Error: p_c not in grammar
52
+ expect { add_production(p_bc) }.to raise_error(StandardError)
53
+
54
+ subject.add_production(p_c)
55
+ expect(subject.productions.size).to eq(4)
56
+ expect(subject.productions.last).to eq(p_c)
57
+
58
+ subject.add_production(p_bc)
59
+ expect(subject.productions.size).to eq(5)
60
+ expect(subject.productions.last).to eq(p_bc)
61
+ end
62
+
63
+ it 'should complain when rhs refers to unknown production' do
64
+ subject.add_production(p_a)
65
+ subject.add_production(p_b)
66
+ # Test fails because of Production#references
67
+ msg = "Production #{p_bc.object_id} refers to production #{p_c.object_id}"
68
+ msg << ' that is not part of the grammar.'
69
+ expect { subject.add_production(p_bc) }.to raise_error(StandardError, msg)
70
+
71
+ end
72
+ end # context
73
+
74
+
75
+ context 'Removing a production from the grammar:' do
76
+ it 'should remove an existing production' do
77
+ subject.add_production(p_a) # index = 1
78
+ subject.add_production(p_b) # index = 2
79
+ subject.add_production(p_c) # index = 3
80
+ subject.add_production(p_bc) # index = 4
81
+ expect(subject.productions.size).to eq(5)
82
+
83
+ expect(p_a.refcount).to eq(0)
84
+ expect(p_b.refcount).to eq(1)
85
+ expect(p_c.refcount).to eq(1)
86
+
87
+ subject.delete_production(1) # 1 => p_a
88
+ expect(subject.productions.size).to eq(4)
89
+ expect(p_b.refcount).to eq(1)
90
+ expect(p_c.refcount).to eq(1)
91
+ expect(subject.productions).not_to include(p_a)
92
+
93
+ subject.delete_production(3) # 3 => p_bc
94
+
95
+ expect(subject.productions.size).to eq(3)
96
+ expect(p_b.refcount).to eq(0)
97
+ expect(p_c.refcount).to eq(0)
98
+ expect(subject.productions).not_to include(p_bc)
99
+ end
100
+
101
+ end # context
102
+
103
+
104
+ context 'Generating a text representation of itself:' do
105
+
106
+ it 'should generate a text representation when empty' do
107
+ expectation = "#{subject.root.object_id} : ."
108
+ expect(subject.to_string).to eq(expectation)
109
+ end
110
+
111
+ # it 'should generate a text representation of a simple production' do
112
+ # instance = SequiturGrammar.new([:a].to_enum)
113
+ # expectation = "#{instance.root.object_id} : a."
114
+ # expect(instance.to_string).to eq(expectation)
115
+ # end
116
+
117
+ end # context
118
+
119
+ end # describe
120
+
121
+ end # module
122
+
123
+ # End of file