sequitur 0.0.04

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,332 @@
1
+ require_relative 'dynamic-grammar'
2
+
3
+ module Sequitur # Module for classes implementing the Sequitur algorithm
4
+
5
+ class SequiturGrammar < DynamicGrammar
6
+ # A hash with pairs of the form: digram key => digram
7
+ attr_reader(:digrams)
8
+
9
+ # The input
10
+ attr_reader(:parsed)
11
+
12
+ # Constructor. Build the grammar from an enumerator of tokens
13
+ def initialize(anEnum)
14
+ super()
15
+ # Make start production compliant with utility rule
16
+ 2.times { root.add_backref(root) }
17
+
18
+ @digrams = {}
19
+ @parsed = []
20
+ anEnum.each { |a_token| add_token(a_token) }
21
+ end
22
+
23
+ public
24
+
25
+ # Add the given token to the grammar.
26
+ def add_token(aToken)
27
+ parsed << aToken
28
+ super
29
+ end
30
+
31
+ # Check the invariant:
32
+ # Every digram appearing in a rhs must occur at most once in the grammar.
33
+ def check_unicity()
34
+ all_digrams = {}
35
+ productions.each do |a_prod|
36
+ prod_digrams = a_prod.digrams
37
+ prod_digrams.each do |a_digram|
38
+ if all_digrams.include? a_digram.key
39
+ msg = "Digram #{a_digram.symbols} occurs twice!"
40
+ colliding = all_digrams[a_digram.key]
41
+ msg << "\nOnce in production #{colliding.production_id}"
42
+ msg << "\nSecond in production #{a_prod.object_id}"
43
+ msg << "\n#{to_string}"
44
+ fail StandardError, msg
45
+ else
46
+ all_digrams[a_digram.key] = a_digram
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+
53
+ private
54
+
55
+ # Assumption: last digram of production isn't yet registered.
56
+ def add_production(aProduction)
57
+ super # Call original method from superclass...
58
+
59
+ # ... then add this behaviour
60
+ last_digram = aProduction.last_digram
61
+ digrams[last_digram.key] = last_digram
62
+ end
63
+
64
+ # Remove a production from the grammar
65
+ def delete_production(anIndex)
66
+ prod = productions[anIndex]
67
+
68
+ # Retrieve in the Hash all registered digrams from the removed production
69
+ digrams_subset = digrams.select do |_, digr|
70
+ digr.production_id == prod.object_id
71
+ end
72
+
73
+ # Remove them...
74
+ digrams_subset.each_key { |a_key| digrams.delete(a_key) }
75
+ super
76
+ end
77
+
78
+ def append_symbol_to(aProduction, aSymbol)
79
+ prod_digrams = aProduction.calc_append_symbol(aSymbol)
80
+ check_digrams # TODO: remove this
81
+ check_backrefs # TODO: remove this
82
+ super
83
+ unless prod_digrams.empty?
84
+ last_digram = prod_digrams.last
85
+ matching_digram = digrams[last_digram.key]
86
+ if matching_digram.nil?
87
+ # ... No registered occurrence of the digram, then register it
88
+ digrams[last_digram.key] = last_digram
89
+ else
90
+ # Digram is already registered...
91
+ # the digram unicity rule is broken: fix this
92
+ preserve_unicity(aProduction)
93
+ enforce_rule_utility
94
+ end
95
+ end
96
+ end
97
+
98
+ # The given production breaks the digram unicity rule.
99
+ # Fix this either by a creating a new production having the duplicate
100
+ # digram as its rhs or by referencing such a production.
101
+ # then by replacing all occurrences of the digram by reference to
102
+ # the fixing production.
103
+ # Pre-condition: the given production has a repeated digram
104
+ # or its last digram is used elsewhere
105
+ def preserve_unicity(aProduction)
106
+ last_digram = aProduction.last_digram
107
+ matching_digram = digrams[last_digram.key]
108
+ if last_digram.production_id == matching_digram.production_id
109
+ # Rule: no other production distinct from aProduction should have
110
+ # the matching digram
111
+ productions.each do |prod|
112
+ its_digrams = prod.digrams
113
+ its_keys = its_digrams.map(&:key)
114
+ if prod.object_id == last_digram.production_id
115
+ # TODO: check that digram really occurs twice in the production.
116
+ # occurrences = its_keys.select { |a_key| a_key == last_digram.key }
117
+ # if occurrences.size != 2
118
+ # msg = "Digram #{last_digram.symbols} should occur twice"
119
+ # msg << "\nin production #{aProduction.object_id}"
120
+ # msg << "\nBut occurs #{occurrences.size}"
121
+ # msg << "\n#{self.to_string}"
122
+ # fail StandardError, msg
123
+ # end
124
+
125
+ else
126
+ if its_keys.include? last_digram.key
127
+ msg = "Digram #{last_digram.symbols} occurs three times!"
128
+ msg << "\nTwice in production #{aProduction.object_id}"
129
+ msg << "\nThird in production #{prod.object_id}"
130
+ msg << "\n#{to_string}"
131
+ fail StandardError, msg
132
+ end
133
+ end
134
+ end
135
+
136
+ # Digram appears twice in given production...
137
+ # Then create a new production with the digram as its rhs
138
+ new_prod = Production.new
139
+ new_prod.append_symbol(last_digram.symbols[0])
140
+ new_prod.append_symbol(last_digram.symbols[1])
141
+
142
+ # ... replace duplicate digram by reference to new production
143
+ aProduction.replace_digram(new_prod)
144
+ add_production(new_prod)
145
+ update_digrams_from(aProduction)
146
+ check_digrams # TODO: remove
147
+ check_unicity
148
+ else
149
+ # Duplicate digram used in distinct production
150
+ # Two cases: other production is a single digram one or a multi-digram
151
+ other_prod = ObjectSpace._id2ref(matching_digram.production_id)
152
+ if other_prod.single_digram?
153
+ # ... replace duplicate digram by reference to other production
154
+ aProduction.replace_digram(other_prod)
155
+ update_digrams_from(aProduction)
156
+
157
+ # Special case a: replacement causes another digram duplication
158
+ # in the given production
159
+ # Special case b: replacement causes another digram duplication
160
+ # with other production
161
+ if aProduction.repeated_digram? || digrams[aProduction.last_digram.key]
162
+ preserve_unicity(aProduction)
163
+ end
164
+
165
+ check_references # TODO: remove this
166
+ else
167
+ # aProduction, other_prod use both the same digram
168
+ # Then create a new production with the digram as its rhs
169
+ new_prod = Production.new
170
+ new_prod.append_symbol(last_digram.symbols[0])
171
+ new_prod.append_symbol(last_digram.symbols[1])
172
+
173
+ # ... replace duplicate digram by reference to new production
174
+ aProduction.replace_digram(new_prod)
175
+ other_prod.replace_digram(new_prod)
176
+ add_production(new_prod)
177
+ update_digrams_from(aProduction)
178
+
179
+ # TODO: Check when aProduction and other_prod have same preceding symbol
180
+ update_digrams_from(other_prod)
181
+ check_backrefs # TODO: remove this
182
+ end
183
+ check_unicity
184
+ end
185
+
186
+ check_unicity
187
+ check_registered
188
+ end
189
+
190
+ # Rule utility: except for the root production, every production must occur
191
+ # multiple times in all the rhs.
192
+ # Initialize occurrence hash with pairs: production id => []
193
+ # For each production:
194
+ # - Detect occurrence of any production in the rhs
195
+ # - Identify the occurring production
196
+ # - In the occurrence hash push the production id of the lhs
197
+ # Select each production that occurs once (singleton rule):
198
+ # Replace the occurrence in the rhs by the rhs of the singleton rule
199
+ # Delete the singleton rule
200
+ # Update digrams
201
+ def enforce_rule_utility()
202
+ return if productions.size < 2
203
+ check_references
204
+
205
+ loop do
206
+ all_refcount_OK = true
207
+ (1...productions.size).to_a.reverse.each do |index|
208
+ next unless productions[index].refcount == 1
209
+
210
+ all_refcount_OK = false
211
+ other_id = productions[index].backrefs.keys.first
212
+ dependent = ObjectSpace._id2ref(other_id)
213
+ dependent.replace_production(productions[index])
214
+ delete_production(index)
215
+ update_digrams_from(dependent)
216
+ check_references
217
+ check_backrefs
218
+ end
219
+
220
+ break if all_refcount_OK
221
+ end
222
+ end
223
+
224
+
225
+ # Update the digrams Hash with the digrams from the given production.
226
+ def update_digrams_from(aProduction)
227
+ current_digrams = aProduction.digrams
228
+
229
+ # Add new digrams
230
+ current_digrams.each do |digr|
231
+ digrams[digr.key] = digr unless digrams.include? digr.key
232
+ end
233
+
234
+ # Retrieve all registered digrams from the production
235
+ digrams_subset = digrams.select do |_, digr|
236
+ digr.production_id == aProduction.object_id
237
+ end
238
+
239
+ # Remove obsolete digrams
240
+ current_keys = current_digrams.map(&:key)
241
+ digrams_subset.keys.each do |a_key|
242
+ digrams.delete(a_key) unless current_keys.include? a_key
243
+ end
244
+ end
245
+
246
+ # Check the invariant:
247
+ # Every production reference in a rhs must point
248
+ # to a production of the grammar.
249
+ def check_references()
250
+ productions.each do |a_prod|
251
+ rhs_prods = a_prod.references
252
+ rhs_prods.each do |referenced_prod|
253
+ next if productions.include? referenced_prod
254
+
255
+ msg = "Production #{a_prod.object_id} references the "
256
+ msg << "unknown production #{referenced_prod.object_id}"
257
+ msg << "\nOrphan production: #{referenced_prod.to_string}"
258
+ msg << "\n#{to_string}"
259
+ fail StandardError, msg
260
+ end
261
+ end
262
+ end
263
+
264
+ # Check the invariant:
265
+ # Every registered digram must reference a production from the grammar
266
+ def check_registered()
267
+ digrams.each do |_key, digr|
268
+ found = productions.find do |a_prod|
269
+ digr.production_id == a_prod.object_id
270
+ end
271
+ next if found
272
+
273
+ msg = "Digram #{digr.symbols} references the unknown "
274
+ msg << "production (#{digr.production_id})."
275
+ msg << "\n#{to_string}"
276
+ fail StandardError, msg
277
+ end
278
+ end
279
+
280
+ # Compare the contents of digrams Hash with
281
+ # All digrams from all productions
282
+ def check_digrams()
283
+ # Control that every registered digram refers
284
+ # to a production that really has that digram
285
+ digrams.each do |key, digr|
286
+ its_prod = ObjectSpace._id2ref(digr.production_id)
287
+ prod_digrams = its_prod.digrams
288
+ prod_keys = prod_digrams.map(&:key)
289
+ next if prod_keys.include? key
290
+
291
+ msg = "Production #{digr.production_id} doesn't have "
292
+ msg << "the digram #{digr.symbols}"
293
+ msg << "\n#{prod_digrams.map(&:symbols)}"
294
+ msg << "\n#{to_string}"
295
+ fail StandardError, msg
296
+ end
297
+
298
+ all_digrams = {}
299
+ productions.each do |a_prod|
300
+ its_digrams = a_prod.digrams
301
+ its_digrams.each do |digr|
302
+ check_unicity if all_digrams[digr.key]
303
+ all_digrams[digr.key] = digr
304
+ end
305
+ end
306
+
307
+ all_digrams.each do |key, digr|
308
+ registered = digrams[key]
309
+ if registered
310
+ if registered.production_id != digr.production_id
311
+ msg = "Production #{digr.production_id} has "
312
+ msg << "the digram #{digr.symbols} that collides"
313
+ msg << "\n with same digram from #{registered.production_id}"
314
+ msg << "\n#{to_string}"
315
+ fail StandardError, msg
316
+ end
317
+ else
318
+ its_prod = ObjectSpace._id2ref(digr.production_id)
319
+ msg = "Production #{its_prod.object_id} (#{its_prod.rhs}) "
320
+ msg << "has the digram #{digr.symbols} that isn't registered."
321
+ msg << "\n#{to_string}"
322
+ fail StandardError, msg
323
+ end
324
+ end
325
+ end
326
+
327
+
328
+ end # class
329
+
330
+ end # module
331
+
332
+ # End of file
@@ -0,0 +1,33 @@
1
+ require_relative '../spec_helper'
2
+
3
+ # Load the class under test
4
+ require_relative '../../lib/sequitur/digram'
5
+
6
+ module Sequitur # Re-open the module to get rid of qualified names
7
+
8
+ describe Digram do
9
+ let(:two_symbols) { [:b, :c] }
10
+
11
+ context 'Standard creation & initialization:' do
12
+
13
+ it 'should be created with 3 arguments' do
14
+ production = double('sample-production')
15
+ instance = Digram.new(:b, :c, production)
16
+
17
+ expect(instance.symbols).to eq(two_symbols)
18
+ expect(instance.production_id).to eq(production.object_id)
19
+ end
20
+
21
+ it 'should return the production that it refers to' do
22
+ production = double('sample-production')
23
+ instance = Digram.new(:b, :c, production)
24
+ expect(instance.production).to eq(production)
25
+ end
26
+
27
+ end # context
28
+
29
+ end # describe
30
+
31
+ end # module
32
+
33
+ # End of file
@@ -0,0 +1,123 @@
1
+ require_relative '../spec_helper'
2
+
3
+ # Load the class under test
4
+ require_relative '../../lib/sequitur/dynamic-grammar'
5
+
6
+ module Sequitur # Re-open the module to get rid of qualified names
7
+
8
+ describe DynamicGrammar do
9
+ # Factory method. Build a production with the given sequence
10
+ # of symbols as its rhs.
11
+ def build_production(*symbols)
12
+ prod = Production.new
13
+ symbols.each { |symb| prod.append_symbol(symb) }
14
+ return prod
15
+ end
16
+
17
+ let(:p_a) { build_production(:a) }
18
+ let(:p_b) { build_production(:b) }
19
+ let(:p_c) { build_production(:c) }
20
+ let(:p_bc) { build_production(p_b, p_c) }
21
+
22
+
23
+ context 'Creation & initialization:' do
24
+
25
+ it 'should be created without parameter' do
26
+ expect { DynamicGrammar.new }.not_to raise_error
27
+ end
28
+
29
+ it 'should have an empty root/start production' do
30
+ expect(subject.root).to be_empty
31
+ expect(subject.productions.size).to eq(1)
32
+ expect(subject.productions.first).to be_empty
33
+ end
34
+
35
+ end # context
36
+
37
+
38
+ context 'Adding productions to the grammar:' do
39
+ it 'should add a simple production' do
40
+ subject.add_production(p_a)
41
+ expect(subject.productions.size).to eq(2)
42
+ expect(subject.productions.last).to eq(p_a)
43
+
44
+ # Error: p_b, p_c not in grammar
45
+ expect { add_production(p_bc) }.to raise_error(StandardError)
46
+
47
+ subject.add_production(p_b)
48
+ expect(subject.productions.size).to eq(3)
49
+ expect(subject.productions.last).to eq(p_b)
50
+
51
+ # Error: p_c not in grammar
52
+ expect { add_production(p_bc) }.to raise_error(StandardError)
53
+
54
+ subject.add_production(p_c)
55
+ expect(subject.productions.size).to eq(4)
56
+ expect(subject.productions.last).to eq(p_c)
57
+
58
+ subject.add_production(p_bc)
59
+ expect(subject.productions.size).to eq(5)
60
+ expect(subject.productions.last).to eq(p_bc)
61
+ end
62
+
63
+ it 'should complain when rhs refers to unknown production' do
64
+ subject.add_production(p_a)
65
+ subject.add_production(p_b)
66
+ # Test fails because of Production#references
67
+ msg = "Production #{p_bc.object_id} refers to production #{p_c.object_id}"
68
+ msg << ' that is not part of the grammar.'
69
+ expect { subject.add_production(p_bc) }.to raise_error(StandardError, msg)
70
+
71
+ end
72
+ end # context
73
+
74
+
75
+ context 'Removing a production from the grammar:' do
76
+ it 'should remove an existing production' do
77
+ subject.add_production(p_a) # index = 1
78
+ subject.add_production(p_b) # index = 2
79
+ subject.add_production(p_c) # index = 3
80
+ subject.add_production(p_bc) # index = 4
81
+ expect(subject.productions.size).to eq(5)
82
+
83
+ expect(p_a.refcount).to eq(0)
84
+ expect(p_b.refcount).to eq(1)
85
+ expect(p_c.refcount).to eq(1)
86
+
87
+ subject.delete_production(1) # 1 => p_a
88
+ expect(subject.productions.size).to eq(4)
89
+ expect(p_b.refcount).to eq(1)
90
+ expect(p_c.refcount).to eq(1)
91
+ expect(subject.productions).not_to include(p_a)
92
+
93
+ subject.delete_production(3) # 3 => p_bc
94
+
95
+ expect(subject.productions.size).to eq(3)
96
+ expect(p_b.refcount).to eq(0)
97
+ expect(p_c.refcount).to eq(0)
98
+ expect(subject.productions).not_to include(p_bc)
99
+ end
100
+
101
+ end # context
102
+
103
+
104
+ context 'Generating a text representation of itself:' do
105
+
106
+ it 'should generate a text representation when empty' do
107
+ expectation = "#{subject.root.object_id} : ."
108
+ expect(subject.to_string).to eq(expectation)
109
+ end
110
+
111
+ # it 'should generate a text representation of a simple production' do
112
+ # instance = SequiturGrammar.new([:a].to_enum)
113
+ # expectation = "#{instance.root.object_id} : a."
114
+ # expect(instance.to_string).to eq(expectation)
115
+ # end
116
+
117
+ end # context
118
+
119
+ end # describe
120
+
121
+ end # module
122
+
123
+ # End of file