sequitur 0.1.02 → 0.1.03

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YTdkNzZiNTc1NjBkM2M0MDlhZDI1M2MyNTFhODJhZGI1MjFlYWI2MQ==
4
+ Mzg2M2MwZDY5M2M1Zjc5MGZlOWI5MTMwM2ExNTc3YWZlMWI4MjliZA==
5
5
  data.tar.gz: !binary |-
6
- MmEzZGRlNTI2M2U3ZmQwYmY3MTA1MmE0MDkzMGQ0ZjBmZDJlYTRmMQ==
6
+ OTY4NzRlYjJmMGU1Nzc4ZDdiZmNkYWFiNzRmMWU3OThiMmExNDNkYg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MjUzMmU0YTQ4MzQ2NmVmMWU2YWQzMTkwZDNiZjM3MjgyOTFlMmRmZDJmMmJi
10
- NmM5YjMxMjA0YzM5OGFiOGRiYjBmYTc2M2YyN2NiNjJiMGRlYjJkMmMxMThk
11
- ZGU1MDlhYTBkZDc3YTEwMDAwNmQ0YTZlOTQyZGM5YmFmNTRjNmM=
9
+ Y2M3Mjg1ZTFkYzEzNWI0YWYwMmIxMGIzZGE5ZjZmZmE4NmRjMDQ3NGE3Nzhl
10
+ YTZiMjM1ZWM1OTRiODAwMmE0NDg0NWI1MzYyZGQ1Y2RiMTAzNWMwMzFjNzE1
11
+ OGVkYmNlYzBmMjlhYmI3NDZkZTZjYWNjY2VkZTk2MzQxNWQ4OGY=
12
12
  data.tar.gz: !binary |-
13
- NGU2NjY2Yzc2ZmQ4NDFlN2E4MGVlYTUwMDg4NjgwYzBiYjk0ZjM5NGY4MTg4
14
- NTI5NTExOTQzMWY1YzhiNWM4ZjM1OWQ5YjM1MjViZWVlYWRlMWU5NjcyNDNk
15
- MzAwMzZhM2NlZGE1M2MzYTYyOGZmODkyMWE4YjA0NTE3MTk4NjA=
13
+ MTNiNjhlYTNhNGU1NjY4ZTUwNzkzNWRiYzllMDA5MjczYmFhMzE3MDJiZDI3
14
+ NWUzYzIyNTJlMDYzZTUxOWFmOTM0MzE3MDU3YTJiMTgzMTVkN2QyMGU5ZDFi
15
+ MmU0ZTMxZjYwMzNkZjFmNGVhMGU4M2E1ZTQ5YTliMDg4NmNlZGQ=
@@ -1,3 +1,7 @@
1
+ ### 0.1.03 / 2014-09-21
2
+ * [CHANGE] Class `Sequitur::SequiturGrammar` Code refactoring: cleaner and simpler implementation the algorithm.
3
+ * [CHANGE] Class `Sequitur::Digram`. Added new method `repeating?` that tells whether digram members are the same.
4
+
1
5
  ### 0.1.02 / 2014-09-18
2
6
  * [CHANGE] File `README.md`: expanded introductory text.
3
7
  * [CHANGE] File `sequitur.gemspec` : expanded gem description in the specification.
@@ -3,7 +3,7 @@
3
3
 
4
4
  module Sequitur # Module used as a namespace
5
5
  # The version number of the gem.
6
- Version = '0.1.02'
6
+ Version = '0.1.03'
7
7
 
8
8
  # Brief description of the gem.
9
9
  Description = 'Ruby implementation of the Sequitur algorithm'
@@ -33,6 +33,11 @@ class Digram
33
33
  return key == other.key
34
34
  end
35
35
 
36
+ # Return true when the digram consists of twice the same symbols
37
+ def repeating?()
38
+ return symbols[0] == symbols[1]
39
+ end
40
+
36
41
  end # class
37
42
 
38
43
  end # module
@@ -54,7 +54,7 @@ class Production
54
54
  end
55
55
 
56
56
  def decr_refcount()
57
- fail StandardError if @refcount == 0
57
+ fail StandardError, 'Internal error' if @refcount == 0
58
58
  @refcount -= 1
59
59
  end
60
60
 
@@ -1,13 +1,9 @@
1
1
  require_relative 'dynamic_grammar'
2
2
 
3
+
3
4
  module Sequitur # Module for classes implementing the Sequitur algorithm
4
5
 
5
6
  class SequiturGrammar < DynamicGrammar
6
- # A hash with pairs of the form: digram key => digram
7
- attr_reader(:digrams)
8
-
9
- # The input
10
- attr_reader(:parsed)
11
7
 
12
8
  # Constructor. Build the grammar from an enumerator of tokens
13
9
  def initialize(anEnum)
@@ -15,191 +11,136 @@ class SequiturGrammar < DynamicGrammar
15
11
  # Make start production compliant with utility rule
16
12
  2.times { root.incr_refcount }
17
13
 
18
- @digrams = {}
19
- @parsed = []
20
- anEnum.each { |a_token| add_token(a_token) }
14
+ # Read the input sequence and apply the Sequitur algorithm
15
+ anEnum.each do |a_token|
16
+ add_token(a_token)
17
+ enforce_rules
18
+ end
21
19
  end
22
20
 
23
21
  public
24
22
 
25
- # Add the given token to the grammar.
26
- def add_token(aToken)
27
- parsed << aToken
28
- super
29
- end
30
-
31
-
32
- private
33
-
34
- # Assumption: last digram of production isn't yet registered.
35
- def add_production(aProduction)
36
- super # Call original method from superclass...
37
-
38
- # ... then add this behaviour
39
- last_digram = aProduction.last_digram
40
- digrams[last_digram.key] = last_digram
41
- end
42
-
43
- # Remove a production from the grammar
44
- def remove_production(anIndex)
45
- prod = productions[anIndex]
46
-
47
- # Retrieve in the Hash all registered digrams from the removed production
48
- digrams_subset = digrams.select do |_, digr|
49
- digr.production == prod
50
- end
51
23
 
52
- # Remove them...
53
- digrams_subset.each_key { |a_key| digrams.delete(a_key) }
54
- super
24
+ CollisionDiagnosis = Struct.new(:collision_found, :digram, :productions)
25
+
26
+
27
+ # Assuming that a new input token was added to the start production,
28
+ # enforce the digram unicity and rule utility rules
29
+ # begin
30
+ # if a digram D occurs twice in the grammar then
31
+ # add a production P : D (if not already there)
32
+ # replace both Ds with R (reduction step).
33
+ # end
34
+ # if a production P : RHS in referenced only once then
35
+ # replace P by its RHS (derivation step)
36
+ # remove P from grammar
37
+ # end
38
+ # end until digram unicity and rule utility are met
39
+ def enforce_rules()
40
+ begin
41
+ unicity_diagnosis = detect_collision if unicity_diagnosis.nil?
42
+ restore_unicity(unicity_diagnosis) if unicity_diagnosis.collision_found
43
+
44
+ useless_prod = detect_useless_production
45
+ restore_utility(useless_prod) if useless_prod
46
+
47
+ unicity_diagnosis = detect_collision
48
+ useless_prod = detect_useless_production
49
+
50
+ end while unicity_diagnosis.collision_found || useless_prod
55
51
  end
56
52
 
57
- def append_symbol_to(aProduction, aSymbol)
58
- super
59
-
60
- prod_digrams = aProduction.digrams
61
- unless prod_digrams.empty?
62
- last_digram = prod_digrams.last
63
- matching_digram = digrams[last_digram.key]
64
- if matching_digram.nil?
65
- # ... No registered occurrence of the digram, then register it
66
- digrams[last_digram.key] = last_digram
67
- else
68
- # Digram is already registered...
69
- # the digram unicity rule is broken: fix this
70
- preserve_unicity(aProduction)
71
- enforce_rule_utility
53
+ # Check whether a digram is used twice in the grammar.
54
+ # Return an empty Hash if each digram appears once.
55
+ # Otherwise return a Hash with a pair of the form: digram => [Pi, Pk]
56
+ # Where Pi, Pk are two productions where the digram occurs.
57
+ def detect_collision()
58
+ diagnosis = CollisionDiagnosis.new(false)
59
+ found_so_far = {}
60
+ productions.each do |a_prod|
61
+ prod_digrams = a_prod.digrams
62
+ prod_digrams.each do |a_digr|
63
+ its_key = a_digr.key
64
+ if found_so_far.include? its_key
65
+ orig_digr = found_so_far[its_key]
66
+ # Disregard sequence like a a a
67
+ if ((orig_digr.production == a_prod) && a_digr.repeating? &&
68
+ (orig_digr == a_digr))
69
+ next
70
+ end
71
+
72
+ diagnosis.digram = orig_digr
73
+ diagnosis.productions = [orig_digr.production, a_prod]
74
+ diagnosis.collision_found = true
75
+ break
76
+ else
77
+ found_so_far[its_key] = a_digr
78
+ end
72
79
  end
80
+ break if diagnosis.collision_found
73
81
  end
82
+
83
+ return diagnosis
74
84
  end
75
85
 
76
- # The given production breaks the digram unicity rule.
77
- # Fix this either by a creating a new production having the duplicate
78
- # digram as its rhs or by referencing such a production.
79
- # then by replacing all occurrences of the digram by reference to
80
- # the fixing production.
81
- # Pre-condition: the given production has a repeated digram
82
- # or its last digram is used elsewhere
83
- def preserve_unicity(aProduction)
84
- last_digram = aProduction.last_digram
85
- matching_digram = digrams[last_digram.key]
86
- if aProduction == matching_digram.production
87
- # Rule: no other production distinct from aProduction should have
88
- # the matching digram
89
- productions.each do |prod|
90
- its_digrams = prod.digrams
91
- its_keys = its_digrams.map(&:key)
92
- next if prod == last_digram.production
93
- next unless its_keys.include? last_digram.key
94
- msg = "Digram #{last_digram.symbols} occurs three times!"
95
- msg << "\nTwice in production #{aProduction.object_id}"
96
- msg << "\nThird in production #{prod.object_id}"
97
- msg << "\n#{to_string}"
98
- fail StandardError, msg
86
+ # When a collision diagnosis indicates that a given
87
+ # digram d occurs twice in the grammar
88
+ # Then create a new production that will have
89
+ # the symbols of d as its rhs members.
90
+ def restore_unicity(aDiagnosis)
91
+ return if aDiagnosis.nil?
92
+
93
+ digr = aDiagnosis.digram
94
+ prods = aDiagnosis.productions
95
+ if prods.any?(&:single_digram?)
96
+ (simple, compound) = prods.partition do |a_prod|
97
+ a_prod.single_digram?
99
98
  end
100
-
101
- # Digram appears twice in given production...
102
- # Then create a new production with the digram as its rhs
99
+ compound[0].replace_digram(simple[0])
100
+ else
101
+ # Create a new production with the digram's symbols as its
102
+ # sole rhs members.
103
103
  new_prod = Production.new
104
- new_prod.append_symbol(last_digram.symbols[0])
105
- new_prod.append_symbol(last_digram.symbols[1])
106
-
107
- # ... replace duplicate digram by reference to new production
108
- aProduction.replace_digram(new_prod)
104
+ digr.symbols.each { |sym| new_prod.append_symbol(sym) }
109
105
  add_production(new_prod)
110
- update_digrams_from(aProduction)
111
- else
112
- # Duplicate digram used in distinct production
113
- # Two cases: other production is a single digram one or a multi-digram
114
- other_prod = matching_digram.production
115
- if other_prod.single_digram?
116
- # ... replace duplicate digram by reference to other production
117
- aProduction.replace_digram(other_prod)
118
- update_digrams_from(aProduction)
119
-
120
- # Special case a: replacement causes another digram duplication
121
- # in the given production
122
- # Special case b: replacement causes another digram duplication
123
- # with other production
124
- if aProduction.repeated_digram? ||
125
- (digrams[aProduction.last_digram.key].production != aProduction)
126
- preserve_unicity(aProduction)
127
- end
128
-
106
+ if prods[0] == prods[1]
107
+ prods[0].replace_digram(new_prod)
129
108
  else
130
- # aProduction, other_prod use both the same digram
131
- # Then create a new production with the digram as its rhs
132
- new_prod = Production.new
133
- new_prod.append_symbol(last_digram.symbols[0])
134
- new_prod.append_symbol(last_digram.symbols[1])
135
-
136
- # ... replace duplicate digram by reference to new production
137
- aProduction.replace_digram(new_prod)
138
- other_prod.replace_digram(new_prod)
139
- add_production(new_prod)
140
- update_digrams_from(aProduction)
141
-
142
- # TODO: Check when aProduction and other_prod have same preceding symbol
143
- update_digrams_from(other_prod)
109
+ prods.each { |a_prod| a_prod.replace_digram(new_prod) }
144
110
  end
145
111
  end
146
112
  end
147
113
 
148
- # Rule utility: except for the root production, every production must occur
149
- # multiple times in all the rhs.
150
- # Initialize occurrence hash with pairs: production id => []
151
- # For each production:
152
- # - Detect occurrence of any production in the rhs
153
- # - Identify the occurring production
154
- # - In the occurrence hash push the production id of the lhs
155
- # Select each production that occurs once (singleton rule):
156
- # Replace the occurrence in the rhs by the rhs of the singleton rule
157
- # Delete the singleton rule
158
- # Update digrams
159
- def enforce_rule_utility()
160
- return if productions.size < 2
161
-
162
- loop do
163
- all_refcount_ok = true
164
- (1...productions.size).to_a.reverse.each do |index|
165
- curr_production = productions[index]
166
- next unless curr_production.refcount == 1
167
-
168
- all_refcount_ok = false
169
- dependent = productions.find do |a_prod|
170
- !a_prod.references_of(curr_production).empty?
171
- end
172
- dependent.replace_production(productions[index])
173
- remove_production(index)
174
- update_digrams_from(dependent)
175
- end
176
-
177
- break if all_refcount_ok
178
- end
114
+ # Return a production that is used less than twice in the grammar.
115
+ def detect_useless_production()
116
+ useless = productions.find { |prod| prod.refcount < 2 }
117
+ return (useless == productions[0]) ? nil : useless
179
118
  end
180
119
 
181
-
182
- # Update the digrams Hash with the digrams from the given production.
183
- def update_digrams_from(aProduction)
184
- current_digrams = aProduction.digrams
185
-
186
- # Add new digrams only if they don't collide
187
- current_digrams.each do |digr|
188
- digrams[digr.key] = digr unless digrams.include? digr.key
120
+ # Given the passed production P is referenced only once.
121
+ # Then replace P by its RHS where it is referenced.
122
+ # And delete P
123
+ def restore_utility(useless_prod)
124
+ # Retrieve index of useless_prod
125
+ index = productions.index(useless_prod)
126
+
127
+ # Retrieve production referencing useless one
128
+ referencing = nil
129
+ productions.each do |a_prod|
130
+ # Next line assumes non-recursive productions
131
+ next if a_prod == useless_prod
132
+
133
+ refs = a_prod.references_of(useless_prod)
134
+ next if refs.empty?
135
+ referencing = a_prod
136
+ break
189
137
  end
190
138
 
191
- # Retrieve all registered digrams from the production
192
- digrams_subset = digrams.select do |_, digr|
193
- digr.production == aProduction
194
- end
195
-
196
- # Remove obsolete digrams
197
- current_keys = current_digrams.map(&:key)
198
- digrams_subset.keys.each do |a_key|
199
- digrams.delete(a_key) unless current_keys.include? a_key
200
- end
139
+ referencing.replace_production(useless_prod)
140
+ remove_production(index)
201
141
  end
202
142
 
143
+
203
144
  end # class
204
145
 
205
146
  end # module
@@ -22,10 +22,18 @@ describe Digram do
22
22
  instance = Digram.new(:b, :c, production)
23
23
  expect(instance.production).to eq(production)
24
24
  end
25
+
26
+ it 'should whether its symbols are the same' do
27
+ instance1 = Digram.new(:a, :a, production)
28
+ expect(instance1).to be_repeating
29
+
30
+ instance1 = Digram.new(:a, :b, production)
31
+ expect(instance1).not_to be_repeating
32
+ end
25
33
 
26
34
  end # context
27
35
 
28
- context 'Standard creation & initialization:' do
36
+ context 'Provided services:' do
29
37
 
30
38
  it 'should compare itself to another digram' do
31
39
  instance1 = Digram.new(:a, :b, production)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sequitur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.02
4
+ version: 0.1.03
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitri Geshef
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-18 00:00:00.000000000 Z
11
+ date: 2014-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake