sequitur 0.1.02 → 0.1.03

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- YTdkNzZiNTc1NjBkM2M0MDlhZDI1M2MyNTFhODJhZGI1MjFlYWI2MQ==
4
+ Mzg2M2MwZDY5M2M1Zjc5MGZlOWI5MTMwM2ExNTc3YWZlMWI4MjliZA==
5
5
  data.tar.gz: !binary |-
6
- MmEzZGRlNTI2M2U3ZmQwYmY3MTA1MmE0MDkzMGQ0ZjBmZDJlYTRmMQ==
6
+ OTY4NzRlYjJmMGU1Nzc4ZDdiZmNkYWFiNzRmMWU3OThiMmExNDNkYg==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- MjUzMmU0YTQ4MzQ2NmVmMWU2YWQzMTkwZDNiZjM3MjgyOTFlMmRmZDJmMmJi
10
- NmM5YjMxMjA0YzM5OGFiOGRiYjBmYTc2M2YyN2NiNjJiMGRlYjJkMmMxMThk
11
- ZGU1MDlhYTBkZDc3YTEwMDAwNmQ0YTZlOTQyZGM5YmFmNTRjNmM=
9
+ Y2M3Mjg1ZTFkYzEzNWI0YWYwMmIxMGIzZGE5ZjZmZmE4NmRjMDQ3NGE3Nzhl
10
+ YTZiMjM1ZWM1OTRiODAwMmE0NDg0NWI1MzYyZGQ1Y2RiMTAzNWMwMzFjNzE1
11
+ OGVkYmNlYzBmMjlhYmI3NDZkZTZjYWNjY2VkZTk2MzQxNWQ4OGY=
12
12
  data.tar.gz: !binary |-
13
- NGU2NjY2Yzc2ZmQ4NDFlN2E4MGVlYTUwMDg4NjgwYzBiYjk0ZjM5NGY4MTg4
14
- NTI5NTExOTQzMWY1YzhiNWM4ZjM1OWQ5YjM1MjViZWVlYWRlMWU5NjcyNDNk
15
- MzAwMzZhM2NlZGE1M2MzYTYyOGZmODkyMWE4YjA0NTE3MTk4NjA=
13
+ MTNiNjhlYTNhNGU1NjY4ZTUwNzkzNWRiYzllMDA5MjczYmFhMzE3MDJiZDI3
14
+ NWUzYzIyNTJlMDYzZTUxOWFmOTM0MzE3MDU3YTJiMTgzMTVkN2QyMGU5ZDFi
15
+ MmU0ZTMxZjYwMzNkZjFmNGVhMGU4M2E1ZTQ5YTliMDg4NmNlZGQ=
@@ -1,3 +1,7 @@
1
+ ### 0.1.03 / 2014-09-21
2
+ * [CHANGE] Class `Sequitur::SequiturGrammar` Code refactoring: cleaner and simpler implementation the algorithm.
3
+ * [CHANGE] Class `Sequitur::Digram`. Added new method `repeating?` that tells whether digram members are the same.
4
+
1
5
  ### 0.1.02 / 2014-09-18
2
6
  * [CHANGE] File `README.md`: expanded introductory text.
3
7
  * [CHANGE] File `sequitur.gemspec` : expanded gem description in the specification.
@@ -3,7 +3,7 @@
3
3
 
4
4
  module Sequitur # Module used as a namespace
5
5
  # The version number of the gem.
6
- Version = '0.1.02'
6
+ Version = '0.1.03'
7
7
 
8
8
  # Brief description of the gem.
9
9
  Description = 'Ruby implementation of the Sequitur algorithm'
@@ -33,6 +33,11 @@ class Digram
33
33
  return key == other.key
34
34
  end
35
35
 
36
+ # Return true when the digram consists of twice the same symbols
37
+ def repeating?()
38
+ return symbols[0] == symbols[1]
39
+ end
40
+
36
41
  end # class
37
42
 
38
43
  end # module
@@ -54,7 +54,7 @@ class Production
54
54
  end
55
55
 
56
56
  def decr_refcount()
57
- fail StandardError if @refcount == 0
57
+ fail StandardError, 'Internal error' if @refcount == 0
58
58
  @refcount -= 1
59
59
  end
60
60
 
@@ -1,13 +1,9 @@
1
1
  require_relative 'dynamic_grammar'
2
2
 
3
+
3
4
  module Sequitur # Module for classes implementing the Sequitur algorithm
4
5
 
5
6
  class SequiturGrammar < DynamicGrammar
6
- # A hash with pairs of the form: digram key => digram
7
- attr_reader(:digrams)
8
-
9
- # The input
10
- attr_reader(:parsed)
11
7
 
12
8
  # Constructor. Build the grammar from an enumerator of tokens
13
9
  def initialize(anEnum)
@@ -15,191 +11,136 @@ class SequiturGrammar < DynamicGrammar
15
11
  # Make start production compliant with utility rule
16
12
  2.times { root.incr_refcount }
17
13
 
18
- @digrams = {}
19
- @parsed = []
20
- anEnum.each { |a_token| add_token(a_token) }
14
+ # Read the input sequence and apply the Sequitur algorithm
15
+ anEnum.each do |a_token|
16
+ add_token(a_token)
17
+ enforce_rules
18
+ end
21
19
  end
22
20
 
23
21
  public
24
22
 
25
- # Add the given token to the grammar.
26
- def add_token(aToken)
27
- parsed << aToken
28
- super
29
- end
30
-
31
-
32
- private
33
-
34
- # Assumption: last digram of production isn't yet registered.
35
- def add_production(aProduction)
36
- super # Call original method from superclass...
37
-
38
- # ... then add this behaviour
39
- last_digram = aProduction.last_digram
40
- digrams[last_digram.key] = last_digram
41
- end
42
-
43
- # Remove a production from the grammar
44
- def remove_production(anIndex)
45
- prod = productions[anIndex]
46
-
47
- # Retrieve in the Hash all registered digrams from the removed production
48
- digrams_subset = digrams.select do |_, digr|
49
- digr.production == prod
50
- end
51
23
 
52
- # Remove them...
53
- digrams_subset.each_key { |a_key| digrams.delete(a_key) }
54
- super
24
+ CollisionDiagnosis = Struct.new(:collision_found, :digram, :productions)
25
+
26
+
27
+ # Assuming that a new input token was added to the start production,
28
+ # enforce the digram unicity and rule utility rules
29
+ # begin
30
+ # if a digram D occurs twice in the grammar then
31
+ # add a production P : D (if not already there)
32
+ # replace both Ds with R (reduction step).
33
+ # end
34
+ # if a production P : RHS in referenced only once then
35
+ # replace P by its RHS (derivation step)
36
+ # remove P from grammar
37
+ # end
38
+ # end until digram unicity and rule utility are met
39
+ def enforce_rules()
40
+ begin
41
+ unicity_diagnosis = detect_collision if unicity_diagnosis.nil?
42
+ restore_unicity(unicity_diagnosis) if unicity_diagnosis.collision_found
43
+
44
+ useless_prod = detect_useless_production
45
+ restore_utility(useless_prod) if useless_prod
46
+
47
+ unicity_diagnosis = detect_collision
48
+ useless_prod = detect_useless_production
49
+
50
+ end while unicity_diagnosis.collision_found || useless_prod
55
51
  end
56
52
 
57
- def append_symbol_to(aProduction, aSymbol)
58
- super
59
-
60
- prod_digrams = aProduction.digrams
61
- unless prod_digrams.empty?
62
- last_digram = prod_digrams.last
63
- matching_digram = digrams[last_digram.key]
64
- if matching_digram.nil?
65
- # ... No registered occurrence of the digram, then register it
66
- digrams[last_digram.key] = last_digram
67
- else
68
- # Digram is already registered...
69
- # the digram unicity rule is broken: fix this
70
- preserve_unicity(aProduction)
71
- enforce_rule_utility
53
+ # Check whether a digram is used twice in the grammar.
54
+ # Return an empty Hash if each digram appears once.
55
+ # Otherwise return a Hash with a pair of the form: digram => [Pi, Pk]
56
+ # Where Pi, Pk are two productions where the digram occurs.
57
+ def detect_collision()
58
+ diagnosis = CollisionDiagnosis.new(false)
59
+ found_so_far = {}
60
+ productions.each do |a_prod|
61
+ prod_digrams = a_prod.digrams
62
+ prod_digrams.each do |a_digr|
63
+ its_key = a_digr.key
64
+ if found_so_far.include? its_key
65
+ orig_digr = found_so_far[its_key]
66
+ # Disregard sequence like a a a
67
+ if ((orig_digr.production == a_prod) && a_digr.repeating? &&
68
+ (orig_digr == a_digr))
69
+ next
70
+ end
71
+
72
+ diagnosis.digram = orig_digr
73
+ diagnosis.productions = [orig_digr.production, a_prod]
74
+ diagnosis.collision_found = true
75
+ break
76
+ else
77
+ found_so_far[its_key] = a_digr
78
+ end
72
79
  end
80
+ break if diagnosis.collision_found
73
81
  end
82
+
83
+ return diagnosis
74
84
  end
75
85
 
76
- # The given production breaks the digram unicity rule.
77
- # Fix this either by a creating a new production having the duplicate
78
- # digram as its rhs or by referencing such a production.
79
- # then by replacing all occurrences of the digram by reference to
80
- # the fixing production.
81
- # Pre-condition: the given production has a repeated digram
82
- # or its last digram is used elsewhere
83
- def preserve_unicity(aProduction)
84
- last_digram = aProduction.last_digram
85
- matching_digram = digrams[last_digram.key]
86
- if aProduction == matching_digram.production
87
- # Rule: no other production distinct from aProduction should have
88
- # the matching digram
89
- productions.each do |prod|
90
- its_digrams = prod.digrams
91
- its_keys = its_digrams.map(&:key)
92
- next if prod == last_digram.production
93
- next unless its_keys.include? last_digram.key
94
- msg = "Digram #{last_digram.symbols} occurs three times!"
95
- msg << "\nTwice in production #{aProduction.object_id}"
96
- msg << "\nThird in production #{prod.object_id}"
97
- msg << "\n#{to_string}"
98
- fail StandardError, msg
86
+ # When a collision diagnosis indicates that a given
87
+ # digram d occurs twice in the grammar
88
+ # Then create a new production that will have
89
+ # the symbols of d as its rhs members.
90
+ def restore_unicity(aDiagnosis)
91
+ return if aDiagnosis.nil?
92
+
93
+ digr = aDiagnosis.digram
94
+ prods = aDiagnosis.productions
95
+ if prods.any?(&:single_digram?)
96
+ (simple, compound) = prods.partition do |a_prod|
97
+ a_prod.single_digram?
99
98
  end
100
-
101
- # Digram appears twice in given production...
102
- # Then create a new production with the digram as its rhs
99
+ compound[0].replace_digram(simple[0])
100
+ else
101
+ # Create a new production with the digram's symbols as its
102
+ # sole rhs members.
103
103
  new_prod = Production.new
104
- new_prod.append_symbol(last_digram.symbols[0])
105
- new_prod.append_symbol(last_digram.symbols[1])
106
-
107
- # ... replace duplicate digram by reference to new production
108
- aProduction.replace_digram(new_prod)
104
+ digr.symbols.each { |sym| new_prod.append_symbol(sym) }
109
105
  add_production(new_prod)
110
- update_digrams_from(aProduction)
111
- else
112
- # Duplicate digram used in distinct production
113
- # Two cases: other production is a single digram one or a multi-digram
114
- other_prod = matching_digram.production
115
- if other_prod.single_digram?
116
- # ... replace duplicate digram by reference to other production
117
- aProduction.replace_digram(other_prod)
118
- update_digrams_from(aProduction)
119
-
120
- # Special case a: replacement causes another digram duplication
121
- # in the given production
122
- # Special case b: replacement causes another digram duplication
123
- # with other production
124
- if aProduction.repeated_digram? ||
125
- (digrams[aProduction.last_digram.key].production != aProduction)
126
- preserve_unicity(aProduction)
127
- end
128
-
106
+ if prods[0] == prods[1]
107
+ prods[0].replace_digram(new_prod)
129
108
  else
130
- # aProduction, other_prod use both the same digram
131
- # Then create a new production with the digram as its rhs
132
- new_prod = Production.new
133
- new_prod.append_symbol(last_digram.symbols[0])
134
- new_prod.append_symbol(last_digram.symbols[1])
135
-
136
- # ... replace duplicate digram by reference to new production
137
- aProduction.replace_digram(new_prod)
138
- other_prod.replace_digram(new_prod)
139
- add_production(new_prod)
140
- update_digrams_from(aProduction)
141
-
142
- # TODO: Check when aProduction and other_prod have same preceding symbol
143
- update_digrams_from(other_prod)
109
+ prods.each { |a_prod| a_prod.replace_digram(new_prod) }
144
110
  end
145
111
  end
146
112
  end
147
113
 
148
- # Rule utility: except for the root production, every production must occur
149
- # multiple times in all the rhs.
150
- # Initialize occurrence hash with pairs: production id => []
151
- # For each production:
152
- # - Detect occurrence of any production in the rhs
153
- # - Identify the occurring production
154
- # - In the occurrence hash push the production id of the lhs
155
- # Select each production that occurs once (singleton rule):
156
- # Replace the occurrence in the rhs by the rhs of the singleton rule
157
- # Delete the singleton rule
158
- # Update digrams
159
- def enforce_rule_utility()
160
- return if productions.size < 2
161
-
162
- loop do
163
- all_refcount_ok = true
164
- (1...productions.size).to_a.reverse.each do |index|
165
- curr_production = productions[index]
166
- next unless curr_production.refcount == 1
167
-
168
- all_refcount_ok = false
169
- dependent = productions.find do |a_prod|
170
- !a_prod.references_of(curr_production).empty?
171
- end
172
- dependent.replace_production(productions[index])
173
- remove_production(index)
174
- update_digrams_from(dependent)
175
- end
176
-
177
- break if all_refcount_ok
178
- end
114
+ # Return a production that is used less than twice in the grammar.
115
+ def detect_useless_production()
116
+ useless = productions.find { |prod| prod.refcount < 2 }
117
+ return (useless == productions[0]) ? nil : useless
179
118
  end
180
119
 
181
-
182
- # Update the digrams Hash with the digrams from the given production.
183
- def update_digrams_from(aProduction)
184
- current_digrams = aProduction.digrams
185
-
186
- # Add new digrams only if they don't collide
187
- current_digrams.each do |digr|
188
- digrams[digr.key] = digr unless digrams.include? digr.key
120
+ # Given the passed production P is referenced only once.
121
+ # Then replace P by its RHS where it is referenced.
122
+ # And delete P
123
+ def restore_utility(useless_prod)
124
+ # Retrieve index of useless_prod
125
+ index = productions.index(useless_prod)
126
+
127
+ # Retrieve production referencing useless one
128
+ referencing = nil
129
+ productions.each do |a_prod|
130
+ # Next line assumes non-recursive productions
131
+ next if a_prod == useless_prod
132
+
133
+ refs = a_prod.references_of(useless_prod)
134
+ next if refs.empty?
135
+ referencing = a_prod
136
+ break
189
137
  end
190
138
 
191
- # Retrieve all registered digrams from the production
192
- digrams_subset = digrams.select do |_, digr|
193
- digr.production == aProduction
194
- end
195
-
196
- # Remove obsolete digrams
197
- current_keys = current_digrams.map(&:key)
198
- digrams_subset.keys.each do |a_key|
199
- digrams.delete(a_key) unless current_keys.include? a_key
200
- end
139
+ referencing.replace_production(useless_prod)
140
+ remove_production(index)
201
141
  end
202
142
 
143
+
203
144
  end # class
204
145
 
205
146
  end # module
@@ -22,10 +22,18 @@ describe Digram do
22
22
  instance = Digram.new(:b, :c, production)
23
23
  expect(instance.production).to eq(production)
24
24
  end
25
+
26
+ it 'should whether its symbols are the same' do
27
+ instance1 = Digram.new(:a, :a, production)
28
+ expect(instance1).to be_repeating
29
+
30
+ instance1 = Digram.new(:a, :b, production)
31
+ expect(instance1).not_to be_repeating
32
+ end
25
33
 
26
34
  end # context
27
35
 
28
- context 'Standard creation & initialization:' do
36
+ context 'Provided services:' do
29
37
 
30
38
  it 'should compare itself to another digram' do
31
39
  instance1 = Digram.new(:a, :b, production)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sequitur
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.02
4
+ version: 0.1.03
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitri Geshef
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-09-18 00:00:00.000000000 Z
11
+ date: 2014-09-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake