sequitur 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/README.md +58 -1
- data/Rakefile +31 -31
- data/examples/integer_sample.rb +33 -0
- data/examples/porridge.rb +41 -0
- data/examples/simple_case.rb +27 -0
- data/examples/symbol_sample.rb +28 -0
- data/examples/word_sample.rb +30 -0
- data/lib/sequitur.rb +1 -1
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +52 -52
- data/lib/sequitur/dynamic_grammar.rb +106 -106
- data/lib/sequitur/formatter/base_formatter.rb +39 -39
- data/lib/sequitur/formatter/base_text.rb +95 -95
- data/lib/sequitur/formatter/debug.rb +131 -131
- data/lib/sequitur/grammar_visitor.rb +110 -110
- data/lib/sequitur/production.rb +243 -243
- data/lib/sequitur/production_ref.rb +119 -119
- data/lib/sequitur/sequitur_grammar.rb +158 -158
- data/lib/sequitur/symbol_sequence.rb +182 -182
- data/spec/sequitur/sequitur_grammar_spec.rb +75 -3
- metadata +7 -2
data/lib/sequitur/production.rb
CHANGED
@@ -1,243 +1,243 @@
|
|
1
|
-
require_relative 'digram'
|
2
|
-
require_relative 'symbol_sequence'
|
3
|
-
require_relative 'production_ref'
|
4
|
-
|
5
|
-
module Sequitur # Module for classes implementing the Sequitur algorithm
|
6
|
-
|
7
|
-
|
8
|
-
# In a context-free grammar, a production is a rule in which
|
9
|
-
# its left-hand side (LHS) consists solely of a non-terminal symbol
|
10
|
-
# and the right-hand side (RHS) consists of a sequence of symbols.
|
11
|
-
# The symbols in RHS can be either terminal or non-terminal symbols.
|
12
|
-
# The rule stipulates that the LHS is equivalent to the RHS,
|
13
|
-
# in other words every occurrence of the LHS can be substituted to
|
14
|
-
# corresponding RHS.
|
15
|
-
# Implementation note: the object id of the production is taken as its LHS.
|
16
|
-
class Production
|
17
|
-
# The right-hand side (rhs) consists of a sequence of grammar symbols
|
18
|
-
attr_reader(:rhs)
|
19
|
-
|
20
|
-
# The reference count (= how times other productions reference this one)
|
21
|
-
attr_reader(:refcount)
|
22
|
-
|
23
|
-
# The sequence of digrams appearing in the RHS
|
24
|
-
attr_reader(:digrams)
|
25
|
-
|
26
|
-
# Constructor.
|
27
|
-
# Build a production with an empty RHS.
|
28
|
-
def initialize()
|
29
|
-
@rhs = SymbolSequence.new
|
30
|
-
@refcount = 0
|
31
|
-
@digrams = []
|
32
|
-
end
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
public
|
37
|
-
|
38
|
-
# Identity testing.
|
39
|
-
# @param other [] another production or production reference.
|
40
|
-
# @return true when the receiver and other are the same.
|
41
|
-
def ==(other)
|
42
|
-
return true if object_id == other.object_id
|
43
|
-
|
44
|
-
if other.is_a?(ProductionRef)
|
45
|
-
result = (other == self)
|
46
|
-
else
|
47
|
-
result = false
|
48
|
-
end
|
49
|
-
|
50
|
-
return result
|
51
|
-
end
|
52
|
-
|
53
|
-
|
54
|
-
# Is the rhs empty?
|
55
|
-
# @ return true if the rhs has no members.
|
56
|
-
def empty?
|
57
|
-
return rhs.empty?
|
58
|
-
end
|
59
|
-
|
60
|
-
# Increment the reference count by one.
|
61
|
-
def incr_refcount()
|
62
|
-
@refcount += 1
|
63
|
-
end
|
64
|
-
|
65
|
-
# Decrement the reference count by one.
|
66
|
-
def decr_refcount()
|
67
|
-
fail StandardError, 'Internal error' if @refcount == 0
|
68
|
-
@refcount -= 1
|
69
|
-
end
|
70
|
-
|
71
|
-
|
72
|
-
# Select the references to production appearing in the rhs.
|
73
|
-
# @return [Array of ProductionRef]
|
74
|
-
def references()
|
75
|
-
return rhs.references
|
76
|
-
end
|
77
|
-
|
78
|
-
# Look in the rhs all the references to a production passed a argument.
|
79
|
-
# aProduction [aProduction or ProductionRef] The production to search for.
|
80
|
-
# @return [Array] the array of ProductionRef to the passed production
|
81
|
-
def references_of(a_prod)
|
82
|
-
real_prod = a_prod.is_a?(ProductionRef) ? a_prod.production : a_prod
|
83
|
-
return rhs.references_of(real_prod)
|
84
|
-
end
|
85
|
-
|
86
|
-
|
87
|
-
# Enumerate the digrams appearing in the right-hand side (rhs)
|
88
|
-
# @return [Array] the list of digrams found in rhs of this production.
|
89
|
-
def recalc_digrams()
|
90
|
-
return [] if rhs.size < 2
|
91
|
-
|
92
|
-
result = []
|
93
|
-
rhs.symbols.each_cons(2) { |couple| result << Digram.new(*couple, self) }
|
94
|
-
@digrams = result
|
95
|
-
end
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
# Does the rhs have exactly one digram only (= 2 symbols)?
|
100
|
-
# @return [true/false] true when the rhs contains exactly two symbols.
|
101
|
-
def single_digram?
|
102
|
-
return rhs.size == 2
|
103
|
-
end
|
104
|
-
|
105
|
-
|
106
|
-
# Detect whether the last digram occurs twice
|
107
|
-
# Assumption: when a digram occurs twice in a production then it must occur
|
108
|
-
# at the end of the rhs
|
109
|
-
# @return [true/false] true when the digram occurs twice in rhs.
|
110
|
-
def repeated_digram?()
|
111
|
-
return false if rhs.size < 3
|
112
|
-
|
113
|
-
my_digrams = digrams
|
114
|
-
all_keys = my_digrams.map(&:key)
|
115
|
-
last_key = all_keys.pop
|
116
|
-
same_key_found = all_keys.index(last_key)
|
117
|
-
return !same_key_found.nil?
|
118
|
-
end
|
119
|
-
|
120
|
-
# Retrieve the last digram appearing in the RHS (if any).
|
121
|
-
# @return [Digram] last digram in the rhs otherwise nil.
|
122
|
-
def last_digram()
|
123
|
-
result = digrams.empty? ? nil : digrams.last
|
124
|
-
return result
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
# Emit a text representation of the production rule.
|
130
|
-
# Text is of the form:
|
131
|
-
# object id of production : rhs as space-separated sequence of symbols.
|
132
|
-
# @return [String]
|
133
|
-
def to_string()
|
134
|
-
return "#{object_id} : #{rhs.to_string}."
|
135
|
-
end
|
136
|
-
|
137
|
-
# Add a (grammar) symbol at the end of the RHS.
|
138
|
-
# @param aSymbol [Object] A (grammar) symbol to add.
|
139
|
-
def append_symbol(aSymbol)
|
140
|
-
case aSymbol
|
141
|
-
when Production
|
142
|
-
new_symb = ProductionRef.new(aSymbol)
|
143
|
-
when ProductionRef
|
144
|
-
if aSymbol.unbound?
|
145
|
-
msg = 'Fail to append reference to nil production in '
|
146
|
-
msg << to_string
|
147
|
-
fail StandardError, msg
|
148
|
-
end
|
149
|
-
new_symb = aSymbol.dup
|
150
|
-
else
|
151
|
-
new_symb = aSymbol
|
152
|
-
end
|
153
|
-
|
154
|
-
rhs << new_symb
|
155
|
-
digrams << Digram.new(rhs[-2], rhs[-1], self) if rhs.size >= 2
|
156
|
-
end
|
157
|
-
|
158
|
-
# Clear the right-hand side.
|
159
|
-
# Any referenced production has its reference counter decremented.
|
160
|
-
def clear_rhs()
|
161
|
-
rhs.clear
|
162
|
-
end
|
163
|
-
|
164
|
-
# Find all the positions where the digram occurs in the rhs
|
165
|
-
# @param symb1 [Object] first symbol of the digram
|
166
|
-
# @param symb2 [Object] second symbol of the digram
|
167
|
-
# @return [Array] the list of indices where the digram occurs in rhs.
|
168
|
-
# @example
|
169
|
-
# # Given the production p : a b c a b a b d
|
170
|
-
# #Then ...
|
171
|
-
# p.positions_of(a, b) # => [0, 3, 5]
|
172
|
-
# # Caution: "overlapping" digrams shouldn't be counted
|
173
|
-
# # Given the production p : a a b a a a c d
|
174
|
-
# # Then ...
|
175
|
-
# p.positions_of(a, a) # => [0, 3]
|
176
|
-
def positions_of(symb1, symb2)
|
177
|
-
|
178
|
-
# Find the positions where the digram occur in rhs
|
179
|
-
indices = [ -2 ] # Dummy index!
|
180
|
-
(0...rhs.size).each do |i|
|
181
|
-
next if i == indices.last + 1
|
182
|
-
indices << i if (rhs[i] == symb1) && (rhs[i + 1] == symb2)
|
183
|
-
end
|
184
|
-
|
185
|
-
indices.shift
|
186
|
-
|
187
|
-
return indices
|
188
|
-
end
|
189
|
-
|
190
|
-
|
191
|
-
# Given that the production P passed as argument has exactly 2 symbols
|
192
|
-
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
193
|
-
# s1 s2 by a reference to P.
|
194
|
-
# @param another [Production or ProductionRef] a production that
|
195
|
-
# consists exactly of one digram (= 2 symbols).
|
196
|
-
def reduce_step(another)
|
197
|
-
(symb1, symb2) = another.rhs.symbols
|
198
|
-
pos = positions_of(symb1, symb2).reverse
|
199
|
-
|
200
|
-
# Replace the two symbol sequence by the production
|
201
|
-
pos.each { |index| rhs.reduce_step(index, another) }
|
202
|
-
|
203
|
-
recalc_digrams
|
204
|
-
end
|
205
|
-
|
206
|
-
# Replace every occurrence of 'another' production in self.rhs by
|
207
|
-
# the symbols in the rhs of 'another'.
|
208
|
-
# @param another [Production or ProductionRef] a production that
|
209
|
-
# consists exactly of one digram (= 2 symbols).
|
210
|
-
# @example Synopsis
|
211
|
-
# # Given the production p_A : a p_B b p_B c
|
212
|
-
# # And the production p_B : x y
|
213
|
-
# # Then...
|
214
|
-
# p_A.derive_step(p_B)
|
215
|
-
# #Modifies p_A as into: p_A -> a x y b x y c
|
216
|
-
def derive_step(another)
|
217
|
-
(0...rhs.size).to_a.reverse.each do |index|
|
218
|
-
next unless rhs[index] == another
|
219
|
-
|
220
|
-
rhs.insert_at(index + 1, another.rhs)
|
221
|
-
another.decr_refcount
|
222
|
-
rhs.delete_at(index)
|
223
|
-
end
|
224
|
-
|
225
|
-
recalc_digrams
|
226
|
-
end
|
227
|
-
|
228
|
-
|
229
|
-
# Part of the 'visitee' role in Visitor design pattern.
|
230
|
-
# @param aVisitor[GrammarVisitor]
|
231
|
-
def accept(aVisitor)
|
232
|
-
aVisitor.start_visit_production(self)
|
233
|
-
|
234
|
-
rhs.accept(aVisitor)
|
235
|
-
|
236
|
-
aVisitor.end_visit_production(self)
|
237
|
-
end
|
238
|
-
|
239
|
-
end # class
|
240
|
-
|
241
|
-
end # module
|
242
|
-
|
243
|
-
# End of file
|
1
|
+
require_relative 'digram'
|
2
|
+
require_relative 'symbol_sequence'
|
3
|
+
require_relative 'production_ref'
|
4
|
+
|
5
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
6
|
+
|
7
|
+
|
8
|
+
# In a context-free grammar, a production is a rule in which
|
9
|
+
# its left-hand side (LHS) consists solely of a non-terminal symbol
|
10
|
+
# and the right-hand side (RHS) consists of a sequence of symbols.
|
11
|
+
# The symbols in RHS can be either terminal or non-terminal symbols.
|
12
|
+
# The rule stipulates that the LHS is equivalent to the RHS,
|
13
|
+
# in other words every occurrence of the LHS can be substituted to
|
14
|
+
# corresponding RHS.
|
15
|
+
# Implementation note: the object id of the production is taken as its LHS.
|
16
|
+
class Production
|
17
|
+
# The right-hand side (rhs) consists of a sequence of grammar symbols
|
18
|
+
attr_reader(:rhs)
|
19
|
+
|
20
|
+
# The reference count (= how times other productions reference this one)
|
21
|
+
attr_reader(:refcount)
|
22
|
+
|
23
|
+
# The sequence of digrams appearing in the RHS
|
24
|
+
attr_reader(:digrams)
|
25
|
+
|
26
|
+
# Constructor.
|
27
|
+
# Build a production with an empty RHS.
|
28
|
+
def initialize()
|
29
|
+
@rhs = SymbolSequence.new
|
30
|
+
@refcount = 0
|
31
|
+
@digrams = []
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
public
|
37
|
+
|
38
|
+
# Identity testing.
|
39
|
+
# @param other [] another production or production reference.
|
40
|
+
# @return true when the receiver and other are the same.
|
41
|
+
def ==(other)
|
42
|
+
return true if object_id == other.object_id
|
43
|
+
|
44
|
+
if other.is_a?(ProductionRef)
|
45
|
+
result = (other == self)
|
46
|
+
else
|
47
|
+
result = false
|
48
|
+
end
|
49
|
+
|
50
|
+
return result
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
# Is the rhs empty?
|
55
|
+
# @ return true if the rhs has no members.
|
56
|
+
def empty?
|
57
|
+
return rhs.empty?
|
58
|
+
end
|
59
|
+
|
60
|
+
# Increment the reference count by one.
|
61
|
+
def incr_refcount()
|
62
|
+
@refcount += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
# Decrement the reference count by one.
|
66
|
+
def decr_refcount()
|
67
|
+
fail StandardError, 'Internal error' if @refcount == 0
|
68
|
+
@refcount -= 1
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
# Select the references to production appearing in the rhs.
|
73
|
+
# @return [Array of ProductionRef]
|
74
|
+
def references()
|
75
|
+
return rhs.references
|
76
|
+
end
|
77
|
+
|
78
|
+
# Look in the rhs all the references to a production passed a argument.
|
79
|
+
# aProduction [aProduction or ProductionRef] The production to search for.
|
80
|
+
# @return [Array] the array of ProductionRef to the passed production
|
81
|
+
def references_of(a_prod)
|
82
|
+
real_prod = a_prod.is_a?(ProductionRef) ? a_prod.production : a_prod
|
83
|
+
return rhs.references_of(real_prod)
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
# Enumerate the digrams appearing in the right-hand side (rhs)
|
88
|
+
# @return [Array] the list of digrams found in rhs of this production.
|
89
|
+
def recalc_digrams()
|
90
|
+
return [] if rhs.size < 2
|
91
|
+
|
92
|
+
result = []
|
93
|
+
rhs.symbols.each_cons(2) { |couple| result << Digram.new(*couple, self) }
|
94
|
+
@digrams = result
|
95
|
+
end
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
# Does the rhs have exactly one digram only (= 2 symbols)?
|
100
|
+
# @return [true/false] true when the rhs contains exactly two symbols.
|
101
|
+
def single_digram?
|
102
|
+
return rhs.size == 2
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# Detect whether the last digram occurs twice
|
107
|
+
# Assumption: when a digram occurs twice in a production then it must occur
|
108
|
+
# at the end of the rhs
|
109
|
+
# @return [true/false] true when the digram occurs twice in rhs.
|
110
|
+
def repeated_digram?()
|
111
|
+
return false if rhs.size < 3
|
112
|
+
|
113
|
+
my_digrams = digrams
|
114
|
+
all_keys = my_digrams.map(&:key)
|
115
|
+
last_key = all_keys.pop
|
116
|
+
same_key_found = all_keys.index(last_key)
|
117
|
+
return !same_key_found.nil?
|
118
|
+
end
|
119
|
+
|
120
|
+
# Retrieve the last digram appearing in the RHS (if any).
|
121
|
+
# @return [Digram] last digram in the rhs otherwise nil.
|
122
|
+
def last_digram()
|
123
|
+
result = digrams.empty? ? nil : digrams.last
|
124
|
+
return result
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
|
129
|
+
# Emit a text representation of the production rule.
|
130
|
+
# Text is of the form:
|
131
|
+
# object id of production : rhs as space-separated sequence of symbols.
|
132
|
+
# @return [String]
|
133
|
+
def to_string()
|
134
|
+
return "#{object_id} : #{rhs.to_string}."
|
135
|
+
end
|
136
|
+
|
137
|
+
# Add a (grammar) symbol at the end of the RHS.
|
138
|
+
# @param aSymbol [Object] A (grammar) symbol to add.
|
139
|
+
def append_symbol(aSymbol)
|
140
|
+
case aSymbol
|
141
|
+
when Production
|
142
|
+
new_symb = ProductionRef.new(aSymbol)
|
143
|
+
when ProductionRef
|
144
|
+
if aSymbol.unbound?
|
145
|
+
msg = 'Fail to append reference to nil production in '
|
146
|
+
msg << to_string
|
147
|
+
fail StandardError, msg
|
148
|
+
end
|
149
|
+
new_symb = aSymbol.dup
|
150
|
+
else
|
151
|
+
new_symb = aSymbol
|
152
|
+
end
|
153
|
+
|
154
|
+
rhs << new_symb
|
155
|
+
digrams << Digram.new(rhs[-2], rhs[-1], self) if rhs.size >= 2
|
156
|
+
end
|
157
|
+
|
158
|
+
# Clear the right-hand side.
|
159
|
+
# Any referenced production has its reference counter decremented.
|
160
|
+
def clear_rhs()
|
161
|
+
rhs.clear
|
162
|
+
end
|
163
|
+
|
164
|
+
# Find all the positions where the digram occurs in the rhs
|
165
|
+
# @param symb1 [Object] first symbol of the digram
|
166
|
+
# @param symb2 [Object] second symbol of the digram
|
167
|
+
# @return [Array] the list of indices where the digram occurs in rhs.
|
168
|
+
# @example
|
169
|
+
# # Given the production p : a b c a b a b d
|
170
|
+
# #Then ...
|
171
|
+
# p.positions_of(a, b) # => [0, 3, 5]
|
172
|
+
# # Caution: "overlapping" digrams shouldn't be counted
|
173
|
+
# # Given the production p : a a b a a a c d
|
174
|
+
# # Then ...
|
175
|
+
# p.positions_of(a, a) # => [0, 3]
|
176
|
+
def positions_of(symb1, symb2)
|
177
|
+
|
178
|
+
# Find the positions where the digram occur in rhs
|
179
|
+
indices = [ -2 ] # Dummy index!
|
180
|
+
(0...rhs.size).each do |i|
|
181
|
+
next if i == indices.last + 1
|
182
|
+
indices << i if (rhs[i] == symb1) && (rhs[i + 1] == symb2)
|
183
|
+
end
|
184
|
+
|
185
|
+
indices.shift
|
186
|
+
|
187
|
+
return indices
|
188
|
+
end
|
189
|
+
|
190
|
+
|
191
|
+
# Given that the production P passed as argument has exactly 2 symbols
|
192
|
+
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
193
|
+
# s1 s2 by a reference to P.
|
194
|
+
# @param another [Production or ProductionRef] a production that
|
195
|
+
# consists exactly of one digram (= 2 symbols).
|
196
|
+
def reduce_step(another)
|
197
|
+
(symb1, symb2) = another.rhs.symbols
|
198
|
+
pos = positions_of(symb1, symb2).reverse
|
199
|
+
|
200
|
+
# Replace the two symbol sequence by the production
|
201
|
+
pos.each { |index| rhs.reduce_step(index, another) }
|
202
|
+
|
203
|
+
recalc_digrams
|
204
|
+
end
|
205
|
+
|
206
|
+
# Replace every occurrence of 'another' production in self.rhs by
|
207
|
+
# the symbols in the rhs of 'another'.
|
208
|
+
# @param another [Production or ProductionRef] a production that
|
209
|
+
# consists exactly of one digram (= 2 symbols).
|
210
|
+
# @example Synopsis
|
211
|
+
# # Given the production p_A : a p_B b p_B c
|
212
|
+
# # And the production p_B : x y
|
213
|
+
# # Then...
|
214
|
+
# p_A.derive_step(p_B)
|
215
|
+
# #Modifies p_A as into: p_A -> a x y b x y c
|
216
|
+
def derive_step(another)
|
217
|
+
(0...rhs.size).to_a.reverse.each do |index|
|
218
|
+
next unless rhs[index] == another
|
219
|
+
|
220
|
+
rhs.insert_at(index + 1, another.rhs)
|
221
|
+
another.decr_refcount
|
222
|
+
rhs.delete_at(index)
|
223
|
+
end
|
224
|
+
|
225
|
+
recalc_digrams
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
# Part of the 'visitee' role in Visitor design pattern.
|
230
|
+
# @param aVisitor[GrammarVisitor]
|
231
|
+
def accept(aVisitor)
|
232
|
+
aVisitor.start_visit_production(self)
|
233
|
+
|
234
|
+
rhs.accept(aVisitor)
|
235
|
+
|
236
|
+
aVisitor.end_visit_production(self)
|
237
|
+
end
|
238
|
+
|
239
|
+
end # class
|
240
|
+
|
241
|
+
end # module
|
242
|
+
|
243
|
+
# End of file
|