sequitur 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/README.md +58 -1
- data/Rakefile +31 -31
- data/examples/integer_sample.rb +33 -0
- data/examples/porridge.rb +41 -0
- data/examples/simple_case.rb +27 -0
- data/examples/symbol_sample.rb +28 -0
- data/examples/word_sample.rb +30 -0
- data/lib/sequitur.rb +1 -1
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +52 -52
- data/lib/sequitur/dynamic_grammar.rb +106 -106
- data/lib/sequitur/formatter/base_formatter.rb +39 -39
- data/lib/sequitur/formatter/base_text.rb +95 -95
- data/lib/sequitur/formatter/debug.rb +131 -131
- data/lib/sequitur/grammar_visitor.rb +110 -110
- data/lib/sequitur/production.rb +243 -243
- data/lib/sequitur/production_ref.rb +119 -119
- data/lib/sequitur/sequitur_grammar.rb +158 -158
- data/lib/sequitur/symbol_sequence.rb +182 -182
- data/spec/sequitur/sequitur_grammar_spec.rb +75 -3
- metadata +7 -2
@@ -1,182 +1,182 @@
|
|
1
|
-
module Sequitur # Module for classes implementing the Sequitur algorithm
|
2
|
-
# Represents a sequence (concatenation) of grammar symbols
|
3
|
-
# as they appear in rhs of productions
|
4
|
-
class SymbolSequence
|
5
|
-
# The sequence of symbols itself
|
6
|
-
attr_reader(:symbols)
|
7
|
-
|
8
|
-
# Create an empty sequence
|
9
|
-
def initialize()
|
10
|
-
@symbols = []
|
11
|
-
end
|
12
|
-
|
13
|
-
# Copy constructor invoked by dup or clone methods.
|
14
|
-
# @param orig [SymbolSequence]
|
15
|
-
def initialize_copy(orig)
|
16
|
-
# Deep copy to avoid the aliasing of production reference
|
17
|
-
@symbols = orig.symbols.map do |sym|
|
18
|
-
sym.is_a?(
|
19
|
-
end
|
20
|
-
invalidate_refs
|
21
|
-
end
|
22
|
-
|
23
|
-
public
|
24
|
-
|
25
|
-
# Clear the symbol sequence.
|
26
|
-
def clear()
|
27
|
-
refs = references
|
28
|
-
refs.each(&:unbind)
|
29
|
-
@symbols = []
|
30
|
-
invalidate_refs
|
31
|
-
end
|
32
|
-
|
33
|
-
# Tell whether the sequence is empty.
|
34
|
-
# @return [true / false] true only if the sequence has no symbol in it.
|
35
|
-
def empty?()
|
36
|
-
return symbols.empty?
|
37
|
-
end
|
38
|
-
|
39
|
-
# Count the number of elements in the sequence.
|
40
|
-
# @return [Fixnum] the number of elements
|
41
|
-
def size()
|
42
|
-
return symbols.size
|
43
|
-
end
|
44
|
-
|
45
|
-
# Append a grammar symbol at the end of the sequence.
|
46
|
-
# @param aSymbol [Object] The symbol to append.
|
47
|
-
def <<(aSymbol)
|
48
|
-
symbols << aSymbol
|
49
|
-
if aSymbol.is_a?(ProductionRef)
|
50
|
-
@memo_references ||= []
|
51
|
-
@memo_references << aSymbol
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
# Retrieve the element from the sequence at given position.
|
56
|
-
# @param anIndex [Fixnum] A zero-based index of the element to access.
|
57
|
-
def [](anIndex)
|
58
|
-
return symbols[anIndex]
|
59
|
-
end
|
60
|
-
|
61
|
-
# Equality testing.
|
62
|
-
# @param other [SymbolSequence or Array] the other other sequence
|
63
|
-
# to compare to.
|
64
|
-
# @return true when an item from self equals the corresponding
|
65
|
-
# item from 'other'
|
66
|
-
def ==(other)
|
67
|
-
return true if object_id == other.object_id
|
68
|
-
|
69
|
-
case other
|
70
|
-
when SymbolSequence
|
71
|
-
same = symbols == other.symbols
|
72
|
-
when Array
|
73
|
-
same = symbols == other
|
74
|
-
else
|
75
|
-
same = false
|
76
|
-
end
|
77
|
-
|
78
|
-
return same
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
# Select the references to production appearing in the rhs.
|
83
|
-
# @return [Array of ProductionRef]
|
84
|
-
def references()
|
85
|
-
@memo_references ||= symbols.select { |symb| symb.is_a?(ProductionRef) }
|
86
|
-
return @memo_references
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
# Select the references of the given production appearing in the rhs.
|
91
|
-
# @param aProduction [Production]
|
92
|
-
# @return [Array of ProductionRef]
|
93
|
-
def references_of(aProduction)
|
94
|
-
return [] if references.empty?
|
95
|
-
result = references.select { |a_ref| a_ref == aProduction }
|
96
|
-
return result
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
# Emit a text representation of the symbol sequence.
|
101
|
-
# Text is of the form: space-separated sequence of symbols.
|
102
|
-
# @return [String]
|
103
|
-
def to_string()
|
104
|
-
rhs_text = symbols.map do |elem|
|
105
|
-
case elem
|
106
|
-
when String then "'#{elem}'"
|
107
|
-
else elem.to_s
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
return rhs_text.join(' ')
|
112
|
-
end
|
113
|
-
|
114
|
-
# Insert at position the elements from another sequence.
|
115
|
-
# @param position [Fixnum] A zero-based index of the symbols to replace.
|
116
|
-
# @param another [SymbolSequence] A production with a two-elements rhs
|
117
|
-
# (a single digram).
|
118
|
-
def insert_at(position, another)
|
119
|
-
klone = another.dup
|
120
|
-
symbols.insert(position, *klone.symbols)
|
121
|
-
invalidate_refs
|
122
|
-
end
|
123
|
-
|
124
|
-
# Given that the production P passed as argument has exactly 2 symbols
|
125
|
-
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
126
|
-
# s1 s2 by a reference to P.
|
127
|
-
# @param index [Fixnum] the position of a two symbol sequence to be replaced
|
128
|
-
# by the production
|
129
|
-
# @param aProduction [Production or ProductionRef] a production that
|
130
|
-
# consists exactly of one digram (= 2 symbols).
|
131
|
-
def reduce_step(index, aProduction)
|
132
|
-
if symbols[index].is_a?(ProductionRef)
|
133
|
-
symbols[index].bind_to(aProduction)
|
134
|
-
else
|
135
|
-
new_ref = ProductionRef.new(aProduction)
|
136
|
-
symbols[index] = new_ref
|
137
|
-
@memo_references ||= []
|
138
|
-
@memo_references << new_ref
|
139
|
-
end
|
140
|
-
index1 = index + 1
|
141
|
-
if symbols[index1].is_a?(ProductionRef)
|
142
|
-
symbols[index1].unbind
|
143
|
-
invalidate_refs
|
144
|
-
end
|
145
|
-
delete_at(index1)
|
146
|
-
end
|
147
|
-
|
148
|
-
# Remove the element at given position
|
149
|
-
# @param position [Fixnum] a zero-based index.
|
150
|
-
def delete_at(position)
|
151
|
-
invalidate_refs if symbols[position].is_a?(ProductionRef)
|
152
|
-
symbols.delete_at(position)
|
153
|
-
end
|
154
|
-
|
155
|
-
|
156
|
-
# Part of the 'visitee' role in Visitor design pattern.
|
157
|
-
# @param aVisitor[GrammarVisitor]
|
158
|
-
def accept(aVisitor)
|
159
|
-
aVisitor.start_visit_rhs(self)
|
160
|
-
|
161
|
-
# Let's proceed with the visit of productions
|
162
|
-
symbols.each do |a_symb|
|
163
|
-
if a_symb.is_a?(ProductionRef)
|
164
|
-
a_symb.accept(aVisitor)
|
165
|
-
else
|
166
|
-
aVisitor.visit_terminal(a_symb)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
aVisitor.end_visit_rhs(self)
|
171
|
-
end
|
172
|
-
|
173
|
-
private
|
174
|
-
|
175
|
-
def invalidate_refs()
|
176
|
-
@memo_references = nil
|
177
|
-
@lookup_references = nil
|
178
|
-
end
|
179
|
-
|
180
|
-
end # class
|
181
|
-
|
182
|
-
end # module
|
1
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
2
|
+
# Represents a sequence (concatenation) of grammar symbols
|
3
|
+
# as they appear in rhs of productions
|
4
|
+
class SymbolSequence
|
5
|
+
# The sequence of symbols itself
|
6
|
+
attr_reader(:symbols)
|
7
|
+
|
8
|
+
# Create an empty sequence
|
9
|
+
def initialize()
|
10
|
+
@symbols = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# Copy constructor invoked by dup or clone methods.
|
14
|
+
# @param orig [SymbolSequence]
|
15
|
+
def initialize_copy(orig)
|
16
|
+
# Deep copy to avoid the aliasing of production reference
|
17
|
+
@symbols = orig.symbols.map do |sym|
|
18
|
+
sym.is_a?(ProductionRef) ? sym.dup : sym
|
19
|
+
end
|
20
|
+
invalidate_refs
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# Clear the symbol sequence.
|
26
|
+
def clear()
|
27
|
+
refs = references
|
28
|
+
refs.each(&:unbind)
|
29
|
+
@symbols = []
|
30
|
+
invalidate_refs
|
31
|
+
end
|
32
|
+
|
33
|
+
# Tell whether the sequence is empty.
|
34
|
+
# @return [true / false] true only if the sequence has no symbol in it.
|
35
|
+
def empty?()
|
36
|
+
return symbols.empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
# Count the number of elements in the sequence.
|
40
|
+
# @return [Fixnum] the number of elements
|
41
|
+
def size()
|
42
|
+
return symbols.size
|
43
|
+
end
|
44
|
+
|
45
|
+
# Append a grammar symbol at the end of the sequence.
|
46
|
+
# @param aSymbol [Object] The symbol to append.
|
47
|
+
def <<(aSymbol)
|
48
|
+
symbols << aSymbol
|
49
|
+
if aSymbol.is_a?(ProductionRef)
|
50
|
+
@memo_references ||= []
|
51
|
+
@memo_references << aSymbol
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Retrieve the element from the sequence at given position.
|
56
|
+
# @param anIndex [Fixnum] A zero-based index of the element to access.
|
57
|
+
def [](anIndex)
|
58
|
+
return symbols[anIndex]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Equality testing.
|
62
|
+
# @param other [SymbolSequence or Array] the other other sequence
|
63
|
+
# to compare to.
|
64
|
+
# @return true when an item from self equals the corresponding
|
65
|
+
# item from 'other'
|
66
|
+
def ==(other)
|
67
|
+
return true if object_id == other.object_id
|
68
|
+
|
69
|
+
case other
|
70
|
+
when SymbolSequence
|
71
|
+
same = symbols == other.symbols
|
72
|
+
when Array
|
73
|
+
same = symbols == other
|
74
|
+
else
|
75
|
+
same = false
|
76
|
+
end
|
77
|
+
|
78
|
+
return same
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
# Select the references to production appearing in the rhs.
|
83
|
+
# @return [Array of ProductionRef]
|
84
|
+
def references()
|
85
|
+
@memo_references ||= symbols.select { |symb| symb.is_a?(ProductionRef) }
|
86
|
+
return @memo_references
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Select the references of the given production appearing in the rhs.
|
91
|
+
# @param aProduction [Production]
|
92
|
+
# @return [Array of ProductionRef]
|
93
|
+
def references_of(aProduction)
|
94
|
+
return [] if references.empty?
|
95
|
+
result = references.select { |a_ref| a_ref == aProduction }
|
96
|
+
return result
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Emit a text representation of the symbol sequence.
|
101
|
+
# Text is of the form: space-separated sequence of symbols.
|
102
|
+
# @return [String]
|
103
|
+
def to_string()
|
104
|
+
rhs_text = symbols.map do |elem|
|
105
|
+
case elem
|
106
|
+
when String then "'#{elem}'"
|
107
|
+
else elem.to_s
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
return rhs_text.join(' ')
|
112
|
+
end
|
113
|
+
|
114
|
+
# Insert at position the elements from another sequence.
|
115
|
+
# @param position [Fixnum] A zero-based index of the symbols to replace.
|
116
|
+
# @param another [SymbolSequence] A production with a two-elements rhs
|
117
|
+
# (a single digram).
|
118
|
+
def insert_at(position, another)
|
119
|
+
klone = another.dup
|
120
|
+
symbols.insert(position, *klone.symbols)
|
121
|
+
invalidate_refs
|
122
|
+
end
|
123
|
+
|
124
|
+
# Given that the production P passed as argument has exactly 2 symbols
|
125
|
+
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
126
|
+
# s1 s2 by a reference to P.
|
127
|
+
# @param index [Fixnum] the position of a two symbol sequence to be replaced
|
128
|
+
# by the production
|
129
|
+
# @param aProduction [Production or ProductionRef] a production that
|
130
|
+
# consists exactly of one digram (= 2 symbols).
|
131
|
+
def reduce_step(index, aProduction)
|
132
|
+
if symbols[index].is_a?(ProductionRef)
|
133
|
+
symbols[index].bind_to(aProduction)
|
134
|
+
else
|
135
|
+
new_ref = ProductionRef.new(aProduction)
|
136
|
+
symbols[index] = new_ref
|
137
|
+
@memo_references ||= []
|
138
|
+
@memo_references << new_ref
|
139
|
+
end
|
140
|
+
index1 = index + 1
|
141
|
+
if symbols[index1].is_a?(ProductionRef)
|
142
|
+
symbols[index1].unbind
|
143
|
+
invalidate_refs
|
144
|
+
end
|
145
|
+
delete_at(index1)
|
146
|
+
end
|
147
|
+
|
148
|
+
# Remove the element at given position
|
149
|
+
# @param position [Fixnum] a zero-based index.
|
150
|
+
def delete_at(position)
|
151
|
+
invalidate_refs if symbols[position].is_a?(ProductionRef)
|
152
|
+
symbols.delete_at(position)
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
# Part of the 'visitee' role in Visitor design pattern.
|
157
|
+
# @param aVisitor[GrammarVisitor]
|
158
|
+
def accept(aVisitor)
|
159
|
+
aVisitor.start_visit_rhs(self)
|
160
|
+
|
161
|
+
# Let's proceed with the visit of productions
|
162
|
+
symbols.each do |a_symb|
|
163
|
+
if a_symb.is_a?(ProductionRef)
|
164
|
+
a_symb.accept(aVisitor)
|
165
|
+
else
|
166
|
+
aVisitor.visit_terminal(a_symb)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
aVisitor.end_visit_rhs(self)
|
171
|
+
end
|
172
|
+
|
173
|
+
private
|
174
|
+
|
175
|
+
def invalidate_refs()
|
176
|
+
@memo_references = nil
|
177
|
+
@lookup_references = nil
|
178
|
+
end
|
179
|
+
|
180
|
+
end # class
|
181
|
+
|
182
|
+
end # module
|
@@ -74,7 +74,7 @@ describe SequiturGrammar do
|
|
74
74
|
expect(p_a.rhs).to eq([:a, :b, :c])
|
75
75
|
expect(instance.start.rhs).to eq([p_a, p_a])
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
78
|
it 'should cope with a pattern that caused an exception' do
|
79
79
|
input = 'aaac' # This sequence raised an exception
|
80
80
|
|
@@ -102,6 +102,78 @@ describe SequiturGrammar do
|
|
102
102
|
expect(p3.rhs).to eq(['b', p2, 'e'])
|
103
103
|
end
|
104
104
|
|
105
|
+
it 'should work with strings instead of single char input tokens' do
|
106
|
+
# Raw input is sequence of chars
|
107
|
+
raw_input = 'bbebeebebebbebee'
|
108
|
+
|
109
|
+
# Convert them into multichar strings
|
110
|
+
input = raw_input.chars.map do |ch|
|
111
|
+
'letter_' + ch
|
112
|
+
end
|
113
|
+
|
114
|
+
# Creation
|
115
|
+
instance = SequiturGrammar.new(input.to_enum)
|
116
|
+
|
117
|
+
# Expectations:
|
118
|
+
# S: P3 P2 P3
|
119
|
+
# P1: b e
|
120
|
+
# P2: P1 P1
|
121
|
+
# P3: b P2 e
|
122
|
+
expect(instance.productions.size).to eq(4)
|
123
|
+
(p1, p2, p3) = instance.productions[1..3]
|
124
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
125
|
+
expect(p1.rhs).to eq(%w(letter_b letter_e))
|
126
|
+
expect(p2.rhs).to eq([p1, p1])
|
127
|
+
expect(p3.rhs).to eq(['letter_b',p2, 'letter_e'])
|
128
|
+
end
|
129
|
+
|
130
|
+
it 'should work with Symbol instead of single char input tokens' do
|
131
|
+
# Raw input is sequence of single characters
|
132
|
+
raw_input = 'bbebeebebebbebee'
|
133
|
+
|
134
|
+
# Convert them into symbols
|
135
|
+
input = raw_input.chars.map(&:to_sym)
|
136
|
+
|
137
|
+
# Creation
|
138
|
+
instance = SequiturGrammar.new(input.to_enum)
|
139
|
+
|
140
|
+
# Expectations:
|
141
|
+
# S: P3 P2 P3
|
142
|
+
# P1: b e
|
143
|
+
# P2: P1 P1
|
144
|
+
# P3: b P2 e
|
145
|
+
expect(instance.productions.size).to eq(4)
|
146
|
+
(p1, p2, p3) = instance.productions[1..3]
|
147
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
148
|
+
expect(p1.rhs).to eq([:b, :e])
|
149
|
+
expect(p2.rhs).to eq([p1, p1])
|
150
|
+
expect(p3.rhs).to eq([:b, p2, :e])
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
it 'should work with integer values as input tokens' do
|
155
|
+
# Raw input is sequence of hex digits
|
156
|
+
raw_input = 'bbebeebebebbebee'
|
157
|
+
|
158
|
+
# Convert them into Fixnums
|
159
|
+
input = raw_input.chars.map { |ch| ch.to_i(16) }
|
160
|
+
|
161
|
+
# Creation
|
162
|
+
instance = SequiturGrammar.new(input.to_enum)
|
163
|
+
|
164
|
+
# Expectations:
|
165
|
+
# S: P3 P2 P3
|
166
|
+
# P1: b e
|
167
|
+
# P2: P1 P1
|
168
|
+
# P3: b P2 e
|
169
|
+
expect(instance.productions.size).to eq(4)
|
170
|
+
(p1, p2, p3) = instance.productions[1..3]
|
171
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
172
|
+
expect(p1.rhs).to eq([0xb, 0xe])
|
173
|
+
expect(p2.rhs).to eq([p1, p1])
|
174
|
+
expect(p3.rhs).to eq([0xb, p2, 0xe])
|
175
|
+
end
|
176
|
+
|
105
177
|
it 'should cope with the example from sequitur.info website' do
|
106
178
|
input = 'abcabdabcabd'
|
107
179
|
instance = SequiturGrammar.new(input.chars)
|
@@ -153,7 +225,7 @@ SNIPPET
|
|
153
225
|
# 2 → h o t hot
|
154
226
|
# 3 → 10 1 ,↵pease_porridge_
|
155
227
|
# 4 → c 11 cold
|
156
|
-
# 5 → 12 _ t h 8 t 10 n 12 9 d a y s _ 11 . ↵
|
228
|
+
# 5 → 12 _ t h 8 t 10 n 12 9 d a y s _ 11 . ↵
|
157
229
|
# in_the_pot,↵nine_days_old.↵
|
158
230
|
# 6 → s o m 9 l i k 9 i t _ some_like_it_
|
159
231
|
# 7 → 10 6 ,↵some_like_it_
|
@@ -196,7 +268,7 @@ SNIPPET
|
|
196
268
|
].flatten
|
197
269
|
expect(p12.rhs).to eq(p12_expectation) # Rule 5 above
|
198
270
|
end
|
199
|
-
|
271
|
+
|
200
272
|
it 'should work with a sequence of Ruby Symbols' do
|
201
273
|
input = 'abcabdabcabd'.chars.map(&:to_sym)
|
202
274
|
instance = SequiturGrammar.new(input.to_enum)
|