sequitur 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/README.md +58 -1
- data/Rakefile +31 -31
- data/examples/integer_sample.rb +33 -0
- data/examples/porridge.rb +41 -0
- data/examples/simple_case.rb +27 -0
- data/examples/symbol_sample.rb +28 -0
- data/examples/word_sample.rb +30 -0
- data/lib/sequitur.rb +1 -1
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +52 -52
- data/lib/sequitur/dynamic_grammar.rb +106 -106
- data/lib/sequitur/formatter/base_formatter.rb +39 -39
- data/lib/sequitur/formatter/base_text.rb +95 -95
- data/lib/sequitur/formatter/debug.rb +131 -131
- data/lib/sequitur/grammar_visitor.rb +110 -110
- data/lib/sequitur/production.rb +243 -243
- data/lib/sequitur/production_ref.rb +119 -119
- data/lib/sequitur/sequitur_grammar.rb +158 -158
- data/lib/sequitur/symbol_sequence.rb +182 -182
- data/spec/sequitur/sequitur_grammar_spec.rb +75 -3
- metadata +7 -2
@@ -1,182 +1,182 @@
|
|
1
|
-
module Sequitur # Module for classes implementing the Sequitur algorithm
|
2
|
-
# Represents a sequence (concatenation) of grammar symbols
|
3
|
-
# as they appear in rhs of productions
|
4
|
-
class SymbolSequence
|
5
|
-
# The sequence of symbols itself
|
6
|
-
attr_reader(:symbols)
|
7
|
-
|
8
|
-
# Create an empty sequence
|
9
|
-
def initialize()
|
10
|
-
@symbols = []
|
11
|
-
end
|
12
|
-
|
13
|
-
# Copy constructor invoked by dup or clone methods.
|
14
|
-
# @param orig [SymbolSequence]
|
15
|
-
def initialize_copy(orig)
|
16
|
-
# Deep copy to avoid the aliasing of production reference
|
17
|
-
@symbols = orig.symbols.map do |sym|
|
18
|
-
sym.is_a?(
|
19
|
-
end
|
20
|
-
invalidate_refs
|
21
|
-
end
|
22
|
-
|
23
|
-
public
|
24
|
-
|
25
|
-
# Clear the symbol sequence.
|
26
|
-
def clear()
|
27
|
-
refs = references
|
28
|
-
refs.each(&:unbind)
|
29
|
-
@symbols = []
|
30
|
-
invalidate_refs
|
31
|
-
end
|
32
|
-
|
33
|
-
# Tell whether the sequence is empty.
|
34
|
-
# @return [true / false] true only if the sequence has no symbol in it.
|
35
|
-
def empty?()
|
36
|
-
return symbols.empty?
|
37
|
-
end
|
38
|
-
|
39
|
-
# Count the number of elements in the sequence.
|
40
|
-
# @return [Fixnum] the number of elements
|
41
|
-
def size()
|
42
|
-
return symbols.size
|
43
|
-
end
|
44
|
-
|
45
|
-
# Append a grammar symbol at the end of the sequence.
|
46
|
-
# @param aSymbol [Object] The symbol to append.
|
47
|
-
def <<(aSymbol)
|
48
|
-
symbols << aSymbol
|
49
|
-
if aSymbol.is_a?(ProductionRef)
|
50
|
-
@memo_references ||= []
|
51
|
-
@memo_references << aSymbol
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
# Retrieve the element from the sequence at given position.
|
56
|
-
# @param anIndex [Fixnum] A zero-based index of the element to access.
|
57
|
-
def [](anIndex)
|
58
|
-
return symbols[anIndex]
|
59
|
-
end
|
60
|
-
|
61
|
-
# Equality testing.
|
62
|
-
# @param other [SymbolSequence or Array] the other other sequence
|
63
|
-
# to compare to.
|
64
|
-
# @return true when an item from self equals the corresponding
|
65
|
-
# item from 'other'
|
66
|
-
def ==(other)
|
67
|
-
return true if object_id == other.object_id
|
68
|
-
|
69
|
-
case other
|
70
|
-
when SymbolSequence
|
71
|
-
same = symbols == other.symbols
|
72
|
-
when Array
|
73
|
-
same = symbols == other
|
74
|
-
else
|
75
|
-
same = false
|
76
|
-
end
|
77
|
-
|
78
|
-
return same
|
79
|
-
end
|
80
|
-
|
81
|
-
|
82
|
-
# Select the references to production appearing in the rhs.
|
83
|
-
# @return [Array of ProductionRef]
|
84
|
-
def references()
|
85
|
-
@memo_references ||= symbols.select { |symb| symb.is_a?(ProductionRef) }
|
86
|
-
return @memo_references
|
87
|
-
end
|
88
|
-
|
89
|
-
|
90
|
-
# Select the references of the given production appearing in the rhs.
|
91
|
-
# @param aProduction [Production]
|
92
|
-
# @return [Array of ProductionRef]
|
93
|
-
def references_of(aProduction)
|
94
|
-
return [] if references.empty?
|
95
|
-
result = references.select { |a_ref| a_ref == aProduction }
|
96
|
-
return result
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
# Emit a text representation of the symbol sequence.
|
101
|
-
# Text is of the form: space-separated sequence of symbols.
|
102
|
-
# @return [String]
|
103
|
-
def to_string()
|
104
|
-
rhs_text = symbols.map do |elem|
|
105
|
-
case elem
|
106
|
-
when String then "'#{elem}'"
|
107
|
-
else elem.to_s
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
return rhs_text.join(' ')
|
112
|
-
end
|
113
|
-
|
114
|
-
# Insert at position the elements from another sequence.
|
115
|
-
# @param position [Fixnum] A zero-based index of the symbols to replace.
|
116
|
-
# @param another [SymbolSequence] A production with a two-elements rhs
|
117
|
-
# (a single digram).
|
118
|
-
def insert_at(position, another)
|
119
|
-
klone = another.dup
|
120
|
-
symbols.insert(position, *klone.symbols)
|
121
|
-
invalidate_refs
|
122
|
-
end
|
123
|
-
|
124
|
-
# Given that the production P passed as argument has exactly 2 symbols
|
125
|
-
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
126
|
-
# s1 s2 by a reference to P.
|
127
|
-
# @param index [Fixnum] the position of a two symbol sequence to be replaced
|
128
|
-
# by the production
|
129
|
-
# @param aProduction [Production or ProductionRef] a production that
|
130
|
-
# consists exactly of one digram (= 2 symbols).
|
131
|
-
def reduce_step(index, aProduction)
|
132
|
-
if symbols[index].is_a?(ProductionRef)
|
133
|
-
symbols[index].bind_to(aProduction)
|
134
|
-
else
|
135
|
-
new_ref = ProductionRef.new(aProduction)
|
136
|
-
symbols[index] = new_ref
|
137
|
-
@memo_references ||= []
|
138
|
-
@memo_references << new_ref
|
139
|
-
end
|
140
|
-
index1 = index + 1
|
141
|
-
if symbols[index1].is_a?(ProductionRef)
|
142
|
-
symbols[index1].unbind
|
143
|
-
invalidate_refs
|
144
|
-
end
|
145
|
-
delete_at(index1)
|
146
|
-
end
|
147
|
-
|
148
|
-
# Remove the element at given position
|
149
|
-
# @param position [Fixnum] a zero-based index.
|
150
|
-
def delete_at(position)
|
151
|
-
invalidate_refs if symbols[position].is_a?(ProductionRef)
|
152
|
-
symbols.delete_at(position)
|
153
|
-
end
|
154
|
-
|
155
|
-
|
156
|
-
# Part of the 'visitee' role in Visitor design pattern.
|
157
|
-
# @param aVisitor[GrammarVisitor]
|
158
|
-
def accept(aVisitor)
|
159
|
-
aVisitor.start_visit_rhs(self)
|
160
|
-
|
161
|
-
# Let's proceed with the visit of productions
|
162
|
-
symbols.each do |a_symb|
|
163
|
-
if a_symb.is_a?(ProductionRef)
|
164
|
-
a_symb.accept(aVisitor)
|
165
|
-
else
|
166
|
-
aVisitor.visit_terminal(a_symb)
|
167
|
-
end
|
168
|
-
end
|
169
|
-
|
170
|
-
aVisitor.end_visit_rhs(self)
|
171
|
-
end
|
172
|
-
|
173
|
-
private
|
174
|
-
|
175
|
-
def invalidate_refs()
|
176
|
-
@memo_references = nil
|
177
|
-
@lookup_references = nil
|
178
|
-
end
|
179
|
-
|
180
|
-
end # class
|
181
|
-
|
182
|
-
end # module
|
1
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
2
|
+
# Represents a sequence (concatenation) of grammar symbols
|
3
|
+
# as they appear in rhs of productions
|
4
|
+
class SymbolSequence
|
5
|
+
# The sequence of symbols itself
|
6
|
+
attr_reader(:symbols)
|
7
|
+
|
8
|
+
# Create an empty sequence
|
9
|
+
def initialize()
|
10
|
+
@symbols = []
|
11
|
+
end
|
12
|
+
|
13
|
+
# Copy constructor invoked by dup or clone methods.
|
14
|
+
# @param orig [SymbolSequence]
|
15
|
+
def initialize_copy(orig)
|
16
|
+
# Deep copy to avoid the aliasing of production reference
|
17
|
+
@symbols = orig.symbols.map do |sym|
|
18
|
+
sym.is_a?(ProductionRef) ? sym.dup : sym
|
19
|
+
end
|
20
|
+
invalidate_refs
|
21
|
+
end
|
22
|
+
|
23
|
+
public
|
24
|
+
|
25
|
+
# Clear the symbol sequence.
|
26
|
+
def clear()
|
27
|
+
refs = references
|
28
|
+
refs.each(&:unbind)
|
29
|
+
@symbols = []
|
30
|
+
invalidate_refs
|
31
|
+
end
|
32
|
+
|
33
|
+
# Tell whether the sequence is empty.
|
34
|
+
# @return [true / false] true only if the sequence has no symbol in it.
|
35
|
+
def empty?()
|
36
|
+
return symbols.empty?
|
37
|
+
end
|
38
|
+
|
39
|
+
# Count the number of elements in the sequence.
|
40
|
+
# @return [Fixnum] the number of elements
|
41
|
+
def size()
|
42
|
+
return symbols.size
|
43
|
+
end
|
44
|
+
|
45
|
+
# Append a grammar symbol at the end of the sequence.
|
46
|
+
# @param aSymbol [Object] The symbol to append.
|
47
|
+
def <<(aSymbol)
|
48
|
+
symbols << aSymbol
|
49
|
+
if aSymbol.is_a?(ProductionRef)
|
50
|
+
@memo_references ||= []
|
51
|
+
@memo_references << aSymbol
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
# Retrieve the element from the sequence at given position.
|
56
|
+
# @param anIndex [Fixnum] A zero-based index of the element to access.
|
57
|
+
def [](anIndex)
|
58
|
+
return symbols[anIndex]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Equality testing.
|
62
|
+
# @param other [SymbolSequence or Array] the other other sequence
|
63
|
+
# to compare to.
|
64
|
+
# @return true when an item from self equals the corresponding
|
65
|
+
# item from 'other'
|
66
|
+
def ==(other)
|
67
|
+
return true if object_id == other.object_id
|
68
|
+
|
69
|
+
case other
|
70
|
+
when SymbolSequence
|
71
|
+
same = symbols == other.symbols
|
72
|
+
when Array
|
73
|
+
same = symbols == other
|
74
|
+
else
|
75
|
+
same = false
|
76
|
+
end
|
77
|
+
|
78
|
+
return same
|
79
|
+
end
|
80
|
+
|
81
|
+
|
82
|
+
# Select the references to production appearing in the rhs.
|
83
|
+
# @return [Array of ProductionRef]
|
84
|
+
def references()
|
85
|
+
@memo_references ||= symbols.select { |symb| symb.is_a?(ProductionRef) }
|
86
|
+
return @memo_references
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Select the references of the given production appearing in the rhs.
|
91
|
+
# @param aProduction [Production]
|
92
|
+
# @return [Array of ProductionRef]
|
93
|
+
def references_of(aProduction)
|
94
|
+
return [] if references.empty?
|
95
|
+
result = references.select { |a_ref| a_ref == aProduction }
|
96
|
+
return result
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Emit a text representation of the symbol sequence.
|
101
|
+
# Text is of the form: space-separated sequence of symbols.
|
102
|
+
# @return [String]
|
103
|
+
def to_string()
|
104
|
+
rhs_text = symbols.map do |elem|
|
105
|
+
case elem
|
106
|
+
when String then "'#{elem}'"
|
107
|
+
else elem.to_s
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
return rhs_text.join(' ')
|
112
|
+
end
|
113
|
+
|
114
|
+
# Insert at position the elements from another sequence.
|
115
|
+
# @param position [Fixnum] A zero-based index of the symbols to replace.
|
116
|
+
# @param another [SymbolSequence] A production with a two-elements rhs
|
117
|
+
# (a single digram).
|
118
|
+
def insert_at(position, another)
|
119
|
+
klone = another.dup
|
120
|
+
symbols.insert(position, *klone.symbols)
|
121
|
+
invalidate_refs
|
122
|
+
end
|
123
|
+
|
124
|
+
# Given that the production P passed as argument has exactly 2 symbols
|
125
|
+
# in its rhs s1 s2, substitute in the rhs of self all occurrences of
|
126
|
+
# s1 s2 by a reference to P.
|
127
|
+
# @param index [Fixnum] the position of a two symbol sequence to be replaced
|
128
|
+
# by the production
|
129
|
+
# @param aProduction [Production or ProductionRef] a production that
|
130
|
+
# consists exactly of one digram (= 2 symbols).
|
131
|
+
def reduce_step(index, aProduction)
|
132
|
+
if symbols[index].is_a?(ProductionRef)
|
133
|
+
symbols[index].bind_to(aProduction)
|
134
|
+
else
|
135
|
+
new_ref = ProductionRef.new(aProduction)
|
136
|
+
symbols[index] = new_ref
|
137
|
+
@memo_references ||= []
|
138
|
+
@memo_references << new_ref
|
139
|
+
end
|
140
|
+
index1 = index + 1
|
141
|
+
if symbols[index1].is_a?(ProductionRef)
|
142
|
+
symbols[index1].unbind
|
143
|
+
invalidate_refs
|
144
|
+
end
|
145
|
+
delete_at(index1)
|
146
|
+
end
|
147
|
+
|
148
|
+
# Remove the element at given position
|
149
|
+
# @param position [Fixnum] a zero-based index.
|
150
|
+
def delete_at(position)
|
151
|
+
invalidate_refs if symbols[position].is_a?(ProductionRef)
|
152
|
+
symbols.delete_at(position)
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
# Part of the 'visitee' role in Visitor design pattern.
|
157
|
+
# @param aVisitor[GrammarVisitor]
|
158
|
+
def accept(aVisitor)
|
159
|
+
aVisitor.start_visit_rhs(self)
|
160
|
+
|
161
|
+
# Let's proceed with the visit of productions
|
162
|
+
symbols.each do |a_symb|
|
163
|
+
if a_symb.is_a?(ProductionRef)
|
164
|
+
a_symb.accept(aVisitor)
|
165
|
+
else
|
166
|
+
aVisitor.visit_terminal(a_symb)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
aVisitor.end_visit_rhs(self)
|
171
|
+
end
|
172
|
+
|
173
|
+
private
|
174
|
+
|
175
|
+
def invalidate_refs()
|
176
|
+
@memo_references = nil
|
177
|
+
@lookup_references = nil
|
178
|
+
end
|
179
|
+
|
180
|
+
end # class
|
181
|
+
|
182
|
+
end # module
|
@@ -74,7 +74,7 @@ describe SequiturGrammar do
|
|
74
74
|
expect(p_a.rhs).to eq([:a, :b, :c])
|
75
75
|
expect(instance.start.rhs).to eq([p_a, p_a])
|
76
76
|
end
|
77
|
-
|
77
|
+
|
78
78
|
it 'should cope with a pattern that caused an exception' do
|
79
79
|
input = 'aaac' # This sequence raised an exception
|
80
80
|
|
@@ -102,6 +102,78 @@ describe SequiturGrammar do
|
|
102
102
|
expect(p3.rhs).to eq(['b', p2, 'e'])
|
103
103
|
end
|
104
104
|
|
105
|
+
it 'should work with strings instead of single char input tokens' do
|
106
|
+
# Raw input is sequence of chars
|
107
|
+
raw_input = 'bbebeebebebbebee'
|
108
|
+
|
109
|
+
# Convert them into multichar strings
|
110
|
+
input = raw_input.chars.map do |ch|
|
111
|
+
'letter_' + ch
|
112
|
+
end
|
113
|
+
|
114
|
+
# Creation
|
115
|
+
instance = SequiturGrammar.new(input.to_enum)
|
116
|
+
|
117
|
+
# Expectations:
|
118
|
+
# S: P3 P2 P3
|
119
|
+
# P1: b e
|
120
|
+
# P2: P1 P1
|
121
|
+
# P3: b P2 e
|
122
|
+
expect(instance.productions.size).to eq(4)
|
123
|
+
(p1, p2, p3) = instance.productions[1..3]
|
124
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
125
|
+
expect(p1.rhs).to eq(%w(letter_b letter_e))
|
126
|
+
expect(p2.rhs).to eq([p1, p1])
|
127
|
+
expect(p3.rhs).to eq(['letter_b',p2, 'letter_e'])
|
128
|
+
end
|
129
|
+
|
130
|
+
it 'should work with Symbol instead of single char input tokens' do
|
131
|
+
# Raw input is sequence of single characters
|
132
|
+
raw_input = 'bbebeebebebbebee'
|
133
|
+
|
134
|
+
# Convert them into symbols
|
135
|
+
input = raw_input.chars.map(&:to_sym)
|
136
|
+
|
137
|
+
# Creation
|
138
|
+
instance = SequiturGrammar.new(input.to_enum)
|
139
|
+
|
140
|
+
# Expectations:
|
141
|
+
# S: P3 P2 P3
|
142
|
+
# P1: b e
|
143
|
+
# P2: P1 P1
|
144
|
+
# P3: b P2 e
|
145
|
+
expect(instance.productions.size).to eq(4)
|
146
|
+
(p1, p2, p3) = instance.productions[1..3]
|
147
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
148
|
+
expect(p1.rhs).to eq([:b, :e])
|
149
|
+
expect(p2.rhs).to eq([p1, p1])
|
150
|
+
expect(p3.rhs).to eq([:b, p2, :e])
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
it 'should work with integer values as input tokens' do
|
155
|
+
# Raw input is sequence of hex digits
|
156
|
+
raw_input = 'bbebeebebebbebee'
|
157
|
+
|
158
|
+
# Convert them into Fixnums
|
159
|
+
input = raw_input.chars.map { |ch| ch.to_i(16) }
|
160
|
+
|
161
|
+
# Creation
|
162
|
+
instance = SequiturGrammar.new(input.to_enum)
|
163
|
+
|
164
|
+
# Expectations:
|
165
|
+
# S: P3 P2 P3
|
166
|
+
# P1: b e
|
167
|
+
# P2: P1 P1
|
168
|
+
# P3: b P2 e
|
169
|
+
expect(instance.productions.size).to eq(4)
|
170
|
+
(p1, p2, p3) = instance.productions[1..3]
|
171
|
+
expect(instance.start.rhs).to eq([p3, p2, p3])
|
172
|
+
expect(p1.rhs).to eq([0xb, 0xe])
|
173
|
+
expect(p2.rhs).to eq([p1, p1])
|
174
|
+
expect(p3.rhs).to eq([0xb, p2, 0xe])
|
175
|
+
end
|
176
|
+
|
105
177
|
it 'should cope with the example from sequitur.info website' do
|
106
178
|
input = 'abcabdabcabd'
|
107
179
|
instance = SequiturGrammar.new(input.chars)
|
@@ -153,7 +225,7 @@ SNIPPET
|
|
153
225
|
# 2 → h o t hot
|
154
226
|
# 3 → 10 1 ,↵pease_porridge_
|
155
227
|
# 4 → c 11 cold
|
156
|
-
# 5 → 12 _ t h 8 t 10 n 12 9 d a y s _ 11 . ↵
|
228
|
+
# 5 → 12 _ t h 8 t 10 n 12 9 d a y s _ 11 . ↵
|
157
229
|
# in_the_pot,↵nine_days_old.↵
|
158
230
|
# 6 → s o m 9 l i k 9 i t _ some_like_it_
|
159
231
|
# 7 → 10 6 ,↵some_like_it_
|
@@ -196,7 +268,7 @@ SNIPPET
|
|
196
268
|
].flatten
|
197
269
|
expect(p12.rhs).to eq(p12_expectation) # Rule 5 above
|
198
270
|
end
|
199
|
-
|
271
|
+
|
200
272
|
it 'should work with a sequence of Ruby Symbols' do
|
201
273
|
input = 'abcabdabcabd'.chars.map(&:to_sym)
|
202
274
|
instance = SequiturGrammar.new(input.to_enum)
|