sequitur 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/README.md +58 -1
- data/Rakefile +31 -31
- data/examples/integer_sample.rb +33 -0
- data/examples/porridge.rb +41 -0
- data/examples/simple_case.rb +27 -0
- data/examples/symbol_sample.rb +28 -0
- data/examples/word_sample.rb +30 -0
- data/lib/sequitur.rb +1 -1
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +52 -52
- data/lib/sequitur/dynamic_grammar.rb +106 -106
- data/lib/sequitur/formatter/base_formatter.rb +39 -39
- data/lib/sequitur/formatter/base_text.rb +95 -95
- data/lib/sequitur/formatter/debug.rb +131 -131
- data/lib/sequitur/grammar_visitor.rb +110 -110
- data/lib/sequitur/production.rb +243 -243
- data/lib/sequitur/production_ref.rb +119 -119
- data/lib/sequitur/sequitur_grammar.rb +158 -158
- data/lib/sequitur/symbol_sequence.rb +182 -182
- data/spec/sequitur/sequitur_grammar_spec.rb +75 -3
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YjAyNWQ2ZDE4MmE2NGU3MTVjZTMxNDc2YjI4ZmE3NTBlMWRlZWM0MQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NGMzZDRiYjNiYzAwNjE0MzFhOGI0YWQyNjU4YmRjMzNkMDY1ZGEzOA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NjBhMGRlYWIyNmI1MWJmNmE0NDc5MzU4NTc4MjI1ZDVhYjIwZDA2NjAyYzdj
|
10
|
+
NGMzYmU4NmYzNDQ2NTI4MWJhYTQxYTBkYTU1NGVjODcyNjIwNjQxM2Q5ZTUz
|
11
|
+
NGU3YWQzODk2ODA5OWFkZDY0ZThkNjhmMzMzNTdhMTI2ZmZiYmM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NDA2OTE2YTAxYmZmODg1NTU3YWFmNTlmNWIwNzU2NjNlMTEzZmE2NGFiMDkw
|
14
|
+
OTYxMTkzMmNmZDJkZGJiZmE2ZDljNzdjY2EzZDk2YmYyMmM4YWJmNDBhNWI2
|
15
|
+
YWJjYWE1NWYwZmE3Y2Q0ZDMzMGYyMjdhMWQzZDI5NzI1MDgyMzc=
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
### 0.1.11 / 2014-10-07
|
2
|
+
* [CHANGE] File `README.md`: Added an example showing that Sequitur can work on a sequence of integers.
|
3
|
+
* [NEW] Folder `examples` Added a few code sample.
|
4
|
+
|
1
5
|
### 0.1.10 / 2014-10-05
|
2
6
|
* [CHANGE] Code refactoring for performance. Impacted classes: `SequiturGrammar`, `SymbolSequence` and `Production`.
|
3
7
|
|
data/README.md
CHANGED
@@ -15,6 +15,13 @@ The following are good entry points to learn about the algorithm:
|
|
15
15
|
[Sequitur algorithm home](http://sequitur.info/)
|
16
16
|
[Wikipedia](http://en.wikipedia.org/wiki/Sequitur_algorithm)
|
17
17
|
|
18
|
+
### Highlights ###
|
19
|
+
* Pure Ruby implementation
|
20
|
+
* No runtime dependency with other gems,
|
21
|
+
* Test suite with 100%,
|
22
|
+
* Documentation: 100% coverage (according to YARD), green badge from inch.io
|
23
|
+
* Algorithm works with different input token types (no limited to single character)
|
24
|
+
|
18
25
|
### The theory in a nutshell ###
|
19
26
|
Given a sequence of input tokens (say, characters), the Sequitur algorithm
|
20
27
|
will represent that input sequence as a set of rules. As the algorithm detects
|
@@ -192,7 +199,57 @@ $[sudo] gem install sequitur
|
|
192
199
|
|
193
200
|
|
194
201
|
|
195
|
-
###
|
202
|
+
### Good to know ###
|
203
|
+
The above examples might give the impression that the input stream must consist of single
|
204
|
+
character tokens. This is simply not true.
|
205
|
+
This implementation is flexible enough to cope with other kinds of input values.
|
206
|
+
The next example shows how integer values can be correctly processed by Sequitur.
|
207
|
+
Assume that the input is the array of Fixnums *[1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]*.
|
208
|
+
Then Sequitur algorithm will generate the rule set:
|
209
|
+
```
|
210
|
+
start : P1 P2 P3 P3 e.
|
211
|
+
P1 : a b.
|
212
|
+
P2 : P1 c.
|
213
|
+
P3 : P2 d.
|
214
|
+
```
|
215
|
+
|
216
|
+
|
217
|
+
```ruby
|
218
|
+
require 'sequitur' # Load the Sequitur library
|
219
|
+
|
220
|
+
#
|
221
|
+
# Purpose: demo of Sequitur with a stream of integer values
|
222
|
+
#
|
223
|
+
input_sequence = [1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
|
224
|
+
|
225
|
+
# Generate the grammar from the sequence
|
226
|
+
grammar = Sequitur.build_from(input_sequence)
|
227
|
+
|
228
|
+
|
229
|
+
# Use a formatter to display the grammar rules on the console output
|
230
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
231
|
+
|
232
|
+
# Now render the rules
|
233
|
+
formatter.render(grammar.visitor)
|
234
|
+
|
235
|
+
# Rendered output is:
|
236
|
+
# start : P1 P2 P3 P3 5.
|
237
|
+
# P1 : 1 2.
|
238
|
+
# P2 : P1 3.
|
239
|
+
# P3 : P2 4.
|
240
|
+
|
241
|
+
# Playing a bit with the API
|
242
|
+
# Access last symbol from rhs of start production:
|
243
|
+
last_symbol_p0 = grammar.start.rhs.symbols[-1]
|
244
|
+
puts last_symbol_p0 # => 5
|
245
|
+
|
246
|
+
# Access first symbol from rhs of P1 production:
|
247
|
+
first_symbol_p1 = grammar.productions[1].rhs.symbols[0]
|
248
|
+
|
249
|
+
puts first_symbol_p1 # => 1
|
250
|
+
```
|
251
|
+
|
252
|
+
More examples are available in the examples folder.
|
196
253
|
|
197
254
|
|
198
255
|
Copyright
|
data/Rakefile
CHANGED
@@ -1,31 +1,31 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require_relative './lib/sequitur/constants'
|
3
|
-
|
4
|
-
namespace :gem do
|
5
|
-
|
6
|
-
desc 'Push the gem to rubygems.org'
|
7
|
-
task :push do
|
8
|
-
system("gem push sequitur-#{Sequitur::Version}.gem")
|
9
|
-
end
|
10
|
-
|
11
|
-
end # namespace
|
12
|
-
|
13
|
-
# Testing-specific tasks
|
14
|
-
|
15
|
-
# RSpec as testing tool
|
16
|
-
require 'rspec/core/rake_task'
|
17
|
-
desc 'Run RSpec'
|
18
|
-
RSpec::Core::RakeTask.new do |spec|
|
19
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
# Run RSpec tests
|
24
|
-
desc 'Run tests, with RSpec'
|
25
|
-
task test: [:spec]
|
26
|
-
|
27
|
-
|
28
|
-
# Default rake task
|
29
|
-
task default: :test
|
30
|
-
|
31
|
-
# End of file
|
1
|
+
require 'rubygems'
|
2
|
+
require_relative './lib/sequitur/constants'
|
3
|
+
|
4
|
+
namespace :gem do
|
5
|
+
|
6
|
+
desc 'Push the gem to rubygems.org'
|
7
|
+
task :push do
|
8
|
+
system("gem push sequitur-#{Sequitur::Version}.gem")
|
9
|
+
end
|
10
|
+
|
11
|
+
end # namespace
|
12
|
+
|
13
|
+
# Testing-specific tasks
|
14
|
+
|
15
|
+
# RSpec as testing tool
|
16
|
+
require 'rspec/core/rake_task'
|
17
|
+
desc 'Run RSpec'
|
18
|
+
RSpec::Core::RakeTask.new do |spec|
|
19
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# Run RSpec tests
|
24
|
+
desc 'Run tests, with RSpec'
|
25
|
+
task test: [:spec]
|
26
|
+
|
27
|
+
|
28
|
+
# Default rake task
|
29
|
+
task default: :test
|
30
|
+
|
31
|
+
# End of file
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of integer values
|
5
|
+
#
|
6
|
+
input_sequence = [1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
|
7
|
+
|
8
|
+
# Generate the grammar from the sequence
|
9
|
+
grammar = Sequitur.build_from(input_sequence)
|
10
|
+
|
11
|
+
|
12
|
+
# Use a formatter to display the grammar rules on the console output
|
13
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
14
|
+
|
15
|
+
# Now render the rules
|
16
|
+
formatter.render(grammar.visitor)
|
17
|
+
|
18
|
+
# Rendered output is:
|
19
|
+
# start : P1 P2 P3 P3 5.
|
20
|
+
# P1 : 1 2.
|
21
|
+
# P2 : P1 3.
|
22
|
+
# P3 : P2 4.
|
23
|
+
|
24
|
+
# Playing a bit with the API
|
25
|
+
# Access last symbol from rhs of start production:
|
26
|
+
last_symbol_p0 = grammar.start.rhs.symbols[-1]
|
27
|
+
puts last_symbol_p0 # => 5
|
28
|
+
|
29
|
+
# Access first symbol from rhs of P1 production:
|
30
|
+
first_symbol_p1 = grammar.productions[1].rhs.symbols[0]
|
31
|
+
|
32
|
+
puts first_symbol_p1 # => 1
|
33
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
|
4
|
+
# Purpose: demo to show that sequitur gem works on example from sequitur.info website
|
5
|
+
input_sequence =
|
6
|
+
input = <<-SNIPPET
|
7
|
+
pease porridge hot,
|
8
|
+
pease porridge cold,
|
9
|
+
pease porridge in the pot,
|
10
|
+
nine days old.
|
11
|
+
|
12
|
+
some like it hot,
|
13
|
+
some like it cold,
|
14
|
+
some like it in the pot,
|
15
|
+
nine days old.
|
16
|
+
SNIPPET
|
17
|
+
|
18
|
+
grammar = Sequitur.build_from(input_sequence)
|
19
|
+
|
20
|
+
# To display the grammar rules on the console output
|
21
|
+
# We use a formatter
|
22
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
23
|
+
|
24
|
+
# Now render the rules.
|
25
|
+
formatter.render(grammar.visitor)
|
26
|
+
|
27
|
+
# Rendered output is:
|
28
|
+
# start : P2 P8 P3 P10 P3 P12 P9 P8 P11 P10 P11 P12.
|
29
|
+
# P1 : e .
|
30
|
+
# P2 : p e a s P4 r r i d g P1.
|
31
|
+
# P3 : P5 P2.
|
32
|
+
# P4 : P1 p o.
|
33
|
+
# P5 : ,
|
34
|
+
# .
|
35
|
+
# P6 : i n.
|
36
|
+
# P7 : o l d.
|
37
|
+
# P8 : h o t.
|
38
|
+
# P9 : s o m P1 l i k P1 i t .
|
39
|
+
# P10 : c P7.
|
40
|
+
# P11 : P5 P9.
|
41
|
+
# P12 : P6 t h P4 t P5 n P6 P1 d a y s P7 .
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of single characters
|
5
|
+
input_sequence = 'ababcabcdabcde' # Let's analyze this string
|
6
|
+
|
7
|
+
# The SEQUITUR algorithm will detect the repeated 'ab' pattern
|
8
|
+
# and will generate a context-free grammar that represents the input string
|
9
|
+
grammar = Sequitur.build_from(input_sequence)
|
10
|
+
|
11
|
+
# To display the grammar rules on the console output
|
12
|
+
# We use a formatter
|
13
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
14
|
+
|
15
|
+
# Now render the rules. Each rule is displayed with the format:
|
16
|
+
# rule_id : a_sequence_grammar_symbols.
|
17
|
+
# Where:
|
18
|
+
# - rule_id is either 'start' or a name like 'Pxxxx' (xxxx is a sequential number)
|
19
|
+
# - a grammar symbol is either a terminal symbol
|
20
|
+
# (i.e. a character from the input) or a rule id
|
21
|
+
formatter.render(grammar.visitor)
|
22
|
+
|
23
|
+
# Rendered output is:
|
24
|
+
# start : P1 P2 P3 P3 e.
|
25
|
+
# P1 : a b.
|
26
|
+
# P2 : P1 c.
|
27
|
+
# P3 : P2 d.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of Symbol values
|
5
|
+
#
|
6
|
+
input_sequence = [
|
7
|
+
:aa, :bb, :aa, :bb,
|
8
|
+
:cc, :aa, :bb, :cc,
|
9
|
+
:dd, :aa, :bb, :cc,
|
10
|
+
:dd, :ee
|
11
|
+
]
|
12
|
+
|
13
|
+
# Generate the grammar from the sequence
|
14
|
+
grammar = Sequitur.build_from(input_sequence)
|
15
|
+
|
16
|
+
|
17
|
+
# Use a formatter to display the grammar rules on the console output
|
18
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
19
|
+
|
20
|
+
# Now render the rules
|
21
|
+
formatter.render(grammar.visitor)
|
22
|
+
|
23
|
+
# Rendered output is:
|
24
|
+
# start : P1 P2 P3 P3 ee.
|
25
|
+
# P1 : aa bb.
|
26
|
+
# P2 : P1 cc.
|
27
|
+
# P3 : P2 dd.
|
28
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of text words
|
5
|
+
#
|
6
|
+
|
7
|
+
# Raw input is one String containing repeated sentences...
|
8
|
+
raw_input = <<-SNIPPET
|
9
|
+
Error: unknown character '?' at position 6
|
10
|
+
Error: illegal character '%' at position 20
|
11
|
+
Error: unknown character '/' at position 9
|
12
|
+
SNIPPET
|
13
|
+
|
14
|
+
# Convert into a sequence of words
|
15
|
+
input_sequence = raw_input.scan(/\w+/)
|
16
|
+
# Generate the grammar from the sequence
|
17
|
+
grammar = Sequitur.build_from(input_sequence)
|
18
|
+
|
19
|
+
|
20
|
+
# Use a formatter to display the grammar rules on the console output
|
21
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
22
|
+
|
23
|
+
# Now render the rules
|
24
|
+
formatter.render(grammar.visitor)
|
25
|
+
|
26
|
+
# Rendered output is:
|
27
|
+
# start : P2 6 Error illegal P1 20 P2 9.
|
28
|
+
# P1 : character at position.
|
29
|
+
# P2 : Error unknown P1.
|
30
|
+
|
data/lib/sequitur.rb
CHANGED
data/lib/sequitur/constants.rb
CHANGED
data/lib/sequitur/digram.rb
CHANGED
@@ -1,52 +1,52 @@
|
|
1
|
-
# File: digram.rb
|
2
|
-
|
3
|
-
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
-
|
5
|
-
# In linguistics, a digram is a sequence of two letters.
|
6
|
-
# In Sequitur, a digram is a sequence of two consecutive symbols that
|
7
|
-
# appear in a production rule. Each symbol in a digram
|
8
|
-
# can be a terminal or not.
|
9
|
-
class Digram
|
10
|
-
# The sequence of two consecutive grammar symbols.
|
11
|
-
# The two symbols should respond to the :hash message.
|
12
|
-
attr_reader(:symbols)
|
13
|
-
|
14
|
-
# An unique hash key of the digram
|
15
|
-
attr_reader(:key)
|
16
|
-
|
17
|
-
# The production in which the digram occurs
|
18
|
-
attr_reader(:production)
|
19
|
-
|
20
|
-
# Constructor.
|
21
|
-
# A digram represents a sequence of two symbols
|
22
|
-
# (that appears in a rhs of a production).
|
23
|
-
# Terminal symbols must respond to the :hash message.
|
24
|
-
# @param symbol1 [StringOrSymbol] First element of the digram
|
25
|
-
# @param symbol2 [StringOrSymbol] Second element of the digram
|
26
|
-
# @param aProduction [Production] Production in which the RHS
|
27
|
-
# the sequence symbol1 symbol2 appears.
|
28
|
-
def initialize(symbol1, symbol2, aProduction)
|
29
|
-
@symbols = [symbol1, symbol2]
|
30
|
-
@key = symbol1.hash.to_s(16) + ':' + symbol2.hash.to_s(16)
|
31
|
-
@production = aProduction
|
32
|
-
end
|
33
|
-
|
34
|
-
# Equality testing.
|
35
|
-
# true iff keys of both digrams are equal, false otherwise
|
36
|
-
# @param other [Digram] another to compare with
|
37
|
-
# @return [true/false]
|
38
|
-
def ==(other)
|
39
|
-
return key == other.key
|
40
|
-
end
|
41
|
-
|
42
|
-
# Does the digram consists of twice the same symbols?
|
43
|
-
# @return [true/false] true when symbols.first == symbols.last
|
44
|
-
def repeating?()
|
45
|
-
return symbols[0] == symbols[1]
|
46
|
-
end
|
47
|
-
|
48
|
-
end # class
|
49
|
-
|
50
|
-
end # module
|
51
|
-
|
52
|
-
# End of file
|
1
|
+
# File: digram.rb
|
2
|
+
|
3
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
+
|
5
|
+
# In linguistics, a digram is a sequence of two letters.
|
6
|
+
# In Sequitur, a digram is a sequence of two consecutive symbols that
|
7
|
+
# appear in a production rule. Each symbol in a digram
|
8
|
+
# can be a terminal or not.
|
9
|
+
class Digram
|
10
|
+
# The sequence of two consecutive grammar symbols.
|
11
|
+
# The two symbols should respond to the :hash message.
|
12
|
+
attr_reader(:symbols)
|
13
|
+
|
14
|
+
# An unique hash key of the digram
|
15
|
+
attr_reader(:key)
|
16
|
+
|
17
|
+
# The production in which the digram occurs
|
18
|
+
attr_reader(:production)
|
19
|
+
|
20
|
+
# Constructor.
|
21
|
+
# A digram represents a sequence of two symbols
|
22
|
+
# (that appears in a rhs of a production).
|
23
|
+
# Terminal symbols must respond to the :hash message.
|
24
|
+
# @param symbol1 [StringOrSymbol] First element of the digram
|
25
|
+
# @param symbol2 [StringOrSymbol] Second element of the digram
|
26
|
+
# @param aProduction [Production] Production in which the RHS
|
27
|
+
# the sequence symbol1 symbol2 appears.
|
28
|
+
def initialize(symbol1, symbol2, aProduction)
|
29
|
+
@symbols = [symbol1, symbol2]
|
30
|
+
@key = symbol1.hash.to_s(16) + ':' + symbol2.hash.to_s(16)
|
31
|
+
@production = aProduction
|
32
|
+
end
|
33
|
+
|
34
|
+
# Equality testing.
|
35
|
+
# true iff keys of both digrams are equal, false otherwise
|
36
|
+
# @param other [Digram] another to compare with
|
37
|
+
# @return [true/false]
|
38
|
+
def ==(other)
|
39
|
+
return key == other.key
|
40
|
+
end
|
41
|
+
|
42
|
+
# Does the digram consists of twice the same symbols?
|
43
|
+
# @return [true/false] true when symbols.first == symbols.last
|
44
|
+
def repeating?()
|
45
|
+
return symbols[0] == symbols[1]
|
46
|
+
end
|
47
|
+
|
48
|
+
end # class
|
49
|
+
|
50
|
+
end # module
|
51
|
+
|
52
|
+
# End of file
|