sequitur 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/CHANGELOG.md +4 -0
- data/README.md +58 -1
- data/Rakefile +31 -31
- data/examples/integer_sample.rb +33 -0
- data/examples/porridge.rb +41 -0
- data/examples/simple_case.rb +27 -0
- data/examples/symbol_sample.rb +28 -0
- data/examples/word_sample.rb +30 -0
- data/lib/sequitur.rb +1 -1
- data/lib/sequitur/constants.rb +1 -1
- data/lib/sequitur/digram.rb +52 -52
- data/lib/sequitur/dynamic_grammar.rb +106 -106
- data/lib/sequitur/formatter/base_formatter.rb +39 -39
- data/lib/sequitur/formatter/base_text.rb +95 -95
- data/lib/sequitur/formatter/debug.rb +131 -131
- data/lib/sequitur/grammar_visitor.rb +110 -110
- data/lib/sequitur/production.rb +243 -243
- data/lib/sequitur/production_ref.rb +119 -119
- data/lib/sequitur/sequitur_grammar.rb +158 -158
- data/lib/sequitur/symbol_sequence.rb +182 -182
- data/spec/sequitur/sequitur_grammar_spec.rb +75 -3
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YjAyNWQ2ZDE4MmE2NGU3MTVjZTMxNDc2YjI4ZmE3NTBlMWRlZWM0MQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NGMzZDRiYjNiYzAwNjE0MzFhOGI0YWQyNjU4YmRjMzNkMDY1ZGEzOA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NjBhMGRlYWIyNmI1MWJmNmE0NDc5MzU4NTc4MjI1ZDVhYjIwZDA2NjAyYzdj
|
10
|
+
NGMzYmU4NmYzNDQ2NTI4MWJhYTQxYTBkYTU1NGVjODcyNjIwNjQxM2Q5ZTUz
|
11
|
+
NGU3YWQzODk2ODA5OWFkZDY0ZThkNjhmMzMzNTdhMTI2ZmZiYmM=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NDA2OTE2YTAxYmZmODg1NTU3YWFmNTlmNWIwNzU2NjNlMTEzZmE2NGFiMDkw
|
14
|
+
OTYxMTkzMmNmZDJkZGJiZmE2ZDljNzdjY2EzZDk2YmYyMmM4YWJmNDBhNWI2
|
15
|
+
YWJjYWE1NWYwZmE3Y2Q0ZDMzMGYyMjdhMWQzZDI5NzI1MDgyMzc=
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
### 0.1.11 / 2014-10-07
|
2
|
+
* [CHANGE] File `README.md`: Added an example showing that Sequitur can work on a sequence of integers.
|
3
|
+
* [NEW] Folder `examples` Added a few code sample.
|
4
|
+
|
1
5
|
### 0.1.10 / 2014-10-05
|
2
6
|
* [CHANGE] Code refactoring for performance. Impacted classes: `SequiturGrammar`, `SymbolSequence` and `Production`.
|
3
7
|
|
data/README.md
CHANGED
@@ -15,6 +15,13 @@ The following are good entry points to learn about the algorithm:
|
|
15
15
|
[Sequitur algorithm home](http://sequitur.info/)
|
16
16
|
[Wikipedia](http://en.wikipedia.org/wiki/Sequitur_algorithm)
|
17
17
|
|
18
|
+
### Highlights ###
|
19
|
+
* Pure Ruby implementation
|
20
|
+
* No runtime dependency with other gems,
|
21
|
+
* Test suite with 100%,
|
22
|
+
* Documentation: 100% coverage (according to YARD), green badge from inch.io
|
23
|
+
* Algorithm works with different input token types (no limited to single character)
|
24
|
+
|
18
25
|
### The theory in a nutshell ###
|
19
26
|
Given a sequence of input tokens (say, characters), the Sequitur algorithm
|
20
27
|
will represent that input sequence as a set of rules. As the algorithm detects
|
@@ -192,7 +199,57 @@ $[sudo] gem install sequitur
|
|
192
199
|
|
193
200
|
|
194
201
|
|
195
|
-
###
|
202
|
+
### Good to know ###
|
203
|
+
The above examples might give the impression that the input stream must consist of single
|
204
|
+
character tokens. This is simply not true.
|
205
|
+
This implementation is flexible enough to cope with other kinds of input values.
|
206
|
+
The next example shows how integer values can be correctly processed by Sequitur.
|
207
|
+
Assume that the input is the array of Fixnums *[1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]*.
|
208
|
+
Then Sequitur algorithm will generate the rule set:
|
209
|
+
```
|
210
|
+
start : P1 P2 P3 P3 e.
|
211
|
+
P1 : a b.
|
212
|
+
P2 : P1 c.
|
213
|
+
P3 : P2 d.
|
214
|
+
```
|
215
|
+
|
216
|
+
|
217
|
+
```ruby
|
218
|
+
require 'sequitur' # Load the Sequitur library
|
219
|
+
|
220
|
+
#
|
221
|
+
# Purpose: demo of Sequitur with a stream of integer values
|
222
|
+
#
|
223
|
+
input_sequence = [1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
|
224
|
+
|
225
|
+
# Generate the grammar from the sequence
|
226
|
+
grammar = Sequitur.build_from(input_sequence)
|
227
|
+
|
228
|
+
|
229
|
+
# Use a formatter to display the grammar rules on the console output
|
230
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
231
|
+
|
232
|
+
# Now render the rules
|
233
|
+
formatter.render(grammar.visitor)
|
234
|
+
|
235
|
+
# Rendered output is:
|
236
|
+
# start : P1 P2 P3 P3 5.
|
237
|
+
# P1 : 1 2.
|
238
|
+
# P2 : P1 3.
|
239
|
+
# P3 : P2 4.
|
240
|
+
|
241
|
+
# Playing a bit with the API
|
242
|
+
# Access last symbol from rhs of start production:
|
243
|
+
last_symbol_p0 = grammar.start.rhs.symbols[-1]
|
244
|
+
puts last_symbol_p0 # => 5
|
245
|
+
|
246
|
+
# Access first symbol from rhs of P1 production:
|
247
|
+
first_symbol_p1 = grammar.productions[1].rhs.symbols[0]
|
248
|
+
|
249
|
+
puts first_symbol_p1 # => 1
|
250
|
+
```
|
251
|
+
|
252
|
+
More examples are available in the examples folder.
|
196
253
|
|
197
254
|
|
198
255
|
Copyright
|
data/Rakefile
CHANGED
@@ -1,31 +1,31 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require_relative './lib/sequitur/constants'
|
3
|
-
|
4
|
-
namespace :gem do
|
5
|
-
|
6
|
-
desc 'Push the gem to rubygems.org'
|
7
|
-
task :push do
|
8
|
-
system("gem push sequitur-#{Sequitur::Version}.gem")
|
9
|
-
end
|
10
|
-
|
11
|
-
end # namespace
|
12
|
-
|
13
|
-
# Testing-specific tasks
|
14
|
-
|
15
|
-
# RSpec as testing tool
|
16
|
-
require 'rspec/core/rake_task'
|
17
|
-
desc 'Run RSpec'
|
18
|
-
RSpec::Core::RakeTask.new do |spec|
|
19
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
20
|
-
end
|
21
|
-
|
22
|
-
|
23
|
-
# Run RSpec tests
|
24
|
-
desc 'Run tests, with RSpec'
|
25
|
-
task test: [:spec]
|
26
|
-
|
27
|
-
|
28
|
-
# Default rake task
|
29
|
-
task default: :test
|
30
|
-
|
31
|
-
# End of file
|
1
|
+
require 'rubygems'
|
2
|
+
require_relative './lib/sequitur/constants'
|
3
|
+
|
4
|
+
namespace :gem do
|
5
|
+
|
6
|
+
desc 'Push the gem to rubygems.org'
|
7
|
+
task :push do
|
8
|
+
system("gem push sequitur-#{Sequitur::Version}.gem")
|
9
|
+
end
|
10
|
+
|
11
|
+
end # namespace
|
12
|
+
|
13
|
+
# Testing-specific tasks
|
14
|
+
|
15
|
+
# RSpec as testing tool
|
16
|
+
require 'rspec/core/rake_task'
|
17
|
+
desc 'Run RSpec'
|
18
|
+
RSpec::Core::RakeTask.new do |spec|
|
19
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
# Run RSpec tests
|
24
|
+
desc 'Run tests, with RSpec'
|
25
|
+
task test: [:spec]
|
26
|
+
|
27
|
+
|
28
|
+
# Default rake task
|
29
|
+
task default: :test
|
30
|
+
|
31
|
+
# End of file
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of integer values
|
5
|
+
#
|
6
|
+
input_sequence = [1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
|
7
|
+
|
8
|
+
# Generate the grammar from the sequence
|
9
|
+
grammar = Sequitur.build_from(input_sequence)
|
10
|
+
|
11
|
+
|
12
|
+
# Use a formatter to display the grammar rules on the console output
|
13
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
14
|
+
|
15
|
+
# Now render the rules
|
16
|
+
formatter.render(grammar.visitor)
|
17
|
+
|
18
|
+
# Rendered output is:
|
19
|
+
# start : P1 P2 P3 P3 5.
|
20
|
+
# P1 : 1 2.
|
21
|
+
# P2 : P1 3.
|
22
|
+
# P3 : P2 4.
|
23
|
+
|
24
|
+
# Playing a bit with the API
|
25
|
+
# Access last symbol from rhs of start production:
|
26
|
+
last_symbol_p0 = grammar.start.rhs.symbols[-1]
|
27
|
+
puts last_symbol_p0 # => 5
|
28
|
+
|
29
|
+
# Access first symbol from rhs of P1 production:
|
30
|
+
first_symbol_p1 = grammar.productions[1].rhs.symbols[0]
|
31
|
+
|
32
|
+
puts first_symbol_p1 # => 1
|
33
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
|
4
|
+
# Purpose: demo to show that sequitur gem works on example from sequitur.info website
|
5
|
+
input_sequence =
|
6
|
+
input = <<-SNIPPET
|
7
|
+
pease porridge hot,
|
8
|
+
pease porridge cold,
|
9
|
+
pease porridge in the pot,
|
10
|
+
nine days old.
|
11
|
+
|
12
|
+
some like it hot,
|
13
|
+
some like it cold,
|
14
|
+
some like it in the pot,
|
15
|
+
nine days old.
|
16
|
+
SNIPPET
|
17
|
+
|
18
|
+
grammar = Sequitur.build_from(input_sequence)
|
19
|
+
|
20
|
+
# To display the grammar rules on the console output
|
21
|
+
# We use a formatter
|
22
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
23
|
+
|
24
|
+
# Now render the rules.
|
25
|
+
formatter.render(grammar.visitor)
|
26
|
+
|
27
|
+
# Rendered output is:
|
28
|
+
# start : P2 P8 P3 P10 P3 P12 P9 P8 P11 P10 P11 P12.
|
29
|
+
# P1 : e .
|
30
|
+
# P2 : p e a s P4 r r i d g P1.
|
31
|
+
# P3 : P5 P2.
|
32
|
+
# P4 : P1 p o.
|
33
|
+
# P5 : ,
|
34
|
+
# .
|
35
|
+
# P6 : i n.
|
36
|
+
# P7 : o l d.
|
37
|
+
# P8 : h o t.
|
38
|
+
# P9 : s o m P1 l i k P1 i t .
|
39
|
+
# P10 : c P7.
|
40
|
+
# P11 : P5 P9.
|
41
|
+
# P12 : P6 t h P4 t P5 n P6 P1 d a y s P7 .
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of single characters
|
5
|
+
input_sequence = 'ababcabcdabcde' # Let's analyze this string
|
6
|
+
|
7
|
+
# The SEQUITUR algorithm will detect the repeated 'ab' pattern
|
8
|
+
# and will generate a context-free grammar that represents the input string
|
9
|
+
grammar = Sequitur.build_from(input_sequence)
|
10
|
+
|
11
|
+
# To display the grammar rules on the console output
|
12
|
+
# We use a formatter
|
13
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
14
|
+
|
15
|
+
# Now render the rules. Each rule is displayed with the format:
|
16
|
+
# rule_id : a_sequence_grammar_symbols.
|
17
|
+
# Where:
|
18
|
+
# - rule_id is either 'start' or a name like 'Pxxxx' (xxxx is a sequential number)
|
19
|
+
# - a grammar symbol is either a terminal symbol
|
20
|
+
# (i.e. a character from the input) or a rule id
|
21
|
+
formatter.render(grammar.visitor)
|
22
|
+
|
23
|
+
# Rendered output is:
|
24
|
+
# start : P1 P2 P3 P3 e.
|
25
|
+
# P1 : a b.
|
26
|
+
# P2 : P1 c.
|
27
|
+
# P3 : P2 d.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of Symbol values
|
5
|
+
#
|
6
|
+
input_sequence = [
|
7
|
+
:aa, :bb, :aa, :bb,
|
8
|
+
:cc, :aa, :bb, :cc,
|
9
|
+
:dd, :aa, :bb, :cc,
|
10
|
+
:dd, :ee
|
11
|
+
]
|
12
|
+
|
13
|
+
# Generate the grammar from the sequence
|
14
|
+
grammar = Sequitur.build_from(input_sequence)
|
15
|
+
|
16
|
+
|
17
|
+
# Use a formatter to display the grammar rules on the console output
|
18
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
19
|
+
|
20
|
+
# Now render the rules
|
21
|
+
formatter.render(grammar.visitor)
|
22
|
+
|
23
|
+
# Rendered output is:
|
24
|
+
# start : P1 P2 P3 P3 ee.
|
25
|
+
# P1 : aa bb.
|
26
|
+
# P2 : P1 cc.
|
27
|
+
# P3 : P2 dd.
|
28
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'sequitur' # Load the Sequitur library
|
2
|
+
|
3
|
+
#
|
4
|
+
# Purpose: show how to apply Sequitur on a stream of text words
|
5
|
+
#
|
6
|
+
|
7
|
+
# Raw input is one String containing repeated sentences...
|
8
|
+
raw_input = <<-SNIPPET
|
9
|
+
Error: unknown character '?' at position 6
|
10
|
+
Error: illegal character '%' at position 20
|
11
|
+
Error: unknown character '/' at position 9
|
12
|
+
SNIPPET
|
13
|
+
|
14
|
+
# Convert into a sequence of words
|
15
|
+
input_sequence = raw_input.scan(/\w+/)
|
16
|
+
# Generate the grammar from the sequence
|
17
|
+
grammar = Sequitur.build_from(input_sequence)
|
18
|
+
|
19
|
+
|
20
|
+
# Use a formatter to display the grammar rules on the console output
|
21
|
+
formatter = Sequitur::Formatter::BaseText.new(STDOUT)
|
22
|
+
|
23
|
+
# Now render the rules
|
24
|
+
formatter.render(grammar.visitor)
|
25
|
+
|
26
|
+
# Rendered output is:
|
27
|
+
# start : P2 6 Error illegal P1 20 P2 9.
|
28
|
+
# P1 : character at position.
|
29
|
+
# P2 : Error unknown P1.
|
30
|
+
|
data/lib/sequitur.rb
CHANGED
data/lib/sequitur/constants.rb
CHANGED
data/lib/sequitur/digram.rb
CHANGED
@@ -1,52 +1,52 @@
|
|
1
|
-
# File: digram.rb
|
2
|
-
|
3
|
-
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
-
|
5
|
-
# In linguistics, a digram is a sequence of two letters.
|
6
|
-
# In Sequitur, a digram is a sequence of two consecutive symbols that
|
7
|
-
# appear in a production rule. Each symbol in a digram
|
8
|
-
# can be a terminal or not.
|
9
|
-
class Digram
|
10
|
-
# The sequence of two consecutive grammar symbols.
|
11
|
-
# The two symbols should respond to the :hash message.
|
12
|
-
attr_reader(:symbols)
|
13
|
-
|
14
|
-
# An unique hash key of the digram
|
15
|
-
attr_reader(:key)
|
16
|
-
|
17
|
-
# The production in which the digram occurs
|
18
|
-
attr_reader(:production)
|
19
|
-
|
20
|
-
# Constructor.
|
21
|
-
# A digram represents a sequence of two symbols
|
22
|
-
# (that appears in a rhs of a production).
|
23
|
-
# Terminal symbols must respond to the :hash message.
|
24
|
-
# @param symbol1 [StringOrSymbol] First element of the digram
|
25
|
-
# @param symbol2 [StringOrSymbol] Second element of the digram
|
26
|
-
# @param aProduction [Production] Production in which the RHS
|
27
|
-
# the sequence symbol1 symbol2 appears.
|
28
|
-
def initialize(symbol1, symbol2, aProduction)
|
29
|
-
@symbols = [symbol1, symbol2]
|
30
|
-
@key = symbol1.hash.to_s(16) + ':' + symbol2.hash.to_s(16)
|
31
|
-
@production = aProduction
|
32
|
-
end
|
33
|
-
|
34
|
-
# Equality testing.
|
35
|
-
# true iff keys of both digrams are equal, false otherwise
|
36
|
-
# @param other [Digram] another to compare with
|
37
|
-
# @return [true/false]
|
38
|
-
def ==(other)
|
39
|
-
return key == other.key
|
40
|
-
end
|
41
|
-
|
42
|
-
# Does the digram consists of twice the same symbols?
|
43
|
-
# @return [true/false] true when symbols.first == symbols.last
|
44
|
-
def repeating?()
|
45
|
-
return symbols[0] == symbols[1]
|
46
|
-
end
|
47
|
-
|
48
|
-
end # class
|
49
|
-
|
50
|
-
end # module
|
51
|
-
|
52
|
-
# End of file
|
1
|
+
# File: digram.rb
|
2
|
+
|
3
|
+
module Sequitur # Module for classes implementing the Sequitur algorithm
|
4
|
+
|
5
|
+
# In linguistics, a digram is a sequence of two letters.
|
6
|
+
# In Sequitur, a digram is a sequence of two consecutive symbols that
|
7
|
+
# appear in a production rule. Each symbol in a digram
|
8
|
+
# can be a terminal or not.
|
9
|
+
class Digram
|
10
|
+
# The sequence of two consecutive grammar symbols.
|
11
|
+
# The two symbols should respond to the :hash message.
|
12
|
+
attr_reader(:symbols)
|
13
|
+
|
14
|
+
# An unique hash key of the digram
|
15
|
+
attr_reader(:key)
|
16
|
+
|
17
|
+
# The production in which the digram occurs
|
18
|
+
attr_reader(:production)
|
19
|
+
|
20
|
+
# Constructor.
|
21
|
+
# A digram represents a sequence of two symbols
|
22
|
+
# (that appears in a rhs of a production).
|
23
|
+
# Terminal symbols must respond to the :hash message.
|
24
|
+
# @param symbol1 [StringOrSymbol] First element of the digram
|
25
|
+
# @param symbol2 [StringOrSymbol] Second element of the digram
|
26
|
+
# @param aProduction [Production] Production in which the RHS
|
27
|
+
# the sequence symbol1 symbol2 appears.
|
28
|
+
def initialize(symbol1, symbol2, aProduction)
|
29
|
+
@symbols = [symbol1, symbol2]
|
30
|
+
@key = symbol1.hash.to_s(16) + ':' + symbol2.hash.to_s(16)
|
31
|
+
@production = aProduction
|
32
|
+
end
|
33
|
+
|
34
|
+
# Equality testing.
|
35
|
+
# true iff keys of both digrams are equal, false otherwise
|
36
|
+
# @param other [Digram] another to compare with
|
37
|
+
# @return [true/false]
|
38
|
+
def ==(other)
|
39
|
+
return key == other.key
|
40
|
+
end
|
41
|
+
|
42
|
+
# Does the digram consists of twice the same symbols?
|
43
|
+
# @return [true/false] true when symbols.first == symbols.last
|
44
|
+
def repeating?()
|
45
|
+
return symbols[0] == symbols[1]
|
46
|
+
end
|
47
|
+
|
48
|
+
end # class
|
49
|
+
|
50
|
+
end # module
|
51
|
+
|
52
|
+
# End of file
|