raingrams 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +9 -0
- data/Manifest.txt +10 -10
- data/README.txt +9 -7
- data/Rakefile +3 -6
- data/TODO.txt +6 -0
- data/lib/raingrams/bigram_model.rb +3 -7
- data/lib/raingrams/extensions/object.rb +4 -1
- data/lib/raingrams/extensions/string.rb +3 -0
- data/lib/raingrams/extensions.rb +0 -5
- data/lib/raingrams/hexagram_model.rb +3 -7
- data/lib/raingrams/model.rb +622 -61
- data/lib/raingrams/ngram.rb +50 -9
- data/lib/raingrams/ngram_set.rb +43 -0
- data/lib/raingrams/open_vocabulary/model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
- data/lib/raingrams/open_vocabulary.rb +0 -1
- data/lib/raingrams/pentagram_model.rb +3 -7
- data/lib/raingrams/probability_table.rb +153 -0
- data/lib/raingrams/quadgram_model.rb +3 -7
- data/lib/raingrams/raingrams.rb +10 -20
- data/lib/raingrams/tokens/start_sentence.rb +2 -2
- data/lib/raingrams/tokens/stop_sentence.rb +2 -2
- data/lib/raingrams/tokens/token.rb +49 -5
- data/lib/raingrams/tokens/unknown.rb +2 -2
- data/lib/raingrams/tokens.rb +1 -0
- data/lib/raingrams/trigram_model.rb +3 -7
- data/lib/raingrams/version.rb +1 -1
- data/lib/raingrams.rb +1 -1
- data/spec/ngram_set_spec.rb +54 -0
- data/spec/ngram_spec.rb +29 -0
- data/spec/probability_table_spec.rb +94 -0
- data/spec/raingrams_spec.rb +9 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/spec.rb +7 -0
- metadata +65 -55
- data/lib/raingrams/extensions/class.rb +0 -7
- data/lib/raingrams/extensions/false_class.rb +0 -7
- data/lib/raingrams/extensions/nil_class.rb +0 -7
- data/lib/raingrams/extensions/symbol.rb +0 -7
- data/lib/raingrams/extensions/true_class.rb +0 -7
- data/lib/raingrams/multigram_model.rb +0 -165
- data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
- data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
- data/lib/raingrams/unigram_model.rb +0 -70
- data/test/test_raingrams.rb +0 -0
data/lib/raingrams/ngram.rb
CHANGED
@@ -1,20 +1,53 @@
|
|
1
|
+
require 'raingrams/extensions'
|
2
|
+
|
1
3
|
module Raingrams
|
2
4
|
class Ngram < Array
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
+
#
|
7
|
+
# Creates a new Ngram object with the specified _objects_.
|
8
|
+
#
|
9
|
+
def initialize(objects)
|
10
|
+
super(objects.map { |obj| obj.to_gram })
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new Ngram object from the specified _objects_.
|
15
|
+
#
|
16
|
+
def self.[](*objects)
|
17
|
+
self.new(objects)
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Creates a new Ngram object by appending the specified _grams_ to the
|
22
|
+
# ngram.
|
23
|
+
#
|
24
|
+
def +(grams)
|
25
|
+
if grams.kind_of?(Array)
|
26
|
+
return self.class.new(super(grams.map { |gram|
|
27
|
+
gram.to_gram
|
28
|
+
}))
|
29
|
+
else
|
30
|
+
return self.class.new(super([grams.to_gram]))
|
31
|
+
end
|
6
32
|
end
|
7
33
|
|
8
|
-
def
|
9
|
-
|
34
|
+
def <<(gram)
|
35
|
+
super(gram.to_gram)
|
10
36
|
end
|
11
37
|
|
38
|
+
#
|
39
|
+
# Returns the prefix of the ngram.
|
40
|
+
#
|
12
41
|
def prefix
|
13
42
|
self[0...length-1]
|
14
43
|
end
|
15
44
|
|
16
|
-
|
17
|
-
|
45
|
+
#
|
46
|
+
# Returns +true+ if the ngram is prefixed by the specified
|
47
|
+
# _smaller_ngram_.
|
48
|
+
#
|
49
|
+
def prefixed_by?(smaller_ngram)
|
50
|
+
prefix == smaller_ngram
|
18
51
|
end
|
19
52
|
|
20
53
|
def postfix
|
@@ -22,21 +55,25 @@ module Raingrams
|
|
22
55
|
end
|
23
56
|
|
24
57
|
def postfixed_by?(ngram)
|
25
|
-
postfix==ngram
|
58
|
+
postfix == ngram
|
26
59
|
end
|
27
60
|
|
28
61
|
def starts_with?(obj)
|
29
|
-
self
|
62
|
+
self.first == obj.to_gram
|
30
63
|
end
|
31
64
|
|
32
65
|
def ends_with?(obj)
|
33
|
-
self
|
66
|
+
self.last == obj.to_gram
|
34
67
|
end
|
35
68
|
|
36
69
|
def include?(obj)
|
37
70
|
super(obj.to_gram)
|
38
71
|
end
|
39
72
|
|
73
|
+
def includes?(*grams)
|
74
|
+
(self & grams) == grams
|
75
|
+
end
|
76
|
+
|
40
77
|
def flatten
|
41
78
|
self.dup
|
42
79
|
end
|
@@ -49,5 +86,9 @@ module Raingrams
|
|
49
86
|
join(', ')
|
50
87
|
end
|
51
88
|
|
89
|
+
def inspect
|
90
|
+
'Ngram[' + self.map { |gram| gram.inspect }.join(', ') + ']'
|
91
|
+
end
|
92
|
+
|
52
93
|
end
|
53
94
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Raingrams
|
6
|
+
class NgramSet < Set
|
7
|
+
|
8
|
+
def select(&block)
|
9
|
+
selected_ngrams = self.class.new
|
10
|
+
|
11
|
+
each do |ngram|
|
12
|
+
selected_ngrams << ngram if block.call(ngram)
|
13
|
+
end
|
14
|
+
|
15
|
+
return selected_ngrams
|
16
|
+
end
|
17
|
+
|
18
|
+
def prefixed_by(prefix)
|
19
|
+
select { |ngram| ngram.prefixed_by?(prefix) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def postfixed_by(postfix)
|
23
|
+
select { |ngram| ngram.postfixed_by?(postfix) }
|
24
|
+
end
|
25
|
+
|
26
|
+
def starts_with(gram)
|
27
|
+
select { |ngram| ngram.starts_with?(gram) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def ends_with(gram)
|
31
|
+
select { |ngram| ngram.ends_with?(gram) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def including(gram)
|
35
|
+
select { |ngram| ngram.include?(gram) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def includes(*grams)
|
39
|
+
select { |ngram| ngram.includes?(*grams) }
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -7,14 +7,18 @@ module Raingrams
|
|
7
7
|
# The fixed lexicon of this model
|
8
8
|
attr_reader :lexicon
|
9
9
|
|
10
|
-
def initialize(
|
11
|
-
@lexicon =
|
10
|
+
def initialize(options={},&block)
|
11
|
+
@lexicon = (options[:lexicon] || [])
|
12
12
|
|
13
|
-
|
13
|
+
@lexicon.map! do |word|
|
14
|
+
word.to_gram
|
15
|
+
end
|
16
|
+
|
17
|
+
super(options,&block)
|
14
18
|
end
|
15
19
|
|
16
20
|
def within_lexicon?(gram)
|
17
|
-
@lexicon.include?(gram)
|
21
|
+
@lexicon.include?(gram.to_gram)
|
18
22
|
end
|
19
23
|
|
20
24
|
def train_ngram(ngram)
|
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class PentagramModel <
|
4
|
+
class PentagramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 5
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 5
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Raingrams
|
2
|
+
class ProbabilityTable
|
3
|
+
|
4
|
+
# Indicates wether the table needs to be rebuilt
|
5
|
+
attr_reader :dirty
|
6
|
+
|
7
|
+
# Frequencies of grams
|
8
|
+
attr_reader :frequencies
|
9
|
+
|
10
|
+
# Probabilities of grams
|
11
|
+
attr_reader :probabilities
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new empty ProbabilityTable object.
|
15
|
+
#
|
16
|
+
def initialize
|
17
|
+
@dirty = false
|
18
|
+
@total = 0
|
19
|
+
@frequencies = {}
|
20
|
+
@probabilities = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Returns +true+ if the probability table is dirty and needs to be
|
25
|
+
# rebuilt, returns +false+ otherwise.
|
26
|
+
#
|
27
|
+
def dirty?
|
28
|
+
@dirty == true
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Returns +true+ if the probability table contains the specified _gram_,
|
33
|
+
# returns +false+ otherwise.
|
34
|
+
#
|
35
|
+
def has_gram?(gram)
|
36
|
+
@frequencies.has_key?(gram)
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Returns the grams within the probability table.
|
41
|
+
#
|
42
|
+
def grams
|
43
|
+
@frequencies.keys
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Iterates over each gram in the probability table, passing each to the
|
48
|
+
# given _block_.
|
49
|
+
#
|
50
|
+
def each_gram(&block)
|
51
|
+
@frequencies.each_key(&block)
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Returns the frequency of the specified _gram_. Returns +0+ by default.
|
56
|
+
#
|
57
|
+
def frequency_of(gram)
|
58
|
+
@frequencies[gram] || 0
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Returns the probability of the specified _gram_ occurring. Returns
|
63
|
+
# <tt>0.0</tt> by default.
|
64
|
+
#
|
65
|
+
def probability_of(gram)
|
66
|
+
@probabilities[gram] || 0.0
|
67
|
+
end
|
68
|
+
|
69
|
+
alias [] probability_of
|
70
|
+
|
71
|
+
#
|
72
|
+
# Sets the frequency of the specified _gram_ to the specified _value_.
|
73
|
+
#
|
74
|
+
def set_count(gram,value)
|
75
|
+
@dirty = true
|
76
|
+
@frequencies[gram] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Increments the frequency of the specified _gram_ and marks the
|
81
|
+
# probability table as dirty.
|
82
|
+
#
|
83
|
+
def count(gram)
|
84
|
+
@dirty = true
|
85
|
+
|
86
|
+
unless @frequencies.has_key?(gram)
|
87
|
+
@frequencies[gram] = 0
|
88
|
+
end
|
89
|
+
|
90
|
+
return @frequencies[gram] += 1
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Calculates the total via the summation of the frequencies. Also
|
95
|
+
# marks the probability table as dirty.
|
96
|
+
#
|
97
|
+
def total
|
98
|
+
if @dirty
|
99
|
+
@total = @frequencies.values.inject do |sum,freq|
|
100
|
+
sum + freq
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
return @total
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Builds the probability table using the recorded frequencies, if the
|
109
|
+
# table is marked as dirty.
|
110
|
+
#
|
111
|
+
def build
|
112
|
+
if @dirty
|
113
|
+
current_total = total.to_f
|
114
|
+
|
115
|
+
@frequencies.each do |gram,count|
|
116
|
+
@probabilities[gram] = count.to_f / current_total
|
117
|
+
end
|
118
|
+
|
119
|
+
@dirty = false
|
120
|
+
end
|
121
|
+
|
122
|
+
return self
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Returns +true+ if the probability table is empty, returns +false+
|
127
|
+
# otherwise.
|
128
|
+
#
|
129
|
+
def empty?
|
130
|
+
@total == 0
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Clears the probability table.
|
135
|
+
#
|
136
|
+
def clear
|
137
|
+
@total = 0
|
138
|
+
@frequencies.clear
|
139
|
+
@probabilities.clear
|
140
|
+
|
141
|
+
return self
|
142
|
+
end
|
143
|
+
|
144
|
+
def inspect
|
145
|
+
if @dirty
|
146
|
+
"#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
|
147
|
+
else
|
148
|
+
@probabilities.inspect
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class QuadgramModel <
|
4
|
+
class QuadgramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 4
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 4
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
data/lib/raingrams/raingrams.rb
CHANGED
@@ -1,31 +1,21 @@
|
|
1
|
-
require 'raingrams/
|
2
|
-
require 'raingrams/
|
3
|
-
require 'raingrams/open_vocabulary/unigram_model'
|
4
|
-
require 'raingrams/open_vocabulary/multigram_model'
|
1
|
+
require 'raingrams/model'
|
2
|
+
require 'raingrams/open_vocabulary/model'
|
5
3
|
|
6
4
|
module Raingrams
|
7
|
-
def Raingrams.closed_vocabulary_model(
|
8
|
-
|
9
|
-
return UnigramModel.new(opts,&block)
|
10
|
-
else
|
11
|
-
return MultigramModel.new(opts,&block)
|
12
|
-
end
|
5
|
+
def Raingrams.closed_vocabulary_model(options={},&block)
|
6
|
+
Model.new(options,&block)
|
13
7
|
end
|
14
8
|
|
15
|
-
def Raingrams.open_vocabulary_model(
|
16
|
-
|
17
|
-
return OpenVocabulary::UnigramModel.new(opts,&block)
|
18
|
-
else
|
19
|
-
return OpenVocabulary::MultigramModel.new(opts,&block)
|
20
|
-
end
|
9
|
+
def Raingrams.open_vocabulary_model(options={},&block)
|
10
|
+
OpenVocabulary::Model.new(options,&block)
|
21
11
|
end
|
22
12
|
|
23
|
-
def Raingrams.model(
|
24
|
-
case
|
13
|
+
def Raingrams.model(options={},&block)
|
14
|
+
case options[:vocabulary]
|
25
15
|
when :open, 'open'
|
26
|
-
return Raingrams.open_vocabulary_model(
|
16
|
+
return Raingrams.open_vocabulary_model(options,&block)
|
27
17
|
else
|
28
|
-
return Raingrams.closed_vocabulary_model(
|
18
|
+
return Raingrams.closed_vocabulary_model(options,&block)
|
29
19
|
end
|
30
20
|
end
|
31
21
|
end
|
@@ -2,16 +2,60 @@ module Raingrams
|
|
2
2
|
module Tokens
|
3
3
|
class Token
|
4
4
|
|
5
|
-
|
5
|
+
# Gram form of the token
|
6
|
+
attr_reader :gram
|
7
|
+
|
8
|
+
#
|
9
|
+
# Creates a new Token object with the specified _gram_.
|
10
|
+
#
|
11
|
+
def initialize(gram)
|
12
|
+
@gram = gram
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_gram
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Creates an Array of the specified _length_ containing the token.
|
21
|
+
#
|
22
|
+
def *(length)
|
6
23
|
[self] * length
|
7
24
|
end
|
8
25
|
|
9
|
-
|
10
|
-
|
26
|
+
#
|
27
|
+
# Returns +true+ if the token has the same gram as the _other_ token,
|
28
|
+
# returns +false+ otherwise.
|
29
|
+
#
|
30
|
+
def eql?(other)
|
31
|
+
if other.kind_of?(Token)
|
32
|
+
return (@gram == other.gram)
|
33
|
+
end
|
34
|
+
|
35
|
+
return false
|
36
|
+
end
|
37
|
+
|
38
|
+
alias == eql?
|
39
|
+
|
40
|
+
#
|
41
|
+
# Returns the String form of the token.
|
42
|
+
#
|
43
|
+
def to_s
|
44
|
+
@gram.to_s
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# Returns the Symbol form of the token.
|
49
|
+
#
|
50
|
+
def to_sym
|
51
|
+
@gram.to_sym
|
11
52
|
end
|
12
53
|
|
13
|
-
|
14
|
-
|
54
|
+
#
|
55
|
+
# Returns the String form of the token.
|
56
|
+
#
|
57
|
+
def inspect
|
58
|
+
@gram.to_s
|
15
59
|
end
|
16
60
|
|
17
61
|
end
|
data/lib/raingrams/tokens.rb
CHANGED
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class TrigramModel <
|
4
|
+
class TrigramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 3
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 3
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
data/lib/raingrams/version.rb
CHANGED
data/lib/raingrams.rb
CHANGED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'raingrams/ngram_set'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe NgramSet do
|
6
|
+
before(:all) do
|
7
|
+
@ngrams = NgramSet[
|
8
|
+
Ngram[:the, :dog],
|
9
|
+
Ngram[:dog, :jumped],
|
10
|
+
Ngram[:jumped, :through],
|
11
|
+
Ngram[:through, :the],
|
12
|
+
Ngram[:the, :hoop]
|
13
|
+
]
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should select ngrams from the set" do
|
17
|
+
@ngrams.select { |ngram|
|
18
|
+
ngram.starts_with?(:the)
|
19
|
+
}.should == NgramSet[Ngram[:the, :dog], Ngram[:the, :hoop]]
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should select ngrams with a specified prefixed" do
|
23
|
+
@ngrams.prefixed_by(Ngram[:dog]).should == NgramSet[
|
24
|
+
Ngram[:dog, :jumped]
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should select ngrams with a specified postfix" do
|
29
|
+
@ngrams.postfixed_by(Ngram[:through]).should == NgramSet[
|
30
|
+
Ngram[:jumped, :through]
|
31
|
+
]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should select ngrams starting with a specified gram" do
|
35
|
+
@ngrams.starts_with(:jumped).should == NgramSet[Ngram[:jumped, :through]]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should select ngrams ending with a specified gram" do
|
39
|
+
@ngrams.ends_with(:dog).should == NgramSet[Ngram[:the, :dog]]
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should select ngrams including a specified gram" do
|
43
|
+
@ngrams.including(:dog).should == NgramSet[
|
44
|
+
Ngram[:the, :dog],
|
45
|
+
Ngram[:dog, :jumped]
|
46
|
+
]
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should select ngrams which includes specified grams" do
|
50
|
+
@ngrams.includes(:the, :dog).should == NgramSet[
|
51
|
+
Ngram[:the, :dog],
|
52
|
+
]
|
53
|
+
end
|
54
|
+
end
|
data/spec/ngram_spec.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Ngram do
|
6
|
+
before(:all) do
|
7
|
+
@ngram = Ngram[:one, :two, :three]
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have a prefix" do
|
11
|
+
@ngram.prefix.should == Ngram[:one, :two]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have a postfix" do
|
15
|
+
@ngram.postfix.should == Ngram[:two, :three]
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should begin with a gram" do
|
19
|
+
@ngram.starts_with?(:one).should == true
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should end with a gram" do
|
23
|
+
@ngram.ends_with?(:three).should == true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should include certain grams" do
|
27
|
+
@ngram.includes?(:one, :three).should == true
|
28
|
+
end
|
29
|
+
end
|