raingrams 0.0.9 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +9 -0
- data/Manifest.txt +10 -10
- data/README.txt +9 -7
- data/Rakefile +3 -6
- data/TODO.txt +6 -0
- data/lib/raingrams/bigram_model.rb +3 -7
- data/lib/raingrams/extensions/object.rb +4 -1
- data/lib/raingrams/extensions/string.rb +3 -0
- data/lib/raingrams/extensions.rb +0 -5
- data/lib/raingrams/hexagram_model.rb +3 -7
- data/lib/raingrams/model.rb +622 -61
- data/lib/raingrams/ngram.rb +50 -9
- data/lib/raingrams/ngram_set.rb +43 -0
- data/lib/raingrams/open_vocabulary/model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
- data/lib/raingrams/open_vocabulary.rb +0 -1
- data/lib/raingrams/pentagram_model.rb +3 -7
- data/lib/raingrams/probability_table.rb +153 -0
- data/lib/raingrams/quadgram_model.rb +3 -7
- data/lib/raingrams/raingrams.rb +10 -20
- data/lib/raingrams/tokens/start_sentence.rb +2 -2
- data/lib/raingrams/tokens/stop_sentence.rb +2 -2
- data/lib/raingrams/tokens/token.rb +49 -5
- data/lib/raingrams/tokens/unknown.rb +2 -2
- data/lib/raingrams/tokens.rb +1 -0
- data/lib/raingrams/trigram_model.rb +3 -7
- data/lib/raingrams/version.rb +1 -1
- data/lib/raingrams.rb +1 -1
- data/spec/ngram_set_spec.rb +54 -0
- data/spec/ngram_spec.rb +29 -0
- data/spec/probability_table_spec.rb +94 -0
- data/spec/raingrams_spec.rb +9 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/spec.rb +7 -0
- metadata +65 -55
- data/lib/raingrams/extensions/class.rb +0 -7
- data/lib/raingrams/extensions/false_class.rb +0 -7
- data/lib/raingrams/extensions/nil_class.rb +0 -7
- data/lib/raingrams/extensions/symbol.rb +0 -7
- data/lib/raingrams/extensions/true_class.rb +0 -7
- data/lib/raingrams/multigram_model.rb +0 -165
- data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
- data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
- data/lib/raingrams/unigram_model.rb +0 -70
- data/test/test_raingrams.rb +0 -0
data/lib/raingrams/ngram.rb
CHANGED
@@ -1,20 +1,53 @@
|
|
1
|
+
require 'raingrams/extensions'
|
2
|
+
|
1
3
|
module Raingrams
|
2
4
|
class Ngram < Array
|
3
5
|
|
4
|
-
|
5
|
-
|
6
|
+
#
|
7
|
+
# Creates a new Ngram object with the specified _objects_.
|
8
|
+
#
|
9
|
+
def initialize(objects)
|
10
|
+
super(objects.map { |obj| obj.to_gram })
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new Ngram object from the specified _objects_.
|
15
|
+
#
|
16
|
+
def self.[](*objects)
|
17
|
+
self.new(objects)
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Creates a new Ngram object by appending the specified _grams_ to the
|
22
|
+
# ngram.
|
23
|
+
#
|
24
|
+
def +(grams)
|
25
|
+
if grams.kind_of?(Array)
|
26
|
+
return self.class.new(super(grams.map { |gram|
|
27
|
+
gram.to_gram
|
28
|
+
}))
|
29
|
+
else
|
30
|
+
return self.class.new(super([grams.to_gram]))
|
31
|
+
end
|
6
32
|
end
|
7
33
|
|
8
|
-
def
|
9
|
-
|
34
|
+
def <<(gram)
|
35
|
+
super(gram.to_gram)
|
10
36
|
end
|
11
37
|
|
38
|
+
#
|
39
|
+
# Returns the prefix of the ngram.
|
40
|
+
#
|
12
41
|
def prefix
|
13
42
|
self[0...length-1]
|
14
43
|
end
|
15
44
|
|
16
|
-
|
17
|
-
|
45
|
+
#
|
46
|
+
# Returns +true+ if the ngram is prefixed by the specified
|
47
|
+
# _smaller_ngram_.
|
48
|
+
#
|
49
|
+
def prefixed_by?(smaller_ngram)
|
50
|
+
prefix == smaller_ngram
|
18
51
|
end
|
19
52
|
|
20
53
|
def postfix
|
@@ -22,21 +55,25 @@ module Raingrams
|
|
22
55
|
end
|
23
56
|
|
24
57
|
def postfixed_by?(ngram)
|
25
|
-
postfix==ngram
|
58
|
+
postfix == ngram
|
26
59
|
end
|
27
60
|
|
28
61
|
def starts_with?(obj)
|
29
|
-
self
|
62
|
+
self.first == obj.to_gram
|
30
63
|
end
|
31
64
|
|
32
65
|
def ends_with?(obj)
|
33
|
-
self
|
66
|
+
self.last == obj.to_gram
|
34
67
|
end
|
35
68
|
|
36
69
|
def include?(obj)
|
37
70
|
super(obj.to_gram)
|
38
71
|
end
|
39
72
|
|
73
|
+
def includes?(*grams)
|
74
|
+
(self & grams) == grams
|
75
|
+
end
|
76
|
+
|
40
77
|
def flatten
|
41
78
|
self.dup
|
42
79
|
end
|
@@ -49,5 +86,9 @@ module Raingrams
|
|
49
86
|
join(', ')
|
50
87
|
end
|
51
88
|
|
89
|
+
def inspect
|
90
|
+
'Ngram[' + self.map { |gram| gram.inspect }.join(', ') + ']'
|
91
|
+
end
|
92
|
+
|
52
93
|
end
|
53
94
|
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
|
3
|
+
require 'set'
|
4
|
+
|
5
|
+
module Raingrams
|
6
|
+
class NgramSet < Set
|
7
|
+
|
8
|
+
def select(&block)
|
9
|
+
selected_ngrams = self.class.new
|
10
|
+
|
11
|
+
each do |ngram|
|
12
|
+
selected_ngrams << ngram if block.call(ngram)
|
13
|
+
end
|
14
|
+
|
15
|
+
return selected_ngrams
|
16
|
+
end
|
17
|
+
|
18
|
+
def prefixed_by(prefix)
|
19
|
+
select { |ngram| ngram.prefixed_by?(prefix) }
|
20
|
+
end
|
21
|
+
|
22
|
+
def postfixed_by(postfix)
|
23
|
+
select { |ngram| ngram.postfixed_by?(postfix) }
|
24
|
+
end
|
25
|
+
|
26
|
+
def starts_with(gram)
|
27
|
+
select { |ngram| ngram.starts_with?(gram) }
|
28
|
+
end
|
29
|
+
|
30
|
+
def ends_with(gram)
|
31
|
+
select { |ngram| ngram.ends_with?(gram) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def including(gram)
|
35
|
+
select { |ngram| ngram.include?(gram) }
|
36
|
+
end
|
37
|
+
|
38
|
+
def includes(*grams)
|
39
|
+
select { |ngram| ngram.includes?(*grams) }
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -7,14 +7,18 @@ module Raingrams
|
|
7
7
|
# The fixed lexicon of this model
|
8
8
|
attr_reader :lexicon
|
9
9
|
|
10
|
-
def initialize(
|
11
|
-
@lexicon =
|
10
|
+
def initialize(options={},&block)
|
11
|
+
@lexicon = (options[:lexicon] || [])
|
12
12
|
|
13
|
-
|
13
|
+
@lexicon.map! do |word|
|
14
|
+
word.to_gram
|
15
|
+
end
|
16
|
+
|
17
|
+
super(options,&block)
|
14
18
|
end
|
15
19
|
|
16
20
|
def within_lexicon?(gram)
|
17
|
-
@lexicon.include?(gram)
|
21
|
+
@lexicon.include?(gram.to_gram)
|
18
22
|
end
|
19
23
|
|
20
24
|
def train_ngram(ngram)
|
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class PentagramModel <
|
4
|
+
class PentagramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 5
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 5
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Raingrams
|
2
|
+
class ProbabilityTable
|
3
|
+
|
4
|
+
# Indicates wether the table needs to be rebuilt
|
5
|
+
attr_reader :dirty
|
6
|
+
|
7
|
+
# Frequencies of grams
|
8
|
+
attr_reader :frequencies
|
9
|
+
|
10
|
+
# Probabilities of grams
|
11
|
+
attr_reader :probabilities
|
12
|
+
|
13
|
+
#
|
14
|
+
# Creates a new empty ProbabilityTable object.
|
15
|
+
#
|
16
|
+
def initialize
|
17
|
+
@dirty = false
|
18
|
+
@total = 0
|
19
|
+
@frequencies = {}
|
20
|
+
@probabilities = {}
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# Returns +true+ if the probability table is dirty and needs to be
|
25
|
+
# rebuilt, returns +false+ otherwise.
|
26
|
+
#
|
27
|
+
def dirty?
|
28
|
+
@dirty == true
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Returns +true+ if the probability table contains the specified _gram_,
|
33
|
+
# returns +false+ otherwise.
|
34
|
+
#
|
35
|
+
def has_gram?(gram)
|
36
|
+
@frequencies.has_key?(gram)
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Returns the grams within the probability table.
|
41
|
+
#
|
42
|
+
def grams
|
43
|
+
@frequencies.keys
|
44
|
+
end
|
45
|
+
|
46
|
+
#
|
47
|
+
# Iterates over each gram in the probability table, passing each to the
|
48
|
+
# given _block_.
|
49
|
+
#
|
50
|
+
def each_gram(&block)
|
51
|
+
@frequencies.each_key(&block)
|
52
|
+
end
|
53
|
+
|
54
|
+
#
|
55
|
+
# Returns the frequency of the specified _gram_. Returns +0+ by default.
|
56
|
+
#
|
57
|
+
def frequency_of(gram)
|
58
|
+
@frequencies[gram] || 0
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Returns the probability of the specified _gram_ occurring. Returns
|
63
|
+
# <tt>0.0</tt> by default.
|
64
|
+
#
|
65
|
+
def probability_of(gram)
|
66
|
+
@probabilities[gram] || 0.0
|
67
|
+
end
|
68
|
+
|
69
|
+
alias [] probability_of
|
70
|
+
|
71
|
+
#
|
72
|
+
# Sets the frequency of the specified _gram_ to the specified _value_.
|
73
|
+
#
|
74
|
+
def set_count(gram,value)
|
75
|
+
@dirty = true
|
76
|
+
@frequencies[gram] = value
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Increments the frequency of the specified _gram_ and marks the
|
81
|
+
# probability table as dirty.
|
82
|
+
#
|
83
|
+
def count(gram)
|
84
|
+
@dirty = true
|
85
|
+
|
86
|
+
unless @frequencies.has_key?(gram)
|
87
|
+
@frequencies[gram] = 0
|
88
|
+
end
|
89
|
+
|
90
|
+
return @frequencies[gram] += 1
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Calculates the total via the summation of the frequencies. Also
|
95
|
+
# marks the probability table as dirty.
|
96
|
+
#
|
97
|
+
def total
|
98
|
+
if @dirty
|
99
|
+
@total = @frequencies.values.inject do |sum,freq|
|
100
|
+
sum + freq
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
return @total
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Builds the probability table using the recorded frequencies, if the
|
109
|
+
# table is marked as dirty.
|
110
|
+
#
|
111
|
+
def build
|
112
|
+
if @dirty
|
113
|
+
current_total = total.to_f
|
114
|
+
|
115
|
+
@frequencies.each do |gram,count|
|
116
|
+
@probabilities[gram] = count.to_f / current_total
|
117
|
+
end
|
118
|
+
|
119
|
+
@dirty = false
|
120
|
+
end
|
121
|
+
|
122
|
+
return self
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Returns +true+ if the probability table is empty, returns +false+
|
127
|
+
# otherwise.
|
128
|
+
#
|
129
|
+
def empty?
|
130
|
+
@total == 0
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Clears the probability table.
|
135
|
+
#
|
136
|
+
def clear
|
137
|
+
@total = 0
|
138
|
+
@frequencies.clear
|
139
|
+
@probabilities.clear
|
140
|
+
|
141
|
+
return self
|
142
|
+
end
|
143
|
+
|
144
|
+
def inspect
|
145
|
+
if @dirty
|
146
|
+
"#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
|
147
|
+
else
|
148
|
+
@probabilities.inspect
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
end
|
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class QuadgramModel <
|
4
|
+
class QuadgramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 4
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 4
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
data/lib/raingrams/raingrams.rb
CHANGED
@@ -1,31 +1,21 @@
|
|
1
|
-
require 'raingrams/
|
2
|
-
require 'raingrams/
|
3
|
-
require 'raingrams/open_vocabulary/unigram_model'
|
4
|
-
require 'raingrams/open_vocabulary/multigram_model'
|
1
|
+
require 'raingrams/model'
|
2
|
+
require 'raingrams/open_vocabulary/model'
|
5
3
|
|
6
4
|
module Raingrams
|
7
|
-
def Raingrams.closed_vocabulary_model(
|
8
|
-
|
9
|
-
return UnigramModel.new(opts,&block)
|
10
|
-
else
|
11
|
-
return MultigramModel.new(opts,&block)
|
12
|
-
end
|
5
|
+
def Raingrams.closed_vocabulary_model(options={},&block)
|
6
|
+
Model.new(options,&block)
|
13
7
|
end
|
14
8
|
|
15
|
-
def Raingrams.open_vocabulary_model(
|
16
|
-
|
17
|
-
return OpenVocabulary::UnigramModel.new(opts,&block)
|
18
|
-
else
|
19
|
-
return OpenVocabulary::MultigramModel.new(opts,&block)
|
20
|
-
end
|
9
|
+
def Raingrams.open_vocabulary_model(options={},&block)
|
10
|
+
OpenVocabulary::Model.new(options,&block)
|
21
11
|
end
|
22
12
|
|
23
|
-
def Raingrams.model(
|
24
|
-
case
|
13
|
+
def Raingrams.model(options={},&block)
|
14
|
+
case options[:vocabulary]
|
25
15
|
when :open, 'open'
|
26
|
-
return Raingrams.open_vocabulary_model(
|
16
|
+
return Raingrams.open_vocabulary_model(options,&block)
|
27
17
|
else
|
28
|
-
return Raingrams.closed_vocabulary_model(
|
18
|
+
return Raingrams.closed_vocabulary_model(options,&block)
|
29
19
|
end
|
30
20
|
end
|
31
21
|
end
|
@@ -2,16 +2,60 @@ module Raingrams
|
|
2
2
|
module Tokens
|
3
3
|
class Token
|
4
4
|
|
5
|
-
|
5
|
+
# Gram form of the token
|
6
|
+
attr_reader :gram
|
7
|
+
|
8
|
+
#
|
9
|
+
# Creates a new Token object with the specified _gram_.
|
10
|
+
#
|
11
|
+
def initialize(gram)
|
12
|
+
@gram = gram
|
13
|
+
end
|
14
|
+
|
15
|
+
def to_gram
|
16
|
+
self
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Creates an Array of the specified _length_ containing the token.
|
21
|
+
#
|
22
|
+
def *(length)
|
6
23
|
[self] * length
|
7
24
|
end
|
8
25
|
|
9
|
-
|
10
|
-
|
26
|
+
#
|
27
|
+
# Returns +true+ if the token has the same gram as the _other_ token,
|
28
|
+
# returns +false+ otherwise.
|
29
|
+
#
|
30
|
+
def eql?(other)
|
31
|
+
if other.kind_of?(Token)
|
32
|
+
return (@gram == other.gram)
|
33
|
+
end
|
34
|
+
|
35
|
+
return false
|
36
|
+
end
|
37
|
+
|
38
|
+
alias == eql?
|
39
|
+
|
40
|
+
#
|
41
|
+
# Returns the String form of the token.
|
42
|
+
#
|
43
|
+
def to_s
|
44
|
+
@gram.to_s
|
45
|
+
end
|
46
|
+
|
47
|
+
#
|
48
|
+
# Returns the Symbol form of the token.
|
49
|
+
#
|
50
|
+
def to_sym
|
51
|
+
@gram.to_sym
|
11
52
|
end
|
12
53
|
|
13
|
-
|
14
|
-
|
54
|
+
#
|
55
|
+
# Returns the String form of the token.
|
56
|
+
#
|
57
|
+
def inspect
|
58
|
+
@gram.to_s
|
15
59
|
end
|
16
60
|
|
17
61
|
end
|
data/lib/raingrams/tokens.rb
CHANGED
@@ -1,13 +1,9 @@
|
|
1
|
-
require 'raingrams/
|
1
|
+
require 'raingrams/model'
|
2
2
|
|
3
3
|
module Raingrams
|
4
|
-
class TrigramModel <
|
4
|
+
class TrigramModel < Model
|
5
5
|
|
6
|
-
|
7
|
-
opts[:ngram_size] = 3
|
8
|
-
|
9
|
-
super(opts,&block)
|
10
|
-
end
|
6
|
+
ngram_size 3
|
11
7
|
|
12
8
|
end
|
13
9
|
end
|
data/lib/raingrams/version.rb
CHANGED
data/lib/raingrams.rb
CHANGED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'raingrams/ngram_set'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe NgramSet do
|
6
|
+
before(:all) do
|
7
|
+
@ngrams = NgramSet[
|
8
|
+
Ngram[:the, :dog],
|
9
|
+
Ngram[:dog, :jumped],
|
10
|
+
Ngram[:jumped, :through],
|
11
|
+
Ngram[:through, :the],
|
12
|
+
Ngram[:the, :hoop]
|
13
|
+
]
|
14
|
+
end
|
15
|
+
|
16
|
+
it "should select ngrams from the set" do
|
17
|
+
@ngrams.select { |ngram|
|
18
|
+
ngram.starts_with?(:the)
|
19
|
+
}.should == NgramSet[Ngram[:the, :dog], Ngram[:the, :hoop]]
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should select ngrams with a specified prefixed" do
|
23
|
+
@ngrams.prefixed_by(Ngram[:dog]).should == NgramSet[
|
24
|
+
Ngram[:dog, :jumped]
|
25
|
+
]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should select ngrams with a specified postfix" do
|
29
|
+
@ngrams.postfixed_by(Ngram[:through]).should == NgramSet[
|
30
|
+
Ngram[:jumped, :through]
|
31
|
+
]
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should select ngrams starting with a specified gram" do
|
35
|
+
@ngrams.starts_with(:jumped).should == NgramSet[Ngram[:jumped, :through]]
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should select ngrams ending with a specified gram" do
|
39
|
+
@ngrams.ends_with(:dog).should == NgramSet[Ngram[:the, :dog]]
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should select ngrams including a specified gram" do
|
43
|
+
@ngrams.including(:dog).should == NgramSet[
|
44
|
+
Ngram[:the, :dog],
|
45
|
+
Ngram[:dog, :jumped]
|
46
|
+
]
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should select ngrams which includes specified grams" do
|
50
|
+
@ngrams.includes(:the, :dog).should == NgramSet[
|
51
|
+
Ngram[:the, :dog],
|
52
|
+
]
|
53
|
+
end
|
54
|
+
end
|
data/spec/ngram_spec.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe Ngram do
|
6
|
+
before(:all) do
|
7
|
+
@ngram = Ngram[:one, :two, :three]
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should have a prefix" do
|
11
|
+
@ngram.prefix.should == Ngram[:one, :two]
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should have a postfix" do
|
15
|
+
@ngram.postfix.should == Ngram[:two, :three]
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should begin with a gram" do
|
19
|
+
@ngram.starts_with?(:one).should == true
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should end with a gram" do
|
23
|
+
@ngram.ends_with?(:three).should == true
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should include certain grams" do
|
27
|
+
@ngram.includes?(:one, :three).should == true
|
28
|
+
end
|
29
|
+
end
|