raingrams 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/History.txt +9 -0
  2. data/Manifest.txt +10 -10
  3. data/README.txt +9 -7
  4. data/Rakefile +3 -6
  5. data/TODO.txt +6 -0
  6. data/lib/raingrams/bigram_model.rb +3 -7
  7. data/lib/raingrams/extensions/object.rb +4 -1
  8. data/lib/raingrams/extensions/string.rb +3 -0
  9. data/lib/raingrams/extensions.rb +0 -5
  10. data/lib/raingrams/hexagram_model.rb +3 -7
  11. data/lib/raingrams/model.rb +622 -61
  12. data/lib/raingrams/ngram.rb +50 -9
  13. data/lib/raingrams/ngram_set.rb +43 -0
  14. data/lib/raingrams/open_vocabulary/model.rb +12 -0
  15. data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
  16. data/lib/raingrams/open_vocabulary.rb +0 -1
  17. data/lib/raingrams/pentagram_model.rb +3 -7
  18. data/lib/raingrams/probability_table.rb +153 -0
  19. data/lib/raingrams/quadgram_model.rb +3 -7
  20. data/lib/raingrams/raingrams.rb +10 -20
  21. data/lib/raingrams/tokens/start_sentence.rb +2 -2
  22. data/lib/raingrams/tokens/stop_sentence.rb +2 -2
  23. data/lib/raingrams/tokens/token.rb +49 -5
  24. data/lib/raingrams/tokens/unknown.rb +2 -2
  25. data/lib/raingrams/tokens.rb +1 -0
  26. data/lib/raingrams/trigram_model.rb +3 -7
  27. data/lib/raingrams/version.rb +1 -1
  28. data/lib/raingrams.rb +1 -1
  29. data/spec/ngram_set_spec.rb +54 -0
  30. data/spec/ngram_spec.rb +29 -0
  31. data/spec/probability_table_spec.rb +94 -0
  32. data/spec/raingrams_spec.rb +9 -0
  33. data/spec/spec_helper.rb +5 -0
  34. data/tasks/spec.rb +7 -0
  35. metadata +65 -55
  36. data/lib/raingrams/extensions/class.rb +0 -7
  37. data/lib/raingrams/extensions/false_class.rb +0 -7
  38. data/lib/raingrams/extensions/nil_class.rb +0 -7
  39. data/lib/raingrams/extensions/symbol.rb +0 -7
  40. data/lib/raingrams/extensions/true_class.rb +0 -7
  41. data/lib/raingrams/multigram_model.rb +0 -165
  42. data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
  43. data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
  44. data/lib/raingrams/unigram_model.rb +0 -70
  45. data/test/test_raingrams.rb +0 -0
@@ -1,20 +1,53 @@
1
+ require 'raingrams/extensions'
2
+
1
3
  module Raingrams
2
4
  class Ngram < Array
3
5
 
4
- def initialize(objs)
5
- super(objs.map { |obj| obj.to_gram })
6
+ #
7
+ # Creates a new Ngram object with the specified _objects_.
8
+ #
9
+ def initialize(objects)
10
+ super(objects.map { |obj| obj.to_gram })
11
+ end
12
+
13
+ #
14
+ # Creates a new Ngram object from the specified _objects_.
15
+ #
16
+ def self.[](*objects)
17
+ self.new(objects)
18
+ end
19
+
20
+ #
21
+ # Creates a new Ngram object by appending the specified _grams_ to the
22
+ # ngram.
23
+ #
24
+ def +(grams)
25
+ if grams.kind_of?(Array)
26
+ return self.class.new(super(grams.map { |gram|
27
+ gram.to_gram
28
+ }))
29
+ else
30
+ return self.class.new(super([grams.to_gram]))
31
+ end
6
32
  end
7
33
 
8
- def self.[](*objs)
9
- self.new(objs)
34
+ def <<(gram)
35
+ super(gram.to_gram)
10
36
  end
11
37
 
38
+ #
39
+ # Returns the prefix of the ngram.
40
+ #
12
41
  def prefix
13
42
  self[0...length-1]
14
43
  end
15
44
 
16
- def prefixed_by?(ngram)
17
- prefix==ngram
45
+ #
46
+ # Returns +true+ if the ngram is prefixed by the specified
47
+ # _smaller_ngram_.
48
+ #
49
+ def prefixed_by?(smaller_ngram)
50
+ prefix == smaller_ngram
18
51
  end
19
52
 
20
53
  def postfix
@@ -22,21 +55,25 @@ module Raingrams
22
55
  end
23
56
 
24
57
  def postfixed_by?(ngram)
25
- postfix==ngram
58
+ postfix == ngram
26
59
  end
27
60
 
28
61
  def starts_with?(obj)
29
- self[0]==obj.to_gram
62
+ self.first == obj.to_gram
30
63
  end
31
64
 
32
65
  def ends_with?(obj)
33
- self[-1]==obj.to_gram
66
+ self.last == obj.to_gram
34
67
  end
35
68
 
36
69
  def include?(obj)
37
70
  super(obj.to_gram)
38
71
  end
39
72
 
73
+ def includes?(*grams)
74
+ (self & grams) == grams
75
+ end
76
+
40
77
  def flatten
41
78
  self.dup
42
79
  end
@@ -49,5 +86,9 @@ module Raingrams
49
86
  join(', ')
50
87
  end
51
88
 
89
+ def inspect
90
+ 'Ngram[' + self.map { |gram| gram.inspect }.join(', ') + ']'
91
+ end
92
+
52
93
  end
53
94
  end
@@ -0,0 +1,43 @@
1
+ require 'raingrams/ngram'
2
+
3
+ require 'set'
4
+
5
+ module Raingrams
6
+ class NgramSet < Set
7
+
8
+ def select(&block)
9
+ selected_ngrams = self.class.new
10
+
11
+ each do |ngram|
12
+ selected_ngrams << ngram if block.call(ngram)
13
+ end
14
+
15
+ return selected_ngrams
16
+ end
17
+
18
+ def prefixed_by(prefix)
19
+ select { |ngram| ngram.prefixed_by?(prefix) }
20
+ end
21
+
22
+ def postfixed_by(postfix)
23
+ select { |ngram| ngram.postfixed_by?(postfix) }
24
+ end
25
+
26
+ def starts_with(gram)
27
+ select { |ngram| ngram.starts_with?(gram) }
28
+ end
29
+
30
+ def ends_with(gram)
31
+ select { |ngram| ngram.ends_with?(gram) }
32
+ end
33
+
34
+ def including(gram)
35
+ select { |ngram| ngram.include?(gram) }
36
+ end
37
+
38
+ def includes(*grams)
39
+ select { |ngram| ngram.includes?(*grams) }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/open_vocabulary/open_model'
2
+ require 'raingrams/model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class Model < Raingrams::Model
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -7,14 +7,18 @@ module Raingrams
7
7
  # The fixed lexicon of this model
8
8
  attr_reader :lexicon
9
9
 
10
- def initialize(opts={},&block)
11
- @lexicon = opts[:lexicon] || []
10
+ def initialize(options={},&block)
11
+ @lexicon = (options[:lexicon] || [])
12
12
 
13
- super(opts,&block)
13
+ @lexicon.map! do |word|
14
+ word.to_gram
15
+ end
16
+
17
+ super(options,&block)
14
18
  end
15
19
 
16
20
  def within_lexicon?(gram)
17
- @lexicon.include?(gram)
21
+ @lexicon.include?(gram.to_gram)
18
22
  end
19
23
 
20
24
  def train_ngram(ngram)
@@ -1,4 +1,3 @@
1
- require 'raingrams/openvocabulary/unigram_model'
2
1
  require 'raingrams/openvocabulary/multigram_model'
3
2
  require 'raingrams/openvocabulary/bigram_model'
4
3
  require 'raingrams/openvocabulary/trigram_model'
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class PentagramModel < MultigramModel
4
+ class PentagramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 5
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 5
11
7
 
12
8
  end
13
9
  end
@@ -0,0 +1,153 @@
1
+ module Raingrams
2
+ class ProbabilityTable
3
+
4
+ # Indicates wether the table needs to be rebuilt
5
+ attr_reader :dirty
6
+
7
+ # Frequencies of grams
8
+ attr_reader :frequencies
9
+
10
+ # Probabilities of grams
11
+ attr_reader :probabilities
12
+
13
+ #
14
+ # Creates a new empty ProbabilityTable object.
15
+ #
16
+ def initialize
17
+ @dirty = false
18
+ @total = 0
19
+ @frequencies = {}
20
+ @probabilities = {}
21
+ end
22
+
23
+ #
24
+ # Returns +true+ if the probability table is dirty and needs to be
25
+ # rebuilt, returns +false+ otherwise.
26
+ #
27
+ def dirty?
28
+ @dirty == true
29
+ end
30
+
31
+ #
32
+ # Returns +true+ if the probability table contains the specified _gram_,
33
+ # returns +false+ otherwise.
34
+ #
35
+ def has_gram?(gram)
36
+ @frequencies.has_key?(gram)
37
+ end
38
+
39
+ #
40
+ # Returns the grams within the probability table.
41
+ #
42
+ def grams
43
+ @frequencies.keys
44
+ end
45
+
46
+ #
47
+ # Iterates over each gram in the probability table, passing each to the
48
+ # given _block_.
49
+ #
50
+ def each_gram(&block)
51
+ @frequencies.each_key(&block)
52
+ end
53
+
54
+ #
55
+ # Returns the frequency of the specified _gram_. Returns +0+ by default.
56
+ #
57
+ def frequency_of(gram)
58
+ @frequencies[gram] || 0
59
+ end
60
+
61
+ #
62
+ # Returns the probability of the specified _gram_ occurring. Returns
63
+ # <tt>0.0</tt> by default.
64
+ #
65
+ def probability_of(gram)
66
+ @probabilities[gram] || 0.0
67
+ end
68
+
69
+ alias [] probability_of
70
+
71
+ #
72
+ # Sets the frequency of the specified _gram_ to the specified _value_.
73
+ #
74
+ def set_count(gram,value)
75
+ @dirty = true
76
+ @frequencies[gram] = value
77
+ end
78
+
79
+ #
80
+ # Increments the frequency of the specified _gram_ and marks the
81
+ # probability table as dirty.
82
+ #
83
+ def count(gram)
84
+ @dirty = true
85
+
86
+ unless @frequencies.has_key?(gram)
87
+ @frequencies[gram] = 0
88
+ end
89
+
90
+ return @frequencies[gram] += 1
91
+ end
92
+
93
+ #
94
+ # Calculates the total via the summation of the frequencies. Also
95
+ # marks the probability table as dirty.
96
+ #
97
+ def total
98
+ if @dirty
99
+ @total = @frequencies.values.inject do |sum,freq|
100
+ sum + freq
101
+ end
102
+ end
103
+
104
+ return @total
105
+ end
106
+
107
+ #
108
+ # Builds the probability table using the recorded frequencies, if the
109
+ # table is marked as dirty.
110
+ #
111
+ def build
112
+ if @dirty
113
+ current_total = total.to_f
114
+
115
+ @frequencies.each do |gram,count|
116
+ @probabilities[gram] = count.to_f / current_total
117
+ end
118
+
119
+ @dirty = false
120
+ end
121
+
122
+ return self
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the probability table is empty, returns +false+
127
+ # otherwise.
128
+ #
129
+ def empty?
130
+ @total == 0
131
+ end
132
+
133
+ #
134
+ # Clears the probability table.
135
+ #
136
+ def clear
137
+ @total = 0
138
+ @frequencies.clear
139
+ @probabilities.clear
140
+
141
+ return self
142
+ end
143
+
144
+ def inspect
145
+ if @dirty
146
+ "#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
147
+ else
148
+ @probabilities.inspect
149
+ end
150
+ end
151
+
152
+ end
153
+ end
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class QuadgramModel < MultigramModel
4
+ class QuadgramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 4
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 4
11
7
 
12
8
  end
13
9
  end
@@ -1,31 +1,21 @@
1
- require 'raingrams/unigram_model'
2
- require 'raingrams/multigram_model'
3
- require 'raingrams/open_vocabulary/unigram_model'
4
- require 'raingrams/open_vocabulary/multigram_model'
1
+ require 'raingrams/model'
2
+ require 'raingrams/open_vocabulary/model'
5
3
 
6
4
  module Raingrams
7
- def Raingrams.closed_vocabulary_model(opts={},&block)
8
- if opts[:ngram_size]==1
9
- return UnigramModel.new(opts,&block)
10
- else
11
- return MultigramModel.new(opts,&block)
12
- end
5
+ def Raingrams.closed_vocabulary_model(options={},&block)
6
+ Model.new(options,&block)
13
7
  end
14
8
 
15
- def Raingrams.open_vocabulary_model(opts={},&block)
16
- if opts[:ngram_size]==1
17
- return OpenVocabulary::UnigramModel.new(opts,&block)
18
- else
19
- return OpenVocabulary::MultigramModel.new(opts,&block)
20
- end
9
+ def Raingrams.open_vocabulary_model(options={},&block)
10
+ OpenVocabulary::Model.new(options,&block)
21
11
  end
22
12
 
23
- def Raingrams.model(opts={},&block)
24
- case opts[:vocabulary]
13
+ def Raingrams.model(options={},&block)
14
+ case options[:vocabulary]
25
15
  when :open, 'open'
26
- return Raingrams.open_vocabulary_model(opts,&block)
16
+ return Raingrams.open_vocabulary_model(options,&block)
27
17
  else
28
- return Raingrams.closed_vocabulary_model(opts,&block)
18
+ return Raingrams.closed_vocabulary_model(options,&block)
29
19
  end
30
20
  end
31
21
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class StartSentence < Token
6
6
 
7
- def self.to_s
8
- '<s>'
7
+ def initialize
8
+ super('<s>')
9
9
  end
10
10
 
11
11
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class StopSentence < Token
6
6
 
7
- def self.to_s
8
- '</s>'
7
+ def initialize
8
+ super('</s>')
9
9
  end
10
10
 
11
11
  end
@@ -2,16 +2,60 @@ module Raingrams
2
2
  module Tokens
3
3
  class Token
4
4
 
5
- def self.*(length)
5
+ # Gram form of the token
6
+ attr_reader :gram
7
+
8
+ #
9
+ # Creates a new Token object with the specified _gram_.
10
+ #
11
+ def initialize(gram)
12
+ @gram = gram
13
+ end
14
+
15
+ def to_gram
16
+ self
17
+ end
18
+
19
+ #
20
+ # Creates an Array of the specified _length_ containing the token.
21
+ #
22
+ def *(length)
6
23
  [self] * length
7
24
  end
8
25
 
9
- def self.to_sym
10
- self.to_s.to_sym
26
+ #
27
+ # Returns +true+ if the token has the same gram as the _other_ token,
28
+ # returns +false+ otherwise.
29
+ #
30
+ def eql?(other)
31
+ if other.kind_of?(Token)
32
+ return (@gram == other.gram)
33
+ end
34
+
35
+ return false
36
+ end
37
+
38
+ alias == eql?
39
+
40
+ #
41
+ # Returns the String form of the token.
42
+ #
43
+ def to_s
44
+ @gram.to_s
45
+ end
46
+
47
+ #
48
+ # Returns the Symbol form of the token.
49
+ #
50
+ def to_sym
51
+ @gram.to_sym
11
52
  end
12
53
 
13
- def self.inspect
14
- self.to_s
54
+ #
55
+ # Returns the String form of the token.
56
+ #
57
+ def inspect
58
+ @gram.to_s
15
59
  end
16
60
 
17
61
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class Unknown < Token
6
6
 
7
- def self.to_s
8
- '<unknown>'
7
+ def initialize
8
+ super('<unknown>')
9
9
  end
10
10
 
11
11
  end
@@ -2,3 +2,4 @@ require 'raingrams/tokens/token'
2
2
  require 'raingrams/tokens/start_sentence'
3
3
  require 'raingrams/tokens/stop_sentence'
4
4
  require 'raingrams/tokens/unknown'
5
+ require 'raingrams/tokens/tokens'
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class TrigramModel < MultigramModel
4
+ class TrigramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 3
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 3
11
7
 
12
8
  end
13
9
  end
@@ -1,3 +1,3 @@
1
1
  module Raingrams
2
- VERSION = '0.0.9'
2
+ VERSION = '0.1.0'
3
3
  end
data/lib/raingrams.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'raingrams/extensions'
2
2
  require 'raingrams/raingrams'
3
3
  require 'raingrams/ngram'
4
- require 'raingrams/unigram_model'
4
+ require 'raingrams/model'
5
5
  require 'raingrams/bigram_model'
6
6
  require 'raingrams/trigram_model'
7
7
  require 'raingrams/quadgram_model'
@@ -0,0 +1,54 @@
1
+ require 'raingrams/ngram_set'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe NgramSet do
6
+ before(:all) do
7
+ @ngrams = NgramSet[
8
+ Ngram[:the, :dog],
9
+ Ngram[:dog, :jumped],
10
+ Ngram[:jumped, :through],
11
+ Ngram[:through, :the],
12
+ Ngram[:the, :hoop]
13
+ ]
14
+ end
15
+
16
+ it "should select ngrams from the set" do
17
+ @ngrams.select { |ngram|
18
+ ngram.starts_with?(:the)
19
+ }.should == NgramSet[Ngram[:the, :dog], Ngram[:the, :hoop]]
20
+ end
21
+
22
+ it "should select ngrams with a specified prefixed" do
23
+ @ngrams.prefixed_by(Ngram[:dog]).should == NgramSet[
24
+ Ngram[:dog, :jumped]
25
+ ]
26
+ end
27
+
28
+ it "should select ngrams with a specified postfix" do
29
+ @ngrams.postfixed_by(Ngram[:through]).should == NgramSet[
30
+ Ngram[:jumped, :through]
31
+ ]
32
+ end
33
+
34
+ it "should select ngrams starting with a specified gram" do
35
+ @ngrams.starts_with(:jumped).should == NgramSet[Ngram[:jumped, :through]]
36
+ end
37
+
38
+ it "should select ngrams ending with a specified gram" do
39
+ @ngrams.ends_with(:dog).should == NgramSet[Ngram[:the, :dog]]
40
+ end
41
+
42
+ it "should select ngrams including a specified gram" do
43
+ @ngrams.including(:dog).should == NgramSet[
44
+ Ngram[:the, :dog],
45
+ Ngram[:dog, :jumped]
46
+ ]
47
+ end
48
+
49
+ it "should select ngrams which includes specified grams" do
50
+ @ngrams.includes(:the, :dog).should == NgramSet[
51
+ Ngram[:the, :dog],
52
+ ]
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ require 'raingrams/ngram'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Ngram do
6
+ before(:all) do
7
+ @ngram = Ngram[:one, :two, :three]
8
+ end
9
+
10
+ it "should have a prefix" do
11
+ @ngram.prefix.should == Ngram[:one, :two]
12
+ end
13
+
14
+ it "should have a postfix" do
15
+ @ngram.postfix.should == Ngram[:two, :three]
16
+ end
17
+
18
+ it "should begin with a gram" do
19
+ @ngram.starts_with?(:one).should == true
20
+ end
21
+
22
+ it "should end with a gram" do
23
+ @ngram.ends_with?(:three).should == true
24
+ end
25
+
26
+ it "should include certain grams" do
27
+ @ngram.includes?(:one, :three).should == true
28
+ end
29
+ end