raingrams 0.0.9 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. data/History.txt +9 -0
  2. data/Manifest.txt +10 -10
  3. data/README.txt +9 -7
  4. data/Rakefile +3 -6
  5. data/TODO.txt +6 -0
  6. data/lib/raingrams/bigram_model.rb +3 -7
  7. data/lib/raingrams/extensions/object.rb +4 -1
  8. data/lib/raingrams/extensions/string.rb +3 -0
  9. data/lib/raingrams/extensions.rb +0 -5
  10. data/lib/raingrams/hexagram_model.rb +3 -7
  11. data/lib/raingrams/model.rb +622 -61
  12. data/lib/raingrams/ngram.rb +50 -9
  13. data/lib/raingrams/ngram_set.rb +43 -0
  14. data/lib/raingrams/open_vocabulary/model.rb +12 -0
  15. data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
  16. data/lib/raingrams/open_vocabulary.rb +0 -1
  17. data/lib/raingrams/pentagram_model.rb +3 -7
  18. data/lib/raingrams/probability_table.rb +153 -0
  19. data/lib/raingrams/quadgram_model.rb +3 -7
  20. data/lib/raingrams/raingrams.rb +10 -20
  21. data/lib/raingrams/tokens/start_sentence.rb +2 -2
  22. data/lib/raingrams/tokens/stop_sentence.rb +2 -2
  23. data/lib/raingrams/tokens/token.rb +49 -5
  24. data/lib/raingrams/tokens/unknown.rb +2 -2
  25. data/lib/raingrams/tokens.rb +1 -0
  26. data/lib/raingrams/trigram_model.rb +3 -7
  27. data/lib/raingrams/version.rb +1 -1
  28. data/lib/raingrams.rb +1 -1
  29. data/spec/ngram_set_spec.rb +54 -0
  30. data/spec/ngram_spec.rb +29 -0
  31. data/spec/probability_table_spec.rb +94 -0
  32. data/spec/raingrams_spec.rb +9 -0
  33. data/spec/spec_helper.rb +5 -0
  34. data/tasks/spec.rb +7 -0
  35. metadata +65 -55
  36. data/lib/raingrams/extensions/class.rb +0 -7
  37. data/lib/raingrams/extensions/false_class.rb +0 -7
  38. data/lib/raingrams/extensions/nil_class.rb +0 -7
  39. data/lib/raingrams/extensions/symbol.rb +0 -7
  40. data/lib/raingrams/extensions/true_class.rb +0 -7
  41. data/lib/raingrams/multigram_model.rb +0 -165
  42. data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
  43. data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
  44. data/lib/raingrams/unigram_model.rb +0 -70
  45. data/test/test_raingrams.rb +0 -0
@@ -1,20 +1,53 @@
1
+ require 'raingrams/extensions'
2
+
1
3
  module Raingrams
2
4
  class Ngram < Array
3
5
 
4
- def initialize(objs)
5
- super(objs.map { |obj| obj.to_gram })
6
+ #
7
+ # Creates a new Ngram object with the specified _objects_.
8
+ #
9
+ def initialize(objects)
10
+ super(objects.map { |obj| obj.to_gram })
11
+ end
12
+
13
+ #
14
+ # Creates a new Ngram object from the specified _objects_.
15
+ #
16
+ def self.[](*objects)
17
+ self.new(objects)
18
+ end
19
+
20
+ #
21
+ # Creates a new Ngram object by appending the specified _grams_ to the
22
+ # ngram.
23
+ #
24
+ def +(grams)
25
+ if grams.kind_of?(Array)
26
+ return self.class.new(super(grams.map { |gram|
27
+ gram.to_gram
28
+ }))
29
+ else
30
+ return self.class.new(super([grams.to_gram]))
31
+ end
6
32
  end
7
33
 
8
- def self.[](*objs)
9
- self.new(objs)
34
+ def <<(gram)
35
+ super(gram.to_gram)
10
36
  end
11
37
 
38
+ #
39
+ # Returns the prefix of the ngram.
40
+ #
12
41
  def prefix
13
42
  self[0...length-1]
14
43
  end
15
44
 
16
- def prefixed_by?(ngram)
17
- prefix==ngram
45
+ #
46
+ # Returns +true+ if the ngram is prefixed by the specified
47
+ # _smaller_ngram_.
48
+ #
49
+ def prefixed_by?(smaller_ngram)
50
+ prefix == smaller_ngram
18
51
  end
19
52
 
20
53
  def postfix
@@ -22,21 +55,25 @@ module Raingrams
22
55
  end
23
56
 
24
57
  def postfixed_by?(ngram)
25
- postfix==ngram
58
+ postfix == ngram
26
59
  end
27
60
 
28
61
  def starts_with?(obj)
29
- self[0]==obj.to_gram
62
+ self.first == obj.to_gram
30
63
  end
31
64
 
32
65
  def ends_with?(obj)
33
- self[-1]==obj.to_gram
66
+ self.last == obj.to_gram
34
67
  end
35
68
 
36
69
  def include?(obj)
37
70
  super(obj.to_gram)
38
71
  end
39
72
 
73
+ def includes?(*grams)
74
+ (self & grams) == grams
75
+ end
76
+
40
77
  def flatten
41
78
  self.dup
42
79
  end
@@ -49,5 +86,9 @@ module Raingrams
49
86
  join(', ')
50
87
  end
51
88
 
89
+ def inspect
90
+ 'Ngram[' + self.map { |gram| gram.inspect }.join(', ') + ']'
91
+ end
92
+
52
93
  end
53
94
  end
@@ -0,0 +1,43 @@
1
+ require 'raingrams/ngram'
2
+
3
+ require 'set'
4
+
5
+ module Raingrams
6
+ class NgramSet < Set
7
+
8
+ def select(&block)
9
+ selected_ngrams = self.class.new
10
+
11
+ each do |ngram|
12
+ selected_ngrams << ngram if block.call(ngram)
13
+ end
14
+
15
+ return selected_ngrams
16
+ end
17
+
18
+ def prefixed_by(prefix)
19
+ select { |ngram| ngram.prefixed_by?(prefix) }
20
+ end
21
+
22
+ def postfixed_by(postfix)
23
+ select { |ngram| ngram.postfixed_by?(postfix) }
24
+ end
25
+
26
+ def starts_with(gram)
27
+ select { |ngram| ngram.starts_with?(gram) }
28
+ end
29
+
30
+ def ends_with(gram)
31
+ select { |ngram| ngram.ends_with?(gram) }
32
+ end
33
+
34
+ def including(gram)
35
+ select { |ngram| ngram.include?(gram) }
36
+ end
37
+
38
+ def includes(*grams)
39
+ select { |ngram| ngram.includes?(*grams) }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/open_vocabulary/open_model'
2
+ require 'raingrams/model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class Model < Raingrams::Model
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -7,14 +7,18 @@ module Raingrams
7
7
  # The fixed lexicon of this model
8
8
  attr_reader :lexicon
9
9
 
10
- def initialize(opts={},&block)
11
- @lexicon = opts[:lexicon] || []
10
+ def initialize(options={},&block)
11
+ @lexicon = (options[:lexicon] || [])
12
12
 
13
- super(opts,&block)
13
+ @lexicon.map! do |word|
14
+ word.to_gram
15
+ end
16
+
17
+ super(options,&block)
14
18
  end
15
19
 
16
20
  def within_lexicon?(gram)
17
- @lexicon.include?(gram)
21
+ @lexicon.include?(gram.to_gram)
18
22
  end
19
23
 
20
24
  def train_ngram(ngram)
@@ -1,4 +1,3 @@
1
- require 'raingrams/openvocabulary/unigram_model'
2
1
  require 'raingrams/openvocabulary/multigram_model'
3
2
  require 'raingrams/openvocabulary/bigram_model'
4
3
  require 'raingrams/openvocabulary/trigram_model'
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class PentagramModel < MultigramModel
4
+ class PentagramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 5
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 5
11
7
 
12
8
  end
13
9
  end
@@ -0,0 +1,153 @@
1
+ module Raingrams
2
+ class ProbabilityTable
3
+
4
+ # Indicates wether the table needs to be rebuilt
5
+ attr_reader :dirty
6
+
7
+ # Frequencies of grams
8
+ attr_reader :frequencies
9
+
10
+ # Probabilities of grams
11
+ attr_reader :probabilities
12
+
13
+ #
14
+ # Creates a new empty ProbabilityTable object.
15
+ #
16
+ def initialize
17
+ @dirty = false
18
+ @total = 0
19
+ @frequencies = {}
20
+ @probabilities = {}
21
+ end
22
+
23
+ #
24
+ # Returns +true+ if the probability table is dirty and needs to be
25
+ # rebuilt, returns +false+ otherwise.
26
+ #
27
+ def dirty?
28
+ @dirty == true
29
+ end
30
+
31
+ #
32
+ # Returns +true+ if the probability table contains the specified _gram_,
33
+ # returns +false+ otherwise.
34
+ #
35
+ def has_gram?(gram)
36
+ @frequencies.has_key?(gram)
37
+ end
38
+
39
+ #
40
+ # Returns the grams within the probability table.
41
+ #
42
+ def grams
43
+ @frequencies.keys
44
+ end
45
+
46
+ #
47
+ # Iterates over each gram in the probability table, passing each to the
48
+ # given _block_.
49
+ #
50
+ def each_gram(&block)
51
+ @frequencies.each_key(&block)
52
+ end
53
+
54
+ #
55
+ # Returns the frequency of the specified _gram_. Returns +0+ by default.
56
+ #
57
+ def frequency_of(gram)
58
+ @frequencies[gram] || 0
59
+ end
60
+
61
+ #
62
+ # Returns the probability of the specified _gram_ occurring. Returns
63
+ # <tt>0.0</tt> by default.
64
+ #
65
+ def probability_of(gram)
66
+ @probabilities[gram] || 0.0
67
+ end
68
+
69
+ alias [] probability_of
70
+
71
+ #
72
+ # Sets the frequency of the specified _gram_ to the specified _value_.
73
+ #
74
+ def set_count(gram,value)
75
+ @dirty = true
76
+ @frequencies[gram] = value
77
+ end
78
+
79
+ #
80
+ # Increments the frequency of the specified _gram_ and marks the
81
+ # probability table as dirty.
82
+ #
83
+ def count(gram)
84
+ @dirty = true
85
+
86
+ unless @frequencies.has_key?(gram)
87
+ @frequencies[gram] = 0
88
+ end
89
+
90
+ return @frequencies[gram] += 1
91
+ end
92
+
93
+ #
94
+ # Calculates the total via the summation of the frequencies. Also
95
+ # marks the probability table as dirty.
96
+ #
97
+ def total
98
+ if @dirty
99
+ @total = @frequencies.values.inject do |sum,freq|
100
+ sum + freq
101
+ end
102
+ end
103
+
104
+ return @total
105
+ end
106
+
107
+ #
108
+ # Builds the probability table using the recorded frequencies, if the
109
+ # table is marked as dirty.
110
+ #
111
+ def build
112
+ if @dirty
113
+ current_total = total.to_f
114
+
115
+ @frequencies.each do |gram,count|
116
+ @probabilities[gram] = count.to_f / current_total
117
+ end
118
+
119
+ @dirty = false
120
+ end
121
+
122
+ return self
123
+ end
124
+
125
+ #
126
+ # Returns +true+ if the probability table is empty, returns +false+
127
+ # otherwise.
128
+ #
129
+ def empty?
130
+ @total == 0
131
+ end
132
+
133
+ #
134
+ # Clears the probability table.
135
+ #
136
+ def clear
137
+ @total = 0
138
+ @frequencies.clear
139
+ @probabilities.clear
140
+
141
+ return self
142
+ end
143
+
144
+ def inspect
145
+ if @dirty
146
+ "#<ProbabilityTable @total=#{@total} @frequencies=#{@frequencies.inspect}>"
147
+ else
148
+ @probabilities.inspect
149
+ end
150
+ end
151
+
152
+ end
153
+ end
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class QuadgramModel < MultigramModel
4
+ class QuadgramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 4
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 4
11
7
 
12
8
  end
13
9
  end
@@ -1,31 +1,21 @@
1
- require 'raingrams/unigram_model'
2
- require 'raingrams/multigram_model'
3
- require 'raingrams/open_vocabulary/unigram_model'
4
- require 'raingrams/open_vocabulary/multigram_model'
1
+ require 'raingrams/model'
2
+ require 'raingrams/open_vocabulary/model'
5
3
 
6
4
  module Raingrams
7
- def Raingrams.closed_vocabulary_model(opts={},&block)
8
- if opts[:ngram_size]==1
9
- return UnigramModel.new(opts,&block)
10
- else
11
- return MultigramModel.new(opts,&block)
12
- end
5
+ def Raingrams.closed_vocabulary_model(options={},&block)
6
+ Model.new(options,&block)
13
7
  end
14
8
 
15
- def Raingrams.open_vocabulary_model(opts={},&block)
16
- if opts[:ngram_size]==1
17
- return OpenVocabulary::UnigramModel.new(opts,&block)
18
- else
19
- return OpenVocabulary::MultigramModel.new(opts,&block)
20
- end
9
+ def Raingrams.open_vocabulary_model(options={},&block)
10
+ OpenVocabulary::Model.new(options,&block)
21
11
  end
22
12
 
23
- def Raingrams.model(opts={},&block)
24
- case opts[:vocabulary]
13
+ def Raingrams.model(options={},&block)
14
+ case options[:vocabulary]
25
15
  when :open, 'open'
26
- return Raingrams.open_vocabulary_model(opts,&block)
16
+ return Raingrams.open_vocabulary_model(options,&block)
27
17
  else
28
- return Raingrams.closed_vocabulary_model(opts,&block)
18
+ return Raingrams.closed_vocabulary_model(options,&block)
29
19
  end
30
20
  end
31
21
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class StartSentence < Token
6
6
 
7
- def self.to_s
8
- '<s>'
7
+ def initialize
8
+ super('<s>')
9
9
  end
10
10
 
11
11
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class StopSentence < Token
6
6
 
7
- def self.to_s
8
- '</s>'
7
+ def initialize
8
+ super('</s>')
9
9
  end
10
10
 
11
11
  end
@@ -2,16 +2,60 @@ module Raingrams
2
2
  module Tokens
3
3
  class Token
4
4
 
5
- def self.*(length)
5
+ # Gram form of the token
6
+ attr_reader :gram
7
+
8
+ #
9
+ # Creates a new Token object with the specified _gram_.
10
+ #
11
+ def initialize(gram)
12
+ @gram = gram
13
+ end
14
+
15
+ def to_gram
16
+ self
17
+ end
18
+
19
+ #
20
+ # Creates an Array of the specified _length_ containing the token.
21
+ #
22
+ def *(length)
6
23
  [self] * length
7
24
  end
8
25
 
9
- def self.to_sym
10
- self.to_s.to_sym
26
+ #
27
+ # Returns +true+ if the token has the same gram as the _other_ token,
28
+ # returns +false+ otherwise.
29
+ #
30
+ def eql?(other)
31
+ if other.kind_of?(Token)
32
+ return (@gram == other.gram)
33
+ end
34
+
35
+ return false
36
+ end
37
+
38
+ alias == eql?
39
+
40
+ #
41
+ # Returns the String form of the token.
42
+ #
43
+ def to_s
44
+ @gram.to_s
45
+ end
46
+
47
+ #
48
+ # Returns the Symbol form of the token.
49
+ #
50
+ def to_sym
51
+ @gram.to_sym
11
52
  end
12
53
 
13
- def self.inspect
14
- self.to_s
54
+ #
55
+ # Returns the String form of the token.
56
+ #
57
+ def inspect
58
+ @gram.to_s
15
59
  end
16
60
 
17
61
  end
@@ -4,8 +4,8 @@ module Raingrams
4
4
  module Tokens
5
5
  class Unknown < Token
6
6
 
7
- def self.to_s
8
- '<unknown>'
7
+ def initialize
8
+ super('<unknown>')
9
9
  end
10
10
 
11
11
  end
@@ -2,3 +2,4 @@ require 'raingrams/tokens/token'
2
2
  require 'raingrams/tokens/start_sentence'
3
3
  require 'raingrams/tokens/stop_sentence'
4
4
  require 'raingrams/tokens/unknown'
5
+ require 'raingrams/tokens/tokens'
@@ -1,13 +1,9 @@
1
- require 'raingrams/multigram_model'
1
+ require 'raingrams/model'
2
2
 
3
3
  module Raingrams
4
- class TrigramModel < MultigramModel
4
+ class TrigramModel < Model
5
5
 
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 3
8
-
9
- super(opts,&block)
10
- end
6
+ ngram_size 3
11
7
 
12
8
  end
13
9
  end
@@ -1,3 +1,3 @@
1
1
  module Raingrams
2
- VERSION = '0.0.9'
2
+ VERSION = '0.1.0'
3
3
  end
data/lib/raingrams.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'raingrams/extensions'
2
2
  require 'raingrams/raingrams'
3
3
  require 'raingrams/ngram'
4
- require 'raingrams/unigram_model'
4
+ require 'raingrams/model'
5
5
  require 'raingrams/bigram_model'
6
6
  require 'raingrams/trigram_model'
7
7
  require 'raingrams/quadgram_model'
@@ -0,0 +1,54 @@
1
+ require 'raingrams/ngram_set'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe NgramSet do
6
+ before(:all) do
7
+ @ngrams = NgramSet[
8
+ Ngram[:the, :dog],
9
+ Ngram[:dog, :jumped],
10
+ Ngram[:jumped, :through],
11
+ Ngram[:through, :the],
12
+ Ngram[:the, :hoop]
13
+ ]
14
+ end
15
+
16
+ it "should select ngrams from the set" do
17
+ @ngrams.select { |ngram|
18
+ ngram.starts_with?(:the)
19
+ }.should == NgramSet[Ngram[:the, :dog], Ngram[:the, :hoop]]
20
+ end
21
+
22
+ it "should select ngrams with a specified prefixed" do
23
+ @ngrams.prefixed_by(Ngram[:dog]).should == NgramSet[
24
+ Ngram[:dog, :jumped]
25
+ ]
26
+ end
27
+
28
+ it "should select ngrams with a specified postfix" do
29
+ @ngrams.postfixed_by(Ngram[:through]).should == NgramSet[
30
+ Ngram[:jumped, :through]
31
+ ]
32
+ end
33
+
34
+ it "should select ngrams starting with a specified gram" do
35
+ @ngrams.starts_with(:jumped).should == NgramSet[Ngram[:jumped, :through]]
36
+ end
37
+
38
+ it "should select ngrams ending with a specified gram" do
39
+ @ngrams.ends_with(:dog).should == NgramSet[Ngram[:the, :dog]]
40
+ end
41
+
42
+ it "should select ngrams including a specified gram" do
43
+ @ngrams.including(:dog).should == NgramSet[
44
+ Ngram[:the, :dog],
45
+ Ngram[:dog, :jumped]
46
+ ]
47
+ end
48
+
49
+ it "should select ngrams which includes specified grams" do
50
+ @ngrams.includes(:the, :dog).should == NgramSet[
51
+ Ngram[:the, :dog],
52
+ ]
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ require 'raingrams/ngram'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Ngram do
6
+ before(:all) do
7
+ @ngram = Ngram[:one, :two, :three]
8
+ end
9
+
10
+ it "should have a prefix" do
11
+ @ngram.prefix.should == Ngram[:one, :two]
12
+ end
13
+
14
+ it "should have a postfix" do
15
+ @ngram.postfix.should == Ngram[:two, :three]
16
+ end
17
+
18
+ it "should begin with a gram" do
19
+ @ngram.starts_with?(:one).should == true
20
+ end
21
+
22
+ it "should end with a gram" do
23
+ @ngram.ends_with?(:three).should == true
24
+ end
25
+
26
+ it "should include certain grams" do
27
+ @ngram.includes?(:one, :three).should == true
28
+ end
29
+ end