raingrams 0.0.9 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/History.txt +9 -0
  2. data/Manifest.txt +10 -10
  3. data/README.txt +9 -7
  4. data/Rakefile +3 -6
  5. data/TODO.txt +6 -0
  6. data/lib/raingrams/bigram_model.rb +3 -7
  7. data/lib/raingrams/extensions/object.rb +4 -1
  8. data/lib/raingrams/extensions/string.rb +3 -0
  9. data/lib/raingrams/extensions.rb +0 -5
  10. data/lib/raingrams/hexagram_model.rb +3 -7
  11. data/lib/raingrams/model.rb +622 -61
  12. data/lib/raingrams/ngram.rb +50 -9
  13. data/lib/raingrams/ngram_set.rb +43 -0
  14. data/lib/raingrams/open_vocabulary/model.rb +12 -0
  15. data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
  16. data/lib/raingrams/open_vocabulary.rb +0 -1
  17. data/lib/raingrams/pentagram_model.rb +3 -7
  18. data/lib/raingrams/probability_table.rb +153 -0
  19. data/lib/raingrams/quadgram_model.rb +3 -7
  20. data/lib/raingrams/raingrams.rb +10 -20
  21. data/lib/raingrams/tokens/start_sentence.rb +2 -2
  22. data/lib/raingrams/tokens/stop_sentence.rb +2 -2
  23. data/lib/raingrams/tokens/token.rb +49 -5
  24. data/lib/raingrams/tokens/unknown.rb +2 -2
  25. data/lib/raingrams/tokens.rb +1 -0
  26. data/lib/raingrams/trigram_model.rb +3 -7
  27. data/lib/raingrams/version.rb +1 -1
  28. data/lib/raingrams.rb +1 -1
  29. data/spec/ngram_set_spec.rb +54 -0
  30. data/spec/ngram_spec.rb +29 -0
  31. data/spec/probability_table_spec.rb +94 -0
  32. data/spec/raingrams_spec.rb +9 -0
  33. data/spec/spec_helper.rb +5 -0
  34. data/tasks/spec.rb +7 -0
  35. metadata +65 -55
  36. data/lib/raingrams/extensions/class.rb +0 -7
  37. data/lib/raingrams/extensions/false_class.rb +0 -7
  38. data/lib/raingrams/extensions/nil_class.rb +0 -7
  39. data/lib/raingrams/extensions/symbol.rb +0 -7
  40. data/lib/raingrams/extensions/true_class.rb +0 -7
  41. data/lib/raingrams/multigram_model.rb +0 -165
  42. data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
  43. data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
  44. data/lib/raingrams/unigram_model.rb +0 -70
  45. data/test/test_raingrams.rb +0 -0
@@ -0,0 +1,94 @@
1
+ require 'raingrams/probability_table'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe ProbabilityTable do
6
+ before(:all) do
7
+ @grams = [:a, :b, :a, :a, :b, :c, :d, 2, 3, :a]
8
+
9
+ @table = ProbabilityTable.new
10
+ @grams.each { |g| @table.count(g) }
11
+ end
12
+
13
+ describe "empty table" do
14
+ before(:all) do
15
+ @empty_table = ProbabilityTable.new
16
+ end
17
+
18
+ it "should not be dirty" do
19
+ @empty_table.should_not be_dirty
20
+ end
21
+
22
+ it "should be empty" do
23
+ @empty_table.should be_empty
24
+ end
25
+
26
+ it "should not have any frequencies" do
27
+ @empty_table.frequencies.should be_empty
28
+ end
29
+
30
+ it "should have no probabilities" do
31
+ @empty_table.probabilities.should be_empty
32
+ end
33
+
34
+ it "should have no grams" do
35
+ @empty_table.grams.should be_empty
36
+ end
37
+ end
38
+
39
+ describe "un-built table" do
40
+ it "should be dirty" do
41
+ @table.should be_dirty
42
+ end
43
+
44
+ it "should have the observed grams" do
45
+ (@table.grams - @grams.uniq).should be_empty
46
+ end
47
+
48
+ it "should have non-zero frequencies" do
49
+ @table.frequencies.each_value do |freq|
50
+ freq.should > 0
51
+ end
52
+ end
53
+
54
+ it "should have non-zero frequencies for grams it has observed" do
55
+ @grams.uniq.each do |g|
56
+ @table.frequency_of(g).should > 0
57
+ end
58
+ end
59
+
60
+ it "should return a zero frequency for unknown grams" do
61
+ @table.frequency_of(:x).should == 0
62
+ end
63
+
64
+ it "should not have any probabilities yet" do
65
+ @table.probabilities.should be_empty
66
+ end
67
+ end
68
+
69
+ describe "built table" do
70
+ before(:all) do
71
+ @table.build
72
+ end
73
+
74
+ it "should not be dirty" do
75
+ @table.should_not be_dirty
76
+ end
77
+
78
+ it "should return a zero probability for unknown grams" do
79
+ @table.probability_of(:x).should == 0.0
80
+ end
81
+
82
+ it "should have non-zero probabilities" do
83
+ @table.probabilities.each_value do |prob|
84
+ prob.should > 0.0
85
+ end
86
+ end
87
+
88
+ it "should have non-zero probabilities for grams it has observed" do
89
+ @grams.uniq.each do |g|
90
+ @table.probability_of(g).should > 0.0
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,9 @@
1
+ require 'raingrams/version'
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Raingrams do
6
+ it "should have a VERSION constant" do
7
+ Raingrams.const_defined?('VERSION').should == true
8
+ end
9
+ end
@@ -0,0 +1,5 @@
1
+ require 'rubygems'
2
+ gem 'rspec', '>=1.1.3'
3
+ require 'spec'
4
+
5
+ include Raingrams
data/tasks/spec.rb ADDED
@@ -0,0 +1,7 @@
1
+ require 'spec/rake/spectask'
2
+
3
+ desc "Run all specifications"
4
+ Spec::Rake::SpecTask.new(:spec) do |t|
5
+ t.libs += ['lib', 'spec']
6
+ t.spec_opts = ['--colour', '--format', 'specdoc']
7
+ end
metadata CHANGED
@@ -1,51 +1,54 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: raingrams
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.0.9
7
- date: 2008-01-09 00:00:00 -08:00
8
- summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
9
- require_paths:
10
- - lib
11
- email: postmodern.mod3@gmail.com
12
- homepage: " by Postmodern Modulus III"
13
- rubyforge_project: raingrams
14
- description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.1.0
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
7
  - Postmodern Modulus III
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-10-06 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.7.0
24
+ version:
25
+ description: Raingrams is a flexible and general-purpose ngrams library written in Ruby. Raingrams supports any non-zero ngram size, text/non-text grams, multiple parsing styles and open/closed vocabulary models.
26
+ email:
27
+ - postmodern.mod3@gmail.com
28
+ executables: []
29
+
30
+ extensions: []
31
+
32
+ extra_rdoc_files:
33
+ - History.txt
34
+ - LICENSE.txt
35
+ - Manifest.txt
36
+ - README.txt
37
+ - TODO.txt
31
38
  files:
32
39
  - History.txt
33
40
  - LICENSE.txt
34
41
  - Manifest.txt
35
42
  - README.txt
43
+ - TODO.txt
36
44
  - Rakefile
37
45
  - lib/raingrams.rb
38
46
  - lib/raingrams/version.rb
39
47
  - lib/raingrams/raingrams.rb
40
48
  - lib/raingrams/exceptions/prefix_frequency_missing.rb
41
49
  - lib/raingrams/exceptions.rb
42
- - lib/raingrams/extensions/class.rb
43
- - lib/raingrams/extensions/false_class.rb
44
- - lib/raingrams/extensions/nil_class.rb
45
50
  - lib/raingrams/extensions/object.rb
46
51
  - lib/raingrams/extensions/string.rb
47
- - lib/raingrams/extensions/symbol.rb
48
- - lib/raingrams/extensions/true_class.rb
49
52
  - lib/raingrams/extensions.rb
50
53
  - lib/raingrams/tokens/token.rb
51
54
  - lib/raingrams/tokens/start_sentence.rb
@@ -53,47 +56,54 @@ files:
53
56
  - lib/raingrams/tokens/unknown.rb
54
57
  - lib/raingrams/tokens.rb
55
58
  - lib/raingrams/ngram.rb
59
+ - lib/raingrams/ngram_set.rb
60
+ - lib/raingrams/probability_table.rb
56
61
  - lib/raingrams/model.rb
57
- - lib/raingrams/unigram_model.rb
58
- - lib/raingrams/multigram_model.rb
59
62
  - lib/raingrams/bigram_model.rb
60
63
  - lib/raingrams/trigram_model.rb
61
64
  - lib/raingrams/quadgram_model.rb
62
65
  - lib/raingrams/pentagram_model.rb
63
66
  - lib/raingrams/hexagram_model.rb
64
67
  - lib/raingrams/open_vocabulary/open_model.rb
65
- - lib/raingrams/open_vocabulary/unigram_model.rb
66
- - lib/raingrams/open_vocabulary/multigram_model.rb
68
+ - lib/raingrams/open_vocabulary/model.rb
67
69
  - lib/raingrams/open_vocabulary/bigram_model.rb
68
70
  - lib/raingrams/open_vocabulary/trigram_model.rb
69
71
  - lib/raingrams/open_vocabulary/quadgram_model.rb
70
72
  - lib/raingrams/open_vocabulary/pentagram_model.rb
71
73
  - lib/raingrams/open_vocabulary/hexagram_model.rb
72
74
  - lib/raingrams/open_vocabulary.rb
73
- - test/test_raingrams.rb
74
- test_files:
75
- - test/test_raingrams.rb
75
+ - tasks/spec.rb
76
+ - spec/spec_helper.rb
77
+ - spec/ngram_spec.rb
78
+ - spec/ngram_set_spec.rb
79
+ - spec/probability_table_spec.rb
80
+ - spec/raingrams_spec.rb
81
+ has_rdoc: true
82
+ homepage: http://raingrams.rubyforge.org/
83
+ post_install_message:
76
84
  rdoc_options:
77
85
  - --main
78
86
  - README.txt
79
- extra_rdoc_files:
80
- - History.txt
81
- - LICENSE.txt
82
- - Manifest.txt
83
- - README.txt
84
- executables: []
85
-
86
- extensions: []
87
-
87
+ require_paths:
88
+ - lib
89
+ required_ruby_version: !ruby/object:Gem::Requirement
90
+ requirements:
91
+ - - ">="
92
+ - !ruby/object:Gem::Version
93
+ version: "0"
94
+ version:
95
+ required_rubygems_version: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - ">="
98
+ - !ruby/object:Gem::Version
99
+ version: "0"
100
+ version:
88
101
  requirements: []
89
102
 
90
- dependencies:
91
- - !ruby/object:Gem::Dependency
92
- name: hoe
93
- version_requirement:
94
- version_requirements: !ruby/object:Gem::Version::Requirement
95
- requirements:
96
- - - ">="
97
- - !ruby/object:Gem::Version
98
- version: 1.4.0
99
- version:
103
+ rubyforge_project: raingrams
104
+ rubygems_version: 1.3.0
105
+ signing_key:
106
+ specification_version: 2
107
+ summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
108
+ test_files: []
109
+
@@ -1,7 +0,0 @@
1
- class Class
2
-
3
- def to_gram
4
- self
5
- end
6
-
7
- end
@@ -1,7 +0,0 @@
1
- class FalseClass
2
-
3
- def to_gram
4
- self
5
- end
6
-
7
- end
@@ -1,7 +0,0 @@
1
- class NilClass
2
-
3
- def to_gram
4
- self
5
- end
6
-
7
- end
@@ -1,7 +0,0 @@
1
- class Symbol
2
-
3
- def to_gram
4
- self
5
- end
6
-
7
- end
@@ -1,7 +0,0 @@
1
- class TrueClass
2
-
3
- def to_gram
4
- self
5
- end
6
-
7
- end
@@ -1,165 +0,0 @@
1
- require 'raingrams/model'
2
- require 'raingrams/tokens/start_sentence'
3
- require 'raingrams/tokens/stop_sentence'
4
- require 'raingrams/exceptions/prefix_frequency_missing'
5
-
6
- module Raingrams
7
- class MultigramModel < Model
8
-
9
- # Frequencies of n-1 grams
10
- attr_reader :prefix_frequency
11
-
12
- def initialize(opts={},&block)
13
- @prefix_frequency = Hash.new { |hash,key| 0 }
14
-
15
- super(opts) { |model| model.build(&block) }
16
- end
17
-
18
- def ngrams_from_words(words)
19
- return (0...(words.length-@ngram_size+1)).map do |index|
20
- Ngram.new(words[index,@ngram_size])
21
- end
22
- end
23
-
24
- def ngrams_from_fragment(fragment)
25
- ngrams_from_words(parse_sentence(fragment))
26
- end
27
-
28
- def ngrams_from_sentence(sentence)
29
- ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
30
- end
31
-
32
- def ngrams_from_text(text)
33
- parse_text(text).inject([]) do |ngrams,sentence|
34
- ngrams + ngrams_from_sentence(sentence)
35
- end
36
- end
37
-
38
- def common_ngrams_from_words(words)
39
- ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
40
- end
41
-
42
- def common_ngrams_from_fragment(fragment)
43
- ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
44
- end
45
-
46
- def common_ngrams_from_sentence(sentence)
47
- ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
48
- end
49
-
50
- def common_ngrams_from_text(text)
51
- ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
52
- end
53
-
54
- def train_with_ngram(ngram)
55
- @prefix_frequency[ngram.prefix] += 1
56
- return super(ngram)
57
- end
58
-
59
- def train_with_sentence(sentence)
60
- train_with_ngrams(ngrams_from_sentence(sentence))
61
- end
62
-
63
- def train_with_text(text)
64
- train_with_ngrams(ngrams_from_text(text))
65
- end
66
-
67
- def build(&block)
68
- clear_probabilities
69
-
70
- block.call(self) if block
71
-
72
- @frequency.each do |ngram,count|
73
- prefix = ngram.prefix
74
-
75
- unless @prefix_frequency[prefix]
76
- raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
77
- end
78
-
79
- @probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
80
- end
81
-
82
- return self
83
- end
84
-
85
- def ngrams_prefixed_by(prefix)
86
- ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
87
- end
88
-
89
- def ngrams_postfixed_by(postfix)
90
- ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
91
- end
92
-
93
- def ngrams_preceeding(gram)
94
- ngrams_ending_with(gram).map do |ngram|
95
- ngrams_postfixed_by(ngram.prefix)
96
- end
97
- end
98
-
99
- def ngrams_following(gram)
100
- ngrams_starting_with(gram).map do |ngram|
101
- ngrams_prefixed_by(ngram.postfix)
102
- end
103
- end
104
-
105
- def grams_preceeding(gram)
106
- ngrams_ending_with(gram).map do |ngram|
107
- ngram[-2]
108
- end
109
- end
110
-
111
- def grams_following(gram)
112
- ngrams_starting_with(gram).map do |ngram|
113
- ngram[1]
114
- end
115
- end
116
-
117
- def fragment_probability(fragment)
118
- probability_of_ngrams(ngrams_from_fragment(fragment))
119
- end
120
-
121
- def sentence_probability(sentence)
122
- probability_of_ngrams(ngrams_from_sentence(sentence))
123
- end
124
-
125
- def text_probability(text)
126
- probability_of_ngrams(ngrams_from_text(text))
127
- end
128
-
129
- def common_fragment_probability(fragment)
130
- probability_of_ngrams(common_ngrams_from_fragment(fragment))
131
- end
132
-
133
- def common_sentence_probability(sentence)
134
- probability_of_ngrams(common_ngrams_from_sentence(sentence))
135
- end
136
-
137
- def common_text_probability(fragment)
138
- probability_of_ngrams(common_ngrams_from_text(text))
139
- end
140
-
141
- def similar_fragment_probability(other,fragment)
142
- common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
143
- end
144
-
145
- def similar_sentence_probability(other,sentence)
146
- common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
147
- end
148
-
149
- def similar_text_probability(other,text)
150
- common_text_probability(text) * other.common_text_probability(text)
151
- end
152
-
153
- def clear
154
- @prefix_frequency.clear
155
- return super
156
- end
157
-
158
- protected
159
-
160
- def wrap_sentence(sentence)
161
- (Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
162
- end
163
-
164
- end
165
- end
@@ -1,12 +0,0 @@
1
- require 'raingrams/multigrammodel'
2
- require 'raingrams/openvocabulary/openmodel'
3
-
4
- module Raingrams
5
- module OpenVocabulary
6
- class MultigramModel < Raingrams::MultigramModel
7
-
8
- include OpenModel
9
-
10
- end
11
- end
12
- end
@@ -1,12 +0,0 @@
1
- require 'raingrams/unigram_model'
2
- require 'raingrams/openvocabulary/open_model'
3
-
4
- module Raingrams
5
- module OpenVocabulary
6
- class UnigramModel < Raingrams::UnigramModel
7
-
8
- include OpenModel
9
-
10
- end
11
- end
12
- end
@@ -1,70 +0,0 @@
1
- require 'raingrams/model'
2
-
3
- module Raingrams
4
- class UnigramModel < Model
5
-
6
- def initialize(opts={},&block)
7
- opts[:ngram_size] = 1
8
-
9
- super(opts) { |model| model.build(&block) }
10
- end
11
-
12
- def ngrams_from_words(words)
13
- words.map { |word| Ngram[word] }
14
- end
15
-
16
- def ngrams_from_fragment(fragment)
17
- ngrams_from_words(parse_sentence(fragment))
18
- end
19
-
20
- def ngrams_from_sentence(sentence)
21
- ngrams_from_fragment(sentence)
22
- end
23
-
24
- def ngrams_from_text(text)
25
- parse_text(text).inject([]) do |ngrams,sentence|
26
- ngrams + ngrams_from_sentence(sentence)
27
- end
28
- end
29
-
30
- def train_with_sentence(sentence)
31
- train_with_ngrams(ngrams_from_sentence(sentence))
32
- end
33
-
34
- def train_with_text(text)
35
- train_with_ngrams(ngrams_from_text(text))
36
- end
37
-
38
- def gram_count
39
- @frequency.values.inject do |sum,count|
40
- sum + count
41
- end
42
- end
43
-
44
- def build(&block)
45
- clear_probabilities
46
-
47
- block.call(self) if block
48
-
49
- total_count = gram_count.to_f
50
- @frequency.each do |ngram,count|
51
- @probability[ngram] = count.to_f / total_count
52
- end
53
-
54
- return self
55
- end
56
-
57
- def fragment_probability(fragment)
58
- probability_of_ngrams(ngrams_from_fragment(fragment))
59
- end
60
-
61
- def sentence_probability(sentence)
62
- probability_of_ngrams(ngrams_from_sentence(sentence))
63
- end
64
-
65
- def text_probability(text)
66
- probability_of_ngrams(ngrams_from_text(text))
67
- end
68
-
69
- end
70
- end
File without changes