raingrams 0.0.9 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +9 -0
- data/Manifest.txt +10 -10
- data/README.txt +9 -7
- data/Rakefile +3 -6
- data/TODO.txt +6 -0
- data/lib/raingrams/bigram_model.rb +3 -7
- data/lib/raingrams/extensions/object.rb +4 -1
- data/lib/raingrams/extensions/string.rb +3 -0
- data/lib/raingrams/extensions.rb +0 -5
- data/lib/raingrams/hexagram_model.rb +3 -7
- data/lib/raingrams/model.rb +622 -61
- data/lib/raingrams/ngram.rb +50 -9
- data/lib/raingrams/ngram_set.rb +43 -0
- data/lib/raingrams/open_vocabulary/model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +8 -4
- data/lib/raingrams/open_vocabulary.rb +0 -1
- data/lib/raingrams/pentagram_model.rb +3 -7
- data/lib/raingrams/probability_table.rb +153 -0
- data/lib/raingrams/quadgram_model.rb +3 -7
- data/lib/raingrams/raingrams.rb +10 -20
- data/lib/raingrams/tokens/start_sentence.rb +2 -2
- data/lib/raingrams/tokens/stop_sentence.rb +2 -2
- data/lib/raingrams/tokens/token.rb +49 -5
- data/lib/raingrams/tokens/unknown.rb +2 -2
- data/lib/raingrams/tokens.rb +1 -0
- data/lib/raingrams/trigram_model.rb +3 -7
- data/lib/raingrams/version.rb +1 -1
- data/lib/raingrams.rb +1 -1
- data/spec/ngram_set_spec.rb +54 -0
- data/spec/ngram_spec.rb +29 -0
- data/spec/probability_table_spec.rb +94 -0
- data/spec/raingrams_spec.rb +9 -0
- data/spec/spec_helper.rb +5 -0
- data/tasks/spec.rb +7 -0
- metadata +65 -55
- data/lib/raingrams/extensions/class.rb +0 -7
- data/lib/raingrams/extensions/false_class.rb +0 -7
- data/lib/raingrams/extensions/nil_class.rb +0 -7
- data/lib/raingrams/extensions/symbol.rb +0 -7
- data/lib/raingrams/extensions/true_class.rb +0 -7
- data/lib/raingrams/multigram_model.rb +0 -165
- data/lib/raingrams/open_vocabulary/multigram_model.rb +0 -12
- data/lib/raingrams/open_vocabulary/unigram_model.rb +0 -12
- data/lib/raingrams/unigram_model.rb +0 -70
- data/test/test_raingrams.rb +0 -0
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'raingrams/probability_table'
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe ProbabilityTable do
|
6
|
+
before(:all) do
|
7
|
+
@grams = [:a, :b, :a, :a, :b, :c, :d, 2, 3, :a]
|
8
|
+
|
9
|
+
@table = ProbabilityTable.new
|
10
|
+
@grams.each { |g| @table.count(g) }
|
11
|
+
end
|
12
|
+
|
13
|
+
describe "empty table" do
|
14
|
+
before(:all) do
|
15
|
+
@empty_table = ProbabilityTable.new
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should not be dirty" do
|
19
|
+
@empty_table.should_not be_dirty
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should be empty" do
|
23
|
+
@empty_table.should be_empty
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should not have any frequencies" do
|
27
|
+
@empty_table.frequencies.should be_empty
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should have no probabilities" do
|
31
|
+
@empty_table.probabilities.should be_empty
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should have no grams" do
|
35
|
+
@empty_table.grams.should be_empty
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
describe "un-built table" do
|
40
|
+
it "should be dirty" do
|
41
|
+
@table.should be_dirty
|
42
|
+
end
|
43
|
+
|
44
|
+
it "should have the observed grams" do
|
45
|
+
(@table.grams - @grams.uniq).should be_empty
|
46
|
+
end
|
47
|
+
|
48
|
+
it "should have non-zero frequencies" do
|
49
|
+
@table.frequencies.each_value do |freq|
|
50
|
+
freq.should > 0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should have non-zero frequencies for grams it has observed" do
|
55
|
+
@grams.uniq.each do |g|
|
56
|
+
@table.frequency_of(g).should > 0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
it "should return a zero frequency for unknown grams" do
|
61
|
+
@table.frequency_of(:x).should == 0
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should not have any probabilities yet" do
|
65
|
+
@table.probabilities.should be_empty
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
describe "built table" do
|
70
|
+
before(:all) do
|
71
|
+
@table.build
|
72
|
+
end
|
73
|
+
|
74
|
+
it "should not be dirty" do
|
75
|
+
@table.should_not be_dirty
|
76
|
+
end
|
77
|
+
|
78
|
+
it "should return a zero probability for unknown grams" do
|
79
|
+
@table.probability_of(:x).should == 0.0
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should have non-zero probabilities" do
|
83
|
+
@table.probabilities.each_value do |prob|
|
84
|
+
prob.should > 0.0
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
it "should have non-zero probabilities for grams it has observed" do
|
89
|
+
@grams.uniq.each do |g|
|
90
|
+
@table.probability_of(g).should > 0.0
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
data/spec/spec_helper.rb
ADDED
data/tasks/spec.rb
ADDED
metadata
CHANGED
@@ -1,51 +1,54 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.4
|
3
|
-
specification_version: 1
|
4
2
|
name: raingrams
|
5
3
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.0
|
7
|
-
date: 2008-01-09 00:00:00 -08:00
|
8
|
-
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|
9
|
-
require_paths:
|
10
|
-
- lib
|
11
|
-
email: postmodern.mod3@gmail.com
|
12
|
-
homepage: " by Postmodern Modulus III"
|
13
|
-
rubyforge_project: raingrams
|
14
|
-
description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
|
15
|
-
autorequire:
|
16
|
-
default_executable:
|
17
|
-
bindir: bin
|
18
|
-
has_rdoc: true
|
19
|
-
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
-
requirements:
|
21
|
-
- - ">"
|
22
|
-
- !ruby/object:Gem::Version
|
23
|
-
version: 0.0.0
|
24
|
-
version:
|
4
|
+
version: 0.1.0
|
25
5
|
platform: ruby
|
26
|
-
signing_key:
|
27
|
-
cert_chain:
|
28
|
-
post_install_message:
|
29
6
|
authors:
|
30
7
|
- Postmodern Modulus III
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-10-06 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.7.0
|
24
|
+
version:
|
25
|
+
description: Raingrams is a flexible and general-purpose ngrams library written in Ruby. Raingrams supports any non-zero ngram size, text/non-text grams, multiple parsing styles and open/closed vocabulary models.
|
26
|
+
email:
|
27
|
+
- postmodern.mod3@gmail.com
|
28
|
+
executables: []
|
29
|
+
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files:
|
33
|
+
- History.txt
|
34
|
+
- LICENSE.txt
|
35
|
+
- Manifest.txt
|
36
|
+
- README.txt
|
37
|
+
- TODO.txt
|
31
38
|
files:
|
32
39
|
- History.txt
|
33
40
|
- LICENSE.txt
|
34
41
|
- Manifest.txt
|
35
42
|
- README.txt
|
43
|
+
- TODO.txt
|
36
44
|
- Rakefile
|
37
45
|
- lib/raingrams.rb
|
38
46
|
- lib/raingrams/version.rb
|
39
47
|
- lib/raingrams/raingrams.rb
|
40
48
|
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
41
49
|
- lib/raingrams/exceptions.rb
|
42
|
-
- lib/raingrams/extensions/class.rb
|
43
|
-
- lib/raingrams/extensions/false_class.rb
|
44
|
-
- lib/raingrams/extensions/nil_class.rb
|
45
50
|
- lib/raingrams/extensions/object.rb
|
46
51
|
- lib/raingrams/extensions/string.rb
|
47
|
-
- lib/raingrams/extensions/symbol.rb
|
48
|
-
- lib/raingrams/extensions/true_class.rb
|
49
52
|
- lib/raingrams/extensions.rb
|
50
53
|
- lib/raingrams/tokens/token.rb
|
51
54
|
- lib/raingrams/tokens/start_sentence.rb
|
@@ -53,47 +56,54 @@ files:
|
|
53
56
|
- lib/raingrams/tokens/unknown.rb
|
54
57
|
- lib/raingrams/tokens.rb
|
55
58
|
- lib/raingrams/ngram.rb
|
59
|
+
- lib/raingrams/ngram_set.rb
|
60
|
+
- lib/raingrams/probability_table.rb
|
56
61
|
- lib/raingrams/model.rb
|
57
|
-
- lib/raingrams/unigram_model.rb
|
58
|
-
- lib/raingrams/multigram_model.rb
|
59
62
|
- lib/raingrams/bigram_model.rb
|
60
63
|
- lib/raingrams/trigram_model.rb
|
61
64
|
- lib/raingrams/quadgram_model.rb
|
62
65
|
- lib/raingrams/pentagram_model.rb
|
63
66
|
- lib/raingrams/hexagram_model.rb
|
64
67
|
- lib/raingrams/open_vocabulary/open_model.rb
|
65
|
-
- lib/raingrams/open_vocabulary/
|
66
|
-
- lib/raingrams/open_vocabulary/multigram_model.rb
|
68
|
+
- lib/raingrams/open_vocabulary/model.rb
|
67
69
|
- lib/raingrams/open_vocabulary/bigram_model.rb
|
68
70
|
- lib/raingrams/open_vocabulary/trigram_model.rb
|
69
71
|
- lib/raingrams/open_vocabulary/quadgram_model.rb
|
70
72
|
- lib/raingrams/open_vocabulary/pentagram_model.rb
|
71
73
|
- lib/raingrams/open_vocabulary/hexagram_model.rb
|
72
74
|
- lib/raingrams/open_vocabulary.rb
|
73
|
-
-
|
74
|
-
|
75
|
-
-
|
75
|
+
- tasks/spec.rb
|
76
|
+
- spec/spec_helper.rb
|
77
|
+
- spec/ngram_spec.rb
|
78
|
+
- spec/ngram_set_spec.rb
|
79
|
+
- spec/probability_table_spec.rb
|
80
|
+
- spec/raingrams_spec.rb
|
81
|
+
has_rdoc: true
|
82
|
+
homepage: http://raingrams.rubyforge.org/
|
83
|
+
post_install_message:
|
76
84
|
rdoc_options:
|
77
85
|
- --main
|
78
86
|
- README.txt
|
79
|
-
|
80
|
-
-
|
81
|
-
|
82
|
-
|
83
|
-
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
87
|
+
require_paths:
|
88
|
+
- lib
|
89
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
90
|
+
requirements:
|
91
|
+
- - ">="
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: "0"
|
94
|
+
version:
|
95
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ">="
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: "0"
|
100
|
+
version:
|
88
101
|
requirements: []
|
89
102
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
- !ruby/object:Gem::Version
|
98
|
-
version: 1.4.0
|
99
|
-
version:
|
103
|
+
rubyforge_project: raingrams
|
104
|
+
rubygems_version: 1.3.0
|
105
|
+
signing_key:
|
106
|
+
specification_version: 2
|
107
|
+
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|
108
|
+
test_files: []
|
109
|
+
|
@@ -1,165 +0,0 @@
|
|
1
|
-
require 'raingrams/model'
|
2
|
-
require 'raingrams/tokens/start_sentence'
|
3
|
-
require 'raingrams/tokens/stop_sentence'
|
4
|
-
require 'raingrams/exceptions/prefix_frequency_missing'
|
5
|
-
|
6
|
-
module Raingrams
|
7
|
-
class MultigramModel < Model
|
8
|
-
|
9
|
-
# Frequencies of n-1 grams
|
10
|
-
attr_reader :prefix_frequency
|
11
|
-
|
12
|
-
def initialize(opts={},&block)
|
13
|
-
@prefix_frequency = Hash.new { |hash,key| 0 }
|
14
|
-
|
15
|
-
super(opts) { |model| model.build(&block) }
|
16
|
-
end
|
17
|
-
|
18
|
-
def ngrams_from_words(words)
|
19
|
-
return (0...(words.length-@ngram_size+1)).map do |index|
|
20
|
-
Ngram.new(words[index,@ngram_size])
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
def ngrams_from_fragment(fragment)
|
25
|
-
ngrams_from_words(parse_sentence(fragment))
|
26
|
-
end
|
27
|
-
|
28
|
-
def ngrams_from_sentence(sentence)
|
29
|
-
ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
|
30
|
-
end
|
31
|
-
|
32
|
-
def ngrams_from_text(text)
|
33
|
-
parse_text(text).inject([]) do |ngrams,sentence|
|
34
|
-
ngrams + ngrams_from_sentence(sentence)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def common_ngrams_from_words(words)
|
39
|
-
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
40
|
-
end
|
41
|
-
|
42
|
-
def common_ngrams_from_fragment(fragment)
|
43
|
-
ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
|
44
|
-
end
|
45
|
-
|
46
|
-
def common_ngrams_from_sentence(sentence)
|
47
|
-
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
48
|
-
end
|
49
|
-
|
50
|
-
def common_ngrams_from_text(text)
|
51
|
-
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
52
|
-
end
|
53
|
-
|
54
|
-
def train_with_ngram(ngram)
|
55
|
-
@prefix_frequency[ngram.prefix] += 1
|
56
|
-
return super(ngram)
|
57
|
-
end
|
58
|
-
|
59
|
-
def train_with_sentence(sentence)
|
60
|
-
train_with_ngrams(ngrams_from_sentence(sentence))
|
61
|
-
end
|
62
|
-
|
63
|
-
def train_with_text(text)
|
64
|
-
train_with_ngrams(ngrams_from_text(text))
|
65
|
-
end
|
66
|
-
|
67
|
-
def build(&block)
|
68
|
-
clear_probabilities
|
69
|
-
|
70
|
-
block.call(self) if block
|
71
|
-
|
72
|
-
@frequency.each do |ngram,count|
|
73
|
-
prefix = ngram.prefix
|
74
|
-
|
75
|
-
unless @prefix_frequency[prefix]
|
76
|
-
raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
|
77
|
-
end
|
78
|
-
|
79
|
-
@probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
|
80
|
-
end
|
81
|
-
|
82
|
-
return self
|
83
|
-
end
|
84
|
-
|
85
|
-
def ngrams_prefixed_by(prefix)
|
86
|
-
ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
|
87
|
-
end
|
88
|
-
|
89
|
-
def ngrams_postfixed_by(postfix)
|
90
|
-
ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
|
91
|
-
end
|
92
|
-
|
93
|
-
def ngrams_preceeding(gram)
|
94
|
-
ngrams_ending_with(gram).map do |ngram|
|
95
|
-
ngrams_postfixed_by(ngram.prefix)
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
def ngrams_following(gram)
|
100
|
-
ngrams_starting_with(gram).map do |ngram|
|
101
|
-
ngrams_prefixed_by(ngram.postfix)
|
102
|
-
end
|
103
|
-
end
|
104
|
-
|
105
|
-
def grams_preceeding(gram)
|
106
|
-
ngrams_ending_with(gram).map do |ngram|
|
107
|
-
ngram[-2]
|
108
|
-
end
|
109
|
-
end
|
110
|
-
|
111
|
-
def grams_following(gram)
|
112
|
-
ngrams_starting_with(gram).map do |ngram|
|
113
|
-
ngram[1]
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def fragment_probability(fragment)
|
118
|
-
probability_of_ngrams(ngrams_from_fragment(fragment))
|
119
|
-
end
|
120
|
-
|
121
|
-
def sentence_probability(sentence)
|
122
|
-
probability_of_ngrams(ngrams_from_sentence(sentence))
|
123
|
-
end
|
124
|
-
|
125
|
-
def text_probability(text)
|
126
|
-
probability_of_ngrams(ngrams_from_text(text))
|
127
|
-
end
|
128
|
-
|
129
|
-
def common_fragment_probability(fragment)
|
130
|
-
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
131
|
-
end
|
132
|
-
|
133
|
-
def common_sentence_probability(sentence)
|
134
|
-
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
135
|
-
end
|
136
|
-
|
137
|
-
def common_text_probability(fragment)
|
138
|
-
probability_of_ngrams(common_ngrams_from_text(text))
|
139
|
-
end
|
140
|
-
|
141
|
-
def similar_fragment_probability(other,fragment)
|
142
|
-
common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
|
143
|
-
end
|
144
|
-
|
145
|
-
def similar_sentence_probability(other,sentence)
|
146
|
-
common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
|
147
|
-
end
|
148
|
-
|
149
|
-
def similar_text_probability(other,text)
|
150
|
-
common_text_probability(text) * other.common_text_probability(text)
|
151
|
-
end
|
152
|
-
|
153
|
-
def clear
|
154
|
-
@prefix_frequency.clear
|
155
|
-
return super
|
156
|
-
end
|
157
|
-
|
158
|
-
protected
|
159
|
-
|
160
|
-
def wrap_sentence(sentence)
|
161
|
-
(Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
|
162
|
-
end
|
163
|
-
|
164
|
-
end
|
165
|
-
end
|
@@ -1,70 +0,0 @@
|
|
1
|
-
require 'raingrams/model'
|
2
|
-
|
3
|
-
module Raingrams
|
4
|
-
class UnigramModel < Model
|
5
|
-
|
6
|
-
def initialize(opts={},&block)
|
7
|
-
opts[:ngram_size] = 1
|
8
|
-
|
9
|
-
super(opts) { |model| model.build(&block) }
|
10
|
-
end
|
11
|
-
|
12
|
-
def ngrams_from_words(words)
|
13
|
-
words.map { |word| Ngram[word] }
|
14
|
-
end
|
15
|
-
|
16
|
-
def ngrams_from_fragment(fragment)
|
17
|
-
ngrams_from_words(parse_sentence(fragment))
|
18
|
-
end
|
19
|
-
|
20
|
-
def ngrams_from_sentence(sentence)
|
21
|
-
ngrams_from_fragment(sentence)
|
22
|
-
end
|
23
|
-
|
24
|
-
def ngrams_from_text(text)
|
25
|
-
parse_text(text).inject([]) do |ngrams,sentence|
|
26
|
-
ngrams + ngrams_from_sentence(sentence)
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
def train_with_sentence(sentence)
|
31
|
-
train_with_ngrams(ngrams_from_sentence(sentence))
|
32
|
-
end
|
33
|
-
|
34
|
-
def train_with_text(text)
|
35
|
-
train_with_ngrams(ngrams_from_text(text))
|
36
|
-
end
|
37
|
-
|
38
|
-
def gram_count
|
39
|
-
@frequency.values.inject do |sum,count|
|
40
|
-
sum + count
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
def build(&block)
|
45
|
-
clear_probabilities
|
46
|
-
|
47
|
-
block.call(self) if block
|
48
|
-
|
49
|
-
total_count = gram_count.to_f
|
50
|
-
@frequency.each do |ngram,count|
|
51
|
-
@probability[ngram] = count.to_f / total_count
|
52
|
-
end
|
53
|
-
|
54
|
-
return self
|
55
|
-
end
|
56
|
-
|
57
|
-
def fragment_probability(fragment)
|
58
|
-
probability_of_ngrams(ngrams_from_fragment(fragment))
|
59
|
-
end
|
60
|
-
|
61
|
-
def sentence_probability(sentence)
|
62
|
-
probability_of_ngrams(ngrams_from_sentence(sentence))
|
63
|
-
end
|
64
|
-
|
65
|
-
def text_probability(text)
|
66
|
-
probability_of_ngrams(ngrams_from_text(text))
|
67
|
-
end
|
68
|
-
|
69
|
-
end
|
70
|
-
end
|
data/test/test_raingrams.rb
DELETED
File without changes
|