raingrams 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +42 -0
- data/README.txt +46 -0
- data/Rakefile +17 -0
- data/lib/raingrams/bigram_model.rb +13 -0
- data/lib/raingrams/exceptions/prefix_frequency_missing.rb +4 -0
- data/lib/raingrams/exceptions.rb +1 -0
- data/lib/raingrams/extensions/class.rb +7 -0
- data/lib/raingrams/extensions/false_class.rb +7 -0
- data/lib/raingrams/extensions/nil_class.rb +7 -0
- data/lib/raingrams/extensions/object.rb +7 -0
- data/lib/raingrams/extensions/string.rb +7 -0
- data/lib/raingrams/extensions/symbol.rb +7 -0
- data/lib/raingrams/extensions/true_class.rb +7 -0
- data/lib/raingrams/extensions.rb +7 -0
- data/lib/raingrams/hexagram_model.rb +13 -0
- data/lib/raingrams/model.rb +161 -0
- data/lib/raingrams/multigram_model.rb +165 -0
- data/lib/raingrams/ngram.rb +53 -0
- data/lib/raingrams/open_vocabulary/bigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/hexagram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/multigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +34 -0
- data/lib/raingrams/open_vocabulary/pentagram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/quadgram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/trigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/unigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary.rb +7 -0
- data/lib/raingrams/pentagram_model.rb +13 -0
- data/lib/raingrams/quadgram_model.rb +13 -0
- data/lib/raingrams/raingrams.rb +31 -0
- data/lib/raingrams/tokens/start_sentence.rb +13 -0
- data/lib/raingrams/tokens/stop_sentence.rb +13 -0
- data/lib/raingrams/tokens/token.rb +19 -0
- data/lib/raingrams/tokens/unknown.rb +13 -0
- data/lib/raingrams/tokens.rb +4 -0
- data/lib/raingrams/trigram_model.rb +13 -0
- data/lib/raingrams/unigram_model.rb +70 -0
- data/lib/raingrams/version.rb +3 -0
- data/lib/raingrams.rb +10 -0
- data/test/test_raingrams.rb +0 -0
- metadata +99 -0
data/History.txt
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2007-2008 Hal Brodigan
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
History.txt
|
2
|
+
LICENSE.txt
|
3
|
+
Manifest.txt
|
4
|
+
README.txt
|
5
|
+
Rakefile
|
6
|
+
lib/raingrams.rb
|
7
|
+
lib/raingrams/version.rb
|
8
|
+
lib/raingrams/raingrams.rb
|
9
|
+
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
10
|
+
lib/raingrams/exceptions.rb
|
11
|
+
lib/raingrams/extensions/class.rb
|
12
|
+
lib/raingrams/extensions/false_class.rb
|
13
|
+
lib/raingrams/extensions/nil_class.rb
|
14
|
+
lib/raingrams/extensions/object.rb
|
15
|
+
lib/raingrams/extensions/string.rb
|
16
|
+
lib/raingrams/extensions/symbol.rb
|
17
|
+
lib/raingrams/extensions/true_class.rb
|
18
|
+
lib/raingrams/extensions.rb
|
19
|
+
lib/raingrams/tokens/token.rb
|
20
|
+
lib/raingrams/tokens/start_sentence.rb
|
21
|
+
lib/raingrams/tokens/stop_sentence.rb
|
22
|
+
lib/raingrams/tokens/unknown.rb
|
23
|
+
lib/raingrams/tokens.rb
|
24
|
+
lib/raingrams/ngram.rb
|
25
|
+
lib/raingrams/model.rb
|
26
|
+
lib/raingrams/unigram_model.rb
|
27
|
+
lib/raingrams/multigram_model.rb
|
28
|
+
lib/raingrams/bigram_model.rb
|
29
|
+
lib/raingrams/trigram_model.rb
|
30
|
+
lib/raingrams/quadgram_model.rb
|
31
|
+
lib/raingrams/pentagram_model.rb
|
32
|
+
lib/raingrams/hexagram_model.rb
|
33
|
+
lib/raingrams/open_vocabulary/open_model.rb
|
34
|
+
lib/raingrams/open_vocabulary/unigram_model.rb
|
35
|
+
lib/raingrams/open_vocabulary/multigram_model.rb
|
36
|
+
lib/raingrams/open_vocabulary/bigram_model.rb
|
37
|
+
lib/raingrams/open_vocabulary/trigram_model.rb
|
38
|
+
lib/raingrams/open_vocabulary/quadgram_model.rb
|
39
|
+
lib/raingrams/open_vocabulary/pentagram_model.rb
|
40
|
+
lib/raingrams/open_vocabulary/hexagram_model.rb
|
41
|
+
lib/raingrams/open_vocabulary.rb
|
42
|
+
test/test_raingrams.rb
|
data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Raingrams
|
2
|
+
by Postmodern Modulus III
|
3
|
+
http://rubyforge.net/projects/raingrams/
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Raingrams is a flexible and general-purpose ngrams library written in Ruby.
|
8
|
+
Raingrams supports any non-zero ngram size, text/non-text grams, multiple
|
9
|
+
parsing styles and open/closed vocabulary models.
|
10
|
+
|
11
|
+
== FEATURES/PROBLEMS:
|
12
|
+
|
13
|
+
* Supports all non-zero ngram sizes.
|
14
|
+
* Supports text and non-text grams.
|
15
|
+
* Supports Open and Closed vocabulary models.
|
16
|
+
|
17
|
+
== REQUIREMENTS:
|
18
|
+
|
19
|
+
== INSTALL:
|
20
|
+
|
21
|
+
$ sudo gem install raingrams
|
22
|
+
|
23
|
+
== LICENSE:
|
24
|
+
|
25
|
+
The MIT License
|
26
|
+
|
27
|
+
Copyright (c) 2007-2008 Hal Brodigan
|
28
|
+
|
29
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
30
|
+
a copy of this software and associated documentation files (the
|
31
|
+
'Software'), to deal in the Software without restriction, including
|
32
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
33
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
34
|
+
permit persons to whom the Software is furnished to do so, subject to
|
35
|
+
the following conditions:
|
36
|
+
|
37
|
+
The above copyright notice and this permission notice shall be
|
38
|
+
included in all copies or substantial portions of the Software.
|
39
|
+
|
40
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
41
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
42
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
43
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
44
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
45
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
46
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/raingrams/version.rb'
|
6
|
+
|
7
|
+
Hoe.new('raingrams', Raingrams::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'raingrams'
|
9
|
+
p.author = 'Postmodern Modulus III'
|
10
|
+
p.email = 'postmodern.mod3@gmail.com'
|
11
|
+
p.summary = 'Raingrams is a flexible and general-purpose ngrams library written in Ruby'
|
12
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
end
|
16
|
+
|
17
|
+
# vim: syntax=Ruby
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'raingrams/exceptions/prefix_frequency_missing.rb'
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require 'raingrams/extensions/class'
|
2
|
+
require 'raingrams/extensions/nil_class'
|
3
|
+
require 'raingrams/extensions/true_class'
|
4
|
+
require 'raingrams/extensions/false_class'
|
5
|
+
require 'raingrams/extensions/symbol'
|
6
|
+
require 'raingrams/extensions/string'
|
7
|
+
require 'raingrams/extensions/object'
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
require 'raingrams/tokens/start_sentence'
|
3
|
+
require 'raingrams/tokens/stop_sentence'
|
4
|
+
require 'raingrams/exceptions/prefix_frequency_missing'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
class Model
|
8
|
+
|
9
|
+
# Size of ngrams to use
|
10
|
+
attr_reader :ngram_size
|
11
|
+
|
12
|
+
# Ignore case of parsed text
|
13
|
+
attr_reader :ignore_case
|
14
|
+
|
15
|
+
# Ignore the punctuation of parsed text
|
16
|
+
attr_reader :ignore_punc
|
17
|
+
|
18
|
+
# Ignore URLs
|
19
|
+
attr_reader :ignore_urls
|
20
|
+
|
21
|
+
# Ignore Phone numbers
|
22
|
+
attr_reader :ignore_phone_numbers
|
23
|
+
|
24
|
+
# Ignore References
|
25
|
+
attr_reader :ignore_references
|
26
|
+
|
27
|
+
# Convert Acronyms to names within parsed text
|
28
|
+
attr_reader :convert_acronyms
|
29
|
+
|
30
|
+
# Convert Abbreviations to names within parsed text
|
31
|
+
attr_reader :convert_abbrev
|
32
|
+
|
33
|
+
# Frequencies of observed ngrams
|
34
|
+
attr_reader :frequency
|
35
|
+
|
36
|
+
# Normalized table of observed ngrams
|
37
|
+
attr_reader :probability
|
38
|
+
|
39
|
+
def initialize(opts={},&block)
|
40
|
+
@ngram_size = opts[:ngram_size]
|
41
|
+
@ignore_case = opts[:ignore_case] || false
|
42
|
+
@ignore_punc = opts[:ignore_punc] || true
|
43
|
+
@ignore_urls = opts[:ignore_urls] || false
|
44
|
+
@ignore_phone_numbers = opts[:ignore_phone_numbers] || false
|
45
|
+
@convert_acronyms = opts[:convert_acronyms] || false
|
46
|
+
@convert_abbrev = opts[:convert_abbrev] || false
|
47
|
+
|
48
|
+
@frequency = Hash.new { |hash,key| 0 }
|
49
|
+
@probability = Hash.new { |hash,key| 0.0 }
|
50
|
+
|
51
|
+
block.call(self) if block
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_sentence(sentence)
|
55
|
+
sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
|
56
|
+
|
57
|
+
if @ignore_urls
|
58
|
+
sentence.gsub!(/\s*\w+:\/\/\w*\s*/,' ')
|
59
|
+
end
|
60
|
+
|
61
|
+
if @ignore_phone_numbers
|
62
|
+
sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
|
63
|
+
end
|
64
|
+
|
65
|
+
if @ignore_references
|
66
|
+
sentence.gsub!(/\s*[\d+]\s*/,' ')
|
67
|
+
end
|
68
|
+
|
69
|
+
if @ignore_case
|
70
|
+
sentence.downcase!
|
71
|
+
end
|
72
|
+
|
73
|
+
if @ignore_punc
|
74
|
+
return sentence.scan(/\w+[\.'\-\_]?\w*/)
|
75
|
+
else
|
76
|
+
return sentence.scan(/(\w+|[-_,\.;'"])/)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def parse_text(text,&block)
|
81
|
+
text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
|
82
|
+
end
|
83
|
+
|
84
|
+
def train_with_ngram(ngram)
|
85
|
+
@frequency[ngram] += 1
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
89
|
+
def train_with_ngrams(ngrams=[])
|
90
|
+
ngrams.each { |ngram| train_with_ngram(ngram) }
|
91
|
+
return self
|
92
|
+
end
|
93
|
+
|
94
|
+
def ngrams
|
95
|
+
@frequency.keys
|
96
|
+
end
|
97
|
+
|
98
|
+
def has_ngram?(ngram)
|
99
|
+
ngrams.include?(ngram)
|
100
|
+
end
|
101
|
+
|
102
|
+
def each_ngram(&block)
|
103
|
+
ngrams.each(&block)
|
104
|
+
end
|
105
|
+
|
106
|
+
def ngrams_with(&block)
|
107
|
+
ngrams.select(&block)
|
108
|
+
end
|
109
|
+
|
110
|
+
def vocabulary
|
111
|
+
ngrams.flatten.uniq
|
112
|
+
end
|
113
|
+
|
114
|
+
def within_vocabulary?(gram)
|
115
|
+
each_ngrams do |ngram|
|
116
|
+
return true if ngram.include?(gram)
|
117
|
+
end
|
118
|
+
|
119
|
+
return false
|
120
|
+
end
|
121
|
+
|
122
|
+
def ngrams_starting_with(obj)
|
123
|
+
ngrams_with { |ngram| ngram.starts_with?(obj.to_gram) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def ngrams_ending_with(gram)
|
127
|
+
ngrams_with { |ngram| ngram.ends_with?(gram) }
|
128
|
+
end
|
129
|
+
|
130
|
+
def probabilities_for(ngrams)
|
131
|
+
ngrams.map { |ngram| @probability[ngram] }
|
132
|
+
end
|
133
|
+
|
134
|
+
def probability_of_ngram(ngram)
|
135
|
+
@probability[ngram]
|
136
|
+
end
|
137
|
+
|
138
|
+
def probability_of_ngrams(ngrams)
|
139
|
+
probabilities_for(ngrams).inject { |joint,prob| joint * prob }
|
140
|
+
end
|
141
|
+
|
142
|
+
def probability_of_gram(gram)
|
143
|
+
probability_of_ngrams(ngrams_starting_with(gram))
|
144
|
+
end
|
145
|
+
|
146
|
+
def clear
|
147
|
+
@frequency.clear
|
148
|
+
|
149
|
+
clear_probabilities
|
150
|
+
return self
|
151
|
+
end
|
152
|
+
|
153
|
+
protected
|
154
|
+
|
155
|
+
def clear_probabilities
|
156
|
+
@probability.clear
|
157
|
+
return self
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'raingrams/model'
|
2
|
+
require 'raingrams/tokens/start_sentence'
|
3
|
+
require 'raingrams/tokens/stop_sentence'
|
4
|
+
require 'raingrams/exceptions/prefix_frequency_missing'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
class MultigramModel < Model
|
8
|
+
|
9
|
+
# Frequencies of n-1 grams
|
10
|
+
attr_reader :prefix_frequency
|
11
|
+
|
12
|
+
def initialize(opts={},&block)
|
13
|
+
@prefix_frequency = Hash.new { |hash,key| 0 }
|
14
|
+
|
15
|
+
super(opts) { |model| model.build(&block) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def ngrams_from_words(words)
|
19
|
+
return (0...(words.length-@ngram_size+1)).map do |index|
|
20
|
+
Ngram.new(words[index,@ngram_size])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def ngrams_from_fragment(fragment)
|
25
|
+
ngrams_from_words(parse_sentence(fragment))
|
26
|
+
end
|
27
|
+
|
28
|
+
def ngrams_from_sentence(sentence)
|
29
|
+
ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
|
30
|
+
end
|
31
|
+
|
32
|
+
def ngrams_from_text(text)
|
33
|
+
parse_text(text).inject([]) do |ngrams,sentence|
|
34
|
+
ngrams + ngrams_from_sentence(sentence)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def common_ngrams_from_words(words)
|
39
|
+
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def common_ngrams_from_fragment(fragment)
|
43
|
+
ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
|
44
|
+
end
|
45
|
+
|
46
|
+
def common_ngrams_from_sentence(sentence)
|
47
|
+
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
48
|
+
end
|
49
|
+
|
50
|
+
def common_ngrams_from_text(text)
|
51
|
+
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def train_with_ngram(ngram)
|
55
|
+
@prefix_frequency[ngram.prefix] += 1
|
56
|
+
return super(ngram)
|
57
|
+
end
|
58
|
+
|
59
|
+
def train_with_sentence(sentence)
|
60
|
+
train_with_ngrams(ngrams_from_sentence(sentence))
|
61
|
+
end
|
62
|
+
|
63
|
+
def train_with_text(text)
|
64
|
+
train_with_ngrams(ngrams_from_text(text))
|
65
|
+
end
|
66
|
+
|
67
|
+
def build(&block)
|
68
|
+
clear_probabilities
|
69
|
+
|
70
|
+
block.call(self) if block
|
71
|
+
|
72
|
+
@frequency.each do |ngram,count|
|
73
|
+
prefix = ngram.prefix
|
74
|
+
|
75
|
+
unless @prefix_frequency[prefix]
|
76
|
+
raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
|
77
|
+
end
|
78
|
+
|
79
|
+
@probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
|
80
|
+
end
|
81
|
+
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
def ngrams_prefixed_by(prefix)
|
86
|
+
ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
|
87
|
+
end
|
88
|
+
|
89
|
+
def ngrams_postfixed_by(postfix)
|
90
|
+
ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
|
91
|
+
end
|
92
|
+
|
93
|
+
def ngrams_preceeding(gram)
|
94
|
+
ngrams_ending_with(gram).map do |ngram|
|
95
|
+
ngrams_postfixed_by(ngram.prefix)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def ngrams_following(gram)
|
100
|
+
ngrams_starting_with(gram).map do |ngram|
|
101
|
+
ngrams_prefixed_by(ngram.postfix)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def grams_preceeding(gram)
|
106
|
+
ngrams_ending_with(gram).map do |ngram|
|
107
|
+
ngram[-2]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def grams_following(gram)
|
112
|
+
ngrams_starting_with(gram).map do |ngram|
|
113
|
+
ngram[1]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def fragment_probability(fragment)
|
118
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
119
|
+
end
|
120
|
+
|
121
|
+
def sentence_probability(sentence)
|
122
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
123
|
+
end
|
124
|
+
|
125
|
+
def text_probability(text)
|
126
|
+
probability_of_ngrams(ngrams_from_text(text))
|
127
|
+
end
|
128
|
+
|
129
|
+
def common_fragment_probability(fragment)
|
130
|
+
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
131
|
+
end
|
132
|
+
|
133
|
+
def common_sentence_probability(sentence)
|
134
|
+
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
135
|
+
end
|
136
|
+
|
137
|
+
def common_text_probability(fragment)
|
138
|
+
probability_of_ngrams(common_ngrams_from_text(text))
|
139
|
+
end
|
140
|
+
|
141
|
+
def similar_fragment_probability(other,fragment)
|
142
|
+
common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
|
143
|
+
end
|
144
|
+
|
145
|
+
def similar_sentence_probability(other,sentence)
|
146
|
+
common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
|
147
|
+
end
|
148
|
+
|
149
|
+
def similar_text_probability(other,text)
|
150
|
+
common_text_probability(text) * other.common_text_probability(text)
|
151
|
+
end
|
152
|
+
|
153
|
+
def clear
|
154
|
+
@prefix_frequency.clear
|
155
|
+
return super
|
156
|
+
end
|
157
|
+
|
158
|
+
protected
|
159
|
+
|
160
|
+
def wrap_sentence(sentence)
|
161
|
+
(Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Raingrams
|
2
|
+
class Ngram < Array
|
3
|
+
|
4
|
+
def initialize(objs)
|
5
|
+
super(objs.map { |obj| obj.to_gram })
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.[](*objs)
|
9
|
+
self.new(objs)
|
10
|
+
end
|
11
|
+
|
12
|
+
def prefix
|
13
|
+
self[0...length-1]
|
14
|
+
end
|
15
|
+
|
16
|
+
def prefixed_by?(ngram)
|
17
|
+
prefix==ngram
|
18
|
+
end
|
19
|
+
|
20
|
+
def postfix
|
21
|
+
self[1..-1]
|
22
|
+
end
|
23
|
+
|
24
|
+
def postfixed_by?(ngram)
|
25
|
+
postfix==ngram
|
26
|
+
end
|
27
|
+
|
28
|
+
def starts_with?(obj)
|
29
|
+
self[0]==obj.to_gram
|
30
|
+
end
|
31
|
+
|
32
|
+
def ends_with?(obj)
|
33
|
+
self[-1]==obj.to_gram
|
34
|
+
end
|
35
|
+
|
36
|
+
def include?(obj)
|
37
|
+
super(obj.to_gram)
|
38
|
+
end
|
39
|
+
|
40
|
+
def flatten
|
41
|
+
self.dup
|
42
|
+
end
|
43
|
+
|
44
|
+
def flatten!
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
join(', ')
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'raingrams/tokens/unknown'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module OpenVocabulary
|
5
|
+
module OpenModel
|
6
|
+
|
7
|
+
# The fixed lexicon of this model
|
8
|
+
attr_reader :lexicon
|
9
|
+
|
10
|
+
def initialize(opts={},&block)
|
11
|
+
@lexicon = opts[:lexicon] || []
|
12
|
+
|
13
|
+
super(opts,&block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def within_lexicon?(gram)
|
17
|
+
@lexicon.include?(gram)
|
18
|
+
end
|
19
|
+
|
20
|
+
def train_ngram(ngram)
|
21
|
+
ngram = ngram.map do |gram|
|
22
|
+
if within_lexicon?(gram)
|
23
|
+
gram
|
24
|
+
else
|
25
|
+
Tokens::Unknown
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
return super(ngram)
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require 'raingrams/openvocabulary/unigram_model'
|
2
|
+
require 'raingrams/openvocabulary/multigram_model'
|
3
|
+
require 'raingrams/openvocabulary/bigram_model'
|
4
|
+
require 'raingrams/openvocabulary/trigram_model'
|
5
|
+
require 'raingrams/openvocabulary/quadgram_model'
|
6
|
+
require 'raingrams/openvocabulary/pentagram_model'
|
7
|
+
require 'raingrams/openvocabulary/hexagram_model'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'raingrams/unigram_model'
|
2
|
+
require 'raingrams/multigram_model'
|
3
|
+
require 'raingrams/open_vocabulary/unigram_model'
|
4
|
+
require 'raingrams/open_vocabulary/multigram_model'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
def Raingrams.closed_vocabulary_model(opts={},&block)
|
8
|
+
if opts[:ngram_size]==1
|
9
|
+
return UnigramModel.new(opts,&block)
|
10
|
+
else
|
11
|
+
return MultigramModel.new(opts,&block)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def Raingrams.open_vocabulary_model(opts={},&block)
|
16
|
+
if opts[:ngram_size]==1
|
17
|
+
return OpenVocabulary::UnigramModel.new(opts,&block)
|
18
|
+
else
|
19
|
+
return OpenVocabulary::MultigramModel.new(opts,&block)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def Raingrams.model(opts={},&block)
|
24
|
+
case opts[:vocabulary]
|
25
|
+
when :open, 'open'
|
26
|
+
return Raingrams.open_vocabulary_model(opts,&block)
|
27
|
+
else
|
28
|
+
return Raingrams.closed_vocabulary_model(opts,&block)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'raingrams/model'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
class UnigramModel < Model
|
5
|
+
|
6
|
+
def initialize(opts={},&block)
|
7
|
+
opts[:ngram_size] = 1
|
8
|
+
|
9
|
+
super(opts) { |model| model.build(&block) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def ngrams_from_words(words)
|
13
|
+
words.map { |word| Ngram[word] }
|
14
|
+
end
|
15
|
+
|
16
|
+
def ngrams_from_fragment(fragment)
|
17
|
+
ngrams_from_words(parse_sentence(fragment))
|
18
|
+
end
|
19
|
+
|
20
|
+
def ngrams_from_sentence(sentence)
|
21
|
+
ngrams_from_fragment(sentence)
|
22
|
+
end
|
23
|
+
|
24
|
+
def ngrams_from_text(text)
|
25
|
+
parse_text(text).inject([]) do |ngrams,sentence|
|
26
|
+
ngrams + ngrams_from_sentence(sentence)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def train_with_sentence(sentence)
|
31
|
+
train_with_ngrams(ngrams_from_sentence(sentence))
|
32
|
+
end
|
33
|
+
|
34
|
+
def train_with_text(text)
|
35
|
+
train_with_ngrams(ngrams_from_text(text))
|
36
|
+
end
|
37
|
+
|
38
|
+
def gram_count
|
39
|
+
@frequency.values.inject do |sum,count|
|
40
|
+
sum + count
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def build(&block)
|
45
|
+
clear_probabilities
|
46
|
+
|
47
|
+
block.call(self) if block
|
48
|
+
|
49
|
+
total_count = gram_count.to_f
|
50
|
+
@frequency.each do |ngram,count|
|
51
|
+
@probability[ngram] = count.to_f / total_count
|
52
|
+
end
|
53
|
+
|
54
|
+
return self
|
55
|
+
end
|
56
|
+
|
57
|
+
def fragment_probability(fragment)
|
58
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
59
|
+
end
|
60
|
+
|
61
|
+
def sentence_probability(sentence)
|
62
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
63
|
+
end
|
64
|
+
|
65
|
+
def text_probability(text)
|
66
|
+
probability_of_ngrams(ngrams_from_text(text))
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
data/lib/raingrams.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'raingrams/extensions'
|
2
|
+
require 'raingrams/raingrams'
|
3
|
+
require 'raingrams/ngram'
|
4
|
+
require 'raingrams/unigram_model'
|
5
|
+
require 'raingrams/bigram_model'
|
6
|
+
require 'raingrams/trigram_model'
|
7
|
+
require 'raingrams/quadgram_model'
|
8
|
+
require 'raingrams/pentagram_model'
|
9
|
+
require 'raingrams/hexagram_model'
|
10
|
+
require 'raingrams/raingrams'
|
File without changes
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: raingrams
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.0.9
|
7
|
+
date: 2008-01-09 00:00:00 -08:00
|
8
|
+
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: postmodern.mod3@gmail.com
|
12
|
+
homepage: " by Postmodern Modulus III"
|
13
|
+
rubyforge_project: raingrams
|
14
|
+
description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Postmodern Modulus III
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- LICENSE.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
- Rakefile
|
37
|
+
- lib/raingrams.rb
|
38
|
+
- lib/raingrams/version.rb
|
39
|
+
- lib/raingrams/raingrams.rb
|
40
|
+
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
41
|
+
- lib/raingrams/exceptions.rb
|
42
|
+
- lib/raingrams/extensions/class.rb
|
43
|
+
- lib/raingrams/extensions/false_class.rb
|
44
|
+
- lib/raingrams/extensions/nil_class.rb
|
45
|
+
- lib/raingrams/extensions/object.rb
|
46
|
+
- lib/raingrams/extensions/string.rb
|
47
|
+
- lib/raingrams/extensions/symbol.rb
|
48
|
+
- lib/raingrams/extensions/true_class.rb
|
49
|
+
- lib/raingrams/extensions.rb
|
50
|
+
- lib/raingrams/tokens/token.rb
|
51
|
+
- lib/raingrams/tokens/start_sentence.rb
|
52
|
+
- lib/raingrams/tokens/stop_sentence.rb
|
53
|
+
- lib/raingrams/tokens/unknown.rb
|
54
|
+
- lib/raingrams/tokens.rb
|
55
|
+
- lib/raingrams/ngram.rb
|
56
|
+
- lib/raingrams/model.rb
|
57
|
+
- lib/raingrams/unigram_model.rb
|
58
|
+
- lib/raingrams/multigram_model.rb
|
59
|
+
- lib/raingrams/bigram_model.rb
|
60
|
+
- lib/raingrams/trigram_model.rb
|
61
|
+
- lib/raingrams/quadgram_model.rb
|
62
|
+
- lib/raingrams/pentagram_model.rb
|
63
|
+
- lib/raingrams/hexagram_model.rb
|
64
|
+
- lib/raingrams/open_vocabulary/open_model.rb
|
65
|
+
- lib/raingrams/open_vocabulary/unigram_model.rb
|
66
|
+
- lib/raingrams/open_vocabulary/multigram_model.rb
|
67
|
+
- lib/raingrams/open_vocabulary/bigram_model.rb
|
68
|
+
- lib/raingrams/open_vocabulary/trigram_model.rb
|
69
|
+
- lib/raingrams/open_vocabulary/quadgram_model.rb
|
70
|
+
- lib/raingrams/open_vocabulary/pentagram_model.rb
|
71
|
+
- lib/raingrams/open_vocabulary/hexagram_model.rb
|
72
|
+
- lib/raingrams/open_vocabulary.rb
|
73
|
+
- test/test_raingrams.rb
|
74
|
+
test_files:
|
75
|
+
- test/test_raingrams.rb
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
extra_rdoc_files:
|
80
|
+
- History.txt
|
81
|
+
- LICENSE.txt
|
82
|
+
- Manifest.txt
|
83
|
+
- README.txt
|
84
|
+
executables: []
|
85
|
+
|
86
|
+
extensions: []
|
87
|
+
|
88
|
+
requirements: []
|
89
|
+
|
90
|
+
dependencies:
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: hoe
|
93
|
+
version_requirement:
|
94
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.4.0
|
99
|
+
version:
|