raingrams 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/LICENSE.txt +21 -0
- data/Manifest.txt +42 -0
- data/README.txt +46 -0
- data/Rakefile +17 -0
- data/lib/raingrams/bigram_model.rb +13 -0
- data/lib/raingrams/exceptions/prefix_frequency_missing.rb +4 -0
- data/lib/raingrams/exceptions.rb +1 -0
- data/lib/raingrams/extensions/class.rb +7 -0
- data/lib/raingrams/extensions/false_class.rb +7 -0
- data/lib/raingrams/extensions/nil_class.rb +7 -0
- data/lib/raingrams/extensions/object.rb +7 -0
- data/lib/raingrams/extensions/string.rb +7 -0
- data/lib/raingrams/extensions/symbol.rb +7 -0
- data/lib/raingrams/extensions/true_class.rb +7 -0
- data/lib/raingrams/extensions.rb +7 -0
- data/lib/raingrams/hexagram_model.rb +13 -0
- data/lib/raingrams/model.rb +161 -0
- data/lib/raingrams/multigram_model.rb +165 -0
- data/lib/raingrams/ngram.rb +53 -0
- data/lib/raingrams/open_vocabulary/bigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/hexagram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/multigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/open_model.rb +34 -0
- data/lib/raingrams/open_vocabulary/pentagram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/quadgram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/trigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary/unigram_model.rb +12 -0
- data/lib/raingrams/open_vocabulary.rb +7 -0
- data/lib/raingrams/pentagram_model.rb +13 -0
- data/lib/raingrams/quadgram_model.rb +13 -0
- data/lib/raingrams/raingrams.rb +31 -0
- data/lib/raingrams/tokens/start_sentence.rb +13 -0
- data/lib/raingrams/tokens/stop_sentence.rb +13 -0
- data/lib/raingrams/tokens/token.rb +19 -0
- data/lib/raingrams/tokens/unknown.rb +13 -0
- data/lib/raingrams/tokens.rb +4 -0
- data/lib/raingrams/trigram_model.rb +13 -0
- data/lib/raingrams/unigram_model.rb +70 -0
- data/lib/raingrams/version.rb +3 -0
- data/lib/raingrams.rb +10 -0
- data/test/test_raingrams.rb +0 -0
- metadata +99 -0
data/History.txt
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2007-2008 Hal Brodigan
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/Manifest.txt
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
History.txt
|
2
|
+
LICENSE.txt
|
3
|
+
Manifest.txt
|
4
|
+
README.txt
|
5
|
+
Rakefile
|
6
|
+
lib/raingrams.rb
|
7
|
+
lib/raingrams/version.rb
|
8
|
+
lib/raingrams/raingrams.rb
|
9
|
+
lib/raingrams/exceptions/prefix_frequency_missing.rb
|
10
|
+
lib/raingrams/exceptions.rb
|
11
|
+
lib/raingrams/extensions/class.rb
|
12
|
+
lib/raingrams/extensions/false_class.rb
|
13
|
+
lib/raingrams/extensions/nil_class.rb
|
14
|
+
lib/raingrams/extensions/object.rb
|
15
|
+
lib/raingrams/extensions/string.rb
|
16
|
+
lib/raingrams/extensions/symbol.rb
|
17
|
+
lib/raingrams/extensions/true_class.rb
|
18
|
+
lib/raingrams/extensions.rb
|
19
|
+
lib/raingrams/tokens/token.rb
|
20
|
+
lib/raingrams/tokens/start_sentence.rb
|
21
|
+
lib/raingrams/tokens/stop_sentence.rb
|
22
|
+
lib/raingrams/tokens/unknown.rb
|
23
|
+
lib/raingrams/tokens.rb
|
24
|
+
lib/raingrams/ngram.rb
|
25
|
+
lib/raingrams/model.rb
|
26
|
+
lib/raingrams/unigram_model.rb
|
27
|
+
lib/raingrams/multigram_model.rb
|
28
|
+
lib/raingrams/bigram_model.rb
|
29
|
+
lib/raingrams/trigram_model.rb
|
30
|
+
lib/raingrams/quadgram_model.rb
|
31
|
+
lib/raingrams/pentagram_model.rb
|
32
|
+
lib/raingrams/hexagram_model.rb
|
33
|
+
lib/raingrams/open_vocabulary/open_model.rb
|
34
|
+
lib/raingrams/open_vocabulary/unigram_model.rb
|
35
|
+
lib/raingrams/open_vocabulary/multigram_model.rb
|
36
|
+
lib/raingrams/open_vocabulary/bigram_model.rb
|
37
|
+
lib/raingrams/open_vocabulary/trigram_model.rb
|
38
|
+
lib/raingrams/open_vocabulary/quadgram_model.rb
|
39
|
+
lib/raingrams/open_vocabulary/pentagram_model.rb
|
40
|
+
lib/raingrams/open_vocabulary/hexagram_model.rb
|
41
|
+
lib/raingrams/open_vocabulary.rb
|
42
|
+
test/test_raingrams.rb
|
data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Raingrams
|
2
|
+
by Postmodern Modulus III
|
3
|
+
http://rubyforge.net/projects/raingrams/
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Raingrams is a flexible and general-purpose ngrams library written in Ruby.
|
8
|
+
Raingrams supports any non-zero ngram size, text/non-text grams, multiple
|
9
|
+
parsing styles and open/closed vocabulary models.
|
10
|
+
|
11
|
+
== FEATURES/PROBLEMS:
|
12
|
+
|
13
|
+
* Supports all non-zero ngram sizes.
|
14
|
+
* Supports text and non-text grams.
|
15
|
+
* Supports Open and Closed vocabulary models.
|
16
|
+
|
17
|
+
== REQUIREMENTS:
|
18
|
+
|
19
|
+
== INSTALL:
|
20
|
+
|
21
|
+
$ sudo gem install raingrams
|
22
|
+
|
23
|
+
== LICENSE:
|
24
|
+
|
25
|
+
The MIT License
|
26
|
+
|
27
|
+
Copyright (c) 2007-2008 Hal Brodigan
|
28
|
+
|
29
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
30
|
+
a copy of this software and associated documentation files (the
|
31
|
+
'Software'), to deal in the Software without restriction, including
|
32
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
33
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
34
|
+
permit persons to whom the Software is furnished to do so, subject to
|
35
|
+
the following conditions:
|
36
|
+
|
37
|
+
The above copyright notice and this permission notice shall be
|
38
|
+
included in all copies or substantial portions of the Software.
|
39
|
+
|
40
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
41
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
42
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
43
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
44
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
45
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
46
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/raingrams/version.rb'
|
6
|
+
|
7
|
+
Hoe.new('raingrams', Raingrams::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'raingrams'
|
9
|
+
p.author = 'Postmodern Modulus III'
|
10
|
+
p.email = 'postmodern.mod3@gmail.com'
|
11
|
+
p.summary = 'Raingrams is a flexible and general-purpose ngrams library written in Ruby'
|
12
|
+
p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
|
13
|
+
p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
|
14
|
+
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
15
|
+
end
|
16
|
+
|
17
|
+
# vim: syntax=Ruby
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'raingrams/exceptions/prefix_frequency_missing.rb'
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require 'raingrams/extensions/class'
|
2
|
+
require 'raingrams/extensions/nil_class'
|
3
|
+
require 'raingrams/extensions/true_class'
|
4
|
+
require 'raingrams/extensions/false_class'
|
5
|
+
require 'raingrams/extensions/symbol'
|
6
|
+
require 'raingrams/extensions/string'
|
7
|
+
require 'raingrams/extensions/object'
|
@@ -0,0 +1,161 @@
|
|
1
|
+
require 'raingrams/ngram'
|
2
|
+
require 'raingrams/tokens/start_sentence'
|
3
|
+
require 'raingrams/tokens/stop_sentence'
|
4
|
+
require 'raingrams/exceptions/prefix_frequency_missing'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
class Model
|
8
|
+
|
9
|
+
# Size of ngrams to use
|
10
|
+
attr_reader :ngram_size
|
11
|
+
|
12
|
+
# Ignore case of parsed text
|
13
|
+
attr_reader :ignore_case
|
14
|
+
|
15
|
+
# Ignore the punctuation of parsed text
|
16
|
+
attr_reader :ignore_punc
|
17
|
+
|
18
|
+
# Ignore URLs
|
19
|
+
attr_reader :ignore_urls
|
20
|
+
|
21
|
+
# Ignore Phone numbers
|
22
|
+
attr_reader :ignore_phone_numbers
|
23
|
+
|
24
|
+
# Ignore References
|
25
|
+
attr_reader :ignore_references
|
26
|
+
|
27
|
+
# Convert Acronyms to names within parsed text
|
28
|
+
attr_reader :convert_acronyms
|
29
|
+
|
30
|
+
# Convert Abbreviations to names within parsed text
|
31
|
+
attr_reader :convert_abbrev
|
32
|
+
|
33
|
+
# Frequencies of observed ngrams
|
34
|
+
attr_reader :frequency
|
35
|
+
|
36
|
+
# Normalized table of observed ngrams
|
37
|
+
attr_reader :probability
|
38
|
+
|
39
|
+
def initialize(opts={},&block)
|
40
|
+
@ngram_size = opts[:ngram_size]
|
41
|
+
@ignore_case = opts[:ignore_case] || false
|
42
|
+
@ignore_punc = opts[:ignore_punc] || true
|
43
|
+
@ignore_urls = opts[:ignore_urls] || false
|
44
|
+
@ignore_phone_numbers = opts[:ignore_phone_numbers] || false
|
45
|
+
@convert_acronyms = opts[:convert_acronyms] || false
|
46
|
+
@convert_abbrev = opts[:convert_abbrev] || false
|
47
|
+
|
48
|
+
@frequency = Hash.new { |hash,key| 0 }
|
49
|
+
@probability = Hash.new { |hash,key| 0.0 }
|
50
|
+
|
51
|
+
block.call(self) if block
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_sentence(sentence)
|
55
|
+
sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
|
56
|
+
|
57
|
+
if @ignore_urls
|
58
|
+
sentence.gsub!(/\s*\w+:\/\/\w*\s*/,' ')
|
59
|
+
end
|
60
|
+
|
61
|
+
if @ignore_phone_numbers
|
62
|
+
sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
|
63
|
+
end
|
64
|
+
|
65
|
+
if @ignore_references
|
66
|
+
sentence.gsub!(/\s*[\d+]\s*/,' ')
|
67
|
+
end
|
68
|
+
|
69
|
+
if @ignore_case
|
70
|
+
sentence.downcase!
|
71
|
+
end
|
72
|
+
|
73
|
+
if @ignore_punc
|
74
|
+
return sentence.scan(/\w+[\.'\-\_]?\w*/)
|
75
|
+
else
|
76
|
+
return sentence.scan(/(\w+|[-_,\.;'"])/)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def parse_text(text,&block)
|
81
|
+
text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
|
82
|
+
end
|
83
|
+
|
84
|
+
def train_with_ngram(ngram)
|
85
|
+
@frequency[ngram] += 1
|
86
|
+
return self
|
87
|
+
end
|
88
|
+
|
89
|
+
def train_with_ngrams(ngrams=[])
|
90
|
+
ngrams.each { |ngram| train_with_ngram(ngram) }
|
91
|
+
return self
|
92
|
+
end
|
93
|
+
|
94
|
+
def ngrams
|
95
|
+
@frequency.keys
|
96
|
+
end
|
97
|
+
|
98
|
+
def has_ngram?(ngram)
|
99
|
+
ngrams.include?(ngram)
|
100
|
+
end
|
101
|
+
|
102
|
+
def each_ngram(&block)
|
103
|
+
ngrams.each(&block)
|
104
|
+
end
|
105
|
+
|
106
|
+
def ngrams_with(&block)
|
107
|
+
ngrams.select(&block)
|
108
|
+
end
|
109
|
+
|
110
|
+
def vocabulary
|
111
|
+
ngrams.flatten.uniq
|
112
|
+
end
|
113
|
+
|
114
|
+
def within_vocabulary?(gram)
|
115
|
+
each_ngrams do |ngram|
|
116
|
+
return true if ngram.include?(gram)
|
117
|
+
end
|
118
|
+
|
119
|
+
return false
|
120
|
+
end
|
121
|
+
|
122
|
+
def ngrams_starting_with(obj)
|
123
|
+
ngrams_with { |ngram| ngram.starts_with?(obj.to_gram) }
|
124
|
+
end
|
125
|
+
|
126
|
+
def ngrams_ending_with(gram)
|
127
|
+
ngrams_with { |ngram| ngram.ends_with?(gram) }
|
128
|
+
end
|
129
|
+
|
130
|
+
def probabilities_for(ngrams)
|
131
|
+
ngrams.map { |ngram| @probability[ngram] }
|
132
|
+
end
|
133
|
+
|
134
|
+
def probability_of_ngram(ngram)
|
135
|
+
@probability[ngram]
|
136
|
+
end
|
137
|
+
|
138
|
+
def probability_of_ngrams(ngrams)
|
139
|
+
probabilities_for(ngrams).inject { |joint,prob| joint * prob }
|
140
|
+
end
|
141
|
+
|
142
|
+
def probability_of_gram(gram)
|
143
|
+
probability_of_ngrams(ngrams_starting_with(gram))
|
144
|
+
end
|
145
|
+
|
146
|
+
def clear
|
147
|
+
@frequency.clear
|
148
|
+
|
149
|
+
clear_probabilities
|
150
|
+
return self
|
151
|
+
end
|
152
|
+
|
153
|
+
protected
|
154
|
+
|
155
|
+
def clear_probabilities
|
156
|
+
@probability.clear
|
157
|
+
return self
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
161
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'raingrams/model'
|
2
|
+
require 'raingrams/tokens/start_sentence'
|
3
|
+
require 'raingrams/tokens/stop_sentence'
|
4
|
+
require 'raingrams/exceptions/prefix_frequency_missing'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
class MultigramModel < Model
|
8
|
+
|
9
|
+
# Frequencies of n-1 grams
|
10
|
+
attr_reader :prefix_frequency
|
11
|
+
|
12
|
+
def initialize(opts={},&block)
|
13
|
+
@prefix_frequency = Hash.new { |hash,key| 0 }
|
14
|
+
|
15
|
+
super(opts) { |model| model.build(&block) }
|
16
|
+
end
|
17
|
+
|
18
|
+
def ngrams_from_words(words)
|
19
|
+
return (0...(words.length-@ngram_size+1)).map do |index|
|
20
|
+
Ngram.new(words[index,@ngram_size])
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def ngrams_from_fragment(fragment)
|
25
|
+
ngrams_from_words(parse_sentence(fragment))
|
26
|
+
end
|
27
|
+
|
28
|
+
def ngrams_from_sentence(sentence)
|
29
|
+
ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
|
30
|
+
end
|
31
|
+
|
32
|
+
def ngrams_from_text(text)
|
33
|
+
parse_text(text).inject([]) do |ngrams,sentence|
|
34
|
+
ngrams + ngrams_from_sentence(sentence)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def common_ngrams_from_words(words)
|
39
|
+
ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
|
40
|
+
end
|
41
|
+
|
42
|
+
def common_ngrams_from_fragment(fragment)
|
43
|
+
ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
|
44
|
+
end
|
45
|
+
|
46
|
+
def common_ngrams_from_sentence(sentence)
|
47
|
+
ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
|
48
|
+
end
|
49
|
+
|
50
|
+
def common_ngrams_from_text(text)
|
51
|
+
ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def train_with_ngram(ngram)
|
55
|
+
@prefix_frequency[ngram.prefix] += 1
|
56
|
+
return super(ngram)
|
57
|
+
end
|
58
|
+
|
59
|
+
def train_with_sentence(sentence)
|
60
|
+
train_with_ngrams(ngrams_from_sentence(sentence))
|
61
|
+
end
|
62
|
+
|
63
|
+
def train_with_text(text)
|
64
|
+
train_with_ngrams(ngrams_from_text(text))
|
65
|
+
end
|
66
|
+
|
67
|
+
def build(&block)
|
68
|
+
clear_probabilities
|
69
|
+
|
70
|
+
block.call(self) if block
|
71
|
+
|
72
|
+
@frequency.each do |ngram,count|
|
73
|
+
prefix = ngram.prefix
|
74
|
+
|
75
|
+
unless @prefix_frequency[prefix]
|
76
|
+
raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
|
77
|
+
end
|
78
|
+
|
79
|
+
@probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
|
80
|
+
end
|
81
|
+
|
82
|
+
return self
|
83
|
+
end
|
84
|
+
|
85
|
+
def ngrams_prefixed_by(prefix)
|
86
|
+
ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
|
87
|
+
end
|
88
|
+
|
89
|
+
def ngrams_postfixed_by(postfix)
|
90
|
+
ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
|
91
|
+
end
|
92
|
+
|
93
|
+
def ngrams_preceeding(gram)
|
94
|
+
ngrams_ending_with(gram).map do |ngram|
|
95
|
+
ngrams_postfixed_by(ngram.prefix)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def ngrams_following(gram)
|
100
|
+
ngrams_starting_with(gram).map do |ngram|
|
101
|
+
ngrams_prefixed_by(ngram.postfix)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def grams_preceeding(gram)
|
106
|
+
ngrams_ending_with(gram).map do |ngram|
|
107
|
+
ngram[-2]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def grams_following(gram)
|
112
|
+
ngrams_starting_with(gram).map do |ngram|
|
113
|
+
ngram[1]
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def fragment_probability(fragment)
|
118
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
119
|
+
end
|
120
|
+
|
121
|
+
def sentence_probability(sentence)
|
122
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
123
|
+
end
|
124
|
+
|
125
|
+
def text_probability(text)
|
126
|
+
probability_of_ngrams(ngrams_from_text(text))
|
127
|
+
end
|
128
|
+
|
129
|
+
def common_fragment_probability(fragment)
|
130
|
+
probability_of_ngrams(common_ngrams_from_fragment(fragment))
|
131
|
+
end
|
132
|
+
|
133
|
+
def common_sentence_probability(sentence)
|
134
|
+
probability_of_ngrams(common_ngrams_from_sentence(sentence))
|
135
|
+
end
|
136
|
+
|
137
|
+
def common_text_probability(fragment)
|
138
|
+
probability_of_ngrams(common_ngrams_from_text(text))
|
139
|
+
end
|
140
|
+
|
141
|
+
def similar_fragment_probability(other,fragment)
|
142
|
+
common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
|
143
|
+
end
|
144
|
+
|
145
|
+
def similar_sentence_probability(other,sentence)
|
146
|
+
common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
|
147
|
+
end
|
148
|
+
|
149
|
+
def similar_text_probability(other,text)
|
150
|
+
common_text_probability(text) * other.common_text_probability(text)
|
151
|
+
end
|
152
|
+
|
153
|
+
def clear
|
154
|
+
@prefix_frequency.clear
|
155
|
+
return super
|
156
|
+
end
|
157
|
+
|
158
|
+
protected
|
159
|
+
|
160
|
+
def wrap_sentence(sentence)
|
161
|
+
(Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Raingrams
|
2
|
+
class Ngram < Array
|
3
|
+
|
4
|
+
def initialize(objs)
|
5
|
+
super(objs.map { |obj| obj.to_gram })
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.[](*objs)
|
9
|
+
self.new(objs)
|
10
|
+
end
|
11
|
+
|
12
|
+
def prefix
|
13
|
+
self[0...length-1]
|
14
|
+
end
|
15
|
+
|
16
|
+
def prefixed_by?(ngram)
|
17
|
+
prefix==ngram
|
18
|
+
end
|
19
|
+
|
20
|
+
def postfix
|
21
|
+
self[1..-1]
|
22
|
+
end
|
23
|
+
|
24
|
+
def postfixed_by?(ngram)
|
25
|
+
postfix==ngram
|
26
|
+
end
|
27
|
+
|
28
|
+
def starts_with?(obj)
|
29
|
+
self[0]==obj.to_gram
|
30
|
+
end
|
31
|
+
|
32
|
+
def ends_with?(obj)
|
33
|
+
self[-1]==obj.to_gram
|
34
|
+
end
|
35
|
+
|
36
|
+
def include?(obj)
|
37
|
+
super(obj.to_gram)
|
38
|
+
end
|
39
|
+
|
40
|
+
def flatten
|
41
|
+
self.dup
|
42
|
+
end
|
43
|
+
|
44
|
+
def flatten!
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def to_s
|
49
|
+
join(', ')
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'raingrams/tokens/unknown'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
module OpenVocabulary
|
5
|
+
module OpenModel
|
6
|
+
|
7
|
+
# The fixed lexicon of this model
|
8
|
+
attr_reader :lexicon
|
9
|
+
|
10
|
+
def initialize(opts={},&block)
|
11
|
+
@lexicon = opts[:lexicon] || []
|
12
|
+
|
13
|
+
super(opts,&block)
|
14
|
+
end
|
15
|
+
|
16
|
+
def within_lexicon?(gram)
|
17
|
+
@lexicon.include?(gram)
|
18
|
+
end
|
19
|
+
|
20
|
+
def train_ngram(ngram)
|
21
|
+
ngram = ngram.map do |gram|
|
22
|
+
if within_lexicon?(gram)
|
23
|
+
gram
|
24
|
+
else
|
25
|
+
Tokens::Unknown
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
return super(ngram)
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require 'raingrams/openvocabulary/unigram_model'
|
2
|
+
require 'raingrams/openvocabulary/multigram_model'
|
3
|
+
require 'raingrams/openvocabulary/bigram_model'
|
4
|
+
require 'raingrams/openvocabulary/trigram_model'
|
5
|
+
require 'raingrams/openvocabulary/quadgram_model'
|
6
|
+
require 'raingrams/openvocabulary/pentagram_model'
|
7
|
+
require 'raingrams/openvocabulary/hexagram_model'
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'raingrams/unigram_model'
|
2
|
+
require 'raingrams/multigram_model'
|
3
|
+
require 'raingrams/open_vocabulary/unigram_model'
|
4
|
+
require 'raingrams/open_vocabulary/multigram_model'
|
5
|
+
|
6
|
+
module Raingrams
|
7
|
+
def Raingrams.closed_vocabulary_model(opts={},&block)
|
8
|
+
if opts[:ngram_size]==1
|
9
|
+
return UnigramModel.new(opts,&block)
|
10
|
+
else
|
11
|
+
return MultigramModel.new(opts,&block)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def Raingrams.open_vocabulary_model(opts={},&block)
|
16
|
+
if opts[:ngram_size]==1
|
17
|
+
return OpenVocabulary::UnigramModel.new(opts,&block)
|
18
|
+
else
|
19
|
+
return OpenVocabulary::MultigramModel.new(opts,&block)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def Raingrams.model(opts={},&block)
|
24
|
+
case opts[:vocabulary]
|
25
|
+
when :open, 'open'
|
26
|
+
return Raingrams.open_vocabulary_model(opts,&block)
|
27
|
+
else
|
28
|
+
return Raingrams.closed_vocabulary_model(opts,&block)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'raingrams/model'
|
2
|
+
|
3
|
+
module Raingrams
|
4
|
+
class UnigramModel < Model
|
5
|
+
|
6
|
+
def initialize(opts={},&block)
|
7
|
+
opts[:ngram_size] = 1
|
8
|
+
|
9
|
+
super(opts) { |model| model.build(&block) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def ngrams_from_words(words)
|
13
|
+
words.map { |word| Ngram[word] }
|
14
|
+
end
|
15
|
+
|
16
|
+
def ngrams_from_fragment(fragment)
|
17
|
+
ngrams_from_words(parse_sentence(fragment))
|
18
|
+
end
|
19
|
+
|
20
|
+
def ngrams_from_sentence(sentence)
|
21
|
+
ngrams_from_fragment(sentence)
|
22
|
+
end
|
23
|
+
|
24
|
+
def ngrams_from_text(text)
|
25
|
+
parse_text(text).inject([]) do |ngrams,sentence|
|
26
|
+
ngrams + ngrams_from_sentence(sentence)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def train_with_sentence(sentence)
|
31
|
+
train_with_ngrams(ngrams_from_sentence(sentence))
|
32
|
+
end
|
33
|
+
|
34
|
+
def train_with_text(text)
|
35
|
+
train_with_ngrams(ngrams_from_text(text))
|
36
|
+
end
|
37
|
+
|
38
|
+
def gram_count
|
39
|
+
@frequency.values.inject do |sum,count|
|
40
|
+
sum + count
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def build(&block)
|
45
|
+
clear_probabilities
|
46
|
+
|
47
|
+
block.call(self) if block
|
48
|
+
|
49
|
+
total_count = gram_count.to_f
|
50
|
+
@frequency.each do |ngram,count|
|
51
|
+
@probability[ngram] = count.to_f / total_count
|
52
|
+
end
|
53
|
+
|
54
|
+
return self
|
55
|
+
end
|
56
|
+
|
57
|
+
def fragment_probability(fragment)
|
58
|
+
probability_of_ngrams(ngrams_from_fragment(fragment))
|
59
|
+
end
|
60
|
+
|
61
|
+
def sentence_probability(sentence)
|
62
|
+
probability_of_ngrams(ngrams_from_sentence(sentence))
|
63
|
+
end
|
64
|
+
|
65
|
+
def text_probability(text)
|
66
|
+
probability_of_ngrams(ngrams_from_text(text))
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
data/lib/raingrams.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'raingrams/extensions'
|
2
|
+
require 'raingrams/raingrams'
|
3
|
+
require 'raingrams/ngram'
|
4
|
+
require 'raingrams/unigram_model'
|
5
|
+
require 'raingrams/bigram_model'
|
6
|
+
require 'raingrams/trigram_model'
|
7
|
+
require 'raingrams/quadgram_model'
|
8
|
+
require 'raingrams/pentagram_model'
|
9
|
+
require 'raingrams/hexagram_model'
|
10
|
+
require 'raingrams/raingrams'
|
File without changes
|
metadata
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: raingrams
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.0.9
|
7
|
+
date: 2008-01-09 00:00:00 -08:00
|
8
|
+
summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: postmodern.mod3@gmail.com
|
12
|
+
homepage: " by Postmodern Modulus III"
|
13
|
+
rubyforge_project: raingrams
|
14
|
+
description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Postmodern Modulus III
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- LICENSE.txt
|
34
|
+
- Manifest.txt
|
35
|
+
- README.txt
|
36
|
+
- Rakefile
|
37
|
+
- lib/raingrams.rb
|
38
|
+
- lib/raingrams/version.rb
|
39
|
+
- lib/raingrams/raingrams.rb
|
40
|
+
- lib/raingrams/exceptions/prefix_frequency_missing.rb
|
41
|
+
- lib/raingrams/exceptions.rb
|
42
|
+
- lib/raingrams/extensions/class.rb
|
43
|
+
- lib/raingrams/extensions/false_class.rb
|
44
|
+
- lib/raingrams/extensions/nil_class.rb
|
45
|
+
- lib/raingrams/extensions/object.rb
|
46
|
+
- lib/raingrams/extensions/string.rb
|
47
|
+
- lib/raingrams/extensions/symbol.rb
|
48
|
+
- lib/raingrams/extensions/true_class.rb
|
49
|
+
- lib/raingrams/extensions.rb
|
50
|
+
- lib/raingrams/tokens/token.rb
|
51
|
+
- lib/raingrams/tokens/start_sentence.rb
|
52
|
+
- lib/raingrams/tokens/stop_sentence.rb
|
53
|
+
- lib/raingrams/tokens/unknown.rb
|
54
|
+
- lib/raingrams/tokens.rb
|
55
|
+
- lib/raingrams/ngram.rb
|
56
|
+
- lib/raingrams/model.rb
|
57
|
+
- lib/raingrams/unigram_model.rb
|
58
|
+
- lib/raingrams/multigram_model.rb
|
59
|
+
- lib/raingrams/bigram_model.rb
|
60
|
+
- lib/raingrams/trigram_model.rb
|
61
|
+
- lib/raingrams/quadgram_model.rb
|
62
|
+
- lib/raingrams/pentagram_model.rb
|
63
|
+
- lib/raingrams/hexagram_model.rb
|
64
|
+
- lib/raingrams/open_vocabulary/open_model.rb
|
65
|
+
- lib/raingrams/open_vocabulary/unigram_model.rb
|
66
|
+
- lib/raingrams/open_vocabulary/multigram_model.rb
|
67
|
+
- lib/raingrams/open_vocabulary/bigram_model.rb
|
68
|
+
- lib/raingrams/open_vocabulary/trigram_model.rb
|
69
|
+
- lib/raingrams/open_vocabulary/quadgram_model.rb
|
70
|
+
- lib/raingrams/open_vocabulary/pentagram_model.rb
|
71
|
+
- lib/raingrams/open_vocabulary/hexagram_model.rb
|
72
|
+
- lib/raingrams/open_vocabulary.rb
|
73
|
+
- test/test_raingrams.rb
|
74
|
+
test_files:
|
75
|
+
- test/test_raingrams.rb
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
extra_rdoc_files:
|
80
|
+
- History.txt
|
81
|
+
- LICENSE.txt
|
82
|
+
- Manifest.txt
|
83
|
+
- README.txt
|
84
|
+
executables: []
|
85
|
+
|
86
|
+
extensions: []
|
87
|
+
|
88
|
+
requirements: []
|
89
|
+
|
90
|
+
dependencies:
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: hoe
|
93
|
+
version_requirement:
|
94
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
95
|
+
requirements:
|
96
|
+
- - ">="
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.4.0
|
99
|
+
version:
|