raingrams 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. data/History.txt +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/Manifest.txt +42 -0
  4. data/README.txt +46 -0
  5. data/Rakefile +17 -0
  6. data/lib/raingrams/bigram_model.rb +13 -0
  7. data/lib/raingrams/exceptions/prefix_frequency_missing.rb +4 -0
  8. data/lib/raingrams/exceptions.rb +1 -0
  9. data/lib/raingrams/extensions/class.rb +7 -0
  10. data/lib/raingrams/extensions/false_class.rb +7 -0
  11. data/lib/raingrams/extensions/nil_class.rb +7 -0
  12. data/lib/raingrams/extensions/object.rb +7 -0
  13. data/lib/raingrams/extensions/string.rb +7 -0
  14. data/lib/raingrams/extensions/symbol.rb +7 -0
  15. data/lib/raingrams/extensions/true_class.rb +7 -0
  16. data/lib/raingrams/extensions.rb +7 -0
  17. data/lib/raingrams/hexagram_model.rb +13 -0
  18. data/lib/raingrams/model.rb +161 -0
  19. data/lib/raingrams/multigram_model.rb +165 -0
  20. data/lib/raingrams/ngram.rb +53 -0
  21. data/lib/raingrams/open_vocabulary/bigram_model.rb +12 -0
  22. data/lib/raingrams/open_vocabulary/hexagram_model.rb +12 -0
  23. data/lib/raingrams/open_vocabulary/multigram_model.rb +12 -0
  24. data/lib/raingrams/open_vocabulary/open_model.rb +34 -0
  25. data/lib/raingrams/open_vocabulary/pentagram_model.rb +12 -0
  26. data/lib/raingrams/open_vocabulary/quadgram_model.rb +12 -0
  27. data/lib/raingrams/open_vocabulary/trigram_model.rb +12 -0
  28. data/lib/raingrams/open_vocabulary/unigram_model.rb +12 -0
  29. data/lib/raingrams/open_vocabulary.rb +7 -0
  30. data/lib/raingrams/pentagram_model.rb +13 -0
  31. data/lib/raingrams/quadgram_model.rb +13 -0
  32. data/lib/raingrams/raingrams.rb +31 -0
  33. data/lib/raingrams/tokens/start_sentence.rb +13 -0
  34. data/lib/raingrams/tokens/stop_sentence.rb +13 -0
  35. data/lib/raingrams/tokens/token.rb +19 -0
  36. data/lib/raingrams/tokens/unknown.rb +13 -0
  37. data/lib/raingrams/tokens.rb +4 -0
  38. data/lib/raingrams/trigram_model.rb +13 -0
  39. data/lib/raingrams/unigram_model.rb +70 -0
  40. data/lib/raingrams/version.rb +3 -0
  41. data/lib/raingrams.rb +10 -0
  42. data/test/test_raingrams.rb +0 -0
  43. metadata +99 -0
data/History.txt ADDED
@@ -0,0 +1,7 @@
1
+ == 0.0.9 / 2008-01-09
2
+
3
+ * Initial release.
4
+ * Supports all non-zero ngram sizes.
5
+ * Supports text and non-text grams.
6
+ * Supports Open and Closed vocabulary models.
7
+
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2007-2008 Hal Brodigan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,42 @@
1
+ History.txt
2
+ LICENSE.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/raingrams.rb
7
+ lib/raingrams/version.rb
8
+ lib/raingrams/raingrams.rb
9
+ lib/raingrams/exceptions/prefix_frequency_missing.rb
10
+ lib/raingrams/exceptions.rb
11
+ lib/raingrams/extensions/class.rb
12
+ lib/raingrams/extensions/false_class.rb
13
+ lib/raingrams/extensions/nil_class.rb
14
+ lib/raingrams/extensions/object.rb
15
+ lib/raingrams/extensions/string.rb
16
+ lib/raingrams/extensions/symbol.rb
17
+ lib/raingrams/extensions/true_class.rb
18
+ lib/raingrams/extensions.rb
19
+ lib/raingrams/tokens/token.rb
20
+ lib/raingrams/tokens/start_sentence.rb
21
+ lib/raingrams/tokens/stop_sentence.rb
22
+ lib/raingrams/tokens/unknown.rb
23
+ lib/raingrams/tokens.rb
24
+ lib/raingrams/ngram.rb
25
+ lib/raingrams/model.rb
26
+ lib/raingrams/unigram_model.rb
27
+ lib/raingrams/multigram_model.rb
28
+ lib/raingrams/bigram_model.rb
29
+ lib/raingrams/trigram_model.rb
30
+ lib/raingrams/quadgram_model.rb
31
+ lib/raingrams/pentagram_model.rb
32
+ lib/raingrams/hexagram_model.rb
33
+ lib/raingrams/open_vocabulary/open_model.rb
34
+ lib/raingrams/open_vocabulary/unigram_model.rb
35
+ lib/raingrams/open_vocabulary/multigram_model.rb
36
+ lib/raingrams/open_vocabulary/bigram_model.rb
37
+ lib/raingrams/open_vocabulary/trigram_model.rb
38
+ lib/raingrams/open_vocabulary/quadgram_model.rb
39
+ lib/raingrams/open_vocabulary/pentagram_model.rb
40
+ lib/raingrams/open_vocabulary/hexagram_model.rb
41
+ lib/raingrams/open_vocabulary.rb
42
+ test/test_raingrams.rb
data/README.txt ADDED
@@ -0,0 +1,46 @@
1
+ Raingrams
2
+ by Postmodern Modulus III
3
+ http://rubyforge.net/projects/raingrams/
4
+
5
+ == DESCRIPTION:
6
+
7
+ Raingrams is a flexible and general-purpose ngrams library written in Ruby.
8
+ Raingrams supports any non-zero ngram size, text/non-text grams, multiple
9
+ parsing styles and open/closed vocabulary models.
10
+
11
+ == FEATURES/PROBLEMS:
12
+
13
+ * Supports all non-zero ngram sizes.
14
+ * Supports text and non-text grams.
15
+ * Supports Open and Closed vocabulary models.
16
+
17
+ == REQUIREMENTS:
18
+
19
+ == INSTALL:
20
+
21
+ $ sudo gem install raingrams
22
+
23
+ == LICENSE:
24
+
25
+ The MIT License
26
+
27
+ Copyright (c) 2007-2008 Hal Brodigan
28
+
29
+ Permission is hereby granted, free of charge, to any person obtaining
30
+ a copy of this software and associated documentation files (the
31
+ 'Software'), to deal in the Software without restriction, including
32
+ without limitation the rights to use, copy, modify, merge, publish,
33
+ distribute, sublicense, and/or sell copies of the Software, and to
34
+ permit persons to whom the Software is furnished to do so, subject to
35
+ the following conditions:
36
+
37
+ The above copyright notice and this permission notice shall be
38
+ included in all copies or substantial portions of the Software.
39
+
40
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
41
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
44
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
45
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
46
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/raingrams/version.rb'
6
+
7
+ Hoe.new('raingrams', Raingrams::VERSION) do |p|
8
+ p.rubyforge_name = 'raingrams'
9
+ p.author = 'Postmodern Modulus III'
10
+ p.email = 'postmodern.mod3@gmail.com'
11
+ p.summary = 'Raingrams is a flexible and general-purpose ngrams library written in Ruby'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ end
16
+
17
+ # vim: syntax=Ruby
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class BigramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 2
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,4 @@
1
+ module Raingrams
2
+ class PrefixFrequencyMissing < RuntimeError
3
+ end
4
+ end
@@ -0,0 +1 @@
1
+ require 'raingrams/exceptions/prefix_frequency_missing.rb'
@@ -0,0 +1,7 @@
1
+ class Class
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class FalseClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class NilClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class Object
2
+
3
+ def to_gram
4
+ self.dup.freeze
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class String
2
+
3
+ def to_gram
4
+ intern
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class Symbol
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class TrueClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ require 'raingrams/extensions/class'
2
+ require 'raingrams/extensions/nil_class'
3
+ require 'raingrams/extensions/true_class'
4
+ require 'raingrams/extensions/false_class'
5
+ require 'raingrams/extensions/symbol'
6
+ require 'raingrams/extensions/string'
7
+ require 'raingrams/extensions/object'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class HexagramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 6
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,161 @@
1
+ require 'raingrams/ngram'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/exceptions/prefix_frequency_missing'
5
+
6
+ module Raingrams
7
+ class Model
8
+
9
+ # Size of ngrams to use
10
+ attr_reader :ngram_size
11
+
12
+ # Ignore case of parsed text
13
+ attr_reader :ignore_case
14
+
15
+ # Ignore the punctuation of parsed text
16
+ attr_reader :ignore_punc
17
+
18
+ # Ignore URLs
19
+ attr_reader :ignore_urls
20
+
21
+ # Ignore Phone numbers
22
+ attr_reader :ignore_phone_numbers
23
+
24
+ # Ignore References
25
+ attr_reader :ignore_references
26
+
27
+ # Convert Acronyms to names within parsed text
28
+ attr_reader :convert_acronyms
29
+
30
+ # Convert Abbreviations to names within parsed text
31
+ attr_reader :convert_abbrev
32
+
33
+ # Frequencies of observed ngrams
34
+ attr_reader :frequency
35
+
36
+ # Normalized table of observed ngrams
37
+ attr_reader :probability
38
+
39
+ def initialize(opts={},&block)
40
+ @ngram_size = opts[:ngram_size]
41
+ @ignore_case = opts[:ignore_case] || false
42
+ @ignore_punc = opts[:ignore_punc] || true
43
+ @ignore_urls = opts[:ignore_urls] || false
44
+ @ignore_phone_numbers = opts[:ignore_phone_numbers] || false
45
+ @convert_acronyms = opts[:convert_acronyms] || false
46
+ @convert_abbrev = opts[:convert_abbrev] || false
47
+
48
+ @frequency = Hash.new { |hash,key| 0 }
49
+ @probability = Hash.new { |hash,key| 0.0 }
50
+
51
+ block.call(self) if block
52
+ end
53
+
54
+ def parse_sentence(sentence)
55
+ sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
56
+
57
+ if @ignore_urls
58
+ sentence.gsub!(/\s*\w+:\/\/\w*\s*/,' ')
59
+ end
60
+
61
+ if @ignore_phone_numbers
62
+ sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
63
+ end
64
+
65
+ if @ignore_references
66
+ sentence.gsub!(/\s*[\d+]\s*/,' ')
67
+ end
68
+
69
+ if @ignore_case
70
+ sentence.downcase!
71
+ end
72
+
73
+ if @ignore_punc
74
+ return sentence.scan(/\w+[\.'\-\_]?\w*/)
75
+ else
76
+ return sentence.scan(/(\w+|[-_,\.;'"])/)
77
+ end
78
+ end
79
+
80
+ def parse_text(text,&block)
81
+ text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
82
+ end
83
+
84
+ def train_with_ngram(ngram)
85
+ @frequency[ngram] += 1
86
+ return self
87
+ end
88
+
89
+ def train_with_ngrams(ngrams=[])
90
+ ngrams.each { |ngram| train_with_ngram(ngram) }
91
+ return self
92
+ end
93
+
94
+ def ngrams
95
+ @frequency.keys
96
+ end
97
+
98
+ def has_ngram?(ngram)
99
+ ngrams.include?(ngram)
100
+ end
101
+
102
+ def each_ngram(&block)
103
+ ngrams.each(&block)
104
+ end
105
+
106
+ def ngrams_with(&block)
107
+ ngrams.select(&block)
108
+ end
109
+
110
+ def vocabulary
111
+ ngrams.flatten.uniq
112
+ end
113
+
114
+ def within_vocabulary?(gram)
115
+ each_ngrams do |ngram|
116
+ return true if ngram.include?(gram)
117
+ end
118
+
119
+ return false
120
+ end
121
+
122
+ def ngrams_starting_with(obj)
123
+ ngrams_with { |ngram| ngram.starts_with?(obj.to_gram) }
124
+ end
125
+
126
+ def ngrams_ending_with(gram)
127
+ ngrams_with { |ngram| ngram.ends_with?(gram) }
128
+ end
129
+
130
+ def probabilities_for(ngrams)
131
+ ngrams.map { |ngram| @probability[ngram] }
132
+ end
133
+
134
+ def probability_of_ngram(ngram)
135
+ @probability[ngram]
136
+ end
137
+
138
+ def probability_of_ngrams(ngrams)
139
+ probabilities_for(ngrams).inject { |joint,prob| joint * prob }
140
+ end
141
+
142
+ def probability_of_gram(gram)
143
+ probability_of_ngrams(ngrams_starting_with(gram))
144
+ end
145
+
146
+ def clear
147
+ @frequency.clear
148
+
149
+ clear_probabilities
150
+ return self
151
+ end
152
+
153
+ protected
154
+
155
+ def clear_probabilities
156
+ @probability.clear
157
+ return self
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,165 @@
1
+ require 'raingrams/model'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/exceptions/prefix_frequency_missing'
5
+
6
+ module Raingrams
7
+ class MultigramModel < Model
8
+
9
+ # Frequencies of n-1 grams
10
+ attr_reader :prefix_frequency
11
+
12
+ def initialize(opts={},&block)
13
+ @prefix_frequency = Hash.new { |hash,key| 0 }
14
+
15
+ super(opts) { |model| model.build(&block) }
16
+ end
17
+
18
+ def ngrams_from_words(words)
19
+ return (0...(words.length-@ngram_size+1)).map do |index|
20
+ Ngram.new(words[index,@ngram_size])
21
+ end
22
+ end
23
+
24
+ def ngrams_from_fragment(fragment)
25
+ ngrams_from_words(parse_sentence(fragment))
26
+ end
27
+
28
+ def ngrams_from_sentence(sentence)
29
+ ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
30
+ end
31
+
32
+ def ngrams_from_text(text)
33
+ parse_text(text).inject([]) do |ngrams,sentence|
34
+ ngrams + ngrams_from_sentence(sentence)
35
+ end
36
+ end
37
+
38
+ def common_ngrams_from_words(words)
39
+ ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
40
+ end
41
+
42
+ def common_ngrams_from_fragment(fragment)
43
+ ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
44
+ end
45
+
46
+ def common_ngrams_from_sentence(sentence)
47
+ ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
48
+ end
49
+
50
+ def common_ngrams_from_text(text)
51
+ ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
52
+ end
53
+
54
+ def train_with_ngram(ngram)
55
+ @prefix_frequency[ngram.prefix] += 1
56
+ return super(ngram)
57
+ end
58
+
59
+ def train_with_sentence(sentence)
60
+ train_with_ngrams(ngrams_from_sentence(sentence))
61
+ end
62
+
63
+ def train_with_text(text)
64
+ train_with_ngrams(ngrams_from_text(text))
65
+ end
66
+
67
+ def build(&block)
68
+ clear_probabilities
69
+
70
+ block.call(self) if block
71
+
72
+ @frequency.each do |ngram,count|
73
+ prefix = ngram.prefix
74
+
75
+ unless @prefix_frequency[prefix]
76
+ raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
77
+ end
78
+
79
+ @probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
80
+ end
81
+
82
+ return self
83
+ end
84
+
85
+ def ngrams_prefixed_by(prefix)
86
+ ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
87
+ end
88
+
89
+ def ngrams_postfixed_by(postfix)
90
+ ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
91
+ end
92
+
93
+ def ngrams_preceeding(gram)
94
+ ngrams_ending_with(gram).map do |ngram|
95
+ ngrams_postfixed_by(ngram.prefix)
96
+ end
97
+ end
98
+
99
+ def ngrams_following(gram)
100
+ ngrams_starting_with(gram).map do |ngram|
101
+ ngrams_prefixed_by(ngram.postfix)
102
+ end
103
+ end
104
+
105
+ def grams_preceeding(gram)
106
+ ngrams_ending_with(gram).map do |ngram|
107
+ ngram[-2]
108
+ end
109
+ end
110
+
111
+ def grams_following(gram)
112
+ ngrams_starting_with(gram).map do |ngram|
113
+ ngram[1]
114
+ end
115
+ end
116
+
117
+ def fragment_probability(fragment)
118
+ probability_of_ngrams(ngrams_from_fragment(fragment))
119
+ end
120
+
121
+ def sentence_probability(sentence)
122
+ probability_of_ngrams(ngrams_from_sentence(sentence))
123
+ end
124
+
125
+ def text_probability(text)
126
+ probability_of_ngrams(ngrams_from_text(text))
127
+ end
128
+
129
+ def common_fragment_probability(fragment)
130
+ probability_of_ngrams(common_ngrams_from_fragment(fragment))
131
+ end
132
+
133
+ def common_sentence_probability(sentence)
134
+ probability_of_ngrams(common_ngrams_from_sentence(sentence))
135
+ end
136
+
137
+ def common_text_probability(fragment)
138
+ probability_of_ngrams(common_ngrams_from_text(text))
139
+ end
140
+
141
+ def similar_fragment_probability(other,fragment)
142
+ common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
143
+ end
144
+
145
+ def similar_sentence_probability(other,sentence)
146
+ common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
147
+ end
148
+
149
+ def similar_text_probability(other,text)
150
+ common_text_probability(text) * other.common_text_probability(text)
151
+ end
152
+
153
+ def clear
154
+ @prefix_frequency.clear
155
+ return super
156
+ end
157
+
158
+ protected
159
+
160
+ def wrap_sentence(sentence)
161
+ (Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
162
+ end
163
+
164
+ end
165
+ end
@@ -0,0 +1,53 @@
1
+ module Raingrams
2
+ class Ngram < Array
3
+
4
+ def initialize(objs)
5
+ super(objs.map { |obj| obj.to_gram })
6
+ end
7
+
8
+ def self.[](*objs)
9
+ self.new(objs)
10
+ end
11
+
12
+ def prefix
13
+ self[0...length-1]
14
+ end
15
+
16
+ def prefixed_by?(ngram)
17
+ prefix==ngram
18
+ end
19
+
20
+ def postfix
21
+ self[1..-1]
22
+ end
23
+
24
+ def postfixed_by?(ngram)
25
+ postfix==ngram
26
+ end
27
+
28
+ def starts_with?(obj)
29
+ self[0]==obj.to_gram
30
+ end
31
+
32
+ def ends_with?(obj)
33
+ self[-1]==obj.to_gram
34
+ end
35
+
36
+ def include?(obj)
37
+ super(obj.to_gram)
38
+ end
39
+
40
+ def flatten
41
+ self.dup
42
+ end
43
+
44
+ def flatten!
45
+ self
46
+ end
47
+
48
+ def to_s
49
+ join(', ')
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/bigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class BigramModel < Raingrams::BigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/hexagram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class HexagramModel < Raingrams::HexagramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/multigrammodel'
2
+ require 'raingrams/openvocabulary/openmodel'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class MultigramModel < Raingrams::MultigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,34 @@
1
+ require 'raingrams/tokens/unknown'
2
+
3
+ module Raingrams
4
+ module OpenVocabulary
5
+ module OpenModel
6
+
7
+ # The fixed lexicon of this model
8
+ attr_reader :lexicon
9
+
10
+ def initialize(opts={},&block)
11
+ @lexicon = opts[:lexicon] || []
12
+
13
+ super(opts,&block)
14
+ end
15
+
16
+ def within_lexicon?(gram)
17
+ @lexicon.include?(gram)
18
+ end
19
+
20
+ def train_ngram(ngram)
21
+ ngram = ngram.map do |gram|
22
+ if within_lexicon?(gram)
23
+ gram
24
+ else
25
+ Tokens::Unknown
26
+ end
27
+ end
28
+
29
+ return super(ngram)
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/pentagram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class PentagramModel < Raingrams::PentagramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/quadgram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class QuadgramModel < Raingrams::QuadgramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/trigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class TrigramModel < Raingrams::TrigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/unigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class UnigramModel < Raingrams::UnigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,7 @@
1
+ require 'raingrams/openvocabulary/unigram_model'
2
+ require 'raingrams/openvocabulary/multigram_model'
3
+ require 'raingrams/openvocabulary/bigram_model'
4
+ require 'raingrams/openvocabulary/trigram_model'
5
+ require 'raingrams/openvocabulary/quadgram_model'
6
+ require 'raingrams/openvocabulary/pentagram_model'
7
+ require 'raingrams/openvocabulary/hexagram_model'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class PentagramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 5
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class QuadgramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 4
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,31 @@
1
+ require 'raingrams/unigram_model'
2
+ require 'raingrams/multigram_model'
3
+ require 'raingrams/open_vocabulary/unigram_model'
4
+ require 'raingrams/open_vocabulary/multigram_model'
5
+
6
+ module Raingrams
7
+ def Raingrams.closed_vocabulary_model(opts={},&block)
8
+ if opts[:ngram_size]==1
9
+ return UnigramModel.new(opts,&block)
10
+ else
11
+ return MultigramModel.new(opts,&block)
12
+ end
13
+ end
14
+
15
+ def Raingrams.open_vocabulary_model(opts={},&block)
16
+ if opts[:ngram_size]==1
17
+ return OpenVocabulary::UnigramModel.new(opts,&block)
18
+ else
19
+ return OpenVocabulary::MultigramModel.new(opts,&block)
20
+ end
21
+ end
22
+
23
+ def Raingrams.model(opts={},&block)
24
+ case opts[:vocabulary]
25
+ when :open, 'open'
26
+ return Raingrams.open_vocabulary_model(opts,&block)
27
+ else
28
+ return Raingrams.closed_vocabulary_model(opts,&block)
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class StartSentence < Token
6
+
7
+ def self.to_s
8
+ '<s>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class StopSentence < Token
6
+
7
+ def self.to_s
8
+ '</s>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module Raingrams
2
+ module Tokens
3
+ class Token
4
+
5
+ def self.*(length)
6
+ [self] * length
7
+ end
8
+
9
+ def self.to_sym
10
+ self.to_s.to_sym
11
+ end
12
+
13
+ def self.inspect
14
+ self.to_s
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class Unknown < Token
6
+
7
+ def self.to_s
8
+ '<unknown>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,4 @@
1
+ require 'raingrams/tokens/token'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/tokens/unknown'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class TrigramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 3
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,70 @@
1
+ require 'raingrams/model'
2
+
3
+ module Raingrams
4
+ class UnigramModel < Model
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 1
8
+
9
+ super(opts) { |model| model.build(&block) }
10
+ end
11
+
12
+ def ngrams_from_words(words)
13
+ words.map { |word| Ngram[word] }
14
+ end
15
+
16
+ def ngrams_from_fragment(fragment)
17
+ ngrams_from_words(parse_sentence(fragment))
18
+ end
19
+
20
+ def ngrams_from_sentence(sentence)
21
+ ngrams_from_fragment(sentence)
22
+ end
23
+
24
+ def ngrams_from_text(text)
25
+ parse_text(text).inject([]) do |ngrams,sentence|
26
+ ngrams + ngrams_from_sentence(sentence)
27
+ end
28
+ end
29
+
30
+ def train_with_sentence(sentence)
31
+ train_with_ngrams(ngrams_from_sentence(sentence))
32
+ end
33
+
34
+ def train_with_text(text)
35
+ train_with_ngrams(ngrams_from_text(text))
36
+ end
37
+
38
+ def gram_count
39
+ @frequency.values.inject do |sum,count|
40
+ sum + count
41
+ end
42
+ end
43
+
44
+ def build(&block)
45
+ clear_probabilities
46
+
47
+ block.call(self) if block
48
+
49
+ total_count = gram_count.to_f
50
+ @frequency.each do |ngram,count|
51
+ @probability[ngram] = count.to_f / total_count
52
+ end
53
+
54
+ return self
55
+ end
56
+
57
+ def fragment_probability(fragment)
58
+ probability_of_ngrams(ngrams_from_fragment(fragment))
59
+ end
60
+
61
+ def sentence_probability(sentence)
62
+ probability_of_ngrams(ngrams_from_sentence(sentence))
63
+ end
64
+
65
+ def text_probability(text)
66
+ probability_of_ngrams(ngrams_from_text(text))
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,3 @@
1
+ module Raingrams
2
+ VERSION = '0.0.9'
3
+ end
data/lib/raingrams.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'raingrams/extensions'
2
+ require 'raingrams/raingrams'
3
+ require 'raingrams/ngram'
4
+ require 'raingrams/unigram_model'
5
+ require 'raingrams/bigram_model'
6
+ require 'raingrams/trigram_model'
7
+ require 'raingrams/quadgram_model'
8
+ require 'raingrams/pentagram_model'
9
+ require 'raingrams/hexagram_model'
10
+ require 'raingrams/raingrams'
File without changes
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: raingrams
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.9
7
+ date: 2008-01-09 00:00:00 -08:00
8
+ summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
9
+ require_paths:
10
+ - lib
11
+ email: postmodern.mod3@gmail.com
12
+ homepage: " by Postmodern Modulus III"
13
+ rubyforge_project: raingrams
14
+ description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Postmodern Modulus III
31
+ files:
32
+ - History.txt
33
+ - LICENSE.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ - Rakefile
37
+ - lib/raingrams.rb
38
+ - lib/raingrams/version.rb
39
+ - lib/raingrams/raingrams.rb
40
+ - lib/raingrams/exceptions/prefix_frequency_missing.rb
41
+ - lib/raingrams/exceptions.rb
42
+ - lib/raingrams/extensions/class.rb
43
+ - lib/raingrams/extensions/false_class.rb
44
+ - lib/raingrams/extensions/nil_class.rb
45
+ - lib/raingrams/extensions/object.rb
46
+ - lib/raingrams/extensions/string.rb
47
+ - lib/raingrams/extensions/symbol.rb
48
+ - lib/raingrams/extensions/true_class.rb
49
+ - lib/raingrams/extensions.rb
50
+ - lib/raingrams/tokens/token.rb
51
+ - lib/raingrams/tokens/start_sentence.rb
52
+ - lib/raingrams/tokens/stop_sentence.rb
53
+ - lib/raingrams/tokens/unknown.rb
54
+ - lib/raingrams/tokens.rb
55
+ - lib/raingrams/ngram.rb
56
+ - lib/raingrams/model.rb
57
+ - lib/raingrams/unigram_model.rb
58
+ - lib/raingrams/multigram_model.rb
59
+ - lib/raingrams/bigram_model.rb
60
+ - lib/raingrams/trigram_model.rb
61
+ - lib/raingrams/quadgram_model.rb
62
+ - lib/raingrams/pentagram_model.rb
63
+ - lib/raingrams/hexagram_model.rb
64
+ - lib/raingrams/open_vocabulary/open_model.rb
65
+ - lib/raingrams/open_vocabulary/unigram_model.rb
66
+ - lib/raingrams/open_vocabulary/multigram_model.rb
67
+ - lib/raingrams/open_vocabulary/bigram_model.rb
68
+ - lib/raingrams/open_vocabulary/trigram_model.rb
69
+ - lib/raingrams/open_vocabulary/quadgram_model.rb
70
+ - lib/raingrams/open_vocabulary/pentagram_model.rb
71
+ - lib/raingrams/open_vocabulary/hexagram_model.rb
72
+ - lib/raingrams/open_vocabulary.rb
73
+ - test/test_raingrams.rb
74
+ test_files:
75
+ - test/test_raingrams.rb
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ extra_rdoc_files:
80
+ - History.txt
81
+ - LICENSE.txt
82
+ - Manifest.txt
83
+ - README.txt
84
+ executables: []
85
+
86
+ extensions: []
87
+
88
+ requirements: []
89
+
90
+ dependencies:
91
+ - !ruby/object:Gem::Dependency
92
+ name: hoe
93
+ version_requirement:
94
+ version_requirements: !ruby/object:Gem::Version::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: 1.4.0
99
+ version: