raingrams 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. data/History.txt +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/Manifest.txt +42 -0
  4. data/README.txt +46 -0
  5. data/Rakefile +17 -0
  6. data/lib/raingrams/bigram_model.rb +13 -0
  7. data/lib/raingrams/exceptions/prefix_frequency_missing.rb +4 -0
  8. data/lib/raingrams/exceptions.rb +1 -0
  9. data/lib/raingrams/extensions/class.rb +7 -0
  10. data/lib/raingrams/extensions/false_class.rb +7 -0
  11. data/lib/raingrams/extensions/nil_class.rb +7 -0
  12. data/lib/raingrams/extensions/object.rb +7 -0
  13. data/lib/raingrams/extensions/string.rb +7 -0
  14. data/lib/raingrams/extensions/symbol.rb +7 -0
  15. data/lib/raingrams/extensions/true_class.rb +7 -0
  16. data/lib/raingrams/extensions.rb +7 -0
  17. data/lib/raingrams/hexagram_model.rb +13 -0
  18. data/lib/raingrams/model.rb +161 -0
  19. data/lib/raingrams/multigram_model.rb +165 -0
  20. data/lib/raingrams/ngram.rb +53 -0
  21. data/lib/raingrams/open_vocabulary/bigram_model.rb +12 -0
  22. data/lib/raingrams/open_vocabulary/hexagram_model.rb +12 -0
  23. data/lib/raingrams/open_vocabulary/multigram_model.rb +12 -0
  24. data/lib/raingrams/open_vocabulary/open_model.rb +34 -0
  25. data/lib/raingrams/open_vocabulary/pentagram_model.rb +12 -0
  26. data/lib/raingrams/open_vocabulary/quadgram_model.rb +12 -0
  27. data/lib/raingrams/open_vocabulary/trigram_model.rb +12 -0
  28. data/lib/raingrams/open_vocabulary/unigram_model.rb +12 -0
  29. data/lib/raingrams/open_vocabulary.rb +7 -0
  30. data/lib/raingrams/pentagram_model.rb +13 -0
  31. data/lib/raingrams/quadgram_model.rb +13 -0
  32. data/lib/raingrams/raingrams.rb +31 -0
  33. data/lib/raingrams/tokens/start_sentence.rb +13 -0
  34. data/lib/raingrams/tokens/stop_sentence.rb +13 -0
  35. data/lib/raingrams/tokens/token.rb +19 -0
  36. data/lib/raingrams/tokens/unknown.rb +13 -0
  37. data/lib/raingrams/tokens.rb +4 -0
  38. data/lib/raingrams/trigram_model.rb +13 -0
  39. data/lib/raingrams/unigram_model.rb +70 -0
  40. data/lib/raingrams/version.rb +3 -0
  41. data/lib/raingrams.rb +10 -0
  42. data/test/test_raingrams.rb +0 -0
  43. metadata +99 -0
data/History.txt ADDED
@@ -0,0 +1,7 @@
1
+ == 0.0.9 / 2008-01-09
2
+
3
+ * Initial release.
4
+ * Supports all non-zero ngram sizes.
5
+ * Supports text and non-text grams.
6
+ * Supports Open and Closed vocabulary models.
7
+
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2007-2008 Hal Brodigan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,42 @@
1
+ History.txt
2
+ LICENSE.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ lib/raingrams.rb
7
+ lib/raingrams/version.rb
8
+ lib/raingrams/raingrams.rb
9
+ lib/raingrams/exceptions/prefix_frequency_missing.rb
10
+ lib/raingrams/exceptions.rb
11
+ lib/raingrams/extensions/class.rb
12
+ lib/raingrams/extensions/false_class.rb
13
+ lib/raingrams/extensions/nil_class.rb
14
+ lib/raingrams/extensions/object.rb
15
+ lib/raingrams/extensions/string.rb
16
+ lib/raingrams/extensions/symbol.rb
17
+ lib/raingrams/extensions/true_class.rb
18
+ lib/raingrams/extensions.rb
19
+ lib/raingrams/tokens/token.rb
20
+ lib/raingrams/tokens/start_sentence.rb
21
+ lib/raingrams/tokens/stop_sentence.rb
22
+ lib/raingrams/tokens/unknown.rb
23
+ lib/raingrams/tokens.rb
24
+ lib/raingrams/ngram.rb
25
+ lib/raingrams/model.rb
26
+ lib/raingrams/unigram_model.rb
27
+ lib/raingrams/multigram_model.rb
28
+ lib/raingrams/bigram_model.rb
29
+ lib/raingrams/trigram_model.rb
30
+ lib/raingrams/quadgram_model.rb
31
+ lib/raingrams/pentagram_model.rb
32
+ lib/raingrams/hexagram_model.rb
33
+ lib/raingrams/open_vocabulary/open_model.rb
34
+ lib/raingrams/open_vocabulary/unigram_model.rb
35
+ lib/raingrams/open_vocabulary/multigram_model.rb
36
+ lib/raingrams/open_vocabulary/bigram_model.rb
37
+ lib/raingrams/open_vocabulary/trigram_model.rb
38
+ lib/raingrams/open_vocabulary/quadgram_model.rb
39
+ lib/raingrams/open_vocabulary/pentagram_model.rb
40
+ lib/raingrams/open_vocabulary/hexagram_model.rb
41
+ lib/raingrams/open_vocabulary.rb
42
+ test/test_raingrams.rb
data/README.txt ADDED
@@ -0,0 +1,46 @@
1
+ Raingrams
2
+ by Postmodern Modulus III
3
+ http://rubyforge.net/projects/raingrams/
4
+
5
+ == DESCRIPTION:
6
+
7
+ Raingrams is a flexible and general-purpose ngrams library written in Ruby.
8
+ Raingrams supports any non-zero ngram size, text/non-text grams, multiple
9
+ parsing styles and open/closed vocabulary models.
10
+
11
+ == FEATURES/PROBLEMS:
12
+
13
+ * Supports all non-zero ngram sizes.
14
+ * Supports text and non-text grams.
15
+ * Supports Open and Closed vocabulary models.
16
+
17
+ == REQUIREMENTS:
18
+
19
+ == INSTALL:
20
+
21
+ $ sudo gem install raingrams
22
+
23
+ == LICENSE:
24
+
25
+ The MIT License
26
+
27
+ Copyright (c) 2007-2008 Hal Brodigan
28
+
29
+ Permission is hereby granted, free of charge, to any person obtaining
30
+ a copy of this software and associated documentation files (the
31
+ 'Software'), to deal in the Software without restriction, including
32
+ without limitation the rights to use, copy, modify, merge, publish,
33
+ distribute, sublicense, and/or sell copies of the Software, and to
34
+ permit persons to whom the Software is furnished to do so, subject to
35
+ the following conditions:
36
+
37
+ The above copyright notice and this permission notice shall be
38
+ included in all copies or substantial portions of the Software.
39
+
40
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
41
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
43
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
44
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
45
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
46
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,17 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/raingrams/version.rb'
6
+
7
+ Hoe.new('raingrams', Raingrams::VERSION) do |p|
8
+ p.rubyforge_name = 'raingrams'
9
+ p.author = 'Postmodern Modulus III'
10
+ p.email = 'postmodern.mod3@gmail.com'
11
+ p.summary = 'Raingrams is a flexible and general-purpose ngrams library written in Ruby'
12
+ p.description = p.paragraphs_of('README.txt', 2..5).join("\n\n")
13
+ p.url = p.paragraphs_of('README.txt', 0).first.split(/\n/)[1..-1]
14
+ p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
15
+ end
16
+
17
+ # vim: syntax=Ruby
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class BigramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 2
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,4 @@
1
+ module Raingrams
2
+ class PrefixFrequencyMissing < RuntimeError
3
+ end
4
+ end
@@ -0,0 +1 @@
1
+ require 'raingrams/exceptions/prefix_frequency_missing.rb'
@@ -0,0 +1,7 @@
1
+ class Class
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class FalseClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class NilClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class Object
2
+
3
+ def to_gram
4
+ self.dup.freeze
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class String
2
+
3
+ def to_gram
4
+ intern
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class Symbol
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ class TrueClass
2
+
3
+ def to_gram
4
+ self
5
+ end
6
+
7
+ end
@@ -0,0 +1,7 @@
1
+ require 'raingrams/extensions/class'
2
+ require 'raingrams/extensions/nil_class'
3
+ require 'raingrams/extensions/true_class'
4
+ require 'raingrams/extensions/false_class'
5
+ require 'raingrams/extensions/symbol'
6
+ require 'raingrams/extensions/string'
7
+ require 'raingrams/extensions/object'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class HexagramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 6
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,161 @@
1
+ require 'raingrams/ngram'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/exceptions/prefix_frequency_missing'
5
+
6
+ module Raingrams
7
+ class Model
8
+
9
+ # Size of ngrams to use
10
+ attr_reader :ngram_size
11
+
12
+ # Ignore case of parsed text
13
+ attr_reader :ignore_case
14
+
15
+ # Ignore the punctuation of parsed text
16
+ attr_reader :ignore_punc
17
+
18
+ # Ignore URLs
19
+ attr_reader :ignore_urls
20
+
21
+ # Ignore Phone numbers
22
+ attr_reader :ignore_phone_numbers
23
+
24
+ # Ignore References
25
+ attr_reader :ignore_references
26
+
27
+ # Convert Acronyms to names within parsed text
28
+ attr_reader :convert_acronyms
29
+
30
+ # Convert Abbreviations to names within parsed text
31
+ attr_reader :convert_abbrev
32
+
33
+ # Frequencies of observed ngrams
34
+ attr_reader :frequency
35
+
36
+ # Normalized table of observed ngrams
37
+ attr_reader :probability
38
+
39
+ def initialize(opts={},&block)
40
+ @ngram_size = opts[:ngram_size]
41
+ @ignore_case = opts[:ignore_case] || false
42
+ @ignore_punc = opts[:ignore_punc] || true
43
+ @ignore_urls = opts[:ignore_urls] || false
44
+ @ignore_phone_numbers = opts[:ignore_phone_numbers] || false
45
+ @convert_acronyms = opts[:convert_acronyms] || false
46
+ @convert_abbrev = opts[:convert_abbrev] || false
47
+
48
+ @frequency = Hash.new { |hash,key| 0 }
49
+ @probability = Hash.new { |hash,key| 0.0 }
50
+
51
+ block.call(self) if block
52
+ end
53
+
54
+ def parse_sentence(sentence)
55
+ sentence = sentence.to_s.gsub(/[\.\?!]$/,'')
56
+
57
+ if @ignore_urls
58
+ sentence.gsub!(/\s*\w+:\/\/\w*\s*/,' ')
59
+ end
60
+
61
+ if @ignore_phone_numbers
62
+ sentence.gsub!(/\s*(\d-)?(\d{3}-)?\d{3}-\d{4}\s*/,' ')
63
+ end
64
+
65
+ if @ignore_references
66
+ sentence.gsub!(/\s*[\d+]\s*/,' ')
67
+ end
68
+
69
+ if @ignore_case
70
+ sentence.downcase!
71
+ end
72
+
73
+ if @ignore_punc
74
+ return sentence.scan(/\w+[\.'\-\_]?\w*/)
75
+ else
76
+ return sentence.scan(/(\w+|[-_,\.;'"])/)
77
+ end
78
+ end
79
+
80
+ def parse_text(text,&block)
81
+ text.to_s.scan(/[^\s\.\?!][^\.\?!]*/)
82
+ end
83
+
84
+ def train_with_ngram(ngram)
85
+ @frequency[ngram] += 1
86
+ return self
87
+ end
88
+
89
+ def train_with_ngrams(ngrams=[])
90
+ ngrams.each { |ngram| train_with_ngram(ngram) }
91
+ return self
92
+ end
93
+
94
+ def ngrams
95
+ @frequency.keys
96
+ end
97
+
98
+ def has_ngram?(ngram)
99
+ ngrams.include?(ngram)
100
+ end
101
+
102
+ def each_ngram(&block)
103
+ ngrams.each(&block)
104
+ end
105
+
106
+ def ngrams_with(&block)
107
+ ngrams.select(&block)
108
+ end
109
+
110
+ def vocabulary
111
+ ngrams.flatten.uniq
112
+ end
113
+
114
+ def within_vocabulary?(gram)
115
+ each_ngrams do |ngram|
116
+ return true if ngram.include?(gram)
117
+ end
118
+
119
+ return false
120
+ end
121
+
122
+ def ngrams_starting_with(obj)
123
+ ngrams_with { |ngram| ngram.starts_with?(obj.to_gram) }
124
+ end
125
+
126
+ def ngrams_ending_with(gram)
127
+ ngrams_with { |ngram| ngram.ends_with?(gram) }
128
+ end
129
+
130
+ def probabilities_for(ngrams)
131
+ ngrams.map { |ngram| @probability[ngram] }
132
+ end
133
+
134
+ def probability_of_ngram(ngram)
135
+ @probability[ngram]
136
+ end
137
+
138
+ def probability_of_ngrams(ngrams)
139
+ probabilities_for(ngrams).inject { |joint,prob| joint * prob }
140
+ end
141
+
142
+ def probability_of_gram(gram)
143
+ probability_of_ngrams(ngrams_starting_with(gram))
144
+ end
145
+
146
+ def clear
147
+ @frequency.clear
148
+
149
+ clear_probabilities
150
+ return self
151
+ end
152
+
153
+ protected
154
+
155
+ def clear_probabilities
156
+ @probability.clear
157
+ return self
158
+ end
159
+
160
+ end
161
+ end
@@ -0,0 +1,165 @@
1
+ require 'raingrams/model'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/exceptions/prefix_frequency_missing'
5
+
6
+ module Raingrams
7
+ class MultigramModel < Model
8
+
9
+ # Frequencies of n-1 grams
10
+ attr_reader :prefix_frequency
11
+
12
+ def initialize(opts={},&block)
13
+ @prefix_frequency = Hash.new { |hash,key| 0 }
14
+
15
+ super(opts) { |model| model.build(&block) }
16
+ end
17
+
18
+ def ngrams_from_words(words)
19
+ return (0...(words.length-@ngram_size+1)).map do |index|
20
+ Ngram.new(words[index,@ngram_size])
21
+ end
22
+ end
23
+
24
+ def ngrams_from_fragment(fragment)
25
+ ngrams_from_words(parse_sentence(fragment))
26
+ end
27
+
28
+ def ngrams_from_sentence(sentence)
29
+ ngrams_from_words(wrap_sentence(parse_sentence(sentence)))
30
+ end
31
+
32
+ def ngrams_from_text(text)
33
+ parse_text(text).inject([]) do |ngrams,sentence|
34
+ ngrams + ngrams_from_sentence(sentence)
35
+ end
36
+ end
37
+
38
+ def common_ngrams_from_words(words)
39
+ ngrams_from_words(words).select { |ngram| has_ngram?(ngram) }
40
+ end
41
+
42
+ def common_ngrams_from_fragment(fragment)
43
+ ngrams_from_fragment(words).select { |ngram| has_ngram?(ngram) }
44
+ end
45
+
46
+ def common_ngrams_from_sentence(sentence)
47
+ ngrams_from_sentence(sentence).select { |ngram| has_ngram?(ngram) }
48
+ end
49
+
50
+ def common_ngrams_from_text(text)
51
+ ngrams_from_text(text).select { |ngram| has_ngram?(ngram) }
52
+ end
53
+
54
+ def train_with_ngram(ngram)
55
+ @prefix_frequency[ngram.prefix] += 1
56
+ return super(ngram)
57
+ end
58
+
59
+ def train_with_sentence(sentence)
60
+ train_with_ngrams(ngrams_from_sentence(sentence))
61
+ end
62
+
63
+ def train_with_text(text)
64
+ train_with_ngrams(ngrams_from_text(text))
65
+ end
66
+
67
+ def build(&block)
68
+ clear_probabilities
69
+
70
+ block.call(self) if block
71
+
72
+ @frequency.each do |ngram,count|
73
+ prefix = ngram.prefix
74
+
75
+ unless @prefix_frequency[prefix]
76
+ raise(PrefixFrequencyMissing,"the model is missing the frequency of the ngram prefix #{prefix}",caller)
77
+ end
78
+
79
+ @probability[ngram] = count.to_f / @prefix_frequency[prefix].to_f
80
+ end
81
+
82
+ return self
83
+ end
84
+
85
+ def ngrams_prefixed_by(prefix)
86
+ ngrams_with { |ngram| ngram.prefixed_by?(prefix) }
87
+ end
88
+
89
+ def ngrams_postfixed_by(postfix)
90
+ ngrams_with { |ngram| ngram.prefixed_by?(postfix) }
91
+ end
92
+
93
+ def ngrams_preceeding(gram)
94
+ ngrams_ending_with(gram).map do |ngram|
95
+ ngrams_postfixed_by(ngram.prefix)
96
+ end
97
+ end
98
+
99
+ def ngrams_following(gram)
100
+ ngrams_starting_with(gram).map do |ngram|
101
+ ngrams_prefixed_by(ngram.postfix)
102
+ end
103
+ end
104
+
105
+ def grams_preceeding(gram)
106
+ ngrams_ending_with(gram).map do |ngram|
107
+ ngram[-2]
108
+ end
109
+ end
110
+
111
+ def grams_following(gram)
112
+ ngrams_starting_with(gram).map do |ngram|
113
+ ngram[1]
114
+ end
115
+ end
116
+
117
+ def fragment_probability(fragment)
118
+ probability_of_ngrams(ngrams_from_fragment(fragment))
119
+ end
120
+
121
+ def sentence_probability(sentence)
122
+ probability_of_ngrams(ngrams_from_sentence(sentence))
123
+ end
124
+
125
+ def text_probability(text)
126
+ probability_of_ngrams(ngrams_from_text(text))
127
+ end
128
+
129
+ def common_fragment_probability(fragment)
130
+ probability_of_ngrams(common_ngrams_from_fragment(fragment))
131
+ end
132
+
133
+ def common_sentence_probability(sentence)
134
+ probability_of_ngrams(common_ngrams_from_sentence(sentence))
135
+ end
136
+
137
+ def common_text_probability(fragment)
138
+ probability_of_ngrams(common_ngrams_from_text(text))
139
+ end
140
+
141
+ def similar_fragment_probability(other,fragment)
142
+ common_fragment_probability(fragment) * other.common_fragment_probability(fragment)
143
+ end
144
+
145
+ def similar_sentence_probability(other,sentence)
146
+ common_sentence_probability(sentence) * other.common_sentence_probability(sentence)
147
+ end
148
+
149
+ def similar_text_probability(other,text)
150
+ common_text_probability(text) * other.common_text_probability(text)
151
+ end
152
+
153
+ def clear
154
+ @prefix_frequency.clear
155
+ return super
156
+ end
157
+
158
+ protected
159
+
160
+ def wrap_sentence(sentence)
161
+ (Tokens::StartSentence * @ngram_size) + sentence.to_a + (Tokens::StopSentence * @ngram_size)
162
+ end
163
+
164
+ end
165
+ end
@@ -0,0 +1,53 @@
1
+ module Raingrams
2
+ class Ngram < Array
3
+
4
+ def initialize(objs)
5
+ super(objs.map { |obj| obj.to_gram })
6
+ end
7
+
8
+ def self.[](*objs)
9
+ self.new(objs)
10
+ end
11
+
12
+ def prefix
13
+ self[0...length-1]
14
+ end
15
+
16
+ def prefixed_by?(ngram)
17
+ prefix==ngram
18
+ end
19
+
20
+ def postfix
21
+ self[1..-1]
22
+ end
23
+
24
+ def postfixed_by?(ngram)
25
+ postfix==ngram
26
+ end
27
+
28
+ def starts_with?(obj)
29
+ self[0]==obj.to_gram
30
+ end
31
+
32
+ def ends_with?(obj)
33
+ self[-1]==obj.to_gram
34
+ end
35
+
36
+ def include?(obj)
37
+ super(obj.to_gram)
38
+ end
39
+
40
+ def flatten
41
+ self.dup
42
+ end
43
+
44
+ def flatten!
45
+ self
46
+ end
47
+
48
+ def to_s
49
+ join(', ')
50
+ end
51
+
52
+ end
53
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/bigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class BigramModel < Raingrams::BigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/hexagram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class HexagramModel < Raingrams::HexagramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/multigrammodel'
2
+ require 'raingrams/openvocabulary/openmodel'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class MultigramModel < Raingrams::MultigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,34 @@
1
+ require 'raingrams/tokens/unknown'
2
+
3
+ module Raingrams
4
+ module OpenVocabulary
5
+ module OpenModel
6
+
7
+ # The fixed lexicon of this model
8
+ attr_reader :lexicon
9
+
10
+ def initialize(opts={},&block)
11
+ @lexicon = opts[:lexicon] || []
12
+
13
+ super(opts,&block)
14
+ end
15
+
16
+ def within_lexicon?(gram)
17
+ @lexicon.include?(gram)
18
+ end
19
+
20
+ def train_ngram(ngram)
21
+ ngram = ngram.map do |gram|
22
+ if within_lexicon?(gram)
23
+ gram
24
+ else
25
+ Tokens::Unknown
26
+ end
27
+ end
28
+
29
+ return super(ngram)
30
+ end
31
+
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/pentagram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class PentagramModel < Raingrams::PentagramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/quadgram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class QuadgramModel < Raingrams::QuadgramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/trigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class TrigramModel < Raingrams::TrigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,12 @@
1
+ require 'raingrams/unigram_model'
2
+ require 'raingrams/openvocabulary/open_model'
3
+
4
+ module Raingrams
5
+ module OpenVocabulary
6
+ class UnigramModel < Raingrams::UnigramModel
7
+
8
+ include OpenModel
9
+
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,7 @@
1
+ require 'raingrams/openvocabulary/unigram_model'
2
+ require 'raingrams/openvocabulary/multigram_model'
3
+ require 'raingrams/openvocabulary/bigram_model'
4
+ require 'raingrams/openvocabulary/trigram_model'
5
+ require 'raingrams/openvocabulary/quadgram_model'
6
+ require 'raingrams/openvocabulary/pentagram_model'
7
+ require 'raingrams/openvocabulary/hexagram_model'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class PentagramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 5
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class QuadgramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 4
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,31 @@
1
+ require 'raingrams/unigram_model'
2
+ require 'raingrams/multigram_model'
3
+ require 'raingrams/open_vocabulary/unigram_model'
4
+ require 'raingrams/open_vocabulary/multigram_model'
5
+
6
+ module Raingrams
7
+ def Raingrams.closed_vocabulary_model(opts={},&block)
8
+ if opts[:ngram_size]==1
9
+ return UnigramModel.new(opts,&block)
10
+ else
11
+ return MultigramModel.new(opts,&block)
12
+ end
13
+ end
14
+
15
+ def Raingrams.open_vocabulary_model(opts={},&block)
16
+ if opts[:ngram_size]==1
17
+ return OpenVocabulary::UnigramModel.new(opts,&block)
18
+ else
19
+ return OpenVocabulary::MultigramModel.new(opts,&block)
20
+ end
21
+ end
22
+
23
+ def Raingrams.model(opts={},&block)
24
+ case opts[:vocabulary]
25
+ when :open, 'open'
26
+ return Raingrams.open_vocabulary_model(opts,&block)
27
+ else
28
+ return Raingrams.closed_vocabulary_model(opts,&block)
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class StartSentence < Token
6
+
7
+ def self.to_s
8
+ '<s>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class StopSentence < Token
6
+
7
+ def self.to_s
8
+ '</s>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ module Raingrams
2
+ module Tokens
3
+ class Token
4
+
5
+ def self.*(length)
6
+ [self] * length
7
+ end
8
+
9
+ def self.to_sym
10
+ self.to_s.to_sym
11
+ end
12
+
13
+ def self.inspect
14
+ self.to_s
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,13 @@
1
+ require 'raingrams/tokens/token'
2
+
3
+ module Raingrams
4
+ module Tokens
5
+ class Unknown < Token
6
+
7
+ def self.to_s
8
+ '<unknown>'
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,4 @@
1
+ require 'raingrams/tokens/token'
2
+ require 'raingrams/tokens/start_sentence'
3
+ require 'raingrams/tokens/stop_sentence'
4
+ require 'raingrams/tokens/unknown'
@@ -0,0 +1,13 @@
1
+ require 'raingrams/multigram_model'
2
+
3
+ module Raingrams
4
+ class TrigramModel < MultigramModel
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 3
8
+
9
+ super(opts,&block)
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,70 @@
1
+ require 'raingrams/model'
2
+
3
+ module Raingrams
4
+ class UnigramModel < Model
5
+
6
+ def initialize(opts={},&block)
7
+ opts[:ngram_size] = 1
8
+
9
+ super(opts) { |model| model.build(&block) }
10
+ end
11
+
12
+ def ngrams_from_words(words)
13
+ words.map { |word| Ngram[word] }
14
+ end
15
+
16
+ def ngrams_from_fragment(fragment)
17
+ ngrams_from_words(parse_sentence(fragment))
18
+ end
19
+
20
+ def ngrams_from_sentence(sentence)
21
+ ngrams_from_fragment(sentence)
22
+ end
23
+
24
+ def ngrams_from_text(text)
25
+ parse_text(text).inject([]) do |ngrams,sentence|
26
+ ngrams + ngrams_from_sentence(sentence)
27
+ end
28
+ end
29
+
30
+ def train_with_sentence(sentence)
31
+ train_with_ngrams(ngrams_from_sentence(sentence))
32
+ end
33
+
34
+ def train_with_text(text)
35
+ train_with_ngrams(ngrams_from_text(text))
36
+ end
37
+
38
+ def gram_count
39
+ @frequency.values.inject do |sum,count|
40
+ sum + count
41
+ end
42
+ end
43
+
44
+ def build(&block)
45
+ clear_probabilities
46
+
47
+ block.call(self) if block
48
+
49
+ total_count = gram_count.to_f
50
+ @frequency.each do |ngram,count|
51
+ @probability[ngram] = count.to_f / total_count
52
+ end
53
+
54
+ return self
55
+ end
56
+
57
+ def fragment_probability(fragment)
58
+ probability_of_ngrams(ngrams_from_fragment(fragment))
59
+ end
60
+
61
+ def sentence_probability(sentence)
62
+ probability_of_ngrams(ngrams_from_sentence(sentence))
63
+ end
64
+
65
+ def text_probability(text)
66
+ probability_of_ngrams(ngrams_from_text(text))
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,3 @@
1
+ module Raingrams
2
+ VERSION = '0.0.9'
3
+ end
data/lib/raingrams.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'raingrams/extensions'
2
+ require 'raingrams/raingrams'
3
+ require 'raingrams/ngram'
4
+ require 'raingrams/unigram_model'
5
+ require 'raingrams/bigram_model'
6
+ require 'raingrams/trigram_model'
7
+ require 'raingrams/quadgram_model'
8
+ require 'raingrams/pentagram_model'
9
+ require 'raingrams/hexagram_model'
10
+ require 'raingrams/raingrams'
File without changes
metadata ADDED
@@ -0,0 +1,99 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: raingrams
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.0.9
7
+ date: 2008-01-09 00:00:00 -08:00
8
+ summary: Raingrams is a flexible and general-purpose ngrams library written in Ruby
9
+ require_paths:
10
+ - lib
11
+ email: postmodern.mod3@gmail.com
12
+ homepage: " by Postmodern Modulus III"
13
+ rubyforge_project: raingrams
14
+ description: "== FEATURES/PROBLEMS: * Supports all non-zero ngram sizes. * Supports text and non-text grams. * Supports Open and Closed vocabulary models. == REQUIREMENTS: == INSTALL: $ sudo gem install raingrams"
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Postmodern Modulus III
31
+ files:
32
+ - History.txt
33
+ - LICENSE.txt
34
+ - Manifest.txt
35
+ - README.txt
36
+ - Rakefile
37
+ - lib/raingrams.rb
38
+ - lib/raingrams/version.rb
39
+ - lib/raingrams/raingrams.rb
40
+ - lib/raingrams/exceptions/prefix_frequency_missing.rb
41
+ - lib/raingrams/exceptions.rb
42
+ - lib/raingrams/extensions/class.rb
43
+ - lib/raingrams/extensions/false_class.rb
44
+ - lib/raingrams/extensions/nil_class.rb
45
+ - lib/raingrams/extensions/object.rb
46
+ - lib/raingrams/extensions/string.rb
47
+ - lib/raingrams/extensions/symbol.rb
48
+ - lib/raingrams/extensions/true_class.rb
49
+ - lib/raingrams/extensions.rb
50
+ - lib/raingrams/tokens/token.rb
51
+ - lib/raingrams/tokens/start_sentence.rb
52
+ - lib/raingrams/tokens/stop_sentence.rb
53
+ - lib/raingrams/tokens/unknown.rb
54
+ - lib/raingrams/tokens.rb
55
+ - lib/raingrams/ngram.rb
56
+ - lib/raingrams/model.rb
57
+ - lib/raingrams/unigram_model.rb
58
+ - lib/raingrams/multigram_model.rb
59
+ - lib/raingrams/bigram_model.rb
60
+ - lib/raingrams/trigram_model.rb
61
+ - lib/raingrams/quadgram_model.rb
62
+ - lib/raingrams/pentagram_model.rb
63
+ - lib/raingrams/hexagram_model.rb
64
+ - lib/raingrams/open_vocabulary/open_model.rb
65
+ - lib/raingrams/open_vocabulary/unigram_model.rb
66
+ - lib/raingrams/open_vocabulary/multigram_model.rb
67
+ - lib/raingrams/open_vocabulary/bigram_model.rb
68
+ - lib/raingrams/open_vocabulary/trigram_model.rb
69
+ - lib/raingrams/open_vocabulary/quadgram_model.rb
70
+ - lib/raingrams/open_vocabulary/pentagram_model.rb
71
+ - lib/raingrams/open_vocabulary/hexagram_model.rb
72
+ - lib/raingrams/open_vocabulary.rb
73
+ - test/test_raingrams.rb
74
+ test_files:
75
+ - test/test_raingrams.rb
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ extra_rdoc_files:
80
+ - History.txt
81
+ - LICENSE.txt
82
+ - Manifest.txt
83
+ - README.txt
84
+ executables: []
85
+
86
+ extensions: []
87
+
88
+ requirements: []
89
+
90
+ dependencies:
91
+ - !ruby/object:Gem::Dependency
92
+ name: hoe
93
+ version_requirement:
94
+ version_requirements: !ruby/object:Gem::Version::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: 1.4.0
99
+ version: