omnicat-bayes 0.2.2 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9226b652056d418b7f374a46c15e55c146f9f421
4
+ data.tar.gz: e370533de84b92342b0cc202ecf50c978749c9a4
5
+ SHA512:
6
+ metadata.gz: ecae56eab0769f6d582114d13326ec5e483431ac8f33ff2479578dff56a759e482e61e120e03619123c65ca78419f303cfb0bd685f8d712142a6e502fc3f6c28
7
+ data.tar.gz: 3cbf521720c7f804a70aacde837cac67599de3fd8e190540e4272909c085ebcedc3c1f5e322fb7f2160263432f22b17cc19f8d7f769a8b6b8be4cb75e1346ece
data/README.md CHANGED
@@ -27,11 +27,15 @@ See rdoc for detailed usage.
27
27
  Optional configuration sample:
28
28
 
29
29
  OmniCat.configure do |config|
30
+ # you can enable auto train mode by :unique or :continues
31
+ # unique: only uniq docs will be added to training docs on prediction
32
+ # continues: always add docs to training docs on prediction
33
+ config.auto_train = :off
30
34
  config.exclude_tokens = ['something', 'anything'] # exclude token list
31
35
  config.token_patterns = {
32
- # exclude token Regex patterns
36
+ # exclude tokens with Regex patterns
33
37
  minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
34
- # include token Regex patterns
38
+ # include tokens with Regex patterns
35
39
  plus: [/[\p{L}\-0-9]{2,}/, /[\!\?]/, /[\:\)\(\;\-\|]{2,3}/]
36
40
  }
37
41
  end
@@ -96,9 +100,14 @@ Untrain category with multiple documents.
96
100
  Classify a document.
97
101
 
98
102
  result = bayes.classify('I feel so good and happy')
99
- => #<OmniCat::Result:0x007fd20296aad8 @category={:name=>"positive", :percentage=>73}, @scores={"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, @total_score=7.385426829679431e-09>
103
+ => #<OmniCat::Result:0x007febb152af68 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb152add8 @key="positive", @value=6.813226744186048e-09, @percentage=58>, "negative"=>#<OmniCat::Score:0x007febb152ac70 @key="negative", @value=4.875003449064939e-09, @percentage=42>}, @total_score=1.1688230193250986e-08>
100
104
  result.to_hash
101
- => {:category=>{:name=>"positive", :percentage=>73}, :scores=>{"positive"=>5.4253472222222225e-09, "negative"=>1.9600796074572086e-09}, :total_score=>7.385426829679431e-09}
105
+ => {:top_score_key=>"positive", :scores=>{"positive"=>{:key=>"positive", :value=>6.813226744186048e-09, :percentage=>58}, "negative"=>{:key=>"negative", :value=>4.875003449064939e-09, :percentage=>42}}, :total_score=>1.1688230193250986e-08}
106
+ result.top_score
107
+ => #<OmniCat::Score:0x007febb152add8 @key="positive", @value=6.813226744186048e-09, @percentage=58>
108
+ result.top_score.to_hash
109
+ => {:key=>"positive", :value=>6.813226744186048e-09, :percentage=>58}
110
+
102
111
 
103
112
  ### Classify batch
104
113
  Classify multiple documents at a time.
@@ -109,25 +118,26 @@ Classify multiple documents at a time.
109
118
  'a good piece of work'
110
119
  ]
111
120
  )
112
- => [#<OmniCat::Result:0x007fd2029341b8 @category={:name=>"negative", :percentage=>78}, @scores={"positive"=>2.5521869888765736e-14, "negative"=>9.074442627116706e-14}, @total_score=1.162662961599328e-13>, #<OmniCat::Result:0x007fd20292e7e0 @category={:name=>"positive", :percentage=>80}, @scores={"positive"=>2.411265432098765e-07, "negative"=>5.880238822371627e-08}, @total_score=2.999289314335928e-07>]
121
+ => [#<OmniCat::Result:0x007febb14f3680 @top_score_key="negative", @scores={"positive"=>#<OmniCat::Score:0x007febb14f34a0 @key="positive", @value=7.971480930520432e-14, @percentage=22>, "negative"=>#<OmniCat::Score:0x007febb14f32c0 @key="negative", @value=2.834304330851709e-13, @percentage=78>}, @total_score=3.6314524239037524e-13>, #<OmniCat::Result:0x007febb14f2aa0 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb14f2960 @key="positive", @value=3.802731206057328e-07, @percentage=72>, "negative"=>#<OmniCat::Score:0x007febb14f2820 @key="negative", @value=1.4625010347194818e-07, @percentage=28>}, @total_score=5.26523224077681e-07>]
113
122
 
114
123
  ### Convert to hash
115
124
  Convert full Bayes object to hash.
116
125
 
117
126
  # For storing, restoring modal data
118
127
  bayes_hash = bayes.to_hash
119
- => {:categories=>{"positive"=>{:doc_count=>4, :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37}, "negative"=>{:doc_count=>4, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17}}, :category_count=>2, :doc_count=>8, :k_value=>1.0, :token_count=>54, :uniq_token_count=>43}
128
+ => {:categories=>{"positive"=>{:doc_count=>4, :docs=>{"28fd29bbf840c86db65e510ff3cd07a9"=>{:content=>"great if you are in a slap happy mood .", :content_md5=>"28fd29bbf840c86db65e510ff3cd07a9", :count=>1, :tokens=>{"great"=>1, "if"=>1, "you"=>1, "are"=>1, "in"=>1, "slap"=>1, "happy"=>1, "mood"=>1}}, "82b4cd9513f448dea0024f2d0e2ccd44"=>{:content=>"a feel-good picture in the best sense of the term...", :content_md5=>"82b4cd9513f448dea0024f2d0e2ccd44", :count=>1, :tokens=>{"feel-good"=>1, "picture"=>1, "in"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>1, "term"=>1}}, "f917bf1cf1256c78c5436d850dab3104"=>{:content=>"it is a feel-good movie about which you can actually feel good.", :content_md5=>"f917bf1cf1256c78c5436d850dab3104", :count=>1, :tokens=>{"it"=>1, "is"=>1, "feel-good"=>1, "movie"=>1, "about"=>1, "which"=>1, "you"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>1}}, "4343bbe84c035733708c3f58136f321e"=>{:content=>"love and money both of them are good choises", :content_md5=>"4343bbe84c035733708c3f58136f321e", :count=>1, :tokens=>{"love"=>1, "and"=>1, "money"=>1, "both"=>1, "of"=>1, "them"=>1, "are"=>1, "good"=>1, "choises"=>1}}}, :name=>"positive", :tokens=>{"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, :token_count=>37, :prior=>0.5}, "negative"=>{:doc_count=>4, :docs=>{"89b36e774579662591ea21b3283d9b35"=>{:content=>"bad tracking issue", :content_md5=>"89b36e774579662591ea21b3283d9b35", :count=>1, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1}}, "b0ec48bc87527e285b26d6cce8e278e7"=>{:content=>"simplistic , silly and tedious .", :content_md5=>"b0ec48bc87527e285b26d6cce8e278e7", :count=>1, :tokens=>{"simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1}}, "ae9d4fbaf40906614ca712a888648c5f"=>{:content=>"interesting , but not compelling . ", :content_md5=>"ae9d4fbaf40906614ca712a888648c5f", :count=>1, :tokens=>{"interesting"=>1, "but"=>1, "not"=>1, "compelling"=>1}}, "0e495f5d88d8049746a1b6961bf3cc90"=>{:content=>"seems clever but not especially compelling", :content_md5=>"0e495f5d88d8049746a1b6961bf3cc90", :count=>1, :tokens=>{"seems"=>1, "clever"=>1, "but"=>1, "not"=>1, "especially"=>1, "compelling"=>1}}}, :name=>"negative", :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, :token_count=>17, :prior=>0.5}}, :category_count=>2, :category_size_limit=>0, :doc_count=>8, :token_count=>54, :unique_token_count=>43, :k_value=>1.0}
120
129
 
121
130
  ### Load from hash
122
131
  Load full Bayes object from hash.
123
132
 
124
133
  another_bayes_obj = OmniCat::Classifiers::Bayes.new(bayes_hash)
125
- => #<OmniCat::Classifiers::Bayes:0x007fd20308cff0 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf78 @doc_count=4, @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007fd20308cf00 @doc_count=4, @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17>}, @category_count=2, @doc_count=8, @k_value=1.0, @token_count=54, @uniq_token_count=43>
134
+ => #<OmniCat::Classifiers::Bayes:0x007febb14d15a8 @categories={"positive"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007febb14d1530 @doc_count=4, @docs={"28fd29bbf840c86db65e510ff3cd07a9"=>{:content=>"great if you are in a slap happy mood .", :content_md5=>"28fd29bbf840c86db65e510ff3cd07a9", :count=>1, :tokens=>{"great"=>1, "if"=>1, "you"=>1, "are"=>1, "in"=>1, "slap"=>1, "happy"=>1, "mood"=>1}}, "82b4cd9513f448dea0024f2d0e2ccd44"=>{:content=>"a feel-good picture in the best sense of the term...", :content_md5=>"82b4cd9513f448dea0024f2d0e2ccd44", :count=>1, :tokens=>{"feel-good"=>1, "picture"=>1, "in"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>1, "term"=>1}}, "f917bf1cf1256c78c5436d850dab3104"=>{:content=>"it is a feel-good movie about which you can actually feel good.", :content_md5=>"f917bf1cf1256c78c5436d850dab3104", :count=>1, :tokens=>{"it"=>1, "is"=>1, "feel-good"=>1, "movie"=>1, "about"=>1, "which"=>1, "you"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>1}}, "4343bbe84c035733708c3f58136f321e"=>{:content=>"love and money both of them are good choises", :content_md5=>"4343bbe84c035733708c3f58136f321e", :count=>1, :tokens=>{"love"=>1, "and"=>1, "money"=>1, "both"=>1, "of"=>1, "them"=>1, "are"=>1, "good"=>1, "choises"=>1}}}, @name="positive", @tokens={"great"=>1, "if"=>1, "you"=>2, "are"=>2, "in"=>2, "slap"=>1, "happy"=>1, "mood"=>1, "feel-good"=>2, "picture"=>1, "the"=>2, "best"=>1, "sense"=>1, "of"=>2, "term"=>1, "it"=>1, "is"=>1, "movie"=>1, "about"=>1, "which"=>1, "can"=>1, "actually"=>1, "feel"=>1, "good"=>2, "love"=>1, "and"=>1, "money"=>1, "both"=>1, "them"=>1, "choises"=>1}, @token_count=37, @prior=0.5>, "negative"=>#<OmniCat::Classifiers::BayesInternals::Category:0x007febb14d14e0 @doc_count=4, @docs={"89b36e774579662591ea21b3283d9b35"=>{:content=>"bad tracking issue", :content_md5=>"89b36e774579662591ea21b3283d9b35", :count=>1, :tokens=>{"bad"=>1, "tracking"=>1, "issue"=>1}}, "b0ec48bc87527e285b26d6cce8e278e7"=>{:content=>"simplistic , silly and tedious .", :content_md5=>"b0ec48bc87527e285b26d6cce8e278e7", :count=>1, :tokens=>{"simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1}}, "ae9d4fbaf40906614ca712a888648c5f"=>{:content=>"interesting , but not compelling . ", :content_md5=>"ae9d4fbaf40906614ca712a888648c5f", :count=>1, :tokens=>{"interesting"=>1, "but"=>1, "not"=>1, "compelling"=>1}}, "0e495f5d88d8049746a1b6961bf3cc90"=>{:content=>"seems clever but not especially compelling", :content_md5=>"0e495f5d88d8049746a1b6961bf3cc90", :count=>1, :tokens=>{"seems"=>1, "clever"=>1, "but"=>1, "not"=>1, "especially"=>1, "compelling"=>1}}}, @name="negative", @tokens={"bad"=>1, "tracking"=>1, "issue"=>1, "simplistic"=>1, "silly"=>1, "and"=>1, "tedious"=>1, "interesting"=>1, "but"=>2, "not"=>2, "compelling"=>2, "seems"=>1, "clever"=>1, "especially"=>1}, @token_count=17, @prior=0.5>}, @category_count=2, @category_size_limit=0, @doc_count=8, @token_count=54, @unique_token_count=43, @k_value=1.0>
126
135
  another_bayes_obj.classify('best senses')
127
- => #<OmniCat::Result:0x007fd203075008 @category={:name=>"positive", :percentage=>57}, @scores={"positive"=>0.0002314814814814815, "negative"=>0.00017146776406035664}, @total_score=0.00040294924554183816>
136
+ => #<OmniCat::Result:0x007febb14c0fc8 @top_score_key="positive", @scores={"positive"=>#<OmniCat::Score:0x007febb14c0ed8 @key="positive", @value=0.00029069767441860465, @percentage=52>, "negative"=>#<OmniCat::Score:0x007febb14c0de8 @key="negative", @value=0.0002704164413196322, @percentage=48>}, @total_score=0.0005611141157382368>
128
137
 
129
- ## Todo
130
- * Implement all OmniCat(http://github.com/mustafaturan/omnicat) classifier strategy abstract methods
138
+ ### Best practices
139
+ For bayes classification always try to train same amount of documents for each category. So, do not activate auto training mode, because it make overages on balance of trained docs and makes algorithm go crazy :).
140
+ To get best results on text classification you should apply some cleaning actions like spellchecking, stemming, stop words cleaning before training and prediction actions.
131
141
 
132
142
  ## Contributing
133
143
 
@@ -1,5 +1,5 @@
1
1
  module Omnicat
2
2
  module Bayes
3
- VERSION = '0.2.2'
3
+ VERSION = '0.3.0'
4
4
  end
5
5
  end
@@ -33,7 +33,7 @@ module OmniCat
33
33
  "Category with name '#{category_name}' is already exists!"
34
34
  else
35
35
  increment_category_count
36
- @categories[category_name] = ::OmniCat::Classifiers::BayesInternals::Category.new
36
+ @categories[category_name] = ::OmniCat::Classifiers::BayesInternals::Category.new(name: category_name)
37
37
  end
38
38
  end
39
39
 
@@ -52,24 +52,14 @@ module OmniCat
52
52
  # bayes.train("negative", "bad dog")
53
53
  # bayes.train("neutral", "how is the management gui")
54
54
  def train(category_name, doc_content)
55
- if category_exists?(category_name)
56
- increment_doc_counts(category_name)
57
- update_priors
58
- doc_key = Digest::MD5.hexdigest(doc_content)
59
- if doc = @categories[category_name].docs[doc_key]
60
- doc.increment_count
61
- else
62
- doc = OmniCat::Doc.new(content: doc_content)
63
- end
64
- @categories[category_name].docs[doc_key] = doc
65
- doc.tokens.each do |token, count|
66
- increment_token_counts(category_name, token, count)
67
- @categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i + count
68
- end
69
- else
70
- raise StandardError,
71
- "Category with name '#{category_name}' does not exist!"
55
+ category_must_exist(category_name)
56
+ doc = add_doc(category_name, doc_content)
57
+ doc.tokens.each do |token, count|
58
+ increment_token_counts(category_name, token, count)
59
+ @categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i + count
72
60
  end
61
+ increment_doc_counts(category_name)
62
+ update_priors
73
63
  end
74
64
 
75
65
  # Untrain the desired category with a document
@@ -87,26 +77,15 @@ module OmniCat
87
77
  # bayes.untrain("negative", "bad dog")
88
78
  # bayes.untrain("neutral", "how is the management gui")
89
79
  def untrain(category_name, doc_content)
90
- if category_exists?(category_name)
91
- doc_key = Digest::MD5.hexdigest(doc_content)
92
- if doc = @categories[category_name].docs[doc_key]
93
- @categories[category_name].docs[doc_key].decrement_count
94
- else
95
- raise StandardError,
96
- "Document is not found in #{category_name} documents!"
97
- end
98
- doc.tokens.each do |token, count|
99
- @categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i - count
100
- @categories[category_name].tokens.delete(token) if @categories[category_name].tokens[token] == 0
101
- decrement_token_counts(category_name, token, count)
102
- end
103
- @categories[category_name].docs.delete(doc_key) if @categories[category_name].docs[doc_key].count == 0
104
- decrement_doc_counts(category_name)
105
- update_priors
106
- else
107
- raise StandardError,
108
- "Category with name '#{category_name}' does not exist!"
80
+ category_must_exist(category_name)
81
+ doc = remove_doc(category_name, doc_content)
82
+ doc.tokens.each do |token, count|
83
+ @categories[category_name].tokens[token] = @categories[category_name].tokens[token].to_i - count
84
+ @categories[category_name].tokens.delete(token) if @categories[category_name].tokens[token] == 0
85
+ decrement_token_counts(category_name, token, count)
109
86
  end
87
+ decrement_doc_counts(category_name)
88
+ update_priors
110
89
  end
111
90
 
112
91
  # Classify the given document
@@ -126,25 +105,32 @@ module OmniCat
126
105
  # =>
127
106
  def classify(doc_content)
128
107
  return unless classifiable?
129
- score = -1000000
108
+ doc = ::OmniCat::Doc.new(content: doc_content)
130
109
  result = ::OmniCat::Result.new
131
110
  @categories.each do |category_name, category|
132
- result.scores[category_name] = doc_probability(category, doc_content)
133
- if result.scores[category_name] > score
134
- result.category[:name] = category_name
135
- score = result.scores[category_name]
136
- end
137
- result.total_score += result.scores[category_name]
111
+ result.add_score(
112
+ Score.new(
113
+ key: category.name,
114
+ value: doc_probability(category, doc)
115
+ )
116
+ )
138
117
  end
139
- result.total_score = 1 if result.total_score == 0
140
- result.category[:percentage] = (
141
- result.scores[result.category[:name]] * 100.0 /
142
- result.total_score
143
- ).floor
118
+ auto_train(@categories[result.top_score.key], doc)
119
+ result.calculate_percentages
144
120
  result
145
121
  end
146
122
 
147
123
  private
124
+ # nodoc
125
+ def auto_train(category, doc)
126
+ case ::OmniCat.config.auto_train
127
+ when :continues
128
+ train(category.name, doc.content)
129
+ when :unique
130
+ train(category.name, doc.content) unless category.docs.has_key?(doc.content_md5)
131
+ end
132
+ end
133
+
148
134
  # nodoc
149
135
  def update_priors
150
136
  @categories.each do |_, category|
@@ -164,36 +150,35 @@ module OmniCat
164
150
 
165
151
  # nodoc
166
152
  def modify_token_counts(category_name, token, count)
167
- modify_uniq_token_count(token, count < 0 ? -1 : 1)
153
+ modify_unique_token_count(token, count < 0 ? -1 : 1)
168
154
  @token_count += count
169
155
  @categories[category_name].token_count += count
170
156
  end
171
157
 
172
158
  # nodoc
173
- def increment_uniq_token_count(token)
174
- modify_uniq_token_count(token, 1)
159
+ def increment_unique_token_count(token)
160
+ modify_unique_token_count(token, 1)
175
161
  end
176
162
 
177
163
  # nodoc
178
- def decrement_uniq_token_count(token)
179
- modify_uniq_token_count(token, -1)
164
+ def decrement_unique_token_count(token)
165
+ modify_unique_token_count(token, -1)
180
166
  end
181
167
 
182
168
  # nodoc
183
- def modify_uniq_token_count(token, uniq_token_addition)
169
+ def modify_unique_token_count(token, uniq_token_addition)
184
170
  @categories.each do |_, category|
185
171
  if category.tokens.has_key?(token)
186
172
  uniq_token_addition = 0
187
173
  break
188
174
  end
189
175
  end
190
- @uniq_token_count += uniq_token_addition
176
+ @unique_token_count += uniq_token_addition
191
177
  end
192
178
 
193
179
  # nodoc
194
- def doc_probability(category, doc_content)
195
- score = k_value
196
- doc = OmniCat::Doc.new(content: doc_content)
180
+ def doc_probability(category, doc)
181
+ score = @k_value
197
182
  doc.tokens.each do |token, count|
198
183
  score *= token_probability(category, token, count)
199
184
  end
@@ -203,14 +188,51 @@ module OmniCat
203
188
  # nodoc
204
189
  def token_probability(category, token, count)
205
190
  if category.tokens[token].to_i == 0
206
- k_value / token_count
191
+ @k_value / (@unique_token_count * count)
207
192
  else
208
193
  count * (
209
- (category.tokens[token].to_i + k_value) /
210
- (category.token_count + uniq_token_count)
194
+ (category.tokens[token].to_i + @k_value) /
195
+ (category.token_count + @unique_token_count)
211
196
  )
212
197
  end
213
198
  end
199
+
200
+ # nodoc
201
+ def add_doc(category_name, doc_content)
202
+ doc_key = generate_doc_key(doc_content)
203
+ if doc = @categories[category_name].docs[doc_key]
204
+ doc.increment_count
205
+ else
206
+ @categories[category_name].docs[doc_key] = ::OmniCat::Doc.new(content: doc_content)
207
+ end
208
+ @categories[category_name].docs[doc_key]
209
+ end
210
+
211
+ # nodoc
212
+ def remove_doc(category_name, doc_content)
213
+ doc_key = generate_doc_key(doc_content)
214
+ doc = @categories[category_name].docs[doc_key]
215
+ unless doc
216
+ raise StandardError,
217
+ "Document is not found in #{category_name} documents!"
218
+ end
219
+ doc.decrement_count
220
+ @categories[category_name].docs.delete(doc_key) if doc.count == 0
221
+ doc
222
+ end
223
+
224
+ # nodoc
225
+ def generate_doc_key(doc_content)
226
+ Digest::MD5.hexdigest(doc_content)
227
+ end
228
+
229
+ # nodoc
230
+ def category_must_exist(category_name)
231
+ unless category_exists?(category_name)
232
+ raise StandardError,
233
+ "Category with name '#{category_name}' does not exist!"
234
+ end
235
+ end
214
236
  end
215
237
  end
216
238
  end
@@ -18,8 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
20
 
21
- spec.add_dependency 'hashable', '~> 0.1.1'
22
- spec.add_dependency 'omnicat', '~> 0.2.2'
21
+ spec.add_dependency 'omnicat', '~> 0.3.0'
23
22
  spec.add_development_dependency 'bundler', '~> 1.3'
24
23
  spec.add_development_dependency 'rake'
25
24
  end
@@ -4,6 +4,7 @@ require File.expand_path(File.join(File.dirname(__FILE__), '..', '..', 'test_hel
4
4
  class TestBayes < Test::Unit::TestCase
5
5
  def setup
6
6
  OmniCat.configure do |config|
7
+ config.auto_train = :off
7
8
  config.exclude_tokens = ['are', 'at', 'by']
8
9
  config.token_patterns = {
9
10
  minus: [/[\s\t\n\r]+/, /(@[\w\d]+)/],
@@ -82,7 +83,7 @@ class TestBayes < Test::Unit::TestCase
82
83
  @bayes.categories['neutral'].token_count
83
84
  )
84
85
  end
85
-
86
+
86
87
  def test_untrain_with_doc_count_2
87
88
  @bayes.add_category 'neutral'
88
89
  @bayes.train 'neutral', 'how are you?? : :| :) ;-) :('
@@ -112,7 +113,8 @@ class TestBayes < Test::Unit::TestCase
112
113
  def test_untrain_with_missing_doc
113
114
  @bayes.add_category 'neutral'
114
115
  assert_raise(StandardError) {
115
- @bayes.untrain 'neutral', 'how are you?? : :| :) ;-) :(' }
116
+ @bayes.untrain 'neutral', 'how are you?? : :| :) ;-) :('
117
+ }
116
118
  end
117
119
 
118
120
  def test_train_batch
@@ -128,15 +130,15 @@ class TestBayes < Test::Unit::TestCase
128
130
  def test_train_missing_category
129
131
  assert_raise(StandardError) { @bayes.train 'neutral', 'how are you?' }
130
132
  end
131
-
133
+
132
134
  def test_unique_token_count
133
135
  @bayes.add_category 'positive'
134
136
  @bayes.train_batch 'positive', ['good job ever', 'valid syntax',
135
137
  'best moments of my good life']
136
- assert_equal(10,@bayes.uniq_token_count)
138
+ assert_equal(10,@bayes.unique_token_count)
137
139
  @bayes.untrain_batch 'positive', ['good job ever', 'valid syntax',
138
140
  'best moments of my good life']
139
- assert_equal(0,@bayes.uniq_token_count)
141
+ assert_equal(0,@bayes.unique_token_count)
140
142
  end
141
143
 
142
144
  def test_classifiability_error
@@ -154,11 +156,12 @@ class TestBayes < Test::Unit::TestCase
154
156
  @bayes.train('negative', 'bad work')
155
157
  assert_equal(
156
158
  'positive',
157
- @bayes.classify('very good position for this sentence').category[:name]
159
+ @bayes.classify('very good position for this sentence').top_score.key
158
160
  )
161
+ @bayes.train('negative', 'work')
159
162
  assert_equal(
160
163
  'negative',
161
- @bayes.classify('bad words').category[:name]
164
+ @bayes.classify('bad words').top_score.key
162
165
  )
163
166
  end
164
167
 
@@ -176,11 +179,11 @@ class TestBayes < Test::Unit::TestCase
176
179
 
177
180
  assert_equal(
178
181
  'positive',
179
- results[0].category[:name]
182
+ results[0].top_score.key
180
183
  )
181
184
  assert_equal(
182
185
  'negative',
183
- results[1].category[:name]
186
+ results[1].top_score.key
184
187
  )
185
188
  end
186
189
 
@@ -191,11 +194,21 @@ class TestBayes < Test::Unit::TestCase
191
194
  bayes1.train('positive', 'good job')
192
195
  bayes1.train('negative', 'bad work')
193
196
  h1 = bayes1.to_hash
194
-
195
197
  bayes2 = ::OmniCat::Classifiers::Bayes.new(h1)
196
198
  assert_equal(h1, bayes2.to_hash)
197
199
  end
198
200
 
201
+ def test_change_strategy
202
+ c1 = ::OmniCat::Classifier.new(::OmniCat::Classifiers::Bayes.new)
203
+ c1.add_category 'positive'
204
+ c1.add_category 'negative'
205
+ c1.train('positive', 'good job')
206
+ c1.train('negative', 'bad work')
207
+ h1 = c1.to_hash
208
+ c1.strategy = ::OmniCat::Classifiers::Bayes.new
209
+ assert_equal(h1, c1.to_hash)
210
+ end
211
+
199
212
  def test_classify_with_insufficient_categories
200
213
  assert_raise(StandardError) { @bayes.classify 'blank' }
201
214
  end
metadata CHANGED
@@ -1,78 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omnicat-bayes
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
5
- prerelease:
4
+ version: 0.3.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Mustafa Turan
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-07-11 00:00:00.000000000 Z
11
+ date: 2014-02-19 00:00:00.000000000 Z
13
12
  dependencies:
14
- - !ruby/object:Gem::Dependency
15
- name: hashable
16
- requirement: !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ~>
20
- - !ruby/object:Gem::Version
21
- version: 0.1.1
22
- type: :runtime
23
- prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
- requirements:
27
- - - ~>
28
- - !ruby/object:Gem::Version
29
- version: 0.1.1
30
13
  - !ruby/object:Gem::Dependency
31
14
  name: omnicat
32
15
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
16
  requirements:
35
- - - ~>
17
+ - - "~>"
36
18
  - !ruby/object:Gem::Version
37
- version: 0.2.2
19
+ version: 0.3.0
38
20
  type: :runtime
39
21
  prerelease: false
40
22
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
23
  requirements:
43
- - - ~>
24
+ - - "~>"
44
25
  - !ruby/object:Gem::Version
45
- version: 0.2.2
26
+ version: 0.3.0
46
27
  - !ruby/object:Gem::Dependency
47
28
  name: bundler
48
29
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
30
  requirements:
51
- - - ~>
31
+ - - "~>"
52
32
  - !ruby/object:Gem::Version
53
33
  version: '1.3'
54
34
  type: :development
55
35
  prerelease: false
56
36
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
37
  requirements:
59
- - - ~>
38
+ - - "~>"
60
39
  - !ruby/object:Gem::Version
61
40
  version: '1.3'
62
41
  - !ruby/object:Gem::Dependency
63
42
  name: rake
64
43
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
44
  requirements:
67
- - - ! '>='
45
+ - - ">="
68
46
  - !ruby/object:Gem::Version
69
47
  version: '0'
70
48
  type: :development
71
49
  prerelease: false
72
50
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
51
  requirements:
75
- - - ! '>='
52
+ - - ">="
76
53
  - !ruby/object:Gem::Version
77
54
  version: '0'
78
55
  description: Naive Bayes classifier strategy for OmniCat
@@ -82,7 +59,7 @@ executables: []
82
59
  extensions: []
83
60
  extra_rdoc_files: []
84
61
  files:
85
- - .gitignore
62
+ - ".gitignore"
86
63
  - Gemfile
87
64
  - LICENSE.txt
88
65
  - README.md
@@ -98,27 +75,26 @@ files:
98
75
  homepage: https://github.com/mustafaturan/omnicat-bayes
99
76
  licenses:
100
77
  - MIT
78
+ metadata: {}
101
79
  post_install_message:
102
80
  rdoc_options: []
103
81
  require_paths:
104
82
  - lib
105
83
  required_ruby_version: !ruby/object:Gem::Requirement
106
- none: false
107
84
  requirements:
108
- - - ! '>='
85
+ - - ">="
109
86
  - !ruby/object:Gem::Version
110
87
  version: '0'
111
88
  required_rubygems_version: !ruby/object:Gem::Requirement
112
- none: false
113
89
  requirements:
114
- - - ! '>='
90
+ - - ">="
115
91
  - !ruby/object:Gem::Version
116
92
  version: '0'
117
93
  requirements: []
118
94
  rubyforge_project:
119
- rubygems_version: 1.8.23
95
+ rubygems_version: 2.2.0
120
96
  signing_key:
121
- specification_version: 3
97
+ specification_version: 4
122
98
  summary: Naive Bayes text classification implementation as an OmniCat classifier strategy.
123
99
  test_files:
124
100
  - test/test_helper.rb