nbayes 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4c286395465f5e97e4ecba9407ce021039e2809
4
+ data.tar.gz: 7ac74a9b05b4fcfb1051a9a9ed2359c860b8564e
5
+ SHA512:
6
+ metadata.gz: 6359cc15db183b65f376062297c9c1f143befd795365d969b92f0ea55fd831a3c93aba8247f52fa3da1e6348a5f7aedff85bec3b52f3d7da204c0bb5a79bfbca
7
+ data.tar.gz: ee66196fab0c55a70557947dac32005c868451ec2b4032b64cc2c9c2ee3da8eafe14fb13ab8ad968fba7ebd30352e9eb1619dc96e646fa6bfe0d862cbd089074
data/Gemfile CHANGED
@@ -6,9 +6,9 @@ source "http://rubygems.org"
6
6
  # Add dependencies to develop your gem here.
7
7
  # Include everything needed to run rake, tests, features, etc.
8
8
  group :development do
9
- gem "rspec", ">= 2.8.0"
10
- gem "rdoc", ">= 3.12"
11
- gem "bundler", ">= 1.0.0"
12
- gem "jeweler", ">= 1.8.3"
9
+ gem "rspec", ">= 3.9.0"
10
+ gem "rdoc", ">= 3.0.0"
11
+ gem "bundler", ">= 2.0.0"
12
+ gem "jeweler", ">= 2.3.0"
13
13
  end
14
14
  gem 'simplecov', :require => false, :group => :test
@@ -1,4 +1,4 @@
1
- Copyright (c) 2012 Oasic Technologies LLC
1
+ Copyright (c) 2012-2016 Oasic Technologies LLC
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
@@ -1,8 +1,10 @@
1
- == nbayes
1
+ # nbayes
2
2
 
3
- _gem install nbayes_
3
+ ```
4
+ gem install nbayes
5
+ ```
4
6
 
5
- NBayes is a full-featured, Ruby implementation of Naive Bayes. Some of the features include:
7
+ NBayes is a full-featured, Ruby implementation of ``Naive Bayes``. Some of the features include:
6
8
 
7
9
  * allows prior distribution on classes to be assumed uniform (optional)
8
10
  * generic to work with all types of tokens, not just text
@@ -14,7 +16,7 @@ NBayes is a full-featured, Ruby implementation of Naive Bayes. Some of the feat
14
16
 
15
17
  For more information, view this blog post: http://blog.oasic.net/2012/06/naive-bayes-for-ruby.html
16
18
 
17
- == Contributing to nbayes
19
+ ## Contributing to nbayes
18
20
 
19
21
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
20
22
  * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
@@ -24,8 +26,12 @@ For more information, view this blog post: http://blog.oasic.net/2012/06/naive-b
24
26
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
25
27
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
26
28
 
27
- == Copyright
29
+ ## Acknowledgements
28
30
 
29
- Copyright (c) 2012 Oasic Technologies LLC. See LICENSE.txt for
30
- further details.
31
+ This project is supported by the GrammarBot [grammar checker](http://www.GrammarBot.io/)
32
+
33
+
34
+ ## Copyright
35
+
36
+ Copyright (c) 2012-2020 Oasic Technologies LLC. See LICENSE.txt for further details.
31
37
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.3
@@ -8,24 +8,44 @@ require 'yaml'
8
8
  # - allows binarized or standard NB
9
9
  # - allows Prior distribution on category to be assumed uniform (optional)
10
10
  # - generic to work with all types of tokens, not just text
11
- #
11
+
12
12
 
13
13
  module NBayes
14
14
 
15
- class Base
15
+ class Vocab
16
+ attr_accessor :log_size, :tokens
16
17
 
17
- attr_accessor :assume_uniform, :debug, :k, :vocab, :data, :log_vocab
18
- attr_reader :binarized
18
+ def initialize(options = {})
19
+ @tokens = Hash.new
20
+ # for smoothing, use log of vocab size, rather than vocab size
21
+ @log_size = options[:log_size]
22
+ end
19
23
 
20
- def initialize(options={})
21
- @debug = false
22
- @k = 1
23
- @binarized = options[:binarized] || false
24
- @log_vocab = false # for smoothing, use log of vocab size, rather than vocab size
25
- @assume_uniform = false
26
- @vocab = Hash.new # used to calculate vocab size (@vocab.keys.length)
24
+ def delete(token)
25
+ tokens.delete(token)
26
+ end
27
+
28
+ def each(&block)
29
+ tokens.keys.each(&block)
30
+ end
31
+
32
+ def size
33
+ if log_size
34
+ Math.log(tokens.count)
35
+ else
36
+ tokens.count
37
+ end
38
+ end
39
+
40
+ def seen_token(token)
41
+ tokens[token] = 1
42
+ end
43
+ end
44
+
45
+ class Data
46
+ attr_accessor :data
47
+ def initialize(options = {})
27
48
  @data = Hash.new
28
- @data.default_proc = get_default_proc()
29
49
  #@data = {
30
50
  # "category1": {
31
51
  # "tokens": Hash.new(0),
@@ -36,82 +56,199 @@ module NBayes
36
56
  #}
37
57
  end
38
58
 
59
+ def categories
60
+ data.keys
61
+ end
62
+
63
+ def token_trained?(token, category)
64
+ data[category] ? data[category][:tokens].has_key?(token) : false
65
+ end
66
+
67
+ def cat_data(category)
68
+ unless data[category].is_a? Hash
69
+ data[category] = new_category
70
+ end
71
+ data[category]
72
+ end
73
+
74
+ def category_stats
75
+ tmp = []
76
+ total_example_count = total_examples
77
+ self.each do |category|
78
+ e = example_count(category)
79
+ t = token_count(category)
80
+ tmp << "For category #{category}, %d examples (%.02f%% of the total) and %d total_tokens" % [e, 100.0 * e / total_example_count, t]
81
+ end
82
+ tmp.join("\n")
83
+ end
84
+
85
+ def each(&block)
86
+ data.keys.each(&block)
87
+ end
88
+
89
+ # Increment the number of training examples for this category
90
+ def increment_examples(category)
91
+ cat_data(category)[:examples] += 1
92
+ end
93
+
94
+ # Decrement the number of training examples for this category.
95
+ # Delete the category if the examples counter is 0.
96
+ def decrement_examples(category)
97
+ cat_data(category)[:examples] -= 1
98
+ delete_category(category) if cat_data(category)[:examples] < 1
99
+ end
100
+
101
+ def example_count(category)
102
+ cat_data(category)[:examples]
103
+ end
104
+
105
+ def token_count(category)
106
+ cat_data(category)[:total_tokens]
107
+ end
108
+
109
+ # XXX - Add Enumerable and see if I get inject?
110
+ # Total number of training instances
111
+ def total_examples
112
+ sum = 0
113
+ self.each {|category| sum += example_count(category) }
114
+ sum
115
+ end
116
+
117
+ # Add this token to this category
118
+ def add_token_to_category(category, token)
119
+ cat_data(category)[:tokens][token] += 1
120
+ cat_data(category)[:total_tokens] += 1
121
+ end
122
+
123
+ # Decrement the token counter in a category
124
+ # If the counter is 0, delete the token.
125
+ # If the total number of tokens is 0, delete the category.
126
+ def remove_token_from_category(category, token)
127
+ cat_data(category)[:tokens][token] -= 1
128
+ delete_token_from_category(category, token) if cat_data(category)[:tokens][token] < 1
129
+ cat_data(category)[:total_tokens] -= 1
130
+ delete_category(category) if cat_data(category)[:total_tokens] < 1
131
+ end
132
+
133
+ # How many times does this token appear in this category?
134
+ def count_of_token_in_category(category, token)
135
+ cat_data(category)[:tokens][token]
136
+ end
137
+
138
+ def delete_token_from_category(category, token)
139
+ count = count_of_token_in_category(category, token)
140
+ cat_data(category)[:tokens].delete(token)
141
+ # Update this category's total token count
142
+ cat_data(category)[:total_tokens] -= count
143
+ end
144
+
145
+ def purge_less_than(token, x)
146
+ return if token_count_across_categories(token) >= x
147
+ self.each do |category|
148
+ delete_token_from_category(category, token)
149
+ end
150
+ true # Let caller know we removed this token
151
+ end
152
+
153
+ # XXX - TODO - use count_of_token_in_category
154
+ # Return the total number of tokens we've seen across all categories
155
+ def token_count_across_categories(token)
156
+ data.keys.inject(0){|sum, cat| sum + @data[cat][:tokens][token] }
157
+ end
158
+
159
+ def reset_after_import
160
+ self.each {|category| cat_data(category)[:tokens].default = 0 }
161
+ end
162
+
163
+ def new_category
164
+ {
165
+ :tokens => Hash.new(0), # holds freq counts
166
+ :total_tokens => 0,
167
+ :examples => 0
168
+ }
169
+ end
170
+
171
+ def delete_category(category)
172
+ data.delete(category) if data.has_key?(category)
173
+ categories
174
+ end
175
+
176
+ end
177
+
178
+ class Base
179
+
180
+ attr_accessor :assume_uniform, :debug, :k, :vocab, :data
181
+ attr_reader :binarized
182
+
183
+ def initialize(options={})
184
+ @debug = false
185
+ @k = 1
186
+ @binarized = options[:binarized] || false
187
+ @assume_uniform = false
188
+ @vocab = Vocab.new(:log_size => options[:log_vocab])
189
+ @data = Data.new
190
+ end
39
191
 
40
192
  # Allows removal of low frequency words that increase processing time and may overfit
41
193
  # - tokens with a count less than x (measured by summing across all classes) are removed
42
194
  # Ex: nb.purge_less_than(2)
43
195
  #
44
- # NOTE: this does not decrement the "examples" count, so purging is not *always* the same
45
- # as if the item was never added in the first place, but usually so
196
+ # NOTE: this does not decrement the "examples" count, so purging is not *always* the same
197
+ # as if the item was never added in the first place, but usually so
46
198
  def purge_less_than(x)
47
199
  remove_list = {}
48
- @vocab.keys.each do |token|
49
- count = @data.keys.inject(0){|sum, cat| sum + @data[cat][:tokens][token] }
50
- next if count >= x
51
- @data.each do |cat, cat_data|
52
- count = cat_data[:tokens][token]
53
- cat_data[:tokens].delete(token) # delete and retrieve count
54
- cat_data[:total_tokens] -= count # subtract that count from cat counts
55
- end # each category hash
56
- #print "removing #{token}\n"
57
- remove_list[token]=1
200
+ @vocab.each do |token|
201
+ if data.purge_less_than(token, x)
202
+ # print "removing #{token}\n"
203
+ remove_list[token] = 1
204
+ end
58
205
  end # each vocab word
59
206
  remove_list.keys.each {|token| @vocab.delete(token) }
60
- #print "total vocab size is now #{vocab_size}\n"
207
+ # print "total vocab size is now #{vocab.size}\n"
61
208
  end
62
209
 
63
-
64
- # Returns the default proc used by the data hash
65
- # Separate method so that it can be used after data import
66
- def get_default_proc
67
- return lambda do |hash, category|
68
- hash[category]= {
69
- :tokens => Hash.new(0), # holds freq counts
70
- :total_tokens => 0,
71
- :examples => 0
72
- }
73
- end
210
+ # Delete an entire category from the classification data
211
+ def delete_category(category)
212
+ data.delete_category(category)
74
213
  end
75
214
 
76
- # called internally after yaml import to reset Hash defaults
77
- def reset_after_import
78
- @data.default_proc = get_default_proc()
79
- @data.each {|cat, cat_hash| cat_hash[:tokens].default=0 }
215
+ def train(tokens, category)
216
+ tokens = tokens.uniq if binarized
217
+ data.increment_examples(category)
218
+ tokens.each do |token|
219
+ vocab.seen_token(token)
220
+ data.add_token_to_category(category, token)
221
+ end
80
222
  end
81
223
 
82
- def train(tokens, category)
83
- cat_data = @data[category]
84
- cat_data[:examples]+=1
85
- tokens = tokens.uniq if binarized
86
- tokens.each do |w|
87
- @vocab[w]=1
88
- cat_data[:tokens][w]+=1
89
- cat_data[:total_tokens]+=1
224
+ # Be carefull with this function:
225
+ # * It decrement the number of examples for the category.
226
+ # If the being-untrained category has no more examples, it is removed from the category list.
227
+ # * It untrain already trained tokens, non existing tokens are not considered.
228
+ def untrain(tokens, category)
229
+ tokens = tokens.uniq if binarized
230
+ data.decrement_examples(category)
231
+
232
+ tokens.each do |token|
233
+ if data.token_trained?(token, category)
234
+ vocab.delete(token)
235
+ data.remove_token_from_category(category, token)
236
+ end
90
237
  end
91
238
  end
92
239
 
93
240
  def classify(tokens)
94
241
  print "classify: #{tokens.join(', ')}\n" if @debug
95
242
  probs = {}
96
- tokens = tokens.uniq if binarized
243
+ tokens = tokens.uniq if binarized
97
244
  probs = calculate_probabilities(tokens)
98
245
  print "results: #{probs.to_yaml}\n" if @debug
99
246
  probs.extend(NBayes::Result)
100
247
  probs
101
248
  end
102
249
 
103
- # Total number of training instances
104
- def total_examples
105
- sum = 0
106
- @data.each {|cat, cat_data| sum += cat_data[:examples] }
107
- sum
108
- end
109
-
110
- # Returns the size of the "vocab" - the number of unique tokens found in the text
111
- # This is used in the Laplacian smoothing.
112
- def vocab_size
113
- return Math.log(@vocab.keys.length) if @log_vocab
114
- @vocab.keys.length
250
+ def category_stats
251
+ data.category_stats
115
252
  end
116
253
 
117
254
  # Calculates the actual probability of a class given the tokens
@@ -119,21 +256,31 @@ module NBayes
119
256
  def calculate_probabilities(tokens)
120
257
  # P(class|words) = P(w1,...,wn|class) * P(class) / P(w1,...,wn)
121
258
  # = argmax P(w1,...,wn|class) * P(class)
122
- #
259
+ #
123
260
  # P(wi|class) = (count(wi, class) + k)/(count(w,class) + kV)
124
261
  prob_numerator = {}
125
- v_size = vocab_size
126
- @data.keys.each do |category|
127
- cat_data = @data[category]
128
- cat_prob = Math.log(cat_data[:examples]/total_examples().to_f)
129
- cat_prob = Math.log(1/@data.keys.length.to_f) if assume_uniform
130
- log_probs = 0
131
- cat_denominator = (cat_data[:total_tokens]+ @k*v_size).to_f
132
- tokens.each do |token|
133
- log_probs += Math.log( (cat_data[:tokens][token] + @k)/cat_denominator )
134
- end
135
- prob_numerator[category] = log_probs + cat_prob
262
+ v_size = vocab.size
263
+
264
+ cat_prob = Math.log(1 / data.categories.count.to_f)
265
+ total_example_count = data.total_examples.to_f
266
+
267
+ data.each do |category|
268
+ unless assume_uniform
269
+ cat_prob = Math.log(data.example_count(category) / total_example_count)
270
+ end
271
+
272
+ log_probs = 0
273
+ denominator = (data.token_count(category) + @k * v_size).to_f
274
+ tokens.each do |token|
275
+ numerator = data.count_of_token_in_category(category, token) + @k
276
+ log_probs += Math.log( numerator / denominator )
277
+ end
278
+ prob_numerator[category] = log_probs + cat_prob
136
279
  end
280
+ normalize(prob_numerator)
281
+ end
282
+
283
+ def normalize(prob_numerator)
137
284
  # calculate the denominator, which normalizes this into a probability; it's just the sum of all numerators from above
138
285
  normalizer = 0
139
286
  prob_numerator.each {|cat, numerator| normalizer += numerator }
@@ -148,37 +295,47 @@ module NBayes
148
295
  intermed = {}
149
296
  renormalizer = 0
150
297
  prob_numerator.each do |cat, numerator|
151
- intermed[cat]=normalizer/numerator.to_f
152
- renormalizer += intermed[cat]
298
+ intermed[cat] = normalizer / numerator.to_f
299
+ renormalizer += intermed[cat]
153
300
  end
154
301
  # calculate final probs
155
302
  final_probs = {}
156
303
  intermed.each do |cat, value|
157
- final_probs[cat]=value/renormalizer.to_f
304
+ final_probs[cat] = value / renormalizer.to_f
158
305
  end
159
306
  final_probs
160
307
  end
161
308
 
309
+ # called internally after yaml import to reset Hash defaults
310
+ def reset_after_import
311
+ data.reset_after_import
312
+ end
313
+
314
+ def self.from_yml(yml_data)
315
+ nbayes = YAML.load(yml_data)
316
+ nbayes.reset_after_import() # yaml does not properly set the defaults on the Hashes
317
+ nbayes
318
+ end
319
+
162
320
  # Loads class instance from a data file (e.g., yaml)
163
321
  def self.from(yml_file)
164
- nbayes = YAML.load_file(yml_file)
165
- nbayes.reset_after_import() # yaml does not properly set the defaults on the Hashes
166
- nbayes
322
+ File.open(yml_file, "rb") do |file|
323
+ self.from_yml(file.read)
324
+ end
167
325
  end
168
326
 
169
327
  # Load class instance
170
328
  def load(yml)
171
329
  if yml.nil?
172
- return NBayes::Base.new
330
+ nbayes = NBayes::Base.new
173
331
  elsif yml[0..2] == "---"
174
- nbayes = YAML.load(yml)
332
+ nbayes = self.class.from_yml(yml)
175
333
  else
176
- nbayes = YAML.load_file(yml_file)
334
+ nbayes = self.class.from(yml)
177
335
  end
178
- nbayes.reset_after_import() # yaml does not properly set the defaults on the Hashes
179
336
  nbayes
180
337
  end
181
-
338
+
182
339
  # Dumps class instance to a data file (e.g., yaml) or a string
183
340
  def dump(arg)
184
341
  if arg.instance_of? String
@@ -190,14 +347,11 @@ module NBayes
190
347
 
191
348
  end
192
349
 
193
-
194
350
  module Result
351
+ # Return the key having the largest value
195
352
  def max_class
196
- keys.max{|a,b| self[a] <=> self[b] }
353
+ keys.max{ |a,b| self[a] <=> self[b] }
197
354
  end
198
355
  end
199
356
 
200
357
  end
201
-
202
-
203
-
@@ -2,27 +2,28 @@
2
2
  # DO NOT EDIT THIS FILE DIRECTLY
3
3
  # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
4
  # -*- encoding: utf-8 -*-
5
+ # stub: nbayes 0.1.3 ruby lib
5
6
 
6
7
  Gem::Specification.new do |s|
7
- s.name = "nbayes"
8
- s.version = "0.1.1"
8
+ s.name = "nbayes".freeze
9
+ s.version = "0.1.3"
9
10
 
10
- s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
- s.authors = ["oasic"]
12
- s.date = "2012-07-13"
13
- s.description = "Ruby implementation of Naive Bayes that generates true probabilities per class, works with many token types, and provides lots of bells and whistles while being optimized for performance."
14
- s.email = "j@oasic.net"
11
+ s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12
+ s.require_paths = ["lib".freeze]
13
+ s.authors = ["oasic".freeze]
14
+ s.date = "2020-06-26"
15
+ s.description = "Ruby implementation of Naive Bayes that generates true probabilities per class, works with many token types, and provides lots of bells and whistles while being optimized for performance.".freeze
16
+ s.email = "j@oasic.net".freeze
15
17
  s.extra_rdoc_files = [
16
18
  "LICENSE.txt",
17
- "README.rdoc"
19
+ "README.md"
18
20
  ]
19
21
  s.files = [
20
22
  ".document",
21
23
  ".rspec",
22
24
  "Gemfile",
23
- "Gemfile.lock",
24
25
  "LICENSE.txt",
25
- "README.rdoc",
26
+ "README.md",
26
27
  "Rakefile",
27
28
  "VERSION",
28
29
  "lib/nbayes.rb",
@@ -30,31 +31,30 @@ Gem::Specification.new do |s|
30
31
  "spec/nbayes_spec.rb",
31
32
  "spec/spec_helper.rb"
32
33
  ]
33
- s.homepage = "http://github.com/oasic/nbayes"
34
- s.licenses = ["MIT"]
35
- s.require_paths = ["lib"]
36
- s.rubygems_version = "1.8.15"
37
- s.summary = "Full-featured Ruby implementation of Naive Bayes classifier"
34
+ s.homepage = "http://github.com/oasic/nbayes".freeze
35
+ s.licenses = ["MIT".freeze]
36
+ s.rubygems_version = "2.6.14".freeze
37
+ s.summary = "Full-featured Ruby implementation of Naive Bayes classifier".freeze
38
38
 
39
39
  if s.respond_to? :specification_version then
40
- s.specification_version = 3
40
+ s.specification_version = 4
41
41
 
42
42
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
43
- s.add_development_dependency(%q<rspec>, [">= 2.8.0"])
44
- s.add_development_dependency(%q<rdoc>, [">= 3.12"])
45
- s.add_development_dependency(%q<bundler>, [">= 1.0.0"])
46
- s.add_development_dependency(%q<jeweler>, [">= 1.8.3"])
43
+ s.add_development_dependency(%q<rspec>.freeze, [">= 3.9.0"])
44
+ s.add_development_dependency(%q<rdoc>.freeze, [">= 3.0.0"])
45
+ s.add_development_dependency(%q<bundler>.freeze, [">= 2.0.0"])
46
+ s.add_development_dependency(%q<jeweler>.freeze, [">= 2.3.0"])
47
47
  else
48
- s.add_dependency(%q<rspec>, [">= 2.8.0"])
49
- s.add_dependency(%q<rdoc>, [">= 3.12"])
50
- s.add_dependency(%q<bundler>, [">= 1.0.0"])
51
- s.add_dependency(%q<jeweler>, [">= 1.8.3"])
48
+ s.add_dependency(%q<rspec>.freeze, [">= 3.9.0"])
49
+ s.add_dependency(%q<rdoc>.freeze, [">= 3.0.0"])
50
+ s.add_dependency(%q<bundler>.freeze, [">= 2.0.0"])
51
+ s.add_dependency(%q<jeweler>.freeze, [">= 2.3.0"])
52
52
  end
53
53
  else
54
- s.add_dependency(%q<rspec>, [">= 2.8.0"])
55
- s.add_dependency(%q<rdoc>, [">= 3.12"])
56
- s.add_dependency(%q<bundler>, [">= 1.0.0"])
57
- s.add_dependency(%q<jeweler>, [">= 1.8.3"])
54
+ s.add_dependency(%q<rspec>.freeze, [">= 3.9.0"])
55
+ s.add_dependency(%q<rdoc>.freeze, [">= 3.0.0"])
56
+ s.add_dependency(%q<bundler>.freeze, [">= 2.0.0"])
57
+ s.add_dependency(%q<jeweler>.freeze, [">= 2.3.0"])
58
58
  end
59
59
  end
60
60
 
@@ -1,161 +1,243 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
2
  require 'fileutils'
3
3
 
4
- describe "NBayes" do
5
- before do
6
- @nbayes = NBayes::Base.new
7
- end
8
-
9
- it "should assign equal probability to each class" do
10
- @nbayes.train( %w[a b c d e f g], 'classA' )
11
- @nbayes.train( %w[a b c d e f g], 'classB' )
12
- results = @nbayes.classify( %w[a b c] )
13
- results['classA'].should == 0.5
14
- results['classB'].should == 0.5
15
- end
16
-
17
- it "should handle more than 2 classes" do
18
- @nbayes.train( %w[a a a a], 'classA' )
19
- @nbayes.train( %w[b b b b], 'classB' )
20
- @nbayes.train( %w[c c], 'classC' )
21
- results = @nbayes.classify( %w[a a a a b c] )
22
- results.max_class.should == 'classA'
23
- results['classA'].should >= 0.4
24
- results['classB'].should <= 0.3
25
- results['classC'].should <= 0.3
26
- end
27
-
28
- it "should use smoothing by default to eliminate errors w/division by zero" do
29
- @nbayes.train( %w[a a a a], 'classA' )
30
- @nbayes.train( %w[b b b b], 'classB' )
31
- results = @nbayes.classify( %w[x y z] )
32
- results['classA'].should >= 0.0
33
- results['classB'].should >= 0.0
34
- end
35
-
36
- it "should optionally purge low frequency data" do
37
- 100.times do
38
- @nbayes.train( %w[a a a a], 'classA' )
39
- @nbayes.train( %w[b b b b], 'classB' )
4
+ describe NBayes do
5
+ let(:nbayes) { NBayes::Base.new }
6
+
7
+ describe 'should assign equal probability to each class' do
8
+ let(:results) { nbayes.classify(%w(a b c)) }
9
+
10
+ before do
11
+ nbayes.train(%w(a b c d e f g), 'classA')
12
+ nbayes.train(%w(a b c d e f g), 'classB')
13
+ end
14
+
15
+ specify { expect(results['classA']).to eq(0.5) }
16
+ specify { expect(results['classB']).to eq(0.5) }
17
+ end
18
+
19
+ describe 'should handle more than 2 classes' do
20
+ let(:results) { nbayes.classify(%w(a a a a b c)) }
21
+
22
+ before do
23
+ nbayes.train(%w(a a a a), 'classA')
24
+ nbayes.train(%w(b b b b), 'classB')
25
+ nbayes.train(%w(c c), 'classC')
26
+ end
27
+
28
+ specify { expect(results.max_class).to eq('classA') }
29
+ specify { expect(results['classA']).to be >= 0.4 }
30
+ specify { expect(results['classB']).to be <= 0.3 }
31
+ specify { expect(results['classC']).to be <= 0.3 }
32
+ end
33
+
34
+ describe 'should use smoothing by default to eliminate errors' do
35
+ context 'when dividing by zero' do
36
+ let(:results) { nbayes.classify(%w(x y z)) }
37
+
38
+ before do
39
+ nbayes.train(%w(a a a a), 'classA')
40
+ nbayes.train(%w(b b b b), 'classB')
41
+ end
42
+
43
+ specify { expect(results['classA']).to be >= 0.0 }
44
+ specify { expect(results['classB']).to be >= 0.0 }
45
+ end
46
+ end
47
+
48
+ describe 'should optionally purge low frequency data' do
49
+ let(:results) { nbayes.classify(%w(c)) }
50
+ let(:token_count) { nbayes.data.count_of_token_in_category('classB', 'c') }
51
+
52
+ before do
53
+ 100.times do
54
+ nbayes.train(%w(a a a a), 'classA')
55
+ nbayes.train(%w(b b b b), 'classB')
56
+ end
57
+ nbayes.train(%w(a), 'classA')
58
+ nbayes.train(%w(c b), 'classB')
59
+ end
60
+
61
+ context 'before purge' do
62
+ specify { expect(results.max_class).to eq('classB') }
63
+ specify { expect(results['classB']).to be > 0.5 }
64
+ specify { expect(token_count).to eq(1) }
65
+ end
66
+
67
+ context 'after purge' do
68
+ before { nbayes.purge_less_than(2) }
69
+
70
+ specify { expect(results['classA']).to eq(0.5) }
71
+ specify { expect(results['classB']).to eq(0.5) }
72
+ specify { expect(token_count).to be_zero }
73
+ end
74
+ end
75
+
76
+ it 'works on all tokens - not just strings' do
77
+ nbayes.train([1, 2, 3], 'low')
78
+ nbayes.train([5, 6, 7], 'high')
79
+ results = nbayes.classify([2])
80
+ expect(results.max_class).to eq('low')
81
+ results = nbayes.classify([6])
82
+ expect(results.max_class).to eq('high')
83
+ end
84
+
85
+ describe 'should optionally allow class distribution to be assumed uniform' do
86
+ context 'before uniform distribution' do
87
+ let(:before_results) { nbayes.classify(['a']) }
88
+
89
+ before do
90
+ nbayes.train(%w(a a a a b), 'classA')
91
+ nbayes.train(%w(a a a a), 'classA')
92
+ nbayes.train(%w(a a a a), 'classB')
93
+ end
94
+
95
+ specify { expect(before_results.max_class).to eq('classA') }
96
+ specify { expect(before_results['classA']).to be > 0.5 }
97
+
98
+ context 'and after uniform distribution assumption' do
99
+ let(:after_results) { nbayes.classify(['a']) }
100
+
101
+ before { nbayes.assume_uniform = true }
102
+
103
+ specify { expect(after_results.max_class).to eq('classB') }
104
+ specify { expect(after_results['classB']).to be > 0.5 }
105
+ end
40
106
  end
41
- @nbayes.train( %w[a], 'classA' )
42
- @nbayes.train( %w[c b], 'classB' )
43
- results = @nbayes.classify( %w[c] )
44
- results.max_class.should == 'classB'
45
- results['classB'].should > 0.5
46
- @nbayes.data['classB'][:tokens]['c'].should == 1
47
-
48
- @nbayes.purge_less_than(2) # this removes the entry for 'c' in 'classB' because it has freq of 1
49
- # NOTE: this does not decrement the 'example' count
50
- results = @nbayes.classify( %w[c] )
51
- @nbayes.data['classB'][:tokens]['c'].should == 0
52
- results['classA'].should == 0.5
53
- results['classB'].should == 0.5
54
- end
55
-
56
- it "works on all tokens - not just strings" do
57
- @nbayes.train( [1, 2, 3], 'low' )
58
- @nbayes.train( [5, 6, 7], 'high' )
59
- results = @nbayes.classify( [2] )
60
- results.max_class.should == 'low'
61
- results = @nbayes.classify( [6] )
62
- results.max_class.should == 'high'
63
- end
64
-
65
- it "should optionally allow class distribution to be assumed uniform" do
66
- # before uniform distribution
67
- @nbayes.train( %w[a a a a b], 'classA' )
68
- @nbayes.train( %w[a a a a], 'classA' )
69
- @nbayes.train( %w[a a a a], 'classB' )
70
- results = @nbayes.classify( ['a'] )
71
- results.max_class.should == 'classA'
72
- results['classA'].should > 0.5
73
- # after uniform distribution assumption
74
- @nbayes.assume_uniform = true
75
- results = @nbayes.classify( ['a'] )
76
- results.max_class.should == 'classB'
77
- results['classB'].should > 0.5
78
- end
79
-
80
- it "should allow log of vocab size in smoothing" do
81
-
82
107
  end
83
108
 
84
109
  # In binarized mode, the frequency count is set to 1 for each token in each instance
85
110
  # For text, this is "set of words" rather than "bag of words"
86
- it "should allow binarized mode" do
111
+ it 'should allow binarized mode' do
87
112
  # w/o binarized mode, token repetition can skew the results
88
- def train_it
89
- @nbayes.train( %w[a a a a a a a a a a a], 'classA' )
90
- @nbayes.train( %w[b b], 'classA' )
91
- @nbayes.train( %w[a c], 'classB' )
92
- @nbayes.train( %w[a c], 'classB' )
93
- @nbayes.train( %w[a c], 'classB' )
94
- end
95
- train_it
96
- results = @nbayes.classify( ['a'] )
97
- results.max_class.should == 'classA'
98
- results['classA'].should > 0.5
113
+ # def train_it
114
+ nbayes.train(%w(a a a a a a a a a a a), 'classA')
115
+ nbayes.train(%w(b b), 'classA')
116
+ nbayes.train(%w(a c), 'classB')
117
+ nbayes.train(%w(a c), 'classB')
118
+ nbayes.train(%w(a c), 'classB')
119
+ # end
120
+ # train_it
121
+ results = nbayes.classify(['a'])
122
+ expect(results.max_class).to eq('classA')
123
+ expect(results['classA']).to be > 0.5
99
124
  # this does not happen in binarized mode
100
- @nbayes = NBayes::Base.new(:binarized => true)
101
- train_it
102
- results = @nbayes.classify( ['a'] )
103
- results.max_class.should == 'classB'
104
- results['classB'].should > 0.5
125
+ nbayes = NBayes::Base.new(binarized: true)
126
+ nbayes.train(%w(a a a a a a a a a a a), 'classA')
127
+ nbayes.train(%w(b b), 'classA')
128
+ nbayes.train(%w(a c), 'classB')
129
+ nbayes.train(%w(a c), 'classB')
130
+ nbayes.train(%w(a c), 'classB')
131
+ results = nbayes.classify(['a'])
132
+ expect(results.max_class).to eq('classB')
133
+ expect(results['classB']).to be > 0.5
105
134
  end
106
135
 
107
- it "allows smoothing constant k to be set to any value" do
136
+ it 'allows smoothing constant k to be set to any value' do
108
137
  # increasing k increases smoothing
109
- @nbayes.train( %w[a a a c], 'classA' )
110
- @nbayes.train( %w[b b b d], 'classB' )
111
- @nbayes.k.should == 1
112
- results = @nbayes.classify( ['c'] )
138
+ nbayes.train(%w(a a a c), 'classA')
139
+ nbayes.train(%w(b b b d), 'classB')
140
+ expect(nbayes.k).to eq(1)
141
+ results = nbayes.classify(['c'])
113
142
  prob_k1 = results['classA']
114
- @nbayes.k = 5
115
- results = @nbayes.classify( ['c'] )
143
+ nbayes.k = 5
144
+ results = nbayes.classify(['c'])
116
145
  prob_k5 = results['classA']
117
- prob_k1.should > prob_k5 # increasing smoothing constant dampens the effect of the rare token 'c'
146
+ expect(prob_k1).to be > prob_k5 # increasing smoothing constant dampens the effect of the rare token 'c'
118
147
  end
119
148
 
120
- it "optionally allows using the log of vocab size during smoothing" do
121
- 10_000.times do
122
- @nbayes.train( [rand(100)], 'classA' )
123
- @nbayes.train( %w[b b b d], 'classB' )
149
+ it 'optionally allows using the log of vocab size during smoothing' do
150
+ 10_000.times do
151
+ nbayes.train([rand(100)], 'classA')
152
+ nbayes.train(%w(b b b d), 'classB')
124
153
  end
125
154
  end
126
155
 
127
- describe "saving" do
156
+ describe 'saving' do
157
+ let(:tmp_dir) { File.join(File.dirname(__FILE__), 'tmp') }
158
+ let(:yml_file) { File.join(tmp_dir, 'test.yml') }
159
+
160
+ before { FileUtils.mkdir(tmp_dir) unless File.exist?(tmp_dir) }
161
+
162
+ after { FileUtils.rm(yml_file) if File.exist?(yml_file) }
163
+
164
+ it 'should save to yaml and load from yaml' do
165
+ nbayes.train(%w(a a a a), 'classA')
166
+ nbayes.train(%w(b b b b), 'classB')
167
+ results = nbayes.classify(['b'])
168
+ expect(results['classB']).to be >= 0.5
169
+ nbayes.dump(yml_file)
170
+ expect(File.exist?(yml_file)).to eq(true)
171
+ nbayes2 = NBayes::Base.from(yml_file)
172
+ results = nbayes.classify(['b'])
173
+ expect(results['classB']).to be >= 0.5
174
+ end
175
+ end
176
+
177
+ it 'should dump to yaml string and load from yaml string' do
178
+ nbayes.train(%w(a a a a), 'classA')
179
+ nbayes.train(%w(b b b b), 'classB')
180
+ results = nbayes.classify(['b'])
181
+ expect(results['classB']).to be >= 0.5
182
+ yml = nbayes.dump(nbayes)
183
+ nbayes2 = NBayes::Base.new.load(yml)
184
+ results = nbayes.classify(['b'])
185
+ expect(results['classB']).to be >= 0.5
186
+ end
187
+
188
+ describe 'should delete a category' do
128
189
  before do
129
- @tmp_dir = File.join( File.dirname(__FILE__), 'tmp')
130
- FileUtils.mkdir(@tmp_dir) if !File.exists?(@tmp_dir)
131
- @yml_file = File.join(@tmp_dir, 'test.yml')
190
+ nbayes.train(%w(a a a a), 'classA')
191
+ nbayes.train(%w(b b b b), 'classB')
192
+ expect(nbayes.data.categories).to eq(%w(classA classB))
193
+ expect(nbayes.delete_category('classB')).to eq(['classA'])
132
194
  end
133
195
 
134
- after do
135
- FileUtils.rm(@yml_file) if File.exists?(@yml_file)
196
+ specify { expect(nbayes.data.categories).to eq(['classA']) }
197
+ end
198
+
199
+ describe 'should do nothing if asked to delete an inexistant category' do
200
+ before { nbayes.train(%w(a a a a), 'classA') }
201
+
202
+ specify { expect(nbayes.data.categories).to eq(['classA']) }
203
+ specify { expect(nbayes.delete_category('classB')).to eq(['classA']) }
204
+ specify { expect(nbayes.data.categories).to eq(['classA']) }
205
+ end
206
+
207
+ describe 'should untrain a class' do
208
+ let(:results) { nbayes.classify(%w(a b c)) }
209
+
210
+ before do
211
+ nbayes.train(%w(a b c d e f g), 'classA')
212
+ nbayes.train(%w(a b c d e f g), 'classB')
213
+ nbayes.train(%w(a b c d e f g), 'classB')
214
+ nbayes.untrain(%w(a b c d e f g), 'classB')
136
215
  end
137
216
 
138
- it "should save to yaml and load from yaml" do
139
- @nbayes.train( %w[a a a a], 'classA' )
140
- @nbayes.train( %w[b b b b], 'classB' )
141
- results = @nbayes.classify( ['b'] )
142
- results['classB'].should >= 0.5
143
- @nbayes.dump(@yml_file)
144
- File.exists?(@yml_file).should == true
145
- @nbayes2 = NBayes::Base.from(@yml_file)
146
- results = @nbayes.classify( ['b'] )
147
- results['classB'].should >= 0.5
217
+ specify { expect(results['classA']).to eq(0.5) }
218
+ specify { expect(results['classB']).to eq(0.5) }
219
+ end
220
+
221
+ describe 'should remove the category when the only example is untrained' do
222
+ before do
223
+ nbayes.train(%w(a b c d e f g), 'classA')
224
+ nbayes.untrain(%w(a b c d e f g), 'classA')
148
225
  end
226
+
227
+ specify { expect(nbayes.data.categories).to eq([]) }
149
228
  end
150
229
 
151
- it "should dump to yaml string and load from yaml string" do
152
- @nbayes.train( %w[a a a a], 'classA' )
153
- @nbayes.train( %w[b b b b], 'classB' )
154
- results = @nbayes.classify( ['b'] )
155
- results['classB'].should >= 0.5
156
- yml = @nbayes.dump(@nbayes)
157
- @nbayes2 = NBayes::Base.new.load(yml)
158
- results = @nbayes.classify( ['b'] )
159
- results['classB'].should >= 0.5
230
+ describe 'try untraining a non-existant category' do
231
+ let(:results) { nbayes.classify(%w(a b c)) }
232
+
233
+ before do
234
+ nbayes.train(%w(a b c d e f g), 'classA')
235
+ nbayes.train(%w(a b c d e f g), 'classB')
236
+ nbayes.untrain(%w(a b c d e f g), 'classC')
237
+ end
238
+
239
+ specify { expect(nbayes.data.categories).to eq(%w(classA classB)) }
240
+ specify { expect(results['classA']).to eq(0.5) }
241
+ specify { expect(results['classB']).to eq(0.5) }
160
242
  end
161
243
  end
metadata CHANGED
@@ -1,77 +1,86 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: nbayes
3
- version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.1.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.3
6
5
  platform: ruby
7
- authors:
6
+ authors:
8
7
  - oasic
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
-
13
- date: 2012-07-13 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
11
+ date: 2020-06-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
16
14
  name: rspec
17
- requirement: &id001 !ruby/object:Gem::Requirement
18
- none: false
19
- requirements:
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
20
17
  - - ">="
21
- - !ruby/object:Gem::Version
22
- version: 2.8.0
18
+ - !ruby/object:Gem::Version
19
+ version: 3.9.0
23
20
  type: :development
24
21
  prerelease: false
25
- version_requirements: *id001
26
- - !ruby/object:Gem::Dependency
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 3.9.0
27
+ - !ruby/object:Gem::Dependency
27
28
  name: rdoc
28
- requirement: &id002 !ruby/object:Gem::Requirement
29
- none: false
30
- requirements:
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
31
  - - ">="
32
- - !ruby/object:Gem::Version
33
- version: "3.12"
32
+ - !ruby/object:Gem::Version
33
+ version: 3.0.0
34
34
  type: :development
35
35
  prerelease: false
36
- version_requirements: *id002
37
- - !ruby/object:Gem::Dependency
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: 3.0.0
41
+ - !ruby/object:Gem::Dependency
38
42
  name: bundler
39
- requirement: &id003 !ruby/object:Gem::Requirement
40
- none: false
41
- requirements:
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
42
45
  - - ">="
43
- - !ruby/object:Gem::Version
44
- version: 1.0.0
46
+ - !ruby/object:Gem::Version
47
+ version: 2.0.0
45
48
  type: :development
46
49
  prerelease: false
47
- version_requirements: *id003
48
- - !ruby/object:Gem::Dependency
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 2.0.0
55
+ - !ruby/object:Gem::Dependency
49
56
  name: jeweler
50
- requirement: &id004 !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
53
59
  - - ">="
54
- - !ruby/object:Gem::Version
55
- version: 1.8.3
60
+ - !ruby/object:Gem::Version
61
+ version: 2.3.0
56
62
  type: :development
57
63
  prerelease: false
58
- version_requirements: *id004
59
- description: Ruby implementation of Naive Bayes that generates true probabilities per class, works with many token types, and provides lots of bells and whistles while being optimized for performance.
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 2.3.0
69
+ description: Ruby implementation of Naive Bayes that generates true probabilities
70
+ per class, works with many token types, and provides lots of bells and whistles
71
+ while being optimized for performance.
60
72
  email: j@oasic.net
61
73
  executables: []
62
-
63
74
  extensions: []
64
-
65
- extra_rdoc_files:
75
+ extra_rdoc_files:
66
76
  - LICENSE.txt
67
- - README.rdoc
68
- files:
69
- - .document
70
- - .rspec
77
+ - README.md
78
+ files:
79
+ - ".document"
80
+ - ".rspec"
71
81
  - Gemfile
72
- - Gemfile.lock
73
82
  - LICENSE.txt
74
- - README.rdoc
83
+ - README.md
75
84
  - Rakefile
76
85
  - VERSION
77
86
  - lib/nbayes.rb
@@ -79,34 +88,27 @@ files:
79
88
  - spec/nbayes_spec.rb
80
89
  - spec/spec_helper.rb
81
90
  homepage: http://github.com/oasic/nbayes
82
- licenses:
91
+ licenses:
83
92
  - MIT
93
+ metadata: {}
84
94
  post_install_message:
85
95
  rdoc_options: []
86
-
87
- require_paths:
96
+ require_paths:
88
97
  - lib
89
- required_ruby_version: !ruby/object:Gem::Requirement
90
- none: false
91
- requirements:
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
92
100
  - - ">="
93
- - !ruby/object:Gem::Version
94
- hash: 1596303061917544948
95
- segments:
96
- - 0
97
- version: "0"
98
- required_rubygems_version: !ruby/object:Gem::Requirement
99
- none: false
100
- requirements:
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ requirements:
101
105
  - - ">="
102
- - !ruby/object:Gem::Version
103
- version: "0"
106
+ - !ruby/object:Gem::Version
107
+ version: '0'
104
108
  requirements: []
105
-
106
109
  rubyforge_project:
107
- rubygems_version: 1.8.15
110
+ rubygems_version: 2.6.14
108
111
  signing_key:
109
- specification_version: 3
112
+ specification_version: 4
110
113
  summary: Full-featured Ruby implementation of Naive Bayes classifier
111
114
  test_files: []
112
-
@@ -1,37 +0,0 @@
1
- GEM
2
- remote: http://rubygems.org/
3
- specs:
4
- diff-lcs (1.1.3)
5
- git (1.2.5)
6
- jeweler (1.8.3)
7
- bundler (~> 1.0)
8
- git (>= 1.2.5)
9
- rake
10
- rdoc
11
- json (1.7.3)
12
- multi_json (1.3.6)
13
- rake (0.9.2.2)
14
- rdoc (3.12)
15
- json (~> 1.4)
16
- rspec (2.10.0)
17
- rspec-core (~> 2.10.0)
18
- rspec-expectations (~> 2.10.0)
19
- rspec-mocks (~> 2.10.0)
20
- rspec-core (2.10.1)
21
- rspec-expectations (2.10.0)
22
- diff-lcs (~> 1.1.3)
23
- rspec-mocks (2.10.1)
24
- simplecov (0.6.4)
25
- multi_json (~> 1.0)
26
- simplecov-html (~> 0.5.3)
27
- simplecov-html (0.5.3)
28
-
29
- PLATFORMS
30
- ruby
31
-
32
- DEPENDENCIES
33
- bundler (>= 1.0.0)
34
- jeweler (>= 1.8.3)
35
- rdoc (>= 3.12)
36
- rspec (>= 2.8.0)
37
- simplecov