nb 0.0.4 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0d55c868cf9d3df0866130593f9e7d78935d9fde
4
- data.tar.gz: 5e9cd9507bdea2a306e3a8fa9423153d0d4cfbb5
3
+ metadata.gz: 703ab07acadbf5f04d8d979d888790029cc0c6de
4
+ data.tar.gz: c468c1d63b8f628be6160e7041f6053c8431297a
5
5
  SHA512:
6
- metadata.gz: 299259d6e2322cfe07adca36f84899796632bbc7f36c407de3a6e28d763355ba7518cabc82c7f3e8339c8783fa9937cb14edfc062d50946832c0d112c72d4c44
7
- data.tar.gz: 243a9a559861ea695f66f21fe6b34bd5ecc89809b819306c1a9833f08bf51cad1131846180fa6dc55aaad404dc97a13b708ce281b9f95c499691da09e65a6e75
6
+ metadata.gz: 95c245d113ac2dd0a15c7c0d23599d8393b738ddf18ac8be176757409e9b46ca0d19ede1b47f4fdc60a7184690f347e836697231b945f10e1d90f8a8111fa461
7
+ data.tar.gz: 3f969b83d80f16baa624d874228f03d1c36935dfd469d8e1bd9428529bb4e5138e62c3160673b4836f381e762aabef1f2f0848a7d08e9869ec02406a1d9cd371
data/Gemfile CHANGED
@@ -7,4 +7,7 @@ group :development, :test do
7
7
  gem 'guard'
8
8
  gem 'guard-rspec'
9
9
  gem 'simplecov', require: false
10
+ gem 'fakeredis'
10
11
  end
12
+
13
+ gem 'redis'
data/Gemfile.ci CHANGED
@@ -8,4 +8,7 @@ group :development, :test do
8
8
  gem 'guard-rspec'
9
9
  gem 'simplecov', require: false
10
10
  gem 'coveralls', require: false
11
+ gem 'fakeredis'
11
12
  end
13
+
14
+ gem 'redis'
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
  [![Build Status](https://travis-ci.org/forresty/nb.svg?branch=master)](https://travis-ci.org/forresty/nb)
5
5
  [![Gem Version](https://badge.fury.io/rb/nb.svg)](http://badge.fury.io/rb/nb)
6
6
 
7
- yet another Naive Bayes library
7
+ yet another Naive Bayes library with support of memory and Redis backend
8
8
 
9
9
  ## Installation
10
10
 
@@ -25,19 +25,19 @@ Or install it yourself as:
25
25
  ## Usage
26
26
 
27
27
  ```ruby
28
- bayes = NaiveBayes.new :love, :hate
28
+ classifier = NaiveBayes::Classifier.new :love, :hate
29
29
 
30
- bayes.train :love, 'I', 'love', 'you'
31
- bayes.train :hate, 'I', 'hate', 'you'
30
+ classifier.train :love, 'I', 'love', 'you'
31
+ classifier.train :hate, 'I', 'hate', 'you'
32
32
 
33
- bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
34
- bayes.classify(*%w{ I love you }).should == [:love, 0.5]
35
- bayes.classify(*%w{ love }).should == [:love, 0.5]
33
+ classifier.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
34
+ classifier.classify(*%w{ I love you }).should == [:love, 0.5]
35
+ classifier.classify(*%w{ love }).should == [:love, 0.5]
36
36
  ```
37
37
 
38
- ### ability to view top tokens
38
+ ### Ability to view top tokens
39
39
 
40
- `bayes.top_tokens_of_category(:spam)`
40
+ `classifier.top_tokens_of_category(:spam)`
41
41
 
42
42
  ```
43
43
  +------------+------+--------------------+
@@ -61,6 +61,39 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
61
61
  +------------+------+--------------------+
62
62
  ```
63
63
 
64
+ ### Use Redis backend
65
+
66
+ ```ruby
67
+ classifier = Classifier.new(:spam, :ham, backend: :redis, host: 'localhost', port: 30000)
68
+ ```
69
+
70
+ it generates 2 + N keys in redis:
71
+
72
+ ```
73
+ 127.0.0.1:30000> keys *
74
+ 1) "nb:hash:tokens_count:ham"
75
+ 2) "nb:hash:tokens_count:spam"
76
+ 3) "nb:set:categories"
77
+ 4) "nb:hash:categories_count"
78
+ ```
79
+
80
+ ### Support default category
81
+
82
+ in case the probability of each category is too low:
83
+
84
+ ```ruby
85
+ @classifier = NaiveBayes::Classifer.new :spam, :ham
86
+ @classifier.default_category = :ham
87
+ ```
88
+
89
+ ```
90
+ bayes filter mark as spam: false
91
+ bayes classifications: [[:ham, 5.044818725004143e-80], [:spam, 1.938475275819746e-119]]
92
+
93
+ bayes filter mark as spam: false
94
+ bayes classifications: [[:spam, 0.0], [:ham, 0.0]]
95
+ ```
96
+
64
97
  ## Credits
65
98
 
66
99
  - [classifier gem](https://github.com/cardmagic/classifier)
@@ -74,3 +107,13 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
74
107
  3. Commit your changes (`git commit -am 'Add some feature'`)
75
108
  4. Push to the branch (`git push origin my-new-feature`)
76
109
  5. Create a new Pull Request
110
+
111
+ ## Changelog
112
+
113
+ ### 0.1.1 / 2014-12-15
114
+
115
+ - fix redis backend
116
+
117
+ ### 0.1.0 / 2014-12-15
118
+
119
+ - init implementation of redis backend
data/lib/nb.rb CHANGED
@@ -1 +1,3 @@
1
- require "nb/naive_bayes"
1
+ require_relative "nb/classifier"
2
+ require_relative "nb/backend/memory"
3
+ require_relative "nb/backend/redis"
@@ -0,0 +1,39 @@
1
+ module NaiveBayes
2
+ module Backend
3
+ class Memory
4
+ attr_accessor :categories, :tokens_count, :categories_count
5
+
6
+ def initialize(categories)
7
+ @categories = categories
8
+
9
+ clear!
10
+ end
11
+
12
+ def clear!
13
+ @tokens_count = {}
14
+ @categories_count = {}
15
+
16
+ @categories.each do |category|
17
+ @tokens_count[category] = Hash.new(0)
18
+ @categories_count[category] = 0
19
+ end
20
+ end
21
+
22
+ def train(category, *tokens)
23
+ tokens.uniq.each do |token|
24
+ @tokens_count[category][token] += 1
25
+ end
26
+
27
+ @categories_count[category] += 1
28
+ end
29
+
30
+ def untrain(category, *tokens)
31
+ tokens.uniq.each do |token|
32
+ @tokens_count[category][token] -= 1
33
+ end
34
+
35
+ @categories_count[category] -= 1
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,102 @@
1
+ require "redis"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ class Redis
6
+ class RedisHash
7
+ def initialize(redis, hash_name)
8
+ @redis = redis
9
+ @hash_name = hash_name
10
+ end
11
+
12
+ def [](key)
13
+ value = @redis.hget @hash_name, key
14
+ value.to_f
15
+ end
16
+
17
+ def []=(key, value)
18
+ @redis.hset @hash_name, key, value
19
+ end
20
+
21
+ def incr(key)
22
+ @redis.hincrby @hash_name, key, 1
23
+ end
24
+
25
+ def decr(key)
26
+ @redis.hdecrby @hash_name, key, 1
27
+ end
28
+
29
+ def values
30
+ @redis.hvals(@hash_name).map(&:to_f)
31
+ end
32
+
33
+ def map
34
+ out = []
35
+
36
+ if block_given?
37
+ @redis.hkeys(@hash_name).each { |k| out << yield(k, self.[](k)) }
38
+ else
39
+ out = to_enum :map
40
+ end
41
+
42
+ out
43
+ end
44
+ end
45
+
46
+ def initialize(categories, options={})
47
+ @redis = ::Redis.new(options)
48
+
49
+ @_categories = categories
50
+
51
+ setup
52
+ end
53
+
54
+ def categories
55
+ @redis.smembers("nb:set:categories").map(&:to_sym)
56
+ end
57
+
58
+ def categories_count
59
+ @categories_count ||= RedisHash.new(@redis, "nb:hash:categories_count")
60
+ end
61
+
62
+ def tokens_count
63
+ @tokens_count ||= Hash.new
64
+ end
65
+
66
+ def clear!
67
+ @redis.flushall
68
+
69
+ setup
70
+
71
+ categories.each do |category|
72
+ self.categories_count[category] = 0
73
+ end
74
+ end
75
+
76
+ def setup
77
+ @redis.sadd "nb:set:categories", @_categories
78
+
79
+ categories.each do |category|
80
+ # @tokens_count[category] = Hash.new(0)
81
+ self.tokens_count[category] = RedisHash.new(@redis, "nb:hash:tokens_count:#{category}")
82
+ end
83
+ end
84
+
85
+ def train(category, *tokens)
86
+ tokens.uniq.each do |token|
87
+ self.tokens_count[category].incr(token)
88
+ end
89
+
90
+ self.categories_count.incr(category)
91
+ end
92
+
93
+ def untrain(category, *tokens)
94
+ tokens.uniq.each do |token|
95
+ self.tokens_count[category][token].decr(token)
96
+ end
97
+
98
+ self.categories_count.decr(category)
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,127 @@
1
+ require "yaml"
2
+
3
+ module NaiveBayes
4
+ class Classifier
5
+ attr_accessor :default_category
6
+ attr_accessor :backend
7
+
8
+ def initialize(*categories)
9
+ if categories.last.is_a?(Hash)
10
+ options = categories.pop
11
+ else
12
+ options = {}
13
+ end
14
+
15
+ options[:backend] ||= :memory
16
+
17
+ case options[:backend]
18
+ when :memory
19
+ @backend = Backend::Memory.new(categories)
20
+ when :redis
21
+ options[:host] ||= 'localhost'
22
+ options[:port] ||= 6379
23
+
24
+ @backend = Backend::Redis.new(categories, host: options[:host], port: options[:port])
25
+ else
26
+ raise "unsupported backend: #{options[:backend]}"
27
+ end
28
+
29
+ @default_category = categories.first
30
+ end
31
+
32
+ def train(category, *tokens)
33
+ backend.train(category, *tokens)
34
+ end
35
+
36
+ def untrain(category, *tokens)
37
+ backend.untrain(category, *tokens)
38
+ end
39
+
40
+ def clear!
41
+ backend.clear!
42
+ end
43
+
44
+ def classify(*tokens)
45
+ result = classifications(*tokens).first
46
+
47
+ if result.last == 0.0
48
+ [@default_category, 0.0]
49
+ else
50
+ result
51
+ end
52
+ end
53
+
54
+ def classifications(*tokens)
55
+ scores = {}
56
+ backend.categories.each do |category|
57
+ scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
58
+ end
59
+ scores.sort_by { |k, v| -v }
60
+ end
61
+
62
+ def top_tokens_of_category(category, count=20)
63
+ backend.tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
64
+ end
65
+
66
+ def probability_of_a_token_in_category(token, category)
67
+ probability_of_a_token_given_a_category(token, category) / backend.categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
68
+ end
69
+
70
+ def probability_of_a_token_given_a_category(token, category)
71
+ return assumed_probability if backend.tokens_count[category][token] == 0
72
+
73
+ backend.tokens_count[category][token].to_f / backend.categories_count[category]
74
+ end
75
+
76
+ def probability_of_tokens_given_a_category(tokens, category)
77
+ tokens.inject(1.0) do |product, token|
78
+ product * probability_of_a_token_given_a_category(token, category)
79
+ end
80
+ end
81
+
82
+ def probability_of_a_category(category)
83
+ backend.categories_count[category].to_f / total_number_of_items
84
+ end
85
+
86
+ # def total_number_of_tokens
87
+ # @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
88
+ # end
89
+
90
+ def total_number_of_items
91
+ backend.categories_count.values.inject(&:+)
92
+ end
93
+
94
+ # If we have only trained a little bit a class may not have had a feature yet
95
+ # give it a probability of 0 may not be true so we produce a assumed probability
96
+ # which gets smaller more we train
97
+ def assumed_probability
98
+ 0.5 / (total_number_of_items.to_f / 2)
99
+ end
100
+
101
+ def data
102
+ {
103
+ :categories => backend.categories,
104
+ :tokens_count => backend.tokens_count,
105
+ :categories_count => backend.categories_count
106
+ }
107
+ end
108
+
109
+ def save(yaml_file)
110
+ raise 'only memory backend can save' unless backend == :memory
111
+
112
+ File.write(yaml_file, data.to_yaml)
113
+ end
114
+
115
+ class << self
116
+ # will load into a memory-backed classifier
117
+ def load_yaml(yaml_file)
118
+ data = YAML.load_file(yaml_file)
119
+
120
+ new(data[:categories], backend: :memory).tap do |classifier|
121
+ classifier.tokens_count = data[:tokens_count]
122
+ classifier.categories_count = data[:categories_count]
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
@@ -1,3 +1,3 @@
1
- class NaiveBayes
2
- VERSION = "0.0.4"
1
+ module NaiveBayes
2
+ VERSION = "0.1.1"
3
3
  end
@@ -0,0 +1,13 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ describe Memory do
6
+ subject { Memory.new [:ham, :spam] }
7
+
8
+ it { should respond_to :categories= }
9
+ it { should respond_to :train }
10
+ it { should respond_to :untrain }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ describe Redis do
6
+ subject { Redis.new [:ham, :spam] }
7
+
8
+ it { should respond_to :train }
9
+ it { should respond_to :untrain }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,152 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ describe Classifier do
5
+ let(:classifier) { Classifier.new(:love, :hate) }
6
+ subject { classifier }
7
+
8
+ it { should respond_to :train }
9
+ it { should respond_to :untrain }
10
+ it { should respond_to :save }
11
+ it { should respond_to :classify }
12
+ it { should respond_to :classifications }
13
+ it { should respond_to :probability_of_a_token_given_a_category }
14
+ it { should respond_to :probability_of_tokens_given_a_category }
15
+ it { should respond_to :probability_of_a_category }
16
+ it { should respond_to :probability_of_a_token_in_category }
17
+ # it { should respond_to :total_number_of_tokens }
18
+ it { should respond_to :total_number_of_items }
19
+ it { should respond_to :top_tokens_of_category }
20
+ it { should respond_to :default_category= }
21
+
22
+ it { should respond_to :clear! }
23
+
24
+ [:memory, :redis].each do |backend|
25
+ describe "with backend #{backend}" do
26
+
27
+ let(:classifier) { Classifier.new(:love, :hate, backend: backend) }
28
+
29
+ subject { classifier }
30
+
31
+ before(:each) do
32
+ subject.clear!
33
+ end
34
+
35
+ # describe '#total_number_of_tokens' do
36
+ # it 'calculates correctly' do
37
+ # bayes.train :love, 'I', 'love', 'you'
38
+ # bayes.train :hate, 'I', 'hate', 'you'
39
+ #
40
+ # bayes.total_number_of_tokens.should == 6
41
+ #
42
+ # bayes.train :love, 'I', 'love', 'you', 'more'
43
+ #
44
+ # bayes.total_number_of_tokens.should == 10
45
+ # end
46
+ # end
47
+
48
+ describe '#categories_count and #tokens_count' do
49
+ it 'must get it right' do
50
+ subject.backend.categories_count[:love].should == 0
51
+
52
+ subject.train :love, 'I', 'love', 'you'
53
+ subject.train :hate, 'I', 'hate', 'you'
54
+
55
+ subject.backend.categories_count[:love].should == 1
56
+ subject.backend.tokens_count[:hate]['you'].should == 1
57
+ subject.backend.tokens_count[:hate]['love'].should == 0
58
+ end
59
+ end
60
+
61
+ describe '#probability_of_a_token_in_category' do
62
+ it 'calculates correctly' do
63
+ subject.train :love, 'I', 'love', 'you'
64
+ subject.train :hate, 'I', 'hate', 'you'
65
+
66
+ subject.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
67
+ subject.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
68
+ subject.probability_of_a_token_in_category('I', :love).should == 0.5
69
+
70
+ subject.train :love, 'hate', 'is', 'love'
71
+ subject.train :love, 'hate', 'is', 'love'
72
+ subject.train :love, 'hate', 'is', 'love'
73
+
74
+ subject.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
75
+ subject.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
76
+ end
77
+ end
78
+
79
+ describe '#total_number_of_items' do
80
+ it 'calculates correctly' do
81
+ subject.train :love, 'I', 'love', 'you'
82
+ subject.train :hate, 'I', 'hate', 'you'
83
+
84
+ subject.total_number_of_items.should == 2
85
+
86
+ subject.train :love, 'I', 'love', 'you', 'more'
87
+
88
+ subject.total_number_of_items.should == 3
89
+ end
90
+ end
91
+
92
+ describe '#probability_of_a_category' do
93
+ it 'calculates correctly' do
94
+ subject.train :love, 'I', 'love', 'you'
95
+ subject.train :hate, 'I', 'hate', 'you'
96
+
97
+ subject.probability_of_a_category(:love).should == 0.5
98
+ end
99
+ end
100
+
101
+ describe '#probability_of_token_given_a_category' do
102
+ it 'calculates correctly' do
103
+ subject.train :love, 'I', 'love', 'you'
104
+ subject.train :hate, 'I', 'hate', 'you'
105
+
106
+ subject.probability_of_a_token_given_a_category('love', :love).should == 1
107
+ subject.probability_of_a_token_given_a_category('you', :hate).should == 1
108
+
109
+ subject.train :love, 'I', 'love', 'you', 'more'
110
+
111
+ subject.probability_of_a_token_given_a_category('more', :love).should == 0.5
112
+ # bayes.probability_of_token_given_a_category('more', :hate).should == 0
113
+ end
114
+ end
115
+
116
+ describe '#classifications' do
117
+ it 'calculates correctly' do
118
+ subject.train :love, 'I', 'love', 'you'
119
+ subject.train :hate, 'I', 'hate', 'you'
120
+
121
+ subject.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
122
+ subject.classify(*%w{ I love you }).should == [:love, 0.5]
123
+ subject.classify(*%w{ love }).should == [:love, 0.5]
124
+
125
+ subject.train :love, 'I', 'love', 'you'
126
+ subject.train :love, 'I', 'love', 'you'
127
+ subject.train :love, 'I', 'love', 'you'
128
+
129
+ subject.classify(*%w{ I love you }).should == [:love, 0.8]
130
+ subject.classify(*%w{ love }).should == [:love, 0.8]
131
+ subject.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
132
+ end
133
+ end
134
+
135
+ describe '#top_tokens_of_category' do
136
+ it 'finds to tokens' do
137
+ subject.train :love, 'I', 'love', 'you'
138
+ subject.train :hate, 'I', 'hate', 'you'
139
+
140
+ subject.top_tokens_of_category(:love).count.should == 3
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ describe 'class methods' do
147
+ subject { Classifier }
148
+
149
+ it { should respond_to :load_yaml }
150
+ end
151
+ end
152
+ end
@@ -10,3 +10,4 @@ rescue LoadError
10
10
  end
11
11
 
12
12
  require "nb"
13
+ require "fakeredis"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Forrest Ye
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-19 00:00:00.000000000 Z
11
+ date: 2014-12-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,10 +54,14 @@ files:
54
54
  - README.md
55
55
  - Rakefile
56
56
  - lib/nb.rb
57
- - lib/nb/naive_bayes.rb
57
+ - lib/nb/backend/memory.rb
58
+ - lib/nb/backend/redis.rb
59
+ - lib/nb/classifier.rb
58
60
  - lib/nb/version.rb
59
61
  - nb.gemspec
60
- - spec/nb/naive_bayes_spec.rb
62
+ - spec/nb/backend/memory_spec.rb
63
+ - spec/nb/backend/redis_spec.rb
64
+ - spec/nb/classifier_spec.rb
61
65
  - spec/spec_helper.rb
62
66
  homepage: https://github.com/forresty/nb
63
67
  licenses:
@@ -79,10 +83,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
83
  version: '0'
80
84
  requirements: []
81
85
  rubyforge_project:
82
- rubygems_version: 2.0.2
86
+ rubygems_version: 2.4.4
83
87
  signing_key:
84
88
  specification_version: 4
85
89
  summary: yet another Naive Bayes library
86
90
  test_files:
87
- - spec/nb/naive_bayes_spec.rb
91
+ - spec/nb/backend/memory_spec.rb
92
+ - spec/nb/backend/redis_spec.rb
93
+ - spec/nb/classifier_spec.rb
88
94
  - spec/spec_helper.rb
@@ -1,112 +0,0 @@
1
- require "yaml"
2
-
3
- class NaiveBayes
4
- attr_accessor :categories, :tokens_count, :categories_count, :default_category
5
-
6
- def initialize(*categories)
7
- @categories = categories
8
- @tokens_count = {}
9
- @categories_count = {}
10
- @default_category = @categories.first
11
-
12
- categories.each do |category|
13
- @tokens_count[category] = Hash.new(0)
14
- @categories_count[category] = 0
15
- end
16
- end
17
-
18
- def train(category, *tokens)
19
- tokens.uniq.each do |token|
20
- @tokens_count[category][token] += 1
21
- end
22
- @categories_count[category] += 1
23
- end
24
-
25
- def untrain(category, *tokens)
26
- tokens.uniq.each do |token|
27
- @tokens_count[category][token] -= 1
28
- end
29
- @categories_count[category] -= 1
30
- end
31
-
32
- def classify(*tokens)
33
- result = classifications(*tokens).first
34
-
35
- if result.last == 0.0
36
- [@default_category, 0.0]
37
- else
38
- result
39
- end
40
- end
41
-
42
- def classifications(*tokens)
43
- scores = {}
44
- @categories.each do |category|
45
- scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
46
- end
47
- scores.sort_by { |k, v| -v }
48
- end
49
-
50
- def top_tokens_of_category(category, count=20)
51
- tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
52
- end
53
-
54
- def probability_of_a_token_in_category(token, category)
55
- probability_of_a_token_given_a_category(token, category) / @categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
56
- end
57
-
58
- def probability_of_a_token_given_a_category(token, category)
59
- return assumed_probability if @tokens_count[category][token] == 0
60
-
61
- @tokens_count[category][token].to_f / @categories_count[category]
62
- end
63
-
64
- def probability_of_tokens_given_a_category(tokens, category)
65
- tokens.inject(1.0) do |product, token|
66
- product * probability_of_a_token_given_a_category(token, category)
67
- end
68
- end
69
-
70
- def probability_of_a_category(category)
71
- @categories_count[category].to_f / total_number_of_items
72
- end
73
-
74
- # def total_number_of_tokens
75
- # @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
76
- # end
77
-
78
- def total_number_of_items
79
- @categories_count.values.inject(&:+)
80
- end
81
-
82
- # If we have only trained a little bit a class may not have had a feature yet
83
- # give it a probability of 0 may not be true so we produce a assumed probability
84
- # which gets smaller more we train
85
- def assumed_probability
86
- 0.5 / (total_number_of_items.to_f / 2)
87
- end
88
-
89
- def data
90
- {
91
- :categories => @categories,
92
- :tokens_count => @tokens_count,
93
- :categories_count => @categories_count
94
- }
95
- end
96
-
97
- def save(yaml_file)
98
- File.write(yaml_file, data.to_yaml)
99
- end
100
-
101
- class << self
102
- def load_yaml(yaml_file)
103
- data = YAML.load_file(yaml_file)
104
-
105
- new.tap do |bayes|
106
- bayes.categories = data[:categories]
107
- bayes.tokens_count = data[:tokens_count]
108
- bayes.categories_count = data[:categories_count]
109
- end
110
- end
111
- end
112
- end
@@ -1,113 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe NaiveBayes do
4
- it { should respond_to :train }
5
- it { should respond_to :untrain }
6
- it { should respond_to :save }
7
- it { should respond_to :classify }
8
- it { should respond_to :classifications }
9
- it { should respond_to :probability_of_a_token_given_a_category }
10
- it { should respond_to :probability_of_tokens_given_a_category }
11
- it { should respond_to :probability_of_a_category }
12
- it { should respond_to :probability_of_a_token_in_category }
13
- # it { should respond_to :total_number_of_tokens }
14
- it { should respond_to :total_number_of_items }
15
- it { should respond_to :top_tokens_of_category }
16
- it { should respond_to :default_category= }
17
-
18
- let(:bayes) { NaiveBayes.new(:love, :hate) }
19
- subject { bayes }
20
-
21
- # describe '#total_number_of_tokens' do
22
- # it 'calculates correctly' do
23
- # bayes.train :love, 'I', 'love', 'you'
24
- # bayes.train :hate, 'I', 'hate', 'you'
25
- #
26
- # bayes.total_number_of_tokens.should == 6
27
- #
28
- # bayes.train :love, 'I', 'love', 'you', 'more'
29
- #
30
- # bayes.total_number_of_tokens.should == 10
31
- # end
32
- # end
33
-
34
- describe '#probability_of_a_token_in_category' do
35
- it 'calculates correctly' do
36
- bayes.train :love, 'I', 'love', 'you'
37
- bayes.train :hate, 'I', 'hate', 'you'
38
-
39
- bayes.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
40
- bayes.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
41
- bayes.probability_of_a_token_in_category('I', :love).should == 0.5
42
-
43
- bayes.train :love, 'hate', 'is', 'love'
44
- bayes.train :love, 'hate', 'is', 'love'
45
- bayes.train :love, 'hate', 'is', 'love'
46
-
47
- bayes.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
48
- bayes.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
49
- end
50
- end
51
-
52
- describe '#total_number_of_items' do
53
- it 'calculates correctly' do
54
- bayes.train :love, 'I', 'love', 'you'
55
- bayes.train :hate, 'I', 'hate', 'you'
56
-
57
- bayes.total_number_of_items.should == 2
58
-
59
- bayes.train :love, 'I', 'love', 'you', 'more'
60
-
61
- bayes.total_number_of_items.should == 3
62
- end
63
- end
64
-
65
- describe '#probability_of_a_category' do
66
- it 'calculates correctly' do
67
- bayes.train :love, 'I', 'love', 'you'
68
- bayes.train :hate, 'I', 'hate', 'you'
69
-
70
- bayes.probability_of_a_category(:love).should == 0.5
71
- end
72
- end
73
-
74
- describe '#probability_of_token_given_a_category' do
75
- it 'calculates correctly' do
76
- bayes.train :love, 'I', 'love', 'you'
77
- bayes.train :hate, 'I', 'hate', 'you'
78
-
79
- bayes.probability_of_a_token_given_a_category('love', :love).should == 1
80
- bayes.probability_of_a_token_given_a_category('you', :hate).should == 1
81
-
82
- bayes.train :love, 'I', 'love', 'you', 'more'
83
-
84
- bayes.probability_of_a_token_given_a_category('more', :love).should == 0.5
85
- # bayes.probability_of_token_given_a_category('more', :hate).should == 0
86
- end
87
- end
88
-
89
- describe '#classifications' do
90
- it 'calculates correctly' do
91
- bayes.train :love, 'I', 'love', 'you'
92
- bayes.train :hate, 'I', 'hate', 'you'
93
-
94
- bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
95
- bayes.classify(*%w{ I love you }).should == [:love, 0.5]
96
- bayes.classify(*%w{ love }).should == [:love, 0.5]
97
-
98
- bayes.train :love, 'I', 'love', 'you'
99
- bayes.train :love, 'I', 'love', 'you'
100
- bayes.train :love, 'I', 'love', 'you'
101
-
102
- bayes.classify(*%w{ I love you }).should == [:love, 0.8]
103
- bayes.classify(*%w{ love }).should == [:love, 0.8]
104
- bayes.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
105
- end
106
- end
107
-
108
- describe 'class methods' do
109
- subject { NaiveBayes }
110
-
111
- it { should respond_to :load_yaml }
112
- end
113
- end