nb 0.0.4 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0d55c868cf9d3df0866130593f9e7d78935d9fde
4
- data.tar.gz: 5e9cd9507bdea2a306e3a8fa9423153d0d4cfbb5
3
+ metadata.gz: 703ab07acadbf5f04d8d979d888790029cc0c6de
4
+ data.tar.gz: c468c1d63b8f628be6160e7041f6053c8431297a
5
5
  SHA512:
6
- metadata.gz: 299259d6e2322cfe07adca36f84899796632bbc7f36c407de3a6e28d763355ba7518cabc82c7f3e8339c8783fa9937cb14edfc062d50946832c0d112c72d4c44
7
- data.tar.gz: 243a9a559861ea695f66f21fe6b34bd5ecc89809b819306c1a9833f08bf51cad1131846180fa6dc55aaad404dc97a13b708ce281b9f95c499691da09e65a6e75
6
+ metadata.gz: 95c245d113ac2dd0a15c7c0d23599d8393b738ddf18ac8be176757409e9b46ca0d19ede1b47f4fdc60a7184690f347e836697231b945f10e1d90f8a8111fa461
7
+ data.tar.gz: 3f969b83d80f16baa624d874228f03d1c36935dfd469d8e1bd9428529bb4e5138e62c3160673b4836f381e762aabef1f2f0848a7d08e9869ec02406a1d9cd371
data/Gemfile CHANGED
@@ -7,4 +7,7 @@ group :development, :test do
7
7
  gem 'guard'
8
8
  gem 'guard-rspec'
9
9
  gem 'simplecov', require: false
10
+ gem 'fakeredis'
10
11
  end
12
+
13
+ gem 'redis'
data/Gemfile.ci CHANGED
@@ -8,4 +8,7 @@ group :development, :test do
8
8
  gem 'guard-rspec'
9
9
  gem 'simplecov', require: false
10
10
  gem 'coveralls', require: false
11
+ gem 'fakeredis'
11
12
  end
13
+
14
+ gem 'redis'
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
  [![Build Status](https://travis-ci.org/forresty/nb.svg?branch=master)](https://travis-ci.org/forresty/nb)
5
5
  [![Gem Version](https://badge.fury.io/rb/nb.svg)](http://badge.fury.io/rb/nb)
6
6
 
7
- yet another Naive Bayes library
7
+ yet another Naive Bayes library with support of memory and Redis backend
8
8
 
9
9
  ## Installation
10
10
 
@@ -25,19 +25,19 @@ Or install it yourself as:
25
25
  ## Usage
26
26
 
27
27
  ```ruby
28
- bayes = NaiveBayes.new :love, :hate
28
+ classifier = NaiveBayes::Classifier.new :love, :hate
29
29
 
30
- bayes.train :love, 'I', 'love', 'you'
31
- bayes.train :hate, 'I', 'hate', 'you'
30
+ classifier.train :love, 'I', 'love', 'you'
31
+ classifier.train :hate, 'I', 'hate', 'you'
32
32
 
33
- bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
34
- bayes.classify(*%w{ I love you }).should == [:love, 0.5]
35
- bayes.classify(*%w{ love }).should == [:love, 0.5]
33
+ classifier.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
34
+ classifier.classify(*%w{ I love you }).should == [:love, 0.5]
35
+ classifier.classify(*%w{ love }).should == [:love, 0.5]
36
36
  ```
37
37
 
38
- ### ability to view top tokens
38
+ ### Ability to view top tokens
39
39
 
40
- `bayes.top_tokens_of_category(:spam)`
40
+ `classifier.top_tokens_of_category(:spam)`
41
41
 
42
42
  ```
43
43
  +------------+------+--------------------+
@@ -61,6 +61,39 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
61
61
  +------------+------+--------------------+
62
62
  ```
63
63
 
64
+ ### Use Redis backend
65
+
66
+ ```ruby
67
+ classifier = Classifier.new(:spam, :ham, backend: :redis, host: 'localhost', port: 30000)
68
+ ```
69
+
70
+ it generates 2 + N keys in redis:
71
+
72
+ ```
73
+ 127.0.0.1:30000> keys *
74
+ 1) "nb:hash:tokens_count:ham"
75
+ 2) "nb:hash:tokens_count:spam"
76
+ 3) "nb:set:categories"
77
+ 4) "nb:hash:categories_count"
78
+ ```
79
+
80
+ ### Support default category
81
+
82
+ in case the probability of each category is too low:
83
+
84
+ ```ruby
85
+ @classifier = NaiveBayes::Classifer.new :spam, :ham
86
+ @classifier.default_category = :ham
87
+ ```
88
+
89
+ ```
90
+ bayes filter mark as spam: false
91
+ bayes classifications: [[:ham, 5.044818725004143e-80], [:spam, 1.938475275819746e-119]]
92
+
93
+ bayes filter mark as spam: false
94
+ bayes classifications: [[:spam, 0.0], [:ham, 0.0]]
95
+ ```
96
+
64
97
  ## Credits
65
98
 
66
99
  - [classifier gem](https://github.com/cardmagic/classifier)
@@ -74,3 +107,13 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
74
107
  3. Commit your changes (`git commit -am 'Add some feature'`)
75
108
  4. Push to the branch (`git push origin my-new-feature`)
76
109
  5. Create a new Pull Request
110
+
111
+ ## Changelog
112
+
113
+ ### 0.1.1 / 2014-12-15
114
+
115
+ - fix redis backend
116
+
117
+ ### 0.1.0 / 2014-12-15
118
+
119
+ - init implementation of redis backend
data/lib/nb.rb CHANGED
@@ -1 +1,3 @@
1
- require "nb/naive_bayes"
1
+ require_relative "nb/classifier"
2
+ require_relative "nb/backend/memory"
3
+ require_relative "nb/backend/redis"
@@ -0,0 +1,39 @@
1
+ module NaiveBayes
2
+ module Backend
3
+ class Memory
4
+ attr_accessor :categories, :tokens_count, :categories_count
5
+
6
+ def initialize(categories)
7
+ @categories = categories
8
+
9
+ clear!
10
+ end
11
+
12
+ def clear!
13
+ @tokens_count = {}
14
+ @categories_count = {}
15
+
16
+ @categories.each do |category|
17
+ @tokens_count[category] = Hash.new(0)
18
+ @categories_count[category] = 0
19
+ end
20
+ end
21
+
22
+ def train(category, *tokens)
23
+ tokens.uniq.each do |token|
24
+ @tokens_count[category][token] += 1
25
+ end
26
+
27
+ @categories_count[category] += 1
28
+ end
29
+
30
+ def untrain(category, *tokens)
31
+ tokens.uniq.each do |token|
32
+ @tokens_count[category][token] -= 1
33
+ end
34
+
35
+ @categories_count[category] -= 1
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,102 @@
1
+ require "redis"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ class Redis
6
+ class RedisHash
7
+ def initialize(redis, hash_name)
8
+ @redis = redis
9
+ @hash_name = hash_name
10
+ end
11
+
12
+ def [](key)
13
+ value = @redis.hget @hash_name, key
14
+ value.to_f
15
+ end
16
+
17
+ def []=(key, value)
18
+ @redis.hset @hash_name, key, value
19
+ end
20
+
21
+ def incr(key)
22
+ @redis.hincrby @hash_name, key, 1
23
+ end
24
+
25
+ def decr(key)
26
+ @redis.hdecrby @hash_name, key, 1
27
+ end
28
+
29
+ def values
30
+ @redis.hvals(@hash_name).map(&:to_f)
31
+ end
32
+
33
+ def map
34
+ out = []
35
+
36
+ if block_given?
37
+ @redis.hkeys(@hash_name).each { |k| out << yield(k, self.[](k)) }
38
+ else
39
+ out = to_enum :map
40
+ end
41
+
42
+ out
43
+ end
44
+ end
45
+
46
+ def initialize(categories, options={})
47
+ @redis = ::Redis.new(options)
48
+
49
+ @_categories = categories
50
+
51
+ setup
52
+ end
53
+
54
+ def categories
55
+ @redis.smembers("nb:set:categories").map(&:to_sym)
56
+ end
57
+
58
+ def categories_count
59
+ @categories_count ||= RedisHash.new(@redis, "nb:hash:categories_count")
60
+ end
61
+
62
+ def tokens_count
63
+ @tokens_count ||= Hash.new
64
+ end
65
+
66
+ def clear!
67
+ @redis.flushall
68
+
69
+ setup
70
+
71
+ categories.each do |category|
72
+ self.categories_count[category] = 0
73
+ end
74
+ end
75
+
76
+ def setup
77
+ @redis.sadd "nb:set:categories", @_categories
78
+
79
+ categories.each do |category|
80
+ # @tokens_count[category] = Hash.new(0)
81
+ self.tokens_count[category] = RedisHash.new(@redis, "nb:hash:tokens_count:#{category}")
82
+ end
83
+ end
84
+
85
+ def train(category, *tokens)
86
+ tokens.uniq.each do |token|
87
+ self.tokens_count[category].incr(token)
88
+ end
89
+
90
+ self.categories_count.incr(category)
91
+ end
92
+
93
+ def untrain(category, *tokens)
94
+ tokens.uniq.each do |token|
95
+ self.tokens_count[category][token].decr(token)
96
+ end
97
+
98
+ self.categories_count.decr(category)
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,127 @@
1
+ require "yaml"
2
+
3
+ module NaiveBayes
4
+ class Classifier
5
+ attr_accessor :default_category
6
+ attr_accessor :backend
7
+
8
+ def initialize(*categories)
9
+ if categories.last.is_a?(Hash)
10
+ options = categories.pop
11
+ else
12
+ options = {}
13
+ end
14
+
15
+ options[:backend] ||= :memory
16
+
17
+ case options[:backend]
18
+ when :memory
19
+ @backend = Backend::Memory.new(categories)
20
+ when :redis
21
+ options[:host] ||= 'localhost'
22
+ options[:port] ||= 6379
23
+
24
+ @backend = Backend::Redis.new(categories, host: options[:host], port: options[:port])
25
+ else
26
+ raise "unsupported backend: #{options[:backend]}"
27
+ end
28
+
29
+ @default_category = categories.first
30
+ end
31
+
32
+ def train(category, *tokens)
33
+ backend.train(category, *tokens)
34
+ end
35
+
36
+ def untrain(category, *tokens)
37
+ backend.untrain(category, *tokens)
38
+ end
39
+
40
+ def clear!
41
+ backend.clear!
42
+ end
43
+
44
+ def classify(*tokens)
45
+ result = classifications(*tokens).first
46
+
47
+ if result.last == 0.0
48
+ [@default_category, 0.0]
49
+ else
50
+ result
51
+ end
52
+ end
53
+
54
+ def classifications(*tokens)
55
+ scores = {}
56
+ backend.categories.each do |category|
57
+ scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
58
+ end
59
+ scores.sort_by { |k, v| -v }
60
+ end
61
+
62
+ def top_tokens_of_category(category, count=20)
63
+ backend.tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
64
+ end
65
+
66
+ def probability_of_a_token_in_category(token, category)
67
+ probability_of_a_token_given_a_category(token, category) / backend.categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
68
+ end
69
+
70
+ def probability_of_a_token_given_a_category(token, category)
71
+ return assumed_probability if backend.tokens_count[category][token] == 0
72
+
73
+ backend.tokens_count[category][token].to_f / backend.categories_count[category]
74
+ end
75
+
76
+ def probability_of_tokens_given_a_category(tokens, category)
77
+ tokens.inject(1.0) do |product, token|
78
+ product * probability_of_a_token_given_a_category(token, category)
79
+ end
80
+ end
81
+
82
+ def probability_of_a_category(category)
83
+ backend.categories_count[category].to_f / total_number_of_items
84
+ end
85
+
86
+ # def total_number_of_tokens
87
+ # @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
88
+ # end
89
+
90
+ def total_number_of_items
91
+ backend.categories_count.values.inject(&:+)
92
+ end
93
+
94
+ # If we have only trained a little bit a class may not have had a feature yet
95
+ # give it a probability of 0 may not be true so we produce a assumed probability
96
+ # which gets smaller more we train
97
+ def assumed_probability
98
+ 0.5 / (total_number_of_items.to_f / 2)
99
+ end
100
+
101
+ def data
102
+ {
103
+ :categories => backend.categories,
104
+ :tokens_count => backend.tokens_count,
105
+ :categories_count => backend.categories_count
106
+ }
107
+ end
108
+
109
+ def save(yaml_file)
110
+ raise 'only memory backend can save' unless backend == :memory
111
+
112
+ File.write(yaml_file, data.to_yaml)
113
+ end
114
+
115
+ class << self
116
+ # will load into a memory-backed classifier
117
+ def load_yaml(yaml_file)
118
+ data = YAML.load_file(yaml_file)
119
+
120
+ new(data[:categories], backend: :memory).tap do |classifier|
121
+ classifier.tokens_count = data[:tokens_count]
122
+ classifier.categories_count = data[:categories_count]
123
+ end
124
+ end
125
+ end
126
+ end
127
+ end
@@ -1,3 +1,3 @@
1
- class NaiveBayes
2
- VERSION = "0.0.4"
1
+ module NaiveBayes
2
+ VERSION = "0.1.1"
3
3
  end
@@ -0,0 +1,13 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ describe Memory do
6
+ subject { Memory.new [:ham, :spam] }
7
+
8
+ it { should respond_to :categories= }
9
+ it { should respond_to :train }
10
+ it { should respond_to :untrain }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,12 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ module Backend
5
+ describe Redis do
6
+ subject { Redis.new [:ham, :spam] }
7
+
8
+ it { should respond_to :train }
9
+ it { should respond_to :untrain }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,152 @@
1
+ require "spec_helper"
2
+
3
+ module NaiveBayes
4
+ describe Classifier do
5
+ let(:classifier) { Classifier.new(:love, :hate) }
6
+ subject { classifier }
7
+
8
+ it { should respond_to :train }
9
+ it { should respond_to :untrain }
10
+ it { should respond_to :save }
11
+ it { should respond_to :classify }
12
+ it { should respond_to :classifications }
13
+ it { should respond_to :probability_of_a_token_given_a_category }
14
+ it { should respond_to :probability_of_tokens_given_a_category }
15
+ it { should respond_to :probability_of_a_category }
16
+ it { should respond_to :probability_of_a_token_in_category }
17
+ # it { should respond_to :total_number_of_tokens }
18
+ it { should respond_to :total_number_of_items }
19
+ it { should respond_to :top_tokens_of_category }
20
+ it { should respond_to :default_category= }
21
+
22
+ it { should respond_to :clear! }
23
+
24
+ [:memory, :redis].each do |backend|
25
+ describe "with backend #{backend}" do
26
+
27
+ let(:classifier) { Classifier.new(:love, :hate, backend: backend) }
28
+
29
+ subject { classifier }
30
+
31
+ before(:each) do
32
+ subject.clear!
33
+ end
34
+
35
+ # describe '#total_number_of_tokens' do
36
+ # it 'calculates correctly' do
37
+ # bayes.train :love, 'I', 'love', 'you'
38
+ # bayes.train :hate, 'I', 'hate', 'you'
39
+ #
40
+ # bayes.total_number_of_tokens.should == 6
41
+ #
42
+ # bayes.train :love, 'I', 'love', 'you', 'more'
43
+ #
44
+ # bayes.total_number_of_tokens.should == 10
45
+ # end
46
+ # end
47
+
48
+ describe '#categories_count and #tokens_count' do
49
+ it 'must get it right' do
50
+ subject.backend.categories_count[:love].should == 0
51
+
52
+ subject.train :love, 'I', 'love', 'you'
53
+ subject.train :hate, 'I', 'hate', 'you'
54
+
55
+ subject.backend.categories_count[:love].should == 1
56
+ subject.backend.tokens_count[:hate]['you'].should == 1
57
+ subject.backend.tokens_count[:hate]['love'].should == 0
58
+ end
59
+ end
60
+
61
+ describe '#probability_of_a_token_in_category' do
62
+ it 'calculates correctly' do
63
+ subject.train :love, 'I', 'love', 'you'
64
+ subject.train :hate, 'I', 'hate', 'you'
65
+
66
+ subject.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
67
+ subject.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
68
+ subject.probability_of_a_token_in_category('I', :love).should == 0.5
69
+
70
+ subject.train :love, 'hate', 'is', 'love'
71
+ subject.train :love, 'hate', 'is', 'love'
72
+ subject.train :love, 'hate', 'is', 'love'
73
+
74
+ subject.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
75
+ subject.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
76
+ end
77
+ end
78
+
79
+ describe '#total_number_of_items' do
80
+ it 'calculates correctly' do
81
+ subject.train :love, 'I', 'love', 'you'
82
+ subject.train :hate, 'I', 'hate', 'you'
83
+
84
+ subject.total_number_of_items.should == 2
85
+
86
+ subject.train :love, 'I', 'love', 'you', 'more'
87
+
88
+ subject.total_number_of_items.should == 3
89
+ end
90
+ end
91
+
92
+ describe '#probability_of_a_category' do
93
+ it 'calculates correctly' do
94
+ subject.train :love, 'I', 'love', 'you'
95
+ subject.train :hate, 'I', 'hate', 'you'
96
+
97
+ subject.probability_of_a_category(:love).should == 0.5
98
+ end
99
+ end
100
+
101
+ describe '#probability_of_token_given_a_category' do
102
+ it 'calculates correctly' do
103
+ subject.train :love, 'I', 'love', 'you'
104
+ subject.train :hate, 'I', 'hate', 'you'
105
+
106
+ subject.probability_of_a_token_given_a_category('love', :love).should == 1
107
+ subject.probability_of_a_token_given_a_category('you', :hate).should == 1
108
+
109
+ subject.train :love, 'I', 'love', 'you', 'more'
110
+
111
+ subject.probability_of_a_token_given_a_category('more', :love).should == 0.5
112
+ # bayes.probability_of_token_given_a_category('more', :hate).should == 0
113
+ end
114
+ end
115
+
116
+ describe '#classifications' do
117
+ it 'calculates correctly' do
118
+ subject.train :love, 'I', 'love', 'you'
119
+ subject.train :hate, 'I', 'hate', 'you'
120
+
121
+ subject.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
122
+ subject.classify(*%w{ I love you }).should == [:love, 0.5]
123
+ subject.classify(*%w{ love }).should == [:love, 0.5]
124
+
125
+ subject.train :love, 'I', 'love', 'you'
126
+ subject.train :love, 'I', 'love', 'you'
127
+ subject.train :love, 'I', 'love', 'you'
128
+
129
+ subject.classify(*%w{ I love you }).should == [:love, 0.8]
130
+ subject.classify(*%w{ love }).should == [:love, 0.8]
131
+ subject.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
132
+ end
133
+ end
134
+
135
+ describe '#top_tokens_of_category' do
136
+ it 'finds to tokens' do
137
+ subject.train :love, 'I', 'love', 'you'
138
+ subject.train :hate, 'I', 'hate', 'you'
139
+
140
+ subject.top_tokens_of_category(:love).count.should == 3
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ describe 'class methods' do
147
+ subject { Classifier }
148
+
149
+ it { should respond_to :load_yaml }
150
+ end
151
+ end
152
+ end
@@ -10,3 +10,4 @@ rescue LoadError
10
10
  end
11
11
 
12
12
  require "nb"
13
+ require "fakeredis"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Forrest Ye
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-11-19 00:00:00.000000000 Z
11
+ date: 2014-12-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -54,10 +54,14 @@ files:
54
54
  - README.md
55
55
  - Rakefile
56
56
  - lib/nb.rb
57
- - lib/nb/naive_bayes.rb
57
+ - lib/nb/backend/memory.rb
58
+ - lib/nb/backend/redis.rb
59
+ - lib/nb/classifier.rb
58
60
  - lib/nb/version.rb
59
61
  - nb.gemspec
60
- - spec/nb/naive_bayes_spec.rb
62
+ - spec/nb/backend/memory_spec.rb
63
+ - spec/nb/backend/redis_spec.rb
64
+ - spec/nb/classifier_spec.rb
61
65
  - spec/spec_helper.rb
62
66
  homepage: https://github.com/forresty/nb
63
67
  licenses:
@@ -79,10 +83,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
79
83
  version: '0'
80
84
  requirements: []
81
85
  rubyforge_project:
82
- rubygems_version: 2.0.2
86
+ rubygems_version: 2.4.4
83
87
  signing_key:
84
88
  specification_version: 4
85
89
  summary: yet another Naive Bayes library
86
90
  test_files:
87
- - spec/nb/naive_bayes_spec.rb
91
+ - spec/nb/backend/memory_spec.rb
92
+ - spec/nb/backend/redis_spec.rb
93
+ - spec/nb/classifier_spec.rb
88
94
  - spec/spec_helper.rb
@@ -1,112 +0,0 @@
1
- require "yaml"
2
-
3
- class NaiveBayes
4
- attr_accessor :categories, :tokens_count, :categories_count, :default_category
5
-
6
- def initialize(*categories)
7
- @categories = categories
8
- @tokens_count = {}
9
- @categories_count = {}
10
- @default_category = @categories.first
11
-
12
- categories.each do |category|
13
- @tokens_count[category] = Hash.new(0)
14
- @categories_count[category] = 0
15
- end
16
- end
17
-
18
- def train(category, *tokens)
19
- tokens.uniq.each do |token|
20
- @tokens_count[category][token] += 1
21
- end
22
- @categories_count[category] += 1
23
- end
24
-
25
- def untrain(category, *tokens)
26
- tokens.uniq.each do |token|
27
- @tokens_count[category][token] -= 1
28
- end
29
- @categories_count[category] -= 1
30
- end
31
-
32
- def classify(*tokens)
33
- result = classifications(*tokens).first
34
-
35
- if result.last == 0.0
36
- [@default_category, 0.0]
37
- else
38
- result
39
- end
40
- end
41
-
42
- def classifications(*tokens)
43
- scores = {}
44
- @categories.each do |category|
45
- scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
46
- end
47
- scores.sort_by { |k, v| -v }
48
- end
49
-
50
- def top_tokens_of_category(category, count=20)
51
- tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
52
- end
53
-
54
- def probability_of_a_token_in_category(token, category)
55
- probability_of_a_token_given_a_category(token, category) / @categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
56
- end
57
-
58
- def probability_of_a_token_given_a_category(token, category)
59
- return assumed_probability if @tokens_count[category][token] == 0
60
-
61
- @tokens_count[category][token].to_f / @categories_count[category]
62
- end
63
-
64
- def probability_of_tokens_given_a_category(tokens, category)
65
- tokens.inject(1.0) do |product, token|
66
- product * probability_of_a_token_given_a_category(token, category)
67
- end
68
- end
69
-
70
- def probability_of_a_category(category)
71
- @categories_count[category].to_f / total_number_of_items
72
- end
73
-
74
- # def total_number_of_tokens
75
- # @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
76
- # end
77
-
78
- def total_number_of_items
79
- @categories_count.values.inject(&:+)
80
- end
81
-
82
- # If we have only trained a little bit a class may not have had a feature yet
83
- # give it a probability of 0 may not be true so we produce a assumed probability
84
- # which gets smaller more we train
85
- def assumed_probability
86
- 0.5 / (total_number_of_items.to_f / 2)
87
- end
88
-
89
- def data
90
- {
91
- :categories => @categories,
92
- :tokens_count => @tokens_count,
93
- :categories_count => @categories_count
94
- }
95
- end
96
-
97
- def save(yaml_file)
98
- File.write(yaml_file, data.to_yaml)
99
- end
100
-
101
- class << self
102
- def load_yaml(yaml_file)
103
- data = YAML.load_file(yaml_file)
104
-
105
- new.tap do |bayes|
106
- bayes.categories = data[:categories]
107
- bayes.tokens_count = data[:tokens_count]
108
- bayes.categories_count = data[:categories_count]
109
- end
110
- end
111
- end
112
- end
@@ -1,113 +0,0 @@
1
- require "spec_helper"
2
-
3
- describe NaiveBayes do
4
- it { should respond_to :train }
5
- it { should respond_to :untrain }
6
- it { should respond_to :save }
7
- it { should respond_to :classify }
8
- it { should respond_to :classifications }
9
- it { should respond_to :probability_of_a_token_given_a_category }
10
- it { should respond_to :probability_of_tokens_given_a_category }
11
- it { should respond_to :probability_of_a_category }
12
- it { should respond_to :probability_of_a_token_in_category }
13
- # it { should respond_to :total_number_of_tokens }
14
- it { should respond_to :total_number_of_items }
15
- it { should respond_to :top_tokens_of_category }
16
- it { should respond_to :default_category= }
17
-
18
- let(:bayes) { NaiveBayes.new(:love, :hate) }
19
- subject { bayes }
20
-
21
- # describe '#total_number_of_tokens' do
22
- # it 'calculates correctly' do
23
- # bayes.train :love, 'I', 'love', 'you'
24
- # bayes.train :hate, 'I', 'hate', 'you'
25
- #
26
- # bayes.total_number_of_tokens.should == 6
27
- #
28
- # bayes.train :love, 'I', 'love', 'you', 'more'
29
- #
30
- # bayes.total_number_of_tokens.should == 10
31
- # end
32
- # end
33
-
34
- describe '#probability_of_a_token_in_category' do
35
- it 'calculates correctly' do
36
- bayes.train :love, 'I', 'love', 'you'
37
- bayes.train :hate, 'I', 'hate', 'you'
38
-
39
- bayes.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
40
- bayes.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
41
- bayes.probability_of_a_token_in_category('I', :love).should == 0.5
42
-
43
- bayes.train :love, 'hate', 'is', 'love'
44
- bayes.train :love, 'hate', 'is', 'love'
45
- bayes.train :love, 'hate', 'is', 'love'
46
-
47
- bayes.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
48
- bayes.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
49
- end
50
- end
51
-
52
- describe '#total_number_of_items' do
53
- it 'calculates correctly' do
54
- bayes.train :love, 'I', 'love', 'you'
55
- bayes.train :hate, 'I', 'hate', 'you'
56
-
57
- bayes.total_number_of_items.should == 2
58
-
59
- bayes.train :love, 'I', 'love', 'you', 'more'
60
-
61
- bayes.total_number_of_items.should == 3
62
- end
63
- end
64
-
65
- describe '#probability_of_a_category' do
66
- it 'calculates correctly' do
67
- bayes.train :love, 'I', 'love', 'you'
68
- bayes.train :hate, 'I', 'hate', 'you'
69
-
70
- bayes.probability_of_a_category(:love).should == 0.5
71
- end
72
- end
73
-
74
- describe '#probability_of_token_given_a_category' do
75
- it 'calculates correctly' do
76
- bayes.train :love, 'I', 'love', 'you'
77
- bayes.train :hate, 'I', 'hate', 'you'
78
-
79
- bayes.probability_of_a_token_given_a_category('love', :love).should == 1
80
- bayes.probability_of_a_token_given_a_category('you', :hate).should == 1
81
-
82
- bayes.train :love, 'I', 'love', 'you', 'more'
83
-
84
- bayes.probability_of_a_token_given_a_category('more', :love).should == 0.5
85
- # bayes.probability_of_token_given_a_category('more', :hate).should == 0
86
- end
87
- end
88
-
89
- describe '#classifications' do
90
- it 'calculates correctly' do
91
- bayes.train :love, 'I', 'love', 'you'
92
- bayes.train :hate, 'I', 'hate', 'you'
93
-
94
- bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
95
- bayes.classify(*%w{ I love you }).should == [:love, 0.5]
96
- bayes.classify(*%w{ love }).should == [:love, 0.5]
97
-
98
- bayes.train :love, 'I', 'love', 'you'
99
- bayes.train :love, 'I', 'love', 'you'
100
- bayes.train :love, 'I', 'love', 'you'
101
-
102
- bayes.classify(*%w{ I love you }).should == [:love, 0.8]
103
- bayes.classify(*%w{ love }).should == [:love, 0.8]
104
- bayes.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
105
- end
106
- end
107
-
108
- describe 'class methods' do
109
- subject { NaiveBayes }
110
-
111
- it { should respond_to :load_yaml }
112
- end
113
- end