nb 0.0.4 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.ci +3 -0
- data/README.md +52 -9
- data/lib/nb.rb +3 -1
- data/lib/nb/backend/memory.rb +39 -0
- data/lib/nb/backend/redis.rb +102 -0
- data/lib/nb/classifier.rb +127 -0
- data/lib/nb/version.rb +2 -2
- data/spec/nb/backend/memory_spec.rb +13 -0
- data/spec/nb/backend/redis_spec.rb +12 -0
- data/spec/nb/classifier_spec.rb +152 -0
- data/spec/spec_helper.rb +1 -0
- metadata +12 -6
- data/lib/nb/naive_bayes.rb +0 -112
- data/spec/nb/naive_bayes_spec.rb +0 -113
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 703ab07acadbf5f04d8d979d888790029cc0c6de
|
4
|
+
data.tar.gz: c468c1d63b8f628be6160e7041f6053c8431297a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95c245d113ac2dd0a15c7c0d23599d8393b738ddf18ac8be176757409e9b46ca0d19ede1b47f4fdc60a7184690f347e836697231b945f10e1d90f8a8111fa461
|
7
|
+
data.tar.gz: 3f969b83d80f16baa624d874228f03d1c36935dfd469d8e1bd9428529bb4e5138e62c3160673b4836f381e762aabef1f2f0848a7d08e9869ec02406a1d9cd371
|
data/Gemfile
CHANGED
data/Gemfile.ci
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
[![Build Status](https://travis-ci.org/forresty/nb.svg?branch=master)](https://travis-ci.org/forresty/nb)
|
5
5
|
[![Gem Version](https://badge.fury.io/rb/nb.svg)](http://badge.fury.io/rb/nb)
|
6
6
|
|
7
|
-
yet another Naive Bayes library
|
7
|
+
yet another Naive Bayes library with support of memory and Redis backend
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -25,19 +25,19 @@ Or install it yourself as:
|
|
25
25
|
## Usage
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
|
28
|
+
classifier = NaiveBayes::Classifier.new :love, :hate
|
29
29
|
|
30
|
-
|
31
|
-
|
30
|
+
classifier.train :love, 'I', 'love', 'you'
|
31
|
+
classifier.train :hate, 'I', 'hate', 'you'
|
32
32
|
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
classifier.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
34
|
+
classifier.classify(*%w{ I love you }).should == [:love, 0.5]
|
35
|
+
classifier.classify(*%w{ love }).should == [:love, 0.5]
|
36
36
|
```
|
37
37
|
|
38
|
-
###
|
38
|
+
### Ability to view top tokens
|
39
39
|
|
40
|
-
`
|
40
|
+
`classifier.top_tokens_of_category(:spam)`
|
41
41
|
|
42
42
|
```
|
43
43
|
+------------+------+--------------------+
|
@@ -61,6 +61,39 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
|
|
61
61
|
+------------+------+--------------------+
|
62
62
|
```
|
63
63
|
|
64
|
+
### Use Redis backend
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
classifier = Classifier.new(:spam, :ham, backend: :redis, host: 'localhost', port: 30000)
|
68
|
+
```
|
69
|
+
|
70
|
+
it generates 2 + N keys in redis:
|
71
|
+
|
72
|
+
```
|
73
|
+
127.0.0.1:30000> keys *
|
74
|
+
1) "nb:hash:tokens_count:ham"
|
75
|
+
2) "nb:hash:tokens_count:spam"
|
76
|
+
3) "nb:set:categories"
|
77
|
+
4) "nb:hash:categories_count"
|
78
|
+
```
|
79
|
+
|
80
|
+
### Support default category
|
81
|
+
|
82
|
+
in case the probability of each category is too low:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
@classifier = NaiveBayes::Classifer.new :spam, :ham
|
86
|
+
@classifier.default_category = :ham
|
87
|
+
```
|
88
|
+
|
89
|
+
```
|
90
|
+
bayes filter mark as spam: false
|
91
|
+
bayes classifications: [[:ham, 5.044818725004143e-80], [:spam, 1.938475275819746e-119]]
|
92
|
+
|
93
|
+
bayes filter mark as spam: false
|
94
|
+
bayes classifications: [[:spam, 0.0], [:ham, 0.0]]
|
95
|
+
```
|
96
|
+
|
64
97
|
## Credits
|
65
98
|
|
66
99
|
- [classifier gem](https://github.com/cardmagic/classifier)
|
@@ -74,3 +107,13 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
|
|
74
107
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
75
108
|
4. Push to the branch (`git push origin my-new-feature`)
|
76
109
|
5. Create a new Pull Request
|
110
|
+
|
111
|
+
## Changelog
|
112
|
+
|
113
|
+
### 0.1.1 / 2014-12-15
|
114
|
+
|
115
|
+
- fix redis backend
|
116
|
+
|
117
|
+
### 0.1.0 / 2014-12-15
|
118
|
+
|
119
|
+
- init implementation of redis backend
|
data/lib/nb.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module NaiveBayes
|
2
|
+
module Backend
|
3
|
+
class Memory
|
4
|
+
attr_accessor :categories, :tokens_count, :categories_count
|
5
|
+
|
6
|
+
def initialize(categories)
|
7
|
+
@categories = categories
|
8
|
+
|
9
|
+
clear!
|
10
|
+
end
|
11
|
+
|
12
|
+
def clear!
|
13
|
+
@tokens_count = {}
|
14
|
+
@categories_count = {}
|
15
|
+
|
16
|
+
@categories.each do |category|
|
17
|
+
@tokens_count[category] = Hash.new(0)
|
18
|
+
@categories_count[category] = 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def train(category, *tokens)
|
23
|
+
tokens.uniq.each do |token|
|
24
|
+
@tokens_count[category][token] += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
@categories_count[category] += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
def untrain(category, *tokens)
|
31
|
+
tokens.uniq.each do |token|
|
32
|
+
@tokens_count[category][token] -= 1
|
33
|
+
end
|
34
|
+
|
35
|
+
@categories_count[category] -= 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require "redis"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
module Backend
|
5
|
+
class Redis
|
6
|
+
class RedisHash
|
7
|
+
def initialize(redis, hash_name)
|
8
|
+
@redis = redis
|
9
|
+
@hash_name = hash_name
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](key)
|
13
|
+
value = @redis.hget @hash_name, key
|
14
|
+
value.to_f
|
15
|
+
end
|
16
|
+
|
17
|
+
def []=(key, value)
|
18
|
+
@redis.hset @hash_name, key, value
|
19
|
+
end
|
20
|
+
|
21
|
+
def incr(key)
|
22
|
+
@redis.hincrby @hash_name, key, 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def decr(key)
|
26
|
+
@redis.hdecrby @hash_name, key, 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
@redis.hvals(@hash_name).map(&:to_f)
|
31
|
+
end
|
32
|
+
|
33
|
+
def map
|
34
|
+
out = []
|
35
|
+
|
36
|
+
if block_given?
|
37
|
+
@redis.hkeys(@hash_name).each { |k| out << yield(k, self.[](k)) }
|
38
|
+
else
|
39
|
+
out = to_enum :map
|
40
|
+
end
|
41
|
+
|
42
|
+
out
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(categories, options={})
|
47
|
+
@redis = ::Redis.new(options)
|
48
|
+
|
49
|
+
@_categories = categories
|
50
|
+
|
51
|
+
setup
|
52
|
+
end
|
53
|
+
|
54
|
+
def categories
|
55
|
+
@redis.smembers("nb:set:categories").map(&:to_sym)
|
56
|
+
end
|
57
|
+
|
58
|
+
def categories_count
|
59
|
+
@categories_count ||= RedisHash.new(@redis, "nb:hash:categories_count")
|
60
|
+
end
|
61
|
+
|
62
|
+
def tokens_count
|
63
|
+
@tokens_count ||= Hash.new
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear!
|
67
|
+
@redis.flushall
|
68
|
+
|
69
|
+
setup
|
70
|
+
|
71
|
+
categories.each do |category|
|
72
|
+
self.categories_count[category] = 0
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def setup
|
77
|
+
@redis.sadd "nb:set:categories", @_categories
|
78
|
+
|
79
|
+
categories.each do |category|
|
80
|
+
# @tokens_count[category] = Hash.new(0)
|
81
|
+
self.tokens_count[category] = RedisHash.new(@redis, "nb:hash:tokens_count:#{category}")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def train(category, *tokens)
|
86
|
+
tokens.uniq.each do |token|
|
87
|
+
self.tokens_count[category].incr(token)
|
88
|
+
end
|
89
|
+
|
90
|
+
self.categories_count.incr(category)
|
91
|
+
end
|
92
|
+
|
93
|
+
def untrain(category, *tokens)
|
94
|
+
tokens.uniq.each do |token|
|
95
|
+
self.tokens_count[category][token].decr(token)
|
96
|
+
end
|
97
|
+
|
98
|
+
self.categories_count.decr(category)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
class Classifier
|
5
|
+
attr_accessor :default_category
|
6
|
+
attr_accessor :backend
|
7
|
+
|
8
|
+
def initialize(*categories)
|
9
|
+
if categories.last.is_a?(Hash)
|
10
|
+
options = categories.pop
|
11
|
+
else
|
12
|
+
options = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
options[:backend] ||= :memory
|
16
|
+
|
17
|
+
case options[:backend]
|
18
|
+
when :memory
|
19
|
+
@backend = Backend::Memory.new(categories)
|
20
|
+
when :redis
|
21
|
+
options[:host] ||= 'localhost'
|
22
|
+
options[:port] ||= 6379
|
23
|
+
|
24
|
+
@backend = Backend::Redis.new(categories, host: options[:host], port: options[:port])
|
25
|
+
else
|
26
|
+
raise "unsupported backend: #{options[:backend]}"
|
27
|
+
end
|
28
|
+
|
29
|
+
@default_category = categories.first
|
30
|
+
end
|
31
|
+
|
32
|
+
def train(category, *tokens)
|
33
|
+
backend.train(category, *tokens)
|
34
|
+
end
|
35
|
+
|
36
|
+
def untrain(category, *tokens)
|
37
|
+
backend.untrain(category, *tokens)
|
38
|
+
end
|
39
|
+
|
40
|
+
def clear!
|
41
|
+
backend.clear!
|
42
|
+
end
|
43
|
+
|
44
|
+
def classify(*tokens)
|
45
|
+
result = classifications(*tokens).first
|
46
|
+
|
47
|
+
if result.last == 0.0
|
48
|
+
[@default_category, 0.0]
|
49
|
+
else
|
50
|
+
result
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def classifications(*tokens)
|
55
|
+
scores = {}
|
56
|
+
backend.categories.each do |category|
|
57
|
+
scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
|
58
|
+
end
|
59
|
+
scores.sort_by { |k, v| -v }
|
60
|
+
end
|
61
|
+
|
62
|
+
def top_tokens_of_category(category, count=20)
|
63
|
+
backend.tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
|
64
|
+
end
|
65
|
+
|
66
|
+
def probability_of_a_token_in_category(token, category)
|
67
|
+
probability_of_a_token_given_a_category(token, category) / backend.categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def probability_of_a_token_given_a_category(token, category)
|
71
|
+
return assumed_probability if backend.tokens_count[category][token] == 0
|
72
|
+
|
73
|
+
backend.tokens_count[category][token].to_f / backend.categories_count[category]
|
74
|
+
end
|
75
|
+
|
76
|
+
def probability_of_tokens_given_a_category(tokens, category)
|
77
|
+
tokens.inject(1.0) do |product, token|
|
78
|
+
product * probability_of_a_token_given_a_category(token, category)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def probability_of_a_category(category)
|
83
|
+
backend.categories_count[category].to_f / total_number_of_items
|
84
|
+
end
|
85
|
+
|
86
|
+
# def total_number_of_tokens
|
87
|
+
# @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
|
88
|
+
# end
|
89
|
+
|
90
|
+
def total_number_of_items
|
91
|
+
backend.categories_count.values.inject(&:+)
|
92
|
+
end
|
93
|
+
|
94
|
+
# If we have only trained a little bit a class may not have had a feature yet
|
95
|
+
# give it a probability of 0 may not be true so we produce a assumed probability
|
96
|
+
# which gets smaller more we train
|
97
|
+
def assumed_probability
|
98
|
+
0.5 / (total_number_of_items.to_f / 2)
|
99
|
+
end
|
100
|
+
|
101
|
+
def data
|
102
|
+
{
|
103
|
+
:categories => backend.categories,
|
104
|
+
:tokens_count => backend.tokens_count,
|
105
|
+
:categories_count => backend.categories_count
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
def save(yaml_file)
|
110
|
+
raise 'only memory backend can save' unless backend == :memory
|
111
|
+
|
112
|
+
File.write(yaml_file, data.to_yaml)
|
113
|
+
end
|
114
|
+
|
115
|
+
class << self
|
116
|
+
# will load into a memory-backed classifier
|
117
|
+
def load_yaml(yaml_file)
|
118
|
+
data = YAML.load_file(yaml_file)
|
119
|
+
|
120
|
+
new(data[:categories], backend: :memory).tap do |classifier|
|
121
|
+
classifier.tokens_count = data[:tokens_count]
|
122
|
+
classifier.categories_count = data[:categories_count]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
data/lib/nb/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
module NaiveBayes
|
2
|
+
VERSION = "0.1.1"
|
3
3
|
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
describe Classifier do
|
5
|
+
let(:classifier) { Classifier.new(:love, :hate) }
|
6
|
+
subject { classifier }
|
7
|
+
|
8
|
+
it { should respond_to :train }
|
9
|
+
it { should respond_to :untrain }
|
10
|
+
it { should respond_to :save }
|
11
|
+
it { should respond_to :classify }
|
12
|
+
it { should respond_to :classifications }
|
13
|
+
it { should respond_to :probability_of_a_token_given_a_category }
|
14
|
+
it { should respond_to :probability_of_tokens_given_a_category }
|
15
|
+
it { should respond_to :probability_of_a_category }
|
16
|
+
it { should respond_to :probability_of_a_token_in_category }
|
17
|
+
# it { should respond_to :total_number_of_tokens }
|
18
|
+
it { should respond_to :total_number_of_items }
|
19
|
+
it { should respond_to :top_tokens_of_category }
|
20
|
+
it { should respond_to :default_category= }
|
21
|
+
|
22
|
+
it { should respond_to :clear! }
|
23
|
+
|
24
|
+
[:memory, :redis].each do |backend|
|
25
|
+
describe "with backend #{backend}" do
|
26
|
+
|
27
|
+
let(:classifier) { Classifier.new(:love, :hate, backend: backend) }
|
28
|
+
|
29
|
+
subject { classifier }
|
30
|
+
|
31
|
+
before(:each) do
|
32
|
+
subject.clear!
|
33
|
+
end
|
34
|
+
|
35
|
+
# describe '#total_number_of_tokens' do
|
36
|
+
# it 'calculates correctly' do
|
37
|
+
# bayes.train :love, 'I', 'love', 'you'
|
38
|
+
# bayes.train :hate, 'I', 'hate', 'you'
|
39
|
+
#
|
40
|
+
# bayes.total_number_of_tokens.should == 6
|
41
|
+
#
|
42
|
+
# bayes.train :love, 'I', 'love', 'you', 'more'
|
43
|
+
#
|
44
|
+
# bayes.total_number_of_tokens.should == 10
|
45
|
+
# end
|
46
|
+
# end
|
47
|
+
|
48
|
+
describe '#categories_count and #tokens_count' do
|
49
|
+
it 'must get it right' do
|
50
|
+
subject.backend.categories_count[:love].should == 0
|
51
|
+
|
52
|
+
subject.train :love, 'I', 'love', 'you'
|
53
|
+
subject.train :hate, 'I', 'hate', 'you'
|
54
|
+
|
55
|
+
subject.backend.categories_count[:love].should == 1
|
56
|
+
subject.backend.tokens_count[:hate]['you'].should == 1
|
57
|
+
subject.backend.tokens_count[:hate]['love'].should == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#probability_of_a_token_in_category' do
|
62
|
+
it 'calculates correctly' do
|
63
|
+
subject.train :love, 'I', 'love', 'you'
|
64
|
+
subject.train :hate, 'I', 'hate', 'you'
|
65
|
+
|
66
|
+
subject.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
|
67
|
+
subject.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
|
68
|
+
subject.probability_of_a_token_in_category('I', :love).should == 0.5
|
69
|
+
|
70
|
+
subject.train :love, 'hate', 'is', 'love'
|
71
|
+
subject.train :love, 'hate', 'is', 'love'
|
72
|
+
subject.train :love, 'hate', 'is', 'love'
|
73
|
+
|
74
|
+
subject.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
|
75
|
+
subject.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#total_number_of_items' do
|
80
|
+
it 'calculates correctly' do
|
81
|
+
subject.train :love, 'I', 'love', 'you'
|
82
|
+
subject.train :hate, 'I', 'hate', 'you'
|
83
|
+
|
84
|
+
subject.total_number_of_items.should == 2
|
85
|
+
|
86
|
+
subject.train :love, 'I', 'love', 'you', 'more'
|
87
|
+
|
88
|
+
subject.total_number_of_items.should == 3
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe '#probability_of_a_category' do
|
93
|
+
it 'calculates correctly' do
|
94
|
+
subject.train :love, 'I', 'love', 'you'
|
95
|
+
subject.train :hate, 'I', 'hate', 'you'
|
96
|
+
|
97
|
+
subject.probability_of_a_category(:love).should == 0.5
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe '#probability_of_token_given_a_category' do
|
102
|
+
it 'calculates correctly' do
|
103
|
+
subject.train :love, 'I', 'love', 'you'
|
104
|
+
subject.train :hate, 'I', 'hate', 'you'
|
105
|
+
|
106
|
+
subject.probability_of_a_token_given_a_category('love', :love).should == 1
|
107
|
+
subject.probability_of_a_token_given_a_category('you', :hate).should == 1
|
108
|
+
|
109
|
+
subject.train :love, 'I', 'love', 'you', 'more'
|
110
|
+
|
111
|
+
subject.probability_of_a_token_given_a_category('more', :love).should == 0.5
|
112
|
+
# bayes.probability_of_token_given_a_category('more', :hate).should == 0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
describe '#classifications' do
|
117
|
+
it 'calculates correctly' do
|
118
|
+
subject.train :love, 'I', 'love', 'you'
|
119
|
+
subject.train :hate, 'I', 'hate', 'you'
|
120
|
+
|
121
|
+
subject.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
122
|
+
subject.classify(*%w{ I love you }).should == [:love, 0.5]
|
123
|
+
subject.classify(*%w{ love }).should == [:love, 0.5]
|
124
|
+
|
125
|
+
subject.train :love, 'I', 'love', 'you'
|
126
|
+
subject.train :love, 'I', 'love', 'you'
|
127
|
+
subject.train :love, 'I', 'love', 'you'
|
128
|
+
|
129
|
+
subject.classify(*%w{ I love you }).should == [:love, 0.8]
|
130
|
+
subject.classify(*%w{ love }).should == [:love, 0.8]
|
131
|
+
subject.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
describe '#top_tokens_of_category' do
|
136
|
+
it 'finds to tokens' do
|
137
|
+
subject.train :love, 'I', 'love', 'you'
|
138
|
+
subject.train :hate, 'I', 'hate', 'you'
|
139
|
+
|
140
|
+
subject.top_tokens_of_category(:love).count.should == 3
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe 'class methods' do
|
147
|
+
subject { Classifier }
|
148
|
+
|
149
|
+
it { should respond_to :load_yaml }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Forrest Ye
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,10 +54,14 @@ files:
|
|
54
54
|
- README.md
|
55
55
|
- Rakefile
|
56
56
|
- lib/nb.rb
|
57
|
-
- lib/nb/
|
57
|
+
- lib/nb/backend/memory.rb
|
58
|
+
- lib/nb/backend/redis.rb
|
59
|
+
- lib/nb/classifier.rb
|
58
60
|
- lib/nb/version.rb
|
59
61
|
- nb.gemspec
|
60
|
-
- spec/nb/
|
62
|
+
- spec/nb/backend/memory_spec.rb
|
63
|
+
- spec/nb/backend/redis_spec.rb
|
64
|
+
- spec/nb/classifier_spec.rb
|
61
65
|
- spec/spec_helper.rb
|
62
66
|
homepage: https://github.com/forresty/nb
|
63
67
|
licenses:
|
@@ -79,10 +83,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
83
|
version: '0'
|
80
84
|
requirements: []
|
81
85
|
rubyforge_project:
|
82
|
-
rubygems_version: 2.
|
86
|
+
rubygems_version: 2.4.4
|
83
87
|
signing_key:
|
84
88
|
specification_version: 4
|
85
89
|
summary: yet another Naive Bayes library
|
86
90
|
test_files:
|
87
|
-
- spec/nb/
|
91
|
+
- spec/nb/backend/memory_spec.rb
|
92
|
+
- spec/nb/backend/redis_spec.rb
|
93
|
+
- spec/nb/classifier_spec.rb
|
88
94
|
- spec/spec_helper.rb
|
data/lib/nb/naive_bayes.rb
DELETED
@@ -1,112 +0,0 @@
|
|
1
|
-
require "yaml"
|
2
|
-
|
3
|
-
class NaiveBayes
|
4
|
-
attr_accessor :categories, :tokens_count, :categories_count, :default_category
|
5
|
-
|
6
|
-
def initialize(*categories)
|
7
|
-
@categories = categories
|
8
|
-
@tokens_count = {}
|
9
|
-
@categories_count = {}
|
10
|
-
@default_category = @categories.first
|
11
|
-
|
12
|
-
categories.each do |category|
|
13
|
-
@tokens_count[category] = Hash.new(0)
|
14
|
-
@categories_count[category] = 0
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def train(category, *tokens)
|
19
|
-
tokens.uniq.each do |token|
|
20
|
-
@tokens_count[category][token] += 1
|
21
|
-
end
|
22
|
-
@categories_count[category] += 1
|
23
|
-
end
|
24
|
-
|
25
|
-
def untrain(category, *tokens)
|
26
|
-
tokens.uniq.each do |token|
|
27
|
-
@tokens_count[category][token] -= 1
|
28
|
-
end
|
29
|
-
@categories_count[category] -= 1
|
30
|
-
end
|
31
|
-
|
32
|
-
def classify(*tokens)
|
33
|
-
result = classifications(*tokens).first
|
34
|
-
|
35
|
-
if result.last == 0.0
|
36
|
-
[@default_category, 0.0]
|
37
|
-
else
|
38
|
-
result
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def classifications(*tokens)
|
43
|
-
scores = {}
|
44
|
-
@categories.each do |category|
|
45
|
-
scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
|
46
|
-
end
|
47
|
-
scores.sort_by { |k, v| -v }
|
48
|
-
end
|
49
|
-
|
50
|
-
def top_tokens_of_category(category, count=20)
|
51
|
-
tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
|
52
|
-
end
|
53
|
-
|
54
|
-
def probability_of_a_token_in_category(token, category)
|
55
|
-
probability_of_a_token_given_a_category(token, category) / @categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
|
56
|
-
end
|
57
|
-
|
58
|
-
def probability_of_a_token_given_a_category(token, category)
|
59
|
-
return assumed_probability if @tokens_count[category][token] == 0
|
60
|
-
|
61
|
-
@tokens_count[category][token].to_f / @categories_count[category]
|
62
|
-
end
|
63
|
-
|
64
|
-
def probability_of_tokens_given_a_category(tokens, category)
|
65
|
-
tokens.inject(1.0) do |product, token|
|
66
|
-
product * probability_of_a_token_given_a_category(token, category)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def probability_of_a_category(category)
|
71
|
-
@categories_count[category].to_f / total_number_of_items
|
72
|
-
end
|
73
|
-
|
74
|
-
# def total_number_of_tokens
|
75
|
-
# @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
|
76
|
-
# end
|
77
|
-
|
78
|
-
def total_number_of_items
|
79
|
-
@categories_count.values.inject(&:+)
|
80
|
-
end
|
81
|
-
|
82
|
-
# If we have only trained a little bit a class may not have had a feature yet
|
83
|
-
# give it a probability of 0 may not be true so we produce a assumed probability
|
84
|
-
# which gets smaller more we train
|
85
|
-
def assumed_probability
|
86
|
-
0.5 / (total_number_of_items.to_f / 2)
|
87
|
-
end
|
88
|
-
|
89
|
-
def data
|
90
|
-
{
|
91
|
-
:categories => @categories,
|
92
|
-
:tokens_count => @tokens_count,
|
93
|
-
:categories_count => @categories_count
|
94
|
-
}
|
95
|
-
end
|
96
|
-
|
97
|
-
def save(yaml_file)
|
98
|
-
File.write(yaml_file, data.to_yaml)
|
99
|
-
end
|
100
|
-
|
101
|
-
class << self
|
102
|
-
def load_yaml(yaml_file)
|
103
|
-
data = YAML.load_file(yaml_file)
|
104
|
-
|
105
|
-
new.tap do |bayes|
|
106
|
-
bayes.categories = data[:categories]
|
107
|
-
bayes.tokens_count = data[:tokens_count]
|
108
|
-
bayes.categories_count = data[:categories_count]
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
data/spec/nb/naive_bayes_spec.rb
DELETED
@@ -1,113 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe NaiveBayes do
|
4
|
-
it { should respond_to :train }
|
5
|
-
it { should respond_to :untrain }
|
6
|
-
it { should respond_to :save }
|
7
|
-
it { should respond_to :classify }
|
8
|
-
it { should respond_to :classifications }
|
9
|
-
it { should respond_to :probability_of_a_token_given_a_category }
|
10
|
-
it { should respond_to :probability_of_tokens_given_a_category }
|
11
|
-
it { should respond_to :probability_of_a_category }
|
12
|
-
it { should respond_to :probability_of_a_token_in_category }
|
13
|
-
# it { should respond_to :total_number_of_tokens }
|
14
|
-
it { should respond_to :total_number_of_items }
|
15
|
-
it { should respond_to :top_tokens_of_category }
|
16
|
-
it { should respond_to :default_category= }
|
17
|
-
|
18
|
-
let(:bayes) { NaiveBayes.new(:love, :hate) }
|
19
|
-
subject { bayes }
|
20
|
-
|
21
|
-
# describe '#total_number_of_tokens' do
|
22
|
-
# it 'calculates correctly' do
|
23
|
-
# bayes.train :love, 'I', 'love', 'you'
|
24
|
-
# bayes.train :hate, 'I', 'hate', 'you'
|
25
|
-
#
|
26
|
-
# bayes.total_number_of_tokens.should == 6
|
27
|
-
#
|
28
|
-
# bayes.train :love, 'I', 'love', 'you', 'more'
|
29
|
-
#
|
30
|
-
# bayes.total_number_of_tokens.should == 10
|
31
|
-
# end
|
32
|
-
# end
|
33
|
-
|
34
|
-
describe '#probability_of_a_token_in_category' do
|
35
|
-
it 'calculates correctly' do
|
36
|
-
bayes.train :love, 'I', 'love', 'you'
|
37
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
38
|
-
|
39
|
-
bayes.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
|
40
|
-
bayes.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
|
41
|
-
bayes.probability_of_a_token_in_category('I', :love).should == 0.5
|
42
|
-
|
43
|
-
bayes.train :love, 'hate', 'is', 'love'
|
44
|
-
bayes.train :love, 'hate', 'is', 'love'
|
45
|
-
bayes.train :love, 'hate', 'is', 'love'
|
46
|
-
|
47
|
-
bayes.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
|
48
|
-
bayes.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
describe '#total_number_of_items' do
|
53
|
-
it 'calculates correctly' do
|
54
|
-
bayes.train :love, 'I', 'love', 'you'
|
55
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
56
|
-
|
57
|
-
bayes.total_number_of_items.should == 2
|
58
|
-
|
59
|
-
bayes.train :love, 'I', 'love', 'you', 'more'
|
60
|
-
|
61
|
-
bayes.total_number_of_items.should == 3
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
describe '#probability_of_a_category' do
|
66
|
-
it 'calculates correctly' do
|
67
|
-
bayes.train :love, 'I', 'love', 'you'
|
68
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
69
|
-
|
70
|
-
bayes.probability_of_a_category(:love).should == 0.5
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
describe '#probability_of_token_given_a_category' do
|
75
|
-
it 'calculates correctly' do
|
76
|
-
bayes.train :love, 'I', 'love', 'you'
|
77
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
78
|
-
|
79
|
-
bayes.probability_of_a_token_given_a_category('love', :love).should == 1
|
80
|
-
bayes.probability_of_a_token_given_a_category('you', :hate).should == 1
|
81
|
-
|
82
|
-
bayes.train :love, 'I', 'love', 'you', 'more'
|
83
|
-
|
84
|
-
bayes.probability_of_a_token_given_a_category('more', :love).should == 0.5
|
85
|
-
# bayes.probability_of_token_given_a_category('more', :hate).should == 0
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
describe '#classifications' do
|
90
|
-
it 'calculates correctly' do
|
91
|
-
bayes.train :love, 'I', 'love', 'you'
|
92
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
93
|
-
|
94
|
-
bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
95
|
-
bayes.classify(*%w{ I love you }).should == [:love, 0.5]
|
96
|
-
bayes.classify(*%w{ love }).should == [:love, 0.5]
|
97
|
-
|
98
|
-
bayes.train :love, 'I', 'love', 'you'
|
99
|
-
bayes.train :love, 'I', 'love', 'you'
|
100
|
-
bayes.train :love, 'I', 'love', 'you'
|
101
|
-
|
102
|
-
bayes.classify(*%w{ I love you }).should == [:love, 0.8]
|
103
|
-
bayes.classify(*%w{ love }).should == [:love, 0.8]
|
104
|
-
bayes.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
describe 'class methods' do
|
109
|
-
subject { NaiveBayes }
|
110
|
-
|
111
|
-
it { should respond_to :load_yaml }
|
112
|
-
end
|
113
|
-
end
|