nb 0.0.4 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.ci +3 -0
- data/README.md +52 -9
- data/lib/nb.rb +3 -1
- data/lib/nb/backend/memory.rb +39 -0
- data/lib/nb/backend/redis.rb +102 -0
- data/lib/nb/classifier.rb +127 -0
- data/lib/nb/version.rb +2 -2
- data/spec/nb/backend/memory_spec.rb +13 -0
- data/spec/nb/backend/redis_spec.rb +12 -0
- data/spec/nb/classifier_spec.rb +152 -0
- data/spec/spec_helper.rb +1 -0
- metadata +12 -6
- data/lib/nb/naive_bayes.rb +0 -112
- data/spec/nb/naive_bayes_spec.rb +0 -113
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 703ab07acadbf5f04d8d979d888790029cc0c6de
|
4
|
+
data.tar.gz: c468c1d63b8f628be6160e7041f6053c8431297a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 95c245d113ac2dd0a15c7c0d23599d8393b738ddf18ac8be176757409e9b46ca0d19ede1b47f4fdc60a7184690f347e836697231b945f10e1d90f8a8111fa461
|
7
|
+
data.tar.gz: 3f969b83d80f16baa624d874228f03d1c36935dfd469d8e1bd9428529bb4e5138e62c3160673b4836f381e762aabef1f2f0848a7d08e9869ec02406a1d9cd371
|
data/Gemfile
CHANGED
data/Gemfile.ci
CHANGED
data/README.md
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
[](https://travis-ci.org/forresty/nb)
|
5
5
|
[](http://badge.fury.io/rb/nb)
|
6
6
|
|
7
|
-
yet another Naive Bayes library
|
7
|
+
yet another Naive Bayes library with support of memory and Redis backend
|
8
8
|
|
9
9
|
## Installation
|
10
10
|
|
@@ -25,19 +25,19 @@ Or install it yourself as:
|
|
25
25
|
## Usage
|
26
26
|
|
27
27
|
```ruby
|
28
|
-
|
28
|
+
classifier = NaiveBayes::Classifier.new :love, :hate
|
29
29
|
|
30
|
-
|
31
|
-
|
30
|
+
classifier.train :love, 'I', 'love', 'you'
|
31
|
+
classifier.train :hate, 'I', 'hate', 'you'
|
32
32
|
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
classifier.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
34
|
+
classifier.classify(*%w{ I love you }).should == [:love, 0.5]
|
35
|
+
classifier.classify(*%w{ love }).should == [:love, 0.5]
|
36
36
|
```
|
37
37
|
|
38
|
-
###
|
38
|
+
### Ability to view top tokens
|
39
39
|
|
40
|
-
`
|
40
|
+
`classifier.top_tokens_of_category(:spam)`
|
41
41
|
|
42
42
|
```
|
43
43
|
+------------+------+--------------------+
|
@@ -61,6 +61,39 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
|
|
61
61
|
+------------+------+--------------------+
|
62
62
|
```
|
63
63
|
|
64
|
+
### Use Redis backend
|
65
|
+
|
66
|
+
```ruby
|
67
|
+
classifier = Classifier.new(:spam, :ham, backend: :redis, host: 'localhost', port: 30000)
|
68
|
+
```
|
69
|
+
|
70
|
+
it generates 2 + N keys in redis:
|
71
|
+
|
72
|
+
```
|
73
|
+
127.0.0.1:30000> keys *
|
74
|
+
1) "nb:hash:tokens_count:ham"
|
75
|
+
2) "nb:hash:tokens_count:spam"
|
76
|
+
3) "nb:set:categories"
|
77
|
+
4) "nb:hash:categories_count"
|
78
|
+
```
|
79
|
+
|
80
|
+
### Support default category
|
81
|
+
|
82
|
+
in case the probability of each category is too low:
|
83
|
+
|
84
|
+
```ruby
|
85
|
+
@classifier = NaiveBayes::Classifer.new :spam, :ham
|
86
|
+
@classifier.default_category = :ham
|
87
|
+
```
|
88
|
+
|
89
|
+
```
|
90
|
+
bayes filter mark as spam: false
|
91
|
+
bayes classifications: [[:ham, 5.044818725004143e-80], [:spam, 1.938475275819746e-119]]
|
92
|
+
|
93
|
+
bayes filter mark as spam: false
|
94
|
+
bayes classifications: [[:spam, 0.0], [:ham, 0.0]]
|
95
|
+
```
|
96
|
+
|
64
97
|
## Credits
|
65
98
|
|
66
99
|
- [classifier gem](https://github.com/cardmagic/classifier)
|
@@ -74,3 +107,13 @@ bayes.classify(*%w{ love }).should == [:love, 0.5]
|
|
74
107
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
75
108
|
4. Push to the branch (`git push origin my-new-feature`)
|
76
109
|
5. Create a new Pull Request
|
110
|
+
|
111
|
+
## Changelog
|
112
|
+
|
113
|
+
### 0.1.1 / 2014-12-15
|
114
|
+
|
115
|
+
- fix redis backend
|
116
|
+
|
117
|
+
### 0.1.0 / 2014-12-15
|
118
|
+
|
119
|
+
- init implementation of redis backend
|
data/lib/nb.rb
CHANGED
@@ -0,0 +1,39 @@
|
|
1
|
+
module NaiveBayes
|
2
|
+
module Backend
|
3
|
+
class Memory
|
4
|
+
attr_accessor :categories, :tokens_count, :categories_count
|
5
|
+
|
6
|
+
def initialize(categories)
|
7
|
+
@categories = categories
|
8
|
+
|
9
|
+
clear!
|
10
|
+
end
|
11
|
+
|
12
|
+
def clear!
|
13
|
+
@tokens_count = {}
|
14
|
+
@categories_count = {}
|
15
|
+
|
16
|
+
@categories.each do |category|
|
17
|
+
@tokens_count[category] = Hash.new(0)
|
18
|
+
@categories_count[category] = 0
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def train(category, *tokens)
|
23
|
+
tokens.uniq.each do |token|
|
24
|
+
@tokens_count[category][token] += 1
|
25
|
+
end
|
26
|
+
|
27
|
+
@categories_count[category] += 1
|
28
|
+
end
|
29
|
+
|
30
|
+
def untrain(category, *tokens)
|
31
|
+
tokens.uniq.each do |token|
|
32
|
+
@tokens_count[category][token] -= 1
|
33
|
+
end
|
34
|
+
|
35
|
+
@categories_count[category] -= 1
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require "redis"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
module Backend
|
5
|
+
class Redis
|
6
|
+
class RedisHash
|
7
|
+
def initialize(redis, hash_name)
|
8
|
+
@redis = redis
|
9
|
+
@hash_name = hash_name
|
10
|
+
end
|
11
|
+
|
12
|
+
def [](key)
|
13
|
+
value = @redis.hget @hash_name, key
|
14
|
+
value.to_f
|
15
|
+
end
|
16
|
+
|
17
|
+
def []=(key, value)
|
18
|
+
@redis.hset @hash_name, key, value
|
19
|
+
end
|
20
|
+
|
21
|
+
def incr(key)
|
22
|
+
@redis.hincrby @hash_name, key, 1
|
23
|
+
end
|
24
|
+
|
25
|
+
def decr(key)
|
26
|
+
@redis.hdecrby @hash_name, key, 1
|
27
|
+
end
|
28
|
+
|
29
|
+
def values
|
30
|
+
@redis.hvals(@hash_name).map(&:to_f)
|
31
|
+
end
|
32
|
+
|
33
|
+
def map
|
34
|
+
out = []
|
35
|
+
|
36
|
+
if block_given?
|
37
|
+
@redis.hkeys(@hash_name).each { |k| out << yield(k, self.[](k)) }
|
38
|
+
else
|
39
|
+
out = to_enum :map
|
40
|
+
end
|
41
|
+
|
42
|
+
out
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def initialize(categories, options={})
|
47
|
+
@redis = ::Redis.new(options)
|
48
|
+
|
49
|
+
@_categories = categories
|
50
|
+
|
51
|
+
setup
|
52
|
+
end
|
53
|
+
|
54
|
+
def categories
|
55
|
+
@redis.smembers("nb:set:categories").map(&:to_sym)
|
56
|
+
end
|
57
|
+
|
58
|
+
def categories_count
|
59
|
+
@categories_count ||= RedisHash.new(@redis, "nb:hash:categories_count")
|
60
|
+
end
|
61
|
+
|
62
|
+
def tokens_count
|
63
|
+
@tokens_count ||= Hash.new
|
64
|
+
end
|
65
|
+
|
66
|
+
def clear!
|
67
|
+
@redis.flushall
|
68
|
+
|
69
|
+
setup
|
70
|
+
|
71
|
+
categories.each do |category|
|
72
|
+
self.categories_count[category] = 0
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def setup
|
77
|
+
@redis.sadd "nb:set:categories", @_categories
|
78
|
+
|
79
|
+
categories.each do |category|
|
80
|
+
# @tokens_count[category] = Hash.new(0)
|
81
|
+
self.tokens_count[category] = RedisHash.new(@redis, "nb:hash:tokens_count:#{category}")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def train(category, *tokens)
|
86
|
+
tokens.uniq.each do |token|
|
87
|
+
self.tokens_count[category].incr(token)
|
88
|
+
end
|
89
|
+
|
90
|
+
self.categories_count.incr(category)
|
91
|
+
end
|
92
|
+
|
93
|
+
def untrain(category, *tokens)
|
94
|
+
tokens.uniq.each do |token|
|
95
|
+
self.tokens_count[category][token].decr(token)
|
96
|
+
end
|
97
|
+
|
98
|
+
self.categories_count.decr(category)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
class Classifier
|
5
|
+
attr_accessor :default_category
|
6
|
+
attr_accessor :backend
|
7
|
+
|
8
|
+
def initialize(*categories)
|
9
|
+
if categories.last.is_a?(Hash)
|
10
|
+
options = categories.pop
|
11
|
+
else
|
12
|
+
options = {}
|
13
|
+
end
|
14
|
+
|
15
|
+
options[:backend] ||= :memory
|
16
|
+
|
17
|
+
case options[:backend]
|
18
|
+
when :memory
|
19
|
+
@backend = Backend::Memory.new(categories)
|
20
|
+
when :redis
|
21
|
+
options[:host] ||= 'localhost'
|
22
|
+
options[:port] ||= 6379
|
23
|
+
|
24
|
+
@backend = Backend::Redis.new(categories, host: options[:host], port: options[:port])
|
25
|
+
else
|
26
|
+
raise "unsupported backend: #{options[:backend]}"
|
27
|
+
end
|
28
|
+
|
29
|
+
@default_category = categories.first
|
30
|
+
end
|
31
|
+
|
32
|
+
def train(category, *tokens)
|
33
|
+
backend.train(category, *tokens)
|
34
|
+
end
|
35
|
+
|
36
|
+
def untrain(category, *tokens)
|
37
|
+
backend.untrain(category, *tokens)
|
38
|
+
end
|
39
|
+
|
40
|
+
def clear!
|
41
|
+
backend.clear!
|
42
|
+
end
|
43
|
+
|
44
|
+
def classify(*tokens)
|
45
|
+
result = classifications(*tokens).first
|
46
|
+
|
47
|
+
if result.last == 0.0
|
48
|
+
[@default_category, 0.0]
|
49
|
+
else
|
50
|
+
result
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def classifications(*tokens)
|
55
|
+
scores = {}
|
56
|
+
backend.categories.each do |category|
|
57
|
+
scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
|
58
|
+
end
|
59
|
+
scores.sort_by { |k, v| -v }
|
60
|
+
end
|
61
|
+
|
62
|
+
def top_tokens_of_category(category, count=20)
|
63
|
+
backend.tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
|
64
|
+
end
|
65
|
+
|
66
|
+
def probability_of_a_token_in_category(token, category)
|
67
|
+
probability_of_a_token_given_a_category(token, category) / backend.categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
|
68
|
+
end
|
69
|
+
|
70
|
+
def probability_of_a_token_given_a_category(token, category)
|
71
|
+
return assumed_probability if backend.tokens_count[category][token] == 0
|
72
|
+
|
73
|
+
backend.tokens_count[category][token].to_f / backend.categories_count[category]
|
74
|
+
end
|
75
|
+
|
76
|
+
def probability_of_tokens_given_a_category(tokens, category)
|
77
|
+
tokens.inject(1.0) do |product, token|
|
78
|
+
product * probability_of_a_token_given_a_category(token, category)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def probability_of_a_category(category)
|
83
|
+
backend.categories_count[category].to_f / total_number_of_items
|
84
|
+
end
|
85
|
+
|
86
|
+
# def total_number_of_tokens
|
87
|
+
# @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
|
88
|
+
# end
|
89
|
+
|
90
|
+
def total_number_of_items
|
91
|
+
backend.categories_count.values.inject(&:+)
|
92
|
+
end
|
93
|
+
|
94
|
+
# If we have only trained a little bit a class may not have had a feature yet
|
95
|
+
# give it a probability of 0 may not be true so we produce a assumed probability
|
96
|
+
# which gets smaller more we train
|
97
|
+
def assumed_probability
|
98
|
+
0.5 / (total_number_of_items.to_f / 2)
|
99
|
+
end
|
100
|
+
|
101
|
+
def data
|
102
|
+
{
|
103
|
+
:categories => backend.categories,
|
104
|
+
:tokens_count => backend.tokens_count,
|
105
|
+
:categories_count => backend.categories_count
|
106
|
+
}
|
107
|
+
end
|
108
|
+
|
109
|
+
def save(yaml_file)
|
110
|
+
raise 'only memory backend can save' unless backend == :memory
|
111
|
+
|
112
|
+
File.write(yaml_file, data.to_yaml)
|
113
|
+
end
|
114
|
+
|
115
|
+
class << self
|
116
|
+
# will load into a memory-backed classifier
|
117
|
+
def load_yaml(yaml_file)
|
118
|
+
data = YAML.load_file(yaml_file)
|
119
|
+
|
120
|
+
new(data[:categories], backend: :memory).tap do |classifier|
|
121
|
+
classifier.tokens_count = data[:tokens_count]
|
122
|
+
classifier.categories_count = data[:categories_count]
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
data/lib/nb/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
module NaiveBayes
|
2
|
+
VERSION = "0.1.1"
|
3
3
|
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module NaiveBayes
|
4
|
+
describe Classifier do
|
5
|
+
let(:classifier) { Classifier.new(:love, :hate) }
|
6
|
+
subject { classifier }
|
7
|
+
|
8
|
+
it { should respond_to :train }
|
9
|
+
it { should respond_to :untrain }
|
10
|
+
it { should respond_to :save }
|
11
|
+
it { should respond_to :classify }
|
12
|
+
it { should respond_to :classifications }
|
13
|
+
it { should respond_to :probability_of_a_token_given_a_category }
|
14
|
+
it { should respond_to :probability_of_tokens_given_a_category }
|
15
|
+
it { should respond_to :probability_of_a_category }
|
16
|
+
it { should respond_to :probability_of_a_token_in_category }
|
17
|
+
# it { should respond_to :total_number_of_tokens }
|
18
|
+
it { should respond_to :total_number_of_items }
|
19
|
+
it { should respond_to :top_tokens_of_category }
|
20
|
+
it { should respond_to :default_category= }
|
21
|
+
|
22
|
+
it { should respond_to :clear! }
|
23
|
+
|
24
|
+
[:memory, :redis].each do |backend|
|
25
|
+
describe "with backend #{backend}" do
|
26
|
+
|
27
|
+
let(:classifier) { Classifier.new(:love, :hate, backend: backend) }
|
28
|
+
|
29
|
+
subject { classifier }
|
30
|
+
|
31
|
+
before(:each) do
|
32
|
+
subject.clear!
|
33
|
+
end
|
34
|
+
|
35
|
+
# describe '#total_number_of_tokens' do
|
36
|
+
# it 'calculates correctly' do
|
37
|
+
# bayes.train :love, 'I', 'love', 'you'
|
38
|
+
# bayes.train :hate, 'I', 'hate', 'you'
|
39
|
+
#
|
40
|
+
# bayes.total_number_of_tokens.should == 6
|
41
|
+
#
|
42
|
+
# bayes.train :love, 'I', 'love', 'you', 'more'
|
43
|
+
#
|
44
|
+
# bayes.total_number_of_tokens.should == 10
|
45
|
+
# end
|
46
|
+
# end
|
47
|
+
|
48
|
+
describe '#categories_count and #tokens_count' do
|
49
|
+
it 'must get it right' do
|
50
|
+
subject.backend.categories_count[:love].should == 0
|
51
|
+
|
52
|
+
subject.train :love, 'I', 'love', 'you'
|
53
|
+
subject.train :hate, 'I', 'hate', 'you'
|
54
|
+
|
55
|
+
subject.backend.categories_count[:love].should == 1
|
56
|
+
subject.backend.tokens_count[:hate]['you'].should == 1
|
57
|
+
subject.backend.tokens_count[:hate]['love'].should == 0
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
describe '#probability_of_a_token_in_category' do
|
62
|
+
it 'calculates correctly' do
|
63
|
+
subject.train :love, 'I', 'love', 'you'
|
64
|
+
subject.train :hate, 'I', 'hate', 'you'
|
65
|
+
|
66
|
+
subject.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
|
67
|
+
subject.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
|
68
|
+
subject.probability_of_a_token_in_category('I', :love).should == 0.5
|
69
|
+
|
70
|
+
subject.train :love, 'hate', 'is', 'love'
|
71
|
+
subject.train :love, 'hate', 'is', 'love'
|
72
|
+
subject.train :love, 'hate', 'is', 'love'
|
73
|
+
|
74
|
+
subject.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
|
75
|
+
subject.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#total_number_of_items' do
|
80
|
+
it 'calculates correctly' do
|
81
|
+
subject.train :love, 'I', 'love', 'you'
|
82
|
+
subject.train :hate, 'I', 'hate', 'you'
|
83
|
+
|
84
|
+
subject.total_number_of_items.should == 2
|
85
|
+
|
86
|
+
subject.train :love, 'I', 'love', 'you', 'more'
|
87
|
+
|
88
|
+
subject.total_number_of_items.should == 3
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
describe '#probability_of_a_category' do
|
93
|
+
it 'calculates correctly' do
|
94
|
+
subject.train :love, 'I', 'love', 'you'
|
95
|
+
subject.train :hate, 'I', 'hate', 'you'
|
96
|
+
|
97
|
+
subject.probability_of_a_category(:love).should == 0.5
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe '#probability_of_token_given_a_category' do
|
102
|
+
it 'calculates correctly' do
|
103
|
+
subject.train :love, 'I', 'love', 'you'
|
104
|
+
subject.train :hate, 'I', 'hate', 'you'
|
105
|
+
|
106
|
+
subject.probability_of_a_token_given_a_category('love', :love).should == 1
|
107
|
+
subject.probability_of_a_token_given_a_category('you', :hate).should == 1
|
108
|
+
|
109
|
+
subject.train :love, 'I', 'love', 'you', 'more'
|
110
|
+
|
111
|
+
subject.probability_of_a_token_given_a_category('more', :love).should == 0.5
|
112
|
+
# bayes.probability_of_token_given_a_category('more', :hate).should == 0
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
describe '#classifications' do
|
117
|
+
it 'calculates correctly' do
|
118
|
+
subject.train :love, 'I', 'love', 'you'
|
119
|
+
subject.train :hate, 'I', 'hate', 'you'
|
120
|
+
|
121
|
+
subject.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
122
|
+
subject.classify(*%w{ I love you }).should == [:love, 0.5]
|
123
|
+
subject.classify(*%w{ love }).should == [:love, 0.5]
|
124
|
+
|
125
|
+
subject.train :love, 'I', 'love', 'you'
|
126
|
+
subject.train :love, 'I', 'love', 'you'
|
127
|
+
subject.train :love, 'I', 'love', 'you'
|
128
|
+
|
129
|
+
subject.classify(*%w{ I love you }).should == [:love, 0.8]
|
130
|
+
subject.classify(*%w{ love }).should == [:love, 0.8]
|
131
|
+
subject.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
describe '#top_tokens_of_category' do
|
136
|
+
it 'finds to tokens' do
|
137
|
+
subject.train :love, 'I', 'love', 'you'
|
138
|
+
subject.train :hate, 'I', 'hate', 'you'
|
139
|
+
|
140
|
+
subject.top_tokens_of_category(:love).count.should == 3
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe 'class methods' do
|
147
|
+
subject { Classifier }
|
148
|
+
|
149
|
+
it { should respond_to :load_yaml }
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Forrest Ye
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,10 +54,14 @@ files:
|
|
54
54
|
- README.md
|
55
55
|
- Rakefile
|
56
56
|
- lib/nb.rb
|
57
|
-
- lib/nb/
|
57
|
+
- lib/nb/backend/memory.rb
|
58
|
+
- lib/nb/backend/redis.rb
|
59
|
+
- lib/nb/classifier.rb
|
58
60
|
- lib/nb/version.rb
|
59
61
|
- nb.gemspec
|
60
|
-
- spec/nb/
|
62
|
+
- spec/nb/backend/memory_spec.rb
|
63
|
+
- spec/nb/backend/redis_spec.rb
|
64
|
+
- spec/nb/classifier_spec.rb
|
61
65
|
- spec/spec_helper.rb
|
62
66
|
homepage: https://github.com/forresty/nb
|
63
67
|
licenses:
|
@@ -79,10 +83,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
79
83
|
version: '0'
|
80
84
|
requirements: []
|
81
85
|
rubyforge_project:
|
82
|
-
rubygems_version: 2.
|
86
|
+
rubygems_version: 2.4.4
|
83
87
|
signing_key:
|
84
88
|
specification_version: 4
|
85
89
|
summary: yet another Naive Bayes library
|
86
90
|
test_files:
|
87
|
-
- spec/nb/
|
91
|
+
- spec/nb/backend/memory_spec.rb
|
92
|
+
- spec/nb/backend/redis_spec.rb
|
93
|
+
- spec/nb/classifier_spec.rb
|
88
94
|
- spec/spec_helper.rb
|
data/lib/nb/naive_bayes.rb
DELETED
@@ -1,112 +0,0 @@
|
|
1
|
-
require "yaml"
|
2
|
-
|
3
|
-
class NaiveBayes
|
4
|
-
attr_accessor :categories, :tokens_count, :categories_count, :default_category
|
5
|
-
|
6
|
-
def initialize(*categories)
|
7
|
-
@categories = categories
|
8
|
-
@tokens_count = {}
|
9
|
-
@categories_count = {}
|
10
|
-
@default_category = @categories.first
|
11
|
-
|
12
|
-
categories.each do |category|
|
13
|
-
@tokens_count[category] = Hash.new(0)
|
14
|
-
@categories_count[category] = 0
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def train(category, *tokens)
|
19
|
-
tokens.uniq.each do |token|
|
20
|
-
@tokens_count[category][token] += 1
|
21
|
-
end
|
22
|
-
@categories_count[category] += 1
|
23
|
-
end
|
24
|
-
|
25
|
-
def untrain(category, *tokens)
|
26
|
-
tokens.uniq.each do |token|
|
27
|
-
@tokens_count[category][token] -= 1
|
28
|
-
end
|
29
|
-
@categories_count[category] -= 1
|
30
|
-
end
|
31
|
-
|
32
|
-
def classify(*tokens)
|
33
|
-
result = classifications(*tokens).first
|
34
|
-
|
35
|
-
if result.last == 0.0
|
36
|
-
[@default_category, 0.0]
|
37
|
-
else
|
38
|
-
result
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def classifications(*tokens)
|
43
|
-
scores = {}
|
44
|
-
@categories.each do |category|
|
45
|
-
scores[category] = probability_of_tokens_given_a_category(tokens, category) * probability_of_a_category(category)
|
46
|
-
end
|
47
|
-
scores.sort_by { |k, v| -v }
|
48
|
-
end
|
49
|
-
|
50
|
-
def top_tokens_of_category(category, count=20)
|
51
|
-
tokens_count[category].map { |k, v| [k, v, probability_of_a_token_in_category(k, category)] }.sort_by { |i| -i.last }.first(count)
|
52
|
-
end
|
53
|
-
|
54
|
-
def probability_of_a_token_in_category(token, category)
|
55
|
-
probability_of_a_token_given_a_category(token, category) / @categories.inject(0.0) { |r, c| r + probability_of_a_token_given_a_category(token, c) }
|
56
|
-
end
|
57
|
-
|
58
|
-
def probability_of_a_token_given_a_category(token, category)
|
59
|
-
return assumed_probability if @tokens_count[category][token] == 0
|
60
|
-
|
61
|
-
@tokens_count[category][token].to_f / @categories_count[category]
|
62
|
-
end
|
63
|
-
|
64
|
-
def probability_of_tokens_given_a_category(tokens, category)
|
65
|
-
tokens.inject(1.0) do |product, token|
|
66
|
-
product * probability_of_a_token_given_a_category(token, category)
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def probability_of_a_category(category)
|
71
|
-
@categories_count[category].to_f / total_number_of_items
|
72
|
-
end
|
73
|
-
|
74
|
-
# def total_number_of_tokens
|
75
|
-
# @tokens_count.values.inject(0) { |sum, hash| sum + hash.values.inject(&:+) }
|
76
|
-
# end
|
77
|
-
|
78
|
-
def total_number_of_items
|
79
|
-
@categories_count.values.inject(&:+)
|
80
|
-
end
|
81
|
-
|
82
|
-
# If we have only trained a little bit a class may not have had a feature yet
|
83
|
-
# give it a probability of 0 may not be true so we produce a assumed probability
|
84
|
-
# which gets smaller more we train
|
85
|
-
def assumed_probability
|
86
|
-
0.5 / (total_number_of_items.to_f / 2)
|
87
|
-
end
|
88
|
-
|
89
|
-
def data
|
90
|
-
{
|
91
|
-
:categories => @categories,
|
92
|
-
:tokens_count => @tokens_count,
|
93
|
-
:categories_count => @categories_count
|
94
|
-
}
|
95
|
-
end
|
96
|
-
|
97
|
-
def save(yaml_file)
|
98
|
-
File.write(yaml_file, data.to_yaml)
|
99
|
-
end
|
100
|
-
|
101
|
-
class << self
|
102
|
-
def load_yaml(yaml_file)
|
103
|
-
data = YAML.load_file(yaml_file)
|
104
|
-
|
105
|
-
new.tap do |bayes|
|
106
|
-
bayes.categories = data[:categories]
|
107
|
-
bayes.tokens_count = data[:tokens_count]
|
108
|
-
bayes.categories_count = data[:categories_count]
|
109
|
-
end
|
110
|
-
end
|
111
|
-
end
|
112
|
-
end
|
data/spec/nb/naive_bayes_spec.rb
DELETED
@@ -1,113 +0,0 @@
|
|
1
|
-
require "spec_helper"
|
2
|
-
|
3
|
-
describe NaiveBayes do
|
4
|
-
it { should respond_to :train }
|
5
|
-
it { should respond_to :untrain }
|
6
|
-
it { should respond_to :save }
|
7
|
-
it { should respond_to :classify }
|
8
|
-
it { should respond_to :classifications }
|
9
|
-
it { should respond_to :probability_of_a_token_given_a_category }
|
10
|
-
it { should respond_to :probability_of_tokens_given_a_category }
|
11
|
-
it { should respond_to :probability_of_a_category }
|
12
|
-
it { should respond_to :probability_of_a_token_in_category }
|
13
|
-
# it { should respond_to :total_number_of_tokens }
|
14
|
-
it { should respond_to :total_number_of_items }
|
15
|
-
it { should respond_to :top_tokens_of_category }
|
16
|
-
it { should respond_to :default_category= }
|
17
|
-
|
18
|
-
let(:bayes) { NaiveBayes.new(:love, :hate) }
|
19
|
-
subject { bayes }
|
20
|
-
|
21
|
-
# describe '#total_number_of_tokens' do
|
22
|
-
# it 'calculates correctly' do
|
23
|
-
# bayes.train :love, 'I', 'love', 'you'
|
24
|
-
# bayes.train :hate, 'I', 'hate', 'you'
|
25
|
-
#
|
26
|
-
# bayes.total_number_of_tokens.should == 6
|
27
|
-
#
|
28
|
-
# bayes.train :love, 'I', 'love', 'you', 'more'
|
29
|
-
#
|
30
|
-
# bayes.total_number_of_tokens.should == 10
|
31
|
-
# end
|
32
|
-
# end
|
33
|
-
|
34
|
-
describe '#probability_of_a_token_in_category' do
|
35
|
-
it 'calculates correctly' do
|
36
|
-
bayes.train :love, 'I', 'love', 'you'
|
37
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
38
|
-
|
39
|
-
bayes.probability_of_a_token_in_category('love', :love).should == 2.0/3 # 1 / ( 1 + 0.5 )
|
40
|
-
bayes.probability_of_a_token_in_category('hate', :love).should == 1.0/3 # 0.5 / ( 1 + 0.5 )
|
41
|
-
bayes.probability_of_a_token_in_category('I', :love).should == 0.5
|
42
|
-
|
43
|
-
bayes.train :love, 'hate', 'is', 'love'
|
44
|
-
bayes.train :love, 'hate', 'is', 'love'
|
45
|
-
bayes.train :love, 'hate', 'is', 'love'
|
46
|
-
|
47
|
-
bayes.probability_of_a_token_in_category('love', :love).should == 5.0/6 # 1 / ( 1 + 0.2 )
|
48
|
-
bayes.probability_of_a_token_in_category('hate', :love).should == 3.0/7 # 0.75 / ( 0.75 + 1 )
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
describe '#total_number_of_items' do
|
53
|
-
it 'calculates correctly' do
|
54
|
-
bayes.train :love, 'I', 'love', 'you'
|
55
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
56
|
-
|
57
|
-
bayes.total_number_of_items.should == 2
|
58
|
-
|
59
|
-
bayes.train :love, 'I', 'love', 'you', 'more'
|
60
|
-
|
61
|
-
bayes.total_number_of_items.should == 3
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
describe '#probability_of_a_category' do
|
66
|
-
it 'calculates correctly' do
|
67
|
-
bayes.train :love, 'I', 'love', 'you'
|
68
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
69
|
-
|
70
|
-
bayes.probability_of_a_category(:love).should == 0.5
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
describe '#probability_of_token_given_a_category' do
|
75
|
-
it 'calculates correctly' do
|
76
|
-
bayes.train :love, 'I', 'love', 'you'
|
77
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
78
|
-
|
79
|
-
bayes.probability_of_a_token_given_a_category('love', :love).should == 1
|
80
|
-
bayes.probability_of_a_token_given_a_category('you', :hate).should == 1
|
81
|
-
|
82
|
-
bayes.train :love, 'I', 'love', 'you', 'more'
|
83
|
-
|
84
|
-
bayes.probability_of_a_token_given_a_category('more', :love).should == 0.5
|
85
|
-
# bayes.probability_of_token_given_a_category('more', :hate).should == 0
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
describe '#classifications' do
|
90
|
-
it 'calculates correctly' do
|
91
|
-
bayes.train :love, 'I', 'love', 'you'
|
92
|
-
bayes.train :hate, 'I', 'hate', 'you'
|
93
|
-
|
94
|
-
bayes.classifications(*%w{ I love you }).should == [[:love, 0.5], [:hate, 0.25]]
|
95
|
-
bayes.classify(*%w{ I love you }).should == [:love, 0.5]
|
96
|
-
bayes.classify(*%w{ love }).should == [:love, 0.5]
|
97
|
-
|
98
|
-
bayes.train :love, 'I', 'love', 'you'
|
99
|
-
bayes.train :love, 'I', 'love', 'you'
|
100
|
-
bayes.train :love, 'I', 'love', 'you'
|
101
|
-
|
102
|
-
bayes.classify(*%w{ I love you }).should == [:love, 0.8]
|
103
|
-
bayes.classify(*%w{ love }).should == [:love, 0.8]
|
104
|
-
bayes.classify(*%w{ only love }).first.should == :love #[:love, 0.16], (0.2 * 1) * 0.8
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
describe 'class methods' do
|
109
|
-
subject { NaiveBayes }
|
110
|
-
|
111
|
-
it { should respond_to :load_yaml }
|
112
|
-
end
|
113
|
-
end
|