bayes_on_redis 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +13 -0
- data/lib/bayes_on_redis.rb +74 -0
- metadata +68 -0
data/README.markdown
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# bayes_on_redis
|
2
|
+
|
3
|
+
Bayesian classifier on top of Redis
|
4
|
+
|
5
|
+
## Why on Redis?
|
6
|
+
|
7
|
+
Redis is perfect for building fast bayesian filter.
|
8
|
+
|
9
|
+
## Getting started
|
10
|
+
|
11
|
+
## Contributing
|
12
|
+
|
13
|
+
[Fork the project](http://github.com/didip/bayes_on_redis) and send pull requests.
|
@@ -0,0 +1,74 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "redis"
|
3
|
+
|
4
|
+
class BayesOnRedis
|
5
|
+
CATEGORIES_KEY = "BayesOnRedis:categories"
|
6
|
+
|
7
|
+
def initialize(options)
|
8
|
+
@redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
|
9
|
+
end
|
10
|
+
|
11
|
+
def flushdb
|
12
|
+
@redis.flushdb
|
13
|
+
end
|
14
|
+
|
15
|
+
# training for a category
|
16
|
+
def train(category, text)
|
17
|
+
category = category.downcase
|
18
|
+
@redis.sadd(CATEGORIES_KEY, category)
|
19
|
+
|
20
|
+
count_occurance(text).each do |word, count|
|
21
|
+
@redis.hincrby(redis_category_key(category), word, count)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def untrain(category, text)
|
26
|
+
category = category.downcase
|
27
|
+
|
28
|
+
count_occurance(text).each do |word, count|
|
29
|
+
word_count_atm = @redis.hget(redis_category_key(category), word)
|
30
|
+
if (word_count_atm >= count)
|
31
|
+
new_count = (word_count_atm - count)
|
32
|
+
else
|
33
|
+
new_count = 0
|
34
|
+
end
|
35
|
+
@redis.hset(redis_category_key(category), word, new_count)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def classify(text)
|
40
|
+
scores = {}
|
41
|
+
|
42
|
+
@redis.smembers(CATEGORIES_KEY).each do |category|
|
43
|
+
words_count_per_category = @redis.hvals(redis_category_key(category)).inject(0) {|sum, score| sum + score.to_i}
|
44
|
+
@redis.srem(CATEGORIES_KEY, category) if words_count_per_category <= 0
|
45
|
+
|
46
|
+
scores[category] = 0
|
47
|
+
|
48
|
+
count_occurance(text).each do |word, count|
|
49
|
+
tmp_score = @redis.hget(redis_category_key(category), word).to_i
|
50
|
+
tmp_score = 0.1 if tmp_score <= 0
|
51
|
+
|
52
|
+
scores[category] += Math.log(tmp_score / words_count_per_category.to_f)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
return scores
|
57
|
+
end
|
58
|
+
|
59
|
+
def classify_for_human(text)
|
60
|
+
(classify(text).sort_by { |score| -score[1] })[0][0] # [0][0] -> first score, get the key
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
def redis_category_key(category)
|
65
|
+
"BayesOnRedis:cat:#{category}"
|
66
|
+
end
|
67
|
+
|
68
|
+
# Incoming text is always downcased
|
69
|
+
def count_occurance(text)
|
70
|
+
text.downcase.split.inject(Hash.new(0)) do |container, word|
|
71
|
+
container[word] += 1; container
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bayes_on_redis
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Didip Kerabat
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-11-24 00:00:00 -08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering.
|
23
|
+
email: didipk@gmail.com
|
24
|
+
executables: []
|
25
|
+
|
26
|
+
extensions: []
|
27
|
+
|
28
|
+
extra_rdoc_files: []
|
29
|
+
|
30
|
+
files:
|
31
|
+
- README.markdown
|
32
|
+
- lib/bayes_on_redis.rb
|
33
|
+
has_rdoc: true
|
34
|
+
homepage: https://github.com/didip/bayes_on_redis
|
35
|
+
licenses: []
|
36
|
+
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options: []
|
39
|
+
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
none: false
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
hash: 3
|
48
|
+
segments:
|
49
|
+
- 0
|
50
|
+
version: "0"
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
hash: 3
|
57
|
+
segments:
|
58
|
+
- 0
|
59
|
+
version: "0"
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.7
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Bayesian filter on top of Redis
|
67
|
+
test_files: []
|
68
|
+
|