bayes_on_redis 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/README.markdown +13 -0
  2. data/lib/bayes_on_redis.rb +74 -0
  3. metadata +68 -0
data/README.markdown ADDED
@@ -0,0 +1,13 @@
1
+ # bayes_on_redis
2
+
3
+ Bayesian classifier on top of Redis
4
+
5
+ ## Why on Redis?
6
+
7
+ Redis is perfect for building fast bayesian filter.
8
+
9
+ ## Getting started
10
+
11
+ ## Contributing
12
+
13
+ [Fork the project](http://github.com/didip/bayes_on_redis) and send pull requests.
@@ -0,0 +1,74 @@
1
+ require "rubygems"
2
+ require "redis"
3
+
4
+ class BayesOnRedis
5
+ CATEGORIES_KEY = "BayesOnRedis:categories"
6
+
7
+ def initialize(options)
8
+ @redis = Redis.new(:host => options[:redis_host], :port => options[:redis_port], :db => options[:redis_db])
9
+ end
10
+
11
+ def flushdb
12
+ @redis.flushdb
13
+ end
14
+
15
+ # training for a category
16
+ def train(category, text)
17
+ category = category.downcase
18
+ @redis.sadd(CATEGORIES_KEY, category)
19
+
20
+ count_occurance(text).each do |word, count|
21
+ @redis.hincrby(redis_category_key(category), word, count)
22
+ end
23
+ end
24
+
25
+ def untrain(category, text)
26
+ category = category.downcase
27
+
28
+ count_occurance(text).each do |word, count|
29
+ word_count_atm = @redis.hget(redis_category_key(category), word)
30
+ if (word_count_atm >= count)
31
+ new_count = (word_count_atm - count)
32
+ else
33
+ new_count = 0
34
+ end
35
+ @redis.hset(redis_category_key(category), word, new_count)
36
+ end
37
+ end
38
+
39
+ def classify(text)
40
+ scores = {}
41
+
42
+ @redis.smembers(CATEGORIES_KEY).each do |category|
43
+ words_count_per_category = @redis.hvals(redis_category_key(category)).inject(0) {|sum, score| sum + score.to_i}
44
+ @redis.srem(CATEGORIES_KEY, category) if words_count_per_category <= 0
45
+
46
+ scores[category] = 0
47
+
48
+ count_occurance(text).each do |word, count|
49
+ tmp_score = @redis.hget(redis_category_key(category), word).to_i
50
+ tmp_score = 0.1 if tmp_score <= 0
51
+
52
+ scores[category] += Math.log(tmp_score / words_count_per_category.to_f)
53
+ end
54
+ end
55
+
56
+ return scores
57
+ end
58
+
59
+ def classify_for_human(text)
60
+ (classify(text).sort_by { |score| -score[1] })[0][0] # [0][0] -> first score, get the key
61
+ end
62
+
63
+ private
64
+ def redis_category_key(category)
65
+ "BayesOnRedis:cat:#{category}"
66
+ end
67
+
68
+ # Incoming text is always downcased
69
+ def count_occurance(text)
70
+ text.downcase.split.inject(Hash.new(0)) do |container, word|
71
+ container[word] += 1; container
72
+ end
73
+ end
74
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bayes_on_redis
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Didip Kerabat
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-24 00:00:00 -08:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: bayes_on_redis library provides bayesian classification on a given text similar to many SPAM/HAM filtering.
23
+ email: didipk@gmail.com
24
+ executables: []
25
+
26
+ extensions: []
27
+
28
+ extra_rdoc_files: []
29
+
30
+ files:
31
+ - README.markdown
32
+ - lib/bayes_on_redis.rb
33
+ has_rdoc: true
34
+ homepage: https://github.com/didip/bayes_on_redis
35
+ licenses: []
36
+
37
+ post_install_message:
38
+ rdoc_options: []
39
+
40
+ require_paths:
41
+ - lib
42
+ required_ruby_version: !ruby/object:Gem::Requirement
43
+ none: false
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ hash: 3
48
+ segments:
49
+ - 0
50
+ version: "0"
51
+ required_rubygems_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ hash: 3
57
+ segments:
58
+ - 0
59
+ version: "0"
60
+ requirements: []
61
+
62
+ rubyforge_project:
63
+ rubygems_version: 1.3.7
64
+ signing_key:
65
+ specification_version: 3
66
+ summary: Bayesian filter on top of Redis
67
+ test_files: []
68
+