judgee 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in judgee.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Railsmechanic
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,54 @@
1
+ # Judgee
2
+
3
+ A simple Bayesian Classifier with additive smoothing built in.
4
+ The primary focus of judgee lies on performance and a minimal but flexible feature set.
5
+ So it's up to you to do stemming, text analysis, etc.
6
+
7
+
8
+ ## Backed by Redis
9
+
10
+ [Redis](http://redis.io/) is an open source, BSD licensed, advanced key-value store, which is often referred to as a data structure server.
11
+ It supports strings, hashes, lists, sets, sorted sets and offers an incredible performance.
12
+
13
+
14
+ ## Installation
15
+
16
+ gem install judgee
17
+
18
+
19
+ ## Getting started
20
+
21
+ # Require Judgee
22
+ require "judgee"
23
+
24
+ # Create an instance of Judgee.
25
+ # It assumes that your Redis instance is running on localhost at port 6379.
26
+ judgee = Judgee::Classifier.new
27
+
28
+ # Is Redis running on a host in your network, simply pass in your options
29
+ judgee = Judgee::Classifier.new(:host => "10.0.1.1", :port => 6380)
30
+
31
+ # It also supports Unix sockets
32
+ judgee = Judgee::Classifier.new(:path => "/tmp/redis.sock")
33
+
34
+
35
+
36
+ # Now you can train the classifier
37
+ judgee.train(:spam, ["bad", "worse", "stupid", "idiotic"])
38
+ judgee.train(:ham, ["good", "better", "best", "lovely"])
39
+
40
+ # After training, classify your text sample
41
+ judgee.classify(["good", "better", "best", "worse"]) # => :ham
42
+
43
+
44
+ # Want to untrain some words?
45
+ judgee.untrain(:spam, ["bad", "worse"])
46
+
47
+
48
+ ## Information on Performance
49
+
50
+ If you read the source code, you might stumble upon the confusing method names.
51
+ There are two methods for training (train, train_fast), two methods for untraining (untrain, untrain_fast) and two methods for classification (classify, classify_fast).
52
+ The difference is quite simple. As the name suggests, all methods with the suffix '_fast' are (really) faster (3x to 10x) in processing the data, but virtually unreadable.
53
+
54
+ So use the '_fast' methods if you need performance, e.g. in production and the methods without the suffix for learning purposes.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'judgee/version'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "judgee"
9
+ gem.version = Judgee::VERSION
10
+ gem.authors = ["Railsmechanic"]
11
+ gem.email = ["info@railsmechanic.de"]
12
+ gem.description = %q{A simple Bayesian Classifier with additive smoothing and its focus on performance.}
13
+ gem.summary = %q{Judgee is a simple Bayesian Classifier with additive smoothing, which uses Redis for persistance.}
14
+ gem.homepage = "https://github.com/railsmechanic/judgee"
15
+ gem.homepage = "https://github.com/railsmechanic/judgee"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+
22
+
23
+ # Regular dependencies
24
+ gem.add_dependency "redis"
25
+
26
+ # Development dependencies
27
+ gem.add_development_dependency "rspec"
28
+
29
+ end
@@ -0,0 +1,3 @@
1
+
2
+ require "judgee/version"
3
+ require "judgee/classifier"
@@ -0,0 +1,138 @@
1
+ # encoding: UTF-8
2
+
3
+ require "redis"
4
+
5
+ module Judgee
6
+ class Classifier
7
+
8
+ ###
9
+ # Constants
10
+ ###
11
+ CATEGORIES_KEY = "judgee:categories"
12
+ CATEGORY_KEY = "judgee:category"
13
+ ALPHA = 1.0
14
+
15
+
16
+ attr_reader :redis
17
+
18
+ def initialize(options={})
19
+ @redis = Redis.new(options)
20
+ end
21
+
22
+
23
+ def train(category, data)
24
+ redis.sadd(CATEGORIES_KEY, category_name(category))
25
+ count_occurance(data).each do |word, word_count|
26
+ redis.hincrby(redis_category_key(category), word, word_count)
27
+ end
28
+ "OK"
29
+ end
30
+
31
+ def train_fast(category, data)
32
+ redis.sadd(CATEGORIES_KEY, category_name(category))
33
+ occurances = count_occurance(data)
34
+ database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
35
+ new_occurances = occurances.merge(database_occurances) { |key, value_occurance, value_database_occurance| value_occurance.to_i + value_database_occurance.to_i }.to_a.flatten!
36
+ redis.hmset(redis_category_key(category), new_occurances)
37
+ "OK"
38
+ end
39
+
40
+
41
+
42
+ def untrain(category, data)
43
+ count_occurance(data).each do |word, word_count|
44
+ new_count = [(redis.hget(redis_category_key(category), word).to_i - word_count), 0].max
45
+ if new_count > 0
46
+ redis.hset(redis_category_key(category), word, new_count)
47
+ else
48
+ redis.hdel(redis_category_key(category), word)
49
+ end
50
+ end
51
+ "OK"
52
+ end
53
+
54
+ def untrain_fast(category, data)
55
+ occurances = count_occurance(data)
56
+ database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
57
+ untrain_occurances = database_occurances.merge(occurances) { |key, value_occurance, value_untrain_occurance| value_occurance.to_i - value_untrain_occurance.to_i }
58
+ empty_occurances = untrain_occurances.select { |key, value| value.to_i <= 0 }
59
+ redis.hmset(redis_category_key(category), untrain_occurances.to_a.flatten!)
60
+ redis.hdel(redis_category_key(category), empty_occurances.keys) unless empty_occurances.empty?
61
+ "OK"
62
+ end
63
+
64
+
65
+
66
+ def classify(data)
67
+ result = Hash.new(0)
68
+ categories = redis.smembers(CATEGORIES_KEY)
69
+
70
+ categories.each do |category|
71
+ count_occurance(data).each do |word, word_count|
72
+ numerator = (redis.hget(redis_category_key(category), word).to_i + ALPHA).to_f
73
+ denominator = (categories.map { |category| redis.hget(redis_category_key(category), word).to_i }.inject(0, :+) + (ALPHA * data.length)).to_f
74
+ result[category] += (word_count * Math.log(numerator / denominator)).abs
75
+ end
76
+ end
77
+
78
+ result.min_by(&:last).first.to_sym
79
+ end
80
+
81
+ def classify_fast(data)
82
+ result = Hash.new(0)
83
+ categories = redis.smembers(CATEGORIES_KEY)
84
+ occurances = count_occurance(data)
85
+
86
+ categories.each do |category|
87
+ numerator = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))].inject({}) { |hash, (key, value)| hash[key] = value.to_f + ALPHA; hash }
88
+ denominator = categories.map { |category| Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))] }.inject(Hash.new(0)) { |main_hash, sub_hash| main_hash.merge(sub_hash) { |key, value_first, value_second| value_first.to_f + value_second.to_f} }.inject(Hash.new(0)) { |hash, (key, value)| hash[key] = value.to_f + (ALPHA * data.length); hash }
89
+ result[category] += numerator.merge(denominator) { |key, value_numerator, value_denominator| (occurances[key] * Math.log(value_numerator / value_denominator)).abs }.values.inject(0, :+)
90
+ end
91
+
92
+ result.min_by(&:last).first.to_sym
93
+ end
94
+
95
+
96
+
97
+ def flushdb(flush_db=false)
98
+ redis.flushdb if flush_db
99
+ end
100
+
101
+
102
+
103
+ def flush_category(category)
104
+ redis.del(redis_category_key(category))
105
+ redis.srem(CATEGORIES_KEY, category_name(category))
106
+ end
107
+
108
+
109
+ private
110
+
111
+
112
+ def count_occurance(data='')
113
+ bag_of_words = Hash.new(0)
114
+
115
+ data = [data].flatten.map! do |word|
116
+ word.to_s.strip
117
+ end.delete_if(&:empty?)
118
+
119
+ for word in data
120
+ bag_of_words[word] += 1
121
+ end
122
+ bag_of_words
123
+ rescue
124
+ raise ArgumentError, 'input must be a single String or an Array of Strings'
125
+ end
126
+
127
+
128
+ def category_name(category)
129
+ category.to_s.downcase.strip.gsub(/[\s\W]+/,'_').gsub(/_+$/,'')
130
+ end
131
+
132
+
133
+ def redis_category_key(category)
134
+ "#{CATEGORY_KEY}:#{category_name(category)}"
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: UTF-8
2
+
3
+ module Judgee
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'judgee'
4
+ require 'redis'
5
+
6
+ describe Judgee::Classifier do
7
+
8
+ CATEGORIES_KEY = "judgee:categories"
9
+ CATEGORY_KEY = "judgee:category"
10
+
11
+ before :each do
12
+ @judgee = Judgee::Classifier.new
13
+ @redis = Redis.new
14
+ @redis.flushdb
15
+ @spam_category = :spam_spec
16
+ @ham_category = :ham_spec
17
+ @spam_data = %w(money rich quick big viagra penis)
18
+ @ham_data = %w(mail google gmail maps ruby)
19
+ @judgee.flush_category(@spam_category)
20
+ @judgee.flush_category(@ham_category)
21
+ end
22
+
23
+ describe "training" do
24
+ it "should add category to 'judgee:categories' set" do
25
+ @judgee.train(@ham_category, @ham_data)
26
+ @judgee.train(@spam_category, @spam_data)
27
+ categories = @redis.smembers(CATEGORIES_KEY)
28
+ categories.length.should eq 2
29
+ categories.should_include("judgee:categories:spam_spec")
30
+ categories.should_include("judgee:categories:ham_spec")
31
+ end
32
+ end
33
+
34
+ # TODO
35
+
36
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: judgee
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Railsmechanic
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: A simple Bayesian Classifier with additive smoothing and its focus on
47
+ performance.
48
+ email:
49
+ - info@railsmechanic.de
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - Gemfile
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - judgee.gemspec
59
+ - lib/judgee.rb
60
+ - lib/judgee/classifier.rb
61
+ - lib/judgee/version.rb
62
+ - spec/judgee_spec.rb
63
+ homepage: https://github.com/railsmechanic/judgee
64
+ licenses: []
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.24
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Judgee is a simple Bayesian Classifier with additive smoothing, which uses
87
+ Redis for persistance.
88
+ test_files:
89
+ - spec/judgee_spec.rb