judgee 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in judgee.gemspec
4
+ gemspec
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Railsmechanic
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,54 @@
1
+ # Judgee
2
+
3
+ A simple Bayesian Classifier with additive smoothing built in.
4
+ The primary focus of judgee lies on performance and a minimal but flexible feature set.
5
+ So it's up to you to do stemming, text analysis, etc.
6
+
7
+
8
+ ## Backed by Redis
9
+
10
+ [Redis](http://redis.io/) is an open source, BSD licensed, advanced key-value store, which is often referred to as a data structure server.
11
+ It supports strings, hashes, lists, sets, sorted sets and offers an incredible performance.
12
+
13
+
14
+ ## Installation
15
+
16
+ gem install judgee
17
+
18
+
19
+ ## Getting started
20
+
21
+ # Require Judgee
22
+ require "judgee"
23
+
24
+ # Create an instance of Judgee.
25
+ # It assumes that your Redis instance is running on localhost at port 6379.
26
+ judgee = Judgee::Classifier.new
27
+
28
+ # Is Redis running on a host in your network, simply pass in your options
29
+ judgee = Judgee::Classifier.new(:host => "10.0.1.1", :port => 6380)
30
+
31
+ # It also supports Unix sockets
32
+ judgee = Judgee::Classifier.new(:path => "/tmp/redis.sock")
33
+
34
+
35
+
36
+ # Now you can train the classifier
37
+ judgee.train(:spam, ["bad", "worse", "stupid", "idiotic"])
38
+ judgee.train(:ham, ["good", "better", "best", "lovely"])
39
+
40
+ # After training, classify your text sample
41
+ judgee.classify(["good", "better", "best", "worse"]) # => :ham
42
+
43
+
44
+ # Want to untrain some words?
45
+ judgee.untrain(:spam, ["bad", "worse"])
46
+
47
+
48
+ ## Information on Performance
49
+
50
+ If you read the source code, you might stumble upon the confusing method names.
51
+ There are two methods for training (train, train_fast), two methods for untraining (untrain, untrain_fast) and two methods for classification (classify, classify_fast).
52
+ The difference is quite simple. As the name suggests, all methods with the suffix '_fast' are (really) faster (3x to 10x) in processing the data, but virtually unreadable.
53
+
54
+ So use the '_fast' methods if you need performance, e.g. in production and the methods without the suffix for learning purposes.
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,29 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'judgee/version'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "judgee"
9
+ gem.version = Judgee::VERSION
10
+ gem.authors = ["Railsmechanic"]
11
+ gem.email = ["info@railsmechanic.de"]
12
+ gem.description = %q{A simple Bayesian Classifier with additive smoothing and its focus on performance.}
13
+ gem.summary = %q{Judgee is a simple Bayesian Classifier with additive smoothing, which uses Redis for persistance.}
14
+ gem.homepage = "https://github.com/railsmechanic/judgee"
15
+ gem.homepage = "https://github.com/railsmechanic/judgee"
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+
22
+
23
+ # Regular dependencies
24
+ gem.add_dependency "redis"
25
+
26
+ # Development dependencies
27
+ gem.add_development_dependency "rspec"
28
+
29
+ end
@@ -0,0 +1,3 @@
1
+
2
+ require "judgee/version"
3
+ require "judgee/classifier"
@@ -0,0 +1,138 @@
1
+ # encoding: UTF-8
2
+
3
+ require "redis"
4
+
5
+ module Judgee
6
+ class Classifier
7
+
8
+ ###
9
+ # Constants
10
+ ###
11
+ CATEGORIES_KEY = "judgee:categories"
12
+ CATEGORY_KEY = "judgee:category"
13
+ ALPHA = 1.0
14
+
15
+
16
+ attr_reader :redis
17
+
18
+ def initialize(options={})
19
+ @redis = Redis.new(options)
20
+ end
21
+
22
+
23
+ def train(category, data)
24
+ redis.sadd(CATEGORIES_KEY, category_name(category))
25
+ count_occurance(data).each do |word, word_count|
26
+ redis.hincrby(redis_category_key(category), word, word_count)
27
+ end
28
+ "OK"
29
+ end
30
+
31
+ def train_fast(category, data)
32
+ redis.sadd(CATEGORIES_KEY, category_name(category))
33
+ occurances = count_occurance(data)
34
+ database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
35
+ new_occurances = occurances.merge(database_occurances) { |key, value_occurance, value_database_occurance| value_occurance.to_i + value_database_occurance.to_i }.to_a.flatten!
36
+ redis.hmset(redis_category_key(category), new_occurances)
37
+ "OK"
38
+ end
39
+
40
+
41
+
42
+ def untrain(category, data)
43
+ count_occurance(data).each do |word, word_count|
44
+ new_count = [(redis.hget(redis_category_key(category), word).to_i - word_count), 0].max
45
+ if new_count > 0
46
+ redis.hset(redis_category_key(category), word, new_count)
47
+ else
48
+ redis.hdel(redis_category_key(category), word)
49
+ end
50
+ end
51
+ "OK"
52
+ end
53
+
54
+ def untrain_fast(category, data)
55
+ occurances = count_occurance(data)
56
+ database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
57
+ untrain_occurances = database_occurances.merge(occurances) { |key, value_occurance, value_untrain_occurance| value_occurance.to_i - value_untrain_occurance.to_i }
58
+ empty_occurances = untrain_occurances.select { |key, value| value.to_i <= 0 }
59
+ redis.hmset(redis_category_key(category), untrain_occurances.to_a.flatten!)
60
+ redis.hdel(redis_category_key(category), empty_occurances.keys) unless empty_occurances.empty?
61
+ "OK"
62
+ end
63
+
64
+
65
+
66
+ def classify(data)
67
+ result = Hash.new(0)
68
+ categories = redis.smembers(CATEGORIES_KEY)
69
+
70
+ categories.each do |category|
71
+ count_occurance(data).each do |word, word_count|
72
+ numerator = (redis.hget(redis_category_key(category), word).to_i + ALPHA).to_f
73
+ denominator = (categories.map { |category| redis.hget(redis_category_key(category), word).to_i }.inject(0, :+) + (ALPHA * data.length)).to_f
74
+ result[category] += (word_count * Math.log(numerator / denominator)).abs
75
+ end
76
+ end
77
+
78
+ result.min_by(&:last).first.to_sym
79
+ end
80
+
81
+ def classify_fast(data)
82
+ result = Hash.new(0)
83
+ categories = redis.smembers(CATEGORIES_KEY)
84
+ occurances = count_occurance(data)
85
+
86
+ categories.each do |category|
87
+ numerator = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))].inject({}) { |hash, (key, value)| hash[key] = value.to_f + ALPHA; hash }
88
+ denominator = categories.map { |category| Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))] }.inject(Hash.new(0)) { |main_hash, sub_hash| main_hash.merge(sub_hash) { |key, value_first, value_second| value_first.to_f + value_second.to_f} }.inject(Hash.new(0)) { |hash, (key, value)| hash[key] = value.to_f + (ALPHA * data.length); hash }
89
+ result[category] += numerator.merge(denominator) { |key, value_numerator, value_denominator| (occurances[key] * Math.log(value_numerator / value_denominator)).abs }.values.inject(0, :+)
90
+ end
91
+
92
+ result.min_by(&:last).first.to_sym
93
+ end
94
+
95
+
96
+
97
+ def flushdb(flush_db=false)
98
+ redis.flushdb if flush_db
99
+ end
100
+
101
+
102
+
103
+ def flush_category(category)
104
+ redis.del(redis_category_key(category))
105
+ redis.srem(CATEGORIES_KEY, category_name(category))
106
+ end
107
+
108
+
109
+ private
110
+
111
+
112
+ def count_occurance(data='')
113
+ bag_of_words = Hash.new(0)
114
+
115
+ data = [data].flatten.map! do |word|
116
+ word.to_s.strip
117
+ end.delete_if(&:empty?)
118
+
119
+ for word in data
120
+ bag_of_words[word] += 1
121
+ end
122
+ bag_of_words
123
+ rescue
124
+ raise ArgumentError, 'input must be a single String or an Array of Strings'
125
+ end
126
+
127
+
128
+ def category_name(category)
129
+ category.to_s.downcase.strip.gsub(/[\s\W]+/,'_').gsub(/_+$/,'')
130
+ end
131
+
132
+
133
+ def redis_category_key(category)
134
+ "#{CATEGORY_KEY}:#{category_name(category)}"
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,5 @@
1
+ # encoding: UTF-8
2
+
3
+ module Judgee
4
+ VERSION = "0.1.0"
5
+ end
@@ -0,0 +1,36 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'judgee'
4
+ require 'redis'
5
+
6
+ describe Judgee::Classifier do
7
+
8
+ CATEGORIES_KEY = "judgee:categories"
9
+ CATEGORY_KEY = "judgee:category"
10
+
11
+ before :each do
12
+ @judgee = Judgee::Classifier.new
13
+ @redis = Redis.new
14
+ @redis.flushdb
15
+ @spam_category = :spam_spec
16
+ @ham_category = :ham_spec
17
+ @spam_data = %w(money rich quick big viagra penis)
18
+ @ham_data = %w(mail google gmail maps ruby)
19
+ @judgee.flush_category(@spam_category)
20
+ @judgee.flush_category(@ham_category)
21
+ end
22
+
23
+ describe "training" do
24
+ it "should add category to 'judgee:categories' set" do
25
+ @judgee.train(@ham_category, @ham_data)
26
+ @judgee.train(@spam_category, @spam_data)
27
+ categories = @redis.smembers(CATEGORIES_KEY)
28
+ categories.length.should eq 2
29
+ categories.should_include("judgee:categories:spam_spec")
30
+ categories.should_include("judgee:categories:ham_spec")
31
+ end
32
+ end
33
+
34
+ # TODO
35
+
36
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: judgee
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Railsmechanic
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: redis
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: rspec
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :development
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: A simple Bayesian Classifier with additive smoothing and its focus on
47
+ performance.
48
+ email:
49
+ - info@railsmechanic.de
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - Gemfile
55
+ - LICENSE.txt
56
+ - README.md
57
+ - Rakefile
58
+ - judgee.gemspec
59
+ - lib/judgee.rb
60
+ - lib/judgee/classifier.rb
61
+ - lib/judgee/version.rb
62
+ - spec/judgee_spec.rb
63
+ homepage: https://github.com/railsmechanic/judgee
64
+ licenses: []
65
+ post_install_message:
66
+ rdoc_options: []
67
+ require_paths:
68
+ - lib
69
+ required_ruby_version: !ruby/object:Gem::Requirement
70
+ none: false
71
+ requirements:
72
+ - - ! '>='
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ required_rubygems_version: !ruby/object:Gem::Requirement
76
+ none: false
77
+ requirements:
78
+ - - ! '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 1.8.24
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Judgee is a simple Bayesian Classifier with additive smoothing, which uses
87
+ Redis for persistance.
88
+ test_files:
89
+ - spec/judgee_spec.rb