judgee 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +54 -0
- data/Rakefile +1 -0
- data/judgee.gemspec +29 -0
- data/lib/judgee.rb +3 -0
- data/lib/judgee/classifier.rb +138 -0
- data/lib/judgee/version.rb +5 -0
- data/spec/judgee_spec.rb +36 -0
- metadata +89 -0
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Railsmechanic
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# Judgee
|
2
|
+
|
3
|
+
A simple Bayesian Classifier with additive smoothing built in.
|
4
|
+
The primary focus of judgee lies on performance and a minimal but flexible feature set.
|
5
|
+
So it's up to you to do stemming, text analysis, etc.
|
6
|
+
|
7
|
+
|
8
|
+
## Backed by Redis
|
9
|
+
|
10
|
+
[Redis](http://redis.io/) is an open source, BSD licensed, advanced key-value store, which is often referred to as a data structure server.
|
11
|
+
It supports strings, hashes, lists, sets, sorted sets and offers an incredible performance.
|
12
|
+
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
gem install judgee
|
17
|
+
|
18
|
+
|
19
|
+
## Getting started
|
20
|
+
|
21
|
+
# Require Judgee
|
22
|
+
require "judgee"
|
23
|
+
|
24
|
+
# Create an instance of Judgee.
|
25
|
+
# It assumes that your Redis instance is running on localhost at port 6379.
|
26
|
+
judgee = Judgee::Classifier.new
|
27
|
+
|
28
|
+
# Is Redis running on a host in your network, simply pass in your options
|
29
|
+
judgee = Judgee::Classifier.new(:host => "10.0.1.1", :port => 6380)
|
30
|
+
|
31
|
+
# It also supports Unix sockets
|
32
|
+
judgee = Judgee::Classifier.new(:path => "/tmp/redis.sock")
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
# Now you can train the classifier
|
37
|
+
judgee.train(:spam, ["bad", "worse", "stupid", "idiotic"])
|
38
|
+
judgee.train(:ham, ["good", "better", "best", "lovely"])
|
39
|
+
|
40
|
+
# After training, classify your text sample
|
41
|
+
judgee.classify(["good", "better", "best", "worse"]) # => :ham
|
42
|
+
|
43
|
+
|
44
|
+
# Want to untrain some words?
|
45
|
+
judgee.untrain(:spam, ["bad", "worse"])
|
46
|
+
|
47
|
+
|
48
|
+
## Information on Performance
|
49
|
+
|
50
|
+
If you read the source code, you might stumble upon the confusing method names.
|
51
|
+
There are two methods for training (train, train_fast), two methods for untraining (untrain, untrain_fast) and two methods for classification (classify, classify_fast).
|
52
|
+
The difference is quite simple. As the name suggests, all methods with the suffix '_fast' are (really) faster (3x to 10x) in processing the data, but virtually unreadable.
|
53
|
+
|
54
|
+
So use the '_fast' methods if you need performance, e.g. in production and the methods without the suffix for learning purposes.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/judgee.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'judgee/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |gem|
|
8
|
+
gem.name = "judgee"
|
9
|
+
gem.version = Judgee::VERSION
|
10
|
+
gem.authors = ["Railsmechanic"]
|
11
|
+
gem.email = ["info@railsmechanic.de"]
|
12
|
+
gem.description = %q{A simple Bayesian Classifier with additive smoothing and its focus on performance.}
|
13
|
+
gem.summary = %q{Judgee is a simple Bayesian Classifier with additive smoothing, which uses Redis for persistance.}
|
14
|
+
gem.homepage = "https://github.com/railsmechanic/judgee"
|
15
|
+
gem.homepage = "https://github.com/railsmechanic/judgee"
|
16
|
+
|
17
|
+
gem.files = `git ls-files`.split($/)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
19
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
20
|
+
gem.require_paths = ["lib"]
|
21
|
+
|
22
|
+
|
23
|
+
# Regular dependencies
|
24
|
+
gem.add_dependency "redis"
|
25
|
+
|
26
|
+
# Development dependencies
|
27
|
+
gem.add_development_dependency "rspec"
|
28
|
+
|
29
|
+
end
|
data/lib/judgee.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require "redis"
|
4
|
+
|
5
|
+
module Judgee
|
6
|
+
class Classifier
|
7
|
+
|
8
|
+
###
|
9
|
+
# Constants
|
10
|
+
###
|
11
|
+
CATEGORIES_KEY = "judgee:categories"
|
12
|
+
CATEGORY_KEY = "judgee:category"
|
13
|
+
ALPHA = 1.0
|
14
|
+
|
15
|
+
|
16
|
+
attr_reader :redis
|
17
|
+
|
18
|
+
def initialize(options={})
|
19
|
+
@redis = Redis.new(options)
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def train(category, data)
|
24
|
+
redis.sadd(CATEGORIES_KEY, category_name(category))
|
25
|
+
count_occurance(data).each do |word, word_count|
|
26
|
+
redis.hincrby(redis_category_key(category), word, word_count)
|
27
|
+
end
|
28
|
+
"OK"
|
29
|
+
end
|
30
|
+
|
31
|
+
def train_fast(category, data)
|
32
|
+
redis.sadd(CATEGORIES_KEY, category_name(category))
|
33
|
+
occurances = count_occurance(data)
|
34
|
+
database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
|
35
|
+
new_occurances = occurances.merge(database_occurances) { |key, value_occurance, value_database_occurance| value_occurance.to_i + value_database_occurance.to_i }.to_a.flatten!
|
36
|
+
redis.hmset(redis_category_key(category), new_occurances)
|
37
|
+
"OK"
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
def untrain(category, data)
|
43
|
+
count_occurance(data).each do |word, word_count|
|
44
|
+
new_count = [(redis.hget(redis_category_key(category), word).to_i - word_count), 0].max
|
45
|
+
if new_count > 0
|
46
|
+
redis.hset(redis_category_key(category), word, new_count)
|
47
|
+
else
|
48
|
+
redis.hdel(redis_category_key(category), word)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
"OK"
|
52
|
+
end
|
53
|
+
|
54
|
+
def untrain_fast(category, data)
|
55
|
+
occurances = count_occurance(data)
|
56
|
+
database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
|
57
|
+
untrain_occurances = database_occurances.merge(occurances) { |key, value_occurance, value_untrain_occurance| value_occurance.to_i - value_untrain_occurance.to_i }
|
58
|
+
empty_occurances = untrain_occurances.select { |key, value| value.to_i <= 0 }
|
59
|
+
redis.hmset(redis_category_key(category), untrain_occurances.to_a.flatten!)
|
60
|
+
redis.hdel(redis_category_key(category), empty_occurances.keys) unless empty_occurances.empty?
|
61
|
+
"OK"
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
def classify(data)
|
67
|
+
result = Hash.new(0)
|
68
|
+
categories = redis.smembers(CATEGORIES_KEY)
|
69
|
+
|
70
|
+
categories.each do |category|
|
71
|
+
count_occurance(data).each do |word, word_count|
|
72
|
+
numerator = (redis.hget(redis_category_key(category), word).to_i + ALPHA).to_f
|
73
|
+
denominator = (categories.map { |category| redis.hget(redis_category_key(category), word).to_i }.inject(0, :+) + (ALPHA * data.length)).to_f
|
74
|
+
result[category] += (word_count * Math.log(numerator / denominator)).abs
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
result.min_by(&:last).first.to_sym
|
79
|
+
end
|
80
|
+
|
81
|
+
def classify_fast(data)
|
82
|
+
result = Hash.new(0)
|
83
|
+
categories = redis.smembers(CATEGORIES_KEY)
|
84
|
+
occurances = count_occurance(data)
|
85
|
+
|
86
|
+
categories.each do |category|
|
87
|
+
numerator = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))].inject({}) { |hash, (key, value)| hash[key] = value.to_f + ALPHA; hash }
|
88
|
+
denominator = categories.map { |category| Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))] }.inject(Hash.new(0)) { |main_hash, sub_hash| main_hash.merge(sub_hash) { |key, value_first, value_second| value_first.to_f + value_second.to_f} }.inject(Hash.new(0)) { |hash, (key, value)| hash[key] = value.to_f + (ALPHA * data.length); hash }
|
89
|
+
result[category] += numerator.merge(denominator) { |key, value_numerator, value_denominator| (occurances[key] * Math.log(value_numerator / value_denominator)).abs }.values.inject(0, :+)
|
90
|
+
end
|
91
|
+
|
92
|
+
result.min_by(&:last).first.to_sym
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
def flushdb(flush_db=false)
|
98
|
+
redis.flushdb if flush_db
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
def flush_category(category)
|
104
|
+
redis.del(redis_category_key(category))
|
105
|
+
redis.srem(CATEGORIES_KEY, category_name(category))
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
|
112
|
+
def count_occurance(data='')
|
113
|
+
bag_of_words = Hash.new(0)
|
114
|
+
|
115
|
+
data = [data].flatten.map! do |word|
|
116
|
+
word.to_s.strip
|
117
|
+
end.delete_if(&:empty?)
|
118
|
+
|
119
|
+
for word in data
|
120
|
+
bag_of_words[word] += 1
|
121
|
+
end
|
122
|
+
bag_of_words
|
123
|
+
rescue
|
124
|
+
raise ArgumentError, 'input must be a single String or an Array of Strings'
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def category_name(category)
|
129
|
+
category.to_s.downcase.strip.gsub(/[\s\W]+/,'_').gsub(/_+$/,'')
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def redis_category_key(category)
|
134
|
+
"#{CATEGORY_KEY}:#{category_name(category)}"
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
data/spec/judgee_spec.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'judgee'
|
4
|
+
require 'redis'
|
5
|
+
|
6
|
+
describe Judgee::Classifier do
|
7
|
+
|
8
|
+
CATEGORIES_KEY = "judgee:categories"
|
9
|
+
CATEGORY_KEY = "judgee:category"
|
10
|
+
|
11
|
+
before :each do
|
12
|
+
@judgee = Judgee::Classifier.new
|
13
|
+
@redis = Redis.new
|
14
|
+
@redis.flushdb
|
15
|
+
@spam_category = :spam_spec
|
16
|
+
@ham_category = :ham_spec
|
17
|
+
@spam_data = %w(money rich quick big viagra penis)
|
18
|
+
@ham_data = %w(mail google gmail maps ruby)
|
19
|
+
@judgee.flush_category(@spam_category)
|
20
|
+
@judgee.flush_category(@ham_category)
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "training" do
|
24
|
+
it "should add category to 'judgee:categories' set" do
|
25
|
+
@judgee.train(@ham_category, @ham_data)
|
26
|
+
@judgee.train(@spam_category, @spam_data)
|
27
|
+
categories = @redis.smembers(CATEGORIES_KEY)
|
28
|
+
categories.length.should eq 2
|
29
|
+
categories.should_include("judgee:categories:spam_spec")
|
30
|
+
categories.should_include("judgee:categories:ham_spec")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# TODO
|
35
|
+
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: judgee
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Railsmechanic
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: redis
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: A simple Bayesian Classifier with additive smoothing and its focus on
|
47
|
+
performance.
|
48
|
+
email:
|
49
|
+
- info@railsmechanic.de
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- Gemfile
|
55
|
+
- LICENSE.txt
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- judgee.gemspec
|
59
|
+
- lib/judgee.rb
|
60
|
+
- lib/judgee/classifier.rb
|
61
|
+
- lib/judgee/version.rb
|
62
|
+
- spec/judgee_spec.rb
|
63
|
+
homepage: https://github.com/railsmechanic/judgee
|
64
|
+
licenses: []
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ! '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 1.8.24
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: Judgee is a simple Bayesian Classifier with additive smoothing, which uses
|
87
|
+
Redis for persistance.
|
88
|
+
test_files:
|
89
|
+
- spec/judgee_spec.rb
|