judgee 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +54 -0
- data/Rakefile +1 -0
- data/judgee.gemspec +29 -0
- data/lib/judgee.rb +3 -0
- data/lib/judgee/classifier.rb +138 -0
- data/lib/judgee/version.rb +5 -0
- data/spec/judgee_spec.rb +36 -0
- metadata +89 -0
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Railsmechanic
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
# Judgee
|
2
|
+
|
3
|
+
A simple Bayesian Classifier with additive smoothing built in.
|
4
|
+
The primary focus of judgee lies on performance and a minimal but flexible feature set.
|
5
|
+
So it's up to you to do stemming, text analysis, etc.
|
6
|
+
|
7
|
+
|
8
|
+
## Backed by Redis
|
9
|
+
|
10
|
+
[Redis](http://redis.io/) is an open source, BSD licensed, advanced key-value store, which is often referred to as a data structure server.
|
11
|
+
It supports strings, hashes, lists, sets, sorted sets and offers an incredible performance.
|
12
|
+
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
gem install judgee
|
17
|
+
|
18
|
+
|
19
|
+
## Getting started
|
20
|
+
|
21
|
+
# Require Judgee
|
22
|
+
require "judgee"
|
23
|
+
|
24
|
+
# Create an instance of Judgee.
|
25
|
+
# It assumes that your Redis instance is running on localhost at port 6379.
|
26
|
+
judgee = Judgee::Classifier.new
|
27
|
+
|
28
|
+
# Is Redis running on a host in your network, simply pass in your options
|
29
|
+
judgee = Judgee::Classifier.new(:host => "10.0.1.1", :port => 6380)
|
30
|
+
|
31
|
+
# It also supports Unix sockets
|
32
|
+
judgee = Judgee::Classifier.new(:path => "/tmp/redis.sock")
|
33
|
+
|
34
|
+
|
35
|
+
|
36
|
+
# Now you can train the classifier
|
37
|
+
judgee.train(:spam, ["bad", "worse", "stupid", "idiotic"])
|
38
|
+
judgee.train(:ham, ["good", "better", "best", "lovely"])
|
39
|
+
|
40
|
+
# After training, classify your text sample
|
41
|
+
judgee.classify(["good", "better", "best", "worse"]) # => :ham
|
42
|
+
|
43
|
+
|
44
|
+
# Want to untrain some words?
|
45
|
+
judgee.untrain(:spam, ["bad", "worse"])
|
46
|
+
|
47
|
+
|
48
|
+
## Information on Performance
|
49
|
+
|
50
|
+
If you read the source code, you might stumble upon the confusing method names.
|
51
|
+
There are two methods for training (train, train_fast), two methods for untraining (untrain, untrain_fast) and two methods for classification (classify, classify_fast).
|
52
|
+
The difference is quite simple. As the name suggests, all methods with the suffix '_fast' are (really) faster (3x to 10x) in processing the data, but virtually unreadable.
|
53
|
+
|
54
|
+
So use the '_fast' methods if you need performance, e.g. in production and the methods without the suffix for learning purposes.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/judgee.gemspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
lib = File.expand_path('../lib', __FILE__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'judgee/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |gem|
|
8
|
+
gem.name = "judgee"
|
9
|
+
gem.version = Judgee::VERSION
|
10
|
+
gem.authors = ["Railsmechanic"]
|
11
|
+
gem.email = ["info@railsmechanic.de"]
|
12
|
+
gem.description = %q{A simple Bayesian Classifier with additive smoothing and its focus on performance.}
|
13
|
+
gem.summary = %q{Judgee is a simple Bayesian Classifier with additive smoothing, which uses Redis for persistance.}
|
14
|
+
gem.homepage = "https://github.com/railsmechanic/judgee"
|
15
|
+
gem.homepage = "https://github.com/railsmechanic/judgee"
|
16
|
+
|
17
|
+
gem.files = `git ls-files`.split($/)
|
18
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
19
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
20
|
+
gem.require_paths = ["lib"]
|
21
|
+
|
22
|
+
|
23
|
+
# Regular dependencies
|
24
|
+
gem.add_dependency "redis"
|
25
|
+
|
26
|
+
# Development dependencies
|
27
|
+
gem.add_development_dependency "rspec"
|
28
|
+
|
29
|
+
end
|
data/lib/judgee.rb
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require "redis"
|
4
|
+
|
5
|
+
module Judgee
|
6
|
+
class Classifier
|
7
|
+
|
8
|
+
###
|
9
|
+
# Constants
|
10
|
+
###
|
11
|
+
CATEGORIES_KEY = "judgee:categories"
|
12
|
+
CATEGORY_KEY = "judgee:category"
|
13
|
+
ALPHA = 1.0
|
14
|
+
|
15
|
+
|
16
|
+
attr_reader :redis
|
17
|
+
|
18
|
+
def initialize(options={})
|
19
|
+
@redis = Redis.new(options)
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def train(category, data)
|
24
|
+
redis.sadd(CATEGORIES_KEY, category_name(category))
|
25
|
+
count_occurance(data).each do |word, word_count|
|
26
|
+
redis.hincrby(redis_category_key(category), word, word_count)
|
27
|
+
end
|
28
|
+
"OK"
|
29
|
+
end
|
30
|
+
|
31
|
+
def train_fast(category, data)
|
32
|
+
redis.sadd(CATEGORIES_KEY, category_name(category))
|
33
|
+
occurances = count_occurance(data)
|
34
|
+
database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
|
35
|
+
new_occurances = occurances.merge(database_occurances) { |key, value_occurance, value_database_occurance| value_occurance.to_i + value_database_occurance.to_i }.to_a.flatten!
|
36
|
+
redis.hmset(redis_category_key(category), new_occurances)
|
37
|
+
"OK"
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
def untrain(category, data)
|
43
|
+
count_occurance(data).each do |word, word_count|
|
44
|
+
new_count = [(redis.hget(redis_category_key(category), word).to_i - word_count), 0].max
|
45
|
+
if new_count > 0
|
46
|
+
redis.hset(redis_category_key(category), word, new_count)
|
47
|
+
else
|
48
|
+
redis.hdel(redis_category_key(category), word)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
"OK"
|
52
|
+
end
|
53
|
+
|
54
|
+
def untrain_fast(category, data)
|
55
|
+
occurances = count_occurance(data)
|
56
|
+
database_occurances = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))]
|
57
|
+
untrain_occurances = database_occurances.merge(occurances) { |key, value_occurance, value_untrain_occurance| value_occurance.to_i - value_untrain_occurance.to_i }
|
58
|
+
empty_occurances = untrain_occurances.select { |key, value| value.to_i <= 0 }
|
59
|
+
redis.hmset(redis_category_key(category), untrain_occurances.to_a.flatten!)
|
60
|
+
redis.hdel(redis_category_key(category), empty_occurances.keys) unless empty_occurances.empty?
|
61
|
+
"OK"
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
def classify(data)
|
67
|
+
result = Hash.new(0)
|
68
|
+
categories = redis.smembers(CATEGORIES_KEY)
|
69
|
+
|
70
|
+
categories.each do |category|
|
71
|
+
count_occurance(data).each do |word, word_count|
|
72
|
+
numerator = (redis.hget(redis_category_key(category), word).to_i + ALPHA).to_f
|
73
|
+
denominator = (categories.map { |category| redis.hget(redis_category_key(category), word).to_i }.inject(0, :+) + (ALPHA * data.length)).to_f
|
74
|
+
result[category] += (word_count * Math.log(numerator / denominator)).abs
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
result.min_by(&:last).first.to_sym
|
79
|
+
end
|
80
|
+
|
81
|
+
def classify_fast(data)
|
82
|
+
result = Hash.new(0)
|
83
|
+
categories = redis.smembers(CATEGORIES_KEY)
|
84
|
+
occurances = count_occurance(data)
|
85
|
+
|
86
|
+
categories.each do |category|
|
87
|
+
numerator = Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))].inject({}) { |hash, (key, value)| hash[key] = value.to_f + ALPHA; hash }
|
88
|
+
denominator = categories.map { |category| Hash[occurances.keys.zip(redis.hmget(redis_category_key(category), occurances.keys))] }.inject(Hash.new(0)) { |main_hash, sub_hash| main_hash.merge(sub_hash) { |key, value_first, value_second| value_first.to_f + value_second.to_f} }.inject(Hash.new(0)) { |hash, (key, value)| hash[key] = value.to_f + (ALPHA * data.length); hash }
|
89
|
+
result[category] += numerator.merge(denominator) { |key, value_numerator, value_denominator| (occurances[key] * Math.log(value_numerator / value_denominator)).abs }.values.inject(0, :+)
|
90
|
+
end
|
91
|
+
|
92
|
+
result.min_by(&:last).first.to_sym
|
93
|
+
end
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
def flushdb(flush_db=false)
|
98
|
+
redis.flushdb if flush_db
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
def flush_category(category)
|
104
|
+
redis.del(redis_category_key(category))
|
105
|
+
redis.srem(CATEGORIES_KEY, category_name(category))
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
|
112
|
+
def count_occurance(data='')
|
113
|
+
bag_of_words = Hash.new(0)
|
114
|
+
|
115
|
+
data = [data].flatten.map! do |word|
|
116
|
+
word.to_s.strip
|
117
|
+
end.delete_if(&:empty?)
|
118
|
+
|
119
|
+
for word in data
|
120
|
+
bag_of_words[word] += 1
|
121
|
+
end
|
122
|
+
bag_of_words
|
123
|
+
rescue
|
124
|
+
raise ArgumentError, 'input must be a single String or an Array of Strings'
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def category_name(category)
|
129
|
+
category.to_s.downcase.strip.gsub(/[\s\W]+/,'_').gsub(/_+$/,'')
|
130
|
+
end
|
131
|
+
|
132
|
+
|
133
|
+
def redis_category_key(category)
|
134
|
+
"#{CATEGORY_KEY}:#{category_name(category)}"
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
data/spec/judgee_spec.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'judgee'
|
4
|
+
require 'redis'
|
5
|
+
|
6
|
+
describe Judgee::Classifier do
|
7
|
+
|
8
|
+
CATEGORIES_KEY = "judgee:categories"
|
9
|
+
CATEGORY_KEY = "judgee:category"
|
10
|
+
|
11
|
+
before :each do
|
12
|
+
@judgee = Judgee::Classifier.new
|
13
|
+
@redis = Redis.new
|
14
|
+
@redis.flushdb
|
15
|
+
@spam_category = :spam_spec
|
16
|
+
@ham_category = :ham_spec
|
17
|
+
@spam_data = %w(money rich quick big viagra penis)
|
18
|
+
@ham_data = %w(mail google gmail maps ruby)
|
19
|
+
@judgee.flush_category(@spam_category)
|
20
|
+
@judgee.flush_category(@ham_category)
|
21
|
+
end
|
22
|
+
|
23
|
+
describe "training" do
|
24
|
+
it "should add category to 'judgee:categories' set" do
|
25
|
+
@judgee.train(@ham_category, @ham_data)
|
26
|
+
@judgee.train(@spam_category, @spam_data)
|
27
|
+
categories = @redis.smembers(CATEGORIES_KEY)
|
28
|
+
categories.length.should eq 2
|
29
|
+
categories.should_include("judgee:categories:spam_spec")
|
30
|
+
categories.should_include("judgee:categories:ham_spec")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# TODO
|
35
|
+
|
36
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: judgee
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Railsmechanic
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-03-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: redis
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: A simple Bayesian Classifier with additive smoothing and its focus on
|
47
|
+
performance.
|
48
|
+
email:
|
49
|
+
- info@railsmechanic.de
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- Gemfile
|
55
|
+
- LICENSE.txt
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- judgee.gemspec
|
59
|
+
- lib/judgee.rb
|
60
|
+
- lib/judgee/classifier.rb
|
61
|
+
- lib/judgee/version.rb
|
62
|
+
- spec/judgee_spec.rb
|
63
|
+
homepage: https://github.com/railsmechanic/judgee
|
64
|
+
licenses: []
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
require_paths:
|
68
|
+
- lib
|
69
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
71
|
+
requirements:
|
72
|
+
- - ! '>='
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
none: false
|
77
|
+
requirements:
|
78
|
+
- - ! '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 1.8.24
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: Judgee is a simple Bayesian Classifier with additive smoothing, which uses
|
87
|
+
Redis for persistance.
|
88
|
+
test_files:
|
89
|
+
- spec/judgee_spec.rb
|