bayes_motel 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.md +55 -0
- data/Rakefile +53 -0
- data/lib/bayes_motel.rb +3 -0
- data/lib/bayes_motel/corpus.rb +97 -0
- data/lib/bayes_motel/persistence.rb +14 -0
- data/lib/bayes_motel/version.rb +3 -0
- data/test/100tweets.txt.gz +0 -0
- data/test/2000tweets.txt.gz +0 -0
- data/test/helper.rb +11 -0
- data/test/test_bayes_motel.rb +38 -0
- data/test/test_persistence.rb +22 -0
- data/test/test_training.rb +54 -0
- metadata +91 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2010 Mike Perham
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
bayes_motel
|
2
|
+
--------------
|
3
|
+
|
4
|
+
BayesMotel is a multi-variate Bayesian classification engine. There are two steps to Bayesian classification:
|
5
|
+
|
6
|
+
1. Training
|
7
|
+
You provide a set of variables along with the proper classification for that set.
|
8
|
+
2. Runtime
|
9
|
+
You provide a set of variables and ask for the proper classification according to the training in Step 1.
|
10
|
+
|
11
|
+
Commonly this is used for spam detection. You will provide a corpus of emails or other data along with a "Spam/NotSpam" classification. The library will determine which variables affect the classification and use that to judge future data.
|
12
|
+
|
13
|
+
|
14
|
+
Usage
|
15
|
+
=============
|
16
|
+
|
17
|
+
Step one is to create a corpus that you can train with a set of previously classified documents:
|
18
|
+
|
19
|
+
corpse = BayesMotel::Corpus.new('tweets')
|
20
|
+
spam_tweets.each do |tweet|
|
21
|
+
corpse.train(tweet, :spam)
|
22
|
+
end
|
23
|
+
good_tweets.each do |tweet|
|
24
|
+
corpse.train(tweet, :ham)
|
25
|
+
end
|
26
|
+
corpse.cleanup
|
27
|
+
|
28
|
+
In this example, we have a set of spammy tweets and a set of known good tweets. We pass in each tweet
|
29
|
+
to our train() method. Once we have completed training, we call cleanup which will run through the
|
30
|
+
internal data structures and clean up any variables that are too 'unique' to make a difference in classification (for instance, an :id variable will be unique for each tweet and so will be removed in the cleanup since it does not repeat enough times).
|
31
|
+
|
32
|
+
Step two is to use the calculated corpus for the category scores or a classification for a given document:
|
33
|
+
|
34
|
+
corpse.scores(new_tweet)
|
35
|
+
=> { :spam => 12.4, :ham => 15.25 }
|
36
|
+
corpse.classify(new_tweet)
|
37
|
+
=> [:ham, 15.25]
|
38
|
+
|
39
|
+
|
40
|
+
Trivia
|
41
|
+
==============
|
42
|
+
|
43
|
+
Bates Motel is the motel in Alfred Hitchcock's masterpiece _Psycho_. Corpus is Latin for "body" but also means 'a canonical set of documents'. I'm not crazy, I just like puns.
|
44
|
+
|
45
|
+
|
46
|
+
Author
|
47
|
+
==============
|
48
|
+
|
49
|
+
Mike Perham, mperham AT gmail.com, @mperham, http://mikeperham.com
|
50
|
+
|
51
|
+
|
52
|
+
Copyright
|
53
|
+
==============
|
54
|
+
|
55
|
+
Copyright (c) 2010 Mike Perham. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'lib/bayes_motel/version'
|
4
|
+
|
5
|
+
begin
|
6
|
+
require 'jeweler'
|
7
|
+
Jeweler::Tasks.new do |gem|
|
8
|
+
gem.name = "bayes_motel"
|
9
|
+
gem.summary = %Q{Bayesian classification engine}
|
10
|
+
gem.description = %Q{http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/}
|
11
|
+
gem.email = "mperham@gmail.com"
|
12
|
+
gem.homepage = "http://github.com/mperham/bayes_motel"
|
13
|
+
gem.authors = ["Mike Perham"]
|
14
|
+
gem.version = BayesMotel::VERSION
|
15
|
+
gem.add_development_dependency "shoulda", ">= 0"
|
16
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'rake/testtask'
|
24
|
+
Rake::TestTask.new(:test) do |test|
|
25
|
+
test.libs << 'lib' << 'test'
|
26
|
+
test.pattern = 'test/**/test_*.rb'
|
27
|
+
test.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'rcov/rcovtask'
|
32
|
+
Rcov::RcovTask.new do |test|
|
33
|
+
test.libs << 'test'
|
34
|
+
test.pattern = 'test/**/test_*.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
rescue LoadError
|
38
|
+
task :rcov do
|
39
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
task :test => :check_dependencies
|
44
|
+
|
45
|
+
task :default => :test
|
46
|
+
|
47
|
+
require 'rake/rdoctask'
|
48
|
+
Rake::RDocTask.new do |rdoc|
|
49
|
+
rdoc.rdoc_dir = 'rdoc'
|
50
|
+
rdoc.title = "bayes_motel #{BayesMotel::VERSION}"
|
51
|
+
rdoc.rdoc_files.include('README*')
|
52
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
53
|
+
end
|
data/lib/bayes_motel.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
module BayesMotel
|
2
|
+
class Corpus
|
3
|
+
attr_reader :name
|
4
|
+
attr_reader :total_count
|
5
|
+
attr_reader :data
|
6
|
+
|
7
|
+
def initialize(name)
|
8
|
+
@name = name
|
9
|
+
@total_count = 0
|
10
|
+
@data = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
def train(doc, category)
|
14
|
+
@total_count += 1
|
15
|
+
_training(doc, category)
|
16
|
+
end
|
17
|
+
|
18
|
+
def score(doc)
|
19
|
+
_score(doc)
|
20
|
+
end
|
21
|
+
|
22
|
+
# The default classification algorithm just picks
|
23
|
+
# the category with the highest score.
|
24
|
+
def classify(doc)
|
25
|
+
results = score(doc)
|
26
|
+
max = [:none, 0]
|
27
|
+
results.each_pair do |(k, v)|
|
28
|
+
max = [k, v] if v > max[1]
|
29
|
+
end
|
30
|
+
max
|
31
|
+
end
|
32
|
+
|
33
|
+
def cleanup
|
34
|
+
@data.each_pair do |k, v|
|
35
|
+
clean(@data, k, v)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def _score(variables, name='', odds={})
|
42
|
+
variables.each_pair do |k, v|
|
43
|
+
case v
|
44
|
+
when Hash
|
45
|
+
_score(v, "#{name}_#{k}", odds)
|
46
|
+
else
|
47
|
+
@data.each_pair do |category, raw_counts|
|
48
|
+
cat = odds[category] ||= {}
|
49
|
+
keys = raw_counts["#{name}_#{k}"] || {}
|
50
|
+
cat["#{name}_#{k}_#{v}"] = Float(keys[v] || 0) / @total_count
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
odds.inject({}) do |memo, (key, value)|
|
55
|
+
memo[key] = value.inject(0) do |acc_memo, (acc_key, acc_value)|
|
56
|
+
acc_memo += acc_value
|
57
|
+
end
|
58
|
+
memo
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def _training(variables, category, name='')
|
63
|
+
variables.each_pair do |k, v|
|
64
|
+
case v
|
65
|
+
when Hash
|
66
|
+
_training(v, category, "#{name}_#{k}")
|
67
|
+
else
|
68
|
+
cat = (@data[category] ||= {})
|
69
|
+
values = (cat["#{name}_#{k}"] ||= {})
|
70
|
+
values[v] ||= 0
|
71
|
+
values[v] += 1
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def clean(hash, k, v)
|
77
|
+
case v
|
78
|
+
when Hash
|
79
|
+
v.each_pair do |key, value|
|
80
|
+
clean(v, key, value)
|
81
|
+
end
|
82
|
+
if v.empty?
|
83
|
+
hash.delete(k)
|
84
|
+
elsif v.size == 1 and v['other']
|
85
|
+
hash.delete(k)
|
86
|
+
end
|
87
|
+
else
|
88
|
+
if v < (@total_count * 0.03).floor
|
89
|
+
hash['other'] ||= 0
|
90
|
+
hash['other'] += v
|
91
|
+
hash.delete(k)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module BayesMotel
|
2
|
+
module Persistence
|
3
|
+
# TODO Make this a little more Ruby idiomatic and pluggable
|
4
|
+
# for filesystems, databases, etc.
|
5
|
+
def self.write(corpus)
|
6
|
+
File.open("#{corpus.name}", 'w') do |file|
|
7
|
+
Marshal.dump(corpus, file)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
def self.read(name)
|
11
|
+
Marshal.load(File.read("#{name}"))
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
Binary file
|
Binary file
|
data/test/helper.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestBayesMotel < Test::Unit::TestCase
|
4
|
+
|
5
|
+
should "allow basic training" do
|
6
|
+
c = BayesMotel::Corpus.new('email')
|
7
|
+
tweets.each do |tweet|
|
8
|
+
c.train(tweet, :ham)
|
9
|
+
end
|
10
|
+
c.cleanup
|
11
|
+
assert_equal tweets.size, c.total_count
|
12
|
+
end
|
13
|
+
|
14
|
+
should "allow big training" do
|
15
|
+
c = BayesMotel::Corpus.new('email')
|
16
|
+
tweets(2000).each do |tweet|
|
17
|
+
c.train(tweet, :ham)
|
18
|
+
end
|
19
|
+
c.cleanup
|
20
|
+
assert_equal tweets.size, c.total_count
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def tweets(n=100)
|
26
|
+
@tweets ||= begin
|
27
|
+
t = []
|
28
|
+
Zlib::GzipReader.open("#{File.dirname(__FILE__)}/#{n}tweets.txt.gz") do |gz|
|
29
|
+
gz.read.each_line do |line|
|
30
|
+
hash = eval(line)
|
31
|
+
hash.delete(:retweeted_status) if hash[:retweeted_status]
|
32
|
+
t << hash
|
33
|
+
end
|
34
|
+
end
|
35
|
+
t
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'fileutils'
|
3
|
+
|
4
|
+
class TestPersistence < Test::Unit::TestCase
|
5
|
+
|
6
|
+
should "persist" do
|
7
|
+
c1 = BayesMotel::Corpus.new('test1')
|
8
|
+
c1.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
|
9
|
+
c1.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
|
10
|
+
|
11
|
+
BayesMotel::Persistence.write(c1)
|
12
|
+
c2 = BayesMotel::Persistence.read('test1')
|
13
|
+
FileUtils.rm_f 'test1'
|
14
|
+
assert c1 != c2
|
15
|
+
c1.instance_variables.each do |var|
|
16
|
+
v1 = c1.instance_variable_get(var)
|
17
|
+
v2 = c2.instance_variable_get(var)
|
18
|
+
assert_equal v1, v2
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestTraining < Test::Unit::TestCase
|
4
|
+
|
5
|
+
should "handle basic training" do
|
6
|
+
c = BayesMotel::Corpus.new('test')
|
7
|
+
c.train({ :something => 'foo' }, :ham)
|
8
|
+
c.train({ :something => 'foo' }, :spam)
|
9
|
+
|
10
|
+
results = c.score({ :something => 'foo' })
|
11
|
+
assert results
|
12
|
+
assert_equal 2, results.size
|
13
|
+
assert_equal results[:spam], results[:ham]
|
14
|
+
end
|
15
|
+
|
16
|
+
should "not care about extra variables" do
|
17
|
+
c = BayesMotel::Corpus.new('test')
|
18
|
+
c.train({ :something => 'foo' }, :ham)
|
19
|
+
c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
|
20
|
+
|
21
|
+
results = c.score({ :something => 'foo' })
|
22
|
+
assert results
|
23
|
+
assert_equal 2, results.size
|
24
|
+
assert_equal results[:spam], results[:ham]
|
25
|
+
end
|
26
|
+
|
27
|
+
should "give more weight with more appearances" do
|
28
|
+
c = BayesMotel::Corpus.new('test')
|
29
|
+
c.train({ :something => 'foo' }, :ham)
|
30
|
+
c.train({ :something => 'foo' }, :spam)
|
31
|
+
c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
|
32
|
+
|
33
|
+
doc = { :something => 'foo' }
|
34
|
+
results = c.score(doc)
|
35
|
+
assert results
|
36
|
+
assert_equal 2, results.size
|
37
|
+
assert_equal results[:spam], 2*results[:ham]
|
38
|
+
assert_equal [:spam, 2.0/3], c.classify(doc)
|
39
|
+
end
|
40
|
+
|
41
|
+
should "calculate score for nested documents" do
|
42
|
+
c = BayesMotel::Corpus.new('test')
|
43
|
+
c.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
|
44
|
+
c.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
|
45
|
+
|
46
|
+
doc = { :something => 'foo', :kids => { :bar => 'gee', :id => 167, :ack => 'blag' }}
|
47
|
+
results = c.score(doc)
|
48
|
+
assert results
|
49
|
+
assert_equal 2, results.size
|
50
|
+
assert_equal results[:spam], 2*results[:ham]
|
51
|
+
|
52
|
+
assert_equal [:spam, 1.0], c.classify(doc)
|
53
|
+
end
|
54
|
+
end
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bayes_motel
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Mike Perham
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-04-28 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: shoulda
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 0
|
29
|
+
version: "0"
|
30
|
+
type: :development
|
31
|
+
version_requirements: *id001
|
32
|
+
description: http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/
|
33
|
+
email: mperham@gmail.com
|
34
|
+
executables: []
|
35
|
+
|
36
|
+
extensions: []
|
37
|
+
|
38
|
+
extra_rdoc_files:
|
39
|
+
- LICENSE
|
40
|
+
- README.md
|
41
|
+
files:
|
42
|
+
- .document
|
43
|
+
- .gitignore
|
44
|
+
- LICENSE
|
45
|
+
- README.md
|
46
|
+
- Rakefile
|
47
|
+
- lib/bayes_motel.rb
|
48
|
+
- lib/bayes_motel/corpus.rb
|
49
|
+
- lib/bayes_motel/persistence.rb
|
50
|
+
- lib/bayes_motel/version.rb
|
51
|
+
- test/100tweets.txt.gz
|
52
|
+
- test/2000tweets.txt.gz
|
53
|
+
- test/helper.rb
|
54
|
+
- test/test_bayes_motel.rb
|
55
|
+
- test/test_persistence.rb
|
56
|
+
- test/test_training.rb
|
57
|
+
has_rdoc: true
|
58
|
+
homepage: http://github.com/mperham/bayes_motel
|
59
|
+
licenses: []
|
60
|
+
|
61
|
+
post_install_message:
|
62
|
+
rdoc_options:
|
63
|
+
- --charset=UTF-8
|
64
|
+
require_paths:
|
65
|
+
- lib
|
66
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
version: "0"
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
segments:
|
78
|
+
- 0
|
79
|
+
version: "0"
|
80
|
+
requirements: []
|
81
|
+
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 1.3.6
|
84
|
+
signing_key:
|
85
|
+
specification_version: 3
|
86
|
+
summary: Bayesian classification engine
|
87
|
+
test_files:
|
88
|
+
- test/helper.rb
|
89
|
+
- test/test_bayes_motel.rb
|
90
|
+
- test/test_persistence.rb
|
91
|
+
- test/test_training.rb
|