bayes_motel 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Mike Perham
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,55 @@
1
+ bayes_motel
2
+ --------------
3
+
4
+ BayesMotel is a multi-variate Bayesian classification engine. There are two steps to Bayesian classification:
5
+
6
+ 1. Training
7
+ You provide a set of variables along with the proper classification for that set.
8
+ 2. Runtime
9
+ You provide a set of variables and ask for the proper classification according to the training in Step 1.
10
+
11
+ Commonly this is used for spam detection. You will provide a corpus of emails or other data along with a "Spam/NotSpam" classification. The library will determine which variables affect the classification and use that to judge future data.
12
+
13
+
14
+ Usage
15
+ =============
16
+
17
+ Step one is to create a corpus that you can train with a set of previously classified documents:
18
+
19
+ corpse = BayesMotel::Corpus.new('tweets')
20
+ spam_tweets.each do |tweet|
21
+ corpse.train(tweet, :spam)
22
+ end
23
+ good_tweets.each do |tweet|
24
+ corpse.train(tweet, :ham)
25
+ end
26
+ corpse.cleanup
27
+
28
+ In this example, we have a set of spammy tweets and a set of known good tweets. We pass in each tweet
29
+ to our train() method. Once we have completed training, we call cleanup which will run through the
30
+ internal data structures and clean up any variables that are too 'unique' to make a difference in classification (for instance, an :id variable will be unique for each tweet and so will be removed in the cleanup since it does not repeat enough times).
31
+
32
+ Step two is to use the calculated corpus for the category scores or a classification for a given document:
33
+
34
+ corpse.scores(new_tweet)
35
+ => { :spam => 12.4, :ham => 15.25 }
36
+ corpse.classify(new_tweet)
37
+ => [:ham, 15.25]
38
+
39
+
40
+ Trivia
41
+ ==============
42
+
43
+ Bates Motel is the motel in Alfred Hitchcock's masterpiece _Psycho_. Corpus is Latin for "body" but also means 'a canonical set of documents'. I'm not crazy, I just like puns.
44
+
45
+
46
+ Author
47
+ ==============
48
+
49
+ Mike Perham, mperham AT gmail.com, @mperham, http://mikeperham.com
50
+
51
+
52
+ Copyright
53
+ ==============
54
+
55
+ Copyright (c) 2010 Mike Perham. See LICENSE for details.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'lib/bayes_motel/version'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "bayes_motel"
9
+ gem.summary = %Q{Bayesian classification engine}
10
+ gem.description = %Q{http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/}
11
+ gem.email = "mperham@gmail.com"
12
+ gem.homepage = "http://github.com/mperham/bayes_motel"
13
+ gem.authors = ["Mike Perham"]
14
+ gem.version = BayesMotel::VERSION
15
+ gem.add_development_dependency "shoulda", ">= 0"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "bayes_motel #{BayesMotel::VERSION}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
@@ -0,0 +1,3 @@
1
+ require 'bayes_motel/version'
2
+ require 'bayes_motel/persistence'
3
+ require 'bayes_motel/corpus'
@@ -0,0 +1,97 @@
1
+ module BayesMotel
2
+ class Corpus
3
+ attr_reader :name
4
+ attr_reader :total_count
5
+ attr_reader :data
6
+
7
+ def initialize(name)
8
+ @name = name
9
+ @total_count = 0
10
+ @data = {}
11
+ end
12
+
13
+ def train(doc, category)
14
+ @total_count += 1
15
+ _training(doc, category)
16
+ end
17
+
18
+ def score(doc)
19
+ _score(doc)
20
+ end
21
+
22
+ # The default classification algorithm just picks
23
+ # the category with the highest score.
24
+ def classify(doc)
25
+ results = score(doc)
26
+ max = [:none, 0]
27
+ results.each_pair do |(k, v)|
28
+ max = [k, v] if v > max[1]
29
+ end
30
+ max
31
+ end
32
+
33
+ def cleanup
34
+ @data.each_pair do |k, v|
35
+ clean(@data, k, v)
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def _score(variables, name='', odds={})
42
+ variables.each_pair do |k, v|
43
+ case v
44
+ when Hash
45
+ _score(v, "#{name}_#{k}", odds)
46
+ else
47
+ @data.each_pair do |category, raw_counts|
48
+ cat = odds[category] ||= {}
49
+ keys = raw_counts["#{name}_#{k}"] || {}
50
+ cat["#{name}_#{k}_#{v}"] = Float(keys[v] || 0) / @total_count
51
+ end
52
+ end
53
+ end
54
+ odds.inject({}) do |memo, (key, value)|
55
+ memo[key] = value.inject(0) do |acc_memo, (acc_key, acc_value)|
56
+ acc_memo += acc_value
57
+ end
58
+ memo
59
+ end
60
+ end
61
+
62
+ def _training(variables, category, name='')
63
+ variables.each_pair do |k, v|
64
+ case v
65
+ when Hash
66
+ _training(v, category, "#{name}_#{k}")
67
+ else
68
+ cat = (@data[category] ||= {})
69
+ values = (cat["#{name}_#{k}"] ||= {})
70
+ values[v] ||= 0
71
+ values[v] += 1
72
+ end
73
+ end
74
+ end
75
+
76
+ def clean(hash, k, v)
77
+ case v
78
+ when Hash
79
+ v.each_pair do |key, value|
80
+ clean(v, key, value)
81
+ end
82
+ if v.empty?
83
+ hash.delete(k)
84
+ elsif v.size == 1 and v['other']
85
+ hash.delete(k)
86
+ end
87
+ else
88
+ if v < (@total_count * 0.03).floor
89
+ hash['other'] ||= 0
90
+ hash['other'] += v
91
+ hash.delete(k)
92
+ end
93
+ end
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,14 @@
1
+ module BayesMotel
2
+ module Persistence
3
+ # TODO Make this a little more Ruby idiomatic and pluggable
4
+ # for filesystems, databases, etc.
5
+ def self.write(corpus)
6
+ File.open("#{corpus.name}", 'w') do |file|
7
+ Marshal.dump(corpus, file)
8
+ end
9
+ end
10
+ def self.read(name)
11
+ Marshal.load(File.read("#{name}"))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,3 @@
1
+ module BayesMotel
2
+ VERSION = "0.1.0"
3
+ end
Binary file
Binary file
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'zlib'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'bayes_motel'
9
+
10
+ class Test::Unit::TestCase
11
+ end
@@ -0,0 +1,38 @@
1
+ require 'helper'
2
+
3
+ class TestBayesMotel < Test::Unit::TestCase
4
+
5
+ should "allow basic training" do
6
+ c = BayesMotel::Corpus.new('email')
7
+ tweets.each do |tweet|
8
+ c.train(tweet, :ham)
9
+ end
10
+ c.cleanup
11
+ assert_equal tweets.size, c.total_count
12
+ end
13
+
14
+ should "allow big training" do
15
+ c = BayesMotel::Corpus.new('email')
16
+ tweets(2000).each do |tweet|
17
+ c.train(tweet, :ham)
18
+ end
19
+ c.cleanup
20
+ assert_equal tweets.size, c.total_count
21
+ end
22
+
23
+ private
24
+
25
+ def tweets(n=100)
26
+ @tweets ||= begin
27
+ t = []
28
+ Zlib::GzipReader.open("#{File.dirname(__FILE__)}/#{n}tweets.txt.gz") do |gz|
29
+ gz.read.each_line do |line|
30
+ hash = eval(line)
31
+ hash.delete(:retweeted_status) if hash[:retweeted_status]
32
+ t << hash
33
+ end
34
+ end
35
+ t
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,22 @@
1
+ require 'helper'
2
+ require 'fileutils'
3
+
4
+ class TestPersistence < Test::Unit::TestCase
5
+
6
+ should "persist" do
7
+ c1 = BayesMotel::Corpus.new('test1')
8
+ c1.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
9
+ c1.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
10
+
11
+ BayesMotel::Persistence.write(c1)
12
+ c2 = BayesMotel::Persistence.read('test1')
13
+ FileUtils.rm_f 'test1'
14
+ assert c1 != c2
15
+ c1.instance_variables.each do |var|
16
+ v1 = c1.instance_variable_get(var)
17
+ v2 = c2.instance_variable_get(var)
18
+ assert_equal v1, v2
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,54 @@
1
+ require 'helper'
2
+
3
+ class TestTraining < Test::Unit::TestCase
4
+
5
+ should "handle basic training" do
6
+ c = BayesMotel::Corpus.new('test')
7
+ c.train({ :something => 'foo' }, :ham)
8
+ c.train({ :something => 'foo' }, :spam)
9
+
10
+ results = c.score({ :something => 'foo' })
11
+ assert results
12
+ assert_equal 2, results.size
13
+ assert_equal results[:spam], results[:ham]
14
+ end
15
+
16
+ should "not care about extra variables" do
17
+ c = BayesMotel::Corpus.new('test')
18
+ c.train({ :something => 'foo' }, :ham)
19
+ c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
20
+
21
+ results = c.score({ :something => 'foo' })
22
+ assert results
23
+ assert_equal 2, results.size
24
+ assert_equal results[:spam], results[:ham]
25
+ end
26
+
27
+ should "give more weight with more appearances" do
28
+ c = BayesMotel::Corpus.new('test')
29
+ c.train({ :something => 'foo' }, :ham)
30
+ c.train({ :something => 'foo' }, :spam)
31
+ c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
32
+
33
+ doc = { :something => 'foo' }
34
+ results = c.score(doc)
35
+ assert results
36
+ assert_equal 2, results.size
37
+ assert_equal results[:spam], 2*results[:ham]
38
+ assert_equal [:spam, 2.0/3], c.classify(doc)
39
+ end
40
+
41
+ should "calculate score for nested documents" do
42
+ c = BayesMotel::Corpus.new('test')
43
+ c.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
44
+ c.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
45
+
46
+ doc = { :something => 'foo', :kids => { :bar => 'gee', :id => 167, :ack => 'blag' }}
47
+ results = c.score(doc)
48
+ assert results
49
+ assert_equal 2, results.size
50
+ assert_equal results[:spam], 2*results[:ham]
51
+
52
+ assert_equal [:spam, 1.0], c.classify(doc)
53
+ end
54
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bayes_motel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Mike Perham
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-28 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ description: http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/
33
+ email: mperham@gmail.com
34
+ executables: []
35
+
36
+ extensions: []
37
+
38
+ extra_rdoc_files:
39
+ - LICENSE
40
+ - README.md
41
+ files:
42
+ - .document
43
+ - .gitignore
44
+ - LICENSE
45
+ - README.md
46
+ - Rakefile
47
+ - lib/bayes_motel.rb
48
+ - lib/bayes_motel/corpus.rb
49
+ - lib/bayes_motel/persistence.rb
50
+ - lib/bayes_motel/version.rb
51
+ - test/100tweets.txt.gz
52
+ - test/2000tweets.txt.gz
53
+ - test/helper.rb
54
+ - test/test_bayes_motel.rb
55
+ - test/test_persistence.rb
56
+ - test/test_training.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/mperham/bayes_motel
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --charset=UTF-8
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirements: []
81
+
82
+ rubyforge_project:
83
+ rubygems_version: 1.3.6
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Bayesian classification engine
87
+ test_files:
88
+ - test/helper.rb
89
+ - test/test_bayes_motel.rb
90
+ - test/test_persistence.rb
91
+ - test/test_training.rb