bayes_motel 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,21 @@
1
+ ## MAC OS
2
+ .DS_Store
3
+
4
+ ## TEXTMATE
5
+ *.tmproj
6
+ tmtags
7
+
8
+ ## EMACS
9
+ *~
10
+ \#*
11
+ .\#*
12
+
13
+ ## VIM
14
+ *.swp
15
+
16
+ ## PROJECT::GENERAL
17
+ coverage
18
+ rdoc
19
+ pkg
20
+
21
+ ## PROJECT::SPECIFIC
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010 Mike Perham
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,55 @@
1
+ bayes_motel
2
+ --------------
3
+
4
+ BayesMotel is a multi-variate Bayesian classification engine. There are two steps to Bayesian classification:
5
+
6
+ 1. Training
7
+ You provide a set of variables along with the proper classification for that set.
8
+ 2. Runtime
9
+ You provide a set of variables and ask for the proper classification according to the training in Step 1.
10
+
11
+ Commonly this is used for spam detection. You will provide a corpus of emails or other data along with a "Spam/NotSpam" classification. The library will determine which variables affect the classification and use that to judge future data.
12
+
13
+
14
+ Usage
15
+ =============
16
+
17
+ Step one is to create a corpus that you can train with a set of previously classified documents:
18
+
19
+ corpse = BayesMotel::Corpus.new('tweets')
20
+ spam_tweets.each do |tweet|
21
+ corpse.train(tweet, :spam)
22
+ end
23
+ good_tweets.each do |tweet|
24
+ corpse.train(tweet, :ham)
25
+ end
26
+ corpse.cleanup
27
+
28
+ In this example, we have a set of spammy tweets and a set of known good tweets. We pass in each tweet
29
+ to our train() method. Once we have completed training, we call cleanup which will run through the
30
+ internal data structures and clean up any variables that are too 'unique' to make a difference in classification (for instance, an :id variable will be unique for each tweet and so will be removed in the cleanup since it does not repeat enough times).
31
+
32
+ Step two is to use the calculated corpus for the category scores or a classification for a given document:
33
+
34
+ corpse.scores(new_tweet)
35
+ => { :spam => 12.4, :ham => 15.25 }
36
+ corpse.classify(new_tweet)
37
+ => [:ham, 15.25]
38
+
39
+
40
+ Trivia
41
+ ==============
42
+
43
+ Bates Motel is the motel in Alfred Hitchcock's masterpiece _Psycho_. Corpus is Latin for "body" but also means 'a canonical set of documents'. I'm not crazy, I just like puns.
44
+
45
+
46
+ Author
47
+ ==============
48
+
49
+ Mike Perham, mperham AT gmail.com, @mperham, http://mikeperham.com
50
+
51
+
52
+ Copyright
53
+ ==============
54
+
55
+ Copyright (c) 2010 Mike Perham. See LICENSE for details.
@@ -0,0 +1,53 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'lib/bayes_motel/version'
4
+
5
+ begin
6
+ require 'jeweler'
7
+ Jeweler::Tasks.new do |gem|
8
+ gem.name = "bayes_motel"
9
+ gem.summary = %Q{Bayesian classification engine}
10
+ gem.description = %Q{http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/}
11
+ gem.email = "mperham@gmail.com"
12
+ gem.homepage = "http://github.com/mperham/bayes_motel"
13
+ gem.authors = ["Mike Perham"]
14
+ gem.version = BayesMotel::VERSION
15
+ gem.add_development_dependency "shoulda", ">= 0"
16
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/test_*.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+ task :test => :check_dependencies
44
+
45
+ task :default => :test
46
+
47
+ require 'rake/rdoctask'
48
+ Rake::RDocTask.new do |rdoc|
49
+ rdoc.rdoc_dir = 'rdoc'
50
+ rdoc.title = "bayes_motel #{BayesMotel::VERSION}"
51
+ rdoc.rdoc_files.include('README*')
52
+ rdoc.rdoc_files.include('lib/**/*.rb')
53
+ end
@@ -0,0 +1,3 @@
1
+ require 'bayes_motel/version'
2
+ require 'bayes_motel/persistence'
3
+ require 'bayes_motel/corpus'
@@ -0,0 +1,97 @@
1
+ module BayesMotel
2
+ class Corpus
3
+ attr_reader :name
4
+ attr_reader :total_count
5
+ attr_reader :data
6
+
7
+ def initialize(name)
8
+ @name = name
9
+ @total_count = 0
10
+ @data = {}
11
+ end
12
+
13
+ def train(doc, category)
14
+ @total_count += 1
15
+ _training(doc, category)
16
+ end
17
+
18
+ def score(doc)
19
+ _score(doc)
20
+ end
21
+
22
+ # The default classification algorithm just picks
23
+ # the category with the highest score.
24
+ def classify(doc)
25
+ results = score(doc)
26
+ max = [:none, 0]
27
+ results.each_pair do |(k, v)|
28
+ max = [k, v] if v > max[1]
29
+ end
30
+ max
31
+ end
32
+
33
+ def cleanup
34
+ @data.each_pair do |k, v|
35
+ clean(@data, k, v)
36
+ end
37
+ end
38
+
39
+ private
40
+
41
+ def _score(variables, name='', odds={})
42
+ variables.each_pair do |k, v|
43
+ case v
44
+ when Hash
45
+ _score(v, "#{name}_#{k}", odds)
46
+ else
47
+ @data.each_pair do |category, raw_counts|
48
+ cat = odds[category] ||= {}
49
+ keys = raw_counts["#{name}_#{k}"] || {}
50
+ cat["#{name}_#{k}_#{v}"] = Float(keys[v] || 0) / @total_count
51
+ end
52
+ end
53
+ end
54
+ odds.inject({}) do |memo, (key, value)|
55
+ memo[key] = value.inject(0) do |acc_memo, (acc_key, acc_value)|
56
+ acc_memo += acc_value
57
+ end
58
+ memo
59
+ end
60
+ end
61
+
62
+ def _training(variables, category, name='')
63
+ variables.each_pair do |k, v|
64
+ case v
65
+ when Hash
66
+ _training(v, category, "#{name}_#{k}")
67
+ else
68
+ cat = (@data[category] ||= {})
69
+ values = (cat["#{name}_#{k}"] ||= {})
70
+ values[v] ||= 0
71
+ values[v] += 1
72
+ end
73
+ end
74
+ end
75
+
76
+ def clean(hash, k, v)
77
+ case v
78
+ when Hash
79
+ v.each_pair do |key, value|
80
+ clean(v, key, value)
81
+ end
82
+ if v.empty?
83
+ hash.delete(k)
84
+ elsif v.size == 1 and v['other']
85
+ hash.delete(k)
86
+ end
87
+ else
88
+ if v < (@total_count * 0.03).floor
89
+ hash['other'] ||= 0
90
+ hash['other'] += v
91
+ hash.delete(k)
92
+ end
93
+ end
94
+ end
95
+
96
+ end
97
+ end
@@ -0,0 +1,14 @@
1
+ module BayesMotel
2
+ module Persistence
3
+ # TODO Make this a little more Ruby idiomatic and pluggable
4
+ # for filesystems, databases, etc.
5
+ def self.write(corpus)
6
+ File.open("#{corpus.name}", 'w') do |file|
7
+ Marshal.dump(corpus, file)
8
+ end
9
+ end
10
+ def self.read(name)
11
+ Marshal.load(File.read("#{name}"))
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,3 @@
1
+ module BayesMotel
2
+ VERSION = "0.1.0"
3
+ end
Binary file
Binary file
@@ -0,0 +1,11 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+ require 'zlib'
5
+
6
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
7
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
8
+ require 'bayes_motel'
9
+
10
+ class Test::Unit::TestCase
11
+ end
@@ -0,0 +1,38 @@
1
+ require 'helper'
2
+
3
+ class TestBayesMotel < Test::Unit::TestCase
4
+
5
+ should "allow basic training" do
6
+ c = BayesMotel::Corpus.new('email')
7
+ tweets.each do |tweet|
8
+ c.train(tweet, :ham)
9
+ end
10
+ c.cleanup
11
+ assert_equal tweets.size, c.total_count
12
+ end
13
+
14
+ should "allow big training" do
15
+ c = BayesMotel::Corpus.new('email')
16
+ tweets(2000).each do |tweet|
17
+ c.train(tweet, :ham)
18
+ end
19
+ c.cleanup
20
+ assert_equal tweets.size, c.total_count
21
+ end
22
+
23
+ private
24
+
25
+ def tweets(n=100)
26
+ @tweets ||= begin
27
+ t = []
28
+ Zlib::GzipReader.open("#{File.dirname(__FILE__)}/#{n}tweets.txt.gz") do |gz|
29
+ gz.read.each_line do |line|
30
+ hash = eval(line)
31
+ hash.delete(:retweeted_status) if hash[:retweeted_status]
32
+ t << hash
33
+ end
34
+ end
35
+ t
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,22 @@
1
+ require 'helper'
2
+ require 'fileutils'
3
+
4
+ class TestPersistence < Test::Unit::TestCase
5
+
6
+ should "persist" do
7
+ c1 = BayesMotel::Corpus.new('test1')
8
+ c1.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
9
+ c1.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
10
+
11
+ BayesMotel::Persistence.write(c1)
12
+ c2 = BayesMotel::Persistence.read('test1')
13
+ FileUtils.rm_f 'test1'
14
+ assert c1 != c2
15
+ c1.instance_variables.each do |var|
16
+ v1 = c1.instance_variable_get(var)
17
+ v2 = c2.instance_variable_get(var)
18
+ assert_equal v1, v2
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,54 @@
1
+ require 'helper'
2
+
3
+ class TestTraining < Test::Unit::TestCase
4
+
5
+ should "handle basic training" do
6
+ c = BayesMotel::Corpus.new('test')
7
+ c.train({ :something => 'foo' }, :ham)
8
+ c.train({ :something => 'foo' }, :spam)
9
+
10
+ results = c.score({ :something => 'foo' })
11
+ assert results
12
+ assert_equal 2, results.size
13
+ assert_equal results[:spam], results[:ham]
14
+ end
15
+
16
+ should "not care about extra variables" do
17
+ c = BayesMotel::Corpus.new('test')
18
+ c.train({ :something => 'foo' }, :ham)
19
+ c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
20
+
21
+ results = c.score({ :something => 'foo' })
22
+ assert results
23
+ assert_equal 2, results.size
24
+ assert_equal results[:spam], results[:ham]
25
+ end
26
+
27
+ should "give more weight with more appearances" do
28
+ c = BayesMotel::Corpus.new('test')
29
+ c.train({ :something => 'foo' }, :ham)
30
+ c.train({ :something => 'foo' }, :spam)
31
+ c.train({ :something => 'foo', :fubwhiz => 'oh noes' }, :spam)
32
+
33
+ doc = { :something => 'foo' }
34
+ results = c.score(doc)
35
+ assert results
36
+ assert_equal 2, results.size
37
+ assert_equal results[:spam], 2*results[:ham]
38
+ assert_equal [:spam, 2.0/3], c.classify(doc)
39
+ end
40
+
41
+ should "calculate score for nested documents" do
42
+ c = BayesMotel::Corpus.new('test')
43
+ c.train({ :something => 'foo', :kids => { :bar => 'whiz', :id => 123 } }, :ham)
44
+ c.train({ :something => 'foo', :kids => { :bar => 'gee', :id => 145 } }, :spam)
45
+
46
+ doc = { :something => 'foo', :kids => { :bar => 'gee', :id => 167, :ack => 'blag' }}
47
+ results = c.score(doc)
48
+ assert results
49
+ assert_equal 2, results.size
50
+ assert_equal results[:spam], 2*results[:ham]
51
+
52
+ assert_equal [:spam, 1.0], c.classify(doc)
53
+ end
54
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bayes_motel
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Mike Perham
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-04-28 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: shoulda
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ requirements:
25
+ - - ">="
26
+ - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ version: "0"
30
+ type: :development
31
+ version_requirements: *id001
32
+ description: http://www.mikeperham.com/2010/04/28/bayes_motel-bayesian-classification-for-ruby/
33
+ email: mperham@gmail.com
34
+ executables: []
35
+
36
+ extensions: []
37
+
38
+ extra_rdoc_files:
39
+ - LICENSE
40
+ - README.md
41
+ files:
42
+ - .document
43
+ - .gitignore
44
+ - LICENSE
45
+ - README.md
46
+ - Rakefile
47
+ - lib/bayes_motel.rb
48
+ - lib/bayes_motel/corpus.rb
49
+ - lib/bayes_motel/persistence.rb
50
+ - lib/bayes_motel/version.rb
51
+ - test/100tweets.txt.gz
52
+ - test/2000tweets.txt.gz
53
+ - test/helper.rb
54
+ - test/test_bayes_motel.rb
55
+ - test/test_persistence.rb
56
+ - test/test_training.rb
57
+ has_rdoc: true
58
+ homepage: http://github.com/mperham/bayes_motel
59
+ licenses: []
60
+
61
+ post_install_message:
62
+ rdoc_options:
63
+ - --charset=UTF-8
64
+ require_paths:
65
+ - lib
66
+ required_ruby_version: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ segments:
78
+ - 0
79
+ version: "0"
80
+ requirements: []
81
+
82
+ rubyforge_project:
83
+ rubygems_version: 1.3.6
84
+ signing_key:
85
+ specification_version: 3
86
+ summary: Bayesian classification engine
87
+ test_files:
88
+ - test/helper.rb
89
+ - test/test_bayes_motel.rb
90
+ - test/test_persistence.rb
91
+ - test/test_training.rb