classyfier 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/classyfier.gemspec +21 -0
- data/lib/classyfier.rb +2 -0
- data/lib/classyfier/naive_bayes.rb +108 -0
- data/lib/classyfier/version.rb +3 -0
- data/test/helper.rb +10 -0
- data/test/test_classyfier.rb +80 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/classyfier.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "classyfier/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "classyfier"
|
7
|
+
s.version = Classyfier::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Josh Cutler"]
|
10
|
+
s.email = ["josh@codepresencelabs.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Simple Naive Bayesian Classifier}
|
13
|
+
s.description = %q{Simple Naive Bayesian Classifier}
|
14
|
+
|
15
|
+
s.rubyforge_project = "classyfier"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
data/lib/classyfier.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
module Classyfier
|
2
|
+
module NaiveBayes
|
3
|
+
class NaiveBayesClassifier
|
4
|
+
attr_reader :data_size
|
5
|
+
attr_reader :attribute_counts, :category_counts
|
6
|
+
|
7
|
+
def initialize(opts = {})
|
8
|
+
@data_size = 0
|
9
|
+
@attribute_counts = {}
|
10
|
+
@category_counts = {}
|
11
|
+
|
12
|
+
@laplacean_smoother = opts[:laplacean_smoother] || 0
|
13
|
+
@precision = opts[:precision] ? Float(opts[:precision]**10) : 100000.0
|
14
|
+
end
|
15
|
+
|
16
|
+
def train(data_hash, category)
|
17
|
+
@data_size += 1
|
18
|
+
@category_counts[category] ||= 0
|
19
|
+
@category_counts[category] += 1
|
20
|
+
|
21
|
+
_learn(data_hash, category)
|
22
|
+
end
|
23
|
+
|
24
|
+
def classify(data_hash)
|
25
|
+
category_scores = _category_scores(data_hash)
|
26
|
+
max = [:none, 0]
|
27
|
+
category_scores.each_pair do |key, value|
|
28
|
+
max = [key, value] if value > max[1]
|
29
|
+
end
|
30
|
+
return max
|
31
|
+
end
|
32
|
+
|
33
|
+
def category_scores(data_hash)
|
34
|
+
category_scores = _category_scores(data_hash)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def _category_scores(data_hash, name='', odds={})
|
39
|
+
data_hash.each_pair do |key, value|
|
40
|
+
case value
|
41
|
+
when String
|
42
|
+
value.split(" ").each do |sub_string|
|
43
|
+
_calculate_conditional(key, sub_string, name, odds) unless sub_string.strip.empty?
|
44
|
+
end
|
45
|
+
else
|
46
|
+
_calculate_conditional(key, value, name, odds)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#calculate posteriors given data
|
51
|
+
posteriors = {}
|
52
|
+
odds.keys.each do |category|
|
53
|
+
likelihood = 1
|
54
|
+
odds[category].each_pair{|key, value| likelihood *= value}
|
55
|
+
|
56
|
+
prior = (@category_counts[category] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size)
|
57
|
+
|
58
|
+
marginal = 0
|
59
|
+
odds.keys.each do |category2|
|
60
|
+
likelihood2 = 1
|
61
|
+
odds[category2].each_pair do |key, value|
|
62
|
+
likelihood2 *= value
|
63
|
+
end
|
64
|
+
marginal += likelihood2 * ((@category_counts[category2] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size))
|
65
|
+
end
|
66
|
+
|
67
|
+
begin
|
68
|
+
posteriors[category] = ((likelihood * prior / marginal)*@precision).round / @precision
|
69
|
+
rescue
|
70
|
+
posteriors[category] = 0
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
return posteriors
|
75
|
+
end
|
76
|
+
|
77
|
+
def _calculate_conditional(key, value, name, odds)
|
78
|
+
@attribute_counts.each_pair do |category, raw_counts|
|
79
|
+
cat = odds[category] ||= {}
|
80
|
+
keys = raw_counts["#{name}_#{key}"] || {}
|
81
|
+
total_values = 0
|
82
|
+
keys.keys.each {|k| total_values += keys[k]}
|
83
|
+
cat["#{name}_#{key}_#{value}"] = Float((keys[value] || 0) + @laplacean_smoother) / (total_values + @laplacean_smoother*keys.keys.length)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def _learn(data_hash, category, name='')
|
88
|
+
data_hash.each_pair do |key, value|
|
89
|
+
case value
|
90
|
+
when String
|
91
|
+
value.split(" ").each do |sub_string|
|
92
|
+
_store_learned_attribute(key, sub_string, category, name) unless sub_string.strip.empty?
|
93
|
+
end
|
94
|
+
else
|
95
|
+
_store_learned_attribute(key, value, category, name)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def _store_learned_attribute(key, value, category, name)
|
101
|
+
cat = (@attribute_counts[category] ||= {})
|
102
|
+
values = (cat["#{name}_#{key}"] ||= {})
|
103
|
+
values[value] ||= 0
|
104
|
+
values[value] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestClassyfier < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "after training on a few items with no smoothing" do
|
6
|
+
setup do
|
7
|
+
@classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new
|
8
|
+
@classyfier.train({:name => "derp", :extension => "dta"}, :data)
|
9
|
+
@classyfier.train({:name => "malerp", :extension => "dta"}, :data)
|
10
|
+
@classyfier.train({:name => "hi", :extension => "dta"}, :data)
|
11
|
+
@classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
|
12
|
+
@classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
|
13
|
+
end
|
14
|
+
|
15
|
+
should "have stored the priors" do
|
16
|
+
assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
|
17
|
+
assert_equal(5, @classyfier.data_size)
|
18
|
+
assert_equal(3, @classyfier.category_counts[:data])
|
19
|
+
assert_equal(2, @classyfier.category_counts[:not_data])
|
20
|
+
end
|
21
|
+
|
22
|
+
context "and classifying an item" do
|
23
|
+
setup do
|
24
|
+
@scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
|
25
|
+
@scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
|
26
|
+
@scores2 = @classyfier.classify({:extension => "pdf"})
|
27
|
+
@scores3 = @classyfier.classify({:extension => "unknown"})
|
28
|
+
end
|
29
|
+
|
30
|
+
should "return the proper probability" do
|
31
|
+
# P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
|
32
|
+
# P("derp"|data)*P("hi"|data)P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|data)P(not)
|
33
|
+
# (1/3)(1/3)(3/3)(3/5)/((1/3)(1/3)(3/3)(3/5) + (1/4)(0/4)(1/2)(2/5))
|
34
|
+
assert_equal([:data, 1], @scores)
|
35
|
+
# P("malerp"|data)P(dta|data)P(data)/P("malerp"|data)P(dta|data)P(data) + P("malerp"|not_data)P(dta|not_data)P(not_data)
|
36
|
+
# (.333*1*.6)/(.333*1*.6+.25*.5*.4)
|
37
|
+
assert_equal([:data, 0.8], @scores1)
|
38
|
+
assert_equal([:not_data, 1], @scores2)
|
39
|
+
assert_equal([:none, 0], @scores3)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "after training on a few items with smoothing" do
|
45
|
+
setup do
|
46
|
+
@classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new(:laplacean_smoother => 1)
|
47
|
+
@classyfier.train({:name => "derp", :extension => "dta"}, :data)
|
48
|
+
@classyfier.train({:name => "malerp", :extension => "dta"}, :data)
|
49
|
+
@classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
|
50
|
+
@classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
|
51
|
+
@classyfier.train({:name => "hi", :extension => "dta"}, :data)
|
52
|
+
end
|
53
|
+
|
54
|
+
should "have stored the priors" do
|
55
|
+
assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
|
56
|
+
assert_equal(5, @classyfier.data_size)
|
57
|
+
assert_equal(3, @classyfier.category_counts[:data])
|
58
|
+
assert_equal(2, @classyfier.category_counts[:not_data])
|
59
|
+
end
|
60
|
+
|
61
|
+
context "and classifying an item" do
|
62
|
+
setup do
|
63
|
+
@scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
|
64
|
+
@scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
|
65
|
+
@scores2 = @classyfier.classify({:extension => "pdf"})
|
66
|
+
@scores3 = @classyfier.classify({:extension => "unknown"})
|
67
|
+
end
|
68
|
+
|
69
|
+
should "return the proper probability" do
|
70
|
+
# P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
|
71
|
+
# P("derp"|data)*P("hi"|data)*P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|not)P(not)
|
72
|
+
#((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2))/(((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2)) + ((1+1)/(4+4))((0+1)/(4+4))((1+1)/(2+2))((2+1)/(5+2)))
|
73
|
+
assert_equal([:data, 0.90459], @scores)
|
74
|
+
assert_equal([:data, 0.78049], @scores1)
|
75
|
+
assert_equal([:not_data, 0.6], @scores2)
|
76
|
+
assert_equal([:data, 0.57143], @scores3)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: classyfier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Josh Cutler
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-11-09 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Simple Naive Bayesian Classifier
|
23
|
+
email:
|
24
|
+
- josh@codepresencelabs.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- Rakefile
|
35
|
+
- classyfier.gemspec
|
36
|
+
- lib/classyfier.rb
|
37
|
+
- lib/classyfier/naive_bayes.rb
|
38
|
+
- lib/classyfier/version.rb
|
39
|
+
- test/helper.rb
|
40
|
+
- test/test_classyfier.rb
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: ""
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: classyfier
|
71
|
+
rubygems_version: 1.3.7
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Simple Naive Bayesian Classifier
|
75
|
+
test_files:
|
76
|
+
- test/helper.rb
|
77
|
+
- test/test_classyfier.rb
|