classyfier 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +3 -0
- data/Gemfile +4 -0
- data/Rakefile +2 -0
- data/classyfier.gemspec +21 -0
- data/lib/classyfier.rb +2 -0
- data/lib/classyfier/naive_bayes.rb +108 -0
- data/lib/classyfier/version.rb +3 -0
- data/test/helper.rb +10 -0
- data/test/test_classyfier.rb +80 -0
- metadata +77 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Rakefile
ADDED
data/classyfier.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "classyfier/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "classyfier"
|
7
|
+
s.version = Classyfier::VERSION
|
8
|
+
s.platform = Gem::Platform::RUBY
|
9
|
+
s.authors = ["Josh Cutler"]
|
10
|
+
s.email = ["josh@codepresencelabs.com"]
|
11
|
+
s.homepage = ""
|
12
|
+
s.summary = %q{Simple Naive Bayesian Classifier}
|
13
|
+
s.description = %q{Simple Naive Bayesian Classifier}
|
14
|
+
|
15
|
+
s.rubyforge_project = "classyfier"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
end
|
data/lib/classyfier.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
module Classyfier
|
2
|
+
module NaiveBayes
|
3
|
+
class NaiveBayesClassifier
|
4
|
+
attr_reader :data_size
|
5
|
+
attr_reader :attribute_counts, :category_counts
|
6
|
+
|
7
|
+
def initialize(opts = {})
|
8
|
+
@data_size = 0
|
9
|
+
@attribute_counts = {}
|
10
|
+
@category_counts = {}
|
11
|
+
|
12
|
+
@laplacean_smoother = opts[:laplacean_smoother] || 0
|
13
|
+
@precision = opts[:precision] ? Float(opts[:precision]**10) : 100000.0
|
14
|
+
end
|
15
|
+
|
16
|
+
def train(data_hash, category)
|
17
|
+
@data_size += 1
|
18
|
+
@category_counts[category] ||= 0
|
19
|
+
@category_counts[category] += 1
|
20
|
+
|
21
|
+
_learn(data_hash, category)
|
22
|
+
end
|
23
|
+
|
24
|
+
def classify(data_hash)
|
25
|
+
category_scores = _category_scores(data_hash)
|
26
|
+
max = [:none, 0]
|
27
|
+
category_scores.each_pair do |key, value|
|
28
|
+
max = [key, value] if value > max[1]
|
29
|
+
end
|
30
|
+
return max
|
31
|
+
end
|
32
|
+
|
33
|
+
def category_scores(data_hash)
|
34
|
+
category_scores = _category_scores(data_hash)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
def _category_scores(data_hash, name='', odds={})
|
39
|
+
data_hash.each_pair do |key, value|
|
40
|
+
case value
|
41
|
+
when String
|
42
|
+
value.split(" ").each do |sub_string|
|
43
|
+
_calculate_conditional(key, sub_string, name, odds) unless sub_string.strip.empty?
|
44
|
+
end
|
45
|
+
else
|
46
|
+
_calculate_conditional(key, value, name, odds)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
#calculate posteriors given data
|
51
|
+
posteriors = {}
|
52
|
+
odds.keys.each do |category|
|
53
|
+
likelihood = 1
|
54
|
+
odds[category].each_pair{|key, value| likelihood *= value}
|
55
|
+
|
56
|
+
prior = (@category_counts[category] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size)
|
57
|
+
|
58
|
+
marginal = 0
|
59
|
+
odds.keys.each do |category2|
|
60
|
+
likelihood2 = 1
|
61
|
+
odds[category2].each_pair do |key, value|
|
62
|
+
likelihood2 *= value
|
63
|
+
end
|
64
|
+
marginal += likelihood2 * ((@category_counts[category2] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size))
|
65
|
+
end
|
66
|
+
|
67
|
+
begin
|
68
|
+
posteriors[category] = ((likelihood * prior / marginal)*@precision).round / @precision
|
69
|
+
rescue
|
70
|
+
posteriors[category] = 0
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
return posteriors
|
75
|
+
end
|
76
|
+
|
77
|
+
def _calculate_conditional(key, value, name, odds)
|
78
|
+
@attribute_counts.each_pair do |category, raw_counts|
|
79
|
+
cat = odds[category] ||= {}
|
80
|
+
keys = raw_counts["#{name}_#{key}"] || {}
|
81
|
+
total_values = 0
|
82
|
+
keys.keys.each {|k| total_values += keys[k]}
|
83
|
+
cat["#{name}_#{key}_#{value}"] = Float((keys[value] || 0) + @laplacean_smoother) / (total_values + @laplacean_smoother*keys.keys.length)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def _learn(data_hash, category, name='')
|
88
|
+
data_hash.each_pair do |key, value|
|
89
|
+
case value
|
90
|
+
when String
|
91
|
+
value.split(" ").each do |sub_string|
|
92
|
+
_store_learned_attribute(key, sub_string, category, name) unless sub_string.strip.empty?
|
93
|
+
end
|
94
|
+
else
|
95
|
+
_store_learned_attribute(key, value, category, name)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def _store_learned_attribute(key, value, category, name)
|
101
|
+
cat = (@attribute_counts[category] ||= {})
|
102
|
+
values = (cat["#{name}_#{key}"] ||= {})
|
103
|
+
values[value] ||= 0
|
104
|
+
values[value] += 1
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'helper'
|
2
|
+
|
3
|
+
class TestClassyfier < Test::Unit::TestCase
|
4
|
+
|
5
|
+
context "after training on a few items with no smoothing" do
|
6
|
+
setup do
|
7
|
+
@classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new
|
8
|
+
@classyfier.train({:name => "derp", :extension => "dta"}, :data)
|
9
|
+
@classyfier.train({:name => "malerp", :extension => "dta"}, :data)
|
10
|
+
@classyfier.train({:name => "hi", :extension => "dta"}, :data)
|
11
|
+
@classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
|
12
|
+
@classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
|
13
|
+
end
|
14
|
+
|
15
|
+
should "have stored the priors" do
|
16
|
+
assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
|
17
|
+
assert_equal(5, @classyfier.data_size)
|
18
|
+
assert_equal(3, @classyfier.category_counts[:data])
|
19
|
+
assert_equal(2, @classyfier.category_counts[:not_data])
|
20
|
+
end
|
21
|
+
|
22
|
+
context "and classifying an item" do
|
23
|
+
setup do
|
24
|
+
@scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
|
25
|
+
@scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
|
26
|
+
@scores2 = @classyfier.classify({:extension => "pdf"})
|
27
|
+
@scores3 = @classyfier.classify({:extension => "unknown"})
|
28
|
+
end
|
29
|
+
|
30
|
+
should "return the proper probability" do
|
31
|
+
# P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
|
32
|
+
# P("derp"|data)*P("hi"|data)P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|data)P(not)
|
33
|
+
# (1/3)(1/3)(3/3)(3/5)/((1/3)(1/3)(3/3)(3/5) + (1/4)(0/4)(1/2)(2/5))
|
34
|
+
assert_equal([:data, 1], @scores)
|
35
|
+
# P("malerp"|data)P(dta|data)P(data)/P("malerp"|data)P(dta|data)P(data) + P("malerp"|not_data)P(dta|not_data)P(not_data)
|
36
|
+
# (.333*1*.6)/(.333*1*.6+.25*.5*.4)
|
37
|
+
assert_equal([:data, 0.8], @scores1)
|
38
|
+
assert_equal([:not_data, 1], @scores2)
|
39
|
+
assert_equal([:none, 0], @scores3)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "after training on a few items with smoothing" do
|
45
|
+
setup do
|
46
|
+
@classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new(:laplacean_smoother => 1)
|
47
|
+
@classyfier.train({:name => "derp", :extension => "dta"}, :data)
|
48
|
+
@classyfier.train({:name => "malerp", :extension => "dta"}, :data)
|
49
|
+
@classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
|
50
|
+
@classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
|
51
|
+
@classyfier.train({:name => "hi", :extension => "dta"}, :data)
|
52
|
+
end
|
53
|
+
|
54
|
+
should "have stored the priors" do
|
55
|
+
assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
|
56
|
+
assert_equal(5, @classyfier.data_size)
|
57
|
+
assert_equal(3, @classyfier.category_counts[:data])
|
58
|
+
assert_equal(2, @classyfier.category_counts[:not_data])
|
59
|
+
end
|
60
|
+
|
61
|
+
context "and classifying an item" do
|
62
|
+
setup do
|
63
|
+
@scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
|
64
|
+
@scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
|
65
|
+
@scores2 = @classyfier.classify({:extension => "pdf"})
|
66
|
+
@scores3 = @classyfier.classify({:extension => "unknown"})
|
67
|
+
end
|
68
|
+
|
69
|
+
should "return the proper probability" do
|
70
|
+
# P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
|
71
|
+
# P("derp"|data)*P("hi"|data)*P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|not)P(not)
|
72
|
+
#((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2))/(((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2)) + ((1+1)/(4+4))((0+1)/(4+4))((1+1)/(2+2))((2+1)/(5+2)))
|
73
|
+
assert_equal([:data, 0.90459], @scores)
|
74
|
+
assert_equal([:data, 0.78049], @scores1)
|
75
|
+
assert_equal([:not_data, 0.6], @scores2)
|
76
|
+
assert_equal([:data, 0.57143], @scores3)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: classyfier
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Josh Cutler
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-11-09 00:00:00 -05:00
|
19
|
+
default_executable:
|
20
|
+
dependencies: []
|
21
|
+
|
22
|
+
description: Simple Naive Bayesian Classifier
|
23
|
+
email:
|
24
|
+
- josh@codepresencelabs.com
|
25
|
+
executables: []
|
26
|
+
|
27
|
+
extensions: []
|
28
|
+
|
29
|
+
extra_rdoc_files: []
|
30
|
+
|
31
|
+
files:
|
32
|
+
- .gitignore
|
33
|
+
- Gemfile
|
34
|
+
- Rakefile
|
35
|
+
- classyfier.gemspec
|
36
|
+
- lib/classyfier.rb
|
37
|
+
- lib/classyfier/naive_bayes.rb
|
38
|
+
- lib/classyfier/version.rb
|
39
|
+
- test/helper.rb
|
40
|
+
- test/test_classyfier.rb
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: ""
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
hash: 3
|
56
|
+
segments:
|
57
|
+
- 0
|
58
|
+
version: "0"
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
none: false
|
61
|
+
requirements:
|
62
|
+
- - ">="
|
63
|
+
- !ruby/object:Gem::Version
|
64
|
+
hash: 3
|
65
|
+
segments:
|
66
|
+
- 0
|
67
|
+
version: "0"
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project: classyfier
|
71
|
+
rubygems_version: 1.3.7
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Simple Naive Bayesian Classifier
|
75
|
+
test_files:
|
76
|
+
- test/helper.rb
|
77
|
+
- test/test_classyfier.rb
|