classyfier 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in classyfier.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "classyfier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "classyfier"
7
+ s.version = Classyfier::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Josh Cutler"]
10
+ s.email = ["josh@codepresencelabs.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{Simple Naive Bayesian Classifier}
13
+ s.description = %q{Simple Naive Bayesian Classifier}
14
+
15
+ s.rubyforge_project = "classyfier"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ end
data/lib/classyfier.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'classyfier/version'
2
+ require 'classyfier/naive_bayes'
@@ -0,0 +1,108 @@
1
+ module Classyfier
2
+ module NaiveBayes
3
+ class NaiveBayesClassifier
4
+ attr_reader :data_size
5
+ attr_reader :attribute_counts, :category_counts
6
+
7
+ def initialize(opts = {})
8
+ @data_size = 0
9
+ @attribute_counts = {}
10
+ @category_counts = {}
11
+
12
+ @laplacean_smoother = opts[:laplacean_smoother] || 0
13
+ @precision = opts[:precision] ? Float(opts[:precision]**10) : 100000.0
14
+ end
15
+
16
+ def train(data_hash, category)
17
+ @data_size += 1
18
+ @category_counts[category] ||= 0
19
+ @category_counts[category] += 1
20
+
21
+ _learn(data_hash, category)
22
+ end
23
+
24
+ def classify(data_hash)
25
+ category_scores = _category_scores(data_hash)
26
+ max = [:none, 0]
27
+ category_scores.each_pair do |key, value|
28
+ max = [key, value] if value > max[1]
29
+ end
30
+ return max
31
+ end
32
+
33
+ def category_scores(data_hash)
34
+ category_scores = _category_scores(data_hash)
35
+ end
36
+
37
+ private
38
+ def _category_scores(data_hash, name='', odds={})
39
+ data_hash.each_pair do |key, value|
40
+ case value
41
+ when String
42
+ value.split(" ").each do |sub_string|
43
+ _calculate_conditional(key, sub_string, name, odds) unless sub_string.strip.empty?
44
+ end
45
+ else
46
+ _calculate_conditional(key, value, name, odds)
47
+ end
48
+ end
49
+
50
+ #calculate posteriors given data
51
+ posteriors = {}
52
+ odds.keys.each do |category|
53
+ likelihood = 1
54
+ odds[category].each_pair{|key, value| likelihood *= value}
55
+
56
+ prior = (@category_counts[category] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size)
57
+
58
+ marginal = 0
59
+ odds.keys.each do |category2|
60
+ likelihood2 = 1
61
+ odds[category2].each_pair do |key, value|
62
+ likelihood2 *= value
63
+ end
64
+ marginal += likelihood2 * ((@category_counts[category2] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size))
65
+ end
66
+
67
+ begin
68
+ posteriors[category] = ((likelihood * prior / marginal)*@precision).round / @precision
69
+ rescue
70
+ posteriors[category] = 0
71
+ end
72
+ end
73
+
74
+ return posteriors
75
+ end
76
+
77
+ def _calculate_conditional(key, value, name, odds)
78
+ @attribute_counts.each_pair do |category, raw_counts|
79
+ cat = odds[category] ||= {}
80
+ keys = raw_counts["#{name}_#{key}"] || {}
81
+ total_values = 0
82
+ keys.keys.each {|k| total_values += keys[k]}
83
+ cat["#{name}_#{key}_#{value}"] = Float((keys[value] || 0) + @laplacean_smoother) / (total_values + @laplacean_smoother*keys.keys.length)
84
+ end
85
+ end
86
+
87
+ def _learn(data_hash, category, name='')
88
+ data_hash.each_pair do |key, value|
89
+ case value
90
+ when String
91
+ value.split(" ").each do |sub_string|
92
+ _store_learned_attribute(key, sub_string, category, name) unless sub_string.strip.empty?
93
+ end
94
+ else
95
+ _store_learned_attribute(key, value, category, name)
96
+ end
97
+ end
98
+ end
99
+
100
+ def _store_learned_attribute(key, value, category, name)
101
+ cat = (@attribute_counts[category] ||= {})
102
+ values = (cat["#{name}_#{key}"] ||= {})
103
+ values[value] ||= 0
104
+ values[value] += 1
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ module Classyfier
2
+ VERSION = "0.1.0"
3
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'classyfier'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,80 @@
1
+ require 'helper'
2
+
3
+ class TestClassyfier < Test::Unit::TestCase
4
+
5
+ context "after training on a few items with no smoothing" do
6
+ setup do
7
+ @classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new
8
+ @classyfier.train({:name => "derp", :extension => "dta"}, :data)
9
+ @classyfier.train({:name => "malerp", :extension => "dta"}, :data)
10
+ @classyfier.train({:name => "hi", :extension => "dta"}, :data)
11
+ @classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
12
+ @classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
13
+ end
14
+
15
+ should "have stored the priors" do
16
+ assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
17
+ assert_equal(5, @classyfier.data_size)
18
+ assert_equal(3, @classyfier.category_counts[:data])
19
+ assert_equal(2, @classyfier.category_counts[:not_data])
20
+ end
21
+
22
+ context "and classifying an item" do
23
+ setup do
24
+ @scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
25
+ @scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
26
+ @scores2 = @classyfier.classify({:extension => "pdf"})
27
+ @scores3 = @classyfier.classify({:extension => "unknown"})
28
+ end
29
+
30
+ should "return the proper probability" do
31
+ # P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
32
+ # P("derp"|data)*P("hi"|data)P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|data)P(not)
33
+ # (1/3)(1/3)(3/3)(3/5)/((1/3)(1/3)(3/3)(3/5) + (1/4)(0/4)(1/2)(2/5))
34
+ assert_equal([:data, 1], @scores)
35
+ # P("malerp"|data)P(dta|data)P(data)/P("malerp"|data)P(dta|data)P(data) + P("malerp"|not_data)P(dta|not_data)P(not_data)
36
+ # (.333*1*.6)/(.333*1*.6+.25*.5*.4)
37
+ assert_equal([:data, 0.8], @scores1)
38
+ assert_equal([:not_data, 1], @scores2)
39
+ assert_equal([:none, 0], @scores3)
40
+ end
41
+ end
42
+ end
43
+
44
+ context "after training on a few items with smoothing" do
45
+ setup do
46
+ @classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new(:laplacean_smoother => 1)
47
+ @classyfier.train({:name => "derp", :extension => "dta"}, :data)
48
+ @classyfier.train({:name => "malerp", :extension => "dta"}, :data)
49
+ @classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
50
+ @classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
51
+ @classyfier.train({:name => "hi", :extension => "dta"}, :data)
52
+ end
53
+
54
+ should "have stored the priors" do
55
+ assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
56
+ assert_equal(5, @classyfier.data_size)
57
+ assert_equal(3, @classyfier.category_counts[:data])
58
+ assert_equal(2, @classyfier.category_counts[:not_data])
59
+ end
60
+
61
+ context "and classifying an item" do
62
+ setup do
63
+ @scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
64
+ @scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
65
+ @scores2 = @classyfier.classify({:extension => "pdf"})
66
+ @scores3 = @classyfier.classify({:extension => "unknown"})
67
+ end
68
+
69
+ should "return the proper probability" do
70
+ # P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
71
+ # P("derp"|data)*P("hi"|data)*P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|not)P(not)
72
+ #((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2))/(((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2)) + ((1+1)/(4+4))((0+1)/(4+4))((1+1)/(2+2))((2+1)/(5+2)))
73
+ assert_equal([:data, 0.90459], @scores)
74
+ assert_equal([:data, 0.78049], @scores1)
75
+ assert_equal([:not_data, 0.6], @scores2)
76
+ assert_equal([:data, 0.57143], @scores3)
77
+ end
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: classyfier
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Josh Cutler
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-11-09 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Simple Naive Bayesian Classifier
23
+ email:
24
+ - josh@codepresencelabs.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - Rakefile
35
+ - classyfier.gemspec
36
+ - lib/classyfier.rb
37
+ - lib/classyfier/naive_bayes.rb
38
+ - lib/classyfier/version.rb
39
+ - test/helper.rb
40
+ - test/test_classyfier.rb
41
+ has_rdoc: true
42
+ homepage: ""
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: classyfier
71
+ rubygems_version: 1.3.7
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Simple Naive Bayesian Classifier
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_classyfier.rb