classyfier 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ pkg/*
2
+ *.gem
3
+ .bundle
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in classyfier.gemspec
4
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require 'bundler'
2
+ Bundler::GemHelper.install_tasks
@@ -0,0 +1,21 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "classyfier/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "classyfier"
7
+ s.version = Classyfier::VERSION
8
+ s.platform = Gem::Platform::RUBY
9
+ s.authors = ["Josh Cutler"]
10
+ s.email = ["josh@codepresencelabs.com"]
11
+ s.homepage = ""
12
+ s.summary = %q{Simple Naive Bayesian Classifier}
13
+ s.description = %q{Simple Naive Bayesian Classifier}
14
+
15
+ s.rubyforge_project = "classyfier"
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+ end
data/lib/classyfier.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'classyfier/version'
2
+ require 'classyfier/naive_bayes'
@@ -0,0 +1,108 @@
1
+ module Classyfier
2
+ module NaiveBayes
3
+ class NaiveBayesClassifier
4
+ attr_reader :data_size
5
+ attr_reader :attribute_counts, :category_counts
6
+
7
+ def initialize(opts = {})
8
+ @data_size = 0
9
+ @attribute_counts = {}
10
+ @category_counts = {}
11
+
12
+ @laplacean_smoother = opts[:laplacean_smoother] || 0
13
+ @precision = opts[:precision] ? Float(opts[:precision]**10) : 100000.0
14
+ end
15
+
16
+ def train(data_hash, category)
17
+ @data_size += 1
18
+ @category_counts[category] ||= 0
19
+ @category_counts[category] += 1
20
+
21
+ _learn(data_hash, category)
22
+ end
23
+
24
+ def classify(data_hash)
25
+ category_scores = _category_scores(data_hash)
26
+ max = [:none, 0]
27
+ category_scores.each_pair do |key, value|
28
+ max = [key, value] if value > max[1]
29
+ end
30
+ return max
31
+ end
32
+
33
+ def category_scores(data_hash)
34
+ category_scores = _category_scores(data_hash)
35
+ end
36
+
37
+ private
38
+ def _category_scores(data_hash, name='', odds={})
39
+ data_hash.each_pair do |key, value|
40
+ case value
41
+ when String
42
+ value.split(" ").each do |sub_string|
43
+ _calculate_conditional(key, sub_string, name, odds) unless sub_string.strip.empty?
44
+ end
45
+ else
46
+ _calculate_conditional(key, value, name, odds)
47
+ end
48
+ end
49
+
50
+ #calculate posteriors given data
51
+ posteriors = {}
52
+ odds.keys.each do |category|
53
+ likelihood = 1
54
+ odds[category].each_pair{|key, value| likelihood *= value}
55
+
56
+ prior = (@category_counts[category] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size)
57
+
58
+ marginal = 0
59
+ odds.keys.each do |category2|
60
+ likelihood2 = 1
61
+ odds[category2].each_pair do |key, value|
62
+ likelihood2 *= value
63
+ end
64
+ marginal += likelihood2 * ((@category_counts[category2] + @laplacean_smoother) / (Float(@data_size) + @laplacean_smoother*@category_counts.keys.size))
65
+ end
66
+
67
+ begin
68
+ posteriors[category] = ((likelihood * prior / marginal)*@precision).round / @precision
69
+ rescue
70
+ posteriors[category] = 0
71
+ end
72
+ end
73
+
74
+ return posteriors
75
+ end
76
+
77
+ def _calculate_conditional(key, value, name, odds)
78
+ @attribute_counts.each_pair do |category, raw_counts|
79
+ cat = odds[category] ||= {}
80
+ keys = raw_counts["#{name}_#{key}"] || {}
81
+ total_values = 0
82
+ keys.keys.each {|k| total_values += keys[k]}
83
+ cat["#{name}_#{key}_#{value}"] = Float((keys[value] || 0) + @laplacean_smoother) / (total_values + @laplacean_smoother*keys.keys.length)
84
+ end
85
+ end
86
+
87
+ def _learn(data_hash, category, name='')
88
+ data_hash.each_pair do |key, value|
89
+ case value
90
+ when String
91
+ value.split(" ").each do |sub_string|
92
+ _store_learned_attribute(key, sub_string, category, name) unless sub_string.strip.empty?
93
+ end
94
+ else
95
+ _store_learned_attribute(key, value, category, name)
96
+ end
97
+ end
98
+ end
99
+
100
+ def _store_learned_attribute(key, value, category, name)
101
+ cat = (@attribute_counts[category] ||= {})
102
+ values = (cat["#{name}_#{key}"] ||= {})
103
+ values[value] ||= 0
104
+ values[value] += 1
105
+ end
106
+ end
107
+ end
108
+ end
@@ -0,0 +1,3 @@
1
+ module Classyfier
2
+ VERSION = "0.1.0"
3
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'classyfier'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,80 @@
1
+ require 'helper'
2
+
3
+ class TestClassyfier < Test::Unit::TestCase
4
+
5
+ context "after training on a few items with no smoothing" do
6
+ setup do
7
+ @classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new
8
+ @classyfier.train({:name => "derp", :extension => "dta"}, :data)
9
+ @classyfier.train({:name => "malerp", :extension => "dta"}, :data)
10
+ @classyfier.train({:name => "hi", :extension => "dta"}, :data)
11
+ @classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
12
+ @classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
13
+ end
14
+
15
+ should "have stored the priors" do
16
+ assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
17
+ assert_equal(5, @classyfier.data_size)
18
+ assert_equal(3, @classyfier.category_counts[:data])
19
+ assert_equal(2, @classyfier.category_counts[:not_data])
20
+ end
21
+
22
+ context "and classifying an item" do
23
+ setup do
24
+ @scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
25
+ @scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
26
+ @scores2 = @classyfier.classify({:extension => "pdf"})
27
+ @scores3 = @classyfier.classify({:extension => "unknown"})
28
+ end
29
+
30
+ should "return the proper probability" do
31
+ # P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
32
+ # P("derp"|data)*P("hi"|data)P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|data)P(not)
33
+ # (1/3)(1/3)(3/3)(3/5)/((1/3)(1/3)(3/3)(3/5) + (1/4)(0/4)(1/2)(2/5))
34
+ assert_equal([:data, 1], @scores)
35
+ # P("malerp"|data)P(dta|data)P(data)/P("malerp"|data)P(dta|data)P(data) + P("malerp"|not_data)P(dta|not_data)P(not_data)
36
+ # (.333*1*.6)/(.333*1*.6+.25*.5*.4)
37
+ assert_equal([:data, 0.8], @scores1)
38
+ assert_equal([:not_data, 1], @scores2)
39
+ assert_equal([:none, 0], @scores3)
40
+ end
41
+ end
42
+ end
43
+
44
+ context "after training on a few items with smoothing" do
45
+ setup do
46
+ @classyfier = Classyfier::NaiveBayes::NaiveBayesClassifier.new(:laplacean_smoother => 1)
47
+ @classyfier.train({:name => "derp", :extension => "dta"}, :data)
48
+ @classyfier.train({:name => "malerp", :extension => "dta"}, :data)
49
+ @classyfier.train({:name => "derp malerp", :extension => "pdf"}, :not_data)
50
+ @classyfier.train({:name => "foo bar", :extension => "dta"}, :not_data)
51
+ @classyfier.train({:name => "hi", :extension => "dta"}, :data)
52
+ end
53
+
54
+ should "have stored the priors" do
55
+ assert_equal 3, @classyfier.attribute_counts[:data]['_extension']['dta']
56
+ assert_equal(5, @classyfier.data_size)
57
+ assert_equal(3, @classyfier.category_counts[:data])
58
+ assert_equal(2, @classyfier.category_counts[:not_data])
59
+ end
60
+
61
+ context "and classifying an item" do
62
+ setup do
63
+ @scores = @classyfier.classify({:name => "derp hi", :extension => "dta"})
64
+ @scores1 = @classyfier.classify({:name => "malerp", :extension => "dta"})
65
+ @scores2 = @classyfier.classify({:extension => "pdf"})
66
+ @scores3 = @classyfier.classify({:extension => "unknown"})
67
+ end
68
+
69
+ should "return the proper probability" do
70
+ # P(data|"derp hi") = P("derp hi"|data)*P(data) / P("derp hi") = P("derp hi"|data)*P(data) / P("derp hi"|data)P(data) + P("derp hi|not")P(not) =
71
+ # P("derp"|data)*P("hi"|data)*P(dta|data)*P(data) / P("derp"|data)P(hi"|data)P(dta|data)P(data) + P("derp"|not)P("hi"|not)P(dta|not)P(not)
72
+ #((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2))/(((1+1)/(3+3))((1+1)/(3+3))((3+1)/(3+1))((3+1)/(5+2)) + ((1+1)/(4+4))((0+1)/(4+4))((1+1)/(2+2))((2+1)/(5+2)))
73
+ assert_equal([:data, 0.90459], @scores)
74
+ assert_equal([:data, 0.78049], @scores1)
75
+ assert_equal([:not_data, 0.6], @scores2)
76
+ assert_equal([:data, 0.57143], @scores3)
77
+ end
78
+ end
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: classyfier
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Josh Cutler
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-11-09 00:00:00 -05:00
19
+ default_executable:
20
+ dependencies: []
21
+
22
+ description: Simple Naive Bayesian Classifier
23
+ email:
24
+ - josh@codepresencelabs.com
25
+ executables: []
26
+
27
+ extensions: []
28
+
29
+ extra_rdoc_files: []
30
+
31
+ files:
32
+ - .gitignore
33
+ - Gemfile
34
+ - Rakefile
35
+ - classyfier.gemspec
36
+ - lib/classyfier.rb
37
+ - lib/classyfier/naive_bayes.rb
38
+ - lib/classyfier/version.rb
39
+ - test/helper.rb
40
+ - test/test_classyfier.rb
41
+ has_rdoc: true
42
+ homepage: ""
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ hash: 3
56
+ segments:
57
+ - 0
58
+ version: "0"
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ none: false
61
+ requirements:
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ hash: 3
65
+ segments:
66
+ - 0
67
+ version: "0"
68
+ requirements: []
69
+
70
+ rubyforge_project: classyfier
71
+ rubygems_version: 1.3.7
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Simple Naive Bayesian Classifier
75
+ test_files:
76
+ - test/helper.rb
77
+ - test/test_classyfier.rb