cross_validation 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e36ebf38a97ebf8665474186f2c1ae3c82a3855d
4
+ data.tar.gz: 770c131f2d8e3cf359e7e26450cb7a24c704239c
5
+ SHA512:
6
+ metadata.gz: 34ce7b4484db03a2d09aeb2b1f40c586a84a63b6ba22d355220ee6cba8bf588f3c8a2ac867a82524d43aa9fba06863f4f44ca09af8a17cb7f56409d9105dce7f
7
+ data.tar.gz: d55546305d845c1f2c977f825f2d69a77b32369c3afc13b5262dd170b523f4efffa6f7fa9f6d9f7d9215b6b008156ff1ae2cb13993625268bc8d3e1f0b4b2f4c
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .bin
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ - jruby-19mode
6
+ - rbx-19mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cross_validation.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jon-Michael Deldin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # CrossValidation
2
+
3
+ [![Build Status](https://travis-ci.org/jmdeldin/cross_validation.png?branch=master)](https://travis-ci.org/jmdeldin/cross_validation)
4
+ [![Code Climate](https://codeclimate.com/github/jmdeldin/cross_validation.png)](https://codeclimate.com/github/jmdeldin/cross_validation)
5
+
6
+ This gem provides a k-fold cross-validation routine and confusion matrix
7
+ for evaluating machine learning classifiers.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'cross_validation'
14
+
15
+ And then execute:
16
+
17
+ $ bundle install --binstubs .bin
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install cross_validation
22
+
23
+ ## Usage
24
+
25
+ Cross-validation:
26
+
27
+ Confusion-matrix:
28
+
29
+
30
+ ## Contributing
31
+
32
+ 1. Fork it
33
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
34
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
35
+ 4. Push to the branch (`git push origin my-new-feature`)
36
+ 5. Create new Pull Request
37
+
38
+ ## Questions
39
+
40
+ Send me an email, `dev@jmdeldin.com`
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ t.test_files = FileList['test/test_*']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'cross_validation'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "cross_validation"
9
+ gem.version = CrossValidation::VERSION
10
+ gem.authors = ["Jon-Michael Deldin"]
11
+ gem.email = ["dev@jmdeldin.com"]
12
+ gem.summary = %q{Performs k-fold cross-validation on machine learning
13
+ classifiers.}
14
+ gem.description = gem.summary
15
+ gem.homepage = 'https://github.com/jmdeldin/cross_validation'
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+
22
+ gem.add_development_dependency('rake', '~> 10.0')
23
+ end
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift File.dirname(__FILE__)
2
+
3
+ module CrossValidation
4
+ VERSION = '0.0.1'
5
+ end
@@ -0,0 +1,100 @@
1
+ require_relative '../cross_validation'
2
+
3
+ module CrossValidation
4
+
5
+ # Provides a confusion matrix (contingency table) for classification
6
+ # results.
7
+ #
8
+ # See the following book for more details:
9
+ #
10
+ # Speech and Language Processing: An introduction to natural language
11
+ # processing, computational linguistics, and speech recognition. Daniel
12
+ # Jurafsky & James H. Martin.
13
+ class ConfusionMatrix
14
+ # Initialize the confusion matrix with a Proc (or block). This Proc must
15
+ # return a symbol of :tp (true positive), :tn (true negative), :fp (false
16
+ # positive), or :fn (false negative) for a given classification and its
17
+ # expected value.
18
+ #
19
+ # See the unit test for an example Proc.
20
+ #
21
+ # @param [Proc] keys_proc
22
+ def initialize(keys_proc)
23
+ @keys_for = keys_proc
24
+ @values = {:tp => 0, :tn => 0, :fp => 0, :fn => 0}
25
+ end
26
+
27
+ [:tp, :tn, :fp, :fn].each do |field|
28
+ define_method(field) { @values.fetch(field) }
29
+ end
30
+
31
+ # Save the result of classification
32
+ #
33
+ # @param [Object] actual The classified value
34
+ # @param [Object] truth The known, expected value
35
+ # @return [self]
36
+ def store(actual, truth)
37
+ key = @keys_for.call(actual, truth)
38
+
39
+ if @values.key?(key)
40
+ @values[key] += 1
41
+ else
42
+ fail IndexError, "#{key} not found in confusion matrix"
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ # Computes the accuracy of the classifier, defined as (tp + tn)/n
49
+ #
50
+ # @return [Float]
51
+ def accuracy
52
+ (@values.fetch(:tp) + @values.fetch(:tn)) / total()
53
+ end
54
+
55
+ # Computes the precision of the classifier, defined as tp/(tp + fp)
56
+ #
57
+ # @return [Float]
58
+ def precision
59
+ @values.fetch(:tp) / Float(@values.fetch(:tp) + @values.fetch(:fp))
60
+ end
61
+
62
+ # Computes the recall of the classifier, defined as tp/(tp + fn)
63
+ #
64
+ # @return [Float]
65
+ def recall
66
+ @values.fetch(:tp) / Float(@values.fetch(:tp) + @values.fetch(:fn))
67
+ end
68
+
69
+ # Returns the F-measure of the classifier's precision and recall.
70
+ #
71
+ # @param [Float] beta Favor precision (<1), recall (>1), or both (1)
72
+ # @return [Float]
73
+ def fscore(beta)
74
+ b2 = Float(beta**2)
75
+ ((b2 + 1) * precision * recall) / (b2 * precision + recall)
76
+ end
77
+
78
+ # Returns an F-score that favors precision and recall equally.
79
+ #
80
+ # @return [Float]
81
+ def f1
82
+ fscore(1)
83
+ end
84
+
85
+ # Returns the classifier's error
86
+ def error
87
+ 1.0 - accuracy()
88
+ end
89
+
90
+ private
91
+
92
+ # Returns the total number of classifications as a Float, since this value
93
+ # is used as a divisor.
94
+ #
95
+ # @return [Float]
96
+ def total
97
+ Float(@values.values.reduce(:+))
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,95 @@
1
+ require_relative '../cross_validation'
2
+
3
+ module CrossValidation
4
+ class Runner
5
+ # @return [Array] Array of documents to train and test on. It can be an
6
+ # array of anything, as the +fetch_sample_value+ and
7
+ # +fetch_sample_class+ lambdas specify what to feed into
8
+ # the classifying method.
9
+ attr_accessor :documents
10
+
11
+ # @return [Proc] This instantiates your classifier.
12
+ attr_accessor :classifier
13
+
14
+ # @return [Fixnum] The number of folds to partition +documents+ into.
15
+ # Mutually exclusive with +percentage+.
16
+ attr_accessor :folds
17
+
18
+ # @return [Float] The number of folds to partition +documents+ into as a
19
+ # *percentage* of the documents. Mutually exclusive with
20
+ # +folds+.
21
+ attr_accessor :percentage
22
+
23
+ # @return [ConfusionMatrix]
24
+ attr_accessor :matrix
25
+
26
+ # @return [Proc] This receives an instantiated +classifier+ and a
27
+ # document, and it should call your classifier's training
28
+ # method.
29
+ attr_accessor :training
30
+
31
+ # @return [Proc] This receives a *trained* classifier and a test document.
32
+ # It classifies the document. It's a +Proc+ because we
33
+ # create a new one with each partition.
34
+ attr_accessor :classifying
35
+
36
+ # @return [Proc] This receives a document and should return its value,
37
+ # i.e., whatever you're feeding into +classifying+.
38
+ attr_accessor :fetch_sample_value
39
+
40
+ # @return [Proc] When verifying the results of executing the +classifying+
41
+ # method, we need to determine what the actual class (e.g.,
42
+ # spam) of the document was. This +Proc+ receives a
43
+ # document and should return the document's class.
44
+ attr_accessor :fetch_sample_class
45
+
46
+ # Returns the number of folds to partition the documents into.
47
+ #
48
+ # @return [Fixnum]
49
+ def k
50
+ @k ||= percentage ? (documents.size * percentage) : folds
51
+ end
52
+
53
+ # Performs k-fold cross-validation and returns a confusion matrix.
54
+ #
55
+ # The algorithm is as follows (Mitchell, 1997, p147):
56
+ #
57
+ # partitions = partition data into k-equal sized subsets (folds)
58
+ # for i = 1 -> k:
59
+ # T = data \ partitions[i]
60
+ # train(T)
61
+ # classify(partitions[i])
62
+ # output confusion matrix
63
+ #
64
+ def run
65
+ partitions = documents.each_slice(k).to_a
66
+
67
+ results = partitions.map.with_index do |part, i|
68
+ # Array#rotate puts the element i first, so all we have to do is rotate
69
+ # then remove that element to get the training set. Array#drop does not
70
+ # mutate the original array either. Array#flatten is needed to coalesce
71
+ # our list of lists into one list again.
72
+ training_samples = partitions.rotate(i).drop(1).flatten
73
+
74
+ classifier_instance = classifier.call()
75
+
76
+ # train it
77
+ training_samples.each { |doc| training.call(classifier_instance, doc) }
78
+
79
+ # fetch confusion keys
80
+ part.each do |x|
81
+ prediction = classifying.call(classifier_instance, fetch_sample_value.call(x))
82
+ matrix.store(prediction, fetch_sample_class.call(x))
83
+ end
84
+ end
85
+
86
+ matrix
87
+ end
88
+
89
+ # Configuring a cross-validation run is complicated. Let's make it easier
90
+ # with a factory method.
91
+ def self.create
92
+ new.tap { |r| yield(r) }
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,101 @@
1
+ require_relative 'test_helper'
2
+ require_relative '../lib/cross_validation/confusion_matrix'
3
+
4
+ class TestConfusionMatrix < MiniTest::Unit::TestCase
5
+ def delta
6
+ 1e-6
7
+ end
8
+
9
+ def setup
10
+ @mat = CrossValidation::ConfusionMatrix.new(method(:keys_for))
11
+ end
12
+
13
+ def test_true_positives
14
+ true_positive(@mat)
15
+ assert_equal 1, @mat.tp
16
+ end
17
+
18
+ def test_true_negatives
19
+ true_negative(@mat)
20
+ assert_equal 1, @mat.tn
21
+ end
22
+
23
+ def test_false_positives
24
+ false_positive(@mat)
25
+ assert_equal 1, @mat.fp
26
+ end
27
+
28
+ def test_false_negatives
29
+ false_negative(@mat)
30
+ assert_equal 1, @mat.fn
31
+ end
32
+
33
+ def test_store_raises_index_error_on_bad_key
34
+ bad_keys_for = ->(actual, expected) { :bad }
35
+ mat = CrossValidation::ConfusionMatrix.new(bad_keys_for)
36
+ assert_raises IndexError do
37
+ mat.store(:ham, :spam)
38
+ end
39
+ end
40
+
41
+ def test_accuracy
42
+ true_positive(@mat)
43
+ true_negative(@mat)
44
+ false_negative(@mat)
45
+
46
+ assert_in_delta 2.0/3.0, @mat.accuracy, delta
47
+ end
48
+
49
+ def test_precision
50
+ true_positive(@mat)
51
+ false_positive(@mat)
52
+
53
+ assert_in_delta 0.5, @mat.precision, delta
54
+ end
55
+
56
+ def test_error
57
+ true_positive(@mat)
58
+ true_negative(@mat)
59
+ false_positive(@mat)
60
+
61
+ assert_in_delta 1/3.0, @mat.error, delta
62
+ end
63
+
64
+ def test_precision
65
+ true_positive(@mat)
66
+ false_positive(@mat)
67
+ false_positive(@mat)
68
+ false_positive(@mat)
69
+
70
+ assert_in_delta 0.25, @mat.precision, delta
71
+ end
72
+
73
+ def test_recall
74
+ true_positive(@mat)
75
+ false_negative(@mat)
76
+
77
+ assert_in_delta 0.5, @mat.recall, delta
78
+ end
79
+
80
+ def test_fscore
81
+ true_positive(@mat)
82
+ true_negative(@mat)
83
+ false_positive(@mat)
84
+
85
+ assert_in_delta 2/3.0, @mat.fscore(1), delta
86
+ end
87
+
88
+ def test_f1score
89
+ true_positive(@mat)
90
+ true_negative(@mat)
91
+
92
+ assert_in_delta 1.0, @mat.f1, delta
93
+ end
94
+
95
+ private
96
+
97
+ def true_positive(mat) mat.store(:spam, :spam) end
98
+ def true_negative(mat) mat.store(:ham, :ham) end
99
+ def false_positive(mat) mat.store(:ham, :spam) end
100
+ def false_negative(mat) mat.store(:spam, :ham) end
101
+ end
@@ -0,0 +1,10 @@
1
+ require 'minitest/autorun'
2
+
3
+ # Dummy method for use in testing confusion matrices.
4
+ def keys_for(actual, expected)
5
+ if actual == :spam
6
+ expected == :spam ? :tp : :fn
7
+ elsif actual == :ham
8
+ expected == :ham ? :tn : :fp
9
+ end
10
+ end
@@ -0,0 +1,86 @@
1
+ require_relative 'test_helper'
2
+ require_relative '../lib/cross_validation/confusion_matrix'
3
+ require_relative '../lib/cross_validation/runner'
4
+
5
+ # A stupid classifier
6
+ class SpamClassifier
7
+ def train(klass, document)
8
+ # don't bother, we're that good (in reality, you should probably do some
9
+ # work here)
10
+ end
11
+
12
+ def classify(document)
13
+ document =~ /viagra/ ? :spam : :ham
14
+ end
15
+ end
16
+
17
+ # We just need to associate a class with a value. Feel free to use whatever
18
+ # data structure you like -- this is only used in user-defined training and
19
+ # classifying closures.
20
+ Sample = Struct.new(:klass, :value)
21
+
22
+ # Asserts the DSL's getter and setters work.
23
+ def check_dsl(attribute, value)
24
+ runner = CrossValidation::Runner.create { |r|
25
+ r.public_send("#{attribute}=", :value)
26
+ }
27
+
28
+ define_method("test_#{attribute}_getter") {
29
+ assert_equal :value, runner.public_send(attribute)
30
+ }
31
+ end
32
+
33
+ class TestRunner < MiniTest::Unit::TestCase
34
+ def setup
35
+ tpl = ['Buy some...', 'Would you like some...']
36
+ @spam = tpl.map { |pfx| Sample.new(:spam, pfx + 'viagra!') }
37
+ @ham = tpl.map { |pfx| Sample.new(:ham, pfx + 'penicillin!') }
38
+ @corpus = @spam + @ham
39
+ @corpus *= 25 # 100 is easier to deal with
40
+ end
41
+
42
+ def test_run
43
+ runner = CrossValidation::Runner.create do |r|
44
+ r.documents = @corpus
45
+ r.folds = 10
46
+ r.classifier = lambda { SpamClassifier.new }
47
+ r.fetch_sample_class = lambda { |sample| sample.klass }
48
+ r.fetch_sample_value = lambda { |sample| sample.value }
49
+ r.matrix = CrossValidation::ConfusionMatrix.new(method(:keys_for))
50
+ r.training = lambda { |classifier, doc|
51
+ classifier.train doc.klass, doc.value
52
+ }
53
+ r.classifying = lambda { |classifier, doc|
54
+ classifier.classify doc
55
+ }
56
+ end
57
+
58
+ mat = runner.run
59
+
60
+ assert_equal 50, mat.tp
61
+ assert_equal 50, mat.tn
62
+ end
63
+
64
+ def test_percentage_takes_precedence_over_folds
65
+ runner = CrossValidation::Runner.create do |r|
66
+ r.documents = ['foo'] * 100
67
+ r.folds = 20
68
+ r.percentage = 0.1
69
+ end
70
+
71
+ assert_equal 10, runner.k
72
+ end
73
+
74
+ [
75
+ :documents,
76
+ :folds,
77
+ :classifier,
78
+ :fetch_sample_value,
79
+ :fetch_sample_class,
80
+ :matrix,
81
+ :training,
82
+ :classifying,
83
+ ].each do |attribute|
84
+ check_dsl(attribute, :foo)
85
+ end
86
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cross_validation
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jon-Michael Deldin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '10.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '10.0'
27
+ description: Performs k-fold cross-validation on machine learning classifiers.
28
+ email:
29
+ - dev@jmdeldin.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - .gitignore
35
+ - .travis.yml
36
+ - Gemfile
37
+ - LICENSE.txt
38
+ - README.md
39
+ - Rakefile
40
+ - cross_validation.gemspec
41
+ - lib/cross_validation.rb
42
+ - lib/cross_validation/confusion_matrix.rb
43
+ - lib/cross_validation/runner.rb
44
+ - test/test_confusion_matrix.rb
45
+ - test/test_helper.rb
46
+ - test/test_runner.rb
47
+ homepage: https://github.com/jmdeldin/cross_validation
48
+ licenses: []
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.0.0.rc.2
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Performs k-fold cross-validation on machine learning classifiers.
70
+ test_files:
71
+ - test/test_confusion_matrix.rb
72
+ - test/test_helper.rb
73
+ - test/test_runner.rb