cross_validation 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e36ebf38a97ebf8665474186f2c1ae3c82a3855d
4
+ data.tar.gz: 770c131f2d8e3cf359e7e26450cb7a24c704239c
5
+ SHA512:
6
+ metadata.gz: 34ce7b4484db03a2d09aeb2b1f40c586a84a63b6ba22d355220ee6cba8bf588f3c8a2ac867a82524d43aa9fba06863f4f44ca09af8a17cb7f56409d9105dce7f
7
+ data.tar.gz: d55546305d845c1f2c977f825f2d69a77b32369c3afc13b5262dd170b523f4efffa6f7fa9f6d9f7d9215b6b008156ff1ae2cb13993625268bc8d3e1f0b4b2f4c
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .bin
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
5
+ - jruby-19mode
6
+ - rbx-19mode
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in cross_validation.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Jon-Michael Deldin
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # CrossValidation
2
+
3
+ [![Build Status](https://travis-ci.org/jmdeldin/cross_validation.png?branch=master)](https://travis-ci.org/jmdeldin/cross_validation)
4
+ [![Code Climate](https://codeclimate.com/github/jmdeldin/cross_validation.png)](https://codeclimate.com/github/jmdeldin/cross_validation)
5
+
6
+ This gem provides a k-fold cross-validation routine and confusion matrix
7
+ for evaluating machine learning classifiers.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ gem 'cross_validation'
14
+
15
+ And then execute:
16
+
17
+ $ bundle install --binstubs .bin
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install cross_validation
22
+
23
+ ## Usage
24
+
25
+ Cross-validation:
26
+
27
+ Confusion-matrix:
28
+
29
+
30
+ ## Contributing
31
+
32
+ 1. Fork it
33
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
34
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
35
+ 4. Push to the branch (`git push origin my-new-feature`)
36
+ 5. Create new Pull Request
37
+
38
+ ## Questions
39
+
40
+ Send me an email, `dev@jmdeldin.com`
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+
4
+ Rake::TestTask.new do |t|
5
+ t.libs << 'test'
6
+ t.test_files = FileList['test/test_*']
7
+ t.verbose = true
8
+ end
9
+
10
+ task :default => :test
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ require 'cross_validation'
6
+
7
+ Gem::Specification.new do |gem|
8
+ gem.name = "cross_validation"
9
+ gem.version = CrossValidation::VERSION
10
+ gem.authors = ["Jon-Michael Deldin"]
11
+ gem.email = ["dev@jmdeldin.com"]
12
+ gem.summary = %q{Performs k-fold cross-validation on machine learning
13
+ classifiers.}
14
+ gem.description = gem.summary
15
+ gem.homepage = 'https://github.com/jmdeldin/cross_validation'
16
+
17
+ gem.files = `git ls-files`.split($/)
18
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
19
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
20
+ gem.require_paths = ["lib"]
21
+
22
+ gem.add_development_dependency('rake', '~> 10.0')
23
+ end
@@ -0,0 +1,5 @@
1
+ $LOAD_PATH.unshift File.dirname(__FILE__)
2
+
3
+ module CrossValidation
4
+ VERSION = '0.0.1'
5
+ end
@@ -0,0 +1,100 @@
1
+ require_relative '../cross_validation'
2
+
3
+ module CrossValidation
4
+
5
+ # Provides a confusion matrix (contingency table) for classification
6
+ # results.
7
+ #
8
+ # See the following book for more details:
9
+ #
10
+ # Speech and Language Processing: An introduction to natural language
11
+ # processing, computational linguistics, and speech recognition. Daniel
12
+ # Jurafsky & James H. Martin.
13
+ class ConfusionMatrix
14
+ # Initialize the confusion matrix with a Proc (or block). This Proc must
15
+ # return a symbol of :tp (true positive), :tn (true negative), :fp (false
16
+ # positive), or :fn (false negative) for a given classification and its
17
+ # expected value.
18
+ #
19
+ # See the unit test for an example Proc.
20
+ #
21
+ # @param [Proc] keys_proc
22
+ def initialize(keys_proc)
23
+ @keys_for = keys_proc
24
+ @values = {:tp => 0, :tn => 0, :fp => 0, :fn => 0}
25
+ end
26
+
27
+ [:tp, :tn, :fp, :fn].each do |field|
28
+ define_method(field) { @values.fetch(field) }
29
+ end
30
+
31
+ # Save the result of classification
32
+ #
33
+ # @param [Object] actual The classified value
34
+ # @param [Object] truth The known, expected value
35
+ # @return [self]
36
+ def store(actual, truth)
37
+ key = @keys_for.call(actual, truth)
38
+
39
+ if @values.key?(key)
40
+ @values[key] += 1
41
+ else
42
+ fail IndexError, "#{key} not found in confusion matrix"
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ # Computes the accuracy of the classifier, defined as (tp + tn)/n
49
+ #
50
+ # @return [Float]
51
+ def accuracy
52
+ (@values.fetch(:tp) + @values.fetch(:tn)) / total()
53
+ end
54
+
55
+ # Computes the precision of the classifier, defined as tp/(tp + fp)
56
+ #
57
+ # @return [Float]
58
+ def precision
59
+ @values.fetch(:tp) / Float(@values.fetch(:tp) + @values.fetch(:fp))
60
+ end
61
+
62
+ # Computes the recall of the classifier, defined as tp/(tp + fn)
63
+ #
64
+ # @return [Float]
65
+ def recall
66
+ @values.fetch(:tp) / Float(@values.fetch(:tp) + @values.fetch(:fn))
67
+ end
68
+
69
+ # Returns the F-measure of the classifier's precision and recall.
70
+ #
71
+ # @param [Float] beta Favor precision (<1), recall (>1), or both (1)
72
+ # @return [Float]
73
+ def fscore(beta)
74
+ b2 = Float(beta**2)
75
+ ((b2 + 1) * precision * recall) / (b2 * precision + recall)
76
+ end
77
+
78
+ # Returns an F-score that favors precision and recall equally.
79
+ #
80
+ # @return [Float]
81
+ def f1
82
+ fscore(1)
83
+ end
84
+
85
+ # Returns the classifier's error
86
+ def error
87
+ 1.0 - accuracy()
88
+ end
89
+
90
+ private
91
+
92
+ # Returns the total number of classifications as a Float, since this value
93
+ # is used as a divisor.
94
+ #
95
+ # @return [Float]
96
+ def total
97
+ Float(@values.values.reduce(:+))
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,95 @@
1
+ require_relative '../cross_validation'
2
+
3
+ module CrossValidation
4
+ class Runner
5
+ # @return [Array] Array of documents to train and test on. It can be an
6
+ # array of anything, as the +fetch_sample_value+ and
7
+ # +fetch_sample_class+ lambdas specify what to feed into
8
+ # the classifying method.
9
+ attr_accessor :documents
10
+
11
+ # @return [Proc] This instantiates your classifier.
12
+ attr_accessor :classifier
13
+
14
+ # @return [Fixnum] The number of folds to partition +documents+ into.
15
+ # Mutually exclusive with +percentage+.
16
+ attr_accessor :folds
17
+
18
+ # @return [Float] The number of folds to partition +documents+ into as a
19
+ # *percentage* of the documents. Mutually exclusive with
20
+ # +folds+.
21
+ attr_accessor :percentage
22
+
23
+ # @return [ConfusionMatrix]
24
+ attr_accessor :matrix
25
+
26
+ # @return [Proc] This receives an instantiated +classifier+ and a
27
+ # document, and it should call your classifier's training
28
+ # method.
29
+ attr_accessor :training
30
+
31
+ # @return [Proc] This receives a *trained* classifier and a test document.
32
+ # It classifies the document. It's a +Proc+ because we
33
+ # create a new one with each partition.
34
+ attr_accessor :classifying
35
+
36
+ # @return [Proc] This receives a document and should return its value,
37
+ # i.e., whatever you're feeding into +classifying+.
38
+ attr_accessor :fetch_sample_value
39
+
40
+ # @return [Proc] When verifying the results of executing the +classifying+
41
+ # method, we need to determine what the actual class (e.g.,
42
+ # spam) of the document was. This +Proc+ receives a
43
+ # document and should return the document's class.
44
+ attr_accessor :fetch_sample_class
45
+
46
+ # Returns the number of folds to partition the documents into.
47
+ #
48
+ # @return [Fixnum]
49
+ def k
50
+ @k ||= percentage ? (documents.size * percentage) : folds
51
+ end
52
+
53
+ # Performs k-fold cross-validation and returns a confusion matrix.
54
+ #
55
+ # The algorithm is as follows (Mitchell, 1997, p147):
56
+ #
57
+ # partitions = partition data into k-equal sized subsets (folds)
58
+ # for i = 1 -> k:
59
+ # T = data \ partitions[i]
60
+ # train(T)
61
+ # classify(partitions[i])
62
+ # output confusion matrix
63
+ #
64
+ def run
65
+ partitions = documents.each_slice(k).to_a
66
+
67
+ results = partitions.map.with_index do |part, i|
68
+ # Array#rotate puts the element i first, so all we have to do is rotate
69
+ # then remove that element to get the training set. Array#drop does not
70
+ # mutate the original array either. Array#flatten is needed to coalesce
71
+ # our list of lists into one list again.
72
+ training_samples = partitions.rotate(i).drop(1).flatten
73
+
74
+ classifier_instance = classifier.call()
75
+
76
+ # train it
77
+ training_samples.each { |doc| training.call(classifier_instance, doc) }
78
+
79
+ # fetch confusion keys
80
+ part.each do |x|
81
+ prediction = classifying.call(classifier_instance, fetch_sample_value.call(x))
82
+ matrix.store(prediction, fetch_sample_class.call(x))
83
+ end
84
+ end
85
+
86
+ matrix
87
+ end
88
+
89
+ # Configuring a cross-validation run is complicated. Let's make it easier
90
+ # with a factory method.
91
+ def self.create
92
+ new.tap { |r| yield(r) }
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,101 @@
1
+ require_relative 'test_helper'
2
+ require_relative '../lib/cross_validation/confusion_matrix'
3
+
4
+ class TestConfusionMatrix < MiniTest::Unit::TestCase
5
+ def delta
6
+ 1e-6
7
+ end
8
+
9
+ def setup
10
+ @mat = CrossValidation::ConfusionMatrix.new(method(:keys_for))
11
+ end
12
+
13
+ def test_true_positives
14
+ true_positive(@mat)
15
+ assert_equal 1, @mat.tp
16
+ end
17
+
18
+ def test_true_negatives
19
+ true_negative(@mat)
20
+ assert_equal 1, @mat.tn
21
+ end
22
+
23
+ def test_false_positives
24
+ false_positive(@mat)
25
+ assert_equal 1, @mat.fp
26
+ end
27
+
28
+ def test_false_negatives
29
+ false_negative(@mat)
30
+ assert_equal 1, @mat.fn
31
+ end
32
+
33
+ def test_store_raises_index_error_on_bad_key
34
+ bad_keys_for = ->(actual, expected) { :bad }
35
+ mat = CrossValidation::ConfusionMatrix.new(bad_keys_for)
36
+ assert_raises IndexError do
37
+ mat.store(:ham, :spam)
38
+ end
39
+ end
40
+
41
+ def test_accuracy
42
+ true_positive(@mat)
43
+ true_negative(@mat)
44
+ false_negative(@mat)
45
+
46
+ assert_in_delta 2.0/3.0, @mat.accuracy, delta
47
+ end
48
+
49
+ def test_precision
50
+ true_positive(@mat)
51
+ false_positive(@mat)
52
+
53
+ assert_in_delta 0.5, @mat.precision, delta
54
+ end
55
+
56
+ def test_error
57
+ true_positive(@mat)
58
+ true_negative(@mat)
59
+ false_positive(@mat)
60
+
61
+ assert_in_delta 1/3.0, @mat.error, delta
62
+ end
63
+
64
+ def test_precision
65
+ true_positive(@mat)
66
+ false_positive(@mat)
67
+ false_positive(@mat)
68
+ false_positive(@mat)
69
+
70
+ assert_in_delta 0.25, @mat.precision, delta
71
+ end
72
+
73
+ def test_recall
74
+ true_positive(@mat)
75
+ false_negative(@mat)
76
+
77
+ assert_in_delta 0.5, @mat.recall, delta
78
+ end
79
+
80
+ def test_fscore
81
+ true_positive(@mat)
82
+ true_negative(@mat)
83
+ false_positive(@mat)
84
+
85
+ assert_in_delta 2/3.0, @mat.fscore(1), delta
86
+ end
87
+
88
+ def test_f1score
89
+ true_positive(@mat)
90
+ true_negative(@mat)
91
+
92
+ assert_in_delta 1.0, @mat.f1, delta
93
+ end
94
+
95
+ private
96
+
97
+ def true_positive(mat) mat.store(:spam, :spam) end
98
+ def true_negative(mat) mat.store(:ham, :ham) end
99
+ def false_positive(mat) mat.store(:ham, :spam) end
100
+ def false_negative(mat) mat.store(:spam, :ham) end
101
+ end
@@ -0,0 +1,10 @@
1
+ require 'minitest/autorun'
2
+
3
+ # Dummy method for use in testing confusion matrices.
4
+ def keys_for(actual, expected)
5
+ if actual == :spam
6
+ expected == :spam ? :tp : :fn
7
+ elsif actual == :ham
8
+ expected == :ham ? :tn : :fp
9
+ end
10
+ end
@@ -0,0 +1,86 @@
1
+ require_relative 'test_helper'
2
+ require_relative '../lib/cross_validation/confusion_matrix'
3
+ require_relative '../lib/cross_validation/runner'
4
+
5
+ # A stupid classifier
6
+ class SpamClassifier
7
+ def train(klass, document)
8
+ # don't bother, we're that good (in reality, you should probably do some
9
+ # work here)
10
+ end
11
+
12
+ def classify(document)
13
+ document =~ /viagra/ ? :spam : :ham
14
+ end
15
+ end
16
+
17
+ # We just need to associate a class with a value. Feel free to use whatever
18
+ # data structure you like -- this is only used in user-defined training and
19
+ # classifying closures.
20
+ Sample = Struct.new(:klass, :value)
21
+
22
+ # Asserts the DSL's getter and setters work.
23
+ def check_dsl(attribute, value)
24
+ runner = CrossValidation::Runner.create { |r|
25
+ r.public_send("#{attribute}=", :value)
26
+ }
27
+
28
+ define_method("test_#{attribute}_getter") {
29
+ assert_equal :value, runner.public_send(attribute)
30
+ }
31
+ end
32
+
33
+ class TestRunner < MiniTest::Unit::TestCase
34
+ def setup
35
+ tpl = ['Buy some...', 'Would you like some...']
36
+ @spam = tpl.map { |pfx| Sample.new(:spam, pfx + 'viagra!') }
37
+ @ham = tpl.map { |pfx| Sample.new(:ham, pfx + 'penicillin!') }
38
+ @corpus = @spam + @ham
39
+ @corpus *= 25 # 100 is easier to deal with
40
+ end
41
+
42
+ def test_run
43
+ runner = CrossValidation::Runner.create do |r|
44
+ r.documents = @corpus
45
+ r.folds = 10
46
+ r.classifier = lambda { SpamClassifier.new }
47
+ r.fetch_sample_class = lambda { |sample| sample.klass }
48
+ r.fetch_sample_value = lambda { |sample| sample.value }
49
+ r.matrix = CrossValidation::ConfusionMatrix.new(method(:keys_for))
50
+ r.training = lambda { |classifier, doc|
51
+ classifier.train doc.klass, doc.value
52
+ }
53
+ r.classifying = lambda { |classifier, doc|
54
+ classifier.classify doc
55
+ }
56
+ end
57
+
58
+ mat = runner.run
59
+
60
+ assert_equal 50, mat.tp
61
+ assert_equal 50, mat.tn
62
+ end
63
+
64
+ def test_percentage_takes_precedence_over_folds
65
+ runner = CrossValidation::Runner.create do |r|
66
+ r.documents = ['foo'] * 100
67
+ r.folds = 20
68
+ r.percentage = 0.1
69
+ end
70
+
71
+ assert_equal 10, runner.k
72
+ end
73
+
74
+ [
75
+ :documents,
76
+ :folds,
77
+ :classifier,
78
+ :fetch_sample_value,
79
+ :fetch_sample_class,
80
+ :matrix,
81
+ :training,
82
+ :classifying,
83
+ ].each do |attribute|
84
+ check_dsl(attribute, :foo)
85
+ end
86
+ end
metadata ADDED
@@ -0,0 +1,73 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cross_validation
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Jon-Michael Deldin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-04-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '10.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '10.0'
27
+ description: Performs k-fold cross-validation on machine learning classifiers.
28
+ email:
29
+ - dev@jmdeldin.com
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - .gitignore
35
+ - .travis.yml
36
+ - Gemfile
37
+ - LICENSE.txt
38
+ - README.md
39
+ - Rakefile
40
+ - cross_validation.gemspec
41
+ - lib/cross_validation.rb
42
+ - lib/cross_validation/confusion_matrix.rb
43
+ - lib/cross_validation/runner.rb
44
+ - test/test_confusion_matrix.rb
45
+ - test/test_helper.rb
46
+ - test/test_runner.rb
47
+ homepage: https://github.com/jmdeldin/cross_validation
48
+ licenses: []
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.0.0.rc.2
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Performs k-fold cross-validation on machine learning classifiers.
70
+ test_files:
71
+ - test/test_confusion_matrix.rb
72
+ - test/test_helper.rb
73
+ - test/test_runner.rb