harlequin 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in discriminant_analysis.gemspec
4
+ gemspec
data/README.markdown ADDED
@@ -0,0 +1,32 @@
1
+ Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
2
+
3
+ ```ruby
4
+ analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
5
+ ```
6
+
7
+ Training rows should be formatted as hashes with pairs of the form ```variable_name => value```. For example, we can add some rows to the analysis above with
8
+
9
+ ```ruby
10
+ analysis.add_training_data(
11
+ { :weight => 200, :height => 72, :gender => 'male' },
12
+ { :weight => 205, :height => 71, :gender => 'male' },
13
+ { :weight => 140, :height => 63, :gender => 'female'},
14
+ { :weight => 130, :height => 61, :gender => 'female'}
15
+ )
16
+ ```
17
+ (Note that there must be more than 1 of each classification value represented in the training data, and variable values must not be constant within a class.)
18
+
19
+ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#init_qda_analysis```, respectively. Then we can predict the class of new rows, also given as hashes:
20
+
21
+ ```ruby
22
+ analysis.init_lda_analysis
23
+ analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
24
+ ```
25
+
26
+ Multiple predictions can be computed at once in the same way as adding multiple training rows.
27
+
28
+ In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
29
+
30
+ ```ruby
31
+ analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
32
+ ```
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/harlequin.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "harlequin/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harlequin"
7
+ s.version = Harlequin::VERSION
8
+ s.authors = ["Brian Stanwyck"]
9
+ s.email = ["brian@highgroove.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Wrapper for discriminant analysis methods in R}
12
+ s.description = %q{harlequin is a Ruby wrapper for linear and quadratic discriminant analysis in R for statistical classification. Also allows means testing to determine significance of discriminant variables.}
13
+
14
+ s.rubyforge_project = "harlequin"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here
22
+
23
+ s.add_development_dependency "rspec"
24
+ s.add_dependency "rinruby"
25
+ end
@@ -0,0 +1,3 @@
1
+ module Harlequin
2
+ VERSION = "0.0.1"
3
+ end
data/lib/harlequin.rb ADDED
@@ -0,0 +1,149 @@
1
+ module Harlequin
2
+ require 'rinruby'
3
+
4
+ R.echo false
5
+ R.eval "library(MASS)"
6
+ R.eval "library(alr3)"
7
+
8
+ class DiscriminantAnalysis
9
+ attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
10
+
11
+ def initialize(variables, classification_variable)
12
+ @accuracy = nil
13
+ @variables = variables << classification_variable
14
+ @classification_variable = classification_variable
15
+ @training_data = []
16
+ @class_hash = {}
17
+ end
18
+
19
+ def clear_training_data
20
+ @training_data = []
21
+ @class_hash = {}
22
+ end
23
+
24
+ def add_training_data(*new_data)
25
+ @training_data += new_data
26
+
27
+ @training_data.map { |row| row[@classification_variable] }.each do |class_value|
28
+ unless @class_hash.keys.include? class_value
29
+ @class_hash.merge!({ class_value => (@class_hash.values.max ? @class_hash.values.max+1 : 1) })
30
+ end
31
+ end
32
+ end
33
+
34
+ # Returns the class determined by linear discriminant analysis for an array of sample points.
35
+ def predict(*samples)
36
+ (variables - [classification_variable]).each do |var|
37
+ R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
38
+ end
39
+
40
+ sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
41
+ R.eval "sample_points <- data.frame(#{sample_var_declarations})"
42
+
43
+ R.eval "predictions <- predict(fit, sample_points)"
44
+ R.eval "classes <- as.numeric(predictions$class)"
45
+
46
+ R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
47
+ prediction_matrix = R.pull "as.matrix(d)"
48
+
49
+ # This requires classes to be integers 1,2,3,...
50
+ # TODO: implement this without requiring specific values for sample hashes
51
+ predictions = prediction_matrix.to_a.map do |row|
52
+ classification = row.first.to_i
53
+ confidence = row[classification]
54
+ {
55
+ :class => @class_hash.invert[classification],
56
+ :confidence => confidence
57
+ }
58
+ end
59
+
60
+ predictions.count == 1 ? predictions.first : predictions
61
+ end
62
+
63
+ # Performs a test of difference of means between classes
64
+ # Since the t-test is two-sample, classification_variable must only have two states
65
+ def t_test(variable)
66
+ R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
67
+
68
+ t_statistic = R.pull "t_test$statistic"
69
+ degrees_of_freedom = R.pull "t_test$df"
70
+ p_value = R.pull "t_test$p.value"
71
+
72
+ {
73
+ :t_statistic => t_statistic,
74
+ :degrees_of_freedom => degrees_of_freedom,
75
+ :p_value => p_value
76
+ }
77
+ end
78
+
79
+ def plot(samples = nil)
80
+ if samples
81
+ variables.each do |var|
82
+ R.assign("#{var}_sample", samples.map { |s| s[var] })
83
+ end
84
+ plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
85
+ else
86
+ plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
87
+ end
88
+ R.eval "plot(data.frame(#{plot_vars}), col=as.numeric(#{classification_variable.to_s}))"
89
+ end
90
+
91
+ def plot_predict(samples = nil)
92
+ if samples
93
+ variables.each do |var|
94
+ R.assign("#{var}_sample", samples.map { |s| s[var] })
95
+ end
96
+ plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
97
+
98
+ R.predictions = samples.map { |sample| predict(sample) }
99
+ else
100
+ plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
101
+ R.eval "predictions <- as.numeric(analysis_data$#{classification_variable.to_s})"
102
+ end
103
+ R.eval "plot(data.frame(#{plot_vars}), col=predictions)"
104
+ end
105
+
106
+ ['lda', 'qda'].each do |analysis_type|
107
+ define_method("init_#{analysis_type}_analysis") do
108
+ init_analysis
109
+ R.eval <<-EOF
110
+ analysis_data <- data.frame(#{@var_declarations})
111
+ fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
112
+ EOF
113
+ compute_accuracy
114
+ end
115
+ end
116
+
117
+ private
118
+
119
+ def init_analysis
120
+ variables.each do |variable|
121
+ if variable == @classification_variable
122
+ R.assign(variable.to_s, training_data.map { |point| @class_hash[point[variable]] })
123
+ else
124
+ R.assign(variable.to_s, training_data.map { |point| point[variable] })
125
+ end
126
+ end
127
+ @var_declarations = variables.map(&:to_s).join(',')
128
+ @non_class_variables = (variables - [classification_variable]).map { |variable| variable.to_s }.join('+')
129
+ end
130
+
131
+ def compute_accuracy
132
+ R.eval "ct <- table(predict(fit)$class, analysis_data$#{classification_variable.to_s})"
133
+ percent_correct = R.pull "sum(diag(prop.table(ct)))"
134
+ percent_false_positives = (R.pull "prop.table(ct)[1,2]") / (R.pull "prop.table(ct)[1,1] + prop.table(ct)[1,2]")
135
+ percent_false_negatives = (R.pull "prop.table(ct)[2,1]") / (R.pull "prop.table(ct)[2,1] + prop.table(ct)[2,2]")
136
+
137
+ correlation_coefficient = R.pull "sqrt(chisq.test(ct)$statistic/sum(ct))"
138
+
139
+ @accuracy = {
140
+ :percent_correct => percent_correct,
141
+ :percent_false_negatives => percent_false_negatives,
142
+ :percent_false_positives => percent_false_positives,
143
+ :correlation_coefficient => correlation_coefficient
144
+ }
145
+ end
146
+ end
147
+ end
148
+
149
+ include Harlequin
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+ describe Harlequin::DiscriminantAnalysis do
3
+ before do
4
+ @discriminant_analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
5
+
6
+ csv_data = CSV.read("spec/lda_sample.csv")
7
+ csv_data.shift
8
+ @training = csv_data.map { |weight, height, gender| {:weight => weight.to_f, :height => height.to_f, :gender => gender.to_i} }
9
+
10
+ @discriminant_analysis.add_training_data(*@training)
11
+ @discriminant_analysis.init_lda_analysis
12
+
13
+ @male_sample = { :height => 73, :weight => 210 }
14
+ @female_sample = { :height => 60, :weight => 140 }
15
+
16
+ @male_prediction = @discriminant_analysis.predict(@male_sample)
17
+ @female_prediction = @discriminant_analysis.predict(@female_sample)
18
+ end
19
+
20
+ it 'computes the accuracy of a given training set' do
21
+ @discriminant_analysis.accuracy[:percent_correct].should be > 0.5
22
+ end
23
+
24
+ it 'predicts inclusion in a set' do
25
+ @male_prediction[:class].should eq(1)
26
+ @female_prediction[:class].should eq(2)
27
+ end
28
+
29
+ it 'provides confidence scores for a prediction' do
30
+ @male_prediction[:confidence].should be > 0.5
31
+ @female_prediction[:confidence].should be > 0.5
32
+ end
33
+
34
+ it 'predicts for arrays of sample points' do
35
+ samples = [@male_sample, @female_sample]
36
+ predictions = @discriminant_analysis.predict(*samples)
37
+
38
+ predictions.map { |row| row[:class] }.should eq [1,2]
39
+ predictions.map { |row| row[:confidence] }.each do |confidence|
40
+ confidence.should be > 0.5
41
+ end
42
+ end
43
+
44
+ it 'clears training data from a DiscriminantAnalysis instance' do
45
+ @discriminant_analysis.clear_training_data
46
+ @discriminant_analysis.training_data.should be_empty
47
+ end
48
+
49
+ it 'accepts non-numeric classification values in training data' do
50
+ @discriminant_analysis.clear_training_data
51
+
52
+ @training.map! do |row|
53
+ gender_string = row[:gender] == 1 ? 'male' : 'female'
54
+ row.merge(:gender => gender_string)
55
+ end
56
+
57
+ @discriminant_analysis.add_training_data(*@training)
58
+ @discriminant_analysis.init_lda_analysis
59
+ @discriminant_analysis.accuracy[:percent_correct].should be_within(0.001).of 0.9485
60
+
61
+ @male_prediction = @discriminant_analysis.predict(@male_sample)
62
+ @male_prediction[:class].should eq 'male'
63
+ end
64
+ end