harlequin 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.markdown +32 -0
- data/Rakefile +1 -0
- data/harlequin.gemspec +25 -0
- data/lib/harlequin/version.rb +3 -0
- data/lib/harlequin.rb +149 -0
- data/spec/harlequin_spec.rb +64 -0
- data/spec/lda_sample.csv +2001 -0
- data/spec/spec_helper.rb +2 -0
- metadata +82 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
|
5
|
+
```
|
6
|
+
|
7
|
+
Training rows should be formatted as hashes with pairs of the form ```variable_name => value```. For example, we can add some rows to the analysis above with
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
analysis.add_training_data(
|
11
|
+
{ :weight => 200, :height => 72, :gender => 'male' },
|
12
|
+
{ :weight => 205, :height => 71, :gender => 'male' },
|
13
|
+
{ :weight => 140, :height => 63, :gender => 'female'},
|
14
|
+
{ :weight => 130, :height => 61, :gender => 'female'}
|
15
|
+
)
|
16
|
+
```
|
17
|
+
(Note that there must be more than 1 of each classification value represented in the training data, and variable values must not be constant within a class.)
|
18
|
+
|
19
|
+
Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#init_qda_analysis```, respectively. Then we can predict the class of new rows, also given as hashes:
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
analysis.init_lda_analysis
|
23
|
+
analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
|
24
|
+
```
|
25
|
+
|
26
|
+
Multiple predictions can be computed at once in the same way as adding multiple training rows.
|
27
|
+
|
28
|
+
In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
|
32
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/harlequin.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "harlequin/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harlequin"
|
7
|
+
s.version = Harlequin::VERSION
|
8
|
+
s.authors = ["Brian Stanwyck"]
|
9
|
+
s.email = ["brian@highgroove.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Wrapper for discriminant analysis methods in R}
|
12
|
+
s.description = %q{harlequin is a Ruby wrapper for linear and quadratic discriminant analysis in R for statistical classification. Also allows means testing to determine significance of discriminant variables.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "harlequin"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here
|
22
|
+
|
23
|
+
s.add_development_dependency "rspec"
|
24
|
+
s.add_dependency "rinruby"
|
25
|
+
end
|
data/lib/harlequin.rb
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
module Harlequin
|
2
|
+
require 'rinruby'
|
3
|
+
|
4
|
+
R.echo false
|
5
|
+
R.eval "library(MASS)"
|
6
|
+
R.eval "library(alr3)"
|
7
|
+
|
8
|
+
class DiscriminantAnalysis
|
9
|
+
attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
|
10
|
+
|
11
|
+
def initialize(variables, classification_variable)
|
12
|
+
@accuracy = nil
|
13
|
+
@variables = variables << classification_variable
|
14
|
+
@classification_variable = classification_variable
|
15
|
+
@training_data = []
|
16
|
+
@class_hash = {}
|
17
|
+
end
|
18
|
+
|
19
|
+
def clear_training_data
|
20
|
+
@training_data = []
|
21
|
+
@class_hash = {}
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_training_data(*new_data)
|
25
|
+
@training_data += new_data
|
26
|
+
|
27
|
+
@training_data.map { |row| row[@classification_variable] }.each do |class_value|
|
28
|
+
unless @class_hash.keys.include? class_value
|
29
|
+
@class_hash.merge!({ class_value => (@class_hash.values.max ? @class_hash.values.max+1 : 1) })
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the class determined by linear discriminant analysis for an array of sample points.
|
35
|
+
def predict(*samples)
|
36
|
+
(variables - [classification_variable]).each do |var|
|
37
|
+
R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
|
38
|
+
end
|
39
|
+
|
40
|
+
sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
|
41
|
+
R.eval "sample_points <- data.frame(#{sample_var_declarations})"
|
42
|
+
|
43
|
+
R.eval "predictions <- predict(fit, sample_points)"
|
44
|
+
R.eval "classes <- as.numeric(predictions$class)"
|
45
|
+
|
46
|
+
R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
|
47
|
+
prediction_matrix = R.pull "as.matrix(d)"
|
48
|
+
|
49
|
+
# This requires classes to be integers 1,2,3,...
|
50
|
+
# TODO: implement this without requiring specific values for sample hashes
|
51
|
+
predictions = prediction_matrix.to_a.map do |row|
|
52
|
+
classification = row.first.to_i
|
53
|
+
confidence = row[classification]
|
54
|
+
{
|
55
|
+
:class => @class_hash.invert[classification],
|
56
|
+
:confidence => confidence
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
predictions.count == 1 ? predictions.first : predictions
|
61
|
+
end
|
62
|
+
|
63
|
+
# Performs a test of difference of means between classes
|
64
|
+
# Since the t-test is two-sample, classification_variable must only have two states
|
65
|
+
def t_test(variable)
|
66
|
+
R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
|
67
|
+
|
68
|
+
t_statistic = R.pull "t_test$statistic"
|
69
|
+
degrees_of_freedom = R.pull "t_test$df"
|
70
|
+
p_value = R.pull "t_test$p.value"
|
71
|
+
|
72
|
+
{
|
73
|
+
:t_statistic => t_statistic,
|
74
|
+
:degrees_of_freedom => degrees_of_freedom,
|
75
|
+
:p_value => p_value
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
def plot(samples = nil)
|
80
|
+
if samples
|
81
|
+
variables.each do |var|
|
82
|
+
R.assign("#{var}_sample", samples.map { |s| s[var] })
|
83
|
+
end
|
84
|
+
plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
|
85
|
+
else
|
86
|
+
plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
|
87
|
+
end
|
88
|
+
R.eval "plot(data.frame(#{plot_vars}), col=as.numeric(#{classification_variable.to_s}))"
|
89
|
+
end
|
90
|
+
|
91
|
+
def plot_predict(samples = nil)
|
92
|
+
if samples
|
93
|
+
variables.each do |var|
|
94
|
+
R.assign("#{var}_sample", samples.map { |s| s[var] })
|
95
|
+
end
|
96
|
+
plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
|
97
|
+
|
98
|
+
R.predictions = samples.map { |sample| predict(sample) }
|
99
|
+
else
|
100
|
+
plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
|
101
|
+
R.eval "predictions <- as.numeric(analysis_data$#{classification_variable.to_s})"
|
102
|
+
end
|
103
|
+
R.eval "plot(data.frame(#{plot_vars}), col=predictions)"
|
104
|
+
end
|
105
|
+
|
106
|
+
['lda', 'qda'].each do |analysis_type|
|
107
|
+
define_method("init_#{analysis_type}_analysis") do
|
108
|
+
init_analysis
|
109
|
+
R.eval <<-EOF
|
110
|
+
analysis_data <- data.frame(#{@var_declarations})
|
111
|
+
fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
|
112
|
+
EOF
|
113
|
+
compute_accuracy
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def init_analysis
|
120
|
+
variables.each do |variable|
|
121
|
+
if variable == @classification_variable
|
122
|
+
R.assign(variable.to_s, training_data.map { |point| @class_hash[point[variable]] })
|
123
|
+
else
|
124
|
+
R.assign(variable.to_s, training_data.map { |point| point[variable] })
|
125
|
+
end
|
126
|
+
end
|
127
|
+
@var_declarations = variables.map(&:to_s).join(',')
|
128
|
+
@non_class_variables = (variables - [classification_variable]).map { |variable| variable.to_s }.join('+')
|
129
|
+
end
|
130
|
+
|
131
|
+
def compute_accuracy
|
132
|
+
R.eval "ct <- table(predict(fit)$class, analysis_data$#{classification_variable.to_s})"
|
133
|
+
percent_correct = R.pull "sum(diag(prop.table(ct)))"
|
134
|
+
percent_false_positives = (R.pull "prop.table(ct)[1,2]") / (R.pull "prop.table(ct)[1,1] + prop.table(ct)[1,2]")
|
135
|
+
percent_false_negatives = (R.pull "prop.table(ct)[2,1]") / (R.pull "prop.table(ct)[2,1] + prop.table(ct)[2,2]")
|
136
|
+
|
137
|
+
correlation_coefficient = R.pull "sqrt(chisq.test(ct)$statistic/sum(ct))"
|
138
|
+
|
139
|
+
@accuracy = {
|
140
|
+
:percent_correct => percent_correct,
|
141
|
+
:percent_false_negatives => percent_false_negatives,
|
142
|
+
:percent_false_positives => percent_false_positives,
|
143
|
+
:correlation_coefficient => correlation_coefficient
|
144
|
+
}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
include Harlequin
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe Harlequin::DiscriminantAnalysis do
|
3
|
+
before do
|
4
|
+
@discriminant_analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
|
5
|
+
|
6
|
+
csv_data = CSV.read("spec/lda_sample.csv")
|
7
|
+
csv_data.shift
|
8
|
+
@training = csv_data.map { |weight, height, gender| {:weight => weight.to_f, :height => height.to_f, :gender => gender.to_i} }
|
9
|
+
|
10
|
+
@discriminant_analysis.add_training_data(*@training)
|
11
|
+
@discriminant_analysis.init_lda_analysis
|
12
|
+
|
13
|
+
@male_sample = { :height => 73, :weight => 210 }
|
14
|
+
@female_sample = { :height => 60, :weight => 140 }
|
15
|
+
|
16
|
+
@male_prediction = @discriminant_analysis.predict(@male_sample)
|
17
|
+
@female_prediction = @discriminant_analysis.predict(@female_sample)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'computes the accuracy of a given training set' do
|
21
|
+
@discriminant_analysis.accuracy[:percent_correct].should be > 0.5
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'predicts inclusion in a set' do
|
25
|
+
@male_prediction[:class].should eq(1)
|
26
|
+
@female_prediction[:class].should eq(2)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'provides confidence scores for a prediction' do
|
30
|
+
@male_prediction[:confidence].should be > 0.5
|
31
|
+
@female_prediction[:confidence].should be > 0.5
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'predicts for arrays of sample points' do
|
35
|
+
samples = [@male_sample, @female_sample]
|
36
|
+
predictions = @discriminant_analysis.predict(*samples)
|
37
|
+
|
38
|
+
predictions.map { |row| row[:class] }.should eq [1,2]
|
39
|
+
predictions.map { |row| row[:confidence] }.each do |confidence|
|
40
|
+
confidence.should be > 0.5
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'clears training data from a DiscriminantAnalysis instance' do
|
45
|
+
@discriminant_analysis.clear_training_data
|
46
|
+
@discriminant_analysis.training_data.should be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'accepts non-numeric classification values in training data' do
|
50
|
+
@discriminant_analysis.clear_training_data
|
51
|
+
|
52
|
+
@training.map! do |row|
|
53
|
+
gender_string = row[:gender] == 1 ? 'male' : 'female'
|
54
|
+
row.merge(:gender => gender_string)
|
55
|
+
end
|
56
|
+
|
57
|
+
@discriminant_analysis.add_training_data(*@training)
|
58
|
+
@discriminant_analysis.init_lda_analysis
|
59
|
+
@discriminant_analysis.accuracy[:percent_correct].should be_within(0.001).of 0.9485
|
60
|
+
|
61
|
+
@male_prediction = @discriminant_analysis.predict(@male_sample)
|
62
|
+
@male_prediction[:class].should eq 'male'
|
63
|
+
end
|
64
|
+
end
|