harlequin 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.markdown +32 -0
- data/Rakefile +1 -0
- data/harlequin.gemspec +25 -0
- data/lib/harlequin/version.rb +3 -0
- data/lib/harlequin.rb +149 -0
- data/spec/harlequin_spec.rb +64 -0
- data/spec/lda_sample.csv +2001 -0
- data/spec/spec_helper.rb +2 -0
- metadata +82 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
|
2
|
+
|
3
|
+
```ruby
|
4
|
+
analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
|
5
|
+
```
|
6
|
+
|
7
|
+
Training rows should be formatted as hashes with pairs of the form ```variable_name => value```. For example, we can add some rows to the analysis above with
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
analysis.add_training_data(
|
11
|
+
{ :weight => 200, :height => 72, :gender => 'male' },
|
12
|
+
{ :weight => 205, :height => 71, :gender => 'male' },
|
13
|
+
{ :weight => 140, :height => 63, :gender => 'female'},
|
14
|
+
{ :weight => 130, :height => 61, :gender => 'female'}
|
15
|
+
)
|
16
|
+
```
|
17
|
+
(Note that there must be more than 1 of each classification value represented in the training data, and variable values must not be constant within a class.)
|
18
|
+
|
19
|
+
Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#init_qda_analysis```, respectively. Then we can predict the class of new rows, also given as hashes:
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
analysis.init_lda_analysis
|
23
|
+
analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
|
24
|
+
```
|
25
|
+
|
26
|
+
Multiple predictions can be computed at once in the same way as adding multiple training rows.
|
27
|
+
|
28
|
+
In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
|
32
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/harlequin.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "harlequin/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "harlequin"
|
7
|
+
s.version = Harlequin::VERSION
|
8
|
+
s.authors = ["Brian Stanwyck"]
|
9
|
+
s.email = ["brian@highgroove.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Wrapper for discriminant analysis methods in R}
|
12
|
+
s.description = %q{harlequin is a Ruby wrapper for linear and quadratic discriminant analysis in R for statistical classification. Also allows means testing to determine significance of discriminant variables.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "harlequin"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here
|
22
|
+
|
23
|
+
s.add_development_dependency "rspec"
|
24
|
+
s.add_dependency "rinruby"
|
25
|
+
end
|
data/lib/harlequin.rb
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
module Harlequin
|
2
|
+
require 'rinruby'
|
3
|
+
|
4
|
+
R.echo false
|
5
|
+
R.eval "library(MASS)"
|
6
|
+
R.eval "library(alr3)"
|
7
|
+
|
8
|
+
class DiscriminantAnalysis
|
9
|
+
attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
|
10
|
+
|
11
|
+
def initialize(variables, classification_variable)
|
12
|
+
@accuracy = nil
|
13
|
+
@variables = variables << classification_variable
|
14
|
+
@classification_variable = classification_variable
|
15
|
+
@training_data = []
|
16
|
+
@class_hash = {}
|
17
|
+
end
|
18
|
+
|
19
|
+
def clear_training_data
|
20
|
+
@training_data = []
|
21
|
+
@class_hash = {}
|
22
|
+
end
|
23
|
+
|
24
|
+
def add_training_data(*new_data)
|
25
|
+
@training_data += new_data
|
26
|
+
|
27
|
+
@training_data.map { |row| row[@classification_variable] }.each do |class_value|
|
28
|
+
unless @class_hash.keys.include? class_value
|
29
|
+
@class_hash.merge!({ class_value => (@class_hash.values.max ? @class_hash.values.max+1 : 1) })
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Returns the class determined by linear discriminant analysis for an array of sample points.
|
35
|
+
def predict(*samples)
|
36
|
+
(variables - [classification_variable]).each do |var|
|
37
|
+
R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
|
38
|
+
end
|
39
|
+
|
40
|
+
sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
|
41
|
+
R.eval "sample_points <- data.frame(#{sample_var_declarations})"
|
42
|
+
|
43
|
+
R.eval "predictions <- predict(fit, sample_points)"
|
44
|
+
R.eval "classes <- as.numeric(predictions$class)"
|
45
|
+
|
46
|
+
R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
|
47
|
+
prediction_matrix = R.pull "as.matrix(d)"
|
48
|
+
|
49
|
+
# This requires classes to be integers 1,2,3,...
|
50
|
+
# TODO: implement this without requiring specific values for sample hashes
|
51
|
+
predictions = prediction_matrix.to_a.map do |row|
|
52
|
+
classification = row.first.to_i
|
53
|
+
confidence = row[classification]
|
54
|
+
{
|
55
|
+
:class => @class_hash.invert[classification],
|
56
|
+
:confidence => confidence
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
predictions.count == 1 ? predictions.first : predictions
|
61
|
+
end
|
62
|
+
|
63
|
+
# Performs a test of difference of means between classes
|
64
|
+
# Since the t-test is two-sample, classification_variable must only have two states
|
65
|
+
def t_test(variable)
|
66
|
+
R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
|
67
|
+
|
68
|
+
t_statistic = R.pull "t_test$statistic"
|
69
|
+
degrees_of_freedom = R.pull "t_test$df"
|
70
|
+
p_value = R.pull "t_test$p.value"
|
71
|
+
|
72
|
+
{
|
73
|
+
:t_statistic => t_statistic,
|
74
|
+
:degrees_of_freedom => degrees_of_freedom,
|
75
|
+
:p_value => p_value
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
def plot(samples = nil)
|
80
|
+
if samples
|
81
|
+
variables.each do |var|
|
82
|
+
R.assign("#{var}_sample", samples.map { |s| s[var] })
|
83
|
+
end
|
84
|
+
plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
|
85
|
+
else
|
86
|
+
plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
|
87
|
+
end
|
88
|
+
R.eval "plot(data.frame(#{plot_vars}), col=as.numeric(#{classification_variable.to_s}))"
|
89
|
+
end
|
90
|
+
|
91
|
+
def plot_predict(samples = nil)
|
92
|
+
if samples
|
93
|
+
variables.each do |var|
|
94
|
+
R.assign("#{var}_sample", samples.map { |s| s[var] })
|
95
|
+
end
|
96
|
+
plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
|
97
|
+
|
98
|
+
R.predictions = samples.map { |sample| predict(sample) }
|
99
|
+
else
|
100
|
+
plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
|
101
|
+
R.eval "predictions <- as.numeric(analysis_data$#{classification_variable.to_s})"
|
102
|
+
end
|
103
|
+
R.eval "plot(data.frame(#{plot_vars}), col=predictions)"
|
104
|
+
end
|
105
|
+
|
106
|
+
['lda', 'qda'].each do |analysis_type|
|
107
|
+
define_method("init_#{analysis_type}_analysis") do
|
108
|
+
init_analysis
|
109
|
+
R.eval <<-EOF
|
110
|
+
analysis_data <- data.frame(#{@var_declarations})
|
111
|
+
fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
|
112
|
+
EOF
|
113
|
+
compute_accuracy
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def init_analysis
|
120
|
+
variables.each do |variable|
|
121
|
+
if variable == @classification_variable
|
122
|
+
R.assign(variable.to_s, training_data.map { |point| @class_hash[point[variable]] })
|
123
|
+
else
|
124
|
+
R.assign(variable.to_s, training_data.map { |point| point[variable] })
|
125
|
+
end
|
126
|
+
end
|
127
|
+
@var_declarations = variables.map(&:to_s).join(',')
|
128
|
+
@non_class_variables = (variables - [classification_variable]).map { |variable| variable.to_s }.join('+')
|
129
|
+
end
|
130
|
+
|
131
|
+
def compute_accuracy
|
132
|
+
R.eval "ct <- table(predict(fit)$class, analysis_data$#{classification_variable.to_s})"
|
133
|
+
percent_correct = R.pull "sum(diag(prop.table(ct)))"
|
134
|
+
percent_false_positives = (R.pull "prop.table(ct)[1,2]") / (R.pull "prop.table(ct)[1,1] + prop.table(ct)[1,2]")
|
135
|
+
percent_false_negatives = (R.pull "prop.table(ct)[2,1]") / (R.pull "prop.table(ct)[2,1] + prop.table(ct)[2,2]")
|
136
|
+
|
137
|
+
correlation_coefficient = R.pull "sqrt(chisq.test(ct)$statistic/sum(ct))"
|
138
|
+
|
139
|
+
@accuracy = {
|
140
|
+
:percent_correct => percent_correct,
|
141
|
+
:percent_false_negatives => percent_false_negatives,
|
142
|
+
:percent_false_positives => percent_false_positives,
|
143
|
+
:correlation_coefficient => correlation_coefficient
|
144
|
+
}
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
include Harlequin
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
describe Harlequin::DiscriminantAnalysis do
|
3
|
+
before do
|
4
|
+
@discriminant_analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
|
5
|
+
|
6
|
+
csv_data = CSV.read("spec/lda_sample.csv")
|
7
|
+
csv_data.shift
|
8
|
+
@training = csv_data.map { |weight, height, gender| {:weight => weight.to_f, :height => height.to_f, :gender => gender.to_i} }
|
9
|
+
|
10
|
+
@discriminant_analysis.add_training_data(*@training)
|
11
|
+
@discriminant_analysis.init_lda_analysis
|
12
|
+
|
13
|
+
@male_sample = { :height => 73, :weight => 210 }
|
14
|
+
@female_sample = { :height => 60, :weight => 140 }
|
15
|
+
|
16
|
+
@male_prediction = @discriminant_analysis.predict(@male_sample)
|
17
|
+
@female_prediction = @discriminant_analysis.predict(@female_sample)
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'computes the accuracy of a given training set' do
|
21
|
+
@discriminant_analysis.accuracy[:percent_correct].should be > 0.5
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'predicts inclusion in a set' do
|
25
|
+
@male_prediction[:class].should eq(1)
|
26
|
+
@female_prediction[:class].should eq(2)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'provides confidence scores for a prediction' do
|
30
|
+
@male_prediction[:confidence].should be > 0.5
|
31
|
+
@female_prediction[:confidence].should be > 0.5
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'predicts for arrays of sample points' do
|
35
|
+
samples = [@male_sample, @female_sample]
|
36
|
+
predictions = @discriminant_analysis.predict(*samples)
|
37
|
+
|
38
|
+
predictions.map { |row| row[:class] }.should eq [1,2]
|
39
|
+
predictions.map { |row| row[:confidence] }.each do |confidence|
|
40
|
+
confidence.should be > 0.5
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'clears training data from a DiscriminantAnalysis instance' do
|
45
|
+
@discriminant_analysis.clear_training_data
|
46
|
+
@discriminant_analysis.training_data.should be_empty
|
47
|
+
end
|
48
|
+
|
49
|
+
it 'accepts non-numeric classification values in training data' do
|
50
|
+
@discriminant_analysis.clear_training_data
|
51
|
+
|
52
|
+
@training.map! do |row|
|
53
|
+
gender_string = row[:gender] == 1 ? 'male' : 'female'
|
54
|
+
row.merge(:gender => gender_string)
|
55
|
+
end
|
56
|
+
|
57
|
+
@discriminant_analysis.add_training_data(*@training)
|
58
|
+
@discriminant_analysis.init_lda_analysis
|
59
|
+
@discriminant_analysis.accuracy[:percent_correct].should be_within(0.001).of 0.9485
|
60
|
+
|
61
|
+
@male_prediction = @discriminant_analysis.predict(@male_sample)
|
62
|
+
@male_prediction[:class].should eq 'male'
|
63
|
+
end
|
64
|
+
end
|