harlequin 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in discriminant_analysis.gemspec
4
+ gemspec
data/README.markdown ADDED
@@ -0,0 +1,32 @@
1
+ Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
2
+
3
+ ```ruby
4
+ analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
5
+ ```
6
+
7
+ Training rows should be formatted as hashes with pairs of the form ```variable_name => value```. For example, we can add some rows to the analysis above with
8
+
9
+ ```ruby
10
+ analysis.add_training_data(
11
+ { :weight => 200, :height => 72, :gender => 'male' },
12
+ { :weight => 205, :height => 71, :gender => 'male' },
13
+ { :weight => 140, :height => 63, :gender => 'female'},
14
+ { :weight => 130, :height => 61, :gender => 'female'}
15
+ )
16
+ ```
17
+ (Note that there must be more than 1 of each classification value represented in the training data, and variable values must not be constant within a class.)
18
+
19
+ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#init_qda_analysis```, respectively. Then we can predict the class of new rows, also given as hashes:
20
+
21
+ ```ruby
22
+ analysis.init_lda_analysis
23
+ analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
24
+ ```
25
+
26
+ Multiple predictions can be computed at once in the same way as adding multiple training rows.
27
+
28
+ In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
29
+
30
+ ```ruby
31
+ analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
32
+ ```
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/harlequin.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "harlequin/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "harlequin"
7
+ s.version = Harlequin::VERSION
8
+ s.authors = ["Brian Stanwyck"]
9
+ s.email = ["brian@highgroove.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Wrapper for discriminant analysis methods in R}
12
+ s.description = %q{harlequin is a Ruby wrapper for linear and quadratic discriminant analysis in R for statistical classification. Also allows means testing to determine significance of discriminant variables.}
13
+
14
+ s.rubyforge_project = "harlequin"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here
22
+
23
+ s.add_development_dependency "rspec"
24
+ s.add_dependency "rinruby"
25
+ end
@@ -0,0 +1,3 @@
1
+ module Harlequin
2
+ VERSION = "0.0.1"
3
+ end
data/lib/harlequin.rb ADDED
@@ -0,0 +1,149 @@
1
+ module Harlequin
2
+ require 'rinruby'
3
+
4
+ R.echo false
5
+ R.eval "library(MASS)"
6
+ R.eval "library(alr3)"
7
+
8
+ class DiscriminantAnalysis
9
+ attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
10
+
11
+ def initialize(variables, classification_variable)
12
+ @accuracy = nil
13
+ @variables = variables << classification_variable
14
+ @classification_variable = classification_variable
15
+ @training_data = []
16
+ @class_hash = {}
17
+ end
18
+
19
+ def clear_training_data
20
+ @training_data = []
21
+ @class_hash = {}
22
+ end
23
+
24
+ def add_training_data(*new_data)
25
+ @training_data += new_data
26
+
27
+ @training_data.map { |row| row[@classification_variable] }.each do |class_value|
28
+ unless @class_hash.keys.include? class_value
29
+ @class_hash.merge!({ class_value => (@class_hash.values.max ? @class_hash.values.max+1 : 1) })
30
+ end
31
+ end
32
+ end
33
+
34
+ # Returns the class determined by linear discriminant analysis for an array of sample points.
35
+ def predict(*samples)
36
+ (variables - [classification_variable]).each do |var|
37
+ R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
38
+ end
39
+
40
+ sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
41
+ R.eval "sample_points <- data.frame(#{sample_var_declarations})"
42
+
43
+ R.eval "predictions <- predict(fit, sample_points)"
44
+ R.eval "classes <- as.numeric(predictions$class)"
45
+
46
+ R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
47
+ prediction_matrix = R.pull "as.matrix(d)"
48
+
49
+ # This requires classes to be integers 1,2,3,...
50
+ # TODO: implement this without requiring specific values for sample hashes
51
+ predictions = prediction_matrix.to_a.map do |row|
52
+ classification = row.first.to_i
53
+ confidence = row[classification]
54
+ {
55
+ :class => @class_hash.invert[classification],
56
+ :confidence => confidence
57
+ }
58
+ end
59
+
60
+ predictions.count == 1 ? predictions.first : predictions
61
+ end
62
+
63
+ # Performs a test of difference of means between classes
64
+ # Since the t-test is two-sample, classification_variable must only have two states
65
+ def t_test(variable)
66
+ R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
67
+
68
+ t_statistic = R.pull "t_test$statistic"
69
+ degrees_of_freedom = R.pull "t_test$df"
70
+ p_value = R.pull "t_test$p.value"
71
+
72
+ {
73
+ :t_statistic => t_statistic,
74
+ :degrees_of_freedom => degrees_of_freedom,
75
+ :p_value => p_value
76
+ }
77
+ end
78
+
79
+ def plot(samples = nil)
80
+ if samples
81
+ variables.each do |var|
82
+ R.assign("#{var}_sample", samples.map { |s| s[var] })
83
+ end
84
+ plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
85
+ else
86
+ plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
87
+ end
88
+ R.eval "plot(data.frame(#{plot_vars}), col=as.numeric(#{classification_variable.to_s}))"
89
+ end
90
+
91
+ def plot_predict(samples = nil)
92
+ if samples
93
+ variables.each do |var|
94
+ R.assign("#{var}_sample", samples.map { |s| s[var] })
95
+ end
96
+ plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
97
+
98
+ R.predictions = samples.map { |sample| predict(sample) }
99
+ else
100
+ plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
101
+ R.eval "predictions <- as.numeric(analysis_data$#{classification_variable.to_s})"
102
+ end
103
+ R.eval "plot(data.frame(#{plot_vars}), col=predictions)"
104
+ end
105
+
106
+ ['lda', 'qda'].each do |analysis_type|
107
+ define_method("init_#{analysis_type}_analysis") do
108
+ init_analysis
109
+ R.eval <<-EOF
110
+ analysis_data <- data.frame(#{@var_declarations})
111
+ fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
112
+ EOF
113
+ compute_accuracy
114
+ end
115
+ end
116
+
117
+ private
118
+
119
+ def init_analysis
120
+ variables.each do |variable|
121
+ if variable == @classification_variable
122
+ R.assign(variable.to_s, training_data.map { |point| @class_hash[point[variable]] })
123
+ else
124
+ R.assign(variable.to_s, training_data.map { |point| point[variable] })
125
+ end
126
+ end
127
+ @var_declarations = variables.map(&:to_s).join(',')
128
+ @non_class_variables = (variables - [classification_variable]).map { |variable| variable.to_s }.join('+')
129
+ end
130
+
131
+ def compute_accuracy
132
+ R.eval "ct <- table(predict(fit)$class, analysis_data$#{classification_variable.to_s})"
133
+ percent_correct = R.pull "sum(diag(prop.table(ct)))"
134
+ percent_false_positives = (R.pull "prop.table(ct)[1,2]") / (R.pull "prop.table(ct)[1,1] + prop.table(ct)[1,2]")
135
+ percent_false_negatives = (R.pull "prop.table(ct)[2,1]") / (R.pull "prop.table(ct)[2,1] + prop.table(ct)[2,2]")
136
+
137
+ correlation_coefficient = R.pull "sqrt(chisq.test(ct)$statistic/sum(ct))"
138
+
139
+ @accuracy = {
140
+ :percent_correct => percent_correct,
141
+ :percent_false_negatives => percent_false_negatives,
142
+ :percent_false_positives => percent_false_positives,
143
+ :correlation_coefficient => correlation_coefficient
144
+ }
145
+ end
146
+ end
147
+ end
148
+
149
+ include Harlequin
@@ -0,0 +1,64 @@
1
+ require 'spec_helper'
2
+ describe Harlequin::DiscriminantAnalysis do
3
+ before do
4
+ @discriminant_analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
5
+
6
+ csv_data = CSV.read("spec/lda_sample.csv")
7
+ csv_data.shift
8
+ @training = csv_data.map { |weight, height, gender| {:weight => weight.to_f, :height => height.to_f, :gender => gender.to_i} }
9
+
10
+ @discriminant_analysis.add_training_data(*@training)
11
+ @discriminant_analysis.init_lda_analysis
12
+
13
+ @male_sample = { :height => 73, :weight => 210 }
14
+ @female_sample = { :height => 60, :weight => 140 }
15
+
16
+ @male_prediction = @discriminant_analysis.predict(@male_sample)
17
+ @female_prediction = @discriminant_analysis.predict(@female_sample)
18
+ end
19
+
20
+ it 'computes the accuracy of a given training set' do
21
+ @discriminant_analysis.accuracy[:percent_correct].should be > 0.5
22
+ end
23
+
24
+ it 'predicts inclusion in a set' do
25
+ @male_prediction[:class].should eq(1)
26
+ @female_prediction[:class].should eq(2)
27
+ end
28
+
29
+ it 'provides confidence scores for a prediction' do
30
+ @male_prediction[:confidence].should be > 0.5
31
+ @female_prediction[:confidence].should be > 0.5
32
+ end
33
+
34
+ it 'predicts for arrays of sample points' do
35
+ samples = [@male_sample, @female_sample]
36
+ predictions = @discriminant_analysis.predict(*samples)
37
+
38
+ predictions.map { |row| row[:class] }.should eq [1,2]
39
+ predictions.map { |row| row[:confidence] }.each do |confidence|
40
+ confidence.should be > 0.5
41
+ end
42
+ end
43
+
44
+ it 'clears training data from a DiscriminantAnalysis instance' do
45
+ @discriminant_analysis.clear_training_data
46
+ @discriminant_analysis.training_data.should be_empty
47
+ end
48
+
49
+ it 'accepts non-numeric classification values in training data' do
50
+ @discriminant_analysis.clear_training_data
51
+
52
+ @training.map! do |row|
53
+ gender_string = row[:gender] == 1 ? 'male' : 'female'
54
+ row.merge(:gender => gender_string)
55
+ end
56
+
57
+ @discriminant_analysis.add_training_data(*@training)
58
+ @discriminant_analysis.init_lda_analysis
59
+ @discriminant_analysis.accuracy[:percent_correct].should be_within(0.001).of 0.9485
60
+
61
+ @male_prediction = @discriminant_analysis.predict(@male_sample)
62
+ @male_prediction[:class].should eq 'male'
63
+ end
64
+ end