harlequin 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +9 -3
- data/harlequin.gemspec +1 -0
- data/lib/harlequin.rb +59 -23
- data/lib/harlequin/version.rb +1 -1
- data/spec/harlequin_spec.rb +8 -0
- data/spec/spec_helper.rb +1 -0
- metadata +18 -7
data/README.markdown
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# About #
|
1
2
|
Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
|
2
3
|
|
3
4
|
```ruby
|
@@ -20,7 +21,7 @@ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#ini
|
|
20
21
|
|
21
22
|
```ruby
|
22
23
|
analysis.init_lda_analysis
|
23
|
-
analysis.predict(:weight => 180, :height => 68)
|
24
|
+
analysis.predict(:weight => 180, :height => 68) # => {:class=>"male", :confidence=>0.9999999999666846}
|
24
25
|
```
|
25
26
|
|
26
27
|
Multiple predictions can be computed at once in the same way as adding multiple training rows.
|
@@ -28,5 +29,10 @@ Multiple predictions can be computed at once in the same way as adding multiple
|
|
28
29
|
In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
|
29
30
|
|
30
31
|
```ruby
|
31
|
-
analysis.t_test(:weight)
|
32
|
-
```
|
32
|
+
analysis.t_test(:weight) # => {:t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898}
|
33
|
+
```
|
34
|
+
|
35
|
+
# Requirements #
|
36
|
+
A Ruby script using Harlequin requires an R instance, so make sure you have a working copy of R installed on your system. The OSX binaries for R can be found [here](http://cran.r-project.org/bin/macosx/). See the documentation for Rinruby for more details.
|
37
|
+
|
38
|
+
You will also need the additional R packages MASS and alr3. These can be installed with the R command line by first choosing a mirror with ```chooseCRANmirror()``` and then installing with ```install.packages(c("MASS"), c("alr3"))```.
|
data/harlequin.gemspec
CHANGED
data/lib/harlequin.rb
CHANGED
@@ -4,6 +4,7 @@ module Harlequin
|
|
4
4
|
R.echo false
|
5
5
|
R.eval "library(MASS)"
|
6
6
|
R.eval "library(alr3)"
|
7
|
+
R.eval "library(RWeka)"
|
7
8
|
|
8
9
|
class DiscriminantAnalysis
|
9
10
|
attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
|
@@ -14,6 +15,7 @@ module Harlequin
|
|
14
15
|
@classification_variable = classification_variable
|
15
16
|
@training_data = []
|
16
17
|
@class_hash = {}
|
18
|
+
@current_analysis_type = nil
|
17
19
|
end
|
18
20
|
|
19
21
|
def clear_training_data
|
@@ -31,33 +33,54 @@ module Harlequin
|
|
31
33
|
end
|
32
34
|
end
|
33
35
|
|
34
|
-
# Returns the
|
36
|
+
# Returns the classification for an array of sample points.
|
35
37
|
def predict(*samples)
|
36
38
|
(variables - [classification_variable]).each do |var|
|
37
39
|
R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
|
38
40
|
end
|
39
|
-
|
41
|
+
|
40
42
|
sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
43
|
+
|
44
|
+
if @current_analysis_type == :k_nearest_neighbor
|
45
|
+
R.eval <<-EOF
|
46
|
+
sample_points <- data.frame(#{sample_var_declarations})
|
47
|
+
|
48
|
+
predictions <- predict(fit, sample_points)
|
49
|
+
classes <- round(predictions)
|
50
|
+
d <- data.frame(classes, confidence=predictions)
|
51
|
+
EOF
|
52
|
+
|
53
|
+
prediction_matrix = R.pull "classes"
|
54
|
+
predictions = prediction_matrix.map do |row|
|
55
|
+
classification = row.to_i
|
56
|
+
{
|
57
|
+
:class => @class_hash.invert[classification]
|
58
|
+
}
|
59
|
+
end
|
60
|
+
|
61
|
+
predictions.count == 1 ? predictions.first : predictions
|
62
|
+
else
|
63
|
+
R.eval <<-EOF
|
64
|
+
sample_points <- data.frame(#{sample_var_declarations})
|
65
|
+
|
66
|
+
predictions <- predict(fit, sample_points)
|
67
|
+
classes <- as.numeric(predictions$class)
|
68
|
+
|
69
|
+
d <- data.frame(classes, confidence=predictions$posterior)
|
70
|
+
EOF
|
71
|
+
|
72
|
+
prediction_matrix = R.pull "as.matrix(d)"
|
73
|
+
predictions = prediction_matrix.to_a.map do |row|
|
74
|
+
classification = row.first.to_i
|
75
|
+
confidence = row[classification]
|
76
|
+
{
|
77
|
+
:class => @class_hash.invert[classification],
|
78
|
+
:confidence => confidence
|
79
|
+
}
|
80
|
+
end
|
81
|
+
|
82
|
+
predictions.count == 1 ? predictions.first : predictions
|
58
83
|
end
|
59
|
-
|
60
|
-
predictions.count == 1 ? predictions.first : predictions
|
61
84
|
end
|
62
85
|
|
63
86
|
# Performs a test of difference of means between classes
|
@@ -66,7 +89,7 @@ module Harlequin
|
|
66
89
|
R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
|
67
90
|
|
68
91
|
t_statistic = R.pull "t_test$statistic"
|
69
|
-
degrees_of_freedom = R.pull "t_test$
|
92
|
+
degrees_of_freedom = R.pull "t_test$parameter"
|
70
93
|
p_value = R.pull "t_test$p.value"
|
71
94
|
|
72
95
|
{
|
@@ -110,9 +133,22 @@ module Harlequin
|
|
110
133
|
analysis_data <- data.frame(#{@var_declarations})
|
111
134
|
fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
|
112
135
|
EOF
|
136
|
+
|
137
|
+
@current_analysis_type = analysis_type.to_sym
|
113
138
|
compute_accuracy
|
114
139
|
end
|
115
140
|
end
|
141
|
+
|
142
|
+
def init_knn_analysis
|
143
|
+
init_analysis
|
144
|
+
R.eval <<-EOF
|
145
|
+
analysis_data <- data.frame(#{@var_declarations})
|
146
|
+
fit <- IBk(#{classification_variable.to_s} ~ ., data=analysis_data)
|
147
|
+
EOF
|
148
|
+
|
149
|
+
@current_analysis_type = :k_nearest_neighbor
|
150
|
+
compute_accuracy
|
151
|
+
end
|
116
152
|
|
117
153
|
private
|
118
154
|
|
@@ -146,4 +182,4 @@ module Harlequin
|
|
146
182
|
end
|
147
183
|
end
|
148
184
|
|
149
|
-
include Harlequin
|
185
|
+
include Harlequin
|
data/lib/harlequin/version.rb
CHANGED
data/spec/harlequin_spec.rb
CHANGED
@@ -41,6 +41,14 @@ describe Harlequin::DiscriminantAnalysis do
|
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
|
+
it 'predicts for k-nearest neighbor classifiers' do
|
45
|
+
@discriminant_analysis.init_knn_analysis
|
46
|
+
samples = [@male_sample, @female_sample]
|
47
|
+
predictions = @discriminant_analysis.predict(*samples)
|
48
|
+
|
49
|
+
predictions.map { |row| row[:class] }.should eq [1,2]
|
50
|
+
end
|
51
|
+
|
44
52
|
it 'clears training data from a DiscriminantAnalysis instance' do
|
45
53
|
@discriminant_analysis.clear_training_data
|
46
54
|
@discriminant_analysis.training_data.should be_empty
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: harlequin
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2012-03-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &70229741113520 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,21 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70229741113520
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: pry
|
27
|
+
requirement: &70229741113100 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70229741113100
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rinruby
|
27
|
-
requirement: &
|
38
|
+
requirement: &70229741112680 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ! '>='
|
@@ -32,7 +43,7 @@ dependencies:
|
|
32
43
|
version: '0'
|
33
44
|
type: :runtime
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70229741112680
|
36
47
|
description: harlequin is a Ruby wrapper for linear and quadratic discriminant analysis
|
37
48
|
in R for statistical classification. Also allows means testing to determine significance
|
38
49
|
of discriminant variables.
|
@@ -72,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
72
83
|
version: '0'
|
73
84
|
requirements: []
|
74
85
|
rubyforge_project: harlequin
|
75
|
-
rubygems_version: 1.8.
|
86
|
+
rubygems_version: 1.8.15
|
76
87
|
signing_key:
|
77
88
|
specification_version: 3
|
78
89
|
summary: Wrapper for discriminant analysis methods in R
|