harlequin 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,4 @@
1
+ # About #
1
2
  Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
2
3
 
3
4
  ```ruby
@@ -20,7 +21,7 @@ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#ini
20
21
 
21
22
  ```ruby
22
23
  analysis.init_lda_analysis
23
- analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
24
+ analysis.predict(:weight => 180, :height => 68) # => {:class=>"male", :confidence=>0.9999999999666846}
24
25
  ```
25
26
 
26
27
  Multiple predictions can be computed at once in the same way as adding multiple training rows.
@@ -28,5 +29,10 @@ Multiple predictions can be computed at once in the same way as adding multiple
28
29
  In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
29
30
 
30
31
  ```ruby
31
- analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
32
- ```
32
+ analysis.t_test(:weight) # => {:t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898}
33
+ ```
34
+
35
+ # Requirements #
36
+ A Ruby script using Harlequin requires an R instance, so make sure you have a working copy of R installed on your system. The OSX binaries for R can be found [here](http://cran.r-project.org/bin/macosx/). See the documentation for Rinruby for more details.
37
+
38
+ You will also need the additional R packages MASS and alr3. These can be installed with the R command line by first choosing a mirror with ```chooseCRANmirror()``` and then installing with ```install.packages(c("MASS"), c("alr3"))```.
@@ -21,5 +21,6 @@ Gem::Specification.new do |s|
21
21
  # specify any dependencies here
22
22
 
23
23
  s.add_development_dependency "rspec"
24
+ s.add_development_dependency "pry"
24
25
  s.add_dependency "rinruby"
25
26
  end
@@ -4,6 +4,7 @@ module Harlequin
4
4
  R.echo false
5
5
  R.eval "library(MASS)"
6
6
  R.eval "library(alr3)"
7
+ R.eval "library(RWeka)"
7
8
 
8
9
  class DiscriminantAnalysis
9
10
  attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
@@ -14,6 +15,7 @@ module Harlequin
14
15
  @classification_variable = classification_variable
15
16
  @training_data = []
16
17
  @class_hash = {}
18
+ @current_analysis_type = nil
17
19
  end
18
20
 
19
21
  def clear_training_data
@@ -31,33 +33,54 @@ module Harlequin
31
33
  end
32
34
  end
33
35
 
34
- # Returns the class determined by linear discriminant analysis for an array of sample points.
36
+ # Returns the classification for an array of sample points.
35
37
  def predict(*samples)
36
38
  (variables - [classification_variable]).each do |var|
37
39
  R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
38
40
  end
39
-
41
+
40
42
  sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
41
- R.eval "sample_points <- data.frame(#{sample_var_declarations})"
42
-
43
- R.eval "predictions <- predict(fit, sample_points)"
44
- R.eval "classes <- as.numeric(predictions$class)"
45
-
46
- R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
47
- prediction_matrix = R.pull "as.matrix(d)"
48
-
49
- # This requires classes to be integers 1,2,3,...
50
- # TODO: implement this without requiring specific values for sample hashes
51
- predictions = prediction_matrix.to_a.map do |row|
52
- classification = row.first.to_i
53
- confidence = row[classification]
54
- {
55
- :class => @class_hash.invert[classification],
56
- :confidence => confidence
57
- }
43
+
44
+ if @current_analysis_type == :k_nearest_neighbor
45
+ R.eval <<-EOF
46
+ sample_points <- data.frame(#{sample_var_declarations})
47
+
48
+ predictions <- predict(fit, sample_points)
49
+ classes <- round(predictions)
50
+ d <- data.frame(classes, confidence=predictions)
51
+ EOF
52
+
53
+ prediction_matrix = R.pull "classes"
54
+ predictions = prediction_matrix.map do |row|
55
+ classification = row.to_i
56
+ {
57
+ :class => @class_hash.invert[classification]
58
+ }
59
+ end
60
+
61
+ predictions.count == 1 ? predictions.first : predictions
62
+ else
63
+ R.eval <<-EOF
64
+ sample_points <- data.frame(#{sample_var_declarations})
65
+
66
+ predictions <- predict(fit, sample_points)
67
+ classes <- as.numeric(predictions$class)
68
+
69
+ d <- data.frame(classes, confidence=predictions$posterior)
70
+ EOF
71
+
72
+ prediction_matrix = R.pull "as.matrix(d)"
73
+ predictions = prediction_matrix.to_a.map do |row|
74
+ classification = row.first.to_i
75
+ confidence = row[classification]
76
+ {
77
+ :class => @class_hash.invert[classification],
78
+ :confidence => confidence
79
+ }
80
+ end
81
+
82
+ predictions.count == 1 ? predictions.first : predictions
58
83
  end
59
-
60
- predictions.count == 1 ? predictions.first : predictions
61
84
  end
62
85
 
63
86
  # Performs a test of difference of means between classes
@@ -66,7 +89,7 @@ module Harlequin
66
89
  R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
67
90
 
68
91
  t_statistic = R.pull "t_test$statistic"
69
- degrees_of_freedom = R.pull "t_test$df"
92
+ degrees_of_freedom = R.pull "t_test$parameter"
70
93
  p_value = R.pull "t_test$p.value"
71
94
 
72
95
  {
@@ -110,9 +133,22 @@ module Harlequin
110
133
  analysis_data <- data.frame(#{@var_declarations})
111
134
  fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
112
135
  EOF
136
+
137
+ @current_analysis_type = analysis_type.to_sym
113
138
  compute_accuracy
114
139
  end
115
140
  end
141
+
142
+ def init_knn_analysis
143
+ init_analysis
144
+ R.eval <<-EOF
145
+ analysis_data <- data.frame(#{@var_declarations})
146
+ fit <- IBk(#{classification_variable.to_s} ~ ., data=analysis_data)
147
+ EOF
148
+
149
+ @current_analysis_type = :k_nearest_neighbor
150
+ compute_accuracy
151
+ end
116
152
 
117
153
  private
118
154
 
@@ -146,4 +182,4 @@ module Harlequin
146
182
  end
147
183
  end
148
184
 
149
- include Harlequin
185
+ include Harlequin
@@ -1,3 +1,3 @@
1
1
  module Harlequin
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -41,6 +41,14 @@ describe Harlequin::DiscriminantAnalysis do
41
41
  end
42
42
  end
43
43
 
44
+ it 'predicts for k-nearest neighbor classifiers' do
45
+ @discriminant_analysis.init_knn_analysis
46
+ samples = [@male_sample, @female_sample]
47
+ predictions = @discriminant_analysis.predict(*samples)
48
+
49
+ predictions.map { |row| row[:class] }.should eq [1,2]
50
+ end
51
+
44
52
  it 'clears training data from a DiscriminantAnalysis instance' do
45
53
  @discriminant_analysis.clear_training_data
46
54
  @discriminant_analysis.training_data.should be_empty
@@ -1,2 +1,3 @@
1
1
  require 'harlequin'
2
2
  require 'csv'
3
+ require 'pry'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: harlequin
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-30 00:00:00.000000000Z
12
+ date: 2012-03-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70351541259960 !ruby/object:Gem::Requirement
16
+ requirement: &70229741113520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,21 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70351541259960
24
+ version_requirements: *70229741113520
25
+ - !ruby/object:Gem::Dependency
26
+ name: pry
27
+ requirement: &70229741113100 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70229741113100
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: rinruby
27
- requirement: &70351541258940 !ruby/object:Gem::Requirement
38
+ requirement: &70229741112680 !ruby/object:Gem::Requirement
28
39
  none: false
29
40
  requirements:
30
41
  - - ! '>='
@@ -32,7 +43,7 @@ dependencies:
32
43
  version: '0'
33
44
  type: :runtime
34
45
  prerelease: false
35
- version_requirements: *70351541258940
46
+ version_requirements: *70229741112680
36
47
  description: harlequin is a Ruby wrapper for linear and quadratic discriminant analysis
37
48
  in R for statistical classification. Also allows means testing to determine significance
38
49
  of discriminant variables.
@@ -72,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
83
  version: '0'
73
84
  requirements: []
74
85
  rubyforge_project: harlequin
75
- rubygems_version: 1.8.10
86
+ rubygems_version: 1.8.15
76
87
  signing_key:
77
88
  specification_version: 3
78
89
  summary: Wrapper for discriminant analysis methods in R