harlequin 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ # About #
1
2
  Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
2
3
 
3
4
  ```ruby
@@ -20,7 +21,7 @@ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#ini
20
21
 
21
22
  ```ruby
22
23
  analysis.init_lda_analysis
23
- analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
24
+ analysis.predict(:weight => 180, :height => 68) # => {:class=>"male", :confidence=>0.9999999999666846}
24
25
  ```
25
26
 
26
27
  Multiple predictions can be computed at once in the same way as adding multiple training rows.
@@ -28,5 +29,10 @@ Multiple predictions can be computed at once in the same way as adding multiple
28
29
  In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
29
30
 
30
31
  ```ruby
31
- analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
32
- ```
32
+ analysis.t_test(:weight) # => {:t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898}
33
+ ```
34
+
35
+ # Requirements #
36
+ A Ruby script using Harlequin requires an R instance, so make sure you have a working copy of R installed on your system. The OSX binaries for R can be found [here](http://cran.r-project.org/bin/macosx/). See the documentation for Rinruby for more details.
37
+
38
+ You will also need the additional R packages MASS and alr3. These can be installed with the R command line by first choosing a mirror with ```chooseCRANmirror()``` and then installing with ```install.packages(c("MASS"), c("alr3"))```.
@@ -21,5 +21,6 @@ Gem::Specification.new do |s|
21
21
  # specify any dependencies here
22
22
 
23
23
  s.add_development_dependency "rspec"
24
+ s.add_development_dependency "pry"
24
25
  s.add_dependency "rinruby"
25
26
  end
@@ -4,6 +4,7 @@ module Harlequin
4
4
  R.echo false
5
5
  R.eval "library(MASS)"
6
6
  R.eval "library(alr3)"
7
+ R.eval "library(RWeka)"
7
8
 
8
9
  class DiscriminantAnalysis
9
10
  attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
@@ -14,6 +15,7 @@ module Harlequin
14
15
  @classification_variable = classification_variable
15
16
  @training_data = []
16
17
  @class_hash = {}
18
+ @current_analysis_type = nil
17
19
  end
18
20
 
19
21
  def clear_training_data
@@ -31,33 +33,54 @@ module Harlequin
31
33
  end
32
34
  end
33
35
 
34
- # Returns the class determined by linear discriminant analysis for an array of sample points.
36
+ # Returns the classification for an array of sample points.
35
37
  def predict(*samples)
36
38
  (variables - [classification_variable]).each do |var|
37
39
  R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
38
40
  end
39
-
41
+
40
42
  sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
41
- R.eval "sample_points <- data.frame(#{sample_var_declarations})"
42
-
43
- R.eval "predictions <- predict(fit, sample_points)"
44
- R.eval "classes <- as.numeric(predictions$class)"
45
-
46
- R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
47
- prediction_matrix = R.pull "as.matrix(d)"
48
-
49
- # This requires classes to be integers 1,2,3,...
50
- # TODO: implement this without requiring specific values for sample hashes
51
- predictions = prediction_matrix.to_a.map do |row|
52
- classification = row.first.to_i
53
- confidence = row[classification]
54
- {
55
- :class => @class_hash.invert[classification],
56
- :confidence => confidence
57
- }
43
+
44
+ if @current_analysis_type == :k_nearest_neighbor
45
+ R.eval <<-EOF
46
+ sample_points <- data.frame(#{sample_var_declarations})
47
+
48
+ predictions <- predict(fit, sample_points)
49
+ classes <- round(predictions)
50
+ d <- data.frame(classes, confidence=predictions)
51
+ EOF
52
+
53
+ prediction_matrix = R.pull "classes"
54
+ predictions = prediction_matrix.map do |row|
55
+ classification = row.to_i
56
+ {
57
+ :class => @class_hash.invert[classification]
58
+ }
59
+ end
60
+
61
+ predictions.count == 1 ? predictions.first : predictions
62
+ else
63
+ R.eval <<-EOF
64
+ sample_points <- data.frame(#{sample_var_declarations})
65
+
66
+ predictions <- predict(fit, sample_points)
67
+ classes <- as.numeric(predictions$class)
68
+
69
+ d <- data.frame(classes, confidence=predictions$posterior)
70
+ EOF
71
+
72
+ prediction_matrix = R.pull "as.matrix(d)"
73
+ predictions = prediction_matrix.to_a.map do |row|
74
+ classification = row.first.to_i
75
+ confidence = row[classification]
76
+ {
77
+ :class => @class_hash.invert[classification],
78
+ :confidence => confidence
79
+ }
80
+ end
81
+
82
+ predictions.count == 1 ? predictions.first : predictions
58
83
  end
59
-
60
- predictions.count == 1 ? predictions.first : predictions
61
84
  end
62
85
 
63
86
  # Performs a test of difference of means between classes
@@ -66,7 +89,7 @@ module Harlequin
66
89
  R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
67
90
 
68
91
  t_statistic = R.pull "t_test$statistic"
69
- degrees_of_freedom = R.pull "t_test$df"
92
+ degrees_of_freedom = R.pull "t_test$parameter"
70
93
  p_value = R.pull "t_test$p.value"
71
94
 
72
95
  {
@@ -110,9 +133,22 @@ module Harlequin
110
133
  analysis_data <- data.frame(#{@var_declarations})
111
134
  fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
112
135
  EOF
136
+
137
+ @current_analysis_type = analysis_type.to_sym
113
138
  compute_accuracy
114
139
  end
115
140
  end
141
+
142
+ def init_knn_analysis
143
+ init_analysis
144
+ R.eval <<-EOF
145
+ analysis_data <- data.frame(#{@var_declarations})
146
+ fit <- IBk(#{classification_variable.to_s} ~ ., data=analysis_data)
147
+ EOF
148
+
149
+ @current_analysis_type = :k_nearest_neighbor
150
+ compute_accuracy
151
+ end
116
152
 
117
153
  private
118
154
 
@@ -146,4 +182,4 @@ module Harlequin
146
182
  end
147
183
  end
148
184
 
149
- include Harlequin
185
+ include Harlequin
@@ -1,3 +1,3 @@
1
1
  module Harlequin
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
@@ -41,6 +41,14 @@ describe Harlequin::DiscriminantAnalysis do
41
41
  end
42
42
  end
43
43
 
44
+ it 'predicts for k-nearest neighbor classifiers' do
45
+ @discriminant_analysis.init_knn_analysis
46
+ samples = [@male_sample, @female_sample]
47
+ predictions = @discriminant_analysis.predict(*samples)
48
+
49
+ predictions.map { |row| row[:class] }.should eq [1,2]
50
+ end
51
+
44
52
  it 'clears training data from a DiscriminantAnalysis instance' do
45
53
  @discriminant_analysis.clear_training_data
46
54
  @discriminant_analysis.training_data.should be_empty
@@ -1,2 +1,3 @@
1
1
  require 'harlequin'
2
2
  require 'csv'
3
+ require 'pry'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: harlequin
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-30 00:00:00.000000000Z
12
+ date: 2012-03-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
16
- requirement: &70351541259960 !ruby/object:Gem::Requirement
16
+ requirement: &70229741113520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,21 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70351541259960
24
+ version_requirements: *70229741113520
25
+ - !ruby/object:Gem::Dependency
26
+ name: pry
27
+ requirement: &70229741113100 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70229741113100
25
36
  - !ruby/object:Gem::Dependency
26
37
  name: rinruby
27
- requirement: &70351541258940 !ruby/object:Gem::Requirement
38
+ requirement: &70229741112680 !ruby/object:Gem::Requirement
28
39
  none: false
29
40
  requirements:
30
41
  - - ! '>='
@@ -32,7 +43,7 @@ dependencies:
32
43
  version: '0'
33
44
  type: :runtime
34
45
  prerelease: false
35
- version_requirements: *70351541258940
46
+ version_requirements: *70229741112680
36
47
  description: harlequin is a Ruby wrapper for linear and quadratic discriminant analysis
37
48
  in R for statistical classification. Also allows means testing to determine significance
38
49
  of discriminant variables.
@@ -72,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
72
83
  version: '0'
73
84
  requirements: []
74
85
  rubyforge_project: harlequin
75
- rubygems_version: 1.8.10
86
+ rubygems_version: 1.8.15
76
87
  signing_key:
77
88
  specification_version: 3
78
89
  summary: Wrapper for discriminant analysis methods in R