RubyGems - harlequin - Versions diffs - 0.0.1 → 0.0.2 - Mend

harlequin 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.markdown CHANGED

@@ -1,3 +1,4 @@
+# About #
 Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
 ```ruby
@@ -20,7 +21,7 @@ Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#ini
 ```ruby
 analysis.init_lda_analysis
-analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
+analysis.predict(:weight => 180, :height => 68) # => {:class=>"male", :confidence=>0.9999999999666846}
 ```
 Multiple predictions can be computed at once in the same way as adding multiple training rows.
@@ -28,5 +29,10 @@ Multiple predictions can be computed at once in the same way as adding multiple
 In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
 ```ruby
-analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
-```
+analysis.t_test(:weight) # => {:t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898}
+```
+# Requirements #
+A Ruby script using Harlequin requires an R instance, so make sure you have a working copy of R installed on your system. The OSX binaries for R can be found [here](http://cran.r-project.org/bin/macosx/). See the documentation for Rinruby for more details.
+You will also need the additional R packages MASS and alr3. These can be installed with the R command line by first choosing a mirror with ```chooseCRANmirror()``` and then installing with ```install.packages(c("MASS"), c("alr3"))```.

data/harlequin.gemspec CHANGED

@@ -21,5 +21,6 @@ Gem::Specification.new do |s|
   # specify any dependencies here
   s.add_development_dependency "rspec"
+  s.add_development_dependency "pry"
   s.add_dependency             "rinruby"
 end

data/lib/harlequin.rb CHANGED

@@ -4,6 +4,7 @@ module Harlequin
   R.echo false
   R.eval "library(MASS)"
   R.eval "library(alr3)"
+  R.eval "library(RWeka)"
   class DiscriminantAnalysis
     attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
@@ -14,6 +15,7 @@ module Harlequin
       @classification_variable = classification_variable
       @training_data           = []
       @class_hash              = {}
+      @current_analysis_type   = nil
     end
     def clear_training_data
@@ -31,33 +33,54 @@ module Harlequin
       end
     end
-    # Returns the class determined by linear discriminant analysis for an array of sample points.
+    # Returns the classification for an array of sample points.
     def predict(*samples)
       (variables - [classification_variable]).each do |var|
         R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
       end
       sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
-      R.eval "sample_points <- data.frame(#{sample_var_declarations})"
-      R.eval "predictions <- predict(fit, sample_points)"
-      R.eval "classes <- as.numeric(predictions$class)"
-      R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
-      prediction_matrix = R.pull "as.matrix(d)"
-      # This requires classes to be integers 1,2,3,...
-      # TODO: implement this without requiring specific values for sample hashes
-      predictions = prediction_matrix.to_a.map do |row|
-        classification = row.first.to_i
-        confidence = row[classification]
-        {
-          :class      => @class_hash.invert[classification],
-          :confidence => confidence
-        }
+      if @current_analysis_type == :k_nearest_neighbor
+        R.eval <<-EOF
+        sample_points <- data.frame(#{sample_var_declarations})
+        predictions <- predict(fit, sample_points)
+        classes <- round(predictions)
+        d <- data.frame(classes, confidence=predictions)
+        EOF
+        prediction_matrix = R.pull "classes"
+        predictions = prediction_matrix.map do |row|
+          classification = row.to_i
+          {
+            :class => @class_hash.invert[classification]
+          }
+        end
+        predictions.count == 1 ? predictions.first : predictions
+      else
+        R.eval <<-EOF
+        sample_points <- data.frame(#{sample_var_declarations})
+        predictions <- predict(fit, sample_points)
+        classes <- as.numeric(predictions$class)
+        d <- data.frame(classes, confidence=predictions$posterior)
+        EOF
+        prediction_matrix = R.pull "as.matrix(d)"
+        predictions = prediction_matrix.to_a.map do |row|
+          classification = row.first.to_i
+          confidence = row[classification]
+          {
+            :class      => @class_hash.invert[classification],
+            :confidence => confidence
+          }
+        end
+        predictions.count == 1 ? predictions.first : predictions
       end
-      predictions.count == 1 ? predictions.first : predictions
     end
     # Performs a test of difference of means between classes
@@ -66,7 +89,7 @@ module Harlequin
       R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
       t_statistic        = R.pull "t_test$statistic"
-      degrees_of_freedom = R.pull "t_test$df"
+      degrees_of_freedom = R.pull "t_test$parameter"
       p_value            = R.pull "t_test$p.value"
       {
@@ -110,9 +133,22 @@ module Harlequin
           analysis_data <- data.frame(#{@var_declarations})
           fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
         EOF
+        @current_analysis_type = analysis_type.to_sym
         compute_accuracy
       end
     end
+    def init_knn_analysis
+      init_analysis
+      R.eval <<-EOF
+        analysis_data <- data.frame(#{@var_declarations})
+        fit <- IBk(#{classification_variable.to_s} ~ ., data=analysis_data)
+      EOF
+      @current_analysis_type = :k_nearest_neighbor
+      compute_accuracy
+    end
     private
@@ -146,4 +182,4 @@ module Harlequin
   end
 end
-include Harlequin
+include Harlequin

data/lib/harlequin/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Harlequin
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

data/spec/harlequin_spec.rb CHANGED

@@ -41,6 +41,14 @@ describe Harlequin::DiscriminantAnalysis do
     end
   end
+  it 'predicts for k-nearest neighbor classifiers' do
+    @discriminant_analysis.init_knn_analysis
+    samples = [@male_sample, @female_sample]
+    predictions = @discriminant_analysis.predict(*samples)
+    predictions.map { |row| row[:class] }.should eq [1,2]
+  end
   it 'clears training data from a DiscriminantAnalysis instance' do
     @discriminant_analysis.clear_training_data
     @discriminant_analysis.training_data.should be_empty

data/spec/spec_helper.rb CHANGED

@@ -1,2 +1,3 @@
 require 'harlequin'
 require 'csv'
+require 'pry'

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: harlequin
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -9,11 +9,11 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-11-30 00:00:00.000000000Z
+date: 2012-03-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
-  requirement: &70351541259960 !ruby/object:Gem::Requirement
+  requirement: &70229741113520 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,21 @@ dependencies:
         version: '0'
   type: :development
   prerelease: false
-  version_requirements: *70351541259960
+  version_requirements: *70229741113520
+- !ruby/object:Gem::Dependency
+  name: pry
+  requirement: &70229741113100 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: *70229741113100
 - !ruby/object:Gem::Dependency
   name: rinruby
-  requirement: &70351541258940 !ruby/object:Gem::Requirement
+  requirement: &70229741112680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -32,7 +43,7 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70351541258940
+  version_requirements: *70229741112680
 description: harlequin is a Ruby wrapper for linear and quadratic discriminant analysis
   in R for statistical classification. Also allows means testing to determine significance
   of discriminant variables.
@@ -72,7 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project: harlequin
-rubygems_version: 1.8.10
+rubygems_version: 1.8.15
 signing_key:
 specification_version: 3
 summary: Wrapper for discriminant analysis methods in R