RubyGems - harlequin - Versions diffs - 0.0.1 - Mend

harlequin 0.0.1

Files changed (11) hide show

data/.gitignore ADDED Viewed

@@ -0,0 +1,4 @@
+*.gem
+.bundle
+Gemfile.lock
+pkg/*

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in discriminant_analysis.gemspec
+gemspec

data/README.markdown ADDED Viewed

@@ -0,0 +1,32 @@
+Harlequin is a gem that allows easy access to the linear and quadratic discriminant analysis functions of R. To use harlequin, initialize a DiscriminantAnalysis object with an array of variable names for analysis, and a classification variable name as a second argument, like so:
+```ruby
+analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
+```
+Training rows should be formatted as hashes with pairs of the form ```variable_name => value```. For example, we can add some rows to the analysis above with
+```ruby
+analysis.add_training_data(
+                           { :weight => 200, :height => 72, :gender => 'male' },
+                           { :weight => 205, :height => 71, :gender => 'male' },
+                           { :weight => 140, :height => 63, :gender => 'female'},
+                           { :weight => 130, :height => 61, :gender => 'female'}
+                          )
+```
+(Note that there must be more than 1 of each classification value represented in the training data, and variable values must not be constant within a class.)
+Initialize linear or quadratic analysis with ```#init_lda_analysis``` or ```#init_qda_analysis```, respectively. Then we can predict the class of new rows, also given as hashes:
+```ruby
+analysis.init_lda_analysis
+analysis.predict(:weight => 180, :height => 68) #=> {:class=>"male", :confidence=>0.9999999999666846}
+```
+Multiple predictions can be computed at once in the same way as adding multiple training rows.
+In order to assess the effectiveness of adding a variable, the DiscriminantAnalysis class includes access to the two-sample t-test for difference in means between classes. This currently works for binary classification only.
+```ruby
+analysis.t_test(:weight) #=> { :t_statistic=>12.0748, :degrees_of_freedom=>1.471, :p_value=>0.01898 }
+```

data/Rakefile ADDED Viewed

	@@ -0,0 +1 @@
1	+ require "bundler/gem_tasks"

data/harlequin.gemspec ADDED Viewed

@@ -0,0 +1,25 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "harlequin/version"
+Gem::Specification.new do |s|
+  s.name        = "harlequin"
+  s.version     = Harlequin::VERSION
+  s.authors     = ["Brian Stanwyck"]
+  s.email       = ["brian@highgroove.com"]
+  s.homepage    = ""
+  s.summary     = %q{Wrapper for discriminant analysis methods in R}
+  s.description = %q{harlequin is a Ruby wrapper for linear and quadratic discriminant analysis in R for statistical classification. Also allows means testing to determine significance of discriminant variables.}
+  s.rubyforge_project = "harlequin"
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  # specify any dependencies here
+  s.add_development_dependency "rspec"
+  s.add_dependency             "rinruby"
+end

data/lib/harlequin/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Harlequin
+  VERSION = "0.0.1"
+end

data/lib/harlequin.rb ADDED Viewed

@@ -0,0 +1,149 @@
+module Harlequin
+  require 'rinruby'
+  R.echo false
+  R.eval "library(MASS)"
+  R.eval "library(alr3)"
+  class DiscriminantAnalysis
+    attr_reader :training_data, :variables, :classification_variable, :accuracy, :class_hash
+    def initialize(variables, classification_variable)
+      @accuracy                = nil
+      @variables               = variables << classification_variable
+      @classification_variable = classification_variable
+      @training_data           = []
+      @class_hash              = {}
+    end
+    def clear_training_data
+      @training_data = []
+      @class_hash    = {}
+    end
+    def add_training_data(*new_data)
+      @training_data += new_data
+      @training_data.map { |row| row[@classification_variable] }.each do |class_value|
+        unless @class_hash.keys.include? class_value
+          @class_hash.merge!({ class_value => (@class_hash.values.max ? @class_hash.values.max+1 : 1) })
+        end
+      end
+    end
+    # Returns the class determined by linear discriminant analysis for an array of sample points.
+    def predict(*samples)
+      (variables - [classification_variable]).each do |var|
+        R.assign(var.to_s + "_sample", samples.map { |s| s[var] })
+      end
+      sample_var_declarations = (variables - [classification_variable]).map { |var| "#{var.to_s} = #{var.to_s}_sample" }.join(',')
+      R.eval "sample_points <- data.frame(#{sample_var_declarations})"
+      R.eval "predictions <- predict(fit, sample_points)"
+      R.eval "classes <- as.numeric(predictions$class)"
+      R.eval "d <- data.frame(classes, confidence=predictions$posterior)"
+      prediction_matrix = R.pull "as.matrix(d)"
+      # This requires classes to be integers 1,2,3,...
+      # TODO: implement this without requiring specific values for sample hashes
+      predictions = prediction_matrix.to_a.map do |row|
+        classification = row.first.to_i
+        confidence = row[classification]
+        {
+          :class      => @class_hash.invert[classification],
+          :confidence => confidence
+        }
+      end
+      predictions.count == 1 ? predictions.first : predictions
+    end
+    # Performs a test of difference of means between classes
+    # Since the t-test is two-sample, classification_variable must only have two states
+    def t_test(variable)
+      R.eval "t_test <- t.test(#{variable.to_s} ~ #{classification_variable.to_s})"
+      t_statistic        = R.pull "t_test$statistic"
+      degrees_of_freedom = R.pull "t_test$df"
+      p_value            = R.pull "t_test$p.value"
+      {
+        :t_statistic        => t_statistic,
+        :degrees_of_freedom => degrees_of_freedom,
+        :p_value            => p_value
+      }
+    end
+    def plot(samples = nil)
+      if samples
+        variables.each do |var|
+          R.assign("#{var}_sample", samples.map { |s| s[var] })
+        end
+        plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
+      else
+        plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
+      end
+      R.eval "plot(data.frame(#{plot_vars}), col=as.numeric(#{classification_variable.to_s}))"
+    end
+    def plot_predict(samples = nil)
+      if samples
+        variables.each do |var|
+          R.assign("#{var}_sample", samples.map { |s| s[var] })
+        end
+        plot_vars = (variables - [classification_variable]).map { |var| "#{var}_sample" }.join(',')
+        R.predictions = samples.map { |sample| predict(sample) }
+      else
+        plot_vars = (variables - [classification_variable]).map { |var| "analysis_data$#{var}" }.join(',')
+        R.eval "predictions <- as.numeric(analysis_data$#{classification_variable.to_s})"
+      end
+      R.eval "plot(data.frame(#{plot_vars}), col=predictions)"
+    end
+    ['lda', 'qda'].each do |analysis_type|
+      define_method("init_#{analysis_type}_analysis") do
+        init_analysis
+        R.eval <<-EOF
+          analysis_data <- data.frame(#{@var_declarations})
+          fit <- #{analysis_type}(#{classification_variable.to_s} ~ #{@non_class_variables}, data=analysis_data)
+        EOF
+        compute_accuracy
+      end
+    end
+    private
+    def init_analysis
+      variables.each do |variable|
+        if variable == @classification_variable
+          R.assign(variable.to_s, training_data.map { |point| @class_hash[point[variable]] })
+        else
+          R.assign(variable.to_s, training_data.map { |point| point[variable] })
+        end
+      end
+      @var_declarations = variables.map(&:to_s).join(',')
+      @non_class_variables = (variables - [classification_variable]).map { |variable| variable.to_s }.join('+')
+    end
+    def compute_accuracy
+      R.eval "ct <- table(predict(fit)$class, analysis_data$#{classification_variable.to_s})"
+      percent_correct = R.pull "sum(diag(prop.table(ct)))"
+      percent_false_positives = (R.pull "prop.table(ct)[1,2]") / (R.pull "prop.table(ct)[1,1] + prop.table(ct)[1,2]")
+      percent_false_negatives = (R.pull "prop.table(ct)[2,1]") / (R.pull "prop.table(ct)[2,1] + prop.table(ct)[2,2]")
+      correlation_coefficient = R.pull "sqrt(chisq.test(ct)$statistic/sum(ct))"
+      @accuracy = {
+        :percent_correct         => percent_correct,
+        :percent_false_negatives => percent_false_negatives,
+        :percent_false_positives => percent_false_positives,
+        :correlation_coefficient => correlation_coefficient
+      }
+    end
+  end
+end
+include Harlequin

data/spec/harlequin_spec.rb ADDED Viewed

@@ -0,0 +1,64 @@
+require 'spec_helper'
+describe Harlequin::DiscriminantAnalysis do
+  before do
+    @discriminant_analysis = DiscriminantAnalysis.new([:weight, :height], :gender)
+    csv_data = CSV.read("spec/lda_sample.csv")
+    csv_data.shift
+    @training = csv_data.map { |weight, height, gender| {:weight => weight.to_f, :height => height.to_f, :gender => gender.to_i} }
+    @discriminant_analysis.add_training_data(*@training)
+    @discriminant_analysis.init_lda_analysis
+    @male_sample   = { :height => 73, :weight => 210 }
+    @female_sample = { :height => 60, :weight => 140 }
+    @male_prediction   = @discriminant_analysis.predict(@male_sample)
+    @female_prediction = @discriminant_analysis.predict(@female_sample)
+  end
+  it 'computes the accuracy of a given training set' do
+    @discriminant_analysis.accuracy[:percent_correct].should be > 0.5
+  end
+  it 'predicts inclusion in a set' do
+    @male_prediction[:class].should eq(1)
+    @female_prediction[:class].should eq(2)
+  end
+  it 'provides confidence scores for a prediction' do
+    @male_prediction[:confidence].should be > 0.5
+    @female_prediction[:confidence].should be > 0.5
+  end
+  it 'predicts for arrays of sample points' do
+    samples = [@male_sample, @female_sample]
+    predictions = @discriminant_analysis.predict(*samples)
+    predictions.map { |row| row[:class] }.should eq [1,2]
+    predictions.map { |row| row[:confidence] }.each do |confidence|
+      confidence.should be > 0.5
+    end
+  end
+  it 'clears training data from a DiscriminantAnalysis instance' do
+    @discriminant_analysis.clear_training_data
+    @discriminant_analysis.training_data.should be_empty
+  end
+  it 'accepts non-numeric classification values in training data' do
+    @discriminant_analysis.clear_training_data
+    @training.map! do |row|
+      gender_string = row[:gender] == 1 ? 'male' : 'female'
+      row.merge(:gender => gender_string)
+    end
+    @discriminant_analysis.add_training_data(*@training)
+    @discriminant_analysis.init_lda_analysis
+    @discriminant_analysis.accuracy[:percent_correct].should be_within(0.001).of 0.9485
+    @male_prediction = @discriminant_analysis.predict(@male_sample)
+    @male_prediction[:class].should eq 'male'
+  end
+end