RubyGems - statsample-glm - Versions diffs - 0.0.1 → 0.1.0 - Mend

statsample-glm 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +7 -0
data/.gitignore +51 -0
data/.rspec +1 -0
data/.travis.yml +2 -9
data/Gemfile +2 -20
data/LICENSE.txt +1 -1
data/README.rdoc +14 -11
data/Rakefile +16 -24
data/lib/statsample-glm.rb +1 -11
data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
data/lib/statsample-glm/glm/base.rb +99 -0
data/lib/statsample-glm/glm/irls/base.rb +54 -0
data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
data/lib/statsample-glm/glm/logistic.rb +16 -0
data/lib/statsample-glm/glm/mle/base.rb +157 -0
data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
data/lib/statsample-glm/glm/mle/normal.rb +94 -0
data/lib/statsample-glm/glm/mle/probit.rb +100 -0
data/lib/statsample-glm/glm/normal.rb +17 -0
data/lib/statsample-glm/glm/poisson.rb +17 -0
data/lib/statsample-glm/glm/probit.rb +16 -0
data/lib/statsample-glm/version.rb +5 -0
data/spec/data/logistic.csv +51 -0
data/spec/data/logistic_mle.csv +201 -0
data/spec/data/normal.csv +30 -0
data/spec/logistic_spec.rb +37 -0
data/spec/normal_spec.rb +15 -0
data/spec/poisson_spec.rb +32 -0
data/spec/probit_spec.rb +19 -0
data/spec/spec_helper.rb +50 -0
data/statsample-glm.gemspec +35 -0
metadata +71 -145
data/VERSION +0 -1
data/features/bio-statsample-glm.feature +0 -9
data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
data/features/support/env.rb +0 -15
data/lib/statsample-glm/regression/logistic.rb +0 -108
data/lib/statsample-glm/regression/poisson.rb +0 -90
data/test/helper.rb +0 -87
data/test/test_glm.rb +0 -4
data/test/test_glm_logistic.rb +0 -23
data/test/test_glm_poisson.rb +0 -25

data/VERSION DELETED

	@@ -1 +0,0 @@
1	- 0.0.1

data/features/bio-statsample-glm.feature DELETED

@@ -1,9 +0,0 @@
-Feature: something something
-  In order to something something
-  A user something something
-  something something something
-  Scenario: something something
-    Given inspiration
-    When I create a sweet new gem
-    Then everyone should see how awesome I am

data/features/step_definitions/bio-statsample-glm_steps.rb DELETED

File without changes

data/features/support/env.rb DELETED

@@ -1,15 +0,0 @@
-require 'bundler'
-begin
-  Bundler.setup(:default, :development)
-rescue Bundler::BundlerError => e
-  $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
-  exit e.status_code
-end
-$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
-require 'statsample-glm'
-require 'test/unit/assertions'
-World(Test::Unit::Assertions)

data/lib/statsample-glm/regression/logistic.rb DELETED

@@ -1,108 +0,0 @@
-module Statsample
-  module Regression
-    module GLM
-      class Logistic
-        attr_reader :se
-        # The fitted mean values
-        attr_reader :fit
-        # the _working_ residuals; that is the residuals in the final iteration of the IRWLS fit.
-        attr_reader :residuals
-        # The residuals degree of freedom
-        attr_reader :df
-        # Number of iterations used for convergence
-        attr_reader :iter
-        # Boolean. Tells whether the IRWLS for the given model converged or not
-        attr_reader :converged
-        def initialize(ds, y)
-          @ds=ds
-          @fields=@ds.fields
-          @x = ds.to_matrix
-          @y = y
-        end
-        # named vector/hash of coefficients
-        # === Parameter
-        # * *type*: symbol; (:array, default). Options = [:array, :hash]
-        def coefficients(type=:array)
-          if type==:array
-            #originally returned as vector; so pass it
-            @coefficients
-          elsif type==:hash
-            h={}
-            @fields.size.times {|i|
-              h[@fields[i]]=@coefficients[i]
-            }
-            h
-          end
-        end
-        def self.mu(x, b)
-          matrix_mul = x * b
-          numerator = matrix_mul.map { |y| Math.exp(y) }
-          denominator = numerator.map { |y| 1 + y }
-          numerator.each_with_index { |e, r, c|
-            numerator[r,c] = numerator[r,c].to_f / denominator[r,c].to_f
-          }
-        end
-        def self.w(x, b)
-          mus = mu(x,b).column_vectors.map(&:to_a).flatten
-          mus_intermediate = mus.collect { |x| 1 - x }
-          w = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
-          w_mat = Matrix.I(w.size)
-          w_enum = w.to_enum
-          return w_mat.map do |x|
-            x.eql?(1) ? w_enum.next : x
-          end
-        end
-        def self.h(x,b,y)
-          x_t = x.transpose
-          mu_flat = mu(x,b).column_vectors.map(&:to_a).flatten
-          column_data = y.zip(mu_flat).collect { |x| x.inject(:-) }
-          x_t * Matrix.column_vector(column_data)
-        end
-        def self.j(x,b)
-          w_matrix = w(x, b)
-          jacobian_matrix = x.transpose * w_matrix * x
-          jacobian_matrix.map { |x| -x }
-        end
-        def to_s
-          sprintf("Logistic Regression (Statsample::Regression::GLM;:Logistic)")
-        end
-        # = Iteratively reweighted least squares
-        #   Computes irwls for given model and parameters.
-        #
-        # == Usage
-        #    require 'statsample-glm'
-        #    x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
-        #    x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
-        #    y=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
-        #    x=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
-        #    obj = Statsample::Regression.glm(x, y, :binomial)
-        #    #=> Logistic Regression object
-        #    obj.irlws
-        #    #=> Array of returned values
-        #    obj.coefficients
-        #    #=> named vector of coefficients
-        def irwls
-          x, y = @x, @y
-          #calling irwls on Regression and passing equivalent methods in lambdas.
-          #Ruby_level+=awesome!
-          @coefficients, @se, @fit, @residuals, @df, @iter, @converged = Statsample::Regression.irwls(
-              x,y, ->l,m{self.class.mu(l,m)}, ->l,m{self.class.w(l,m)},
-              ->l,m{self.class.j(l,m)}, ->k,l,m{self.class.h(k,l,m)}
-          )
-        end
-      end
-    end
-  end
-end

data/lib/statsample-glm/regression/poisson.rb DELETED

@@ -1,90 +0,0 @@
-module Statsample
-  module Regression
-    module GLM
-      class Poisson
-        attr_reader :se
-        # The fitted mean values
-        attr_reader :fit
-        # the _working_ residuals; that is the residuals in the final iteration of the IRWLS fit.
-        attr_reader :residuals
-        # The residuals degree of freedom
-        attr_reader :df
-        # Number of iterations used for convergence
-        attr_reader :iter
-        # Boolean. Tells whether the IRWLS for the given model converged or not
-        attr_reader :converged
-        def initialize(ds, y)
-          @ds=ds
-          @fields=@ds.fields
-          @x = ds.to_matrix
-          @y = y
-        end
-        # named vector/hash of coefficients
-        # === Parameter
-        # * *type*: symbol; (:array, default). Options = [:array, :hash]
-        def coefficients(type=:array)
-          if type==:array
-            @coefficients
-          elsif type==:hash
-            h={}
-            @fields.size.times {|i|
-              h[@fields[i]]=@coefficients[i]
-            }
-            h
-          end
-        end
-        def self.mu(x, b, link=:log)
-          if link.downcase.to_sym == :log
-            (x * b).map { |y| Math.exp(y) }
-          elsif link.downcase.to_sym == :sqrt
-            (x * b).collect { |y| y**2 }
-          end
-        end
-        def self.w(x, b)
-          poisson_mu = mu(x,b)
-          mu_flat = poisson_mu.column_vectors.map(&:to_a).flatten
-          w_mat = Matrix.I(mu_flat.size)
-          mu_enum = mu_flat.to_enum
-          return w_mat.map do |x|
-            x.eql?(1) ? mu_enum.next : x
-          end
-        end
-        def self.h(x, b, y)
-          x_t = x.transpose
-          mu_flat = mu(x,b).column_vectors.map(&:to_a).flatten
-          column_data = y.zip(mu_flat).collect { |x| x.inject(:-) }
-          x_t * Matrix.columns([column_data])
-        end
-        def self.j(x, b)
-          w_matrix = w(x, b)
-          jacobian_matrix = x.transpose * w_matrix * x
-          jacobian_matrix.map { |x| -x }
-        end
-        def to_s
-          sprintf("Logistic Regression (Statsample::Regression::GLM;:Logistic)")
-        end
-        def irwls
-          x,y = @x,@y
-          #calling irwls on Regression and passing equivalent methods in lambdas.
-          #Ruby_level+=awesome!
-          @coefficients, @se, @fit, @residuals, @df, @iter, @converged = Statsample::Regression.irwls(
-              x,y, ->l,m{self.class.mu(l,m)}, ->l,m{self.class.w(l,m)},
-              ->l,m{self.class.j(l,m)}, ->k,l,m{self.class.h(k,l,m)}
-          )
-        end
-      end
-    end
-  end
-end

data/test/helper.rb DELETED

@@ -1,87 +0,0 @@
-require 'rubygems'
-require 'bundler'
-begin
-  Bundler.setup(:default, :development)
-rescue Bundler::BundlerError => e
-  $stderr.puts e.message
-  $stderr.puts "Run `bundle install` to install missing gems"
-  exit e.status_code
-end
-require 'minitest/unit'
-require 'shoulda'
-require 'shoulda-context'
-require 'mocha/setup'
-$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
-$LOAD_PATH.unshift(File.dirname(__FILE__))
-require 'statsample-glm'
-module MiniTest
-  class Unit
-    class TestCase
-      include Shoulda::Context::Assertions
-      include Shoulda::Context::InstanceMethods
-      extend Shoulda::Context::ClassMethods
-      def self.should_with_gsl(name,&block)
-        should(name) do
-          if Statsample.has_gsl?
-            instance_eval(&block)
-          else
-            skip("Requires GSL")
-          end
-        end
-      end
-    end
-  end
-  module Assertions
-    def assert_similar_vector(exp, obs, delta=1e-10,msg=nil)
-      msg||="Different vectors #{exp} - #{obs}"
-      assert_equal(exp.size, obs.size)
-      exp.data_with_nils.each_with_index {|v,i|
-        assert_in_delta(v,obs[i],delta)
-      }
-    end
-    def assert_similar_hash(exp, obs, delta=1e-10,msg=nil)
-      msg||="Different hash #{exp} - #{obs}"
-      assert_equal(exp.size, obs.size)
-      exp.each_key {|k|
-        assert_in_delta(exp[k],obs[k],delta)
-      }
-    end
-    def assert_equal_vector(exp,obs,delta=1e-10,msg=nil)
-      assert_equal(exp.size, obs.size, "Different size.#{msg}")
-      exp.size.times {|i|
-        assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
-      }
-    end
-    def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil)
-      assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}")
-      assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}")
-      exp.row_size.times {|i|
-        exp.column_size.times {|j|
-          assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
-        }
-      }
-    end
-    alias :assert_raise :assert_raises unless method_defined? :assert_raise
-    alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal
-    alias :assert_not_same :refute_same unless method_defined? :assert_not_same
-    unless method_defined? :assert_nothing_raised
-      def assert_nothing_raised(msg=nil)
-        msg||="Nothing should be raised, but raised %s"
-        begin
-          yield
-          not_raised=true
-        rescue Exception => e
-          not_raised=false
-          msg=sprintf(msg,e)
-        end
-        assert(not_raised,msg)
-      end
-    end
-  end
-end
-MiniTest::Unit.autorun

data/test/test_glm.rb DELETED

@@ -1,4 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
-class StatsampleRegressionGlm < MiniTest::Unit::TestCase
-end

data/test/test_glm_logistic.rb DELETED

@@ -1,23 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
-class StatsampleRegressionGlmLogistic < MiniTest::Unit::TestCase
-  context("Example") do
-    setup do
-      x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
-      x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
-      @y_log=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
-      @y_pois=Statsample::Vector.new([1,2,1,3,3,1,10,1,1,2,15,0,0,2,1,2,18,2,1,1,1,8,18,13,7,1,1,0,26,0,2,2,0,0,25,7,0,0,21,0,0,1,5,0,3,0,0,1,0,0],:scale)
-      intercept=Statsample::Vector.new([1]*50,:scale)
-      @df=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
-      @glm=Statsample::Regression.glm(@df,@y_log,:binomial)
-    end
-    should "report correct coefficientes as array" do
-      assert_similar_vector(@glm.coefficients,[0.675603176233325,-0.312493754568903,2.28671333346264])
-    end
-    should "report correct coefficientes as hash" do
-      assert_similar_hash(@glm.coefficients(:hash), {"i"=>0.675603176233325,"x1"=>-0.312493754568903,"x2"=>2.28671333346264})
-    end
-  end
-end

data/test/test_glm_poisson.rb DELETED

@@ -1,25 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
-class StatsampleRegressionGlmPoisson < MiniTest::Unit::TestCase
-  context("Example") do
-    setup do
-      x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
-      x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
-      @y_log=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
-      @y_pois=Statsample::Vector.new([1,2,1,3,3,1,10,1,1,2,15,0,0,2,1,2,18,2,1,1,1,8,18,13,7,1,1,0,26,0,2,2,0,0,25,7,0,0,21,0,0,1,5,0,3,0,0,1,0,0],:scale)
-      intercept=Statsample::Vector.new([1]*50,:scale)
-      @df=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
-      @glm=Statsample::Regression.glm(@df,@y_pois,:poisson)
-    end
-    should "report correct coefficientes as array" do
-      assert_similar_vector(@glm.coefficients,[0.32993246633711,-0.586359358356708,1.28511323439258])
-    end # should
-    should "report correct coefficientes as hash" do
-      assert_similar_hash(@glm.coefficients(:hash), {"i"=>0.32993246633711,"x1"=>-0.586359358356708, "x2"=>1.28511323439258})
-    end # should
-  end # context
-end # class