statsample-glm 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +51 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +2 -9
  5. data/Gemfile +2 -20
  6. data/LICENSE.txt +1 -1
  7. data/README.rdoc +14 -11
  8. data/Rakefile +16 -24
  9. data/lib/statsample-glm.rb +1 -11
  10. data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
  11. data/lib/statsample-glm/glm/base.rb +99 -0
  12. data/lib/statsample-glm/glm/irls/base.rb +54 -0
  13. data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
  14. data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
  15. data/lib/statsample-glm/glm/logistic.rb +16 -0
  16. data/lib/statsample-glm/glm/mle/base.rb +157 -0
  17. data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
  18. data/lib/statsample-glm/glm/mle/normal.rb +94 -0
  19. data/lib/statsample-glm/glm/mle/probit.rb +100 -0
  20. data/lib/statsample-glm/glm/normal.rb +17 -0
  21. data/lib/statsample-glm/glm/poisson.rb +17 -0
  22. data/lib/statsample-glm/glm/probit.rb +16 -0
  23. data/lib/statsample-glm/version.rb +5 -0
  24. data/spec/data/logistic.csv +51 -0
  25. data/spec/data/logistic_mle.csv +201 -0
  26. data/spec/data/normal.csv +30 -0
  27. data/spec/logistic_spec.rb +37 -0
  28. data/spec/normal_spec.rb +15 -0
  29. data/spec/poisson_spec.rb +32 -0
  30. data/spec/probit_spec.rb +19 -0
  31. data/spec/spec_helper.rb +50 -0
  32. data/statsample-glm.gemspec +35 -0
  33. metadata +71 -145
  34. data/VERSION +0 -1
  35. data/features/bio-statsample-glm.feature +0 -9
  36. data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
  37. data/features/support/env.rb +0 -15
  38. data/lib/statsample-glm/regression/logistic.rb +0 -108
  39. data/lib/statsample-glm/regression/poisson.rb +0 -90
  40. data/test/helper.rb +0 -87
  41. data/test/test_glm.rb +0 -4
  42. data/test/test_glm_logistic.rb +0 -23
  43. data/test/test_glm_poisson.rb +0 -25
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.0.1
@@ -1,9 +0,0 @@
1
- Feature: something something
2
- In order to something something
3
- A user something something
4
- something something something
5
-
6
- Scenario: something something
7
- Given inspiration
8
- When I create a sweet new gem
9
- Then everyone should see how awesome I am
@@ -1,15 +0,0 @@
1
- require 'bundler'
2
- begin
3
- Bundler.setup(:default, :development)
4
- rescue Bundler::BundlerError => e
5
- $stderr.puts e.message
6
- $stderr.puts "Run `bundle install` to install missing gems"
7
- exit e.status_code
8
- end
9
-
10
- $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
- require 'statsample-glm'
12
-
13
- require 'test/unit/assertions'
14
-
15
- World(Test::Unit::Assertions)
@@ -1,108 +0,0 @@
1
- module Statsample
2
- module Regression
3
- module GLM
4
-
5
- class Logistic
6
-
7
- attr_reader :se
8
- # The fitted mean values
9
- attr_reader :fit
10
- # the _working_ residuals; that is the residuals in the final iteration of the IRWLS fit.
11
- attr_reader :residuals
12
- # The residuals degree of freedom
13
- attr_reader :df
14
- # Number of iterations used for convergence
15
- attr_reader :iter
16
- # Boolean. Tells whether the IRWLS for the given model converged or not
17
- attr_reader :converged
18
-
19
- def initialize(ds, y)
20
- @ds=ds
21
- @fields=@ds.fields
22
- @x = ds.to_matrix
23
- @y = y
24
- end
25
-
26
- # named vector/hash of coefficients
27
- # === Parameter
28
- # * *type*: symbol; (:array, default). Options = [:array, :hash]
29
- def coefficients(type=:array)
30
- if type==:array
31
- #originally returned as vector; so pass it
32
- @coefficients
33
- elsif type==:hash
34
- h={}
35
- @fields.size.times {|i|
36
- h[@fields[i]]=@coefficients[i]
37
- }
38
- h
39
- end
40
- end
41
- def self.mu(x, b)
42
- matrix_mul = x * b
43
- numerator = matrix_mul.map { |y| Math.exp(y) }
44
- denominator = numerator.map { |y| 1 + y }
45
-
46
- numerator.each_with_index { |e, r, c|
47
- numerator[r,c] = numerator[r,c].to_f / denominator[r,c].to_f
48
- }
49
- end
50
-
51
- def self.w(x, b)
52
- mus = mu(x,b).column_vectors.map(&:to_a).flatten
53
- mus_intermediate = mus.collect { |x| 1 - x }
54
- w = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
55
- w_mat = Matrix.I(w.size)
56
- w_enum = w.to_enum
57
- return w_mat.map do |x|
58
- x.eql?(1) ? w_enum.next : x
59
- end
60
- end
61
-
62
- def self.h(x,b,y)
63
- x_t = x.transpose
64
- mu_flat = mu(x,b).column_vectors.map(&:to_a).flatten
65
- column_data = y.zip(mu_flat).collect { |x| x.inject(:-) }
66
- x_t * Matrix.column_vector(column_data)
67
- end
68
-
69
- def self.j(x,b)
70
- w_matrix = w(x, b)
71
- jacobian_matrix = x.transpose * w_matrix * x
72
- jacobian_matrix.map { |x| -x }
73
- end
74
-
75
- def to_s
76
- sprintf("Logistic Regression (Statsample::Regression::GLM;:Logistic)")
77
- end
78
-
79
- # = Iteratively reweighted least squares
80
- # Computes irwls for given model and parameters.
81
- #
82
- # == Usage
83
- # require 'statsample-glm'
84
- # x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
85
- # x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
86
- # y=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
87
- # x=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
88
- # obj = Statsample::Regression.glm(x, y, :binomial)
89
- # #=> Logistic Regression object
90
- # obj.irlws
91
- # #=> Array of returned values
92
- # obj.coefficients
93
- # #=> named vector of coefficients
94
-
95
- def irwls
96
- x, y = @x, @y
97
- #calling irwls on Regression and passing equivalent methods in lambdas.
98
- #Ruby_level+=awesome!
99
- @coefficients, @se, @fit, @residuals, @df, @iter, @converged = Statsample::Regression.irwls(
100
- x,y, ->l,m{self.class.mu(l,m)}, ->l,m{self.class.w(l,m)},
101
- ->l,m{self.class.j(l,m)}, ->k,l,m{self.class.h(k,l,m)}
102
- )
103
- end
104
- end
105
-
106
- end
107
- end
108
- end
@@ -1,90 +0,0 @@
1
- module Statsample
2
- module Regression
3
- module GLM
4
-
5
- class Poisson
6
-
7
- attr_reader :se
8
- # The fitted mean values
9
- attr_reader :fit
10
- # the _working_ residuals; that is the residuals in the final iteration of the IRWLS fit.
11
- attr_reader :residuals
12
- # The residuals degree of freedom
13
- attr_reader :df
14
- # Number of iterations used for convergence
15
- attr_reader :iter
16
- # Boolean. Tells whether the IRWLS for the given model converged or not
17
- attr_reader :converged
18
-
19
- def initialize(ds, y)
20
- @ds=ds
21
- @fields=@ds.fields
22
- @x = ds.to_matrix
23
- @y = y
24
- end
25
-
26
- # named vector/hash of coefficients
27
- # === Parameter
28
- # * *type*: symbol; (:array, default). Options = [:array, :hash]
29
- def coefficients(type=:array)
30
- if type==:array
31
- @coefficients
32
- elsif type==:hash
33
- h={}
34
- @fields.size.times {|i|
35
- h[@fields[i]]=@coefficients[i]
36
- }
37
- h
38
- end
39
- end
40
-
41
- def self.mu(x, b, link=:log)
42
- if link.downcase.to_sym == :log
43
- (x * b).map { |y| Math.exp(y) }
44
- elsif link.downcase.to_sym == :sqrt
45
- (x * b).collect { |y| y**2 }
46
- end
47
- end
48
-
49
- def self.w(x, b)
50
- poisson_mu = mu(x,b)
51
- mu_flat = poisson_mu.column_vectors.map(&:to_a).flatten
52
-
53
- w_mat = Matrix.I(mu_flat.size)
54
- mu_enum = mu_flat.to_enum
55
- return w_mat.map do |x|
56
- x.eql?(1) ? mu_enum.next : x
57
- end
58
- end
59
-
60
- def self.h(x, b, y)
61
- x_t = x.transpose
62
- mu_flat = mu(x,b).column_vectors.map(&:to_a).flatten
63
- column_data = y.zip(mu_flat).collect { |x| x.inject(:-) }
64
- x_t * Matrix.columns([column_data])
65
- end
66
-
67
- def self.j(x, b)
68
- w_matrix = w(x, b)
69
- jacobian_matrix = x.transpose * w_matrix * x
70
- jacobian_matrix.map { |x| -x }
71
- end
72
-
73
- def to_s
74
- sprintf("Logistic Regression (Statsample::Regression::GLM;:Logistic)")
75
- end
76
-
77
- def irwls
78
- x,y = @x,@y
79
- #calling irwls on Regression and passing equivalent methods in lambdas.
80
- #Ruby_level+=awesome!
81
- @coefficients, @se, @fit, @residuals, @df, @iter, @converged = Statsample::Regression.irwls(
82
- x,y, ->l,m{self.class.mu(l,m)}, ->l,m{self.class.w(l,m)},
83
- ->l,m{self.class.j(l,m)}, ->k,l,m{self.class.h(k,l,m)}
84
- )
85
- end
86
-
87
- end
88
- end
89
- end
90
- end
@@ -1,87 +0,0 @@
1
- require 'rubygems'
2
- require 'bundler'
3
- begin
4
- Bundler.setup(:default, :development)
5
- rescue Bundler::BundlerError => e
6
- $stderr.puts e.message
7
- $stderr.puts "Run `bundle install` to install missing gems"
8
- exit e.status_code
9
- end
10
- require 'minitest/unit'
11
- require 'shoulda'
12
- require 'shoulda-context'
13
- require 'mocha/setup'
14
-
15
-
16
- $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
17
- $LOAD_PATH.unshift(File.dirname(__FILE__))
18
- require 'statsample-glm'
19
- module MiniTest
20
- class Unit
21
- class TestCase
22
- include Shoulda::Context::Assertions
23
- include Shoulda::Context::InstanceMethods
24
- extend Shoulda::Context::ClassMethods
25
- def self.should_with_gsl(name,&block)
26
- should(name) do
27
- if Statsample.has_gsl?
28
- instance_eval(&block)
29
- else
30
- skip("Requires GSL")
31
- end
32
- end
33
- end
34
- end
35
- end
36
-
37
- module Assertions
38
- def assert_similar_vector(exp, obs, delta=1e-10,msg=nil)
39
- msg||="Different vectors #{exp} - #{obs}"
40
- assert_equal(exp.size, obs.size)
41
- exp.data_with_nils.each_with_index {|v,i|
42
- assert_in_delta(v,obs[i],delta)
43
- }
44
- end
45
- def assert_similar_hash(exp, obs, delta=1e-10,msg=nil)
46
- msg||="Different hash #{exp} - #{obs}"
47
- assert_equal(exp.size, obs.size)
48
- exp.each_key {|k|
49
- assert_in_delta(exp[k],obs[k],delta)
50
- }
51
- end
52
-
53
- def assert_equal_vector(exp,obs,delta=1e-10,msg=nil)
54
- assert_equal(exp.size, obs.size, "Different size.#{msg}")
55
- exp.size.times {|i|
56
- assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
57
- }
58
- end
59
- def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil)
60
- assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}")
61
- assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}")
62
- exp.row_size.times {|i|
63
- exp.column_size.times {|j|
64
- assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
65
- }
66
- }
67
- end
68
- alias :assert_raise :assert_raises unless method_defined? :assert_raise
69
- alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal
70
- alias :assert_not_same :refute_same unless method_defined? :assert_not_same
71
- unless method_defined? :assert_nothing_raised
72
- def assert_nothing_raised(msg=nil)
73
- msg||="Nothing should be raised, but raised %s"
74
- begin
75
- yield
76
- not_raised=true
77
- rescue Exception => e
78
- not_raised=false
79
- msg=sprintf(msg,e)
80
- end
81
- assert(not_raised,msg)
82
- end
83
- end
84
- end
85
- end
86
-
87
- MiniTest::Unit.autorun
@@ -1,4 +0,0 @@
1
- require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
2
-
3
- class StatsampleRegressionGlm < MiniTest::Unit::TestCase
4
- end
@@ -1,23 +0,0 @@
1
- require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
2
-
3
- class StatsampleRegressionGlmLogistic < MiniTest::Unit::TestCase
4
-
5
- context("Example") do
6
- setup do
7
- x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
8
- x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
9
- @y_log=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
10
- @y_pois=Statsample::Vector.new([1,2,1,3,3,1,10,1,1,2,15,0,0,2,1,2,18,2,1,1,1,8,18,13,7,1,1,0,26,0,2,2,0,0,25,7,0,0,21,0,0,1,5,0,3,0,0,1,0,0],:scale)
11
- intercept=Statsample::Vector.new([1]*50,:scale)
12
- @df=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
13
- @glm=Statsample::Regression.glm(@df,@y_log,:binomial)
14
- end
15
- should "report correct coefficientes as array" do
16
- assert_similar_vector(@glm.coefficients,[0.675603176233325,-0.312493754568903,2.28671333346264])
17
- end
18
- should "report correct coefficientes as hash" do
19
- assert_similar_hash(@glm.coefficients(:hash), {"i"=>0.675603176233325,"x1"=>-0.312493754568903,"x2"=>2.28671333346264})
20
- end
21
-
22
- end
23
- end
@@ -1,25 +0,0 @@
1
- require(File.expand_path(File.dirname(__FILE__)+'/helper.rb'))
2
-
3
- class StatsampleRegressionGlmPoisson < MiniTest::Unit::TestCase
4
-
5
- context("Example") do
6
- setup do
7
- x1=Statsample::Vector.new([0.537322309644812,-0.717124209978434,-0.519166718891331,0.434970973986765,-0.761822002215759,1.51170030921189,0.883854199811195,-0.908689798854196,1.70331977539793,-0.246971150634099,-1.59077593922623,-0.721548040910253,0.467025703920194,-0.510132788447137,0.430106510266798,-0.144353683251536,-1.54943800728303,0.849307651309298,-0.640304240933579,1.31462478279425,-0.399783455165345,0.0453055645017902,-2.58212161987746,-1.16484414309359,-1.08829266466281,-0.243893919684792,-1.96655661929441,0.301335373291024,-0.665832694463588,-0.0120650855753837,1.5116066367604,0.557300353673344,1.12829931872045,0.234443748015922,-2.03486690662651,0.275544751380246,-0.231465849558696,-0.356880153225012,-0.57746647541923,1.35758352580655,1.23971669378224,-0.662466275100489,0.313263561921793,-1.08783223256362,1.41964722846899,1.29325100940785,0.72153880625103,0.440580131022748,0.0351917814720056, -0.142353224879252],:scale)
8
- x2=Statsample::Vector.new([-0.866655707911859,-0.367820249977585,0.361486610435,0.857332626245179,0.133438466268095,0.716104533073575,1.77206093023382,-0.10136697295802,-0.777086491435508,-0.204573554913706,0.963353531412233,-1.10103024900542,-0.404372761837392,-0.230226345183469,0.0363730246866971,-0.838265540390497,1.12543549657924,-0.57929175648001,-0.747060244805248,0.58946979365152,-0.531952663697324,1.53338594419818,0.521992029051441,1.41631763288724,0.611402316795129,-0.518355638373296,-0.515192557101107,-0.672697937866108,1.84347042325327,-0.21195540664804,-0.269869371631611,0.296155694010096,-2.18097898069634,-1.21314663927206,1.49193669881581,1.38969280369493,-0.400680808117106,-1.87282814976479,1.82394870451051,0.637864732838274,-0.141155946382493,0.0699950644281617,1.32568550595165,-0.412599258349398,0.14436832227506,-1.16507785388489,-2.16782049922428,0.24318371493798,0.258954871320764,-0.151966534521183],:scale)
9
- @y_log=Statsample::Vector.new([0,0,1,0,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,1,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0,1,1],:scale)
10
- @y_pois=Statsample::Vector.new([1,2,1,3,3,1,10,1,1,2,15,0,0,2,1,2,18,2,1,1,1,8,18,13,7,1,1,0,26,0,2,2,0,0,25,7,0,0,21,0,0,1,5,0,3,0,0,1,0,0],:scale)
11
- intercept=Statsample::Vector.new([1]*50,:scale)
12
- @df=Statsample::Dataset.new({"i"=>intercept,"x1"=>x1,"x2"=>x2})
13
- @glm=Statsample::Regression.glm(@df,@y_pois,:poisson)
14
-
15
- end
16
- should "report correct coefficientes as array" do
17
- assert_similar_vector(@glm.coefficients,[0.32993246633711,-0.586359358356708,1.28511323439258])
18
- end # should
19
- should "report correct coefficientes as hash" do
20
- assert_similar_hash(@glm.coefficients(:hash), {"i"=>0.32993246633711,"x1"=>-0.586359358356708, "x2"=>1.28511323439258})
21
- end # should
22
-
23
- end # context
24
- end # class
25
-