statsample-glm 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +51 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +2 -9
  5. data/Gemfile +2 -20
  6. data/LICENSE.txt +1 -1
  7. data/README.rdoc +14 -11
  8. data/Rakefile +16 -24
  9. data/lib/statsample-glm.rb +1 -11
  10. data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
  11. data/lib/statsample-glm/glm/base.rb +99 -0
  12. data/lib/statsample-glm/glm/irls/base.rb +54 -0
  13. data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
  14. data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
  15. data/lib/statsample-glm/glm/logistic.rb +16 -0
  16. data/lib/statsample-glm/glm/mle/base.rb +157 -0
  17. data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
  18. data/lib/statsample-glm/glm/mle/normal.rb +94 -0
  19. data/lib/statsample-glm/glm/mle/probit.rb +100 -0
  20. data/lib/statsample-glm/glm/normal.rb +17 -0
  21. data/lib/statsample-glm/glm/poisson.rb +17 -0
  22. data/lib/statsample-glm/glm/probit.rb +16 -0
  23. data/lib/statsample-glm/version.rb +5 -0
  24. data/spec/data/logistic.csv +51 -0
  25. data/spec/data/logistic_mle.csv +201 -0
  26. data/spec/data/normal.csv +30 -0
  27. data/spec/logistic_spec.rb +37 -0
  28. data/spec/normal_spec.rb +15 -0
  29. data/spec/poisson_spec.rb +32 -0
  30. data/spec/probit_spec.rb +19 -0
  31. data/spec/spec_helper.rb +50 -0
  32. data/statsample-glm.gemspec +35 -0
  33. metadata +71 -145
  34. data/VERSION +0 -1
  35. data/features/bio-statsample-glm.feature +0 -9
  36. data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
  37. data/features/support/env.rb +0 -15
  38. data/lib/statsample-glm/regression/logistic.rb +0 -108
  39. data/lib/statsample-glm/regression/poisson.rb +0 -90
  40. data/test/helper.rb +0 -87
  41. data/test/test_glm.rb +0 -4
  42. data/test/test_glm_logistic.rb +0 -23
  43. data/test/test_glm_poisson.rb +0 -25
@@ -0,0 +1,46 @@
1
+ require 'statsample-glm/glm/irls/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module IRLS
6
+ class Logistic < Statsample::GLM::IRLS::Base
7
+ def initialize data_set, dependent, opts={}
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ "Statsample::GLM::Logistic"
13
+ end
14
+
15
+ protected
16
+
17
+ def measurement x, b
18
+ (x * b).map { |y| 1/(1 + Math.exp(-y)) }
19
+ end
20
+
21
+ def weight x, b
22
+ mus = measurement(x,b).column_vectors.map(&:to_a).flatten
23
+ mus_intermediate = mus.map { |p| 1 - p }
24
+ weights = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
25
+
26
+ w_mat = Matrix.I(weights.size)
27
+ w_enum = weights.to_enum
28
+ return w_mat.map do |x|
29
+ x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
30
+ end
31
+ end
32
+
33
+ def jacobian x, b, y
34
+ mu_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
35
+ column_data = y.zip(mu_flat).map { |x| x.inject(:-) }
36
+
37
+ x.transpose * Matrix.column_vector(column_data)
38
+ end
39
+
40
+ def hessian x, b
41
+ (x.transpose * weight(x, b) * x).map { |x| -x }
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,48 @@
1
+ require 'statsample-glm/glm/irls/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module IRLS
6
+ class Poisson < Statsample::GLM::IRLS::Base
7
+ def initialize data_set, dependent, opts={}
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ puts "Logistic Regression (Statsample::Regression::GLM::Logistic)"
13
+ end
14
+ protected
15
+
16
+ def measurement x, b
17
+ if @opts[:link] == :log
18
+ (x * b).map { |y| Math.exp(y) }
19
+ elsif @opts[:link] == :sqrt
20
+ (x * b).map { |y| y**2 }
21
+ end
22
+ end
23
+
24
+ def weight x, b
25
+ m = measurement(x,b).column_vectors.map(&:to_a).flatten
26
+
27
+ w_mat = Matrix.I(m.size)
28
+ w_enum = m.to_enum
29
+
30
+ return w_mat.map do |x|
31
+ x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
32
+ end
33
+ end
34
+
35
+ def hessian x, b
36
+ (x.transpose * weight(x, b) * x).map { |x| -x }
37
+ end
38
+
39
+ def jacobian x, b, y
40
+ measurement_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
41
+ column_data = y.zip(measurement_flat).collect { |x| x.inject(:-) }
42
+
43
+ x.transpose * Matrix.columns([column_data])
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,16 @@
1
+ require 'statsample-glm/glm/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ class Logistic < Statsample::GLM::Base
6
+
7
+ def initialize data_set, dependent, opts
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ "Statsample::GLM::Logistic"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,157 @@
1
+ module Statsample
2
+
3
+ module GLM
4
+ module MLE
5
+ class Base
6
+ attr_reader :coefficients, :iterations,
7
+ :fitted_mean_values, :residuals, :degree_of_freedom,
8
+ :log_likelihood
9
+
10
+ MIN_DIFF_PARAMETERS=1e-2
11
+
12
+ def initialize data_set, dependent, opts
13
+ @opts = opts
14
+
15
+ @data_set = data_set
16
+ @dependent = dependent
17
+
18
+ @stop_criteria = :parameters
19
+ @var_cov_matrix = nil
20
+ @iterations = nil
21
+ @parameters = nil
22
+
23
+ x = @data_set.to_matrix
24
+ y = @dependent.to_matrix(:vertical)
25
+
26
+ @coefficients = newton_raphson x, y
27
+ @log_likelihood = _log_likelihood x, y, @coefficients
28
+ @fitted_mean_values = create_vector measurement(x, @coefficients).to_a.flatten
29
+ @residuals = @dependent - @fitted_mean_values
30
+ @degree_of_freedom = @dependent.count - x.column_size
31
+
32
+ # This jugad is done because the last vector index for Normal is sigma^2
33
+ # which we dont want to return to the user.
34
+ @coefficients = create_vector(self.is_a?(Statsample::GLM::MLE::Normal) ?
35
+ @coefficients.to_a.flatten[0..-2] : @coefficients.to_a.flatten)
36
+ end
37
+
38
+ def standard_error
39
+ out = []
40
+
41
+ @data_set.fields.each_index do |i|
42
+ out << Math::sqrt(@var_cov_matrix[i,i])
43
+ end
44
+
45
+ out
46
+ end
47
+
48
+ # Newton Raphson with automatic stopping criteria.
49
+ # Based on: Von Tessin, P. (2005). Maximum Likelihood Estimation With Java and Ruby
50
+ #
51
+ # <tt>x</tt>:: matrix of dependent variables. Should have nxk dimensions
52
+ # <tt>y</tt>:: matrix of independent values. Should have nx1 dimensions
53
+ # <tt>@m</tt>:: class for @ming. Could be Normal or Logistic
54
+ # <tt>start_values</tt>:: matrix of coefficients. Should have 1xk dimensions
55
+ def newton_raphson(x,y, start_values=nil)
56
+ # deep copy?
57
+ if start_values.nil?
58
+ parameters = set_default_parameters(x)
59
+ else
60
+ parameters = start_values.dup
61
+ end
62
+ k = parameters.row_size
63
+
64
+ raise "n on y != n on x" if x.row_size != y.row_size
65
+ h = nil
66
+ fd = nil
67
+
68
+ if @stop_criteria == :mle
69
+ old_likelihood = _log_likelihood(x, y, parameters)
70
+ else
71
+ old_parameters = parameters
72
+ end
73
+
74
+ @opts[:iterations].times do |i|
75
+ @iterations = i + 1
76
+
77
+ h = second_derivative(x,y,parameters)
78
+ if h.singular?
79
+ raise "Hessian is singular!"
80
+ end
81
+ fd = first_derivative(x,y,parameters)
82
+ parameters = parameters - (h.inverse * (fd))
83
+
84
+ if @stop_criteria == :parameters
85
+ flag = true
86
+ k.times do |j|
87
+ diff = ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
88
+ flag = false if diff.abs >= MIN_DIFF_PARAMETERS
89
+
90
+ end
91
+
92
+ if flag
93
+ @var_cov_matrix = h.inverse*-1.0
94
+ return parameters
95
+ end
96
+ old_parameters = parameters
97
+ else
98
+ begin
99
+ new_likelihood = _log_likelihood(x,y,parameters)
100
+
101
+ if(new_likelihood < old_likelihood) or ((new_likelihood - old_likelihood) / new_likelihood).abs < @opts[:epsilon]
102
+ @var_cov_matrix = h.inverse*-1.0
103
+ break;
104
+ end
105
+ old_likelihood = new_likelihood
106
+ rescue =>e
107
+ puts "#{e}"
108
+ end
109
+ end
110
+ end
111
+ @parameters = parameters
112
+ parameters
113
+ end
114
+
115
+ private
116
+ # Calculate likelihood for matrices x and y, given b parameters
117
+ def likelihood x,y,b
118
+ prod = 1
119
+ x.row_size.times{|i|
120
+ xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
121
+ y_val=y[i,0].to_f
122
+ #fbx=f(b,x)
123
+ prod=prod*likelihood_i(xi, y_val ,b)
124
+ }
125
+ prod
126
+ end
127
+
128
+ # Calculate log likelihood for matrices x and y, given b parameters
129
+ def _log_likelihood x,y,b
130
+ sum = 0
131
+ x.row_size.times{|i|
132
+ xi = Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
133
+ y_val = y[i,0].to_f
134
+ sum += log_likelihood_i xi, y_val, b
135
+ }
136
+
137
+ sum
138
+ end
139
+
140
+ # Creates a zero matrix Mx1, with M=x.M
141
+ def set_default_parameters x
142
+ fd = [0.0] * x.column_size
143
+
144
+ fd.push(0.1) if self.is_a? Statsample::GLM::MLE::Normal
145
+ Matrix.columns([fd])
146
+ end
147
+
148
+ def create_vector arr
149
+ Statsample::Vector.new(arr, :scale)
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
155
+ require 'statsample/mle/normal'
156
+ require 'statsample/mle/logit'
157
+ require 'statsample/mle/probit'
@@ -0,0 +1,113 @@
1
+ require 'statsample-glm/glm/mle/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module MLE
6
+ # Logistic MLE estimation.
7
+ # See Statsample::Regression for methods to generate a logit regression.
8
+ # Usage:
9
+ #
10
+ # mle=Statsample::GLM::MLE::Logistic.new
11
+ # mle.newton_raphson(x,y)
12
+ # beta=mle.coefficients
13
+ # likelihood=mle.likelihood(x, y, beta)
14
+ # iterations=mle.iterations
15
+ #
16
+ class Logistic < Statsample::GLM::MLE::Base
17
+
18
+ protected
19
+ # F(B'Xi)
20
+ def f(b,xi)
21
+ p_bx = (xi*b)[0,0]
22
+ res = (1.0/(1.0+Math::exp(-p_bx)))
23
+ if res == 0.0
24
+ res = 1e-15
25
+ elsif res == 1.0
26
+ res = 0.999999999999999
27
+ end
28
+
29
+ res
30
+ end
31
+
32
+ # Likehood for x_i vector, y_i scalar and b parameters
33
+ def likelihood_i(xi,yi,b)
34
+ (f(b,xi)**yi)*((1-f(b,xi))**(1-yi))
35
+ end
36
+
37
+ # Log Likehood for x_i vector, y_i scalar and b parameters
38
+ def log_likelihood_i(xi,yi,b)
39
+ fbx = f(b,xi)
40
+ (yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
41
+ end
42
+
43
+ # First derivative of log-likelihood function
44
+ # x: Matrix (NxM)
45
+ # y: Matrix (Nx1)
46
+ # p: Matrix (Mx1)
47
+ def first_derivative(x,y,p)
48
+ raise "x.rows != y.rows" if x.row_size != y.row_size
49
+ raise "x.columns != p.rows" if x.column_size != p.row_size
50
+
51
+ n = x.row_size
52
+ k = x.column_size
53
+ fd = Array.new(k)
54
+ k.times {|i| fd[i] = [0.0]}
55
+
56
+ n.times do |i|
57
+ row = x.row(i).to_a
58
+ value1 = (1 - y[i,0]) - p_plus(row,p)
59
+
60
+ k.times do |j|
61
+ fd[j][0] -= value1*row[j]
62
+ end
63
+ end
64
+ Matrix.rows(fd, true)
65
+ end
66
+ # Second derivative of log-likelihood function
67
+ # x: Matrix (NxM)
68
+ # y: Matrix (Nx1)
69
+ # p: Matrix (Mx1)
70
+ def second_derivative(x,y,p2)
71
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
72
+ raise "x.columns!=p.rows" if x.column_size!=p2.row_size
73
+ n = x.row_size
74
+ k = x.column_size
75
+ sd = Array.new(k)
76
+ k.times do |i|
77
+ arr = Array.new(k)
78
+ k.times{ |j| arr[j]=0.0}
79
+ sd[i] = arr
80
+ end
81
+ n.times do |i|
82
+ row = x.row(i).to_a
83
+ p_m = p_minus(row,p2)
84
+ k.times do |j|
85
+ k.times do |l|
86
+ sd[j][l] -= (p_m*(1-p_m)*row[j]*row[l])
87
+ end
88
+ end
89
+ end
90
+ Matrix.rows(sd, true)
91
+ end
92
+
93
+ def measurement x, b
94
+ (x * b).map { |y| 1/(1 + Math.exp(-y)) }
95
+ end
96
+
97
+ private
98
+ def p_minus(x_row,p)
99
+ value = 0.0;
100
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
101
+ 1/(1+Math.exp(-value))
102
+ end
103
+
104
+ def p_plus(x_row,p)
105
+ value = 0.0;
106
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
107
+ 1/(1+Math.exp(value))
108
+ end
109
+
110
+ end # Logistic
111
+ end # MLE
112
+ end # GLM
113
+ end # Statsample
@@ -0,0 +1,94 @@
1
+ require 'statsample-glm/glm/mle/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module MLE
6
+ # Normal Distribution MLE estimation.
7
+ # Usage:
8
+ # TODO : Document this properly
9
+ # mle=Statsample::MLE::Normal.new
10
+ # mle.newton_raphson(x,y)
11
+ # beta=mle.coefficients
12
+ # likelihood=mle.likelihood(x,y,beta)
13
+ # iterations=mle.iterations
14
+ class Normal < Statsample::GLM::MLE::Base
15
+
16
+ protected
17
+ def measurement data_set, coefficients
18
+ (data_set * coefficients[0..-2,0]).map { |xb| xb }
19
+ end
20
+ # Total MLE for given X, Y and B matrices (overridden over the Base version)
21
+ def _log_likelihood(x,y,b)
22
+ n = x.row_size.to_f
23
+ sigma2 = b[b.row_size-1,0]
24
+ betas = Matrix.columns([b.column(0). to_a[0...b.row_size-1]])
25
+ e = y - (x * betas)
26
+ last = (1 / (2*sigma2)) * e.t * e
27
+ (-(n / 2.0) * Math::log(2*Math::PI))-((n / 2.0)*Math::log(sigma2)) - last[0,0]
28
+ end
29
+ # First derivative for Normal Model.
30
+ # p should be [k+1,1], because the last parameter is sigma^2
31
+ def first_derivative(x,y,p)
32
+ raise "x.rows != y.rows" if x.row_size != y.row_size
33
+ raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
34
+
35
+ n = x.row_size
36
+ k = x.column_size
37
+ b = Array.new(k)
38
+
39
+ k.times{|i| b[i]=[p[i,0]]}
40
+ beta = Matrix.rows(b)
41
+ sigma2 = p[k,0]
42
+ sigma4 = sigma2 * sigma2
43
+ e = y-(x * (beta))
44
+ xte = x.transpose*(e)
45
+ ete = e.transpose*(e)
46
+ #rows of the Jacobian
47
+ rows = Array.new(k+1)
48
+ k.times{|i| rows[i] = [xte[i,0] / sigma2]}
49
+ rows[k] = [ete[0,0] / (2*sigma4) - n / (2*sigma2)]
50
+ Matrix.rows(rows, true)
51
+ end
52
+
53
+ # second derivative for normal model
54
+ # p should be [k+1,1], because the last parameter is sigma^2
55
+ def second_derivative(x,y,p)
56
+ raise "x.rows != y.rows" if x.row_size != y.row_size
57
+ raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
58
+ #n = x.row_size
59
+ k = x.column_size
60
+ b = Array.new(k)
61
+ k.times{|i| b[i] = [p[i,0]]}
62
+ beta = Matrix.rows(b)
63
+ sigma2 = p[k,0]
64
+ sigma4 = sigma2*sigma2
65
+ sigma6 = sigma2*sigma2*sigma2
66
+ e = y-(x*(beta))
67
+ xtx = x.transpose*(x)
68
+ xte = x.transpose*(e)
69
+ ete = e.transpose*(e)
70
+ #rows of the Hessian
71
+ rows = Array.new(k+1)
72
+
73
+ k.times do |i|
74
+ row = Array.new(k+1)
75
+ k.times do |j|
76
+ row[j] = -xtx[i,j] / sigma2
77
+ end
78
+ row[k] = -xte[i,0] / sigma4
79
+ rows[i] = row
80
+ end
81
+
82
+ last_row = Array.new(k+1)
83
+ k.times do |i|
84
+ last_row[i] = -xte[i,0] / sigma4
85
+ end
86
+
87
+ last_row[k] = 2*sigma4 - ete[0,0] / sigma6
88
+ rows[k] = last_row
89
+ Matrix.rows(rows, true)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end