statsample-glm 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +51 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +2 -9
  5. data/Gemfile +2 -20
  6. data/LICENSE.txt +1 -1
  7. data/README.rdoc +14 -11
  8. data/Rakefile +16 -24
  9. data/lib/statsample-glm.rb +1 -11
  10. data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
  11. data/lib/statsample-glm/glm/base.rb +99 -0
  12. data/lib/statsample-glm/glm/irls/base.rb +54 -0
  13. data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
  14. data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
  15. data/lib/statsample-glm/glm/logistic.rb +16 -0
  16. data/lib/statsample-glm/glm/mle/base.rb +157 -0
  17. data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
  18. data/lib/statsample-glm/glm/mle/normal.rb +94 -0
  19. data/lib/statsample-glm/glm/mle/probit.rb +100 -0
  20. data/lib/statsample-glm/glm/normal.rb +17 -0
  21. data/lib/statsample-glm/glm/poisson.rb +17 -0
  22. data/lib/statsample-glm/glm/probit.rb +16 -0
  23. data/lib/statsample-glm/version.rb +5 -0
  24. data/spec/data/logistic.csv +51 -0
  25. data/spec/data/logistic_mle.csv +201 -0
  26. data/spec/data/normal.csv +30 -0
  27. data/spec/logistic_spec.rb +37 -0
  28. data/spec/normal_spec.rb +15 -0
  29. data/spec/poisson_spec.rb +32 -0
  30. data/spec/probit_spec.rb +19 -0
  31. data/spec/spec_helper.rb +50 -0
  32. data/statsample-glm.gemspec +35 -0
  33. metadata +71 -145
  34. data/VERSION +0 -1
  35. data/features/bio-statsample-glm.feature +0 -9
  36. data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
  37. data/features/support/env.rb +0 -15
  38. data/lib/statsample-glm/regression/logistic.rb +0 -108
  39. data/lib/statsample-glm/regression/poisson.rb +0 -90
  40. data/test/helper.rb +0 -87
  41. data/test/test_glm.rb +0 -4
  42. data/test/test_glm_logistic.rb +0 -23
  43. data/test/test_glm_poisson.rb +0 -25
@@ -0,0 +1,46 @@
1
+ require 'statsample-glm/glm/irls/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module IRLS
6
+ class Logistic < Statsample::GLM::IRLS::Base
7
+ def initialize data_set, dependent, opts={}
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ "Statsample::GLM::Logistic"
13
+ end
14
+
15
+ protected
16
+
17
+ def measurement x, b
18
+ (x * b).map { |y| 1/(1 + Math.exp(-y)) }
19
+ end
20
+
21
+ def weight x, b
22
+ mus = measurement(x,b).column_vectors.map(&:to_a).flatten
23
+ mus_intermediate = mus.map { |p| 1 - p }
24
+ weights = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
25
+
26
+ w_mat = Matrix.I(weights.size)
27
+ w_enum = weights.to_enum
28
+ return w_mat.map do |x|
29
+ x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
30
+ end
31
+ end
32
+
33
+ def jacobian x, b, y
34
+ mu_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
35
+ column_data = y.zip(mu_flat).map { |x| x.inject(:-) }
36
+
37
+ x.transpose * Matrix.column_vector(column_data)
38
+ end
39
+
40
+ def hessian x, b
41
+ (x.transpose * weight(x, b) * x).map { |x| -x }
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,48 @@
1
+ require 'statsample-glm/glm/irls/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module IRLS
6
+ class Poisson < Statsample::GLM::IRLS::Base
7
+ def initialize data_set, dependent, opts={}
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ puts "Logistic Regression (Statsample::Regression::GLM::Logistic)"
13
+ end
14
+ protected
15
+
16
+ def measurement x, b
17
+ if @opts[:link] == :log
18
+ (x * b).map { |y| Math.exp(y) }
19
+ elsif @opts[:link] == :sqrt
20
+ (x * b).map { |y| y**2 }
21
+ end
22
+ end
23
+
24
+ def weight x, b
25
+ m = measurement(x,b).column_vectors.map(&:to_a).flatten
26
+
27
+ w_mat = Matrix.I(m.size)
28
+ w_enum = m.to_enum
29
+
30
+ return w_mat.map do |x|
31
+ x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
32
+ end
33
+ end
34
+
35
+ def hessian x, b
36
+ (x.transpose * weight(x, b) * x).map { |x| -x }
37
+ end
38
+
39
+ def jacobian x, b, y
40
+ measurement_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
41
+ column_data = y.zip(measurement_flat).collect { |x| x.inject(:-) }
42
+
43
+ x.transpose * Matrix.columns([column_data])
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,16 @@
1
+ require 'statsample-glm/glm/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ class Logistic < Statsample::GLM::Base
6
+
7
+ def initialize data_set, dependent, opts
8
+ super data_set, dependent, opts
9
+ end
10
+
11
+ def to_s
12
+ "Statsample::GLM::Logistic"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,157 @@
1
+ module Statsample
2
+
3
+ module GLM
4
+ module MLE
5
+ class Base
6
+ attr_reader :coefficients, :iterations,
7
+ :fitted_mean_values, :residuals, :degree_of_freedom,
8
+ :log_likelihood
9
+
10
+ MIN_DIFF_PARAMETERS=1e-2
11
+
12
+ def initialize data_set, dependent, opts
13
+ @opts = opts
14
+
15
+ @data_set = data_set
16
+ @dependent = dependent
17
+
18
+ @stop_criteria = :parameters
19
+ @var_cov_matrix = nil
20
+ @iterations = nil
21
+ @parameters = nil
22
+
23
+ x = @data_set.to_matrix
24
+ y = @dependent.to_matrix(:vertical)
25
+
26
+ @coefficients = newton_raphson x, y
27
+ @log_likelihood = _log_likelihood x, y, @coefficients
28
+ @fitted_mean_values = create_vector measurement(x, @coefficients).to_a.flatten
29
+ @residuals = @dependent - @fitted_mean_values
30
+ @degree_of_freedom = @dependent.count - x.column_size
31
+
32
+ # This jugad is done because the last vector index for Normal is sigma^2
33
+ # which we dont want to return to the user.
34
+ @coefficients = create_vector(self.is_a?(Statsample::GLM::MLE::Normal) ?
35
+ @coefficients.to_a.flatten[0..-2] : @coefficients.to_a.flatten)
36
+ end
37
+
38
+ def standard_error
39
+ out = []
40
+
41
+ @data_set.fields.each_index do |i|
42
+ out << Math::sqrt(@var_cov_matrix[i,i])
43
+ end
44
+
45
+ out
46
+ end
47
+
48
+ # Newton Raphson with automatic stopping criteria.
49
+ # Based on: Von Tessin, P. (2005). Maximum Likelihood Estimation With Java and Ruby
50
+ #
51
+ # <tt>x</tt>:: matrix of dependent variables. Should have nxk dimensions
52
+ # <tt>y</tt>:: matrix of independent values. Should have nx1 dimensions
53
+ # <tt>@m</tt>:: class for @ming. Could be Normal or Logistic
54
+ # <tt>start_values</tt>:: matrix of coefficients. Should have 1xk dimensions
55
+ def newton_raphson(x,y, start_values=nil)
56
+ # deep copy?
57
+ if start_values.nil?
58
+ parameters = set_default_parameters(x)
59
+ else
60
+ parameters = start_values.dup
61
+ end
62
+ k = parameters.row_size
63
+
64
+ raise "n on y != n on x" if x.row_size != y.row_size
65
+ h = nil
66
+ fd = nil
67
+
68
+ if @stop_criteria == :mle
69
+ old_likelihood = _log_likelihood(x, y, parameters)
70
+ else
71
+ old_parameters = parameters
72
+ end
73
+
74
+ @opts[:iterations].times do |i|
75
+ @iterations = i + 1
76
+
77
+ h = second_derivative(x,y,parameters)
78
+ if h.singular?
79
+ raise "Hessian is singular!"
80
+ end
81
+ fd = first_derivative(x,y,parameters)
82
+ parameters = parameters - (h.inverse * (fd))
83
+
84
+ if @stop_criteria == :parameters
85
+ flag = true
86
+ k.times do |j|
87
+ diff = ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
88
+ flag = false if diff.abs >= MIN_DIFF_PARAMETERS
89
+
90
+ end
91
+
92
+ if flag
93
+ @var_cov_matrix = h.inverse*-1.0
94
+ return parameters
95
+ end
96
+ old_parameters = parameters
97
+ else
98
+ begin
99
+ new_likelihood = _log_likelihood(x,y,parameters)
100
+
101
+ if(new_likelihood < old_likelihood) or ((new_likelihood - old_likelihood) / new_likelihood).abs < @opts[:epsilon]
102
+ @var_cov_matrix = h.inverse*-1.0
103
+ break;
104
+ end
105
+ old_likelihood = new_likelihood
106
+ rescue =>e
107
+ puts "#{e}"
108
+ end
109
+ end
110
+ end
111
+ @parameters = parameters
112
+ parameters
113
+ end
114
+
115
+ private
116
+ # Calculate likelihood for matrices x and y, given b parameters
117
+ def likelihood x,y,b
118
+ prod = 1
119
+ x.row_size.times{|i|
120
+ xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
121
+ y_val=y[i,0].to_f
122
+ #fbx=f(b,x)
123
+ prod=prod*likelihood_i(xi, y_val ,b)
124
+ }
125
+ prod
126
+ end
127
+
128
+ # Calculate log likelihood for matrices x and y, given b parameters
129
+ def _log_likelihood x,y,b
130
+ sum = 0
131
+ x.row_size.times{|i|
132
+ xi = Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
133
+ y_val = y[i,0].to_f
134
+ sum += log_likelihood_i xi, y_val, b
135
+ }
136
+
137
+ sum
138
+ end
139
+
140
+ # Creates a zero matrix Mx1, with M=x.M
141
+ def set_default_parameters x
142
+ fd = [0.0] * x.column_size
143
+
144
+ fd.push(0.1) if self.is_a? Statsample::GLM::MLE::Normal
145
+ Matrix.columns([fd])
146
+ end
147
+
148
+ def create_vector arr
149
+ Statsample::Vector.new(arr, :scale)
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
155
+ require 'statsample/mle/normal'
156
+ require 'statsample/mle/logit'
157
+ require 'statsample/mle/probit'
@@ -0,0 +1,113 @@
1
+ require 'statsample-glm/glm/mle/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module MLE
6
+ # Logistic MLE estimation.
7
+ # See Statsample::Regression for methods to generate a logit regression.
8
+ # Usage:
9
+ #
10
+ # mle=Statsample::GLM::MLE::Logistic.new
11
+ # mle.newton_raphson(x,y)
12
+ # beta=mle.coefficients
13
+ # likelihood=mle.likelihood(x, y, beta)
14
+ # iterations=mle.iterations
15
+ #
16
+ class Logistic < Statsample::GLM::MLE::Base
17
+
18
+ protected
19
+ # F(B'Xi)
20
+ def f(b,xi)
21
+ p_bx = (xi*b)[0,0]
22
+ res = (1.0/(1.0+Math::exp(-p_bx)))
23
+ if res == 0.0
24
+ res = 1e-15
25
+ elsif res == 1.0
26
+ res = 0.999999999999999
27
+ end
28
+
29
+ res
30
+ end
31
+
32
+ # Likehood for x_i vector, y_i scalar and b parameters
33
+ def likelihood_i(xi,yi,b)
34
+ (f(b,xi)**yi)*((1-f(b,xi))**(1-yi))
35
+ end
36
+
37
+ # Log Likehood for x_i vector, y_i scalar and b parameters
38
+ def log_likelihood_i(xi,yi,b)
39
+ fbx = f(b,xi)
40
+ (yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
41
+ end
42
+
43
+ # First derivative of log-likelihood function
44
+ # x: Matrix (NxM)
45
+ # y: Matrix (Nx1)
46
+ # p: Matrix (Mx1)
47
+ def first_derivative(x,y,p)
48
+ raise "x.rows != y.rows" if x.row_size != y.row_size
49
+ raise "x.columns != p.rows" if x.column_size != p.row_size
50
+
51
+ n = x.row_size
52
+ k = x.column_size
53
+ fd = Array.new(k)
54
+ k.times {|i| fd[i] = [0.0]}
55
+
56
+ n.times do |i|
57
+ row = x.row(i).to_a
58
+ value1 = (1 - y[i,0]) - p_plus(row,p)
59
+
60
+ k.times do |j|
61
+ fd[j][0] -= value1*row[j]
62
+ end
63
+ end
64
+ Matrix.rows(fd, true)
65
+ end
66
+ # Second derivative of log-likelihood function
67
+ # x: Matrix (NxM)
68
+ # y: Matrix (Nx1)
69
+ # p: Matrix (Mx1)
70
+ def second_derivative(x,y,p2)
71
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
72
+ raise "x.columns!=p.rows" if x.column_size!=p2.row_size
73
+ n = x.row_size
74
+ k = x.column_size
75
+ sd = Array.new(k)
76
+ k.times do |i|
77
+ arr = Array.new(k)
78
+ k.times{ |j| arr[j]=0.0}
79
+ sd[i] = arr
80
+ end
81
+ n.times do |i|
82
+ row = x.row(i).to_a
83
+ p_m = p_minus(row,p2)
84
+ k.times do |j|
85
+ k.times do |l|
86
+ sd[j][l] -= (p_m*(1-p_m)*row[j]*row[l])
87
+ end
88
+ end
89
+ end
90
+ Matrix.rows(sd, true)
91
+ end
92
+
93
+ def measurement x, b
94
+ (x * b).map { |y| 1/(1 + Math.exp(-y)) }
95
+ end
96
+
97
+ private
98
+ def p_minus(x_row,p)
99
+ value = 0.0;
100
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
101
+ 1/(1+Math.exp(-value))
102
+ end
103
+
104
+ def p_plus(x_row,p)
105
+ value = 0.0;
106
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
107
+ 1/(1+Math.exp(value))
108
+ end
109
+
110
+ end # Logistic
111
+ end # MLE
112
+ end # GLM
113
+ end # Statsample
@@ -0,0 +1,94 @@
1
+ require 'statsample-glm/glm/mle/base'
2
+
3
+ module Statsample
4
+ module GLM
5
+ module MLE
6
+ # Normal Distribution MLE estimation.
7
+ # Usage:
8
+ # TODO : Document this properly
9
+ # mle=Statsample::MLE::Normal.new
10
+ # mle.newton_raphson(x,y)
11
+ # beta=mle.coefficients
12
+ # likelihood=mle.likelihood(x,y,beta)
13
+ # iterations=mle.iterations
14
+ class Normal < Statsample::GLM::MLE::Base
15
+
16
+ protected
17
+ def measurement data_set, coefficients
18
+ (data_set * coefficients[0..-2,0]).map { |xb| xb }
19
+ end
20
+ # Total MLE for given X, Y and B matrices (overridden over the Base version)
21
+ def _log_likelihood(x,y,b)
22
+ n = x.row_size.to_f
23
+ sigma2 = b[b.row_size-1,0]
24
+ betas = Matrix.columns([b.column(0). to_a[0...b.row_size-1]])
25
+ e = y - (x * betas)
26
+ last = (1 / (2*sigma2)) * e.t * e
27
+ (-(n / 2.0) * Math::log(2*Math::PI))-((n / 2.0)*Math::log(sigma2)) - last[0,0]
28
+ end
29
+ # First derivative for Normal Model.
30
+ # p should be [k+1,1], because the last parameter is sigma^2
31
+ def first_derivative(x,y,p)
32
+ raise "x.rows != y.rows" if x.row_size != y.row_size
33
+ raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
34
+
35
+ n = x.row_size
36
+ k = x.column_size
37
+ b = Array.new(k)
38
+
39
+ k.times{|i| b[i]=[p[i,0]]}
40
+ beta = Matrix.rows(b)
41
+ sigma2 = p[k,0]
42
+ sigma4 = sigma2 * sigma2
43
+ e = y-(x * (beta))
44
+ xte = x.transpose*(e)
45
+ ete = e.transpose*(e)
46
+ #rows of the Jacobian
47
+ rows = Array.new(k+1)
48
+ k.times{|i| rows[i] = [xte[i,0] / sigma2]}
49
+ rows[k] = [ete[0,0] / (2*sigma4) - n / (2*sigma2)]
50
+ Matrix.rows(rows, true)
51
+ end
52
+
53
+ # second derivative for normal model
54
+ # p should be [k+1,1], because the last parameter is sigma^2
55
+ def second_derivative(x,y,p)
56
+ raise "x.rows != y.rows" if x.row_size != y.row_size
57
+ raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
58
+ #n = x.row_size
59
+ k = x.column_size
60
+ b = Array.new(k)
61
+ k.times{|i| b[i] = [p[i,0]]}
62
+ beta = Matrix.rows(b)
63
+ sigma2 = p[k,0]
64
+ sigma4 = sigma2*sigma2
65
+ sigma6 = sigma2*sigma2*sigma2
66
+ e = y-(x*(beta))
67
+ xtx = x.transpose*(x)
68
+ xte = x.transpose*(e)
69
+ ete = e.transpose*(e)
70
+ #rows of the Hessian
71
+ rows = Array.new(k+1)
72
+
73
+ k.times do |i|
74
+ row = Array.new(k+1)
75
+ k.times do |j|
76
+ row[j] = -xtx[i,j] / sigma2
77
+ end
78
+ row[k] = -xte[i,0] / sigma4
79
+ rows[i] = row
80
+ end
81
+
82
+ last_row = Array.new(k+1)
83
+ k.times do |i|
84
+ last_row[i] = -xte[i,0] / sigma4
85
+ end
86
+
87
+ last_row[k] = 2*sigma4 - ete[0,0] / sigma6
88
+ rows[k] = last_row
89
+ Matrix.rows(rows, true)
90
+ end
91
+ end
92
+ end
93
+ end
94
+ end