statsample-glm 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/.rspec +1 -0
- data/.travis.yml +2 -9
- data/Gemfile +2 -20
- data/LICENSE.txt +1 -1
- data/README.rdoc +14 -11
- data/Rakefile +16 -24
- data/lib/statsample-glm.rb +1 -11
- data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
- data/lib/statsample-glm/glm/base.rb +99 -0
- data/lib/statsample-glm/glm/irls/base.rb +54 -0
- data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
- data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
- data/lib/statsample-glm/glm/logistic.rb +16 -0
- data/lib/statsample-glm/glm/mle/base.rb +157 -0
- data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
- data/lib/statsample-glm/glm/mle/normal.rb +94 -0
- data/lib/statsample-glm/glm/mle/probit.rb +100 -0
- data/lib/statsample-glm/glm/normal.rb +17 -0
- data/lib/statsample-glm/glm/poisson.rb +17 -0
- data/lib/statsample-glm/glm/probit.rb +16 -0
- data/lib/statsample-glm/version.rb +5 -0
- data/spec/data/logistic.csv +51 -0
- data/spec/data/logistic_mle.csv +201 -0
- data/spec/data/normal.csv +30 -0
- data/spec/logistic_spec.rb +37 -0
- data/spec/normal_spec.rb +15 -0
- data/spec/poisson_spec.rb +32 -0
- data/spec/probit_spec.rb +19 -0
- data/spec/spec_helper.rb +50 -0
- data/statsample-glm.gemspec +35 -0
- metadata +71 -145
- data/VERSION +0 -1
- data/features/bio-statsample-glm.feature +0 -9
- data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
- data/features/support/env.rb +0 -15
- data/lib/statsample-glm/regression/logistic.rb +0 -108
- data/lib/statsample-glm/regression/poisson.rb +0 -90
- data/test/helper.rb +0 -87
- data/test/test_glm.rb +0 -4
- data/test/test_glm_logistic.rb +0 -23
- data/test/test_glm_poisson.rb +0 -25
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'statsample-glm/glm/irls/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module IRLS
|
6
|
+
class Logistic < Statsample::GLM::IRLS::Base
|
7
|
+
def initialize data_set, dependent, opts={}
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"Statsample::GLM::Logistic"
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def measurement x, b
|
18
|
+
(x * b).map { |y| 1/(1 + Math.exp(-y)) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def weight x, b
|
22
|
+
mus = measurement(x,b).column_vectors.map(&:to_a).flatten
|
23
|
+
mus_intermediate = mus.map { |p| 1 - p }
|
24
|
+
weights = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
|
25
|
+
|
26
|
+
w_mat = Matrix.I(weights.size)
|
27
|
+
w_enum = weights.to_enum
|
28
|
+
return w_mat.map do |x|
|
29
|
+
x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def jacobian x, b, y
|
34
|
+
mu_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
|
35
|
+
column_data = y.zip(mu_flat).map { |x| x.inject(:-) }
|
36
|
+
|
37
|
+
x.transpose * Matrix.column_vector(column_data)
|
38
|
+
end
|
39
|
+
|
40
|
+
def hessian x, b
|
41
|
+
(x.transpose * weight(x, b) * x).map { |x| -x }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'statsample-glm/glm/irls/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module IRLS
|
6
|
+
class Poisson < Statsample::GLM::IRLS::Base
|
7
|
+
def initialize data_set, dependent, opts={}
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
puts "Logistic Regression (Statsample::Regression::GLM::Logistic)"
|
13
|
+
end
|
14
|
+
protected
|
15
|
+
|
16
|
+
def measurement x, b
|
17
|
+
if @opts[:link] == :log
|
18
|
+
(x * b).map { |y| Math.exp(y) }
|
19
|
+
elsif @opts[:link] == :sqrt
|
20
|
+
(x * b).map { |y| y**2 }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def weight x, b
|
25
|
+
m = measurement(x,b).column_vectors.map(&:to_a).flatten
|
26
|
+
|
27
|
+
w_mat = Matrix.I(m.size)
|
28
|
+
w_enum = m.to_enum
|
29
|
+
|
30
|
+
return w_mat.map do |x|
|
31
|
+
x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def hessian x, b
|
36
|
+
(x.transpose * weight(x, b) * x).map { |x| -x }
|
37
|
+
end
|
38
|
+
|
39
|
+
def jacobian x, b, y
|
40
|
+
measurement_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
|
41
|
+
column_data = y.zip(measurement_flat).collect { |x| x.inject(:-) }
|
42
|
+
|
43
|
+
x.transpose * Matrix.columns([column_data])
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'statsample-glm/glm/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
class Logistic < Statsample::GLM::Base
|
6
|
+
|
7
|
+
def initialize data_set, dependent, opts
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"Statsample::GLM::Logistic"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Statsample
|
2
|
+
|
3
|
+
module GLM
|
4
|
+
module MLE
|
5
|
+
class Base
|
6
|
+
attr_reader :coefficients, :iterations,
|
7
|
+
:fitted_mean_values, :residuals, :degree_of_freedom,
|
8
|
+
:log_likelihood
|
9
|
+
|
10
|
+
MIN_DIFF_PARAMETERS=1e-2
|
11
|
+
|
12
|
+
def initialize data_set, dependent, opts
|
13
|
+
@opts = opts
|
14
|
+
|
15
|
+
@data_set = data_set
|
16
|
+
@dependent = dependent
|
17
|
+
|
18
|
+
@stop_criteria = :parameters
|
19
|
+
@var_cov_matrix = nil
|
20
|
+
@iterations = nil
|
21
|
+
@parameters = nil
|
22
|
+
|
23
|
+
x = @data_set.to_matrix
|
24
|
+
y = @dependent.to_matrix(:vertical)
|
25
|
+
|
26
|
+
@coefficients = newton_raphson x, y
|
27
|
+
@log_likelihood = _log_likelihood x, y, @coefficients
|
28
|
+
@fitted_mean_values = create_vector measurement(x, @coefficients).to_a.flatten
|
29
|
+
@residuals = @dependent - @fitted_mean_values
|
30
|
+
@degree_of_freedom = @dependent.count - x.column_size
|
31
|
+
|
32
|
+
# This jugad is done because the last vector index for Normal is sigma^2
|
33
|
+
# which we dont want to return to the user.
|
34
|
+
@coefficients = create_vector(self.is_a?(Statsample::GLM::MLE::Normal) ?
|
35
|
+
@coefficients.to_a.flatten[0..-2] : @coefficients.to_a.flatten)
|
36
|
+
end
|
37
|
+
|
38
|
+
def standard_error
|
39
|
+
out = []
|
40
|
+
|
41
|
+
@data_set.fields.each_index do |i|
|
42
|
+
out << Math::sqrt(@var_cov_matrix[i,i])
|
43
|
+
end
|
44
|
+
|
45
|
+
out
|
46
|
+
end
|
47
|
+
|
48
|
+
# Newton Raphson with automatic stopping criteria.
|
49
|
+
# Based on: Von Tessin, P. (2005). Maximum Likelihood Estimation With Java and Ruby
|
50
|
+
#
|
51
|
+
# <tt>x</tt>:: matrix of dependent variables. Should have nxk dimensions
|
52
|
+
# <tt>y</tt>:: matrix of independent values. Should have nx1 dimensions
|
53
|
+
# <tt>@m</tt>:: class for @ming. Could be Normal or Logistic
|
54
|
+
# <tt>start_values</tt>:: matrix of coefficients. Should have 1xk dimensions
|
55
|
+
def newton_raphson(x,y, start_values=nil)
|
56
|
+
# deep copy?
|
57
|
+
if start_values.nil?
|
58
|
+
parameters = set_default_parameters(x)
|
59
|
+
else
|
60
|
+
parameters = start_values.dup
|
61
|
+
end
|
62
|
+
k = parameters.row_size
|
63
|
+
|
64
|
+
raise "n on y != n on x" if x.row_size != y.row_size
|
65
|
+
h = nil
|
66
|
+
fd = nil
|
67
|
+
|
68
|
+
if @stop_criteria == :mle
|
69
|
+
old_likelihood = _log_likelihood(x, y, parameters)
|
70
|
+
else
|
71
|
+
old_parameters = parameters
|
72
|
+
end
|
73
|
+
|
74
|
+
@opts[:iterations].times do |i|
|
75
|
+
@iterations = i + 1
|
76
|
+
|
77
|
+
h = second_derivative(x,y,parameters)
|
78
|
+
if h.singular?
|
79
|
+
raise "Hessian is singular!"
|
80
|
+
end
|
81
|
+
fd = first_derivative(x,y,parameters)
|
82
|
+
parameters = parameters - (h.inverse * (fd))
|
83
|
+
|
84
|
+
if @stop_criteria == :parameters
|
85
|
+
flag = true
|
86
|
+
k.times do |j|
|
87
|
+
diff = ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
|
88
|
+
flag = false if diff.abs >= MIN_DIFF_PARAMETERS
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
if flag
|
93
|
+
@var_cov_matrix = h.inverse*-1.0
|
94
|
+
return parameters
|
95
|
+
end
|
96
|
+
old_parameters = parameters
|
97
|
+
else
|
98
|
+
begin
|
99
|
+
new_likelihood = _log_likelihood(x,y,parameters)
|
100
|
+
|
101
|
+
if(new_likelihood < old_likelihood) or ((new_likelihood - old_likelihood) / new_likelihood).abs < @opts[:epsilon]
|
102
|
+
@var_cov_matrix = h.inverse*-1.0
|
103
|
+
break;
|
104
|
+
end
|
105
|
+
old_likelihood = new_likelihood
|
106
|
+
rescue =>e
|
107
|
+
puts "#{e}"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
@parameters = parameters
|
112
|
+
parameters
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
# Calculate likelihood for matrices x and y, given b parameters
|
117
|
+
def likelihood x,y,b
|
118
|
+
prod = 1
|
119
|
+
x.row_size.times{|i|
|
120
|
+
xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
|
121
|
+
y_val=y[i,0].to_f
|
122
|
+
#fbx=f(b,x)
|
123
|
+
prod=prod*likelihood_i(xi, y_val ,b)
|
124
|
+
}
|
125
|
+
prod
|
126
|
+
end
|
127
|
+
|
128
|
+
# Calculate log likelihood for matrices x and y, given b parameters
|
129
|
+
def _log_likelihood x,y,b
|
130
|
+
sum = 0
|
131
|
+
x.row_size.times{|i|
|
132
|
+
xi = Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
|
133
|
+
y_val = y[i,0].to_f
|
134
|
+
sum += log_likelihood_i xi, y_val, b
|
135
|
+
}
|
136
|
+
|
137
|
+
sum
|
138
|
+
end
|
139
|
+
|
140
|
+
# Creates a zero matrix Mx1, with M=x.M
|
141
|
+
def set_default_parameters x
|
142
|
+
fd = [0.0] * x.column_size
|
143
|
+
|
144
|
+
fd.push(0.1) if self.is_a? Statsample::GLM::MLE::Normal
|
145
|
+
Matrix.columns([fd])
|
146
|
+
end
|
147
|
+
|
148
|
+
def create_vector arr
|
149
|
+
Statsample::Vector.new(arr, :scale)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
require 'statsample/mle/normal'
|
156
|
+
require 'statsample/mle/logit'
|
157
|
+
require 'statsample/mle/probit'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'statsample-glm/glm/mle/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module MLE
|
6
|
+
# Logistic MLE estimation.
|
7
|
+
# See Statsample::Regression for methods to generate a logit regression.
|
8
|
+
# Usage:
|
9
|
+
#
|
10
|
+
# mle=Statsample::GLM::MLE::Logistic.new
|
11
|
+
# mle.newton_raphson(x,y)
|
12
|
+
# beta=mle.coefficients
|
13
|
+
# likelihood=mle.likelihood(x, y, beta)
|
14
|
+
# iterations=mle.iterations
|
15
|
+
#
|
16
|
+
class Logistic < Statsample::GLM::MLE::Base
|
17
|
+
|
18
|
+
protected
|
19
|
+
# F(B'Xi)
|
20
|
+
def f(b,xi)
|
21
|
+
p_bx = (xi*b)[0,0]
|
22
|
+
res = (1.0/(1.0+Math::exp(-p_bx)))
|
23
|
+
if res == 0.0
|
24
|
+
res = 1e-15
|
25
|
+
elsif res == 1.0
|
26
|
+
res = 0.999999999999999
|
27
|
+
end
|
28
|
+
|
29
|
+
res
|
30
|
+
end
|
31
|
+
|
32
|
+
# Likehood for x_i vector, y_i scalar and b parameters
|
33
|
+
def likelihood_i(xi,yi,b)
|
34
|
+
(f(b,xi)**yi)*((1-f(b,xi))**(1-yi))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Log Likehood for x_i vector, y_i scalar and b parameters
|
38
|
+
def log_likelihood_i(xi,yi,b)
|
39
|
+
fbx = f(b,xi)
|
40
|
+
(yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
|
41
|
+
end
|
42
|
+
|
43
|
+
# First derivative of log-likelihood function
|
44
|
+
# x: Matrix (NxM)
|
45
|
+
# y: Matrix (Nx1)
|
46
|
+
# p: Matrix (Mx1)
|
47
|
+
def first_derivative(x,y,p)
|
48
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
49
|
+
raise "x.columns != p.rows" if x.column_size != p.row_size
|
50
|
+
|
51
|
+
n = x.row_size
|
52
|
+
k = x.column_size
|
53
|
+
fd = Array.new(k)
|
54
|
+
k.times {|i| fd[i] = [0.0]}
|
55
|
+
|
56
|
+
n.times do |i|
|
57
|
+
row = x.row(i).to_a
|
58
|
+
value1 = (1 - y[i,0]) - p_plus(row,p)
|
59
|
+
|
60
|
+
k.times do |j|
|
61
|
+
fd[j][0] -= value1*row[j]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
Matrix.rows(fd, true)
|
65
|
+
end
|
66
|
+
# Second derivative of log-likelihood function
|
67
|
+
# x: Matrix (NxM)
|
68
|
+
# y: Matrix (Nx1)
|
69
|
+
# p: Matrix (Mx1)
|
70
|
+
def second_derivative(x,y,p2)
|
71
|
+
raise "x.rows!=y.rows" if x.row_size!=y.row_size
|
72
|
+
raise "x.columns!=p.rows" if x.column_size!=p2.row_size
|
73
|
+
n = x.row_size
|
74
|
+
k = x.column_size
|
75
|
+
sd = Array.new(k)
|
76
|
+
k.times do |i|
|
77
|
+
arr = Array.new(k)
|
78
|
+
k.times{ |j| arr[j]=0.0}
|
79
|
+
sd[i] = arr
|
80
|
+
end
|
81
|
+
n.times do |i|
|
82
|
+
row = x.row(i).to_a
|
83
|
+
p_m = p_minus(row,p2)
|
84
|
+
k.times do |j|
|
85
|
+
k.times do |l|
|
86
|
+
sd[j][l] -= (p_m*(1-p_m)*row[j]*row[l])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
Matrix.rows(sd, true)
|
91
|
+
end
|
92
|
+
|
93
|
+
def measurement x, b
|
94
|
+
(x * b).map { |y| 1/(1 + Math.exp(-y)) }
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def p_minus(x_row,p)
|
99
|
+
value = 0.0;
|
100
|
+
x_row.each_index { |i| value += x_row[i]*p[i,0]}
|
101
|
+
1/(1+Math.exp(-value))
|
102
|
+
end
|
103
|
+
|
104
|
+
def p_plus(x_row,p)
|
105
|
+
value = 0.0;
|
106
|
+
x_row.each_index { |i| value += x_row[i]*p[i,0]}
|
107
|
+
1/(1+Math.exp(value))
|
108
|
+
end
|
109
|
+
|
110
|
+
end # Logistic
|
111
|
+
end # MLE
|
112
|
+
end # GLM
|
113
|
+
end # Statsample
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'statsample-glm/glm/mle/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module MLE
|
6
|
+
# Normal Distribution MLE estimation.
|
7
|
+
# Usage:
|
8
|
+
# TODO : Document this properly
|
9
|
+
# mle=Statsample::MLE::Normal.new
|
10
|
+
# mle.newton_raphson(x,y)
|
11
|
+
# beta=mle.coefficients
|
12
|
+
# likelihood=mle.likelihood(x,y,beta)
|
13
|
+
# iterations=mle.iterations
|
14
|
+
class Normal < Statsample::GLM::MLE::Base
|
15
|
+
|
16
|
+
protected
|
17
|
+
def measurement data_set, coefficients
|
18
|
+
(data_set * coefficients[0..-2,0]).map { |xb| xb }
|
19
|
+
end
|
20
|
+
# Total MLE for given X, Y and B matrices (overridden over the Base version)
|
21
|
+
def _log_likelihood(x,y,b)
|
22
|
+
n = x.row_size.to_f
|
23
|
+
sigma2 = b[b.row_size-1,0]
|
24
|
+
betas = Matrix.columns([b.column(0). to_a[0...b.row_size-1]])
|
25
|
+
e = y - (x * betas)
|
26
|
+
last = (1 / (2*sigma2)) * e.t * e
|
27
|
+
(-(n / 2.0) * Math::log(2*Math::PI))-((n / 2.0)*Math::log(sigma2)) - last[0,0]
|
28
|
+
end
|
29
|
+
# First derivative for Normal Model.
|
30
|
+
# p should be [k+1,1], because the last parameter is sigma^2
|
31
|
+
def first_derivative(x,y,p)
|
32
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
33
|
+
raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
|
34
|
+
|
35
|
+
n = x.row_size
|
36
|
+
k = x.column_size
|
37
|
+
b = Array.new(k)
|
38
|
+
|
39
|
+
k.times{|i| b[i]=[p[i,0]]}
|
40
|
+
beta = Matrix.rows(b)
|
41
|
+
sigma2 = p[k,0]
|
42
|
+
sigma4 = sigma2 * sigma2
|
43
|
+
e = y-(x * (beta))
|
44
|
+
xte = x.transpose*(e)
|
45
|
+
ete = e.transpose*(e)
|
46
|
+
#rows of the Jacobian
|
47
|
+
rows = Array.new(k+1)
|
48
|
+
k.times{|i| rows[i] = [xte[i,0] / sigma2]}
|
49
|
+
rows[k] = [ete[0,0] / (2*sigma4) - n / (2*sigma2)]
|
50
|
+
Matrix.rows(rows, true)
|
51
|
+
end
|
52
|
+
|
53
|
+
# second derivative for normal model
|
54
|
+
# p should be [k+1,1], because the last parameter is sigma^2
|
55
|
+
def second_derivative(x,y,p)
|
56
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
57
|
+
raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
|
58
|
+
#n = x.row_size
|
59
|
+
k = x.column_size
|
60
|
+
b = Array.new(k)
|
61
|
+
k.times{|i| b[i] = [p[i,0]]}
|
62
|
+
beta = Matrix.rows(b)
|
63
|
+
sigma2 = p[k,0]
|
64
|
+
sigma4 = sigma2*sigma2
|
65
|
+
sigma6 = sigma2*sigma2*sigma2
|
66
|
+
e = y-(x*(beta))
|
67
|
+
xtx = x.transpose*(x)
|
68
|
+
xte = x.transpose*(e)
|
69
|
+
ete = e.transpose*(e)
|
70
|
+
#rows of the Hessian
|
71
|
+
rows = Array.new(k+1)
|
72
|
+
|
73
|
+
k.times do |i|
|
74
|
+
row = Array.new(k+1)
|
75
|
+
k.times do |j|
|
76
|
+
row[j] = -xtx[i,j] / sigma2
|
77
|
+
end
|
78
|
+
row[k] = -xte[i,0] / sigma4
|
79
|
+
rows[i] = row
|
80
|
+
end
|
81
|
+
|
82
|
+
last_row = Array.new(k+1)
|
83
|
+
k.times do |i|
|
84
|
+
last_row[i] = -xte[i,0] / sigma4
|
85
|
+
end
|
86
|
+
|
87
|
+
last_row[k] = 2*sigma4 - ete[0,0] / sigma6
|
88
|
+
rows[k] = last_row
|
89
|
+
Matrix.rows(rows, true)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|