statsample-glm 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +51 -0
- data/.rspec +1 -0
- data/.travis.yml +2 -9
- data/Gemfile +2 -20
- data/LICENSE.txt +1 -1
- data/README.rdoc +14 -11
- data/Rakefile +16 -24
- data/lib/statsample-glm.rb +1 -11
- data/lib/statsample-glm/{regression.rb → glm.rb} +13 -46
- data/lib/statsample-glm/glm/base.rb +99 -0
- data/lib/statsample-glm/glm/irls/base.rb +54 -0
- data/lib/statsample-glm/glm/irls/logistic.rb +46 -0
- data/lib/statsample-glm/glm/irls/poisson.rb +48 -0
- data/lib/statsample-glm/glm/logistic.rb +16 -0
- data/lib/statsample-glm/glm/mle/base.rb +157 -0
- data/lib/statsample-glm/glm/mle/logistic.rb +113 -0
- data/lib/statsample-glm/glm/mle/normal.rb +94 -0
- data/lib/statsample-glm/glm/mle/probit.rb +100 -0
- data/lib/statsample-glm/glm/normal.rb +17 -0
- data/lib/statsample-glm/glm/poisson.rb +17 -0
- data/lib/statsample-glm/glm/probit.rb +16 -0
- data/lib/statsample-glm/version.rb +5 -0
- data/spec/data/logistic.csv +51 -0
- data/spec/data/logistic_mle.csv +201 -0
- data/spec/data/normal.csv +30 -0
- data/spec/logistic_spec.rb +37 -0
- data/spec/normal_spec.rb +15 -0
- data/spec/poisson_spec.rb +32 -0
- data/spec/probit_spec.rb +19 -0
- data/spec/spec_helper.rb +50 -0
- data/statsample-glm.gemspec +35 -0
- metadata +71 -145
- data/VERSION +0 -1
- data/features/bio-statsample-glm.feature +0 -9
- data/features/step_definitions/bio-statsample-glm_steps.rb +0 -0
- data/features/support/env.rb +0 -15
- data/lib/statsample-glm/regression/logistic.rb +0 -108
- data/lib/statsample-glm/regression/poisson.rb +0 -90
- data/test/helper.rb +0 -87
- data/test/test_glm.rb +0 -4
- data/test/test_glm_logistic.rb +0 -23
- data/test/test_glm_poisson.rb +0 -25
@@ -0,0 +1,46 @@
|
|
1
|
+
require 'statsample-glm/glm/irls/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module IRLS
|
6
|
+
class Logistic < Statsample::GLM::IRLS::Base
|
7
|
+
def initialize data_set, dependent, opts={}
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"Statsample::GLM::Logistic"
|
13
|
+
end
|
14
|
+
|
15
|
+
protected
|
16
|
+
|
17
|
+
def measurement x, b
|
18
|
+
(x * b).map { |y| 1/(1 + Math.exp(-y)) }
|
19
|
+
end
|
20
|
+
|
21
|
+
def weight x, b
|
22
|
+
mus = measurement(x,b).column_vectors.map(&:to_a).flatten
|
23
|
+
mus_intermediate = mus.map { |p| 1 - p }
|
24
|
+
weights = mus.zip(mus_intermediate).collect { |x| x.inject(:*) }
|
25
|
+
|
26
|
+
w_mat = Matrix.I(weights.size)
|
27
|
+
w_enum = weights.to_enum
|
28
|
+
return w_mat.map do |x|
|
29
|
+
x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def jacobian x, b, y
|
34
|
+
mu_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
|
35
|
+
column_data = y.zip(mu_flat).map { |x| x.inject(:-) }
|
36
|
+
|
37
|
+
x.transpose * Matrix.column_vector(column_data)
|
38
|
+
end
|
39
|
+
|
40
|
+
def hessian x, b
|
41
|
+
(x.transpose * weight(x, b) * x).map { |x| -x }
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'statsample-glm/glm/irls/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module IRLS
|
6
|
+
class Poisson < Statsample::GLM::IRLS::Base
|
7
|
+
def initialize data_set, dependent, opts={}
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
puts "Logistic Regression (Statsample::Regression::GLM::Logistic)"
|
13
|
+
end
|
14
|
+
protected
|
15
|
+
|
16
|
+
def measurement x, b
|
17
|
+
if @opts[:link] == :log
|
18
|
+
(x * b).map { |y| Math.exp(y) }
|
19
|
+
elsif @opts[:link] == :sqrt
|
20
|
+
(x * b).map { |y| y**2 }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def weight x, b
|
25
|
+
m = measurement(x,b).column_vectors.map(&:to_a).flatten
|
26
|
+
|
27
|
+
w_mat = Matrix.I(m.size)
|
28
|
+
w_enum = m.to_enum
|
29
|
+
|
30
|
+
return w_mat.map do |x|
|
31
|
+
x.eql?(1) ? w_enum.next : x # diagonal consists of first derivatives of logit
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def hessian x, b
|
36
|
+
(x.transpose * weight(x, b) * x).map { |x| -x }
|
37
|
+
end
|
38
|
+
|
39
|
+
def jacobian x, b, y
|
40
|
+
measurement_flat = measurement(x,b).column_vectors.map(&:to_a).flatten
|
41
|
+
column_data = y.zip(measurement_flat).collect { |x| x.inject(:-) }
|
42
|
+
|
43
|
+
x.transpose * Matrix.columns([column_data])
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'statsample-glm/glm/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
class Logistic < Statsample::GLM::Base
|
6
|
+
|
7
|
+
def initialize data_set, dependent, opts
|
8
|
+
super data_set, dependent, opts
|
9
|
+
end
|
10
|
+
|
11
|
+
def to_s
|
12
|
+
"Statsample::GLM::Logistic"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
module Statsample
|
2
|
+
|
3
|
+
module GLM
|
4
|
+
module MLE
|
5
|
+
class Base
|
6
|
+
attr_reader :coefficients, :iterations,
|
7
|
+
:fitted_mean_values, :residuals, :degree_of_freedom,
|
8
|
+
:log_likelihood
|
9
|
+
|
10
|
+
MIN_DIFF_PARAMETERS=1e-2
|
11
|
+
|
12
|
+
def initialize data_set, dependent, opts
|
13
|
+
@opts = opts
|
14
|
+
|
15
|
+
@data_set = data_set
|
16
|
+
@dependent = dependent
|
17
|
+
|
18
|
+
@stop_criteria = :parameters
|
19
|
+
@var_cov_matrix = nil
|
20
|
+
@iterations = nil
|
21
|
+
@parameters = nil
|
22
|
+
|
23
|
+
x = @data_set.to_matrix
|
24
|
+
y = @dependent.to_matrix(:vertical)
|
25
|
+
|
26
|
+
@coefficients = newton_raphson x, y
|
27
|
+
@log_likelihood = _log_likelihood x, y, @coefficients
|
28
|
+
@fitted_mean_values = create_vector measurement(x, @coefficients).to_a.flatten
|
29
|
+
@residuals = @dependent - @fitted_mean_values
|
30
|
+
@degree_of_freedom = @dependent.count - x.column_size
|
31
|
+
|
32
|
+
# This jugad is done because the last vector index for Normal is sigma^2
|
33
|
+
# which we dont want to return to the user.
|
34
|
+
@coefficients = create_vector(self.is_a?(Statsample::GLM::MLE::Normal) ?
|
35
|
+
@coefficients.to_a.flatten[0..-2] : @coefficients.to_a.flatten)
|
36
|
+
end
|
37
|
+
|
38
|
+
def standard_error
|
39
|
+
out = []
|
40
|
+
|
41
|
+
@data_set.fields.each_index do |i|
|
42
|
+
out << Math::sqrt(@var_cov_matrix[i,i])
|
43
|
+
end
|
44
|
+
|
45
|
+
out
|
46
|
+
end
|
47
|
+
|
48
|
+
# Newton Raphson with automatic stopping criteria.
|
49
|
+
# Based on: Von Tessin, P. (2005). Maximum Likelihood Estimation With Java and Ruby
|
50
|
+
#
|
51
|
+
# <tt>x</tt>:: matrix of dependent variables. Should have nxk dimensions
|
52
|
+
# <tt>y</tt>:: matrix of independent values. Should have nx1 dimensions
|
53
|
+
# <tt>@m</tt>:: class for @ming. Could be Normal or Logistic
|
54
|
+
# <tt>start_values</tt>:: matrix of coefficients. Should have 1xk dimensions
|
55
|
+
def newton_raphson(x,y, start_values=nil)
|
56
|
+
# deep copy?
|
57
|
+
if start_values.nil?
|
58
|
+
parameters = set_default_parameters(x)
|
59
|
+
else
|
60
|
+
parameters = start_values.dup
|
61
|
+
end
|
62
|
+
k = parameters.row_size
|
63
|
+
|
64
|
+
raise "n on y != n on x" if x.row_size != y.row_size
|
65
|
+
h = nil
|
66
|
+
fd = nil
|
67
|
+
|
68
|
+
if @stop_criteria == :mle
|
69
|
+
old_likelihood = _log_likelihood(x, y, parameters)
|
70
|
+
else
|
71
|
+
old_parameters = parameters
|
72
|
+
end
|
73
|
+
|
74
|
+
@opts[:iterations].times do |i|
|
75
|
+
@iterations = i + 1
|
76
|
+
|
77
|
+
h = second_derivative(x,y,parameters)
|
78
|
+
if h.singular?
|
79
|
+
raise "Hessian is singular!"
|
80
|
+
end
|
81
|
+
fd = first_derivative(x,y,parameters)
|
82
|
+
parameters = parameters - (h.inverse * (fd))
|
83
|
+
|
84
|
+
if @stop_criteria == :parameters
|
85
|
+
flag = true
|
86
|
+
k.times do |j|
|
87
|
+
diff = ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
|
88
|
+
flag = false if diff.abs >= MIN_DIFF_PARAMETERS
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
if flag
|
93
|
+
@var_cov_matrix = h.inverse*-1.0
|
94
|
+
return parameters
|
95
|
+
end
|
96
|
+
old_parameters = parameters
|
97
|
+
else
|
98
|
+
begin
|
99
|
+
new_likelihood = _log_likelihood(x,y,parameters)
|
100
|
+
|
101
|
+
if(new_likelihood < old_likelihood) or ((new_likelihood - old_likelihood) / new_likelihood).abs < @opts[:epsilon]
|
102
|
+
@var_cov_matrix = h.inverse*-1.0
|
103
|
+
break;
|
104
|
+
end
|
105
|
+
old_likelihood = new_likelihood
|
106
|
+
rescue =>e
|
107
|
+
puts "#{e}"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
@parameters = parameters
|
112
|
+
parameters
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
# Calculate likelihood for matrices x and y, given b parameters
|
117
|
+
def likelihood x,y,b
|
118
|
+
prod = 1
|
119
|
+
x.row_size.times{|i|
|
120
|
+
xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
|
121
|
+
y_val=y[i,0].to_f
|
122
|
+
#fbx=f(b,x)
|
123
|
+
prod=prod*likelihood_i(xi, y_val ,b)
|
124
|
+
}
|
125
|
+
prod
|
126
|
+
end
|
127
|
+
|
128
|
+
# Calculate log likelihood for matrices x and y, given b parameters
|
129
|
+
def _log_likelihood x,y,b
|
130
|
+
sum = 0
|
131
|
+
x.row_size.times{|i|
|
132
|
+
xi = Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
|
133
|
+
y_val = y[i,0].to_f
|
134
|
+
sum += log_likelihood_i xi, y_val, b
|
135
|
+
}
|
136
|
+
|
137
|
+
sum
|
138
|
+
end
|
139
|
+
|
140
|
+
# Creates a zero matrix Mx1, with M=x.M
|
141
|
+
def set_default_parameters x
|
142
|
+
fd = [0.0] * x.column_size
|
143
|
+
|
144
|
+
fd.push(0.1) if self.is_a? Statsample::GLM::MLE::Normal
|
145
|
+
Matrix.columns([fd])
|
146
|
+
end
|
147
|
+
|
148
|
+
def create_vector arr
|
149
|
+
Statsample::Vector.new(arr, :scale)
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
require 'statsample/mle/normal'
|
156
|
+
require 'statsample/mle/logit'
|
157
|
+
require 'statsample/mle/probit'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
require 'statsample-glm/glm/mle/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module MLE
|
6
|
+
# Logistic MLE estimation.
|
7
|
+
# See Statsample::Regression for methods to generate a logit regression.
|
8
|
+
# Usage:
|
9
|
+
#
|
10
|
+
# mle=Statsample::GLM::MLE::Logistic.new
|
11
|
+
# mle.newton_raphson(x,y)
|
12
|
+
# beta=mle.coefficients
|
13
|
+
# likelihood=mle.likelihood(x, y, beta)
|
14
|
+
# iterations=mle.iterations
|
15
|
+
#
|
16
|
+
class Logistic < Statsample::GLM::MLE::Base
|
17
|
+
|
18
|
+
protected
|
19
|
+
# F(B'Xi)
|
20
|
+
def f(b,xi)
|
21
|
+
p_bx = (xi*b)[0,0]
|
22
|
+
res = (1.0/(1.0+Math::exp(-p_bx)))
|
23
|
+
if res == 0.0
|
24
|
+
res = 1e-15
|
25
|
+
elsif res == 1.0
|
26
|
+
res = 0.999999999999999
|
27
|
+
end
|
28
|
+
|
29
|
+
res
|
30
|
+
end
|
31
|
+
|
32
|
+
# Likehood for x_i vector, y_i scalar and b parameters
|
33
|
+
def likelihood_i(xi,yi,b)
|
34
|
+
(f(b,xi)**yi)*((1-f(b,xi))**(1-yi))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Log Likehood for x_i vector, y_i scalar and b parameters
|
38
|
+
def log_likelihood_i(xi,yi,b)
|
39
|
+
fbx = f(b,xi)
|
40
|
+
(yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
|
41
|
+
end
|
42
|
+
|
43
|
+
# First derivative of log-likelihood function
|
44
|
+
# x: Matrix (NxM)
|
45
|
+
# y: Matrix (Nx1)
|
46
|
+
# p: Matrix (Mx1)
|
47
|
+
def first_derivative(x,y,p)
|
48
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
49
|
+
raise "x.columns != p.rows" if x.column_size != p.row_size
|
50
|
+
|
51
|
+
n = x.row_size
|
52
|
+
k = x.column_size
|
53
|
+
fd = Array.new(k)
|
54
|
+
k.times {|i| fd[i] = [0.0]}
|
55
|
+
|
56
|
+
n.times do |i|
|
57
|
+
row = x.row(i).to_a
|
58
|
+
value1 = (1 - y[i,0]) - p_plus(row,p)
|
59
|
+
|
60
|
+
k.times do |j|
|
61
|
+
fd[j][0] -= value1*row[j]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
Matrix.rows(fd, true)
|
65
|
+
end
|
66
|
+
# Second derivative of log-likelihood function
|
67
|
+
# x: Matrix (NxM)
|
68
|
+
# y: Matrix (Nx1)
|
69
|
+
# p: Matrix (Mx1)
|
70
|
+
def second_derivative(x,y,p2)
|
71
|
+
raise "x.rows!=y.rows" if x.row_size!=y.row_size
|
72
|
+
raise "x.columns!=p.rows" if x.column_size!=p2.row_size
|
73
|
+
n = x.row_size
|
74
|
+
k = x.column_size
|
75
|
+
sd = Array.new(k)
|
76
|
+
k.times do |i|
|
77
|
+
arr = Array.new(k)
|
78
|
+
k.times{ |j| arr[j]=0.0}
|
79
|
+
sd[i] = arr
|
80
|
+
end
|
81
|
+
n.times do |i|
|
82
|
+
row = x.row(i).to_a
|
83
|
+
p_m = p_minus(row,p2)
|
84
|
+
k.times do |j|
|
85
|
+
k.times do |l|
|
86
|
+
sd[j][l] -= (p_m*(1-p_m)*row[j]*row[l])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
Matrix.rows(sd, true)
|
91
|
+
end
|
92
|
+
|
93
|
+
def measurement x, b
|
94
|
+
(x * b).map { |y| 1/(1 + Math.exp(-y)) }
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def p_minus(x_row,p)
|
99
|
+
value = 0.0;
|
100
|
+
x_row.each_index { |i| value += x_row[i]*p[i,0]}
|
101
|
+
1/(1+Math.exp(-value))
|
102
|
+
end
|
103
|
+
|
104
|
+
def p_plus(x_row,p)
|
105
|
+
value = 0.0;
|
106
|
+
x_row.each_index { |i| value += x_row[i]*p[i,0]}
|
107
|
+
1/(1+Math.exp(value))
|
108
|
+
end
|
109
|
+
|
110
|
+
end # Logistic
|
111
|
+
end # MLE
|
112
|
+
end # GLM
|
113
|
+
end # Statsample
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'statsample-glm/glm/mle/base'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
module GLM
|
5
|
+
module MLE
|
6
|
+
# Normal Distribution MLE estimation.
|
7
|
+
# Usage:
|
8
|
+
# TODO : Document this properly
|
9
|
+
# mle=Statsample::MLE::Normal.new
|
10
|
+
# mle.newton_raphson(x,y)
|
11
|
+
# beta=mle.coefficients
|
12
|
+
# likelihood=mle.likelihood(x,y,beta)
|
13
|
+
# iterations=mle.iterations
|
14
|
+
class Normal < Statsample::GLM::MLE::Base
|
15
|
+
|
16
|
+
protected
|
17
|
+
def measurement data_set, coefficients
|
18
|
+
(data_set * coefficients[0..-2,0]).map { |xb| xb }
|
19
|
+
end
|
20
|
+
# Total MLE for given X, Y and B matrices (overridden over the Base version)
|
21
|
+
def _log_likelihood(x,y,b)
|
22
|
+
n = x.row_size.to_f
|
23
|
+
sigma2 = b[b.row_size-1,0]
|
24
|
+
betas = Matrix.columns([b.column(0). to_a[0...b.row_size-1]])
|
25
|
+
e = y - (x * betas)
|
26
|
+
last = (1 / (2*sigma2)) * e.t * e
|
27
|
+
(-(n / 2.0) * Math::log(2*Math::PI))-((n / 2.0)*Math::log(sigma2)) - last[0,0]
|
28
|
+
end
|
29
|
+
# First derivative for Normal Model.
|
30
|
+
# p should be [k+1,1], because the last parameter is sigma^2
|
31
|
+
def first_derivative(x,y,p)
|
32
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
33
|
+
raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
|
34
|
+
|
35
|
+
n = x.row_size
|
36
|
+
k = x.column_size
|
37
|
+
b = Array.new(k)
|
38
|
+
|
39
|
+
k.times{|i| b[i]=[p[i,0]]}
|
40
|
+
beta = Matrix.rows(b)
|
41
|
+
sigma2 = p[k,0]
|
42
|
+
sigma4 = sigma2 * sigma2
|
43
|
+
e = y-(x * (beta))
|
44
|
+
xte = x.transpose*(e)
|
45
|
+
ete = e.transpose*(e)
|
46
|
+
#rows of the Jacobian
|
47
|
+
rows = Array.new(k+1)
|
48
|
+
k.times{|i| rows[i] = [xte[i,0] / sigma2]}
|
49
|
+
rows[k] = [ete[0,0] / (2*sigma4) - n / (2*sigma2)]
|
50
|
+
Matrix.rows(rows, true)
|
51
|
+
end
|
52
|
+
|
53
|
+
# second derivative for normal model
|
54
|
+
# p should be [k+1,1], because the last parameter is sigma^2
|
55
|
+
def second_derivative(x,y,p)
|
56
|
+
raise "x.rows != y.rows" if x.row_size != y.row_size
|
57
|
+
raise "x.columns + 1 != p.rows" if x.column_size + 1 != p.row_size
|
58
|
+
#n = x.row_size
|
59
|
+
k = x.column_size
|
60
|
+
b = Array.new(k)
|
61
|
+
k.times{|i| b[i] = [p[i,0]]}
|
62
|
+
beta = Matrix.rows(b)
|
63
|
+
sigma2 = p[k,0]
|
64
|
+
sigma4 = sigma2*sigma2
|
65
|
+
sigma6 = sigma2*sigma2*sigma2
|
66
|
+
e = y-(x*(beta))
|
67
|
+
xtx = x.transpose*(x)
|
68
|
+
xte = x.transpose*(e)
|
69
|
+
ete = e.transpose*(e)
|
70
|
+
#rows of the Hessian
|
71
|
+
rows = Array.new(k+1)
|
72
|
+
|
73
|
+
k.times do |i|
|
74
|
+
row = Array.new(k+1)
|
75
|
+
k.times do |j|
|
76
|
+
row[j] = -xtx[i,j] / sigma2
|
77
|
+
end
|
78
|
+
row[k] = -xte[i,0] / sigma4
|
79
|
+
rows[i] = row
|
80
|
+
end
|
81
|
+
|
82
|
+
last_row = Array.new(k+1)
|
83
|
+
k.times do |i|
|
84
|
+
last_row[i] = -xte[i,0] / sigma4
|
85
|
+
end
|
86
|
+
|
87
|
+
last_row[k] = 2*sigma4 - ete[0,0] / sigma6
|
88
|
+
rows[k] = last_row
|
89
|
+
Matrix.rows(rows, true)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|