statsample 0.3.4 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. data/History.txt +8 -0
  2. data/Manifest.txt +20 -2
  3. data/data/crime.txt +47 -0
  4. data/data/test_binomial.csv +201 -0
  5. data/demo/distribution_t.rb +2 -2
  6. data/demo/regression.rb +2 -1
  7. data/lib/distribution.rb +8 -0
  8. data/lib/distribution/chisquare.rb +24 -0
  9. data/lib/distribution/f.rb +25 -0
  10. data/lib/distribution/normal.rb +25 -0
  11. data/lib/distribution/t.rb +22 -0
  12. data/lib/matrix_extension.rb +78 -0
  13. data/lib/statistics2.rb +531 -0
  14. data/lib/statsample.rb +12 -9
  15. data/lib/statsample/anova.rb +1 -5
  16. data/lib/statsample/bivariate.rb +24 -20
  17. data/lib/statsample/combination.rb +14 -4
  18. data/lib/statsample/converters.rb +17 -1
  19. data/lib/statsample/dataset.rb +66 -10
  20. data/lib/statsample/dominanceanalysis/bootstrap.rb +1 -3
  21. data/lib/statsample/graph/gdchart.rb +2 -3
  22. data/lib/statsample/graph/svggraph.rb +8 -4
  23. data/lib/statsample/mle.rb +137 -0
  24. data/lib/statsample/mle/logit.rb +95 -0
  25. data/lib/statsample/mle/normal.rb +83 -0
  26. data/lib/statsample/mle/probit.rb +93 -0
  27. data/lib/statsample/regression.rb +3 -1
  28. data/lib/statsample/regression/binomial.rb +65 -0
  29. data/lib/statsample/regression/binomial/logit.rb +13 -0
  30. data/lib/statsample/regression/binomial/probit.rb +13 -0
  31. data/lib/statsample/regression/multiple.rb +61 -58
  32. data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
  33. data/lib/statsample/srs.rb +5 -5
  34. data/lib/statsample/vector.rb +129 -59
  35. data/test/test_anova.rb +0 -5
  36. data/test/test_dataset.rb +13 -1
  37. data/test/test_distribution.rb +57 -0
  38. data/test/test_gsl.rb +22 -0
  39. data/test/test_logit.rb +22 -0
  40. data/test/test_mle.rb +140 -0
  41. data/test/test_r.rb +9 -0
  42. data/test/test_regression.rb +12 -4
  43. data/test/test_srs.rb +0 -4
  44. data/test/test_stata.rb +11 -0
  45. data/test/test_statistics.rb +0 -15
  46. data/test/test_vector.rb +11 -0
  47. metadata +28 -4
  48. data/lib/statsample/chidistribution.rb +0 -39
  49. data/lib/statsample/regression/logit.rb +0 -35
@@ -0,0 +1,137 @@
1
+ module Statsample
2
+ # Module for MLE calculations.
3
+ # Use subclass of BaseMLE for specific MLE model estimation.
4
+ # Usage:
5
+ #
6
+ # mle=Statsample::MLE::Probit.new
7
+ # mle.newton_raphson(x,y)
8
+ # beta=mle.parameters
9
+ # likehood=mle.likehood(x,y,beta)
10
+ # iterations=mle.iterations
11
+ #
12
+ module MLE
13
+ class BaseMLE
14
+ attr_accessor :verbose
15
+ attr_accessor :output
16
+ # Could be :parameters or :mle
17
+ attr_accessor :stop_criteria
18
+ # Variance - Covariance matrix
19
+ attr_reader :var_cov_matrix
20
+ # Iterations
21
+ attr_reader :iterations
22
+ # Parameters (beta coefficients)
23
+ attr_reader :parameters
24
+ ITERATIONS=100
25
+ MIN_DIFF=1e-5
26
+ MIN_DIFF_PARAMETERS=1e-2
27
+ # Model should be a MLE subclass
28
+ def initialize()
29
+ @verbose = false
30
+ @output = STDOUT
31
+ @stop_criteria = :parameters
32
+ @var_cov_matrix = nil
33
+ @iterations = nil
34
+ @parameters = nil
35
+ end
36
+ # Calculate likehood for matrices x and y, given b parameters
37
+ def likehood(x,y,b)
38
+ prod=1
39
+ x.row_size.times{|i|
40
+ xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
41
+ y_val=y[i,0].to_f
42
+ fbx=f(b,x)
43
+ prod=prod*likehood_i(xi,y_val,b)
44
+ }
45
+ prod
46
+ end
47
+ # Calculate log likehood for matrices x and y, given b parameters
48
+ def log_likehood(x,y,b)
49
+ sum=0
50
+ x.row_size.times{|i|
51
+ xi=Matrix.rows([x.row(i).to_a.collect{|v| v.to_f}])
52
+ y_val=y[i,0].to_f
53
+ sum+=log_likehood_i(xi,y_val,b)
54
+ }
55
+ sum
56
+ end
57
+ # Creates a zero matrix Mx1, with M=x.M
58
+ def set_default_parameters(x)
59
+ fd=x.column_size.times.collect{|i| 0.0}
60
+ fd.push(0.1) if self.is_a? Statsample::MLE::Normal
61
+ parameters = Matrix.columns([fd])
62
+ end
63
+
64
+ # Newton Raphson with automatic stopping criteria.
65
+ # Based on: Von Tessin, P. (2005). Maximum Likelihood Estimation With Java and Ruby
66
+ #
67
+ # <tt>x</tt>:: matrix of dependent variables. Should have nxk dimensions
68
+ # <tt>y</tt>:: matrix of independent values. Should have nx1 dimensions
69
+ # <tt>@m</tt>:: class for @ming. Could be Normal or Logit
70
+ # <tt>start_values</tt>:: matrix of coefficients. Should have 1xk dimensions
71
+ def newton_raphson(x,y, start_values=nil)
72
+ # deep copy?
73
+ if start_values.nil?
74
+ parameters=set_default_parameters(x)
75
+ else
76
+ parameters = start_values.dup
77
+ end
78
+ k=parameters.row_size
79
+ cv=Matrix.rows([([1.0]*k)])
80
+ last_diff=nil
81
+ raise "n on y != n on x" if x.row_size!=y.row_size
82
+ h=nil
83
+ fd=nil
84
+ if @stop_criteria==:mle
85
+ old_likehood=log_likehood(x, y, parameters)
86
+ else
87
+ old_parameters=parameters
88
+ end
89
+ ITERATIONS.times do |i|
90
+ @iterations=i+1
91
+ puts "Set #{i}" if @verbose
92
+ h = second_derivative(x,y,parameters)
93
+ if h.singular?
94
+ raise "Hessian is singular!"
95
+ end
96
+ fd = first_derivative(x,y,parameters)
97
+ parameters = parameters-(h.inverse*(fd))
98
+
99
+ if @stop_criteria==:parameters
100
+ flag=true
101
+ k.times do |j|
102
+ diff= ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
103
+ flag=false if diff.abs >= MIN_DIFF_PARAMETERS
104
+ @output.puts "Parameters #{j}: #{diff}" if @verbose
105
+ end
106
+
107
+ if flag
108
+ @var_cov_matrix = h.inverse*-1.0
109
+ return parameters
110
+ end
111
+ old_parameters=parameters
112
+ else
113
+ begin
114
+ new_likehood = log_likehood(x,y,parameters)
115
+ @output.puts "[#{i}]Log-MLE:#{new_likehood} (Diff:#{(new_likehood-old_likehood) / new_likehood})" if @verbose
116
+ if(new_likehood < old_likehood) or ((new_likehood - old_likehood) / new_likehood).abs < MIN_DIFF
117
+ @var_cov_matrix = h.inverse*-1.0
118
+ #@output.puts "Ok"
119
+ break;
120
+ end
121
+ old_likehood=new_likehood
122
+ rescue =>e
123
+ puts "#{e}"
124
+ #puts "dup"
125
+ end
126
+ end
127
+ end
128
+ @parameters=parameters
129
+ parameters
130
+ end
131
+ end
132
+ end
133
+ end
134
+
135
+ require 'statsample/mle/normal'
136
+ require 'statsample/mle/logit'
137
+ require 'statsample/mle/probit'
@@ -0,0 +1,95 @@
1
+ module Statsample
2
+ module MLE
3
+ # Logit MLE estimation.
4
+ # Usage:
5
+ #
6
+ # mle=Statsample::MLE::Logit.new
7
+ # mle.newton_raphson(x,y)
8
+ # beta=mle.parameters
9
+ # likehood=mle.likehood(x,y,beta)
10
+ # iterations=mle.iterations
11
+ #
12
+ class Logit < BaseMLE
13
+ # F(B'Xi)
14
+ def f(b,xi)
15
+ p_bx=(xi*b)[0,0]
16
+ res=(1.0/(1.0+Math::exp(-p_bx)))
17
+ if res==0.0
18
+ res=1e-15
19
+ elsif res==1.0
20
+ res=0.999999999999999
21
+ end
22
+ res
23
+ end
24
+ # Likehood for x_i vector, y_i scalar and b parameters
25
+ def likehood_i(xi,yi,b)
26
+ (f(b,xi)**yi)*((1-f(b,xi))**(1-yi))
27
+ end
28
+ # Log Likehood for x_i vector, y_i scalar and b parameters
29
+ def log_likehood_i(xi,yi,b)
30
+ fbx=f(b,xi)
31
+ (yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
32
+ end
33
+
34
+ # First derivative of log-likehood function
35
+ # x: Matrix (NxM)
36
+ # y: Matrix (Nx1)
37
+ # p: Matrix (Mx1)
38
+ def first_derivative(x,y,p)
39
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
40
+ raise "x.columns!=p.rows" if x.column_size!=p.row_size
41
+ n = x.row_size
42
+ k = x.column_size
43
+ fd = Array.new(k)
44
+ k.times {|i| fd[i] = [0.0]}
45
+ n.times do |i|
46
+ row = x.row(i).to_a
47
+ value1 = (1-y[i,0]) -p_plus(row,p)
48
+ k.times do |j|
49
+ fd[j][0] -= value1*row[j]
50
+ end
51
+ end
52
+ Matrix.rows(fd, true)
53
+
54
+ end
55
+ # Second derivative of log-likehood function
56
+ # x: Matrix (NxM)
57
+ # y: Matrix (Nx1)
58
+ # p: Matrix (Mx1)
59
+ def second_derivative(x,y,p)
60
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
61
+ raise "x.columns!=p.rows" if x.column_size!=p.row_size
62
+ n = x.row_size
63
+ k = x.column_size
64
+ sd = Array.new(k)
65
+ k.times do |i|
66
+ arr = Array.new(k)
67
+ k.times{ |j| arr[j]=0.0}
68
+ sd[i] = arr
69
+ end
70
+ n.times do |i|
71
+ row = x.row(i).to_a
72
+ p_m = p_minus(row,p)
73
+ k.times do |j|
74
+ k.times do |l|
75
+ sd[j][l] -= p_m *(1-p_m)*row[j]*row[l]
76
+ end
77
+ end
78
+ end
79
+ Matrix.rows(sd, true)
80
+ end
81
+ private
82
+ def p_minus(x_row,p)
83
+ value = 0.0;
84
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
85
+ 1/(1+Math.exp(-value))
86
+ end
87
+ def p_plus(x_row,p)
88
+ value = 0.0;
89
+ x_row.each_index { |i| value += x_row[i]*p[i,0]}
90
+ 1/(1+Math.exp(value))
91
+ end
92
+
93
+ end # Logit
94
+ end # MLE
95
+ end # Statsample
@@ -0,0 +1,83 @@
1
+ module Statsample
2
+ module MLE
3
+ # Normal Distribution MLE estimation.
4
+ # Usage:
5
+ #
6
+ # mle=Statsample::MLE::Normal.new
7
+ # mle.newton_raphson(x,y)
8
+ # beta=mle.parameters
9
+ # likehood=mle.likehood(x,y,beta)
10
+ # iterations=mle.iterations
11
+
12
+ class Normal < BaseMLE
13
+ # Total MLE for given X, Y and B matrices
14
+ def log_likehood(x,y,b)
15
+ n=x.row_size.to_f
16
+ sigma2=b[b.row_size-1,0]
17
+ betas=Matrix.columns([b.column(0). to_a[0...b.row_size-1]])
18
+ e=y-(x*betas)
19
+ last=(1 / (2*sigma2))*e.t*e
20
+ (-(n / 2.0) * Math::log(2*Math::PI))-((n / 2.0)*Math::log(sigma2)) - last[0,0]
21
+ end
22
+ # First derivative for Normal Model.
23
+ # p should be [k+1,1], because the last parameter is sigma^2
24
+ def first_derivative(x,y,p)
25
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
26
+ raise "x.columns+1!=p.rows" if x.column_size+1!=p.row_size
27
+
28
+ n = x.row_size
29
+ k = x.column_size
30
+ b = Array.new(k)
31
+ k.times{|i| b[i]=[p[i,0]]}
32
+ beta = Matrix.rows(b)
33
+ sigma2 = p[k,0]
34
+ sigma4=sigma2*sigma2
35
+ e = y-(x*(beta))
36
+ xte = x.transpose*(e)
37
+ ete = e.transpose*(e)
38
+ #rows of the Jacobian
39
+ rows = Array.new(k+1)
40
+ k.times{|i| rows[i] = [xte[i,0] / sigma2]}
41
+ rows[k] = [ete[0,0] / (2*sigma4) - n / (2*sigma2)]
42
+ fd = Matrix.rows(rows, true)
43
+ end
44
+
45
+ # second derivative for normal model
46
+ # p should be [k+1,1], because the last parameter is sigma^2
47
+ def second_derivative(x,y,p)
48
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
49
+ raise "x.columns+1!=p.rows" if x.column_size+1!=p.row_size
50
+
51
+ n = x.row_size
52
+ k = x.column_size
53
+ b = Array.new(k)
54
+ k.times{|i| b[i]=[p[i,0]]}
55
+ beta = Matrix.rows(b)
56
+ sigma2 = p[k,0]
57
+ sigma4=sigma2*sigma2
58
+ sigma6 = sigma2*sigma2*sigma2
59
+ e = y-(x*(beta))
60
+ xtx = x.transpose*(x)
61
+ xte = x.transpose*(e)
62
+ ete = e.transpose*(e)
63
+ #rows of the Hessian
64
+ rows = Array.new(k+1)
65
+ k.times do |i|
66
+ row = Array.new(k+1)
67
+ k.times do |j|
68
+ row[j] = -xtx[i,j] / sigma2
69
+ end
70
+ row[k] = -xte[i,0] / sigma4
71
+ rows[i] = row
72
+ end
73
+ last_row = Array.new(k+1)
74
+ k.times do |i|
75
+ last_row[i] = -xte[i,0] / sigma4
76
+ end
77
+ last_row[k] = 2*sigma4 - ete[0,0] / sigma6
78
+ rows[k] = last_row
79
+ sd = Matrix.rows(rows, true)
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,93 @@
1
+ require 'matrix_extension'
2
+ module Statsample
3
+ module MLE
4
+ # Probit MLE estimation.
5
+ # Usage:
6
+ #
7
+ # mle=Statsample::MLE::Probit.new
8
+ # mle.newton_raphson(x,y)
9
+ # beta=mle.parameters
10
+ # likehood=mle.likehood(x,y,beta)
11
+ # iterations=mle.iterations
12
+ class Probit < BaseMLE
13
+ # F(B'Xi)
14
+ if HAS_GSL
15
+ # F(B'Xi)
16
+ def f(b,x)
17
+ p_bx=(x*b)[0,0]
18
+ GSL::Cdf::ugaussian_P(p_bx)
19
+ end
20
+ # f(B'Xi)
21
+ def ff(b,x)
22
+ p_bx=(x*b)[0,0]
23
+ GSL::Ran::ugaussian_pdf(p_bx)
24
+ end
25
+ else
26
+ def f(b,x) #:nodoc:
27
+ p_bx=(x*b)[0,0]
28
+ Distribution::Normal.cdf(p_bx)
29
+ end
30
+ def ff(b,x) #:nodoc:
31
+ p_bx=(x*b)[0,0]
32
+ Distribution::Normal.pdf(p_bx)
33
+
34
+ end
35
+ end
36
+ # Log Likehood for x_i vector, y_i scalar and b parameters
37
+ def log_likehood_i(xi,yi,b)
38
+ fbx=f(b,xi)
39
+ (yi.to_f*Math::log(fbx))+((1.0-yi.to_f)*Math::log(1.0-fbx))
40
+ end
41
+ # First derivative of log-likehood probit function
42
+ # x: Matrix (NxM)
43
+ # y: Matrix (Nx1)
44
+ # p: Matrix (Mx1)
45
+ def first_derivative(x,y,b)
46
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
47
+ raise "x.columns!=p.rows" if x.column_size!=b.row_size
48
+ n = x.row_size
49
+ k = x.column_size
50
+ fd = Array.new(k)
51
+ k.times {|i| fd[i] = [0.0]}
52
+ n.times do |i|
53
+ xi = Matrix.rows([x.row(i).to_a])
54
+ fbx=f(b,xi)
55
+ value1 = (y[i,0]-fbx)/ ( fbx*(1-fbx))*ff(b,xi)
56
+ k.times do |j|
57
+ fd[j][0] += value1*xi[0,j]
58
+ end
59
+ end
60
+ Matrix.rows(fd, true)
61
+ end
62
+ # Second derivative of log-likehood probit function
63
+ # x: Matrix (NxM)
64
+ # y: Matrix (Nx1)
65
+ # p: Matrix (Mx1)
66
+
67
+ def second_derivative(x,y,b)
68
+ raise "x.rows!=y.rows" if x.row_size!=y.row_size
69
+ raise "x.columns!=p.rows" if x.column_size!=b.row_size
70
+ n = x.row_size
71
+ k = x.column_size
72
+ if HAS_GSL
73
+ sum=GSL::Matrix.zeros(k)
74
+ else
75
+ sum=Matrix.zero(k)
76
+ end
77
+ n.times do |i|
78
+ xi=Matrix.rows([x.row(i).to_a])
79
+ fbx=f(b,xi)
80
+ val=((ff(b,xi)**2) / (fbx*(1.0-fbx)))*xi.t*xi
81
+ if HAS_GSL
82
+ val=val.to_gsl
83
+ end
84
+ sum-=val
85
+ end
86
+ if HAS_GSL
87
+ sum=sum.to_matrix
88
+ end
89
+ sum
90
+ end
91
+ end # Probit
92
+ end # MLE
93
+ end # Statsample
@@ -1,9 +1,11 @@
1
1
  require 'statsample/regression/simple'
2
- require 'statsample/regression/logit'
3
2
  require 'statsample/regression/multiple'
4
3
  require 'statsample/regression/multiple/alglibengine'
5
4
  require 'statsample/regression/multiple/rubyengine'
6
5
  require 'statsample/regression/multiple/gslengine'
6
+ require 'statsample/regression/binomial'
7
+ require 'statsample/regression/binomial/logit'
8
+ require 'statsample/regression/binomial/probit'
7
9
 
8
10
  module Statsample
9
11
  # Module for regression procedures
@@ -0,0 +1,65 @@
1
+
2
+ module Statsample
3
+ module Regression
4
+ module Binomial
5
+ def self.logit(ds,y_var)
6
+ Logit.new(ds,y_var)
7
+ end
8
+ def self.probit(ds,y_var)
9
+ Probit.new(ds,y_var)
10
+ end
11
+
12
+ class BaseEngine
13
+ attr_reader :log_likehood, :iterations
14
+ def initialize(ds,y_var,model)
15
+ @ds=ds
16
+ @y_var=y_var
17
+ @dy=@ds[@y_var]
18
+ @ds_indep=ds.dup(ds.fields-[y_var])
19
+ constant=([1.0]*ds.cases).to_vector(:scale)
20
+ @ds_indep.add_vector("_constant",constant)
21
+ mat_x=@ds_indep.to_matrix
22
+ mat_y=@dy.to_matrix(:vertical)
23
+ @fields=@ds_indep.fields
24
+ @model=model
25
+ coeffs=model.newton_raphson(mat_x, mat_y)
26
+ @coeffs=assign_names(coeffs.column(0).to_a)
27
+ @iterations=model.iterations
28
+ @var_cov_matrix=model.var_cov_matrix
29
+ @log_likehood=model.log_likehood(mat_x, mat_y, coeffs)
30
+ end # init
31
+ # Coefficients standard error
32
+ def coeffs_se
33
+ out={}
34
+ @fields.each_index{|i|
35
+ f=@fields[i]
36
+ out[f]=Math::sqrt(@var_cov_matrix[i,i])
37
+ }
38
+ out.delete("_constant")
39
+ out
40
+ end
41
+ def constant
42
+ @coeffs['_constant']
43
+ end
44
+ def coeffs
45
+ c=@coeffs.dup
46
+ c.delete("_constant")
47
+ c
48
+ end
49
+ # Constant standard error
50
+ def constant_se
51
+ i=@fields.index :_constant
52
+ Math::sqrt(@var_cov_matrix[i,i])
53
+ end
54
+ def assign_names(c)
55
+ a={}
56
+ @fields.each_index {|i|
57
+ a[@fields[i]]=c[i]
58
+ }
59
+ a
60
+ end
61
+ end # Base Engine
62
+
63
+ end # Dichotomic
64
+ end # Regression
65
+ end # Stasample