statsample 0.6.3 → 0.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,11 @@
1
1
  require 'statsample/regression/simple'
2
2
  require 'statsample/regression/multiple'
3
+
4
+ require 'statsample/regression/multiple/matrixengine'
3
5
  require 'statsample/regression/multiple/alglibengine'
4
6
  require 'statsample/regression/multiple/rubyengine'
5
7
  require 'statsample/regression/multiple/gslengine'
8
+
6
9
  require 'statsample/regression/binomial'
7
10
  require 'statsample/regression/binomial/logit'
8
11
  require 'statsample/regression/binomial/probit'
@@ -42,35 +42,77 @@ module Statsample
42
42
  # -----------------------------------------------
43
43
  #
44
44
  module Multiple
45
- # Creates an object for listwise regression.
46
- # Alglib is faster, so is prefered over GSL
47
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
48
- def self.listwise(ds,y_var)
49
- if HAS_ALGIB
50
- AlglibEngine.new(ds,y_var)
51
- elsif HAS_GSL
52
- GslEngine.new(ds,y_var)
53
- else
54
- ds2=ds.dup_only_valid
55
- RubyEngine.new(ds2,y_var)
56
- end
45
+ # Creates an object for listwise regression.
46
+ # Alglib is faster, so is prefered over GSL
47
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
48
+ def self.listwise(ds,y_var)
49
+ if HAS_ALGIB
50
+ AlglibEngine.new(ds,y_var)
51
+ elsif HAS_GSL
52
+ GslEngine.new(ds,y_var)
53
+ else
54
+ ds2=ds.dup_only_valid
55
+ RubyEngine.new(ds2,y_var)
56
+ end
57
+ end
58
+
59
+ # Creates an object for pairwise regression
60
+ # For now, always retrieves a RubyEngine
61
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
62
+ def self.pairwise(ds,y_var)
63
+ RubyEngine.new(ds,y_var)
64
+ end
65
+ def self.listwise_by_exp(ds,exp)
66
+ raise "Not implemented yet"
67
+ end
68
+ # Obtain r2 for regressors
69
+ def self.r2_from_matrices(rxx,rxy)
70
+ matrix=(rxy.transpose*rxx.inverse*rxy)
71
+ matrix[0,0]
72
+ end
73
+
74
+ class MultipleDependent
75
+ def significance
76
+ 0.0
77
+ end
78
+ def initialize(matrix,y_var, opts=Hash.new)
79
+ matrix.extend Statsample::CovariateMatrix
80
+ @matrix=matrix
81
+ @fields=matrix.fields-y_var
82
+ @y_var=y_var
83
+ @q=@y_var.size
84
+ @matrix_cor=matrix.correlation
85
+ @matrix_cor_xx = @matrix_cor.submatrix(@fields)
86
+ @matrix_cor_yy = @matrix_cor.submatrix(y_var, y_var)
87
+
88
+ @sxx = @matrix.submatrix(@fields)
89
+ @syy = @matrix.submatrix(y_var, y_var)
90
+ @sxy = @matrix.submatrix(@fields, y_var)
91
+ @syx = @sxy.t
92
+
93
+
57
94
  end
58
95
 
59
- # Creates an object for pairwise regression
60
- # For now, always retrieves a RubyEngine
61
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
62
- def self.pairwise(ds,y_var)
63
- RubyEngine.new(ds,y_var)
96
+ def r2yx
97
+ 1- (@matrix_cor.determinant.quo(@matrix_cor_yy.determinant * @matrix_cor_xx.determinant))
64
98
  end
65
- def self.listwise_by_exp(ds,exp)
66
- raise "Not implemented yet"
99
+ # Residual covariance of Y after accountin with lineal relation with x
100
+ def syyx
101
+ @syy-@syx*@sxx.inverse*@sxy
67
102
  end
68
- # Obtain r2 for regressors
69
- def self.r2_from_matrices(rxx,rxy)
70
- matrix=(rxy.transpose*rxx.inverse*rxy)
71
- matrix[0,0]
103
+ def r2yx_covariance
104
+ 1-(syyx.determinant.quo(@syy.determinant))
72
105
  end
73
106
 
107
+ def vxy
108
+ @q-(@syy.inverse*syyx).trace
109
+ end
110
+ def p2yx
111
+ vxy.quo(@q)
112
+ end
113
+ end
114
+
115
+
74
116
  end
75
117
  end
76
118
  end
@@ -3,12 +3,21 @@ module Statsample
3
3
  module Multiple
4
4
  # Base class for Multiple Regression Engines
5
5
  class BaseEngine
6
+
6
7
  include GetText
7
8
  bindtextdomain("statsample")
8
9
  # Name of analysis
9
10
  attr_accessor :name
11
+
12
+ def self.univariate?
13
+ true
14
+ end
15
+
16
+
17
+
10
18
  def initialize(ds, y_var, opts = Hash.new)
11
19
  @ds=ds
20
+ @cases=@ds.cases
12
21
  @y_var=y_var
13
22
  @r2=nil
14
23
  @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var]
@@ -92,7 +101,7 @@ module Statsample
92
101
  end
93
102
  # Significance of Fisher
94
103
  def significance
95
- 1.0-Distribution::F.cdf(f,df_r,df_e)
104
+ (1.0-Distribution::F.cdf(f, df_r, df_e)).abs
96
105
  end
97
106
  # Tolerance for a given variable
98
107
  # http://talkstats.com/showthread.php?t=5056
@@ -120,6 +129,11 @@ module Statsample
120
129
  }
121
130
  out
122
131
  end
132
+ # Estandar error of R
133
+ def se_r2
134
+ Math::sqrt((4*r2*(1-r2)**2*(df_e)**2).quo((@cases**2-1)*(@cases+3)))
135
+ end
136
+
123
137
  # Estimated Variance-Covariance Matrix
124
138
  # Used for calculation of se of constant
125
139
  def estimated_variance_covariance_matrix
@@ -152,8 +166,8 @@ module Statsample
152
166
  c=coeffs
153
167
  generator.add_text(_("Engine: %s") % self.class)
154
168
  generator.add_text(_("Cases(listwise)=%d(%d)") % [@ds.cases, @ds_valid.cases])
155
- generator.add_text("r=#{sprintf('%0.3f',r)}")
156
- generator.add_text("r=#{sprintf('%0.3f',r2)}")
169
+ generator.add_text("R=#{sprintf('%0.3f',r)}")
170
+ generator.add_text("R^2=#{sprintf('%0.3f',r2)}")
157
171
 
158
172
  generator.add_text(_("Equation")+"="+ sprintf('%0.3f',constant) +" + "+ @fields.collect {|k| sprintf('%0.3f%s',c[k],k)}.join(' + ') )
159
173
 
@@ -167,7 +181,8 @@ module Statsample
167
181
  cse=coeffs_se
168
182
  t=ReportBuilder::Table.new(:name=>"Beta coefficients", :header=>%w{coeff b beta se t}.collect{|field| _(field)} )
169
183
 
170
- t.add_row([_("Constant"), sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
184
+ t.add_row([_("Constant"), sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
185
+
171
186
  @fields.each do |f|
172
187
  t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
173
188
  end
@@ -184,22 +199,6 @@ module Statsample
184
199
  a
185
200
  end
186
201
 
187
- # Deprecated
188
- # Sum of squares of error (manual calculation)
189
- # using the predicted value minus the y_i value
190
- def sse_manual
191
- pr=predicted
192
- cases=0
193
- sse=(0...@ds.cases).inject(0) {|a,i|
194
- if !@dy.data_with_nils[i].nil? and !pr[i].nil?
195
- cases+=1
196
- a+((pr[i]-@dy[i])**2)
197
- else
198
- a
199
- end
200
- }
201
- sse*(min_n_valid-1.0).quo(cases-1)
202
- end
203
202
  # Sum of squares of regression
204
203
  # using the predicted value minus y mean
205
204
  def ssr_direct
@@ -0,0 +1,187 @@
1
+ module Statsample
2
+ module Regression
3
+ module Multiple
4
+ # Pure Ruby Class for Multiple Regression Analysis, based on a covariance or correlation matrix.
5
+ # <b>Remember:</b> NEVER use a Covariance data if you have missing data. Use only correlation matrix on that case.
6
+ #
7
+ #
8
+ # Example:
9
+ #
10
+ # matrix=[[1.0, 0.5, 0.2], [0.5, 1.0, 0.7], [0.2, 0.7, 1.0]]
11
+ #
12
+ # lr=Statsample::Regression::Multiple::MatrixEngine.new(matrix,2)
13
+
14
+ class MatrixEngine < BaseEngine
15
+ # Hash of standard deviation of predictors.
16
+ # Only useful for Correlation Matrix, because by default is set to 1
17
+ attr_accessor :x_sd
18
+ # Standard deviation of criteria.
19
+ # Only useful for Correlation Matrix, because by default is set to 1
20
+
21
+ attr_accessor :y_sd
22
+ # Hash of mean for predictors. By default, set to 0
23
+ #
24
+ attr_accessor :x_mean
25
+
26
+ # Mean for criteria. By default, set to 0
27
+ #
28
+ attr_accessor :y_mean
29
+
30
+ # Number of cases
31
+ attr_writer :cases
32
+
33
+ # Create object
34
+ #
35
+ def initialize(matrix,y_var, opts=Hash.new)
36
+ matrix.extend Statsample::CovariateMatrix
37
+ raise "#{y_var} variable should be on data" unless matrix.fields.include? y_var
38
+
39
+ @matrix_cor=matrix.correlation
40
+
41
+ @y_var=y_var
42
+ @fields=matrix.fields-[y_var]
43
+ @n_predictors=@fields.size
44
+ @matrix=matrix
45
+ @matrix_x= matrix.submatrix(@fields)
46
+ @matrix_y = matrix.submatrix(@fields, [y_var])
47
+ @matrix_y_cor=@matrix_cor.submatrix(@fields, [y_var])
48
+ @result_matrix=@matrix_x.inverse * @matrix_y
49
+ @y_sd=Math::sqrt(@matrix.submatrix([y_var])[0,0])
50
+ @x_sd=@matrix_x.row_size.times.inject({}) {|ac,i|
51
+ ac[@matrix_x.fields[i]]=Math::sqrt(@matrix_x[i,i])
52
+ ac;
53
+ }
54
+ @cases=nil
55
+ @x_mean=@fields.inject({}) {|ac,f|
56
+ ac[f]=0.0
57
+ ac;
58
+ }
59
+
60
+ @y_mean=0.0
61
+ @name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
62
+
63
+
64
+ opts.each{|k,v|
65
+ self.send("#{k}=",v) if self.respond_to? k
66
+ }
67
+ if matrix.type==:covariance
68
+ @coeffs=@result_matrix.column(0).to_a
69
+ @coeffs_stan=coeffs.collect {|k,v|
70
+ coeffs[k]*@x_sd[k].quo(@y_sd)
71
+ }
72
+ else
73
+ @coeffs_stan=@result_matrix.column(0).to_a
74
+
75
+ @coeffs=standarized_coeffs.collect {|k,v|
76
+ standarized_coeffs[k]*@y_sd.quo(@x_sd[k])
77
+ }
78
+ end
79
+
80
+ end
81
+ def cases
82
+ raise "You should define the number of valid cases first" if @cases.nil?
83
+ @cases
84
+ end
85
+ # Get R^2 for the regression
86
+ # Equal to
87
+ # * 1-(|R| / |R_x|) or
88
+ # * Sum(b_i*r_yi)
89
+ def r2
90
+ @n_predictors.times.inject(0) {|ac,i| ac+@coeffs_stan[i]* @matrix_y_cor[i,0]}
91
+ #1-(@matrix.correlation.determinant.quo(@matrix_x.correlation.determinant))
92
+ end
93
+ def r
94
+ Math::sqrt(r2)
95
+ end
96
+
97
+ def constant
98
+ c=coeffs
99
+ @y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])}
100
+ end
101
+ def coeffs
102
+ assign_names(@coeffs)
103
+ end
104
+ def standarized_coeffs
105
+ assign_names(@coeffs_stan)
106
+ end
107
+
108
+ def sst
109
+ @y_sd**2*(cases-1.0)
110
+ end
111
+
112
+ # Degrees of freedom for regression
113
+ def df_r
114
+ @n_predictors
115
+ end
116
+ # Degrees of freedom for error
117
+ def df_e
118
+ cases-@n_predictors-1
119
+ end
120
+
121
+ # Tolerance for a given variable
122
+ # defined as (1-r2) of regression of other independent variables
123
+ # over the selected
124
+ # http://talkstats.com/showthread.php?t=5056
125
+ def tolerance(var)
126
+ lr=Statsample::Regression::Multiple::MatrixEngine.new(@matrix_x, var)
127
+ 1-lr.r2
128
+ end
129
+ # Standard Error for coefficients.
130
+ # Standard error of a coefficients depends on
131
+ # * Tolerance of the coeffients: Higher tolerances implies higher error
132
+ # * Higher r2 implies lower error
133
+
134
+ # Reference: Cohen et al. (2003). Applied Multiple Reggression / Correlation Analysis for the Behavioral Sciences
135
+ #
136
+ def coeffs_se
137
+ out={}
138
+ mse=sse.quo(df_e)
139
+ coeffs.each {|k,v|
140
+ out[k]=@y_sd.quo(@x_sd[k])*Math::sqrt( 1.quo(tolerance(k)))*Math::sqrt((1-r2).quo(df_e))
141
+ }
142
+ out
143
+ end
144
+ # Standard error for constant
145
+ def constant_se
146
+ nil
147
+ end
148
+
149
+ def to_reportbuilder(generator)
150
+ anchor=generator.add_toc_entry(_("Multiple Regression: ")+@name)
151
+ generator.add_html "<div class='multiple-regression'>#{@name}<a name='#{anchor}'></a>"
152
+ c=coeffs
153
+ generator.add_text(_("Engine: %s") % self.class)
154
+ generator.add_text(_("Cases=%d") % [@cases])
155
+ generator.add_text("R=#{sprintf('%0.3f',r)}")
156
+ generator.add_text("R^2=#{sprintf('%0.3f',r2)}")
157
+
158
+ generator.add_text(_("Equation")+"="+ sprintf('%0.3f',constant) +" + "+ @fields.collect {|k| sprintf('%0.3f%s',c[k],k)}.join(' + ') )
159
+
160
+ t=ReportBuilder::Table.new(:name=>"ANOVA", :header=>%w{source ss df ms f s})
161
+ t.add_row([_("Regression"), sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
162
+ t.add_row([_("Error"), sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
163
+
164
+ t.add_row([_("Total"), sprintf("%0.3f",sst), df_r+df_e])
165
+ generator.parse_element(t)
166
+ sc=standarized_coeffs
167
+ cse=coeffs_se
168
+ t=ReportBuilder::Table.new(:name=>"Beta coefficients", :header=>%w{coeff b beta se t}.collect{|field| _(field)} )
169
+
170
+ if (constant_se.nil?)
171
+ t.add_row([_("Constant"), sprintf("%0.3f", constant),"--","?","?"])
172
+ else
173
+ t.add_row([_("Constant"), sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
174
+ end
175
+
176
+ @fields.each do |f|
177
+ t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
178
+ end
179
+ generator.parse_element(t)
180
+ generator.add_html("</div>")
181
+ end
182
+
183
+
184
+ end
185
+ end
186
+ end
187
+ end
@@ -15,119 +15,79 @@ module Multiple
15
15
  # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
16
16
  # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
17
17
 
18
- class RubyEngine < BaseEngine
18
+ class RubyEngine < MatrixEngine
19
19
  def initialize(ds,y_var, opts=Hash.new)
20
- super
20
+ matrix=Statsample::Bivariate.correlation_matrix(ds)
21
+ fields_indep=ds.fields-[y_var]
22
+ default={
23
+ :y_mean=>ds[y_var].mean,
24
+ :x_mean=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac},
25
+ :y_sd=>ds[y_var].sd,
26
+ :x_sd=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac},
27
+ :cases=>Statsample::Bivariate.min_n_valid(ds)
28
+ }
29
+ opts=opts.merge(default)
30
+ super(matrix, y_var, opts)
31
+ @ds=ds
21
32
  @dy=ds[@y_var]
22
33
  @ds_valid=ds.dup_only_valid
23
- @ds_indep=ds.dup(ds.fields-[y_var])
24
- @fields=@ds_indep.fields
34
+ @ds_indep = ds.dup(ds.fields-[y_var])
35
+
36
+ # p obtain_predictor_matrix
37
+ # p @matrix_x.correlation
38
+
25
39
  set_dep_columns
26
- obtain_y_vector
27
- @matrix_x = Bivariate.correlation_matrix(@ds_indep)
28
- @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
29
- @min_n_valid=nil
30
- end
31
- def min_n_valid
32
- if @min_n_valid.nil?
33
- min=@ds.cases
34
- m=Bivariate::n_valid_matrix(@ds)
35
- for x in 0...m.row_size
36
- for y in 0...m.column_size
37
- min=m[x,y] if m[x,y] < min
38
- end
39
- end
40
- @min_n_valid=min
41
- end
42
- @min_n_valid
43
40
  end
41
+
44
42
  def set_dep_columns
45
43
  @dep_columns=[]
46
44
  @ds_indep.each_vector{|k,v|
47
45
  @dep_columns.push(v.data_with_nils)
48
46
  }
49
47
  end
50
- # Sum of square total
51
- def sst
52
- #if @sst.nil?
53
- @sst=@dy.variance*(min_n_valid-1.0)
54
- #end
55
- @sst
56
- end
57
- def r2
58
- if @r2.nil?
59
- c=@matrix_y
60
- rxx=obtain_predictor_matrix
61
- matrix=(c.t*rxx.inverse*c)
62
- @r2=matrix[0,0]
63
- end
64
- @r2
65
- end
66
- def r
67
- Math::sqrt(r2)
68
- end
69
48
 
70
- def df_e
71
- min_n_valid-@dep_columns.size-1
72
- end
73
- def fix_with_mean
74
- i=0
75
- @ds_indep.each do |row|
76
- empty=[]
77
- row.each do |k,v|
78
- empty.push(k) if v.nil?
79
- end
80
- if empty.size==1
81
- @ds_indep[empty[0]][i]=@ds[empty[0]].mean
82
- end
83
- i+=1
49
+ def fix_with_mean
50
+ i=0
51
+ @ds_indep.each do |row|
52
+ empty=[]
53
+ row.each do |k,v|
54
+ empty.push(k) if v.nil?
84
55
  end
85
- @ds_indep.update_valid_data
86
- set_dep_columns
56
+ if empty.size==1
57
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
58
+ end
59
+ i+=1
87
60
  end
88
- def fix_with_regression
89
- i=0
90
- @ds_indep.each{|row|
91
- empty=[]
92
- row.each{|k,v|
93
- empty.push(k) if v.nil?
94
- }
95
- if empty.size==1
96
- field=empty[0]
97
- lr=MultipleRegression.new(@ds_indep,field)
98
- fields=[]
99
- @ds_indep.fields.each{|f|
100
- fields.push(row[f]) unless f==field
101
- }
102
- @ds_indep[field][i]=lr.process(fields)
103
- end
104
- i+=1
61
+ @ds_indep.update_valid_data
62
+ set_dep_columns
63
+ end
64
+ def fix_with_regression
65
+ i=0
66
+ @ds_indep.each{|row|
67
+ empty=[]
68
+ row.each{|k,v|
69
+ empty.push(k) if v.nil?
70
+ }
71
+ if empty.size==1
72
+ field=empty[0]
73
+ lr=MultipleRegression.new(@ds_indep,field)
74
+ fields=[]
75
+ @ds_indep.fields.each{|f|
76
+ fields.push(row[f]) unless f==field
105
77
  }
106
- @ds_indep.update_valid_data
107
- set_dep_columns
108
- end
109
- def obtain_y_vector
110
- @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
111
- Bivariate.pearson(@dy, @ds_indep[f])
112
- }])
113
- end
114
- def obtain_predictor_matrix
115
- Bivariate::correlation_matrix(@ds_indep)
116
- end
117
- def constant
118
- c=coeffs
119
- @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
120
- end
121
-
122
- def coeffs
123
- sc=standarized_coeffs
124
- assign_names(@fields.collect{|f|
125
- (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
126
- })
127
- end
128
- def standarized_coeffs
129
- assign_names(@coeffs_stan)
130
- end
78
+ @ds_indep[field][i]=lr.process(fields)
79
+ end
80
+ i+=1
81
+ }
82
+ @ds_indep.update_valid_data
83
+ set_dep_columns
84
+ end
85
+
86
+
87
+ # Standard error for constant
88
+ def constant_se
89
+ estimated_variance_covariance_matrix[0,0]
90
+ end
131
91
  end
132
92
  end
133
93
  end