statsample 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,284 +1,56 @@
1
+ require 'statsample/regression/multiple/baseengine'
1
2
  module Statsample
2
- module Regression
3
- # Module for Linear Multiple Regression Analysis
4
- # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
5
- # Example.
6
- #
7
- # require 'statsample'
8
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
9
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
10
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
11
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
12
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
13
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
14
- # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
15
-
16
-
17
- module Multiple
18
- # Creates an object for listwise regression.
19
- # Alglib is faster, so is prefered over GSL
20
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
21
- def self.listwise(ds,y_var)
22
- if HAS_ALGIB
3
+ module Regression
4
+ # Module for Linear Multiple Regression Analysis
5
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
6
+ # Example.
7
+ #
8
+ # require 'statsample'
9
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
10
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
11
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
12
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
13
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
14
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
15
+ # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
16
+ module Multiple
17
+ # Creates an object for listwise regression.
18
+ # Alglib is faster, so is prefered over GSL
19
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
20
+ def self.listwise(ds,y_var)
21
+ if HAS_ALGIB
23
22
  AlglibEngine.new(ds,y_var)
24
- elsif HAS_GSL
23
+ elsif HAS_GSL
25
24
  GslEngine.new(ds,y_var)
26
- else
25
+ else
27
26
  ds2=ds.dup_only_valid
28
27
  RubyEngine.new(ds2,y_var)
28
+ end
29
29
  end
30
- end
31
-
32
- # Creates an object for pairwise regression
33
- # For now, always retrieves a RubyEngine
34
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
35
- def self.pairwise(ds,y_var)
36
- RubyEngine.new(ds,y_var)
37
- end
38
- def self.listwise_by_exp(ds,exp)
39
- end
40
- # Returns a dataset and name of criteria using a expression.
41
- # All nominal vectors are replaced by dummy coding
42
- # and interactions are calculated
43
-
44
- def self.ds_by_exp(ds,exp)
45
- raise "Not implemented"
46
- parts=exp.split(/[\+=]/)
47
- dependent=parts.pop
48
- ds_out=[]
49
- parts.each{|p|
50
-
51
- }
52
- end
53
- # Base class for Multiple Regression Engines
54
- class BaseEngine
55
- def initialize(ds,y_var)
56
- @ds=ds
57
- @y_var=y_var
58
- @r2=nil
59
- end
60
-
61
- # Retrieves a vector with predicted values for y
62
- def predicted
63
- (0...@ds.cases).collect { |i|
64
- invalid=false
65
- vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
66
- if invalid
67
- nil
68
- else
69
- process(vect)
70
- end
71
- }.to_vector(:scale)
72
- end
73
- # Retrieves a vector with standarized values for y
74
- def standarized_predicted
75
- predicted.standarized
76
- end
77
- # Retrieves a vector with residuals values for y
78
- def residuals
79
- (0...@ds.cases).collect{|i|
80
- invalid=false
81
- vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
82
- if invalid or @ds[@y_var][i].nil?
83
- nil
84
- else
85
- @ds[@y_var][i] - process(vect)
86
- end
87
- }.to_vector(:scale)
88
- end
89
- # R Multiple
90
- def r
91
- raise "You should implement this"
92
- end
93
- # Sum of squares Total
94
- def sst
95
- raise "You should implement this"
96
- end
97
- # Sum of squares (regression)
98
- def ssr
99
- r2*sst
100
- end
101
- # Sum of squares (Error)
102
- def sse
103
- sst - ssr
104
- end
105
- # T values for coeffs
106
- def coeffs_t
107
- out={}
108
- se=coeffs_se
109
- coeffs.each{|k,v|
110
- out[k]=v / se[k]
111
- }
112
- out
113
- end
114
- # Mean square Regression
115
- def msr
116
- ssr.quo(df_r)
117
- end
118
- # Mean Square Error
119
- def mse
120
- sse.quo(df_e)
121
- end
122
- # Degrees of freedom for regression
123
- def df_r
124
- @dep_columns.size
125
- end
126
- # Degrees of freedom for error
127
- def df_e
128
- @ds_valid.cases-@dep_columns.size-1
129
- end
130
- # Fisher for Anova
131
- def f
132
- (ssr.quo(df_r)).quo(sse.quo(df_e))
133
- end
134
- # Significance of Fisher
135
- def significance
136
- 1.0-Distribution::F.cdf(f,df_r,df_e)
137
- end
138
- # Tolerance for a given variable
139
- # http://talkstats.com/showthread.php?t=5056
140
- def tolerance(var)
141
- ds=assign_names(@dep_columns)
142
- ds.each{|k,v|
143
- ds[k]=v.to_vector(:scale)
144
- }
145
- lr=Multiple.listwise(ds.to_dataset,var)
146
- 1-lr.r2
147
- end
148
- # Tolerances for each coefficient
149
- def coeffs_tolerances
150
- @fields.inject({}) {|a,f|
151
- a[f]=tolerance(f);
152
- a
153
- }
154
- end
155
- # Standard Error for coefficients
156
- def coeffs_se
157
- out={}
158
- mse=sse.quo(df_e)
159
- coeffs.each {|k,v|
160
- out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
161
- }
162
- out
163
- end
164
- # Estimated Variance-Covariance Matrix
165
- # Used for calculation of se of constant
166
- def estimated_variance_covariance_matrix
167
- mse_p=mse
168
- columns=[]
169
- @ds_valid.each_vector{|k,v|
170
- columns.push(v.data) unless k==@y_var
171
- }
172
- columns.unshift([1.0]*@ds_valid.cases)
173
- x=Matrix.columns(columns)
174
- matrix=((x.t*x)).inverse * mse
175
- matrix.collect {|i|
176
- Math::sqrt(i) if i>0
177
- }
178
- end
179
- # T for constant
180
- def constant_t
181
- constant.to_f/constant_se
182
- end
183
- # Standard error for constant
184
- def constant_se
185
- estimated_variance_covariance_matrix[0,0]
186
- end
187
- # Retrieves a summary for Regression
188
- def summary(report_type=ConsoleSummary)
189
- c=coeffs
190
- out=""
191
- out.extend report_type
192
- out.add <<HEREDOC
193
- Summary for regression of #{@fields.join(',')} over #{@y_var}
194
- *************************************************************
195
- Engine: #{self.class}
196
- Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
197
- r=#{sprintf("%0.3f",r)}
198
- r2=#{sprintf("%0.3f",r2)}
199
- Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
200
- HEREDOC
201
-
202
- out.add_line
203
- out.add "ANOVA TABLE"
204
-
205
- t=Statsample::ReportTable.new(%w{source ss df ms f s})
206
- t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
207
- t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
208
-
209
- t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
210
30
 
211
- out.parse_table(t)
212
-
213
- begin
214
- out.add "Beta coefficientes"
215
- sc=standarized_coeffs
216
- cse=coeffs_se
217
- t=Statsample::ReportTable.new(%w{coeff b beta se t})
218
- t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
219
- @fields.each{|f|
220
- t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
221
- }
222
- out.parse_table(t)
223
-
224
- rescue
31
+ # Creates an object for pairwise regression
32
+ # For now, always retrieves a RubyEngine
33
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
34
+ def self.pairwise(ds,y_var)
35
+ RubyEngine.new(ds,y_var)
225
36
  end
226
- out
37
+ def self.listwise_by_exp(ds,exp)
38
+ raise "Not implemented yet"
227
39
  end
228
- def assign_names(c)
229
- a={}
230
- @fields.each_index {|i|
231
- a[@fields[i]]=c[i]
232
- }
233
- a
40
+ # Returns a dataset and name of criteria using a expression.
41
+ # All nominal vectors are replaced by dummy coding
42
+ # and interactions are calculated
43
+
44
+ def self.ds_by_exp(ds,exp)
45
+ raise "Not implemented"
46
+ parts=exp.split(/[\+=]/)
47
+ dependent=parts.pop
48
+ ds_out=[]
49
+ parts.each{|p|
50
+
51
+ }
234
52
  end
235
-
236
53
 
237
- # Deprecated
238
- # Sum of squares of error (manual calculation)
239
- # using the predicted value minus the y_i value
240
- def sse_manual
241
- pr=predicted
242
- cases=0
243
- sse=(0...@ds.cases).inject(0) {|a,i|
244
- if !@dy.data_with_nils[i].nil? and !pr[i].nil?
245
- cases+=1
246
- a+((pr[i]-@dy[i])**2)
247
- else
248
- a
249
- end
250
- }
251
- sse*(min_n_valid-1.0).quo(cases-1)
252
- end
253
- # Sum of squares of regression
254
- # using the predicted value minus y mean
255
- def ssr_direct
256
- mean=@dy.mean
257
- cases=0
258
- ssr=(0...@ds.cases).inject(0) {|a,i|
259
- invalid=false
260
- v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
261
- if !invalid
262
- cases+=1
263
- a+((process(v)-mean)**2)
264
- else
265
- a
266
- end
267
- }
268
- ssr
269
- end
270
- def sse_direct
271
- sst-ssr
272
54
  end
273
- def process(v)
274
- c=coeffs
275
- total=constant
276
- @fields.each_index{|i|
277
- total+=c[@fields[i]]*v[i]
278
- }
279
- total
280
- end
281
- end
282
- end
283
- end
55
+ end
284
56
  end
@@ -0,0 +1,235 @@
1
+ module Statsample
2
+ module Regression
3
+ module Multiple
4
+ # Base class for Multiple Regression Engines
5
+ class BaseEngine
6
+ def initialize(ds,y_var)
7
+ @ds=ds
8
+ @y_var=y_var
9
+ @r2=nil
10
+ end
11
+
12
+ # Retrieves a vector with predicted values for y
13
+ def predicted
14
+ (0...@ds.cases).collect { |i|
15
+ invalid=false
16
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
17
+ if invalid
18
+ nil
19
+ else
20
+ process(vect)
21
+ end
22
+ }.to_vector(:scale)
23
+ end
24
+ # Retrieves a vector with standarized values for y
25
+ def standarized_predicted
26
+ predicted.standarized
27
+ end
28
+ # Retrieves a vector with residuals values for y
29
+ def residuals
30
+ (0...@ds.cases).collect{|i|
31
+ invalid=false
32
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
33
+ if invalid or @ds[@y_var][i].nil?
34
+ nil
35
+ else
36
+ @ds[@y_var][i] - process(vect)
37
+ end
38
+ }.to_vector(:scale)
39
+ end
40
+ # R Multiple
41
+ def r
42
+ raise "You should implement this"
43
+ end
44
+ # Sum of squares Total
45
+ def sst
46
+ raise "You should implement this"
47
+ end
48
+ # Sum of squares (regression)
49
+ def ssr
50
+ r2*sst
51
+ end
52
+ # Sum of squares (Error)
53
+ def sse
54
+ sst - ssr
55
+ end
56
+ # T values for coeffs
57
+ def coeffs_t
58
+ out={}
59
+ se=coeffs_se
60
+ coeffs.each{|k,v|
61
+ out[k]=v / se[k]
62
+ }
63
+ out
64
+ end
65
+ # Mean square Regression
66
+ def msr
67
+ ssr.quo(df_r)
68
+ end
69
+ # Mean Square Error
70
+ def mse
71
+ sse.quo(df_e)
72
+ end
73
+ # Degrees of freedom for regression
74
+ def df_r
75
+ @dep_columns.size
76
+ end
77
+ # Degrees of freedom for error
78
+ def df_e
79
+ @ds_valid.cases-@dep_columns.size-1
80
+ end
81
+ # Fisher for Anova
82
+ def f
83
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
84
+ end
85
+ # Significance of Fisher
86
+ def significance
87
+ 1.0-Distribution::F.cdf(f,df_r,df_e)
88
+ end
89
+ # Tolerance for a given variable
90
+ # http://talkstats.com/showthread.php?t=5056
91
+ def tolerance(var)
92
+ ds=assign_names(@dep_columns)
93
+ ds.each{|k,v|
94
+ ds[k]=v.to_vector(:scale)
95
+ }
96
+ lr=Multiple.listwise(ds.to_dataset,var)
97
+ 1-lr.r2
98
+ end
99
+ # Tolerances for each coefficient
100
+ def coeffs_tolerances
101
+ @fields.inject({}) {|a,f|
102
+ a[f]=tolerance(f);
103
+ a
104
+ }
105
+ end
106
+ # Standard Error for coefficients
107
+ def coeffs_se
108
+ out={}
109
+ mse=sse.quo(df_e)
110
+ coeffs.each {|k,v|
111
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
112
+ }
113
+ out
114
+ end
115
+ # Estimated Variance-Covariance Matrix
116
+ # Used for calculation of se of constant
117
+ def estimated_variance_covariance_matrix
118
+ mse_p=mse
119
+ columns=[]
120
+ @ds_valid.each_vector{|k,v|
121
+ columns.push(v.data) unless k==@y_var
122
+ }
123
+ columns.unshift([1.0]*@ds_valid.cases)
124
+ x=Matrix.columns(columns)
125
+ matrix=((x.t*x)).inverse * mse
126
+ matrix.collect {|i|
127
+ Math::sqrt(i) if i>0
128
+ }
129
+ end
130
+ # T for constant
131
+ def constant_t
132
+ constant.to_f/constant_se
133
+ end
134
+ # Standard error for constant
135
+ def constant_se
136
+ estimated_variance_covariance_matrix[0,0]
137
+ end
138
+ # Retrieves a summary for Regression
139
+ def summary(report_type=ConsoleSummary)
140
+ c=coeffs
141
+ out=""
142
+ out.extend report_type
143
+ out.add <<HEREDOC
144
+ Summary for regression of #{@fields.join(',')} over #{@y_var}
145
+ *************************************************************
146
+ Engine: #{self.class}
147
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
148
+ r=#{sprintf("%0.3f",r)}
149
+ r2=#{sprintf("%0.3f",r2)}
150
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
151
+ HEREDOC
152
+
153
+ out.add_line
154
+ out.add "ANOVA TABLE"
155
+
156
+ t=Statsample::ReportTable.new(%w{source ss df ms f s})
157
+ t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
158
+ t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
159
+
160
+ t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
161
+
162
+ out.parse_table(t)
163
+
164
+ begin
165
+ out.add "Beta coefficientes"
166
+ sc=standarized_coeffs
167
+ cse=coeffs_se
168
+ t=Statsample::ReportTable.new(%w{coeff b beta se t})
169
+ t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
170
+ @fields.each{|f|
171
+ t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
172
+ }
173
+ out.parse_table(t)
174
+
175
+ rescue
176
+ end
177
+ out
178
+ end
179
+ def assign_names(c)
180
+ a={}
181
+ @fields.each_index {|i|
182
+ a[@fields[i]]=c[i]
183
+ }
184
+ a
185
+ end
186
+
187
+
188
+ # Deprecated
189
+ # Sum of squares of error (manual calculation)
190
+ # using the predicted value minus the y_i value
191
+ def sse_manual
192
+ pr=predicted
193
+ cases=0
194
+ sse=(0...@ds.cases).inject(0) {|a,i|
195
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
196
+ cases+=1
197
+ a+((pr[i]-@dy[i])**2)
198
+ else
199
+ a
200
+ end
201
+ }
202
+ sse*(min_n_valid-1.0).quo(cases-1)
203
+ end
204
+ # Sum of squares of regression
205
+ # using the predicted value minus y mean
206
+ def ssr_direct
207
+ mean=@dy.mean
208
+ cases=0
209
+ ssr=(0...@ds.cases).inject(0) {|a,i|
210
+ invalid=false
211
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
212
+ if !invalid
213
+ cases+=1
214
+ a+((process(v)-mean)**2)
215
+ else
216
+ a
217
+ end
218
+ }
219
+ ssr
220
+ end
221
+ def sse_direct
222
+ sst-ssr
223
+ end
224
+ def process(v)
225
+ c=coeffs
226
+ total=constant
227
+ @fields.each_index{|i|
228
+ total+=c[@fields[i]]*v[i]
229
+ }
230
+ total
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end