statsample 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,284 +1,56 @@
1
+ require 'statsample/regression/multiple/baseengine'
1
2
  module Statsample
2
- module Regression
3
- # Module for Linear Multiple Regression Analysis
4
- # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
5
- # Example.
6
- #
7
- # require 'statsample'
8
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
9
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
10
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
11
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
12
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
13
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
14
- # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
15
-
16
-
17
- module Multiple
18
- # Creates an object for listwise regression.
19
- # Alglib is faster, so is prefered over GSL
20
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
21
- def self.listwise(ds,y_var)
22
- if HAS_ALGIB
3
+ module Regression
4
+ # Module for Linear Multiple Regression Analysis
5
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
6
+ # Example.
7
+ #
8
+ # require 'statsample'
9
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
10
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
11
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
12
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
13
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
14
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
15
+ # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
16
+ module Multiple
17
+ # Creates an object for listwise regression.
18
+ # Alglib is faster, so is prefered over GSL
19
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
20
+ def self.listwise(ds,y_var)
21
+ if HAS_ALGIB
23
22
  AlglibEngine.new(ds,y_var)
24
- elsif HAS_GSL
23
+ elsif HAS_GSL
25
24
  GslEngine.new(ds,y_var)
26
- else
25
+ else
27
26
  ds2=ds.dup_only_valid
28
27
  RubyEngine.new(ds2,y_var)
28
+ end
29
29
  end
30
- end
31
-
32
- # Creates an object for pairwise regression
33
- # For now, always retrieves a RubyEngine
34
- # lr=Statsample::Regression::Multiple.listwise(ds,'y')
35
- def self.pairwise(ds,y_var)
36
- RubyEngine.new(ds,y_var)
37
- end
38
- def self.listwise_by_exp(ds,exp)
39
- end
40
- # Returns a dataset and name of criteria using a expression.
41
- # All nominal vectors are replaced by dummy coding
42
- # and interactions are calculated
43
-
44
- def self.ds_by_exp(ds,exp)
45
- raise "Not implemented"
46
- parts=exp.split(/[\+=]/)
47
- dependent=parts.pop
48
- ds_out=[]
49
- parts.each{|p|
50
-
51
- }
52
- end
53
- # Base class for Multiple Regression Engines
54
- class BaseEngine
55
- def initialize(ds,y_var)
56
- @ds=ds
57
- @y_var=y_var
58
- @r2=nil
59
- end
60
-
61
- # Retrieves a vector with predicted values for y
62
- def predicted
63
- (0...@ds.cases).collect { |i|
64
- invalid=false
65
- vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
66
- if invalid
67
- nil
68
- else
69
- process(vect)
70
- end
71
- }.to_vector(:scale)
72
- end
73
- # Retrieves a vector with standarized values for y
74
- def standarized_predicted
75
- predicted.standarized
76
- end
77
- # Retrieves a vector with residuals values for y
78
- def residuals
79
- (0...@ds.cases).collect{|i|
80
- invalid=false
81
- vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
82
- if invalid or @ds[@y_var][i].nil?
83
- nil
84
- else
85
- @ds[@y_var][i] - process(vect)
86
- end
87
- }.to_vector(:scale)
88
- end
89
- # R Multiple
90
- def r
91
- raise "You should implement this"
92
- end
93
- # Sum of squares Total
94
- def sst
95
- raise "You should implement this"
96
- end
97
- # Sum of squares (regression)
98
- def ssr
99
- r2*sst
100
- end
101
- # Sum of squares (Error)
102
- def sse
103
- sst - ssr
104
- end
105
- # T values for coeffs
106
- def coeffs_t
107
- out={}
108
- se=coeffs_se
109
- coeffs.each{|k,v|
110
- out[k]=v / se[k]
111
- }
112
- out
113
- end
114
- # Mean square Regression
115
- def msr
116
- ssr.quo(df_r)
117
- end
118
- # Mean Square Error
119
- def mse
120
- sse.quo(df_e)
121
- end
122
- # Degrees of freedom for regression
123
- def df_r
124
- @dep_columns.size
125
- end
126
- # Degrees of freedom for error
127
- def df_e
128
- @ds_valid.cases-@dep_columns.size-1
129
- end
130
- # Fisher for Anova
131
- def f
132
- (ssr.quo(df_r)).quo(sse.quo(df_e))
133
- end
134
- # Significance of Fisher
135
- def significance
136
- 1.0-Distribution::F.cdf(f,df_r,df_e)
137
- end
138
- # Tolerance for a given variable
139
- # http://talkstats.com/showthread.php?t=5056
140
- def tolerance(var)
141
- ds=assign_names(@dep_columns)
142
- ds.each{|k,v|
143
- ds[k]=v.to_vector(:scale)
144
- }
145
- lr=Multiple.listwise(ds.to_dataset,var)
146
- 1-lr.r2
147
- end
148
- # Tolerances for each coefficient
149
- def coeffs_tolerances
150
- @fields.inject({}) {|a,f|
151
- a[f]=tolerance(f);
152
- a
153
- }
154
- end
155
- # Standard Error for coefficients
156
- def coeffs_se
157
- out={}
158
- mse=sse.quo(df_e)
159
- coeffs.each {|k,v|
160
- out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
161
- }
162
- out
163
- end
164
- # Estimated Variance-Covariance Matrix
165
- # Used for calculation of se of constant
166
- def estimated_variance_covariance_matrix
167
- mse_p=mse
168
- columns=[]
169
- @ds_valid.each_vector{|k,v|
170
- columns.push(v.data) unless k==@y_var
171
- }
172
- columns.unshift([1.0]*@ds_valid.cases)
173
- x=Matrix.columns(columns)
174
- matrix=((x.t*x)).inverse * mse
175
- matrix.collect {|i|
176
- Math::sqrt(i) if i>0
177
- }
178
- end
179
- # T for constant
180
- def constant_t
181
- constant.to_f/constant_se
182
- end
183
- # Standard error for constant
184
- def constant_se
185
- estimated_variance_covariance_matrix[0,0]
186
- end
187
- # Retrieves a summary for Regression
188
- def summary(report_type=ConsoleSummary)
189
- c=coeffs
190
- out=""
191
- out.extend report_type
192
- out.add <<HEREDOC
193
- Summary for regression of #{@fields.join(',')} over #{@y_var}
194
- *************************************************************
195
- Engine: #{self.class}
196
- Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
197
- r=#{sprintf("%0.3f",r)}
198
- r2=#{sprintf("%0.3f",r2)}
199
- Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
200
- HEREDOC
201
-
202
- out.add_line
203
- out.add "ANOVA TABLE"
204
-
205
- t=Statsample::ReportTable.new(%w{source ss df ms f s})
206
- t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
207
- t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
208
-
209
- t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
210
30
 
211
- out.parse_table(t)
212
-
213
- begin
214
- out.add "Beta coefficientes"
215
- sc=standarized_coeffs
216
- cse=coeffs_se
217
- t=Statsample::ReportTable.new(%w{coeff b beta se t})
218
- t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
219
- @fields.each{|f|
220
- t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
221
- }
222
- out.parse_table(t)
223
-
224
- rescue
31
+ # Creates an object for pairwise regression
32
+ # For now, always retrieves a RubyEngine
33
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
34
+ def self.pairwise(ds,y_var)
35
+ RubyEngine.new(ds,y_var)
225
36
  end
226
- out
37
+ def self.listwise_by_exp(ds,exp)
38
+ raise "Not implemented yet"
227
39
  end
228
- def assign_names(c)
229
- a={}
230
- @fields.each_index {|i|
231
- a[@fields[i]]=c[i]
232
- }
233
- a
40
+ # Returns a dataset and name of criteria using a expression.
41
+ # All nominal vectors are replaced by dummy coding
42
+ # and interactions are calculated
43
+
44
+ def self.ds_by_exp(ds,exp)
45
+ raise "Not implemented"
46
+ parts=exp.split(/[\+=]/)
47
+ dependent=parts.pop
48
+ ds_out=[]
49
+ parts.each{|p|
50
+
51
+ }
234
52
  end
235
-
236
53
 
237
- # Deprecated
238
- # Sum of squares of error (manual calculation)
239
- # using the predicted value minus the y_i value
240
- def sse_manual
241
- pr=predicted
242
- cases=0
243
- sse=(0...@ds.cases).inject(0) {|a,i|
244
- if !@dy.data_with_nils[i].nil? and !pr[i].nil?
245
- cases+=1
246
- a+((pr[i]-@dy[i])**2)
247
- else
248
- a
249
- end
250
- }
251
- sse*(min_n_valid-1.0).quo(cases-1)
252
- end
253
- # Sum of squares of regression
254
- # using the predicted value minus y mean
255
- def ssr_direct
256
- mean=@dy.mean
257
- cases=0
258
- ssr=(0...@ds.cases).inject(0) {|a,i|
259
- invalid=false
260
- v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
261
- if !invalid
262
- cases+=1
263
- a+((process(v)-mean)**2)
264
- else
265
- a
266
- end
267
- }
268
- ssr
269
- end
270
- def sse_direct
271
- sst-ssr
272
54
  end
273
- def process(v)
274
- c=coeffs
275
- total=constant
276
- @fields.each_index{|i|
277
- total+=c[@fields[i]]*v[i]
278
- }
279
- total
280
- end
281
- end
282
- end
283
- end
55
+ end
284
56
  end
@@ -0,0 +1,235 @@
1
+ module Statsample
2
+ module Regression
3
+ module Multiple
4
+ # Base class for Multiple Regression Engines
5
+ class BaseEngine
6
+ def initialize(ds,y_var)
7
+ @ds=ds
8
+ @y_var=y_var
9
+ @r2=nil
10
+ end
11
+
12
+ # Retrieves a vector with predicted values for y
13
+ def predicted
14
+ (0...@ds.cases).collect { |i|
15
+ invalid=false
16
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
17
+ if invalid
18
+ nil
19
+ else
20
+ process(vect)
21
+ end
22
+ }.to_vector(:scale)
23
+ end
24
+ # Retrieves a vector with standarized values for y
25
+ def standarized_predicted
26
+ predicted.standarized
27
+ end
28
+ # Retrieves a vector with residuals values for y
29
+ def residuals
30
+ (0...@ds.cases).collect{|i|
31
+ invalid=false
32
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
33
+ if invalid or @ds[@y_var][i].nil?
34
+ nil
35
+ else
36
+ @ds[@y_var][i] - process(vect)
37
+ end
38
+ }.to_vector(:scale)
39
+ end
40
+ # R Multiple
41
+ def r
42
+ raise "You should implement this"
43
+ end
44
+ # Sum of squares Total
45
+ def sst
46
+ raise "You should implement this"
47
+ end
48
+ # Sum of squares (regression)
49
+ def ssr
50
+ r2*sst
51
+ end
52
+ # Sum of squares (Error)
53
+ def sse
54
+ sst - ssr
55
+ end
56
+ # T values for coeffs
57
+ def coeffs_t
58
+ out={}
59
+ se=coeffs_se
60
+ coeffs.each{|k,v|
61
+ out[k]=v / se[k]
62
+ }
63
+ out
64
+ end
65
+ # Mean square Regression
66
+ def msr
67
+ ssr.quo(df_r)
68
+ end
69
+ # Mean Square Error
70
+ def mse
71
+ sse.quo(df_e)
72
+ end
73
+ # Degrees of freedom for regression
74
+ def df_r
75
+ @dep_columns.size
76
+ end
77
+ # Degrees of freedom for error
78
+ def df_e
79
+ @ds_valid.cases-@dep_columns.size-1
80
+ end
81
+ # Fisher for Anova
82
+ def f
83
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
84
+ end
85
+ # Significance of Fisher
86
+ def significance
87
+ 1.0-Distribution::F.cdf(f,df_r,df_e)
88
+ end
89
+ # Tolerance for a given variable
90
+ # http://talkstats.com/showthread.php?t=5056
91
+ def tolerance(var)
92
+ ds=assign_names(@dep_columns)
93
+ ds.each{|k,v|
94
+ ds[k]=v.to_vector(:scale)
95
+ }
96
+ lr=Multiple.listwise(ds.to_dataset,var)
97
+ 1-lr.r2
98
+ end
99
+ # Tolerances for each coefficient
100
+ def coeffs_tolerances
101
+ @fields.inject({}) {|a,f|
102
+ a[f]=tolerance(f);
103
+ a
104
+ }
105
+ end
106
+ # Standard Error for coefficients
107
+ def coeffs_se
108
+ out={}
109
+ mse=sse.quo(df_e)
110
+ coeffs.each {|k,v|
111
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
112
+ }
113
+ out
114
+ end
115
+ # Estimated Variance-Covariance Matrix
116
+ # Used for calculation of se of constant
117
+ def estimated_variance_covariance_matrix
118
+ mse_p=mse
119
+ columns=[]
120
+ @ds_valid.each_vector{|k,v|
121
+ columns.push(v.data) unless k==@y_var
122
+ }
123
+ columns.unshift([1.0]*@ds_valid.cases)
124
+ x=Matrix.columns(columns)
125
+ matrix=((x.t*x)).inverse * mse
126
+ matrix.collect {|i|
127
+ Math::sqrt(i) if i>0
128
+ }
129
+ end
130
+ # T for constant
131
+ def constant_t
132
+ constant.to_f/constant_se
133
+ end
134
+ # Standard error for constant
135
+ def constant_se
136
+ estimated_variance_covariance_matrix[0,0]
137
+ end
138
+ # Retrieves a summary for Regression
139
+ def summary(report_type=ConsoleSummary)
140
+ c=coeffs
141
+ out=""
142
+ out.extend report_type
143
+ out.add <<HEREDOC
144
+ Summary for regression of #{@fields.join(',')} over #{@y_var}
145
+ *************************************************************
146
+ Engine: #{self.class}
147
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
148
+ r=#{sprintf("%0.3f",r)}
149
+ r2=#{sprintf("%0.3f",r2)}
150
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
151
+ HEREDOC
152
+
153
+ out.add_line
154
+ out.add "ANOVA TABLE"
155
+
156
+ t=Statsample::ReportTable.new(%w{source ss df ms f s})
157
+ t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f", significance)])
158
+ t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
159
+
160
+ t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
161
+
162
+ out.parse_table(t)
163
+
164
+ begin
165
+ out.add "Beta coefficientes"
166
+ sc=standarized_coeffs
167
+ cse=coeffs_se
168
+ t=Statsample::ReportTable.new(%w{coeff b beta se t})
169
+ t.add_row(["Constant", sprintf("%0.3f", constant), "-", sprintf("%0.3f", constant_se), sprintf("%0.3f", constant_t)])
170
+ @fields.each{|f|
171
+ t.add_row([f, sprintf("%0.3f", c[f]), sprintf("%0.3f", sc[f]), sprintf("%0.3f", cse[f]), sprintf("%0.3f", c[f].quo(cse[f]))])
172
+ }
173
+ out.parse_table(t)
174
+
175
+ rescue
176
+ end
177
+ out
178
+ end
179
+ def assign_names(c)
180
+ a={}
181
+ @fields.each_index {|i|
182
+ a[@fields[i]]=c[i]
183
+ }
184
+ a
185
+ end
186
+
187
+
188
+ # Deprecated
189
+ # Sum of squares of error (manual calculation)
190
+ # using the predicted value minus the y_i value
191
+ def sse_manual
192
+ pr=predicted
193
+ cases=0
194
+ sse=(0...@ds.cases).inject(0) {|a,i|
195
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
196
+ cases+=1
197
+ a+((pr[i]-@dy[i])**2)
198
+ else
199
+ a
200
+ end
201
+ }
202
+ sse*(min_n_valid-1.0).quo(cases-1)
203
+ end
204
+ # Sum of squares of regression
205
+ # using the predicted value minus y mean
206
+ def ssr_direct
207
+ mean=@dy.mean
208
+ cases=0
209
+ ssr=(0...@ds.cases).inject(0) {|a,i|
210
+ invalid=false
211
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
212
+ if !invalid
213
+ cases+=1
214
+ a+((process(v)-mean)**2)
215
+ else
216
+ a
217
+ end
218
+ }
219
+ ssr
220
+ end
221
+ def sse_direct
222
+ sst-ssr
223
+ end
224
+ def process(v)
225
+ c=coeffs
226
+ total=constant
227
+ @fields.each_index{|i|
228
+ total+=c[@fields[i]]*v[i]
229
+ }
230
+ total
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end