statsample 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,259 @@
1
+ module Statsample
2
+ module Regression
3
+ # Module for Multiple Regression Analysis
4
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
5
+ # Example.
6
+ #
7
+ # require 'statsample'
8
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
9
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
10
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
11
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
12
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
13
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
14
+ # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
15
+
16
+
17
+ module Multiple
18
+ # Creates an object for listwise regression. According to resources
19
+ # select the best engine
20
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
21
+ def self.listwise(ds,y_var)
22
+ if HAS_ALGIB
23
+ AlglibEngine.new(ds,y_var)
24
+ else
25
+ ds2=ds.dup_only_valid
26
+ RubyEngine.new(ds2,y_var)
27
+ end
28
+ end
29
+
30
+ # Creates an object for pairwise regression
31
+ # For now, always retrieves a RubyEngine
32
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
33
+ def self.pairwise(ds,y_var)
34
+ RubyEngine.new(ds,y_var)
35
+ end
36
+
37
+ # Base class for Multiple Regression Engines
38
+ class BaseEngine
39
+ def initialize(ds,y_var)
40
+ @ds=ds
41
+ @y_var=y_var
42
+ @r2=nil
43
+ end
44
+
45
+ # Retrieves a vector with predicted values for y
46
+ def predicted
47
+ (0...@ds.cases).collect { |i|
48
+ invalid=false
49
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
50
+ if invalid
51
+ nil
52
+ else
53
+ process(vect)
54
+ end
55
+ }.to_vector(:scale)
56
+ end
57
+ # Retrieves a vector with standarized values for y
58
+ def standarized_predicted
59
+ predicted.standarized
60
+ end
61
+ # Retrieves a vector with residuals values for y
62
+ def residuals
63
+ (0...@ds.cases).collect{|i|
64
+ invalid=false
65
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
66
+ if invalid or @ds[@y_var][i].nil?
67
+ nil
68
+ else
69
+ @ds[@y_var][i] - process(vect)
70
+ end
71
+ }.to_vector(:scale)
72
+ end
73
+ # R Multiple
74
+ def r
75
+ raise "You should implement this"
76
+ end
77
+ # Sum of squares Total
78
+ def sst
79
+ raise "You should implement this"
80
+ end
81
+ # Sum of squares (regression)
82
+ def ssr
83
+ r2*sst
84
+ end
85
+ # Sum of squares (Error)
86
+ def sse
87
+ sst - ssr
88
+ end
89
+ # T values for coeffs
90
+ def coeffs_t
91
+ out={}
92
+ se=coeffs_se
93
+ coeffs.each{|k,v|
94
+ out[k]=v / se[k]
95
+ }
96
+ out
97
+ end
98
+ # Mean square Regression
99
+ def msr
100
+ ssr.quo(df_r)
101
+ end
102
+ # Mean Square Error
103
+ def mse
104
+ sse.quo(df_e)
105
+ end
106
+ # Degrees of freedom for regression
107
+ def df_r
108
+ @dep_columns.size
109
+ end
110
+ # Degrees of freedom for error
111
+ def df_e
112
+ @ds_valid.cases-@dep_columns.size-1
113
+ end
114
+ # Fisher for Anova
115
+ def f
116
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
117
+ end
118
+ # Significance of Fisher
119
+ def significance
120
+ if HAS_GSL
121
+ GSL::Cdf.fdist_Q(f,df_r,df_e)
122
+ else
123
+ raise "Need Ruby/GSL"
124
+ end
125
+ end
126
+ # Tolerance for a given variable
127
+ # http://talkstats.com/showthread.php?t=5056
128
+ def tolerance(var)
129
+ ds=assign_names(@dep_columns)
130
+ ds.each{|k,v|
131
+ ds[k]=v.to_vector(:scale)
132
+ }
133
+ if HAS_ALGIB
134
+ lr_class=AlglibEngine
135
+ ds=ds.to_dataset
136
+ else
137
+ lr_class=RubyEngine
138
+ ds=ds.to_dataset.dup_only_valid
139
+ end
140
+ lr=lr_class.new(ds,var)
141
+ 1-lr.r2
142
+ end
143
+ # Tolerances for each coefficient
144
+ def coeffs_tolerances
145
+ @fields.inject({}) {|a,f|
146
+ a[f]=tolerance(f);
147
+ a
148
+ }
149
+ end
150
+ # Standard Error for coefficients
151
+ def coeffs_se
152
+ out={}
153
+ mse=sse.quo(df_e)
154
+ coeffs.each {|k,v|
155
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
156
+ }
157
+ out
158
+ end
159
+ # Estimated Variance-Covariance Matrix
160
+ # Used for calculation of se of constant
161
+ def estimated_variance_covariance_matrix
162
+ mse_p=mse
163
+ columns=[]
164
+ @ds_valid.each_vector{|k,v|
165
+ columns.push(v.data) unless k==@y_var
166
+ }
167
+ columns.unshift([1.0]*@ds_valid.cases)
168
+ x=Matrix.columns(columns)
169
+ matrix=((x.t*x)).inverse * mse
170
+ matrix.collect {|i|
171
+
172
+ Math::sqrt(i) if i>0
173
+ }
174
+ end
175
+ # T for constant
176
+ def constant_t
177
+ constant.to_f/constant_se
178
+ end
179
+ # Standard error for constant
180
+ def constant_se
181
+ estimated_variance_covariance_matrix[0,0]
182
+ end
183
+ # Retrieves a summary for Regression
184
+ def summary(report_type=ConsoleSummary)
185
+ c=coeffs
186
+ out=""
187
+ out.extend report_type
188
+ out.add <<HEREDOC
189
+ Summary for regression of #{@fields.join(',')} over #{@y_var}
190
+ *************************************************************
191
+ Engine: #{self.class}
192
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
193
+ r=#{sprintf("%0.3f",r)}
194
+ r2=#{sprintf("%0.3f",r2)}
195
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
196
+ HEREDOC
197
+
198
+ out.add_line
199
+ out.add "ANOVA TABLE"
200
+
201
+ t=Statsample::ReportTable.new(%w{source ss df ms f s})
202
+ t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f",significance)])
203
+
204
+ t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
205
+
206
+ t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
207
+
208
+ out.parse_table(t)
209
+ out
210
+ end
211
+ def assign_names(c)
212
+ a={}
213
+ @fields.each_index {|i|
214
+ a[@fields[i]]=c[i]
215
+ }
216
+ a
217
+ end
218
+
219
+
220
+ # Deprecated
221
+ # Sum of squares of error (manual calculation)
222
+ # using the predicted value minus the y_i value
223
+ def sse_manual
224
+ pr=predicted
225
+ cases=0
226
+ sse=(0...@ds.cases).inject(0) {|a,i|
227
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
228
+ cases+=1
229
+ a+((pr[i]-@dy[i])**2)
230
+ else
231
+ a
232
+ end
233
+ }
234
+ sse*(min_n_valid-1.0).quo(cases-1)
235
+ end
236
+ # Sum of squares of regression
237
+ # using the predicted value minus y mean
238
+ def ssr_direct
239
+ mean=@dy.mean
240
+ cases=0
241
+ ssr=(0...@ds.cases).inject(0) {|a,i|
242
+ invalid=false
243
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
244
+ if !invalid
245
+ cases+=1
246
+ a+((process(v)-mean)**2)
247
+ else
248
+ a
249
+ end
250
+ }
251
+ ssr
252
+ end
253
+ def sse_direct
254
+ sst-ssr
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,117 @@
1
+ if HAS_ALGIB
2
+ module Statsample
3
+ module Regression
4
+ module Multiple
5
+ # Class for Multiple Regression Analysis
6
+ # Requires Alglib gem and uses a listwise aproach.
7
+ # If you need pairwise, use RubyEngine
8
+ # Example:
9
+ #
10
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
11
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
12
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
13
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
14
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
15
+ # lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y')
16
+ #
17
+ class AlglibEngine < BaseEngine
18
+ def initialize(ds,y_var)
19
+ @ds=ds.dup_only_valid
20
+ @ds_valid=@ds
21
+ @y_var=y_var
22
+ @dy=@ds[@y_var]
23
+ @ds_indep=ds.dup(ds.fields-[y_var])
24
+ # Create a custom matrix
25
+ columns=[]
26
+ @fields=[]
27
+ @ds.fields.each{|f|
28
+ if f!=@y_var
29
+ columns.push(@ds[f].to_a)
30
+ @fields.push(f)
31
+ end
32
+ }
33
+ @dep_columns=columns.dup
34
+ columns.push(@ds[@y_var])
35
+ matrix=Matrix.columns(columns)
36
+ @lr_s=nil
37
+ @lr=::Alglib::LinearRegression.build_from_matrix(matrix)
38
+ end
39
+
40
+ def _dump(i)
41
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
42
+ end
43
+ def self._load(data)
44
+ h=Marshal.load(data)
45
+ self.new(h['ds'], h['y_var'])
46
+ end
47
+
48
+ def coeffs
49
+ assign_names(@lr.coeffs)
50
+ end
51
+ # Coefficients using a constant
52
+ # Based on http://www.xycoon.com/ols1.htm
53
+ def matrix_resolution
54
+ mse_p=mse
55
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
56
+ columns.unshift([1.0]*@ds.cases)
57
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
58
+ x=Matrix.columns(columns)
59
+ xt=x.t
60
+ matrix=((xt*x)).inverse*xt
61
+ matrix*y
62
+ end
63
+ def r2
64
+ r**2
65
+ end
66
+ def r
67
+ Bivariate::pearson(@dy,predicted)
68
+ end
69
+ def sst
70
+ @dy.ss
71
+ end
72
+ def constant
73
+ @lr.constant
74
+ end
75
+ def standarized_coeffs
76
+ l=lr_s
77
+ assign_names(l.coeffs)
78
+ end
79
+ def lr_s
80
+ if @lr_s.nil?
81
+ build_standarized
82
+ end
83
+ @lr_s
84
+ end
85
+ def build_standarized
86
+ @ds_s=@ds.standarize
87
+ columns=[]
88
+ @ds_s.fields.each{|f|
89
+ columns.push(@ds_s[f].to_a) unless f==@y_var
90
+ }
91
+ @dep_columns_s=columns.dup
92
+ columns.push(@ds_s[@y_var])
93
+ matrix=Matrix.columns(columns)
94
+ @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
95
+ end
96
+ def process(v)
97
+ @lr.process(v)
98
+ end
99
+ def process_s(v)
100
+ lr_s.process(v)
101
+ end
102
+ # ???? Not equal to SPSS output
103
+ def standarized_residuals
104
+ res=residuals
105
+ red_sd=residuals.sds
106
+ res.collect {|v|
107
+ v.quo(red_sd)
108
+ }.to_vector(:scale)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end # for Statsample
114
+ end # for if
115
+
116
+
117
+
@@ -0,0 +1,140 @@
1
+ module Statsample
2
+ module Regression
3
+ module Multiple
4
+ # Pure Ruby Class for Multiple Regression Analysis.
5
+ # Slower than AlglibEngine, but is pure ruby and uses a pairwise aproach for missing values.
6
+ # If you need listwise aproach for missing values, use AlglibEngine, because is faster.
7
+ #
8
+ # Example:
9
+ #
10
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
11
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
12
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
13
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
14
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
15
+ # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
16
+
17
+ class RubyEngine < BaseEngine
18
+ def initialize(ds,y_var)
19
+ super
20
+ @dy=ds[@y_var]
21
+ @ds_valid=ds.dup_only_valid
22
+ @ds_indep=ds.dup(ds.fields-[y_var])
23
+ @fields=@ds_indep.fields
24
+ set_dep_columns
25
+ obtain_y_vector
26
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
27
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
28
+ @min_n_valid=nil
29
+ end
30
+ def min_n_valid
31
+ if @min_n_valid.nil?
32
+ min=@ds.cases
33
+ m=Bivariate::n_valid_matrix(@ds)
34
+ for x in 0...m.row_size
35
+ for y in 0...m.column_size
36
+ min=m[x,y] if m[x,y] < min
37
+ end
38
+ end
39
+ @min_n_valid=min
40
+ end
41
+ @min_n_valid
42
+ end
43
+ def set_dep_columns
44
+ @dep_columns=[]
45
+ @ds_indep.each_vector{|k,v|
46
+ @dep_columns.push(v.data_with_nils)
47
+ }
48
+ end
49
+ # Sum of square total
50
+ def sst
51
+ #if @sst.nil?
52
+ @sst=@dy.variance*(min_n_valid-1.0)
53
+ #end
54
+ @sst
55
+ end
56
+ def r2
57
+ if @r2.nil?
58
+ c=@matrix_y
59
+ rxx=obtain_predictor_matrix
60
+ matrix=(c.t*rxx.inverse*c)
61
+ @r2=matrix[0,0]
62
+ end
63
+ @r2
64
+ end
65
+ def r
66
+ Math::sqrt(r2)
67
+ end
68
+
69
+ def df_e
70
+ min_n_valid-@dep_columns.size-1
71
+ end
72
+ def fix_with_mean
73
+ i=0
74
+ @ds_indep.each{|row|
75
+ empty=[]
76
+ row.each{|k,v|
77
+ empty.push(k) if v.nil?
78
+ }
79
+ if empty.size==1
80
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
81
+ end
82
+ i+=1
83
+ }
84
+ @ds_indep.update_valid_data
85
+ set_dep_columns
86
+ end
87
+ def fix_with_regression
88
+ i=0
89
+ @ds_indep.each{|row|
90
+ empty=[]
91
+ row.each{|k,v|
92
+ empty.push(k) if v.nil?
93
+ }
94
+ if empty.size==1
95
+ field=empty[0]
96
+ lr=MultipleRegression.new(@ds_indep,field)
97
+ fields=[]
98
+ @ds_indep.fields.each{|f|
99
+ fields.push(row[f]) unless f==field
100
+ }
101
+ @ds_indep[field][i]=lr.process(fields)
102
+ end
103
+ i+=1
104
+ }
105
+ @ds_indep.update_valid_data
106
+ set_dep_columns
107
+ end
108
+ def obtain_y_vector
109
+ @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
110
+ Bivariate.pearson(@dy, @ds_indep[f])
111
+ }])
112
+ end
113
+ def obtain_predictor_matrix
114
+ Bivariate::correlation_matrix(@ds_indep)
115
+ end
116
+ def constant
117
+ c=coeffs
118
+ @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
119
+ end
120
+ def process(v)
121
+ c=coeffs
122
+ total=constant
123
+ @fields.each_index{|i|
124
+ total+=c[@fields[i]]*v[i]
125
+ }
126
+ total
127
+ end
128
+ def coeffs
129
+ sc=standarized_coeffs
130
+ assign_names(@fields.collect{|f|
131
+ (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
132
+ })
133
+ end
134
+ def standarized_coeffs
135
+ assign_names(@coeffs_stan)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end