statsample 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ module Statsample
2
+ module Regression
3
+ # Module for Multiple Regression Analysis
4
+ # You can call Regression::Multiple.listwise or Regression::Multiple.pairwise or instance directly the engines
5
+ # Example.
6
+ #
7
+ # require 'statsample'
8
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
9
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
10
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
11
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
12
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
13
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
14
+ # #<Statsample::Regression::Multiple::AlglibEngine:0x7f21912e4758 @ds_valid=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @lr=#<Alglib::LinearRegression:0x7f21912df118 @model=#<Alglib_ext::LinearModel:0x7f21912df708>, @ivars=3, @cases=10, @report=#<Alglib_ext::LrReport:0x7f21912df168>>, @y_var="y", @ds=#<Statsample::Dataset:69891073182680 @fields=[a,b,c,y] labels={"a"=>nil, "b"=>nil, "y"=>nil, "c"=>nil} cases=10, @fields=["a", "b", "c"], @lr_s=nil, @dep_columns=[[1, 3, 2, 4, 3, 5, 4, 6, 5, 7], [3, 3, 4, 4, 5, 5, 6, 6, 4, 4], [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]], @ds_indep=#<Statsample::Dataset:69891073180060 @fields=[a,b,c] labels={"a"=>nil, "b"=>nil, "c"=>nil} cases=10, @dy=Vector(type:scale, n:10)[3,4,5,6,7,8,9,10,20,30]>
15
+
16
+
17
+ module Multiple
18
+ # Creates an object for listwise regression. According to resources
19
+ # select the best engine
20
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
21
+ def self.listwise(ds,y_var)
22
+ if HAS_ALGIB
23
+ AlglibEngine.new(ds,y_var)
24
+ else
25
+ ds2=ds.dup_only_valid
26
+ RubyEngine.new(ds2,y_var)
27
+ end
28
+ end
29
+
30
+ # Creates an object for pairwise regression
31
+ # For now, always retrieves a RubyEngine
32
+ # lr=Statsample::Regression::Multiple.listwise(ds,'y')
33
+ def self.pairwise(ds,y_var)
34
+ RubyEngine.new(ds,y_var)
35
+ end
36
+
37
+ # Base class for Multiple Regression Engines
38
+ class BaseEngine
39
+ def initialize(ds,y_var)
40
+ @ds=ds
41
+ @y_var=y_var
42
+ @r2=nil
43
+ end
44
+
45
+ # Retrieves a vector with predicted values for y
46
+ def predicted
47
+ (0...@ds.cases).collect { |i|
48
+ invalid=false
49
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
50
+ if invalid
51
+ nil
52
+ else
53
+ process(vect)
54
+ end
55
+ }.to_vector(:scale)
56
+ end
57
+ # Retrieves a vector with standarized values for y
58
+ def standarized_predicted
59
+ predicted.standarized
60
+ end
61
+ # Retrieves a vector with residuals values for y
62
+ def residuals
63
+ (0...@ds.cases).collect{|i|
64
+ invalid=false
65
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
66
+ if invalid or @ds[@y_var][i].nil?
67
+ nil
68
+ else
69
+ @ds[@y_var][i] - process(vect)
70
+ end
71
+ }.to_vector(:scale)
72
+ end
73
+ # R Multiple
74
+ def r
75
+ raise "You should implement this"
76
+ end
77
+ # Sum of squares Total
78
+ def sst
79
+ raise "You should implement this"
80
+ end
81
+ # Sum of squares (regression)
82
+ def ssr
83
+ r2*sst
84
+ end
85
+ # Sum of squares (Error)
86
+ def sse
87
+ sst - ssr
88
+ end
89
+ # T values for coeffs
90
+ def coeffs_t
91
+ out={}
92
+ se=coeffs_se
93
+ coeffs.each{|k,v|
94
+ out[k]=v / se[k]
95
+ }
96
+ out
97
+ end
98
+ # Mean square Regression
99
+ def msr
100
+ ssr.quo(df_r)
101
+ end
102
+ # Mean Square Error
103
+ def mse
104
+ sse.quo(df_e)
105
+ end
106
+ # Degrees of freedom for regression
107
+ def df_r
108
+ @dep_columns.size
109
+ end
110
+ # Degrees of freedom for error
111
+ def df_e
112
+ @ds_valid.cases-@dep_columns.size-1
113
+ end
114
+ # Fisher for Anova
115
+ def f
116
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
117
+ end
118
+ # Significance of Fisher
119
+ def significance
120
+ if HAS_GSL
121
+ GSL::Cdf.fdist_Q(f,df_r,df_e)
122
+ else
123
+ raise "Need Ruby/GSL"
124
+ end
125
+ end
126
+ # Tolerance for a given variable
127
+ # http://talkstats.com/showthread.php?t=5056
128
+ def tolerance(var)
129
+ ds=assign_names(@dep_columns)
130
+ ds.each{|k,v|
131
+ ds[k]=v.to_vector(:scale)
132
+ }
133
+ if HAS_ALGIB
134
+ lr_class=AlglibEngine
135
+ ds=ds.to_dataset
136
+ else
137
+ lr_class=RubyEngine
138
+ ds=ds.to_dataset.dup_only_valid
139
+ end
140
+ lr=lr_class.new(ds,var)
141
+ 1-lr.r2
142
+ end
143
+ # Tolerances for each coefficient
144
+ def coeffs_tolerances
145
+ @fields.inject({}) {|a,f|
146
+ a[f]=tolerance(f);
147
+ a
148
+ }
149
+ end
150
+ # Standard Error for coefficients
151
+ def coeffs_se
152
+ out={}
153
+ mse=sse.quo(df_e)
154
+ coeffs.each {|k,v|
155
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
156
+ }
157
+ out
158
+ end
159
+ # Estimated Variance-Covariance Matrix
160
+ # Used for calculation of se of constant
161
+ def estimated_variance_covariance_matrix
162
+ mse_p=mse
163
+ columns=[]
164
+ @ds_valid.each_vector{|k,v|
165
+ columns.push(v.data) unless k==@y_var
166
+ }
167
+ columns.unshift([1.0]*@ds_valid.cases)
168
+ x=Matrix.columns(columns)
169
+ matrix=((x.t*x)).inverse * mse
170
+ matrix.collect {|i|
171
+
172
+ Math::sqrt(i) if i>0
173
+ }
174
+ end
175
+ # T for constant
176
+ def constant_t
177
+ constant.to_f/constant_se
178
+ end
179
+ # Standard error for constant
180
+ def constant_se
181
+ estimated_variance_covariance_matrix[0,0]
182
+ end
183
+ # Retrieves a summary for Regression
184
+ def summary(report_type=ConsoleSummary)
185
+ c=coeffs
186
+ out=""
187
+ out.extend report_type
188
+ out.add <<HEREDOC
189
+ Summary for regression of #{@fields.join(',')} over #{@y_var}
190
+ *************************************************************
191
+ Engine: #{self.class}
192
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
193
+ r=#{sprintf("%0.3f",r)}
194
+ r2=#{sprintf("%0.3f",r2)}
195
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
196
+ HEREDOC
197
+
198
+ out.add_line
199
+ out.add "ANOVA TABLE"
200
+
201
+ t=Statsample::ReportTable.new(%w{source ss df ms f s})
202
+ t.add_row(["Regression", sprintf("%0.3f",ssr), df_r, sprintf("%0.3f",msr), sprintf("%0.3f",f), sprintf("%0.3f",significance)])
203
+
204
+ t.add_row(["Error", sprintf("%0.3f",sse), df_e, sprintf("%0.3f",mse)])
205
+
206
+ t.add_row(["Total", sprintf("%0.3f",sst), df_r+df_e])
207
+
208
+ out.parse_table(t)
209
+ out
210
+ end
211
+ def assign_names(c)
212
+ a={}
213
+ @fields.each_index {|i|
214
+ a[@fields[i]]=c[i]
215
+ }
216
+ a
217
+ end
218
+
219
+
220
+ # Deprecated
221
+ # Sum of squares of error (manual calculation)
222
+ # using the predicted value minus the y_i value
223
+ def sse_manual
224
+ pr=predicted
225
+ cases=0
226
+ sse=(0...@ds.cases).inject(0) {|a,i|
227
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
228
+ cases+=1
229
+ a+((pr[i]-@dy[i])**2)
230
+ else
231
+ a
232
+ end
233
+ }
234
+ sse*(min_n_valid-1.0).quo(cases-1)
235
+ end
236
+ # Sum of squares of regression
237
+ # using the predicted value minus y mean
238
+ def ssr_direct
239
+ mean=@dy.mean
240
+ cases=0
241
+ ssr=(0...@ds.cases).inject(0) {|a,i|
242
+ invalid=false
243
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
244
+ if !invalid
245
+ cases+=1
246
+ a+((process(v)-mean)**2)
247
+ else
248
+ a
249
+ end
250
+ }
251
+ ssr
252
+ end
253
+ def sse_direct
254
+ sst-ssr
255
+ end
256
+ end
257
+ end
258
+ end
259
+ end
@@ -0,0 +1,117 @@
1
+ if HAS_ALGIB
2
+ module Statsample
3
+ module Regression
4
+ module Multiple
5
+ # Class for Multiple Regression Analysis
6
+ # Requires Alglib gem and uses a listwise aproach.
7
+ # If you need pairwise, use RubyEngine
8
+ # Example:
9
+ #
10
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
11
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
12
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
13
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
14
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
15
+ # lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y')
16
+ #
17
+ class AlglibEngine < BaseEngine
18
+ def initialize(ds,y_var)
19
+ @ds=ds.dup_only_valid
20
+ @ds_valid=@ds
21
+ @y_var=y_var
22
+ @dy=@ds[@y_var]
23
+ @ds_indep=ds.dup(ds.fields-[y_var])
24
+ # Create a custom matrix
25
+ columns=[]
26
+ @fields=[]
27
+ @ds.fields.each{|f|
28
+ if f!=@y_var
29
+ columns.push(@ds[f].to_a)
30
+ @fields.push(f)
31
+ end
32
+ }
33
+ @dep_columns=columns.dup
34
+ columns.push(@ds[@y_var])
35
+ matrix=Matrix.columns(columns)
36
+ @lr_s=nil
37
+ @lr=::Alglib::LinearRegression.build_from_matrix(matrix)
38
+ end
39
+
40
+ def _dump(i)
41
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
42
+ end
43
+ def self._load(data)
44
+ h=Marshal.load(data)
45
+ self.new(h['ds'], h['y_var'])
46
+ end
47
+
48
+ def coeffs
49
+ assign_names(@lr.coeffs)
50
+ end
51
+ # Coefficients using a constant
52
+ # Based on http://www.xycoon.com/ols1.htm
53
+ def matrix_resolution
54
+ mse_p=mse
55
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
56
+ columns.unshift([1.0]*@ds.cases)
57
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
58
+ x=Matrix.columns(columns)
59
+ xt=x.t
60
+ matrix=((xt*x)).inverse*xt
61
+ matrix*y
62
+ end
63
+ def r2
64
+ r**2
65
+ end
66
+ def r
67
+ Bivariate::pearson(@dy,predicted)
68
+ end
69
+ def sst
70
+ @dy.ss
71
+ end
72
+ def constant
73
+ @lr.constant
74
+ end
75
+ def standarized_coeffs
76
+ l=lr_s
77
+ assign_names(l.coeffs)
78
+ end
79
+ def lr_s
80
+ if @lr_s.nil?
81
+ build_standarized
82
+ end
83
+ @lr_s
84
+ end
85
+ def build_standarized
86
+ @ds_s=@ds.standarize
87
+ columns=[]
88
+ @ds_s.fields.each{|f|
89
+ columns.push(@ds_s[f].to_a) unless f==@y_var
90
+ }
91
+ @dep_columns_s=columns.dup
92
+ columns.push(@ds_s[@y_var])
93
+ matrix=Matrix.columns(columns)
94
+ @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
95
+ end
96
+ def process(v)
97
+ @lr.process(v)
98
+ end
99
+ def process_s(v)
100
+ lr_s.process(v)
101
+ end
102
+ # ???? Not equal to SPSS output
103
+ def standarized_residuals
104
+ res=residuals
105
+ red_sd=residuals.sds
106
+ res.collect {|v|
107
+ v.quo(red_sd)
108
+ }.to_vector(:scale)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end # for Statsample
114
+ end # for if
115
+
116
+
117
+
@@ -0,0 +1,140 @@
1
+ module Statsample
2
+ module Regression
3
+ module Multiple
4
+ # Pure Ruby Class for Multiple Regression Analysis.
5
+ # Slower than AlglibEngine, but is pure ruby and uses a pairwise aproach for missing values.
6
+ # If you need listwise aproach for missing values, use AlglibEngine, because is faster.
7
+ #
8
+ # Example:
9
+ #
10
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
11
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
12
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
13
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
14
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
15
+ # lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
16
+
17
+ class RubyEngine < BaseEngine
18
+ def initialize(ds,y_var)
19
+ super
20
+ @dy=ds[@y_var]
21
+ @ds_valid=ds.dup_only_valid
22
+ @ds_indep=ds.dup(ds.fields-[y_var])
23
+ @fields=@ds_indep.fields
24
+ set_dep_columns
25
+ obtain_y_vector
26
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
27
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
28
+ @min_n_valid=nil
29
+ end
30
+ def min_n_valid
31
+ if @min_n_valid.nil?
32
+ min=@ds.cases
33
+ m=Bivariate::n_valid_matrix(@ds)
34
+ for x in 0...m.row_size
35
+ for y in 0...m.column_size
36
+ min=m[x,y] if m[x,y] < min
37
+ end
38
+ end
39
+ @min_n_valid=min
40
+ end
41
+ @min_n_valid
42
+ end
43
+ def set_dep_columns
44
+ @dep_columns=[]
45
+ @ds_indep.each_vector{|k,v|
46
+ @dep_columns.push(v.data_with_nils)
47
+ }
48
+ end
49
+ # Sum of square total
50
+ def sst
51
+ #if @sst.nil?
52
+ @sst=@dy.variance*(min_n_valid-1.0)
53
+ #end
54
+ @sst
55
+ end
56
+ def r2
57
+ if @r2.nil?
58
+ c=@matrix_y
59
+ rxx=obtain_predictor_matrix
60
+ matrix=(c.t*rxx.inverse*c)
61
+ @r2=matrix[0,0]
62
+ end
63
+ @r2
64
+ end
65
+ def r
66
+ Math::sqrt(r2)
67
+ end
68
+
69
+ def df_e
70
+ min_n_valid-@dep_columns.size-1
71
+ end
72
+ def fix_with_mean
73
+ i=0
74
+ @ds_indep.each{|row|
75
+ empty=[]
76
+ row.each{|k,v|
77
+ empty.push(k) if v.nil?
78
+ }
79
+ if empty.size==1
80
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
81
+ end
82
+ i+=1
83
+ }
84
+ @ds_indep.update_valid_data
85
+ set_dep_columns
86
+ end
87
+ def fix_with_regression
88
+ i=0
89
+ @ds_indep.each{|row|
90
+ empty=[]
91
+ row.each{|k,v|
92
+ empty.push(k) if v.nil?
93
+ }
94
+ if empty.size==1
95
+ field=empty[0]
96
+ lr=MultipleRegression.new(@ds_indep,field)
97
+ fields=[]
98
+ @ds_indep.fields.each{|f|
99
+ fields.push(row[f]) unless f==field
100
+ }
101
+ @ds_indep[field][i]=lr.process(fields)
102
+ end
103
+ i+=1
104
+ }
105
+ @ds_indep.update_valid_data
106
+ set_dep_columns
107
+ end
108
+ def obtain_y_vector
109
+ @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
110
+ Bivariate.pearson(@dy, @ds_indep[f])
111
+ }])
112
+ end
113
+ def obtain_predictor_matrix
114
+ Bivariate::correlation_matrix(@ds_indep)
115
+ end
116
+ def constant
117
+ c=coeffs
118
+ @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
119
+ end
120
+ def process(v)
121
+ c=coeffs
122
+ total=constant
123
+ @fields.each_index{|i|
124
+ total+=c[@fields[i]]*v[i]
125
+ }
126
+ total
127
+ end
128
+ def coeffs
129
+ sc=standarized_coeffs
130
+ assign_names(@fields.collect{|f|
131
+ (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
132
+ })
133
+ end
134
+ def standarized_coeffs
135
+ assign_names(@coeffs_stan)
136
+ end
137
+ end
138
+ end
139
+ end
140
+ end