statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,522 @@
1
+ module Statsample
2
+ # module for regression methods
3
+ module Regression
4
+ # Class for calculation of linear regressions
5
+ # To create a SimpleRegression object:
6
+ # * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
7
+ # * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
8
+ #
9
+ class SimpleRegression
10
+ attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
11
+ private_class_method :new
12
+ def initialize(init_method, *argv)
13
+ self.send(init_method, *argv)
14
+ end
15
+ def y(val_x)
16
+ @a+@b*val_x
17
+ end
18
+ def x(val_y)
19
+ (val_y-@a) / @b.to_f
20
+ end
21
+ # Sum of square error
22
+ def sse
23
+ (0...@vx.size).inject(0) {|acum,i|
24
+ acum+((@vy[i]-y(@vx[i]))**2)
25
+ }
26
+ end
27
+ def standard_error
28
+ Math::sqrt(sse / (@vx.size-2).to_f)
29
+ end
30
+ # Sum of square regression
31
+ def ssr
32
+ vy_mean=@vy.mean
33
+ (0...@vx.size).inject(0) {|a,i|
34
+ a+((y(@vx[i])-vy_mean)**2)
35
+ }
36
+
37
+ end
38
+ # Sum of square total
39
+ def sst
40
+ @vy.sum_of_squared_deviation
41
+ end
42
+ # Value of r
43
+ def r
44
+ @b * (@vx.sds / @vy.sds)
45
+ end
46
+ # Value of r^2
47
+ def r2
48
+ r**2
49
+ end
50
+ class << self
51
+ def new_from_gsl(ar)
52
+ new(:init_gsl, *ar)
53
+ end
54
+ def new_from_vectors(vx,vy)
55
+ new(:init_vectors,vx,vy)
56
+ end
57
+ end
58
+ def init_vectors(vx,vy)
59
+ @vx,@vy=Statsample.only_valid(vx,vy)
60
+ x_m=@vx.mean
61
+ y_m=@vy.mean
62
+ num=den=0
63
+ (0...@vx.size).each {|i|
64
+ num+=(@vx[i]-x_m)*(@vy[i]-y_m)
65
+ den+=(@vx[i]-x_m)**2
66
+ }
67
+ @b=num.to_f/den
68
+ @a=y_m - @b*x_m
69
+ end
70
+ def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
71
+ @a=a
72
+ @b=b
73
+ @cov00=cov00
74
+ @cov01=cov01
75
+ @covx1=covx1
76
+ @chisq=chisq
77
+ @status=status
78
+ end
79
+ end
80
+
81
+
82
+ class MultipleRegressionBase
83
+ def initialize(ds,y_var)
84
+ @ds=ds
85
+ @y_var=y_var
86
+ @r2=nil
87
+
88
+ end
89
+ def assign_names(c)
90
+ a={}
91
+ @fields.each_index {|i|
92
+ a[@fields[i]]=c[i]
93
+ }
94
+ a
95
+ end
96
+ def predicted
97
+ (0...@ds.cases).collect { |i|
98
+ invalid=false
99
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
100
+ if invalid
101
+ nil
102
+ else
103
+ process(vect)
104
+ end
105
+ }.to_vector(:scale)
106
+ end
107
+ def standarized_predicted
108
+ predicted.standarized
109
+ end
110
+ def residuals
111
+ (0...@ds.cases).collect{|i|
112
+ invalid=false
113
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
114
+ if invalid or @ds[@y_var][i].nil?
115
+ nil
116
+ else
117
+ @ds[@y_var][i] - process(vect)
118
+ end
119
+ }.to_vector(:scale)
120
+ end
121
+ def r
122
+ raise "You should implement this"
123
+ end
124
+ def sst
125
+ raise "You should implement this"
126
+ end
127
+ def ssr
128
+ r2*sst
129
+ end
130
+ def sse
131
+ sst - ssr
132
+ end
133
+
134
+ def coeffs_t
135
+ out={}
136
+ se=coeffs_se
137
+ coeffs.each{|k,v|
138
+ out[k]=v / se[k]
139
+ }
140
+ out
141
+ end
142
+
143
+ def mse
144
+ sse/df_e
145
+ end
146
+
147
+ def df_r
148
+ @dep_columns.size
149
+ end
150
+ def df_e
151
+ @ds_valid.cases-@dep_columns.size-1
152
+ end
153
+ def f
154
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
155
+ end
156
+ # Significance of Fisher
157
+ def significance
158
+ if HAS_GSL
159
+ GSL::Cdf.fdist_Q(f,df_r,df_e)
160
+ else
161
+ raise "Need Ruby/GSL"
162
+ end
163
+ end
164
+ # Tolerance for a given variable
165
+ # http://talkstats.com/showthread.php?t=5056
166
+ def tolerance(var)
167
+ ds=assign_names(@dep_columns)
168
+ ds.each{|k,v|
169
+ ds[k]=v.to_vector(:scale)
170
+ }
171
+ if HAS_ALGIB
172
+ lr_class=::Statsample::Regression::MultipleRegressionAlglib
173
+ ds=ds.to_dataset
174
+ else
175
+ lr_class=MultipleRegressionPairwise
176
+ ds=ds.to_dataset.dup_only_valid
177
+ end
178
+ lr=lr_class.new(ds,var)
179
+ 1-lr.r2
180
+ end
181
+ def coeffs_tolerances
182
+ @fields.inject({}) {|a,f|
183
+ a[f]=tolerance(f);
184
+ a
185
+ }
186
+ end
187
+ def coeffs_se
188
+ out={}
189
+ mse=sse.quo(df_e)
190
+ coeffs.each {|k,v|
191
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
192
+ }
193
+ out
194
+ end
195
+ def estimated_variance_covariance_matrix
196
+ mse_p=mse
197
+ columns=[]
198
+ @ds_valid.each_vector{|k,v|
199
+ columns.push(v.data) unless k==@y_var
200
+ }
201
+ columns.unshift([1.0]*@ds_valid.cases)
202
+ x=Matrix.columns(columns)
203
+ matrix=((x.t*x)).inverse * mse
204
+ matrix.collect {|i|
205
+
206
+ Math::sqrt(i) if i>0
207
+ }
208
+ end
209
+ def constant_t
210
+ constant.to_f/constant_se
211
+ end
212
+ def constant_se
213
+ estimated_variance_covariance_matrix[0,0]
214
+ end
215
+ def summary(report_type=ConsoleSummary)
216
+ c=coeffs
217
+ out=""
218
+ out.extend report_type
219
+ out.add <<HEREDOC
220
+ Summary for regression of #{@fields.join(',')} over #{@y_var}"
221
+ *************************************************************
222
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
223
+ r=#{sprintf("%0.3f",r)}
224
+ r2=#{sprintf("%0.3f",r2)}
225
+ ssr=#{sprintf("%0.3f",ssr)}
226
+ sse=#{sprintf("%0.3f",sse)}
227
+ sst=#{sprintf("%0.3f",sst)}
228
+ F#{sprintf("(%d,%d)=%0.3f, p=%0.3f",df_r,df_e,f,significance)}
229
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
230
+
231
+ HEREDOC
232
+
233
+ end
234
+
235
+
236
+ # Deprecated
237
+ # Sum of squares of error (manual calculation)
238
+ # using the predicted value minus the y_i value
239
+ def sse_manual
240
+ pr=predicted
241
+ cases=0
242
+ sse=(0...@ds.cases).inject(0) {|a,i|
243
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
244
+ cases+=1
245
+ a+((pr[i]-@dy[i])**2)
246
+ else
247
+ a
248
+ end
249
+ }
250
+ sse*(min_n_valid-1.0).quo(cases-1)
251
+ end
252
+ # Sum of squares of regression
253
+ # using the predicted value minus y mean
254
+ def ssr_direct
255
+ mean=@dy.mean
256
+ cases=0
257
+ ssr=(0...@ds.cases).inject(0) {|a,i|
258
+ invalid=false
259
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
260
+ if !invalid
261
+ cases+=1
262
+ a+((process(v)-mean)**2)
263
+ else
264
+ a
265
+ end
266
+ }
267
+ ssr
268
+ end
269
+ def sse_direct
270
+ sst-ssr
271
+ end
272
+
273
+
274
+ end
275
+
276
+
277
+
278
+
279
+
280
+ if HAS_ALGIB
281
+ # Class for calculation of multiple regression.
282
+ # Requires Alglib gem.
283
+ # To create a SimpleRegression object:
284
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
285
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
286
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
287
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
288
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
289
+ # lr=Statsample::Regression::MultipleRegression.new(ds,'y')
290
+ #
291
+ class MultipleRegressionAlglib < MultipleRegressionBase
292
+ def initialize(ds,y_var)
293
+ @ds=ds.dup_only_valid
294
+ @ds_valid=@ds
295
+ @y_var=y_var
296
+ @dy=@ds[@y_var]
297
+ @ds_indep=ds.dup(ds.fields-[y_var])
298
+ # Create a custom matrix
299
+ columns=[]
300
+ @fields=[]
301
+ @ds.fields.each{|f|
302
+ if f!=@y_var
303
+ columns.push(@ds[f].to_a)
304
+ @fields.push(f)
305
+ end
306
+ }
307
+ @dep_columns=columns.dup
308
+ columns.push(@ds[@y_var])
309
+ matrix=Matrix.columns(columns)
310
+ @lr_s=nil
311
+ @lr=::Alglib::LinearRegression.build_from_matrix(matrix)
312
+ end
313
+
314
+ def _dump(i)
315
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
316
+ end
317
+ def self._load(data)
318
+ h=Marshal.load(data)
319
+ MultipleRegression.new(h['ds'], h['y_var'])
320
+ end
321
+
322
+ def coeffs
323
+ assign_names(@lr.coeffs)
324
+ end
325
+ # Coefficients using a constant
326
+ # Based on http://www.xycoon.com/ols1.htm
327
+ def matrix_resolution
328
+ mse_p=mse
329
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
330
+ columns.unshift([1.0]*@ds.cases)
331
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
332
+ x=Matrix.columns(columns)
333
+ xt=x.t
334
+ matrix=((xt*x)).inverse*xt
335
+ matrix*y
336
+ end
337
+ def r2
338
+ r**2
339
+ end
340
+ def r
341
+ Bivariate::pearson(@dy,predicted)
342
+ end
343
+ def sst
344
+ @dy.ss
345
+ end
346
+ def constant
347
+ @lr.constant
348
+ end
349
+ def standarized_coeffs
350
+ l=lr_s
351
+ assign_names(l.coeffs)
352
+ end
353
+ def lr_s
354
+ if @lr_s.nil?
355
+ build_standarized
356
+ end
357
+ @lr_s
358
+ end
359
+ def build_standarized
360
+ @ds_s=@ds.standarize
361
+ columns=[]
362
+ @ds_s.fields.each{|f|
363
+ columns.push(@ds_s[f].to_a) unless f==@y_var
364
+ }
365
+ @dep_columns_s=columns.dup
366
+ columns.push(@ds_s[@y_var])
367
+ matrix=Matrix.columns(columns)
368
+ @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
369
+ end
370
+ def process(v)
371
+ @lr.process(v)
372
+ end
373
+ def process_s(v)
374
+ lr_s.process(v)
375
+ end
376
+ # ???? Not equal to SPSS output
377
+ def standarized_residuals
378
+ res=residuals
379
+ red_sd=residuals.sds
380
+ res.collect {|v|
381
+ v.quo(red_sd)
382
+ }.to_vector(:scale)
383
+ end
384
+ end
385
+ end
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+ class MultipleRegressionPairwise < MultipleRegressionBase
399
+ def initialize(ds,y_var)
400
+ super
401
+ @dy=ds[@y_var]
402
+ @ds_valid=ds.dup_only_valid
403
+ @ds_indep=ds.dup(ds.fields-[y_var])
404
+ @fields=@ds_indep.fields
405
+ set_dep_columns
406
+ obtain_y_vector
407
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
408
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
409
+ @min_n_valid=nil
410
+ end
411
+ def min_n_valid
412
+ if @min_n_valid.nil?
413
+ min=@ds.cases
414
+ m=Bivariate::n_valid_matrix(@ds)
415
+ for x in 0...m.row_size
416
+ for y in 0...m.column_size
417
+ min=m[x,y] if m[x,y] < min
418
+ end
419
+ end
420
+ @min_n_valid=min
421
+ end
422
+ @min_n_valid
423
+ end
424
+ def set_dep_columns
425
+ @dep_columns=[]
426
+ @ds_indep.each_vector{|k,v|
427
+ @dep_columns.push(v.data_with_nils)
428
+ }
429
+ end
430
+ # Sum of square total
431
+ def sst
432
+ #if @sst.nil?
433
+ @sst=@dy.variance*(min_n_valid-1.0)
434
+ #end
435
+ @sst
436
+ end
437
+ def r2
438
+ if @r2.nil?
439
+ c=@matrix_y
440
+ rxx=obtain_predictor_matrix
441
+ matrix=(c.t*rxx.inverse*c)
442
+ @r2=matrix[0,0]
443
+ end
444
+ @r2
445
+ end
446
+ def r
447
+ Math::sqrt(r2)
448
+ end
449
+
450
+ def df_e
451
+ min_n_valid-@dep_columns.size-1
452
+ end
453
+ def fix_with_mean
454
+ i=0
455
+ @ds_indep.each{|row|
456
+ empty=[]
457
+ row.each{|k,v|
458
+ empty.push(k) if v.nil?
459
+ }
460
+ if empty.size==1
461
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
462
+ end
463
+ i+=1
464
+ }
465
+ @ds_indep.update_valid_data
466
+ set_dep_columns
467
+ end
468
+ def fix_with_regression
469
+ i=0
470
+ @ds_indep.each{|row|
471
+ empty=[]
472
+ row.each{|k,v|
473
+ empty.push(k) if v.nil?
474
+ }
475
+ if empty.size==1
476
+ field=empty[0]
477
+ lr=MultipleRegression.new(@ds_indep,field)
478
+ fields=[]
479
+ @ds_indep.fields.each{|f|
480
+ fields.push(row[f]) unless f==field
481
+ }
482
+ @ds_indep[field][i]=lr.process(fields)
483
+ end
484
+ i+=1
485
+ }
486
+ @ds_indep.update_valid_data
487
+ set_dep_columns
488
+ end
489
+ def obtain_y_vector
490
+ @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
491
+ Bivariate.pearson(@dy, @ds_indep[f])
492
+ }])
493
+ end
494
+ def obtain_predictor_matrix
495
+ Bivariate::correlation_matrix(@ds_indep)
496
+ end
497
+ def constant
498
+ c=coeffs
499
+ @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
500
+ end
501
+ def process(v)
502
+ c=coeffs
503
+ total=constant
504
+ @fields.each_index{|i|
505
+ total+=c[@fields[i]]*v[i]
506
+ }
507
+ total
508
+ end
509
+ def coeffs
510
+ sc=standarized_coeffs
511
+ assign_names(@fields.collect{|f|
512
+ (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
513
+ })
514
+ end
515
+ def standarized_coeffs
516
+ assign_names(@coeffs_stan)
517
+ end
518
+ end
519
+
520
+
521
+ end
522
+ end