statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,522 @@
1
+ module Statsample
2
+ # module for regression methods
3
+ module Regression
4
+ # Class for calculation of linear regressions
5
+ # To create a SimpleRegression object:
6
+ # * <tt> SimpleRegression.new_from_vectors(vx,vy)</tt>
7
+ # * <tt> SimpleRegression.new_from_gsl(gsl) </tt>
8
+ #
9
+ class SimpleRegression
10
+ attr_accessor :a,:b,:cov00, :cov01, :covx1, :chisq, :status
11
+ private_class_method :new
12
+ def initialize(init_method, *argv)
13
+ self.send(init_method, *argv)
14
+ end
15
+ def y(val_x)
16
+ @a+@b*val_x
17
+ end
18
+ def x(val_y)
19
+ (val_y-@a) / @b.to_f
20
+ end
21
+ # Sum of square error
22
+ def sse
23
+ (0...@vx.size).inject(0) {|acum,i|
24
+ acum+((@vy[i]-y(@vx[i]))**2)
25
+ }
26
+ end
27
+ def standard_error
28
+ Math::sqrt(sse / (@vx.size-2).to_f)
29
+ end
30
+ # Sum of square regression
31
+ def ssr
32
+ vy_mean=@vy.mean
33
+ (0...@vx.size).inject(0) {|a,i|
34
+ a+((y(@vx[i])-vy_mean)**2)
35
+ }
36
+
37
+ end
38
+ # Sum of square total
39
+ def sst
40
+ @vy.sum_of_squared_deviation
41
+ end
42
+ # Value of r
43
+ def r
44
+ @b * (@vx.sds / @vy.sds)
45
+ end
46
+ # Value of r^2
47
+ def r2
48
+ r**2
49
+ end
50
+ class << self
51
+ def new_from_gsl(ar)
52
+ new(:init_gsl, *ar)
53
+ end
54
+ def new_from_vectors(vx,vy)
55
+ new(:init_vectors,vx,vy)
56
+ end
57
+ end
58
+ def init_vectors(vx,vy)
59
+ @vx,@vy=Statsample.only_valid(vx,vy)
60
+ x_m=@vx.mean
61
+ y_m=@vy.mean
62
+ num=den=0
63
+ (0...@vx.size).each {|i|
64
+ num+=(@vx[i]-x_m)*(@vy[i]-y_m)
65
+ den+=(@vx[i]-x_m)**2
66
+ }
67
+ @b=num.to_f/den
68
+ @a=y_m - @b*x_m
69
+ end
70
+ def init_gsl(a,b,cov00, cov01, covx1, chisq, status)
71
+ @a=a
72
+ @b=b
73
+ @cov00=cov00
74
+ @cov01=cov01
75
+ @covx1=covx1
76
+ @chisq=chisq
77
+ @status=status
78
+ end
79
+ end
80
+
81
+
82
+ class MultipleRegressionBase
83
+ def initialize(ds,y_var)
84
+ @ds=ds
85
+ @y_var=y_var
86
+ @r2=nil
87
+
88
+ end
89
+ def assign_names(c)
90
+ a={}
91
+ @fields.each_index {|i|
92
+ a[@fields[i]]=c[i]
93
+ }
94
+ a
95
+ end
96
+ def predicted
97
+ (0...@ds.cases).collect { |i|
98
+ invalid=false
99
+ vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
100
+ if invalid
101
+ nil
102
+ else
103
+ process(vect)
104
+ end
105
+ }.to_vector(:scale)
106
+ end
107
+ def standarized_predicted
108
+ predicted.standarized
109
+ end
110
+ def residuals
111
+ (0...@ds.cases).collect{|i|
112
+ invalid=false
113
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
114
+ if invalid or @ds[@y_var][i].nil?
115
+ nil
116
+ else
117
+ @ds[@y_var][i] - process(vect)
118
+ end
119
+ }.to_vector(:scale)
120
+ end
121
+ def r
122
+ raise "You should implement this"
123
+ end
124
+ def sst
125
+ raise "You should implement this"
126
+ end
127
+ def ssr
128
+ r2*sst
129
+ end
130
+ def sse
131
+ sst - ssr
132
+ end
133
+
134
+ def coeffs_t
135
+ out={}
136
+ se=coeffs_se
137
+ coeffs.each{|k,v|
138
+ out[k]=v / se[k]
139
+ }
140
+ out
141
+ end
142
+
143
+ def mse
144
+ sse/df_e
145
+ end
146
+
147
+ def df_r
148
+ @dep_columns.size
149
+ end
150
+ def df_e
151
+ @ds_valid.cases-@dep_columns.size-1
152
+ end
153
+ def f
154
+ (ssr.quo(df_r)).quo(sse.quo(df_e))
155
+ end
156
+ # Significance of Fisher
157
+ def significance
158
+ if HAS_GSL
159
+ GSL::Cdf.fdist_Q(f,df_r,df_e)
160
+ else
161
+ raise "Need Ruby/GSL"
162
+ end
163
+ end
164
+ # Tolerance for a given variable
165
+ # http://talkstats.com/showthread.php?t=5056
166
+ def tolerance(var)
167
+ ds=assign_names(@dep_columns)
168
+ ds.each{|k,v|
169
+ ds[k]=v.to_vector(:scale)
170
+ }
171
+ if HAS_ALGIB
172
+ lr_class=::Statsample::Regression::MultipleRegressionAlglib
173
+ ds=ds.to_dataset
174
+ else
175
+ lr_class=MultipleRegressionPairwise
176
+ ds=ds.to_dataset.dup_only_valid
177
+ end
178
+ lr=lr_class.new(ds,var)
179
+ 1-lr.r2
180
+ end
181
+ def coeffs_tolerances
182
+ @fields.inject({}) {|a,f|
183
+ a[f]=tolerance(f);
184
+ a
185
+ }
186
+ end
187
+ def coeffs_se
188
+ out={}
189
+ mse=sse.quo(df_e)
190
+ coeffs.each {|k,v|
191
+ out[k]=Math::sqrt(mse/(@ds[k].sum_of_squares*tolerance(k)))
192
+ }
193
+ out
194
+ end
195
+ def estimated_variance_covariance_matrix
196
+ mse_p=mse
197
+ columns=[]
198
+ @ds_valid.each_vector{|k,v|
199
+ columns.push(v.data) unless k==@y_var
200
+ }
201
+ columns.unshift([1.0]*@ds_valid.cases)
202
+ x=Matrix.columns(columns)
203
+ matrix=((x.t*x)).inverse * mse
204
+ matrix.collect {|i|
205
+
206
+ Math::sqrt(i) if i>0
207
+ }
208
+ end
209
+ def constant_t
210
+ constant.to_f/constant_se
211
+ end
212
+ def constant_se
213
+ estimated_variance_covariance_matrix[0,0]
214
+ end
215
+ def summary(report_type=ConsoleSummary)
216
+ c=coeffs
217
+ out=""
218
+ out.extend report_type
219
+ out.add <<HEREDOC
220
+ Summary for regression of #{@fields.join(',')} over #{@y_var}"
221
+ *************************************************************
222
+ Cases(listwise)=#{@ds.cases}(#{@ds_valid.cases})
223
+ r=#{sprintf("%0.3f",r)}
224
+ r2=#{sprintf("%0.3f",r2)}
225
+ ssr=#{sprintf("%0.3f",ssr)}
226
+ sse=#{sprintf("%0.3f",sse)}
227
+ sst=#{sprintf("%0.3f",sst)}
228
+ F#{sprintf("(%d,%d)=%0.3f, p=%0.3f",df_r,df_e,f,significance)}
229
+ Equation=#{sprintf("%0.3f",constant)}+#{@fields.collect {|k| sprintf("%0.3f%s",c[k],k)}.join(' + ')}
230
+
231
+ HEREDOC
232
+
233
+ end
234
+
235
+
236
+ # Deprecated
237
+ # Sum of squares of error (manual calculation)
238
+ # using the predicted value minus the y_i value
239
+ def sse_manual
240
+ pr=predicted
241
+ cases=0
242
+ sse=(0...@ds.cases).inject(0) {|a,i|
243
+ if !@dy.data_with_nils[i].nil? and !pr[i].nil?
244
+ cases+=1
245
+ a+((pr[i]-@dy[i])**2)
246
+ else
247
+ a
248
+ end
249
+ }
250
+ sse*(min_n_valid-1.0).quo(cases-1)
251
+ end
252
+ # Sum of squares of regression
253
+ # using the predicted value minus y mean
254
+ def ssr_direct
255
+ mean=@dy.mean
256
+ cases=0
257
+ ssr=(0...@ds.cases).inject(0) {|a,i|
258
+ invalid=false
259
+ v=@dep_columns.collect{|c| invalid=true if c[i].nil?; c[i]}
260
+ if !invalid
261
+ cases+=1
262
+ a+((process(v)-mean)**2)
263
+ else
264
+ a
265
+ end
266
+ }
267
+ ssr
268
+ end
269
+ def sse_direct
270
+ sst-ssr
271
+ end
272
+
273
+
274
+ end
275
+
276
+
277
+
278
+
279
+
280
+ if HAS_ALGIB
281
+ # Class for calculation of multiple regression.
282
+ # Requires Alglib gem.
283
+ # To create a SimpleRegression object:
284
+ # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
285
+ # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
286
+ # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
287
+ # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
288
+ # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
289
+ # lr=Statsample::Regression::MultipleRegression.new(ds,'y')
290
+ #
291
+ class MultipleRegressionAlglib < MultipleRegressionBase
292
+ def initialize(ds,y_var)
293
+ @ds=ds.dup_only_valid
294
+ @ds_valid=@ds
295
+ @y_var=y_var
296
+ @dy=@ds[@y_var]
297
+ @ds_indep=ds.dup(ds.fields-[y_var])
298
+ # Create a custom matrix
299
+ columns=[]
300
+ @fields=[]
301
+ @ds.fields.each{|f|
302
+ if f!=@y_var
303
+ columns.push(@ds[f].to_a)
304
+ @fields.push(f)
305
+ end
306
+ }
307
+ @dep_columns=columns.dup
308
+ columns.push(@ds[@y_var])
309
+ matrix=Matrix.columns(columns)
310
+ @lr_s=nil
311
+ @lr=::Alglib::LinearRegression.build_from_matrix(matrix)
312
+ end
313
+
314
+ def _dump(i)
315
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
316
+ end
317
+ def self._load(data)
318
+ h=Marshal.load(data)
319
+ MultipleRegression.new(h['ds'], h['y_var'])
320
+ end
321
+
322
+ def coeffs
323
+ assign_names(@lr.coeffs)
324
+ end
325
+ # Coefficients using a constant
326
+ # Based on http://www.xycoon.com/ols1.htm
327
+ def matrix_resolution
328
+ mse_p=mse
329
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
330
+ columns.unshift([1.0]*@ds.cases)
331
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
332
+ x=Matrix.columns(columns)
333
+ xt=x.t
334
+ matrix=((xt*x)).inverse*xt
335
+ matrix*y
336
+ end
337
+ def r2
338
+ r**2
339
+ end
340
+ def r
341
+ Bivariate::pearson(@dy,predicted)
342
+ end
343
+ def sst
344
+ @dy.ss
345
+ end
346
+ def constant
347
+ @lr.constant
348
+ end
349
+ def standarized_coeffs
350
+ l=lr_s
351
+ assign_names(l.coeffs)
352
+ end
353
+ def lr_s
354
+ if @lr_s.nil?
355
+ build_standarized
356
+ end
357
+ @lr_s
358
+ end
359
+ def build_standarized
360
+ @ds_s=@ds.standarize
361
+ columns=[]
362
+ @ds_s.fields.each{|f|
363
+ columns.push(@ds_s[f].to_a) unless f==@y_var
364
+ }
365
+ @dep_columns_s=columns.dup
366
+ columns.push(@ds_s[@y_var])
367
+ matrix=Matrix.columns(columns)
368
+ @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
369
+ end
370
+ def process(v)
371
+ @lr.process(v)
372
+ end
373
+ def process_s(v)
374
+ lr_s.process(v)
375
+ end
376
+ # ???? Not equal to SPSS output
377
+ def standarized_residuals
378
+ res=residuals
379
+ red_sd=residuals.sds
380
+ res.collect {|v|
381
+ v.quo(red_sd)
382
+ }.to_vector(:scale)
383
+ end
384
+ end
385
+ end
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+ class MultipleRegressionPairwise < MultipleRegressionBase
399
+ def initialize(ds,y_var)
400
+ super
401
+ @dy=ds[@y_var]
402
+ @ds_valid=ds.dup_only_valid
403
+ @ds_indep=ds.dup(ds.fields-[y_var])
404
+ @fields=@ds_indep.fields
405
+ set_dep_columns
406
+ obtain_y_vector
407
+ @matrix_x = Bivariate.correlation_matrix(@ds_indep)
408
+ @coeffs_stan=(@matrix_x.inverse * @matrix_y).column(0).to_a
409
+ @min_n_valid=nil
410
+ end
411
+ def min_n_valid
412
+ if @min_n_valid.nil?
413
+ min=@ds.cases
414
+ m=Bivariate::n_valid_matrix(@ds)
415
+ for x in 0...m.row_size
416
+ for y in 0...m.column_size
417
+ min=m[x,y] if m[x,y] < min
418
+ end
419
+ end
420
+ @min_n_valid=min
421
+ end
422
+ @min_n_valid
423
+ end
424
+ def set_dep_columns
425
+ @dep_columns=[]
426
+ @ds_indep.each_vector{|k,v|
427
+ @dep_columns.push(v.data_with_nils)
428
+ }
429
+ end
430
+ # Sum of square total
431
+ def sst
432
+ #if @sst.nil?
433
+ @sst=@dy.variance*(min_n_valid-1.0)
434
+ #end
435
+ @sst
436
+ end
437
+ def r2
438
+ if @r2.nil?
439
+ c=@matrix_y
440
+ rxx=obtain_predictor_matrix
441
+ matrix=(c.t*rxx.inverse*c)
442
+ @r2=matrix[0,0]
443
+ end
444
+ @r2
445
+ end
446
+ def r
447
+ Math::sqrt(r2)
448
+ end
449
+
450
+ def df_e
451
+ min_n_valid-@dep_columns.size-1
452
+ end
453
+ def fix_with_mean
454
+ i=0
455
+ @ds_indep.each{|row|
456
+ empty=[]
457
+ row.each{|k,v|
458
+ empty.push(k) if v.nil?
459
+ }
460
+ if empty.size==1
461
+ @ds_indep[empty[0]][i]=@ds[empty[0]].mean
462
+ end
463
+ i+=1
464
+ }
465
+ @ds_indep.update_valid_data
466
+ set_dep_columns
467
+ end
468
+ def fix_with_regression
469
+ i=0
470
+ @ds_indep.each{|row|
471
+ empty=[]
472
+ row.each{|k,v|
473
+ empty.push(k) if v.nil?
474
+ }
475
+ if empty.size==1
476
+ field=empty[0]
477
+ lr=MultipleRegression.new(@ds_indep,field)
478
+ fields=[]
479
+ @ds_indep.fields.each{|f|
480
+ fields.push(row[f]) unless f==field
481
+ }
482
+ @ds_indep[field][i]=lr.process(fields)
483
+ end
484
+ i+=1
485
+ }
486
+ @ds_indep.update_valid_data
487
+ set_dep_columns
488
+ end
489
+ def obtain_y_vector
490
+ @matrix_y=Matrix.columns([@ds_indep.fields.collect{|f|
491
+ Bivariate.pearson(@dy, @ds_indep[f])
492
+ }])
493
+ end
494
+ def obtain_predictor_matrix
495
+ Bivariate::correlation_matrix(@ds_indep)
496
+ end
497
+ def constant
498
+ c=coeffs
499
+ @dy.mean-@fields.inject(0){|a,k| a+(c[k] * @ds_indep[k].mean)}
500
+ end
501
+ def process(v)
502
+ c=coeffs
503
+ total=constant
504
+ @fields.each_index{|i|
505
+ total+=c[@fields[i]]*v[i]
506
+ }
507
+ total
508
+ end
509
+ def coeffs
510
+ sc=standarized_coeffs
511
+ assign_names(@fields.collect{|f|
512
+ (sc[f]*@dy.sds).quo(@ds_indep[f].sds)
513
+ })
514
+ end
515
+ def standarized_coeffs
516
+ assign_names(@coeffs_stan)
517
+ end
518
+ end
519
+
520
+
521
+ end
522
+ end