statsample 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,526 @@
1
+ require 'statsample/vector'
2
+
3
+ class Hash
4
+ def to_dataset(*args)
5
+ Statsample::Dataset.new(self,*args)
6
+ end
7
+ end
8
+
9
+ class Array
10
+ def prefix(s)
11
+ self.collect{|c|
12
+ s+c.to_s
13
+ }
14
+ end
15
+ def suffix(s)
16
+ self.collect{|c|
17
+ c.to_s+s
18
+ }
19
+ end
20
+ end
21
+
22
+ module Statsample
23
+ class DatasetException < RuntimeError
24
+ attr_reader :ds,:exp
25
+ def initialize(ds,e)
26
+ @ds=ds
27
+ @exp=e
28
+ end
29
+ def to_s
30
+ m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
31
+ m+="\nRow: #{@i}" unless @i.nil?
32
+ m
33
+ end
34
+ end
35
+ class Dataset
36
+ include Writable
37
+ attr_reader :vectors, :fields, :cases, :i
38
+ attr_accessor :labels
39
+ # To create a dataset
40
+ # * Dataset.new()
41
+ # * Dataset.new(%w{v1 v2 v3})
42
+ # * Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
43
+ # * Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
44
+ #
45
+ def initialize(vectors={}, fields=[], labels={})
46
+ if vectors.instance_of? Array
47
+ @fields=vectors.dup
48
+ @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
49
+ else
50
+ @vectors=vectors
51
+ @fields=fields
52
+ check_order
53
+ check_length
54
+ end
55
+ @i=nil
56
+ @labels=labels
57
+ end
58
+ def to_gsl_matrix
59
+ matrix=GSL::Matrix.alloc(cases,@vectors.size)
60
+ each_array do |row|
61
+ row.each_index{|y| matrix.set(@i,y,row[y]) }
62
+ end
63
+ matrix
64
+ end
65
+ def vector_label(v_id)
66
+ raise "Vector #{v} doesn't exists" unless @fields.include? v_id
67
+ @labels[v_id].nil? ? v_id : @labels[v_id]
68
+ end
69
+ # Creates a copy of the given dataset, deleting all the cases with
70
+ # missing data on one of the vectors
71
+ def dup_only_valid
72
+ if @vectors.find{|field,vector| vector.has_missing_data?}
73
+ ds=dup_empty
74
+ each_array { |c|
75
+ ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
76
+ }
77
+ ds.update_valid_data
78
+ else
79
+ ds=dup()
80
+ end
81
+ ds
82
+ end
83
+ # Returns an array with the fields from first argumen to last argument
84
+ def from_to(from,to)
85
+ raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
86
+ raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
87
+ @fields.slice(@fields.index(from)..@fields.index(to))
88
+ end
89
+ # Returns a duplicate of the Database
90
+ # If fields given, only include those vectors
91
+ def dup(*fields_to_include)
92
+ if fields_to_include.size==1 and fields_to_include[0].is_a? Array
93
+ fields_to_include=fields_to_include[0]
94
+ end
95
+ fields_to_include=@fields if fields_to_include.size==0
96
+ vectors={}
97
+ fields=[]
98
+ labels={}
99
+ fields_to_include.each{|f|
100
+ raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
101
+ vectors[f]=@vectors[f].dup
102
+ labels[f]=@labels[f]
103
+ fields.push(f)
104
+ }
105
+ Dataset.new(vectors,fields,labels)
106
+ end
107
+ # Creates a copy of the given dataset, without data on vectors
108
+ def dup_empty
109
+ vectors=@vectors.inject({}) {|a,v|
110
+ a[v[0]]=v[1].dup_empty
111
+ a
112
+ }
113
+ Dataset.new(vectors,@fields.dup,@labels.dup)
114
+ end
115
+ # Returns a dataset with standarized data
116
+ def standarize
117
+ ds=dup()
118
+ ds.fields.each {|f|
119
+ ds[f]=ds[f].vector_standarized
120
+ }
121
+ ds
122
+ end
123
+ # Generate a matrix, based on fields of dataset
124
+ def collect_matrix
125
+ rows=@fields.collect{|row|
126
+ @fields.collect{|col|
127
+ yield row,col
128
+ }
129
+ }
130
+ Matrix.rows(rows)
131
+ end
132
+ # We have the same datasets if the labels and vectors are the same
133
+ def ==(d2)
134
+ @vectors==d2.vectors and @fields==d2.fields
135
+ end
136
+ def col(c)
137
+ @vectors[c]
138
+ end
139
+ alias_method :vector, :col
140
+ def add_vector(name,vector)
141
+ raise ArgumentError, "Vector have different size" if vector.size!=@cases
142
+ @vectors[name]=vector
143
+ check_order
144
+ end
145
+ def has_vector? (v)
146
+ return @vectors.has_key?(v)
147
+ end
148
+ # Creates a dataset with the random data, of a n size
149
+ # If n not given, uses original number of cases
150
+ def bootstrap(n=nil)
151
+ n||=@cases
152
+ ds_boot=dup_empty
153
+ for i in 1..n
154
+ ds_boot.add_case_array(case_as_array(rand(n)))
155
+ end
156
+ ds_boot.update_valid_data
157
+ ds_boot
158
+ end
159
+ # Fast version of add case
160
+ # Can only add one case and no error check if performed
161
+ # You SHOULD use update_valid_data at the the of insertion cycle
162
+ def add_case_array(v)
163
+ v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
164
+ end
165
+ def add_case(v,uvd=true)
166
+ case v
167
+ when Array
168
+ if (v[0].is_a? Array)
169
+ v.each{|subv| add_case(subv,false)}
170
+ else
171
+ raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
172
+ v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
173
+ end
174
+ when Hash
175
+ raise ArgumentError, "Hash keys should be equal to fields" if @fields.sort!=v.keys.sort
176
+ @fields.each{|f| @vectors[f].add(v[f],false)}
177
+ else
178
+ raise TypeError, 'Value must be a Array or a Hash'
179
+ end
180
+ if uvd
181
+ update_valid_data
182
+ end
183
+ end
184
+ def update_valid_data
185
+ @fields.each{|f| @vectors[f].set_valid_data}
186
+ check_length
187
+ end
188
+ def delete_vector(name)
189
+ @fields.delete(name)
190
+ @vectors.delete(name)
191
+ end
192
+ def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
193
+ split=@vectors[name].split_by_separator(sep)
194
+ i=1
195
+ split.each{|k,v|
196
+ new_field=name+join+i.to_s
197
+ @labels[new_field]=name+":"+k
198
+ add_vector(new_field,v)
199
+ i+=1
200
+ }
201
+ end
202
+ def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
203
+ split=@vectors[name].split_by_separator(sep)
204
+ split.each{|k,v|
205
+ add_vector(name+join+k,v)
206
+ }
207
+ end
208
+ def vector_by_calculation(type=:scale)
209
+ a=[]
210
+ each {|row|
211
+ a.push(yield(row))
212
+ }
213
+ a.to_vector(type)
214
+ end
215
+ # Returns a vector with sumatory of fields
216
+ # if fields parameter is empty, sum all fields
217
+ def vector_sum(fields=nil)
218
+ a=[]
219
+ fields||=@fields
220
+ collect_with_index do |i,row|
221
+ if(fields.find{|f| !@vectors[f].data_with_nils[i]})
222
+ nil
223
+ else
224
+ fields.inject(0) {|ac,v| ac + row[v].to_f}
225
+ end
226
+ end
227
+ end
228
+ # Returns a vector with the numbers of missing values for a case
229
+
230
+ def vector_missing_values(fields=nil)
231
+ fields||=@fields
232
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
233
+
234
+ collect_with_index do |i,row|
235
+ fields.inject(0){|a,v|
236
+ a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
237
+ }
238
+ end
239
+ end
240
+ def vector_count_characters(fields=nil)
241
+ fields||=@fields
242
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
243
+ collect_with_index do |i,row|
244
+ fields.inject(0){|a,v|
245
+
246
+ a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
247
+ }
248
+ end
249
+ end
250
+ # Returns a vector with the mean for a set of fields
251
+ # if fields parameter is empty, return the mean for all fields
252
+ # if max invalid parameter > 0, returns the mean for all tuples
253
+ # with 0 to max_invalid invalid fields
254
+ def vector_mean(fields=nil,max_invalid=0)
255
+ a=[]
256
+ fields||=@fields
257
+ size=fields.size
258
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
259
+ each_with_index do |i, row|
260
+ # numero de invalidos
261
+ sum=0
262
+ invalids=0
263
+ fields.each{|f|
264
+ if !@vectors[f].data_with_nils[i].nil?
265
+ sum+=row[f].to_f
266
+ else
267
+ invalids+=1
268
+ end
269
+ }
270
+ if(invalids>max_invalid)
271
+ a.push(nil)
272
+ else
273
+ a.push(sum.quo(size-invalids))
274
+ end
275
+ end
276
+ a.to_vector(:scale)
277
+ end
278
+ def check_length
279
+ size=nil
280
+ @vectors.each{|k,v|
281
+ raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
282
+ if size.nil?
283
+ size=v.size
284
+ else
285
+ if v.size!=size
286
+ p v.to_a.size
287
+ raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
288
+ end
289
+ end
290
+ }
291
+ @cases=size
292
+ end
293
+ def each_vector
294
+ @fields.each{|k|
295
+ yield k,@vectors[k]
296
+ }
297
+ end
298
+ if !Statsample::OPTIMIZED
299
+ def case_as_hash(c)
300
+ @fields.inject({}) {|a,x|
301
+ a[x]=@vectors[x][c]
302
+ a
303
+ }
304
+ end
305
+ def case_as_array(c)
306
+ @fields.collect {|x| @vectors[x][c]}
307
+ end
308
+ end
309
+ def each
310
+ begin
311
+ @i=0
312
+ @cases.times {|i|
313
+ @i=i
314
+ row=case_as_hash(i)
315
+ yield row
316
+ }
317
+ @i=nil
318
+ rescue =>e
319
+ raise DatasetException.new(self,e)
320
+ end
321
+ end
322
+ def each_with_index
323
+ begin
324
+ @i=0
325
+ @cases.times{|i|
326
+ @i=i
327
+ row=case_as_hash(i)
328
+ yield i,row
329
+ }
330
+ @i=nil
331
+ rescue =>e
332
+ raise DatasetException.new(self,e)
333
+ end
334
+ end
335
+ def each_array
336
+ @cases.times {|i|
337
+ @i=i
338
+ row=case_as_array(i)
339
+ yield row
340
+ }
341
+ @i=nil
342
+ end
343
+ def fields=(f)
344
+ @fields=f
345
+ check_order
346
+ end
347
+ def check_order
348
+ if(@vectors.keys.sort!=@fields.sort)
349
+ @fields=@fields&@vectors.keys
350
+ @fields+=@vectors.keys.sort-@fields
351
+ end
352
+ end
353
+ # Returns the vector named i
354
+ def[](i)
355
+ if i.is_a? String
356
+ raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
357
+ @vectors[i]
358
+ elsif i.is_a? Range
359
+ fields=from_to(i.begin,i.end)
360
+ vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
361
+ ds=Dataset.new(vectors,fields)
362
+ else
363
+ raise ArgumentError, "You need a String or a Range"
364
+ end
365
+ end
366
+ def collect(type=:scale)
367
+ data=[]
368
+ each {|row|
369
+ data.push(yield(row))
370
+ }
371
+ Statsample::Vector.new(data,type)
372
+ end
373
+ def collect_with_index(type=:scale)
374
+ data=[]
375
+ each_with_index {|i,row|
376
+ data.push(yield(i,row))
377
+ }
378
+ Statsample::Vector.new(data,type)
379
+ end
380
+ # Recode a vector based on a block
381
+ def recode!(vector_name)
382
+ 0.upto(@cases-1) {|i|
383
+ @vectors[vector_name].data[i]=yield case_as_hash(i)
384
+ }
385
+ @vectors[vector_name].set_valid_data
386
+ end
387
+ def crosstab(v1,v2)
388
+ Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
389
+ end
390
+ def[]=(i,v)
391
+ if v.instance_of? Statsample::Vector
392
+ @vectors[i]=v
393
+ check_order
394
+ else
395
+ raise ArgumentError,"Should pass a Statsample::Vector"
396
+ end
397
+ end
398
+ def to_matrix
399
+ rows=[]
400
+ self.each_array{|c|
401
+ rows.push(c)
402
+ }
403
+ Matrix.rows(rows)
404
+ end
405
+ def to_multiset_by_split(*fields)
406
+ require 'statsample/multiset'
407
+ if fields.size==1
408
+ to_multiset_by_split_one_field(fields[0])
409
+ else
410
+ to_multiset_by_split_multiple_fields(*fields)
411
+ end
412
+ end
413
+ # create a new dataset with all the data which the block returns true
414
+ def filter
415
+ ds=self.dup_empty
416
+ each {|c|
417
+ ds.add_case(c,false) if yield c
418
+ }
419
+ ds.update_valid_data
420
+ ds
421
+ end
422
+ # creates a new vector with the data of a given field which the block returns true
423
+ def filter_field(field)
424
+ a=[]
425
+ each {|c|
426
+ a.push(c[field]) if yield c
427
+ }
428
+ a.to_vector(@vectors[field].type)
429
+ end
430
+ def to_multiset_by_split_one_field(field)
431
+ raise ArgumentError,"Should use a correct field name" if !@fields.include? field
432
+ factors=@vectors[field].factors
433
+ ms=Multiset.new_empty_vectors(@fields,factors)
434
+ each {|c|
435
+ ms[c[field]].add_case(c,false)
436
+ }
437
+ #puts "Ingreso a los dataset"
438
+ ms.datasets.each {|k,ds|
439
+ ds.update_valid_data
440
+ ds.vectors.each{|k1,v1|
441
+ # puts "Vector #{k1}:"+v1.to_s
442
+ v1.type=@vectors[k1].type
443
+ }
444
+ }
445
+ ms
446
+ end
447
+ def to_multiset_by_split_multiple_fields(*fields)
448
+ factors_total=nil
449
+ fields.each{|f|
450
+ if factors_total.nil?
451
+ factors_total=@vectors[f].factors.collect{|c|
452
+ [c]
453
+ }
454
+ else
455
+ suma=[]
456
+ factors=@vectors[f].factors
457
+ factors_total.each{|f1|
458
+ factors.each{|f2|
459
+ suma.push(f1+[f2])
460
+ }
461
+ }
462
+ factors_total=suma
463
+ end
464
+ }
465
+ ms=Multiset.new_empty_vectors(@fields,factors_total)
466
+ p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
467
+ each{|c|
468
+ p1.call(c)
469
+ }
470
+ ms.datasets.each {|k,ds|
471
+ ds.update_valid_data
472
+ ds.vectors.each{|k1,v1|
473
+ # puts "Vector #{k1}:"+v1.to_s
474
+ v1.type=@vectors[k1].type
475
+ }
476
+ }
477
+ ms
478
+
479
+ end
480
+ # Test each row with one or more tests
481
+ # each test is a Proc with the form
482
+ # Proc.new {|row| row['age']>0}
483
+ # The function returns an array with all errors
484
+ def verify(*tests)
485
+ if(tests[0].is_a? String)
486
+ id=tests[0]
487
+ tests.shift
488
+ else
489
+ id=@fields[0]
490
+ end
491
+ vr=[]
492
+ i=0
493
+ each do |row|
494
+ i+=1
495
+ tests.each{|test|
496
+ if ! test[2].call(row)
497
+ values=""
498
+ if test[1].size>0
499
+ values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
500
+ end
501
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
502
+ end
503
+ }
504
+ end
505
+ vr
506
+ end
507
+ def to_s
508
+ "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
509
+ end
510
+ def inspect
511
+ self.to_s
512
+ end
513
+ def summary
514
+ out=""
515
+ out << "Summary for dataset\n"
516
+ @vectors.each{|k,v|
517
+ out << "###############\n"
518
+ out << "Vector #{k}:\n"
519
+ out << v.summary
520
+ out << "###############\n"
521
+
522
+ }
523
+ out
524
+ end
525
+ end
526
+ end