statsample 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +79 -0
  2. data/Manifest.txt +56 -0
  3. data/README.txt +77 -0
  4. data/Rakefile +22 -0
  5. data/bin/statsample +2 -0
  6. data/demo/benchmark.rb +52 -0
  7. data/demo/chi-square.rb +44 -0
  8. data/demo/dice.rb +13 -0
  9. data/demo/distribution_t.rb +95 -0
  10. data/demo/graph.rb +9 -0
  11. data/demo/item_analysis.rb +30 -0
  12. data/demo/mean.rb +81 -0
  13. data/demo/proportion.rb +57 -0
  14. data/demo/sample_test.csv +113 -0
  15. data/demo/strata_proportion.rb +152 -0
  16. data/demo/stratum.rb +141 -0
  17. data/lib/spss.rb +131 -0
  18. data/lib/statsample.rb +216 -0
  19. data/lib/statsample/anova.rb +74 -0
  20. data/lib/statsample/bivariate.rb +255 -0
  21. data/lib/statsample/chidistribution.rb +39 -0
  22. data/lib/statsample/codification.rb +120 -0
  23. data/lib/statsample/converters.rb +338 -0
  24. data/lib/statsample/crosstab.rb +122 -0
  25. data/lib/statsample/dataset.rb +526 -0
  26. data/lib/statsample/dominanceanalysis.rb +259 -0
  27. data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
  28. data/lib/statsample/graph/gdchart.rb +45 -0
  29. data/lib/statsample/graph/svgboxplot.rb +108 -0
  30. data/lib/statsample/graph/svggraph.rb +181 -0
  31. data/lib/statsample/graph/svghistogram.rb +208 -0
  32. data/lib/statsample/graph/svgscatterplot.rb +111 -0
  33. data/lib/statsample/htmlreport.rb +232 -0
  34. data/lib/statsample/multiset.rb +281 -0
  35. data/lib/statsample/regression.rb +522 -0
  36. data/lib/statsample/reliability.rb +235 -0
  37. data/lib/statsample/resample.rb +20 -0
  38. data/lib/statsample/srs.rb +159 -0
  39. data/lib/statsample/test.rb +25 -0
  40. data/lib/statsample/vector.rb +759 -0
  41. data/test/_test_chart.rb +58 -0
  42. data/test/test_anova.rb +31 -0
  43. data/test/test_codification.rb +59 -0
  44. data/test/test_crosstab.rb +55 -0
  45. data/test/test_csv.csv +7 -0
  46. data/test/test_csv.rb +27 -0
  47. data/test/test_dataset.rb +293 -0
  48. data/test/test_ggobi.rb +42 -0
  49. data/test/test_multiset.rb +98 -0
  50. data/test/test_regression.rb +108 -0
  51. data/test/test_reliability.rb +32 -0
  52. data/test/test_resample.rb +23 -0
  53. data/test/test_srs.rb +14 -0
  54. data/test/test_statistics.rb +152 -0
  55. data/test/test_stratified.rb +19 -0
  56. data/test/test_svg_graph.rb +63 -0
  57. data/test/test_vector.rb +265 -0
  58. data/test/test_xls.rb +32 -0
  59. metadata +158 -0
@@ -0,0 +1,526 @@
1
+ require 'statsample/vector'
2
+
3
+ class Hash
4
+ def to_dataset(*args)
5
+ Statsample::Dataset.new(self,*args)
6
+ end
7
+ end
8
+
9
+ class Array
10
+ def prefix(s)
11
+ self.collect{|c|
12
+ s+c.to_s
13
+ }
14
+ end
15
+ def suffix(s)
16
+ self.collect{|c|
17
+ c.to_s+s
18
+ }
19
+ end
20
+ end
21
+
22
+ module Statsample
23
+ class DatasetException < RuntimeError
24
+ attr_reader :ds,:exp
25
+ def initialize(ds,e)
26
+ @ds=ds
27
+ @exp=e
28
+ end
29
+ def to_s
30
+ m="Error:"+@exp.message+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
31
+ m+="\nRow: #{@i}" unless @i.nil?
32
+ m
33
+ end
34
+ end
35
+ class Dataset
36
+ include Writable
37
+ attr_reader :vectors, :fields, :cases, :i
38
+ attr_accessor :labels
39
+ # To create a dataset
40
+ # * Dataset.new()
41
+ # * Dataset.new(%w{v1 v2 v3})
42
+ # * Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
43
+ # * Dataset.new({'v2'=>v2,'v1'=>v1},['v1','v2'])
44
+ #
45
+ def initialize(vectors={}, fields=[], labels={})
46
+ if vectors.instance_of? Array
47
+ @fields=vectors.dup
48
+ @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
49
+ else
50
+ @vectors=vectors
51
+ @fields=fields
52
+ check_order
53
+ check_length
54
+ end
55
+ @i=nil
56
+ @labels=labels
57
+ end
58
+ def to_gsl_matrix
59
+ matrix=GSL::Matrix.alloc(cases,@vectors.size)
60
+ each_array do |row|
61
+ row.each_index{|y| matrix.set(@i,y,row[y]) }
62
+ end
63
+ matrix
64
+ end
65
+ def vector_label(v_id)
66
+ raise "Vector #{v} doesn't exists" unless @fields.include? v_id
67
+ @labels[v_id].nil? ? v_id : @labels[v_id]
68
+ end
69
+ # Creates a copy of the given dataset, deleting all the cases with
70
+ # missing data on one of the vectors
71
+ def dup_only_valid
72
+ if @vectors.find{|field,vector| vector.has_missing_data?}
73
+ ds=dup_empty
74
+ each_array { |c|
75
+ ds.add_case_array(c) unless @fields.find{|f| @vectors[f].data_with_nils[@i].nil? }
76
+ }
77
+ ds.update_valid_data
78
+ else
79
+ ds=dup()
80
+ end
81
+ ds
82
+ end
83
+ # Returns an array with the fields from first argumen to last argument
84
+ def from_to(from,to)
85
+ raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
86
+ raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
87
+ @fields.slice(@fields.index(from)..@fields.index(to))
88
+ end
89
+ # Returns a duplicate of the Database
90
+ # If fields given, only include those vectors
91
+ def dup(*fields_to_include)
92
+ if fields_to_include.size==1 and fields_to_include[0].is_a? Array
93
+ fields_to_include=fields_to_include[0]
94
+ end
95
+ fields_to_include=@fields if fields_to_include.size==0
96
+ vectors={}
97
+ fields=[]
98
+ labels={}
99
+ fields_to_include.each{|f|
100
+ raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
101
+ vectors[f]=@vectors[f].dup
102
+ labels[f]=@labels[f]
103
+ fields.push(f)
104
+ }
105
+ Dataset.new(vectors,fields,labels)
106
+ end
107
+ # Creates a copy of the given dataset, without data on vectors
108
+ def dup_empty
109
+ vectors=@vectors.inject({}) {|a,v|
110
+ a[v[0]]=v[1].dup_empty
111
+ a
112
+ }
113
+ Dataset.new(vectors,@fields.dup,@labels.dup)
114
+ end
115
+ # Returns a dataset with standarized data
116
+ def standarize
117
+ ds=dup()
118
+ ds.fields.each {|f|
119
+ ds[f]=ds[f].vector_standarized
120
+ }
121
+ ds
122
+ end
123
+ # Generate a matrix, based on fields of dataset
124
+ def collect_matrix
125
+ rows=@fields.collect{|row|
126
+ @fields.collect{|col|
127
+ yield row,col
128
+ }
129
+ }
130
+ Matrix.rows(rows)
131
+ end
132
+ # We have the same datasets if the labels and vectors are the same
133
+ def ==(d2)
134
+ @vectors==d2.vectors and @fields==d2.fields
135
+ end
136
+ def col(c)
137
+ @vectors[c]
138
+ end
139
+ alias_method :vector, :col
140
+ def add_vector(name,vector)
141
+ raise ArgumentError, "Vector have different size" if vector.size!=@cases
142
+ @vectors[name]=vector
143
+ check_order
144
+ end
145
+ def has_vector? (v)
146
+ return @vectors.has_key?(v)
147
+ end
148
+ # Creates a dataset with the random data, of a n size
149
+ # If n not given, uses original number of cases
150
+ def bootstrap(n=nil)
151
+ n||=@cases
152
+ ds_boot=dup_empty
153
+ for i in 1..n
154
+ ds_boot.add_case_array(case_as_array(rand(n)))
155
+ end
156
+ ds_boot.update_valid_data
157
+ ds_boot
158
+ end
159
+ # Fast version of add case
160
+ # Can only add one case and no error check if performed
161
+ # You SHOULD use update_valid_data at the the of insertion cycle
162
+ def add_case_array(v)
163
+ v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
164
+ end
165
+ def add_case(v,uvd=true)
166
+ case v
167
+ when Array
168
+ if (v[0].is_a? Array)
169
+ v.each{|subv| add_case(subv,false)}
170
+ else
171
+ raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
172
+ v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
173
+ end
174
+ when Hash
175
+ raise ArgumentError, "Hash keys should be equal to fields" if @fields.sort!=v.keys.sort
176
+ @fields.each{|f| @vectors[f].add(v[f],false)}
177
+ else
178
+ raise TypeError, 'Value must be a Array or a Hash'
179
+ end
180
+ if uvd
181
+ update_valid_data
182
+ end
183
+ end
184
+ def update_valid_data
185
+ @fields.each{|f| @vectors[f].set_valid_data}
186
+ check_length
187
+ end
188
+ def delete_vector(name)
189
+ @fields.delete(name)
190
+ @vectors.delete(name)
191
+ end
192
+ def add_vectors_by_split_recode(name,join='-',sep=Statsample::SPLIT_TOKEN)
193
+ split=@vectors[name].split_by_separator(sep)
194
+ i=1
195
+ split.each{|k,v|
196
+ new_field=name+join+i.to_s
197
+ @labels[new_field]=name+":"+k
198
+ add_vector(new_field,v)
199
+ i+=1
200
+ }
201
+ end
202
+ def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
203
+ split=@vectors[name].split_by_separator(sep)
204
+ split.each{|k,v|
205
+ add_vector(name+join+k,v)
206
+ }
207
+ end
208
+ def vector_by_calculation(type=:scale)
209
+ a=[]
210
+ each {|row|
211
+ a.push(yield(row))
212
+ }
213
+ a.to_vector(type)
214
+ end
215
+ # Returns a vector with sumatory of fields
216
+ # if fields parameter is empty, sum all fields
217
+ def vector_sum(fields=nil)
218
+ a=[]
219
+ fields||=@fields
220
+ collect_with_index do |i,row|
221
+ if(fields.find{|f| !@vectors[f].data_with_nils[i]})
222
+ nil
223
+ else
224
+ fields.inject(0) {|ac,v| ac + row[v].to_f}
225
+ end
226
+ end
227
+ end
228
+ # Returns a vector with the numbers of missing values for a case
229
+
230
+ def vector_missing_values(fields=nil)
231
+ fields||=@fields
232
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
233
+
234
+ collect_with_index do |i,row|
235
+ fields.inject(0){|a,v|
236
+ a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
237
+ }
238
+ end
239
+ end
240
+ def vector_count_characters(fields=nil)
241
+ fields||=@fields
242
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
243
+ collect_with_index do |i,row|
244
+ fields.inject(0){|a,v|
245
+
246
+ a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
247
+ }
248
+ end
249
+ end
250
+ # Returns a vector with the mean for a set of fields
251
+ # if fields parameter is empty, return the mean for all fields
252
+ # if max invalid parameter > 0, returns the mean for all tuples
253
+ # with 0 to max_invalid invalid fields
254
+ def vector_mean(fields=nil,max_invalid=0)
255
+ a=[]
256
+ fields||=@fields
257
+ size=fields.size
258
+ raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
259
+ each_with_index do |i, row|
260
+ # numero de invalidos
261
+ sum=0
262
+ invalids=0
263
+ fields.each{|f|
264
+ if !@vectors[f].data_with_nils[i].nil?
265
+ sum+=row[f].to_f
266
+ else
267
+ invalids+=1
268
+ end
269
+ }
270
+ if(invalids>max_invalid)
271
+ a.push(nil)
272
+ else
273
+ a.push(sum.quo(size-invalids))
274
+ end
275
+ end
276
+ a.to_vector(:scale)
277
+ end
278
+ def check_length
279
+ size=nil
280
+ @vectors.each{|k,v|
281
+ raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
282
+ if size.nil?
283
+ size=v.size
284
+ else
285
+ if v.size!=size
286
+ p v.to_a.size
287
+ raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
288
+ end
289
+ end
290
+ }
291
+ @cases=size
292
+ end
293
+ def each_vector
294
+ @fields.each{|k|
295
+ yield k,@vectors[k]
296
+ }
297
+ end
298
+ if !Statsample::OPTIMIZED
299
+ def case_as_hash(c)
300
+ @fields.inject({}) {|a,x|
301
+ a[x]=@vectors[x][c]
302
+ a
303
+ }
304
+ end
305
+ def case_as_array(c)
306
+ @fields.collect {|x| @vectors[x][c]}
307
+ end
308
+ end
309
+ def each
310
+ begin
311
+ @i=0
312
+ @cases.times {|i|
313
+ @i=i
314
+ row=case_as_hash(i)
315
+ yield row
316
+ }
317
+ @i=nil
318
+ rescue =>e
319
+ raise DatasetException.new(self,e)
320
+ end
321
+ end
322
+ def each_with_index
323
+ begin
324
+ @i=0
325
+ @cases.times{|i|
326
+ @i=i
327
+ row=case_as_hash(i)
328
+ yield i,row
329
+ }
330
+ @i=nil
331
+ rescue =>e
332
+ raise DatasetException.new(self,e)
333
+ end
334
+ end
335
+ def each_array
336
+ @cases.times {|i|
337
+ @i=i
338
+ row=case_as_array(i)
339
+ yield row
340
+ }
341
+ @i=nil
342
+ end
343
+ def fields=(f)
344
+ @fields=f
345
+ check_order
346
+ end
347
+ def check_order
348
+ if(@vectors.keys.sort!=@fields.sort)
349
+ @fields=@fields&@vectors.keys
350
+ @fields+=@vectors.keys.sort-@fields
351
+ end
352
+ end
353
+ # Returns the vector named i
354
+ def[](i)
355
+ if i.is_a? String
356
+ raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
357
+ @vectors[i]
358
+ elsif i.is_a? Range
359
+ fields=from_to(i.begin,i.end)
360
+ vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
361
+ ds=Dataset.new(vectors,fields)
362
+ else
363
+ raise ArgumentError, "You need a String or a Range"
364
+ end
365
+ end
366
+ def collect(type=:scale)
367
+ data=[]
368
+ each {|row|
369
+ data.push(yield(row))
370
+ }
371
+ Statsample::Vector.new(data,type)
372
+ end
373
+ def collect_with_index(type=:scale)
374
+ data=[]
375
+ each_with_index {|i,row|
376
+ data.push(yield(i,row))
377
+ }
378
+ Statsample::Vector.new(data,type)
379
+ end
380
+ # Recode a vector based on a block
381
+ def recode!(vector_name)
382
+ 0.upto(@cases-1) {|i|
383
+ @vectors[vector_name].data[i]=yield case_as_hash(i)
384
+ }
385
+ @vectors[vector_name].set_valid_data
386
+ end
387
+ def crosstab(v1,v2)
388
+ Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
389
+ end
390
+ def[]=(i,v)
391
+ if v.instance_of? Statsample::Vector
392
+ @vectors[i]=v
393
+ check_order
394
+ else
395
+ raise ArgumentError,"Should pass a Statsample::Vector"
396
+ end
397
+ end
398
+ def to_matrix
399
+ rows=[]
400
+ self.each_array{|c|
401
+ rows.push(c)
402
+ }
403
+ Matrix.rows(rows)
404
+ end
405
+ def to_multiset_by_split(*fields)
406
+ require 'statsample/multiset'
407
+ if fields.size==1
408
+ to_multiset_by_split_one_field(fields[0])
409
+ else
410
+ to_multiset_by_split_multiple_fields(*fields)
411
+ end
412
+ end
413
+ # create a new dataset with all the data which the block returns true
414
+ def filter
415
+ ds=self.dup_empty
416
+ each {|c|
417
+ ds.add_case(c,false) if yield c
418
+ }
419
+ ds.update_valid_data
420
+ ds
421
+ end
422
+ # creates a new vector with the data of a given field which the block returns true
423
+ def filter_field(field)
424
+ a=[]
425
+ each {|c|
426
+ a.push(c[field]) if yield c
427
+ }
428
+ a.to_vector(@vectors[field].type)
429
+ end
430
+ def to_multiset_by_split_one_field(field)
431
+ raise ArgumentError,"Should use a correct field name" if !@fields.include? field
432
+ factors=@vectors[field].factors
433
+ ms=Multiset.new_empty_vectors(@fields,factors)
434
+ each {|c|
435
+ ms[c[field]].add_case(c,false)
436
+ }
437
+ #puts "Ingreso a los dataset"
438
+ ms.datasets.each {|k,ds|
439
+ ds.update_valid_data
440
+ ds.vectors.each{|k1,v1|
441
+ # puts "Vector #{k1}:"+v1.to_s
442
+ v1.type=@vectors[k1].type
443
+ }
444
+ }
445
+ ms
446
+ end
447
+ def to_multiset_by_split_multiple_fields(*fields)
448
+ factors_total=nil
449
+ fields.each{|f|
450
+ if factors_total.nil?
451
+ factors_total=@vectors[f].factors.collect{|c|
452
+ [c]
453
+ }
454
+ else
455
+ suma=[]
456
+ factors=@vectors[f].factors
457
+ factors_total.each{|f1|
458
+ factors.each{|f2|
459
+ suma.push(f1+[f2])
460
+ }
461
+ }
462
+ factors_total=suma
463
+ end
464
+ }
465
+ ms=Multiset.new_empty_vectors(@fields,factors_total)
466
+ p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
467
+ each{|c|
468
+ p1.call(c)
469
+ }
470
+ ms.datasets.each {|k,ds|
471
+ ds.update_valid_data
472
+ ds.vectors.each{|k1,v1|
473
+ # puts "Vector #{k1}:"+v1.to_s
474
+ v1.type=@vectors[k1].type
475
+ }
476
+ }
477
+ ms
478
+
479
+ end
480
+ # Test each row with one or more tests
481
+ # each test is a Proc with the form
482
+ # Proc.new {|row| row['age']>0}
483
+ # The function returns an array with all errors
484
+ def verify(*tests)
485
+ if(tests[0].is_a? String)
486
+ id=tests[0]
487
+ tests.shift
488
+ else
489
+ id=@fields[0]
490
+ end
491
+ vr=[]
492
+ i=0
493
+ each do |row|
494
+ i+=1
495
+ tests.each{|test|
496
+ if ! test[2].call(row)
497
+ values=""
498
+ if test[1].size>0
499
+ values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
500
+ end
501
+ vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
502
+ end
503
+ }
504
+ end
505
+ vr
506
+ end
507
+ def to_s
508
+ "#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] labels="+@labels.inspect+" cases="+@vectors[@fields[0]].size.to_s
509
+ end
510
+ def inspect
511
+ self.to_s
512
+ end
513
+ def summary
514
+ out=""
515
+ out << "Summary for dataset\n"
516
+ @vectors.each{|k,v|
517
+ out << "###############\n"
518
+ out << "Vector #{k}:\n"
519
+ out << v.summary
520
+ out << "###############\n"
521
+
522
+ }
523
+ out
524
+ end
525
+ end
526
+ end