statsample 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -23,8 +23,8 @@ module Statsample
23
23
  @exp=e
24
24
  end
25
25
  def to_s
26
- m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
27
- m+="\nRow: #{@i}" unless @i.nil?
26
+ m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
27
+ m+="\nRow: #{@ds.i}" unless @ds.i.nil?
28
28
  m
29
29
  end
30
30
  end
@@ -120,7 +120,7 @@ module Statsample
120
120
  end
121
121
  matrix
122
122
  end
123
- def vector_label(v_id)
123
+ def label(v_id)
124
124
  raise "Vector #{v} doesn't exists" unless @fields.include? v_id
125
125
  @labels[v_id].nil? ? v_id : @labels[v_id]
126
126
  end
@@ -334,7 +334,7 @@ module Statsample
334
334
  a=[]
335
335
  fields=check_fields(fields)
336
336
  size=fields.size
337
- each_with_index do |i, row|
337
+ each_with_index do |row, i |
338
338
  # numero de invalidos
339
339
  sum=0
340
340
  invalids=0
@@ -407,21 +407,21 @@ module Statsample
407
407
  }
408
408
  @i=nil
409
409
  rescue =>e
410
- raise DatasetException.new(self,e)
410
+ raise DatasetException.new(self, e)
411
411
  end
412
412
  end
413
- # Returns each case as index and hash
413
+ # Returns each case as hash and index
414
414
  def each_with_index
415
415
  begin
416
416
  @i=0
417
417
  @cases.times{|i|
418
418
  @i=i
419
419
  row=case_as_hash(i)
420
- yield i,row
420
+ yield row, i
421
421
  }
422
422
  @i=nil
423
423
  rescue =>e
424
- raise DatasetException.new(self,e)
424
+ raise DatasetException.new(self, e)
425
425
  end
426
426
  end
427
427
  # Returns each case as an array, coding missing values as nils
@@ -473,26 +473,28 @@ module Statsample
473
473
  def collect(type=:scale)
474
474
  data=[]
475
475
  each {|row|
476
- data.push(yield(row))
476
+ data.push yield(row)
477
477
  }
478
478
  Statsample::Vector.new(data,type)
479
479
  end
480
480
  def collect_with_index(type=:scale)
481
481
  data=[]
482
- each_with_index {|i,row|
482
+ each_with_index {|row, i|
483
483
  data.push(yield(i,row))
484
484
  }
485
485
  Statsample::Vector.new(data,type)
486
486
  end
487
487
  # Recode a vector based on a block
488
488
  def recode!(vector_name)
489
+
489
490
  0.upto(@cases-1) {|i|
490
491
  @vectors[vector_name].data[i]=yield case_as_hash(i)
491
492
  }
492
493
  @vectors[vector_name].set_valid_data
493
494
  end
494
- def crosstab(v1,v2)
495
- Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
495
+
496
+ def crosstab(v1,v2,opts={})
497
+ Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
496
498
  end
497
499
  def[]=(i,v)
498
500
  if v.instance_of? Statsample::Vector
@@ -508,6 +510,15 @@ module Statsample
508
510
  rows.push(c)
509
511
  }
510
512
  Matrix.rows(rows)
513
+ end
514
+ if HAS_GSL
515
+ def to_matrix_gsl
516
+ rows=[]
517
+ self.each_array{|c|
518
+ rows.push(c)
519
+ }
520
+ GSL::Matrix.alloc(*rows)
521
+ end
511
522
  end
512
523
  def to_multiset_by_split(*fields)
513
524
  require 'statsample/multiset'
@@ -640,6 +651,77 @@ module Statsample
640
651
  def inspect
641
652
  self.to_s
642
653
  end
654
+ # Creates a new dataset for one to many relations
655
+ # on a dataset, based on pattern of field names.
656
+ # for example, you have a survey for number of children
657
+ # with this structure:
658
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
659
+ # with
660
+ # ds.one_to_many(%w{id}, "child_%v_%n"
661
+ # the field of first parameters will be copied verbatim
662
+ # to new dataset, and fields which responds to second
663
+ # pattern will be added one case for each different %n.
664
+ # For example
665
+ # cases=[
666
+ # ['1','george','red',10,'blue',20,nil,nil],
667
+ # ['2','fred','green',15,'orange',30,'white',20],
668
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
669
+ # ]
670
+ # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
671
+ # cases.each {|c| ds.add_case_array c }
672
+ # ds.one_to_many(['id'],'car_%v%n').to_matrix
673
+ # => Matrix[
674
+ # ["red", "1", 10],
675
+ # ["blue", "1", 20],
676
+ # ["green", "2", 15],
677
+ # ["orange", "2", 30],
678
+ # ["white", "2", 20]
679
+ # ]
680
+ #
681
+ def one_to_many(parent_fields, pattern)
682
+ base_pattern=pattern.gsub(/%v|%n/,"")
683
+ re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
684
+ ds_vars=parent_fields
685
+ vars=[]
686
+ max_n=0
687
+ h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
688
+ # Adding _row_id
689
+ h['_col_id']=[].to_scale
690
+ ds_vars.push("_col_id")
691
+ @fields.each do |f|
692
+ if f=~re
693
+ if !vars.include? $1
694
+ vars.push($1)
695
+ h[$1]=Statsample::Vector.new([], @vectors[f].type)
696
+ end
697
+ max_n=$2.to_i if max_n < $2.to_i
698
+ end
699
+ end
700
+ ds=Dataset.new(h,ds_vars+vars)
701
+ each do |row|
702
+ row_out={}
703
+ parent_fields.each do |f|
704
+ row_out[f]=row[f]
705
+ end
706
+ max_n.times do |n1|
707
+ n=n1+1
708
+ any_data=false
709
+ vars.each do |v|
710
+ data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
711
+ row_out[v]=data
712
+ any_data=true if !data.nil?
713
+ end
714
+ if any_data
715
+ row_out["_col_id"]=n
716
+ ds.add_case(row_out,false)
717
+ end
718
+
719
+ end
720
+ end
721
+ ds.update_valid_data
722
+ ds
723
+ end
724
+
643
725
  def summary
644
726
  out=""
645
727
  out << "Summary for dataset\n"
@@ -38,15 +38,16 @@ module Statsample
38
38
  return 0.5 if dm==0.5
39
39
  dominances=[dm]
40
40
  @models_data.each{|k,m|
41
- if !m.contributions[i].nil? and !m.contributions[j].nil?
42
- if m.contributions[i]>m.contributions[j]
43
- dominances.push(1)
44
- elsif m.contributions[i]<m.contributions[j]
45
- dominances.push(0)
46
- else
47
- dominances.push(0.5)
48
- end
41
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
42
+ if m.contributions[i]>m.contributions[j]
43
+ dominances.push(1)
44
+ elsif m.contributions[i]<m.contributions[j]
45
+ dominances.push(0)
46
+ else
47
+ return 0.5
48
+ #dominances.push(0.5)
49
49
  end
50
+ end
50
51
  }
51
52
  final=dominances.uniq
52
53
  final.size>1 ? 0.5 : final[0]
@@ -64,7 +65,8 @@ module Statsample
64
65
  elsif a[i]<a[j]
65
66
  dominances.push(0)
66
67
  else
67
- a(0.5)
68
+ return 0.5
69
+ dominances.push(0.5)
68
70
  end
69
71
  end
70
72
  final=dominances.uniq
@@ -72,34 +74,34 @@ module Statsample
72
74
  end
73
75
  # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
74
76
  def general_dominance_pairwise(i,j)
75
- ga=general_averages
76
- if ga[i]>ga[j]
77
- 1
78
- elsif ga[i]<ga[j]
79
- 0
80
- else
81
- 0.5
82
- end
77
+ ga=general_averages
78
+ if ga[i]>ga[j]
79
+ 1
80
+ elsif ga[i]<ga[j]
81
+ 0
82
+ else
83
+ 0.5
84
+ end
83
85
  end
84
86
  def pairs
85
- @models.find_all{|m| m.size==2}
87
+ @models.find_all{|m| m.size==2}
86
88
  end
87
89
  def total_dominance
88
90
  pairs.inject({}){|a,pair|
89
- a[pair]=total_dominance_pairwise(pair[0], pair[1])
90
- a
91
+ a[pair]=total_dominance_pairwise(pair[0], pair[1])
92
+ a
91
93
  }
92
94
  end
93
95
  def conditional_dominance
94
96
  pairs.inject({}){|a,pair|
95
- a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
96
- a
97
+ a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
98
+ a
97
99
  }
98
100
  end
99
101
  def general_dominance
100
102
  pairs.inject({}){|a,pair|
101
- a[pair]=general_dominance_pairwise(pair[0], pair[1])
102
- a
103
+ a[pair]=general_dominance_pairwise(pair[0], pair[1])
104
+ a
103
105
  }
104
106
  end
105
107
 
@@ -108,56 +110,61 @@ module Statsample
108
110
  end
109
111
  # Get all model of size k
110
112
  def md_k(k)
111
- out=[]
112
- models=@models.each{|m|
113
- out.push(md(m)) if m.size==k
114
- }
115
- out
113
+ out=[]
114
+ models=@models.each{|m| out.push(md(m)) if m.size==k }
115
+ out
116
116
  end
117
+
118
+ # For a hash with arrays of numbers as values
119
+ # Returns a hash with same keys and
120
+ # value as the mean of values of original hash
121
+
117
122
  def get_averages(averages)
118
123
  out={}
119
124
  averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
120
125
  out
121
126
  end
127
+ # Hash with average for each k size
128
+ # model
122
129
  def average_k(k)
123
- return nil if k==@fields.size
124
- models=md_k(k)
125
- averages=@fields.inject({}) {|a,v| a[v]=[];a}
126
- models.each{|m|
127
- @fields.each{|f|
128
- averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
129
- }
130
- }
131
- get_averages(averages)
130
+ return nil if k==@fields.size
131
+ models=md_k(k)
132
+ averages=@fields.inject({}) {|a,v| a[v]=[];a}
133
+ models.each do |m|
134
+ @fields.each do |f|
135
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
136
+ end
137
+ end
138
+ get_averages(averages)
132
139
  end
133
140
  def general_averages
134
- if @general_averages.nil?
135
- averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
136
- for k in 1...@fields.size
137
- ak=average_k(k)
138
- @fields.each{|f|
139
- averages[f].push(ak[f])
140
- }
141
- end
142
- @general_averages=get_averages(averages)
141
+ if @general_averages.nil?
142
+ averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
143
+ for k in 1...@fields.size
144
+ ak=average_k(k)
145
+ @fields.each{|f|
146
+ averages[f].push(ak[f])
147
+ }
143
148
  end
144
- @general_averages
149
+ @general_averages=get_averages(averages)
150
+ end
151
+ @general_averages
145
152
  end
146
153
  def create_models
147
- @models=[]
148
- @models_data={}
149
- for i in 1..@fields.size
150
- c=Statsample::Combination.new(i,@fields.size)
151
- c.each{|data|
152
- convert=data.collect {|i1|
153
- @fields[i1]
154
- }
155
- @models.push(convert)
156
- ds_prev=@ds.dup(convert+[@y_var])
157
- modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
158
- @models_data[convert.sort]=modeldata
159
- }
160
- end
154
+ @models=[]
155
+ @models_data={}
156
+ for i in 1..@fields.size
157
+ c=Statsample::Combination.new(i,@fields.size)
158
+ c.each{|data|
159
+ convert=data.collect {|i1|
160
+ @fields[i1]
161
+ }
162
+ @models.push(convert)
163
+ ds_prev=@ds.dup(convert+[@y_var])
164
+ modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
165
+ @models_data[convert.sort]=modeldata
166
+ }
167
+ end
161
168
  end
162
169
  def summary(report_type=ConsoleSummary)
163
170
  out=""
@@ -26,25 +26,29 @@ class DominanceAnalysis
26
26
  end
27
27
  @da
28
28
  end
29
+ # Creates re-samples from original dataset.
30
+ # * number_samples: Number of new samples to add
31
+ # * n: size of each new sample. If nil, equal to original sample size
32
+ # * report: if true, echo number of current resample and total
29
33
  def bootstrap(number_samples,n=nil,report=false)
30
- number_samples.times{ |t|
31
- @n_samples+=1
32
- puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
33
- ds_boot=@ds.bootstrap(n)
34
- da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
35
- da_1.total_dominance.each{|k,v|
36
- @samples_td[k].push(v)
37
- }
38
- da_1.conditional_dominance.each{|k,v|
39
- @samples_cd[k].push(v)
40
- }
41
- da_1.general_dominance.each{|k,v|
42
- @samples_gd[k].push(v)
43
- }
44
- da_1.general_averages.each{|k,v|
45
- @samples_ga[k].push(v)
46
- }
47
- }
34
+ number_samples.times{ |t|
35
+ @n_samples+=1
36
+ puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
37
+ ds_boot=@ds.bootstrap(n)
38
+ da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
39
+ da_1.total_dominance.each{|k,v|
40
+ @samples_td[k].push(v)
41
+ }
42
+ da_1.conditional_dominance.each{|k,v|
43
+ @samples_cd[k].push(v)
44
+ }
45
+ da_1.general_dominance.each{|k,v|
46
+ @samples_gd[k].push(v)
47
+ }
48
+ da_1.general_averages.each{|k,v|
49
+ @samples_ga[k].push(v)
50
+ }
51
+ }
48
52
  end
49
53
  def create_samples_pairs
50
54
  @samples_td={}
@@ -106,7 +110,7 @@ class DominanceAnalysis
106
110
  table.header=[_("var"),_("mean"),_("se"),_("p.5"),_("p.95")]
107
111
  @fields.each{|f|
108
112
  v=@samples_ga[f].to_vector(:scale)
109
- row=[@ds.vector_label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
113
+ row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
110
114
  table.add_row(row)
111
115
 
112
116
  }
@@ -115,10 +119,10 @@ class DominanceAnalysis
115
119
  end
116
120
  def summary_pairs(pair,std,ttd)
117
121
  freqs=std.proportions
118
- [0,0.5,1].each{|n|
122
+ [0, 0.5, 1].each{|n|
119
123
  freqs[n]=0 if freqs[n].nil?
120
124
  }
121
- name=@ds.vector_label(pair[0])+" - "+@ds.vector_label(pair[1])
125
+ name=@ds.label(pair[0])+" - "+@ds.label(pair[1])
122
126
  [name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
123
127
  end
124
128
  def f(v,n=3)
@@ -0,0 +1,18 @@
1
+ require 'statsample/factor/pca'
2
+ require 'statsample/factor/principalaxis'
3
+ require 'statsample/factor/rotation'
4
+
5
+ module Statsample
6
+ # = Factor Analysis toolbox
7
+ # * Classes for Extraction of factors:
8
+ # * Statsample::Factor::PCA
9
+ # * Statsample::Factor::PrincipalAxis
10
+ # * Classes for Rotation of factors:
11
+ # * Statsample::Factor::Varimax
12
+ # * Statsample::Factor::Equimax
13
+ # * Statsample::Factor::Quartimax
14
+ #
15
+ # See documentation of each class to use it
16
+ module Factor
17
+ end
18
+ end