statsample 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -23,8 +23,8 @@ module Statsample
23
23
  @exp=e
24
24
  end
25
25
  def to_s
26
- m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")+"\nOn Dataset:"+@ds.inspect
27
- m+="\nRow: #{@i}" unless @i.nil?
26
+ m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
27
+ m+="\nRow: #{@ds.i}" unless @ds.i.nil?
28
28
  m
29
29
  end
30
30
  end
@@ -120,7 +120,7 @@ module Statsample
120
120
  end
121
121
  matrix
122
122
  end
123
- def vector_label(v_id)
123
+ def label(v_id)
124
124
  raise "Vector #{v} doesn't exists" unless @fields.include? v_id
125
125
  @labels[v_id].nil? ? v_id : @labels[v_id]
126
126
  end
@@ -334,7 +334,7 @@ module Statsample
334
334
  a=[]
335
335
  fields=check_fields(fields)
336
336
  size=fields.size
337
- each_with_index do |i, row|
337
+ each_with_index do |row, i |
338
338
  # numero de invalidos
339
339
  sum=0
340
340
  invalids=0
@@ -407,21 +407,21 @@ module Statsample
407
407
  }
408
408
  @i=nil
409
409
  rescue =>e
410
- raise DatasetException.new(self,e)
410
+ raise DatasetException.new(self, e)
411
411
  end
412
412
  end
413
- # Returns each case as index and hash
413
+ # Returns each case as hash and index
414
414
  def each_with_index
415
415
  begin
416
416
  @i=0
417
417
  @cases.times{|i|
418
418
  @i=i
419
419
  row=case_as_hash(i)
420
- yield i,row
420
+ yield row, i
421
421
  }
422
422
  @i=nil
423
423
  rescue =>e
424
- raise DatasetException.new(self,e)
424
+ raise DatasetException.new(self, e)
425
425
  end
426
426
  end
427
427
  # Returns each case as an array, coding missing values as nils
@@ -473,26 +473,28 @@ module Statsample
473
473
  def collect(type=:scale)
474
474
  data=[]
475
475
  each {|row|
476
- data.push(yield(row))
476
+ data.push yield(row)
477
477
  }
478
478
  Statsample::Vector.new(data,type)
479
479
  end
480
480
  def collect_with_index(type=:scale)
481
481
  data=[]
482
- each_with_index {|i,row|
482
+ each_with_index {|row, i|
483
483
  data.push(yield(i,row))
484
484
  }
485
485
  Statsample::Vector.new(data,type)
486
486
  end
487
487
  # Recode a vector based on a block
488
488
  def recode!(vector_name)
489
+
489
490
  0.upto(@cases-1) {|i|
490
491
  @vectors[vector_name].data[i]=yield case_as_hash(i)
491
492
  }
492
493
  @vectors[vector_name].set_valid_data
493
494
  end
494
- def crosstab(v1,v2)
495
- Statsample::Crosstab.new(@vectors[v1],@vectors[v2])
495
+
496
+ def crosstab(v1,v2,opts={})
497
+ Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
496
498
  end
497
499
  def[]=(i,v)
498
500
  if v.instance_of? Statsample::Vector
@@ -508,6 +510,15 @@ module Statsample
508
510
  rows.push(c)
509
511
  }
510
512
  Matrix.rows(rows)
513
+ end
514
+ if HAS_GSL
515
+ def to_matrix_gsl
516
+ rows=[]
517
+ self.each_array{|c|
518
+ rows.push(c)
519
+ }
520
+ GSL::Matrix.alloc(*rows)
521
+ end
511
522
  end
512
523
  def to_multiset_by_split(*fields)
513
524
  require 'statsample/multiset'
@@ -640,6 +651,77 @@ module Statsample
640
651
  def inspect
641
652
  self.to_s
642
653
  end
654
+ # Creates a new dataset for one to many relations
655
+ # on a dataset, based on pattern of field names.
656
+ # for example, you have a survey for number of children
657
+ # with this structure:
658
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
659
+ # with
660
+ # ds.one_to_many(%w{id}, "child_%v_%n"
661
+ # the field of first parameters will be copied verbatim
662
+ # to new dataset, and fields which responds to second
663
+ # pattern will be added one case for each different %n.
664
+ # For example
665
+ # cases=[
666
+ # ['1','george','red',10,'blue',20,nil,nil],
667
+ # ['2','fred','green',15,'orange',30,'white',20],
668
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
669
+ # ]
670
+ # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
671
+ # cases.each {|c| ds.add_case_array c }
672
+ # ds.one_to_many(['id'],'car_%v%n').to_matrix
673
+ # => Matrix[
674
+ # ["red", "1", 10],
675
+ # ["blue", "1", 20],
676
+ # ["green", "2", 15],
677
+ # ["orange", "2", 30],
678
+ # ["white", "2", 20]
679
+ # ]
680
+ #
681
+ def one_to_many(parent_fields, pattern)
682
+ base_pattern=pattern.gsub(/%v|%n/,"")
683
+ re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
684
+ ds_vars=parent_fields
685
+ vars=[]
686
+ max_n=0
687
+ h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
688
+ # Adding _row_id
689
+ h['_col_id']=[].to_scale
690
+ ds_vars.push("_col_id")
691
+ @fields.each do |f|
692
+ if f=~re
693
+ if !vars.include? $1
694
+ vars.push($1)
695
+ h[$1]=Statsample::Vector.new([], @vectors[f].type)
696
+ end
697
+ max_n=$2.to_i if max_n < $2.to_i
698
+ end
699
+ end
700
+ ds=Dataset.new(h,ds_vars+vars)
701
+ each do |row|
702
+ row_out={}
703
+ parent_fields.each do |f|
704
+ row_out[f]=row[f]
705
+ end
706
+ max_n.times do |n1|
707
+ n=n1+1
708
+ any_data=false
709
+ vars.each do |v|
710
+ data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
711
+ row_out[v]=data
712
+ any_data=true if !data.nil?
713
+ end
714
+ if any_data
715
+ row_out["_col_id"]=n
716
+ ds.add_case(row_out,false)
717
+ end
718
+
719
+ end
720
+ end
721
+ ds.update_valid_data
722
+ ds
723
+ end
724
+
643
725
  def summary
644
726
  out=""
645
727
  out << "Summary for dataset\n"
@@ -38,15 +38,16 @@ module Statsample
38
38
  return 0.5 if dm==0.5
39
39
  dominances=[dm]
40
40
  @models_data.each{|k,m|
41
- if !m.contributions[i].nil? and !m.contributions[j].nil?
42
- if m.contributions[i]>m.contributions[j]
43
- dominances.push(1)
44
- elsif m.contributions[i]<m.contributions[j]
45
- dominances.push(0)
46
- else
47
- dominances.push(0.5)
48
- end
41
+ if !m.contributions[i].nil? and !m.contributions[j].nil?
42
+ if m.contributions[i]>m.contributions[j]
43
+ dominances.push(1)
44
+ elsif m.contributions[i]<m.contributions[j]
45
+ dominances.push(0)
46
+ else
47
+ return 0.5
48
+ #dominances.push(0.5)
49
49
  end
50
+ end
50
51
  }
51
52
  final=dominances.uniq
52
53
  final.size>1 ? 0.5 : final[0]
@@ -64,7 +65,8 @@ module Statsample
64
65
  elsif a[i]<a[j]
65
66
  dominances.push(0)
66
67
  else
67
- a(0.5)
68
+ return 0.5
69
+ dominances.push(0.5)
68
70
  end
69
71
  end
70
72
  final=dominances.uniq
@@ -72,34 +74,34 @@ module Statsample
72
74
  end
73
75
  # Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
74
76
  def general_dominance_pairwise(i,j)
75
- ga=general_averages
76
- if ga[i]>ga[j]
77
- 1
78
- elsif ga[i]<ga[j]
79
- 0
80
- else
81
- 0.5
82
- end
77
+ ga=general_averages
78
+ if ga[i]>ga[j]
79
+ 1
80
+ elsif ga[i]<ga[j]
81
+ 0
82
+ else
83
+ 0.5
84
+ end
83
85
  end
84
86
  def pairs
85
- @models.find_all{|m| m.size==2}
87
+ @models.find_all{|m| m.size==2}
86
88
  end
87
89
  def total_dominance
88
90
  pairs.inject({}){|a,pair|
89
- a[pair]=total_dominance_pairwise(pair[0], pair[1])
90
- a
91
+ a[pair]=total_dominance_pairwise(pair[0], pair[1])
92
+ a
91
93
  }
92
94
  end
93
95
  def conditional_dominance
94
96
  pairs.inject({}){|a,pair|
95
- a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
96
- a
97
+ a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
98
+ a
97
99
  }
98
100
  end
99
101
  def general_dominance
100
102
  pairs.inject({}){|a,pair|
101
- a[pair]=general_dominance_pairwise(pair[0], pair[1])
102
- a
103
+ a[pair]=general_dominance_pairwise(pair[0], pair[1])
104
+ a
103
105
  }
104
106
  end
105
107
 
@@ -108,56 +110,61 @@ module Statsample
108
110
  end
109
111
  # Get all model of size k
110
112
  def md_k(k)
111
- out=[]
112
- models=@models.each{|m|
113
- out.push(md(m)) if m.size==k
114
- }
115
- out
113
+ out=[]
114
+ models=@models.each{|m| out.push(md(m)) if m.size==k }
115
+ out
116
116
  end
117
+
118
+ # For a hash with arrays of numbers as values
119
+ # Returns a hash with same keys and
120
+ # value as the mean of values of original hash
121
+
117
122
  def get_averages(averages)
118
123
  out={}
119
124
  averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
120
125
  out
121
126
  end
127
+ # Hash with average for each k size
128
+ # model
122
129
  def average_k(k)
123
- return nil if k==@fields.size
124
- models=md_k(k)
125
- averages=@fields.inject({}) {|a,v| a[v]=[];a}
126
- models.each{|m|
127
- @fields.each{|f|
128
- averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
129
- }
130
- }
131
- get_averages(averages)
130
+ return nil if k==@fields.size
131
+ models=md_k(k)
132
+ averages=@fields.inject({}) {|a,v| a[v]=[];a}
133
+ models.each do |m|
134
+ @fields.each do |f|
135
+ averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
136
+ end
137
+ end
138
+ get_averages(averages)
132
139
  end
133
140
  def general_averages
134
- if @general_averages.nil?
135
- averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
136
- for k in 1...@fields.size
137
- ak=average_k(k)
138
- @fields.each{|f|
139
- averages[f].push(ak[f])
140
- }
141
- end
142
- @general_averages=get_averages(averages)
141
+ if @general_averages.nil?
142
+ averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
143
+ for k in 1...@fields.size
144
+ ak=average_k(k)
145
+ @fields.each{|f|
146
+ averages[f].push(ak[f])
147
+ }
143
148
  end
144
- @general_averages
149
+ @general_averages=get_averages(averages)
150
+ end
151
+ @general_averages
145
152
  end
146
153
  def create_models
147
- @models=[]
148
- @models_data={}
149
- for i in 1..@fields.size
150
- c=Statsample::Combination.new(i,@fields.size)
151
- c.each{|data|
152
- convert=data.collect {|i1|
153
- @fields[i1]
154
- }
155
- @models.push(convert)
156
- ds_prev=@ds.dup(convert+[@y_var])
157
- modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
158
- @models_data[convert.sort]=modeldata
159
- }
160
- end
154
+ @models=[]
155
+ @models_data={}
156
+ for i in 1..@fields.size
157
+ c=Statsample::Combination.new(i,@fields.size)
158
+ c.each{|data|
159
+ convert=data.collect {|i1|
160
+ @fields[i1]
161
+ }
162
+ @models.push(convert)
163
+ ds_prev=@ds.dup(convert+[@y_var])
164
+ modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
165
+ @models_data[convert.sort]=modeldata
166
+ }
167
+ end
161
168
  end
162
169
  def summary(report_type=ConsoleSummary)
163
170
  out=""
@@ -26,25 +26,29 @@ class DominanceAnalysis
26
26
  end
27
27
  @da
28
28
  end
29
+ # Creates re-samples from original dataset.
30
+ # * number_samples: Number of new samples to add
31
+ # * n: size of each new sample. If nil, equal to original sample size
32
+ # * report: if true, echo number of current resample and total
29
33
  def bootstrap(number_samples,n=nil,report=false)
30
- number_samples.times{ |t|
31
- @n_samples+=1
32
- puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
33
- ds_boot=@ds.bootstrap(n)
34
- da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
35
- da_1.total_dominance.each{|k,v|
36
- @samples_td[k].push(v)
37
- }
38
- da_1.conditional_dominance.each{|k,v|
39
- @samples_cd[k].push(v)
40
- }
41
- da_1.general_dominance.each{|k,v|
42
- @samples_gd[k].push(v)
43
- }
44
- da_1.general_averages.each{|k,v|
45
- @samples_ga[k].push(v)
46
- }
47
- }
34
+ number_samples.times{ |t|
35
+ @n_samples+=1
36
+ puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
37
+ ds_boot=@ds.bootstrap(n)
38
+ da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
39
+ da_1.total_dominance.each{|k,v|
40
+ @samples_td[k].push(v)
41
+ }
42
+ da_1.conditional_dominance.each{|k,v|
43
+ @samples_cd[k].push(v)
44
+ }
45
+ da_1.general_dominance.each{|k,v|
46
+ @samples_gd[k].push(v)
47
+ }
48
+ da_1.general_averages.each{|k,v|
49
+ @samples_ga[k].push(v)
50
+ }
51
+ }
48
52
  end
49
53
  def create_samples_pairs
50
54
  @samples_td={}
@@ -106,7 +110,7 @@ class DominanceAnalysis
106
110
  table.header=[_("var"),_("mean"),_("se"),_("p.5"),_("p.95")]
107
111
  @fields.each{|f|
108
112
  v=@samples_ga[f].to_vector(:scale)
109
- row=[@ds.vector_label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
113
+ row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
110
114
  table.add_row(row)
111
115
 
112
116
  }
@@ -115,10 +119,10 @@ class DominanceAnalysis
115
119
  end
116
120
  def summary_pairs(pair,std,ttd)
117
121
  freqs=std.proportions
118
- [0,0.5,1].each{|n|
122
+ [0, 0.5, 1].each{|n|
119
123
  freqs[n]=0 if freqs[n].nil?
120
124
  }
121
- name=@ds.vector_label(pair[0])+" - "+@ds.vector_label(pair[1])
125
+ name=@ds.label(pair[0])+" - "+@ds.label(pair[1])
122
126
  [name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
123
127
  end
124
128
  def f(v,n=3)
@@ -0,0 +1,18 @@
1
+ require 'statsample/factor/pca'
2
+ require 'statsample/factor/principalaxis'
3
+ require 'statsample/factor/rotation'
4
+
5
+ module Statsample
6
+ # = Factor Analysis toolbox
7
+ # * Classes for Extraction of factors:
8
+ # * Statsample::Factor::PCA
9
+ # * Statsample::Factor::PrincipalAxis
10
+ # * Classes for Rotation of factors:
11
+ # * Statsample::Factor::Varimax
12
+ # * Statsample::Factor::Equimax
13
+ # * Statsample::Factor::Quartimax
14
+ #
15
+ # See documentation of each class to use it
16
+ module Factor
17
+ end
18
+ end