statsample 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/dataset.rb
CHANGED
@@ -23,8 +23,8 @@ module Statsample
|
|
23
23
|
@exp=e
|
24
24
|
end
|
25
25
|
def to_s
|
26
|
-
m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
27
|
-
m+="\nRow: #{@i}" unless @i.nil?
|
26
|
+
m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
27
|
+
m+="\nRow: #{@ds.i}" unless @ds.i.nil?
|
28
28
|
m
|
29
29
|
end
|
30
30
|
end
|
@@ -120,7 +120,7 @@ module Statsample
|
|
120
120
|
end
|
121
121
|
matrix
|
122
122
|
end
|
123
|
-
def
|
123
|
+
def label(v_id)
|
124
124
|
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
125
125
|
@labels[v_id].nil? ? v_id : @labels[v_id]
|
126
126
|
end
|
@@ -334,7 +334,7 @@ module Statsample
|
|
334
334
|
a=[]
|
335
335
|
fields=check_fields(fields)
|
336
336
|
size=fields.size
|
337
|
-
each_with_index do |
|
337
|
+
each_with_index do |row, i |
|
338
338
|
# numero de invalidos
|
339
339
|
sum=0
|
340
340
|
invalids=0
|
@@ -407,21 +407,21 @@ module Statsample
|
|
407
407
|
}
|
408
408
|
@i=nil
|
409
409
|
rescue =>e
|
410
|
-
raise DatasetException.new(self,e)
|
410
|
+
raise DatasetException.new(self, e)
|
411
411
|
end
|
412
412
|
end
|
413
|
-
# Returns each case as
|
413
|
+
# Returns each case as hash and index
|
414
414
|
def each_with_index
|
415
415
|
begin
|
416
416
|
@i=0
|
417
417
|
@cases.times{|i|
|
418
418
|
@i=i
|
419
419
|
row=case_as_hash(i)
|
420
|
-
yield i
|
420
|
+
yield row, i
|
421
421
|
}
|
422
422
|
@i=nil
|
423
423
|
rescue =>e
|
424
|
-
raise DatasetException.new(self,e)
|
424
|
+
raise DatasetException.new(self, e)
|
425
425
|
end
|
426
426
|
end
|
427
427
|
# Returns each case as an array, coding missing values as nils
|
@@ -473,26 +473,28 @@ module Statsample
|
|
473
473
|
def collect(type=:scale)
|
474
474
|
data=[]
|
475
475
|
each {|row|
|
476
|
-
data.push
|
476
|
+
data.push yield(row)
|
477
477
|
}
|
478
478
|
Statsample::Vector.new(data,type)
|
479
479
|
end
|
480
480
|
def collect_with_index(type=:scale)
|
481
481
|
data=[]
|
482
|
-
each_with_index {|i
|
482
|
+
each_with_index {|row, i|
|
483
483
|
data.push(yield(i,row))
|
484
484
|
}
|
485
485
|
Statsample::Vector.new(data,type)
|
486
486
|
end
|
487
487
|
# Recode a vector based on a block
|
488
488
|
def recode!(vector_name)
|
489
|
+
|
489
490
|
0.upto(@cases-1) {|i|
|
490
491
|
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
491
492
|
}
|
492
493
|
@vectors[vector_name].set_valid_data
|
493
494
|
end
|
494
|
-
|
495
|
-
|
495
|
+
|
496
|
+
def crosstab(v1,v2,opts={})
|
497
|
+
Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
|
496
498
|
end
|
497
499
|
def[]=(i,v)
|
498
500
|
if v.instance_of? Statsample::Vector
|
@@ -508,6 +510,15 @@ module Statsample
|
|
508
510
|
rows.push(c)
|
509
511
|
}
|
510
512
|
Matrix.rows(rows)
|
513
|
+
end
|
514
|
+
if HAS_GSL
|
515
|
+
def to_matrix_gsl
|
516
|
+
rows=[]
|
517
|
+
self.each_array{|c|
|
518
|
+
rows.push(c)
|
519
|
+
}
|
520
|
+
GSL::Matrix.alloc(*rows)
|
521
|
+
end
|
511
522
|
end
|
512
523
|
def to_multiset_by_split(*fields)
|
513
524
|
require 'statsample/multiset'
|
@@ -640,6 +651,77 @@ module Statsample
|
|
640
651
|
def inspect
|
641
652
|
self.to_s
|
642
653
|
end
|
654
|
+
# Creates a new dataset for one to many relations
|
655
|
+
# on a dataset, based on pattern of field names.
|
656
|
+
# for example, you have a survey for number of children
|
657
|
+
# with this structure:
|
658
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
659
|
+
# with
|
660
|
+
# ds.one_to_many(%w{id}, "child_%v_%n"
|
661
|
+
# the field of first parameters will be copied verbatim
|
662
|
+
# to new dataset, and fields which responds to second
|
663
|
+
# pattern will be added one case for each different %n.
|
664
|
+
# For example
|
665
|
+
# cases=[
|
666
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
667
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
668
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
669
|
+
# ]
|
670
|
+
# ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
|
671
|
+
# cases.each {|c| ds.add_case_array c }
|
672
|
+
# ds.one_to_many(['id'],'car_%v%n').to_matrix
|
673
|
+
# => Matrix[
|
674
|
+
# ["red", "1", 10],
|
675
|
+
# ["blue", "1", 20],
|
676
|
+
# ["green", "2", 15],
|
677
|
+
# ["orange", "2", 30],
|
678
|
+
# ["white", "2", 20]
|
679
|
+
# ]
|
680
|
+
#
|
681
|
+
def one_to_many(parent_fields, pattern)
|
682
|
+
base_pattern=pattern.gsub(/%v|%n/,"")
|
683
|
+
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
684
|
+
ds_vars=parent_fields
|
685
|
+
vars=[]
|
686
|
+
max_n=0
|
687
|
+
h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
|
688
|
+
# Adding _row_id
|
689
|
+
h['_col_id']=[].to_scale
|
690
|
+
ds_vars.push("_col_id")
|
691
|
+
@fields.each do |f|
|
692
|
+
if f=~re
|
693
|
+
if !vars.include? $1
|
694
|
+
vars.push($1)
|
695
|
+
h[$1]=Statsample::Vector.new([], @vectors[f].type)
|
696
|
+
end
|
697
|
+
max_n=$2.to_i if max_n < $2.to_i
|
698
|
+
end
|
699
|
+
end
|
700
|
+
ds=Dataset.new(h,ds_vars+vars)
|
701
|
+
each do |row|
|
702
|
+
row_out={}
|
703
|
+
parent_fields.each do |f|
|
704
|
+
row_out[f]=row[f]
|
705
|
+
end
|
706
|
+
max_n.times do |n1|
|
707
|
+
n=n1+1
|
708
|
+
any_data=false
|
709
|
+
vars.each do |v|
|
710
|
+
data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
|
711
|
+
row_out[v]=data
|
712
|
+
any_data=true if !data.nil?
|
713
|
+
end
|
714
|
+
if any_data
|
715
|
+
row_out["_col_id"]=n
|
716
|
+
ds.add_case(row_out,false)
|
717
|
+
end
|
718
|
+
|
719
|
+
end
|
720
|
+
end
|
721
|
+
ds.update_valid_data
|
722
|
+
ds
|
723
|
+
end
|
724
|
+
|
643
725
|
def summary
|
644
726
|
out=""
|
645
727
|
out << "Summary for dataset\n"
|
@@ -38,15 +38,16 @@ module Statsample
|
|
38
38
|
return 0.5 if dm==0.5
|
39
39
|
dominances=[dm]
|
40
40
|
@models_data.each{|k,m|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
41
|
+
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
42
|
+
if m.contributions[i]>m.contributions[j]
|
43
|
+
dominances.push(1)
|
44
|
+
elsif m.contributions[i]<m.contributions[j]
|
45
|
+
dominances.push(0)
|
46
|
+
else
|
47
|
+
return 0.5
|
48
|
+
#dominances.push(0.5)
|
49
49
|
end
|
50
|
+
end
|
50
51
|
}
|
51
52
|
final=dominances.uniq
|
52
53
|
final.size>1 ? 0.5 : final[0]
|
@@ -64,7 +65,8 @@ module Statsample
|
|
64
65
|
elsif a[i]<a[j]
|
65
66
|
dominances.push(0)
|
66
67
|
else
|
67
|
-
|
68
|
+
return 0.5
|
69
|
+
dominances.push(0.5)
|
68
70
|
end
|
69
71
|
end
|
70
72
|
final=dominances.uniq
|
@@ -72,34 +74,34 @@ module Statsample
|
|
72
74
|
end
|
73
75
|
# Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
|
74
76
|
def general_dominance_pairwise(i,j)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
77
|
+
ga=general_averages
|
78
|
+
if ga[i]>ga[j]
|
79
|
+
1
|
80
|
+
elsif ga[i]<ga[j]
|
81
|
+
0
|
82
|
+
else
|
83
|
+
0.5
|
84
|
+
end
|
83
85
|
end
|
84
86
|
def pairs
|
85
|
-
|
87
|
+
@models.find_all{|m| m.size==2}
|
86
88
|
end
|
87
89
|
def total_dominance
|
88
90
|
pairs.inject({}){|a,pair|
|
89
|
-
|
90
|
-
|
91
|
+
a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
92
|
+
a
|
91
93
|
}
|
92
94
|
end
|
93
95
|
def conditional_dominance
|
94
96
|
pairs.inject({}){|a,pair|
|
95
|
-
|
96
|
-
|
97
|
+
a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
98
|
+
a
|
97
99
|
}
|
98
100
|
end
|
99
101
|
def general_dominance
|
100
102
|
pairs.inject({}){|a,pair|
|
101
|
-
|
102
|
-
|
103
|
+
a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
104
|
+
a
|
103
105
|
}
|
104
106
|
end
|
105
107
|
|
@@ -108,56 +110,61 @@ module Statsample
|
|
108
110
|
end
|
109
111
|
# Get all model of size k
|
110
112
|
def md_k(k)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
}
|
115
|
-
out
|
113
|
+
out=[]
|
114
|
+
models=@models.each{|m| out.push(md(m)) if m.size==k }
|
115
|
+
out
|
116
116
|
end
|
117
|
+
|
118
|
+
# For a hash with arrays of numbers as values
|
119
|
+
# Returns a hash with same keys and
|
120
|
+
# value as the mean of values of original hash
|
121
|
+
|
117
122
|
def get_averages(averages)
|
118
123
|
out={}
|
119
124
|
averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
|
120
125
|
out
|
121
126
|
end
|
127
|
+
# Hash with average for each k size
|
128
|
+
# model
|
122
129
|
def average_k(k)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
130
|
+
return nil if k==@fields.size
|
131
|
+
models=md_k(k)
|
132
|
+
averages=@fields.inject({}) {|a,v| a[v]=[];a}
|
133
|
+
models.each do |m|
|
134
|
+
@fields.each do |f|
|
135
|
+
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
136
|
+
end
|
137
|
+
end
|
138
|
+
get_averages(averages)
|
132
139
|
end
|
133
140
|
def general_averages
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
end
|
142
|
-
@general_averages=get_averages(averages)
|
141
|
+
if @general_averages.nil?
|
142
|
+
averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
|
143
|
+
for k in 1...@fields.size
|
144
|
+
ak=average_k(k)
|
145
|
+
@fields.each{|f|
|
146
|
+
averages[f].push(ak[f])
|
147
|
+
}
|
143
148
|
end
|
144
|
-
@general_averages
|
149
|
+
@general_averages=get_averages(averages)
|
150
|
+
end
|
151
|
+
@general_averages
|
145
152
|
end
|
146
153
|
def create_models
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
154
|
+
@models=[]
|
155
|
+
@models_data={}
|
156
|
+
for i in 1..@fields.size
|
157
|
+
c=Statsample::Combination.new(i,@fields.size)
|
158
|
+
c.each{|data|
|
159
|
+
convert=data.collect {|i1|
|
160
|
+
@fields[i1]
|
161
|
+
}
|
162
|
+
@models.push(convert)
|
163
|
+
ds_prev=@ds.dup(convert+[@y_var])
|
164
|
+
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
165
|
+
@models_data[convert.sort]=modeldata
|
166
|
+
}
|
167
|
+
end
|
161
168
|
end
|
162
169
|
def summary(report_type=ConsoleSummary)
|
163
170
|
out=""
|
@@ -26,25 +26,29 @@ class DominanceAnalysis
|
|
26
26
|
end
|
27
27
|
@da
|
28
28
|
end
|
29
|
+
# Creates re-samples from original dataset.
|
30
|
+
# * number_samples: Number of new samples to add
|
31
|
+
# * n: size of each new sample. If nil, equal to original sample size
|
32
|
+
# * report: if true, echo number of current resample and total
|
29
33
|
def bootstrap(number_samples,n=nil,report=false)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
34
|
+
number_samples.times{ |t|
|
35
|
+
@n_samples+=1
|
36
|
+
puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
|
37
|
+
ds_boot=@ds.bootstrap(n)
|
38
|
+
da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
|
39
|
+
da_1.total_dominance.each{|k,v|
|
40
|
+
@samples_td[k].push(v)
|
41
|
+
}
|
42
|
+
da_1.conditional_dominance.each{|k,v|
|
43
|
+
@samples_cd[k].push(v)
|
44
|
+
}
|
45
|
+
da_1.general_dominance.each{|k,v|
|
46
|
+
@samples_gd[k].push(v)
|
47
|
+
}
|
48
|
+
da_1.general_averages.each{|k,v|
|
49
|
+
@samples_ga[k].push(v)
|
50
|
+
}
|
51
|
+
}
|
48
52
|
end
|
49
53
|
def create_samples_pairs
|
50
54
|
@samples_td={}
|
@@ -106,7 +110,7 @@ class DominanceAnalysis
|
|
106
110
|
table.header=[_("var"),_("mean"),_("se"),_("p.5"),_("p.95")]
|
107
111
|
@fields.each{|f|
|
108
112
|
v=@samples_ga[f].to_vector(:scale)
|
109
|
-
row=[@ds.
|
113
|
+
row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
|
110
114
|
table.add_row(row)
|
111
115
|
|
112
116
|
}
|
@@ -115,10 +119,10 @@ class DominanceAnalysis
|
|
115
119
|
end
|
116
120
|
def summary_pairs(pair,std,ttd)
|
117
121
|
freqs=std.proportions
|
118
|
-
[0,0.5,1].each{|n|
|
122
|
+
[0, 0.5, 1].each{|n|
|
119
123
|
freqs[n]=0 if freqs[n].nil?
|
120
124
|
}
|
121
|
-
name=@ds.
|
125
|
+
name=@ds.label(pair[0])+" - "+@ds.label(pair[1])
|
122
126
|
[name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
|
123
127
|
end
|
124
128
|
def f(v,n=3)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'statsample/factor/pca'
|
2
|
+
require 'statsample/factor/principalaxis'
|
3
|
+
require 'statsample/factor/rotation'
|
4
|
+
|
5
|
+
module Statsample
|
6
|
+
# = Factor Analysis toolbox
|
7
|
+
# * Classes for Extraction of factors:
|
8
|
+
# * Statsample::Factor::PCA
|
9
|
+
# * Statsample::Factor::PrincipalAxis
|
10
|
+
# * Classes for Rotation of factors:
|
11
|
+
# * Statsample::Factor::Varimax
|
12
|
+
# * Statsample::Factor::Equimax
|
13
|
+
# * Statsample::Factor::Quartimax
|
14
|
+
#
|
15
|
+
# See documentation of each class to use it
|
16
|
+
module Factor
|
17
|
+
end
|
18
|
+
end
|