statsample 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/lib/statsample/dataset.rb
CHANGED
@@ -23,8 +23,8 @@ module Statsample
|
|
23
23
|
@exp=e
|
24
24
|
end
|
25
25
|
def to_s
|
26
|
-
m="Error: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
27
|
-
m+="\nRow: #{@i}" unless @i.nil?
|
26
|
+
m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
27
|
+
m+="\nRow: #{@ds.i}" unless @ds.i.nil?
|
28
28
|
m
|
29
29
|
end
|
30
30
|
end
|
@@ -120,7 +120,7 @@ module Statsample
|
|
120
120
|
end
|
121
121
|
matrix
|
122
122
|
end
|
123
|
-
def
|
123
|
+
def label(v_id)
|
124
124
|
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
125
125
|
@labels[v_id].nil? ? v_id : @labels[v_id]
|
126
126
|
end
|
@@ -334,7 +334,7 @@ module Statsample
|
|
334
334
|
a=[]
|
335
335
|
fields=check_fields(fields)
|
336
336
|
size=fields.size
|
337
|
-
each_with_index do |
|
337
|
+
each_with_index do |row, i |
|
338
338
|
# numero de invalidos
|
339
339
|
sum=0
|
340
340
|
invalids=0
|
@@ -407,21 +407,21 @@ module Statsample
|
|
407
407
|
}
|
408
408
|
@i=nil
|
409
409
|
rescue =>e
|
410
|
-
raise DatasetException.new(self,e)
|
410
|
+
raise DatasetException.new(self, e)
|
411
411
|
end
|
412
412
|
end
|
413
|
-
# Returns each case as
|
413
|
+
# Returns each case as hash and index
|
414
414
|
def each_with_index
|
415
415
|
begin
|
416
416
|
@i=0
|
417
417
|
@cases.times{|i|
|
418
418
|
@i=i
|
419
419
|
row=case_as_hash(i)
|
420
|
-
yield i
|
420
|
+
yield row, i
|
421
421
|
}
|
422
422
|
@i=nil
|
423
423
|
rescue =>e
|
424
|
-
raise DatasetException.new(self,e)
|
424
|
+
raise DatasetException.new(self, e)
|
425
425
|
end
|
426
426
|
end
|
427
427
|
# Returns each case as an array, coding missing values as nils
|
@@ -473,26 +473,28 @@ module Statsample
|
|
473
473
|
def collect(type=:scale)
|
474
474
|
data=[]
|
475
475
|
each {|row|
|
476
|
-
data.push
|
476
|
+
data.push yield(row)
|
477
477
|
}
|
478
478
|
Statsample::Vector.new(data,type)
|
479
479
|
end
|
480
480
|
def collect_with_index(type=:scale)
|
481
481
|
data=[]
|
482
|
-
each_with_index {|i
|
482
|
+
each_with_index {|row, i|
|
483
483
|
data.push(yield(i,row))
|
484
484
|
}
|
485
485
|
Statsample::Vector.new(data,type)
|
486
486
|
end
|
487
487
|
# Recode a vector based on a block
|
488
488
|
def recode!(vector_name)
|
489
|
+
|
489
490
|
0.upto(@cases-1) {|i|
|
490
491
|
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
491
492
|
}
|
492
493
|
@vectors[vector_name].set_valid_data
|
493
494
|
end
|
494
|
-
|
495
|
-
|
495
|
+
|
496
|
+
def crosstab(v1,v2,opts={})
|
497
|
+
Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
|
496
498
|
end
|
497
499
|
def[]=(i,v)
|
498
500
|
if v.instance_of? Statsample::Vector
|
@@ -508,6 +510,15 @@ module Statsample
|
|
508
510
|
rows.push(c)
|
509
511
|
}
|
510
512
|
Matrix.rows(rows)
|
513
|
+
end
|
514
|
+
if HAS_GSL
|
515
|
+
def to_matrix_gsl
|
516
|
+
rows=[]
|
517
|
+
self.each_array{|c|
|
518
|
+
rows.push(c)
|
519
|
+
}
|
520
|
+
GSL::Matrix.alloc(*rows)
|
521
|
+
end
|
511
522
|
end
|
512
523
|
def to_multiset_by_split(*fields)
|
513
524
|
require 'statsample/multiset'
|
@@ -640,6 +651,77 @@ module Statsample
|
|
640
651
|
def inspect
|
641
652
|
self.to_s
|
642
653
|
end
|
654
|
+
# Creates a new dataset for one to many relations
|
655
|
+
# on a dataset, based on pattern of field names.
|
656
|
+
# for example, you have a survey for number of children
|
657
|
+
# with this structure:
|
658
|
+
# id, name, child_name_1, child_age_1, child_name_2, child_age_2
|
659
|
+
# with
|
660
|
+
# ds.one_to_many(%w{id}, "child_%v_%n"
|
661
|
+
# the field of first parameters will be copied verbatim
|
662
|
+
# to new dataset, and fields which responds to second
|
663
|
+
# pattern will be added one case for each different %n.
|
664
|
+
# For example
|
665
|
+
# cases=[
|
666
|
+
# ['1','george','red',10,'blue',20,nil,nil],
|
667
|
+
# ['2','fred','green',15,'orange',30,'white',20],
|
668
|
+
# ['3','alfred',nil,nil,nil,nil,nil,nil]
|
669
|
+
# ]
|
670
|
+
# ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
|
671
|
+
# cases.each {|c| ds.add_case_array c }
|
672
|
+
# ds.one_to_many(['id'],'car_%v%n').to_matrix
|
673
|
+
# => Matrix[
|
674
|
+
# ["red", "1", 10],
|
675
|
+
# ["blue", "1", 20],
|
676
|
+
# ["green", "2", 15],
|
677
|
+
# ["orange", "2", 30],
|
678
|
+
# ["white", "2", 20]
|
679
|
+
# ]
|
680
|
+
#
|
681
|
+
def one_to_many(parent_fields, pattern)
|
682
|
+
base_pattern=pattern.gsub(/%v|%n/,"")
|
683
|
+
re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
|
684
|
+
ds_vars=parent_fields
|
685
|
+
vars=[]
|
686
|
+
max_n=0
|
687
|
+
h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
|
688
|
+
# Adding _row_id
|
689
|
+
h['_col_id']=[].to_scale
|
690
|
+
ds_vars.push("_col_id")
|
691
|
+
@fields.each do |f|
|
692
|
+
if f=~re
|
693
|
+
if !vars.include? $1
|
694
|
+
vars.push($1)
|
695
|
+
h[$1]=Statsample::Vector.new([], @vectors[f].type)
|
696
|
+
end
|
697
|
+
max_n=$2.to_i if max_n < $2.to_i
|
698
|
+
end
|
699
|
+
end
|
700
|
+
ds=Dataset.new(h,ds_vars+vars)
|
701
|
+
each do |row|
|
702
|
+
row_out={}
|
703
|
+
parent_fields.each do |f|
|
704
|
+
row_out[f]=row[f]
|
705
|
+
end
|
706
|
+
max_n.times do |n1|
|
707
|
+
n=n1+1
|
708
|
+
any_data=false
|
709
|
+
vars.each do |v|
|
710
|
+
data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
|
711
|
+
row_out[v]=data
|
712
|
+
any_data=true if !data.nil?
|
713
|
+
end
|
714
|
+
if any_data
|
715
|
+
row_out["_col_id"]=n
|
716
|
+
ds.add_case(row_out,false)
|
717
|
+
end
|
718
|
+
|
719
|
+
end
|
720
|
+
end
|
721
|
+
ds.update_valid_data
|
722
|
+
ds
|
723
|
+
end
|
724
|
+
|
643
725
|
def summary
|
644
726
|
out=""
|
645
727
|
out << "Summary for dataset\n"
|
@@ -38,15 +38,16 @@ module Statsample
|
|
38
38
|
return 0.5 if dm==0.5
|
39
39
|
dominances=[dm]
|
40
40
|
@models_data.each{|k,m|
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
41
|
+
if !m.contributions[i].nil? and !m.contributions[j].nil?
|
42
|
+
if m.contributions[i]>m.contributions[j]
|
43
|
+
dominances.push(1)
|
44
|
+
elsif m.contributions[i]<m.contributions[j]
|
45
|
+
dominances.push(0)
|
46
|
+
else
|
47
|
+
return 0.5
|
48
|
+
#dominances.push(0.5)
|
49
49
|
end
|
50
|
+
end
|
50
51
|
}
|
51
52
|
final=dominances.uniq
|
52
53
|
final.size>1 ? 0.5 : final[0]
|
@@ -64,7 +65,8 @@ module Statsample
|
|
64
65
|
elsif a[i]<a[j]
|
65
66
|
dominances.push(0)
|
66
67
|
else
|
67
|
-
|
68
|
+
return 0.5
|
69
|
+
dominances.push(0.5)
|
68
70
|
end
|
69
71
|
end
|
70
72
|
final=dominances.uniq
|
@@ -72,34 +74,34 @@ module Statsample
|
|
72
74
|
end
|
73
75
|
# Returns 1 if i gD k, 0 if j gD i and 0.5 if undetermined
|
74
76
|
def general_dominance_pairwise(i,j)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
77
|
+
ga=general_averages
|
78
|
+
if ga[i]>ga[j]
|
79
|
+
1
|
80
|
+
elsif ga[i]<ga[j]
|
81
|
+
0
|
82
|
+
else
|
83
|
+
0.5
|
84
|
+
end
|
83
85
|
end
|
84
86
|
def pairs
|
85
|
-
|
87
|
+
@models.find_all{|m| m.size==2}
|
86
88
|
end
|
87
89
|
def total_dominance
|
88
90
|
pairs.inject({}){|a,pair|
|
89
|
-
|
90
|
-
|
91
|
+
a[pair]=total_dominance_pairwise(pair[0], pair[1])
|
92
|
+
a
|
91
93
|
}
|
92
94
|
end
|
93
95
|
def conditional_dominance
|
94
96
|
pairs.inject({}){|a,pair|
|
95
|
-
|
96
|
-
|
97
|
+
a[pair]=conditional_dominance_pairwise(pair[0], pair[1])
|
98
|
+
a
|
97
99
|
}
|
98
100
|
end
|
99
101
|
def general_dominance
|
100
102
|
pairs.inject({}){|a,pair|
|
101
|
-
|
102
|
-
|
103
|
+
a[pair]=general_dominance_pairwise(pair[0], pair[1])
|
104
|
+
a
|
103
105
|
}
|
104
106
|
end
|
105
107
|
|
@@ -108,56 +110,61 @@ module Statsample
|
|
108
110
|
end
|
109
111
|
# Get all model of size k
|
110
112
|
def md_k(k)
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
}
|
115
|
-
out
|
113
|
+
out=[]
|
114
|
+
models=@models.each{|m| out.push(md(m)) if m.size==k }
|
115
|
+
out
|
116
116
|
end
|
117
|
+
|
118
|
+
# For a hash with arrays of numbers as values
|
119
|
+
# Returns a hash with same keys and
|
120
|
+
# value as the mean of values of original hash
|
121
|
+
|
117
122
|
def get_averages(averages)
|
118
123
|
out={}
|
119
124
|
averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
|
120
125
|
out
|
121
126
|
end
|
127
|
+
# Hash with average for each k size
|
128
|
+
# model
|
122
129
|
def average_k(k)
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
130
|
+
return nil if k==@fields.size
|
131
|
+
models=md_k(k)
|
132
|
+
averages=@fields.inject({}) {|a,v| a[v]=[];a}
|
133
|
+
models.each do |m|
|
134
|
+
@fields.each do |f|
|
135
|
+
averages[f].push(m.contributions[f]) unless m.contributions[f].nil?
|
136
|
+
end
|
137
|
+
end
|
138
|
+
get_averages(averages)
|
132
139
|
end
|
133
140
|
def general_averages
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
end
|
142
|
-
@general_averages=get_averages(averages)
|
141
|
+
if @general_averages.nil?
|
142
|
+
averages=@fields.inject({}) {|a,v| a[v]=[md(v).r2];a}
|
143
|
+
for k in 1...@fields.size
|
144
|
+
ak=average_k(k)
|
145
|
+
@fields.each{|f|
|
146
|
+
averages[f].push(ak[f])
|
147
|
+
}
|
143
148
|
end
|
144
|
-
@general_averages
|
149
|
+
@general_averages=get_averages(averages)
|
150
|
+
end
|
151
|
+
@general_averages
|
145
152
|
end
|
146
153
|
def create_models
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
154
|
+
@models=[]
|
155
|
+
@models_data={}
|
156
|
+
for i in 1..@fields.size
|
157
|
+
c=Statsample::Combination.new(i,@fields.size)
|
158
|
+
c.each{|data|
|
159
|
+
convert=data.collect {|i1|
|
160
|
+
@fields[i1]
|
161
|
+
}
|
162
|
+
@models.push(convert)
|
163
|
+
ds_prev=@ds.dup(convert+[@y_var])
|
164
|
+
modeldata=ModelData.new(convert,ds_prev, @y_var, @fields, @r_class)
|
165
|
+
@models_data[convert.sort]=modeldata
|
166
|
+
}
|
167
|
+
end
|
161
168
|
end
|
162
169
|
def summary(report_type=ConsoleSummary)
|
163
170
|
out=""
|
@@ -26,25 +26,29 @@ class DominanceAnalysis
|
|
26
26
|
end
|
27
27
|
@da
|
28
28
|
end
|
29
|
+
# Creates re-samples from original dataset.
|
30
|
+
# * number_samples: Number of new samples to add
|
31
|
+
# * n: size of each new sample. If nil, equal to original sample size
|
32
|
+
# * report: if true, echo number of current resample and total
|
29
33
|
def bootstrap(number_samples,n=nil,report=false)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
34
|
+
number_samples.times{ |t|
|
35
|
+
@n_samples+=1
|
36
|
+
puts _("Bootstrap %d of %d") % [t+1, number_samples] if report
|
37
|
+
ds_boot=@ds.bootstrap(n)
|
38
|
+
da_1=DominanceAnalysis.new(ds_boot,@y_var,@lr_class)
|
39
|
+
da_1.total_dominance.each{|k,v|
|
40
|
+
@samples_td[k].push(v)
|
41
|
+
}
|
42
|
+
da_1.conditional_dominance.each{|k,v|
|
43
|
+
@samples_cd[k].push(v)
|
44
|
+
}
|
45
|
+
da_1.general_dominance.each{|k,v|
|
46
|
+
@samples_gd[k].push(v)
|
47
|
+
}
|
48
|
+
da_1.general_averages.each{|k,v|
|
49
|
+
@samples_ga[k].push(v)
|
50
|
+
}
|
51
|
+
}
|
48
52
|
end
|
49
53
|
def create_samples_pairs
|
50
54
|
@samples_td={}
|
@@ -106,7 +110,7 @@ class DominanceAnalysis
|
|
106
110
|
table.header=[_("var"),_("mean"),_("se"),_("p.5"),_("p.95")]
|
107
111
|
@fields.each{|f|
|
108
112
|
v=@samples_ga[f].to_vector(:scale)
|
109
|
-
row=[@ds.
|
113
|
+
row=[@ds.label(f), sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
|
110
114
|
table.add_row(row)
|
111
115
|
|
112
116
|
}
|
@@ -115,10 +119,10 @@ class DominanceAnalysis
|
|
115
119
|
end
|
116
120
|
def summary_pairs(pair,std,ttd)
|
117
121
|
freqs=std.proportions
|
118
|
-
[0,0.5,1].each{|n|
|
122
|
+
[0, 0.5, 1].each{|n|
|
119
123
|
freqs[n]=0 if freqs[n].nil?
|
120
124
|
}
|
121
|
-
name=@ds.
|
125
|
+
name=@ds.label(pair[0])+" - "+@ds.label(pair[1])
|
122
126
|
[name,f(ttd,1),f(std.mean,4),f(std.sd),f(freqs[1]), f(freqs[0]), f(freqs[0.5]), f(freqs[ttd])]
|
123
127
|
end
|
124
128
|
def f(v,n=3)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'statsample/factor/pca'
|
2
|
+
require 'statsample/factor/principalaxis'
|
3
|
+
require 'statsample/factor/rotation'
|
4
|
+
|
5
|
+
module Statsample
|
6
|
+
# = Factor Analysis toolbox
|
7
|
+
# * Classes for Extraction of factors:
|
8
|
+
# * Statsample::Factor::PCA
|
9
|
+
# * Statsample::Factor::PrincipalAxis
|
10
|
+
# * Classes for Rotation of factors:
|
11
|
+
# * Statsample::Factor::Varimax
|
12
|
+
# * Statsample::Factor::Equimax
|
13
|
+
# * Statsample::Factor::Quartimax
|
14
|
+
#
|
15
|
+
# See documentation of each class to use it
|
16
|
+
module Factor
|
17
|
+
end
|
18
|
+
end
|