statsample 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +79 -0
- data/Manifest.txt +56 -0
- data/README.txt +77 -0
- data/Rakefile +22 -0
- data/bin/statsample +2 -0
- data/demo/benchmark.rb +52 -0
- data/demo/chi-square.rb +44 -0
- data/demo/dice.rb +13 -0
- data/demo/distribution_t.rb +95 -0
- data/demo/graph.rb +9 -0
- data/demo/item_analysis.rb +30 -0
- data/demo/mean.rb +81 -0
- data/demo/proportion.rb +57 -0
- data/demo/sample_test.csv +113 -0
- data/demo/strata_proportion.rb +152 -0
- data/demo/stratum.rb +141 -0
- data/lib/spss.rb +131 -0
- data/lib/statsample.rb +216 -0
- data/lib/statsample/anova.rb +74 -0
- data/lib/statsample/bivariate.rb +255 -0
- data/lib/statsample/chidistribution.rb +39 -0
- data/lib/statsample/codification.rb +120 -0
- data/lib/statsample/converters.rb +338 -0
- data/lib/statsample/crosstab.rb +122 -0
- data/lib/statsample/dataset.rb +526 -0
- data/lib/statsample/dominanceanalysis.rb +259 -0
- data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
- data/lib/statsample/graph/gdchart.rb +45 -0
- data/lib/statsample/graph/svgboxplot.rb +108 -0
- data/lib/statsample/graph/svggraph.rb +181 -0
- data/lib/statsample/graph/svghistogram.rb +208 -0
- data/lib/statsample/graph/svgscatterplot.rb +111 -0
- data/lib/statsample/htmlreport.rb +232 -0
- data/lib/statsample/multiset.rb +281 -0
- data/lib/statsample/regression.rb +522 -0
- data/lib/statsample/reliability.rb +235 -0
- data/lib/statsample/resample.rb +20 -0
- data/lib/statsample/srs.rb +159 -0
- data/lib/statsample/test.rb +25 -0
- data/lib/statsample/vector.rb +759 -0
- data/test/_test_chart.rb +58 -0
- data/test/test_anova.rb +31 -0
- data/test/test_codification.rb +59 -0
- data/test/test_crosstab.rb +55 -0
- data/test/test_csv.csv +7 -0
- data/test/test_csv.rb +27 -0
- data/test/test_dataset.rb +293 -0
- data/test/test_ggobi.rb +42 -0
- data/test/test_multiset.rb +98 -0
- data/test/test_regression.rb +108 -0
- data/test/test_reliability.rb +32 -0
- data/test/test_resample.rb +23 -0
- data/test/test_srs.rb +14 -0
- data/test/test_statistics.rb +152 -0
- data/test/test_stratified.rb +19 -0
- data/test/test_svg_graph.rb +63 -0
- data/test/test_vector.rb +265 -0
- data/test/test_xls.rb +32 -0
- metadata +158 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Anova
|
3
|
+
# One Way Anova
|
4
|
+
# Example:
|
5
|
+
# v1=[2,3,4,5,6].to_vector(:scale)
|
6
|
+
# v2=[3,3,4,5,6].to_vector(:scale)
|
7
|
+
# v3=[5,3,1,5,6].to_vector(:scale)
|
8
|
+
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
+
# puts anova.f
|
10
|
+
# puts anova.significance
|
11
|
+
class OneWay
|
12
|
+
def initialize(vectors)
|
13
|
+
@vectors=vectors
|
14
|
+
end
|
15
|
+
# Total sum
|
16
|
+
def sum
|
17
|
+
@vectors.inject(0){|a,v| a+v.sum}
|
18
|
+
end
|
19
|
+
# Total mean
|
20
|
+
def mean
|
21
|
+
sum.quo(n)
|
22
|
+
end
|
23
|
+
# Total sum of squares
|
24
|
+
def sst
|
25
|
+
m=mean.to_f
|
26
|
+
@vectors.inject(0) {|total,vector|
|
27
|
+
total+vector.sum_of_squares(m)
|
28
|
+
}
|
29
|
+
end
|
30
|
+
# Sum of squares within groups
|
31
|
+
def sswg
|
32
|
+
@vectors.inject(0) {|total,vector|
|
33
|
+
total+vector.sum_of_squares
|
34
|
+
}
|
35
|
+
end
|
36
|
+
# Sum of squares between groups
|
37
|
+
def ssbg
|
38
|
+
m=mean
|
39
|
+
@vectors.inject(0) {|total,vector|
|
40
|
+
total+(vector.mean-m).square*vector.size
|
41
|
+
}
|
42
|
+
end
|
43
|
+
# Degrees of freedom within groups
|
44
|
+
def df_wg
|
45
|
+
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
46
|
+
end
|
47
|
+
# Degrees of freedom between groups
|
48
|
+
def df_bg
|
49
|
+
@vectors.size-1
|
50
|
+
end
|
51
|
+
# Total Degrees of freedom
|
52
|
+
def df_total
|
53
|
+
n-1
|
54
|
+
end
|
55
|
+
# Total number of cases
|
56
|
+
def n
|
57
|
+
@vectors.inject(0){|a,v| a+v.size}
|
58
|
+
end
|
59
|
+
# Fisher
|
60
|
+
def f
|
61
|
+
k=@vectors.size
|
62
|
+
(ssbg*(n-k)) / (sswg*(k-1))
|
63
|
+
end
|
64
|
+
# Significance of Fisher
|
65
|
+
def significance
|
66
|
+
if HAS_GSL
|
67
|
+
GSL::Cdf.fdist_Q(f,df_bg,df_wg)
|
68
|
+
else
|
69
|
+
raise "Need Ruby/GSL"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Diverse correlation methods
|
3
|
+
module Bivariate
|
4
|
+
class << self
|
5
|
+
# Covariance between two vectors
|
6
|
+
def covariance(v1,v2)
|
7
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
8
|
+
return nil if v1a.size==0
|
9
|
+
if HAS_GSL
|
10
|
+
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
11
|
+
else
|
12
|
+
covariance_slow(v1a,v2a)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
# Covariance. The denominator is n-1
|
16
|
+
def covariance_slow(v1a,v2a)
|
17
|
+
t=0
|
18
|
+
m1=v1a.mean
|
19
|
+
m2=v1a.mean
|
20
|
+
(0...v1a.size).each {|i|
|
21
|
+
t+=((v1a[i]-m1)*(v2a[i]-m2))
|
22
|
+
}
|
23
|
+
t.to_f / (v1a.size-1)
|
24
|
+
end
|
25
|
+
# Calculate Pearson correlation coefficient between 2 vectors
|
26
|
+
def pearson(v1,v2)
|
27
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
28
|
+
return nil if v1a.size ==0
|
29
|
+
if HAS_GSL
|
30
|
+
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
31
|
+
else
|
32
|
+
pearson_slow(v1a,v2a)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
#:nodoc:
|
36
|
+
def pearson_slow(v1a,v2a)
|
37
|
+
v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
|
38
|
+
t=0
|
39
|
+
siz=v1s.size
|
40
|
+
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
41
|
+
t.to_f/v2s.size
|
42
|
+
end
|
43
|
+
# Retrieves the value for t test for a pearson correlation
|
44
|
+
# between two vectors to test the null hipothesis of r=0
|
45
|
+
def t_pearson(v1,v2)
|
46
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
47
|
+
r=pearson(v1a,v2a)
|
48
|
+
if(r==1.0)
|
49
|
+
0
|
50
|
+
else
|
51
|
+
t_r(r,v1a.size)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
# Retrieves the value for t test for a pearson correlation
|
55
|
+
# giving r and vector size
|
56
|
+
def t_r(r,size)
|
57
|
+
r*Math::sqrt(((size)-2).to_f / (1 - r**2))
|
58
|
+
end
|
59
|
+
# Retrieves the probability value (a la SPSS)
|
60
|
+
# for a given t, size and number of tails
|
61
|
+
def prop_pearson(t,size, tails=2)
|
62
|
+
if HAS_GSL
|
63
|
+
t=-t if t>0
|
64
|
+
cdf=GSL::Cdf::tdist_P(t,(size)-2)
|
65
|
+
cdf*tails
|
66
|
+
else
|
67
|
+
raise "Needs ruby-gsl"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
# Returns residual score after delete variance
|
71
|
+
# from another variable
|
72
|
+
#
|
73
|
+
def residuals(from,del)
|
74
|
+
r=Statsample::Bivariate.pearson(from,del)
|
75
|
+
froms, dels = from.vector_standarized, del.vector_standarized
|
76
|
+
nv=[]
|
77
|
+
froms.data_with_nils.each_index{|i|
|
78
|
+
if froms[i].nil? or dels[i].nil?
|
79
|
+
nv.push(nil)
|
80
|
+
else
|
81
|
+
nv.push(froms[i]-r*dels[i])
|
82
|
+
end
|
83
|
+
}
|
84
|
+
nv.to_vector(:scale)
|
85
|
+
end
|
86
|
+
def partial_correlation(v1,v2,control)
|
87
|
+
v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
|
88
|
+
rv1v2=pearson(v1a,v2a)
|
89
|
+
rv1con=pearson(v1a,cona)
|
90
|
+
rv2con=pearson(v2a,cona)
|
91
|
+
|
92
|
+
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
93
|
+
|
94
|
+
end
|
95
|
+
# Covariance matrix
|
96
|
+
def covariance_matrix(ds)
|
97
|
+
ds.collect_matrix do |row,col|
|
98
|
+
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
99
|
+
nil
|
100
|
+
else
|
101
|
+
covariance(ds[row],ds[col])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# The classic correlation matrix for all fields of a dataset
|
107
|
+
|
108
|
+
def correlation_matrix(ds)
|
109
|
+
ds.collect_matrix {|row,col|
|
110
|
+
if row==col
|
111
|
+
1.0
|
112
|
+
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
113
|
+
nil
|
114
|
+
else
|
115
|
+
pearson(ds[row],ds[col])
|
116
|
+
end
|
117
|
+
}
|
118
|
+
end
|
119
|
+
# Retrieves the n valid pairwise
|
120
|
+
def n_valid_matrix(ds)
|
121
|
+
ds.collect_matrix {|row,col|
|
122
|
+
if row==col
|
123
|
+
ds[row].valid_data.size
|
124
|
+
else
|
125
|
+
rowa,rowb=Statsample.only_valid(ds[row],ds[col])
|
126
|
+
rowa.size
|
127
|
+
end
|
128
|
+
}
|
129
|
+
end
|
130
|
+
def correlation_probability_matrix(ds)
|
131
|
+
rows=ds.fields.collect{|row|
|
132
|
+
ds.fields.collect{|col|
|
133
|
+
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
134
|
+
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
|
135
|
+
}
|
136
|
+
}
|
137
|
+
Matrix.rows(rows)
|
138
|
+
end
|
139
|
+
# Calculate Spearman correlation coefficient between 2 vectors
|
140
|
+
def spearman(v1,v2)
|
141
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
142
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
143
|
+
pearson(v1r,v2r)
|
144
|
+
end
|
145
|
+
# Calculate Point biserial correlation.
|
146
|
+
# Equal to Pearson correlation, with one dichotomous value replaced
|
147
|
+
# by "0" and the other by "1"
|
148
|
+
def point_biserial(dichotomous,continous)
|
149
|
+
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
150
|
+
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
151
|
+
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
152
|
+
f0=ds['d'].factors.sort[0]
|
153
|
+
m0=ds.filter_field('c') {|c| c['d']==f0}
|
154
|
+
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
155
|
+
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
156
|
+
end
|
157
|
+
# Kendall Rank Correlation Coefficient.
|
158
|
+
#
|
159
|
+
# Based on Hervé Adbi article
|
160
|
+
def tau_a(v1,v2)
|
161
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
162
|
+
n=v1.size
|
163
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
164
|
+
o1=ordered_pairs(v1r)
|
165
|
+
o2=ordered_pairs(v2r)
|
166
|
+
delta= o1.size*2-(o2 & o1).size*2
|
167
|
+
1-(delta * 2 / (n*(n-1)).to_f)
|
168
|
+
end
|
169
|
+
# Calculates Tau b correlation.
|
170
|
+
#
|
171
|
+
# Tau-b defines perfect association as strict monotonicity.
|
172
|
+
# Although it requires strict monotonicity to reach 1.0,
|
173
|
+
# it does not penalize ties as much as some other measures.
|
174
|
+
#
|
175
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
176
|
+
def tau_b(matrix)
|
177
|
+
v=pairs(matrix)
|
178
|
+
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
179
|
+
end
|
180
|
+
# Calculates Goodman and Kruskal's gamma.
|
181
|
+
#
|
182
|
+
# Gamma is the surplus of concordant pairs over discordant pairs,
|
183
|
+
# as a percentage of all pairs ignoring ties.
|
184
|
+
#
|
185
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
186
|
+
def gamma(matrix)
|
187
|
+
v=pairs(matrix)
|
188
|
+
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
189
|
+
end
|
190
|
+
# Calculate indexes for a matrix
|
191
|
+
# the rows and cols has to be ordered
|
192
|
+
def pairs(matrix)
|
193
|
+
# calculate concordant
|
194
|
+
#p matrix
|
195
|
+
rs=matrix.row_size
|
196
|
+
cs=matrix.column_size
|
197
|
+
conc=disc=ties_x=ties_y=0
|
198
|
+
(0...(rs-1)).each {|x|
|
199
|
+
(0...(cs-1)).each{|y|
|
200
|
+
((x+1)...rs).each{|x2|
|
201
|
+
((y+1)...cs).each{|y2|
|
202
|
+
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
203
|
+
conc+=matrix[x,y]*matrix[x2,y2]
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
}
|
208
|
+
(0...(rs-1)).each {|x|
|
209
|
+
(1...(cs)).each{|y|
|
210
|
+
((x+1)...rs).each{|x2|
|
211
|
+
(0...y).each{|y2|
|
212
|
+
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
213
|
+
disc+=matrix[x,y]*matrix[x2,y2]
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
}
|
218
|
+
(0...(rs-1)).each {|x|
|
219
|
+
(0...(cs)).each{|y|
|
220
|
+
((x+1)...(rs)).each{|x2|
|
221
|
+
ties_x+=matrix[x,y]*matrix[x2,y]
|
222
|
+
}
|
223
|
+
}
|
224
|
+
}
|
225
|
+
(0...rs).each {|x|
|
226
|
+
(0...(cs-1)).each{|y|
|
227
|
+
((y+1)...(cs)).each{|y2|
|
228
|
+
ties_y+=matrix[x,y]*matrix[x,y2]
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
233
|
+
end
|
234
|
+
def ordered_pairs(vector)
|
235
|
+
d=vector.data
|
236
|
+
a=[]
|
237
|
+
(0...(d.size-1)).each{|i|
|
238
|
+
((i+1)...(d.size)).each {|j|
|
239
|
+
a.push([d[i],d[j]])
|
240
|
+
}
|
241
|
+
}
|
242
|
+
a
|
243
|
+
end
|
244
|
+
def sum_of_codeviated(v1,v2)
|
245
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
246
|
+
sum=0
|
247
|
+
(0...v1a.size).each{|i|
|
248
|
+
sum+=v1a[i]*v2a[i]
|
249
|
+
}
|
250
|
+
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Based on Babatunde, Iyiola & Eni () :
|
3
|
+
# "A Numerical Procedure for Computing Chi-Square Percentage Points"
|
4
|
+
#
|
5
|
+
module ChiDistribution
|
6
|
+
class << self
|
7
|
+
def steps(av, bv, itv)
|
8
|
+
steps = ((bv.to_f - av.to_f) / itv.to_f).to_i
|
9
|
+
end
|
10
|
+
def loggamma(k)
|
11
|
+
c1 = 76.18009173
|
12
|
+
c2 = -86.50532033
|
13
|
+
c3 = 24.01409822
|
14
|
+
c4 = -1.231739516
|
15
|
+
c5 = 0.00120858
|
16
|
+
c6 = -0.000005364
|
17
|
+
c7 = 2.506628275
|
18
|
+
x1 = k - 1
|
19
|
+
ws = x1 + 5.5
|
20
|
+
ws = (x1 + 0.5) * Math::log(ws) - ws
|
21
|
+
s = 1 + c1 / (x1 + 1) + c2 / (x1 + 2) + c3 / (x1 + 3) + c4 / (x1 + 4) + c5 / (x1 + 5) + c6 / (x1 + 6)
|
22
|
+
ws + Math::log(c7 * s)
|
23
|
+
end
|
24
|
+
def f(x, k)
|
25
|
+
Math::exp(0.5 * k * Math::log(0.5 * x) - Math::log(x) - loggamma(0.5 * k) - 0.5 * x)
|
26
|
+
end
|
27
|
+
def cdf(b,k)
|
28
|
+
a = 0.001
|
29
|
+
b=b.to_f
|
30
|
+
if k==2
|
31
|
+
1 - Math::exp( -b.to_f / 2)
|
32
|
+
else
|
33
|
+
w = (b - a) / 28.to_f
|
34
|
+
2 * w / 45 * (7 * (f(a, k) + f(a + 28 * w, k)) + 12 * (f(a + 2 * w, k) + f(a + 6 * w, k) + f(a + 10 * w, k) + f(a + 14 * w, k) + f(a + 18 * w, k) + f(a + 22 * w, k) + f(a + 26 * w, k)) + 14 * (f(a + 4 * w, k) + f(a + 8 * w, k) + f(a + 12 * w, k) + f(a + 16 * w, k) + f(a + 20 * w, k) + f(a + 24 * w, k)) + 32 * (f(a + w, k) + f(a + 3 * w, k) + f(a + 5 * w, k) + f(a + 7 * w, k) + f(a + 9 * w, k) + f(a + 11 * w, k) + f(a + 13 * w, k) + f(a + 15 * w, k) + f(a + 17 * w, k) + f(a + 19 * w, k) + f(a + 21 * w, k) + f(a + 23 * w, k) + f(a + 25 * w, k) + f(a + 27 * w, k)))
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
# Codification
|
5
|
+
#
|
6
|
+
# This tool aids to code open questions
|
7
|
+
# * Load one or more vectors on the workflow, to create a file on yaml of values. If data have Statsample::SEPARATOR_TOKEN, the value will be splitted on two or more values
|
8
|
+
# * Edit the yaml and replace the values with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SEPARATOR_TOKEN)
|
9
|
+
# * Recode the vectors, loading the yaml file:
|
10
|
+
# * The new vectors have the same name of the original plus "_recoded"
|
11
|
+
# * Instead of load new recoded vectors, create many vectors as values, as add_vectors_by_split
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# recode_file="recodification.yaml"
|
15
|
+
# phase=:first # flag
|
16
|
+
# if phase==:first
|
17
|
+
# File.open(recode_file,"w") {|fp|
|
18
|
+
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
|
19
|
+
# } # Edit the file recodification.yaml
|
20
|
+
# elsif phase==:second
|
21
|
+
# File.open(recode_file,"r") {|fp|
|
22
|
+
# Statsample::Codification.verify(fp,['vector1'])
|
23
|
+
# }
|
24
|
+
# elsif phase==:third
|
25
|
+
# File.open(recode_file,"r") {|fp|
|
26
|
+
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
|
27
|
+
# }
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
module Codification
|
31
|
+
class << self
|
32
|
+
# Create a yaml dump for a hash, based on vectors
|
33
|
+
# The keys will be vectors name on dataset and the values
|
34
|
+
# will be hashes, with keys = values, for recodification
|
35
|
+
#
|
36
|
+
# v1=%w{a,b b,c d}.to_vector
|
37
|
+
# ds={"v1"=>v1}.to_dataset
|
38
|
+
# Statsample::Codification.create_yaml(ds,['v1'])
|
39
|
+
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
40
|
+
def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
|
41
|
+
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
42
|
+
pro_hash=vectors.inject({}){|h,v_name|
|
43
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
44
|
+
v=dataset[v_name]
|
45
|
+
split_data=v.splitted(sep)
|
46
|
+
factors=split_data.flatten.uniq.compact.sort.inject({}) {|a,v| a[v]=v;a}
|
47
|
+
h[v_name]=factors
|
48
|
+
h
|
49
|
+
}
|
50
|
+
YAML.dump(pro_hash,io)
|
51
|
+
end
|
52
|
+
def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
|
53
|
+
h.inject({}) {|a,v|
|
54
|
+
v[1].split(sep).each {|val|
|
55
|
+
a[val]||=[]
|
56
|
+
a[val].push(v[0])
|
57
|
+
}
|
58
|
+
a
|
59
|
+
}
|
60
|
+
end
|
61
|
+
def dictionary(h,sep=Statsample::SPLIT_TOKEN)
|
62
|
+
h.inject({}) {|a,v|
|
63
|
+
a[v[0]]=v[1].split(sep)
|
64
|
+
a
|
65
|
+
}
|
66
|
+
end
|
67
|
+
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
68
|
+
dict=dictionary(h,sep)
|
69
|
+
new_data=v.splitted(sep)
|
70
|
+
recoded=new_data.collect{|c|
|
71
|
+
if c.nil?
|
72
|
+
nil
|
73
|
+
else
|
74
|
+
c.collect{|value|
|
75
|
+
dict[value]
|
76
|
+
}.flatten.uniq
|
77
|
+
end
|
78
|
+
}
|
79
|
+
end
|
80
|
+
def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
|
81
|
+
_recode_dataset(dataset,yaml,sep,false)
|
82
|
+
end
|
83
|
+
def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
|
84
|
+
_recode_dataset(dataset,yaml,sep,true)
|
85
|
+
end
|
86
|
+
|
87
|
+
def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
|
88
|
+
h=YAML::load(yaml)
|
89
|
+
v_names||=h.keys
|
90
|
+
v_names.each do |v_name|
|
91
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
92
|
+
recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
|
93
|
+
if c.nil?
|
94
|
+
nil
|
95
|
+
else
|
96
|
+
c.join(sep)
|
97
|
+
end
|
98
|
+
}.to_vector
|
99
|
+
if(split)
|
100
|
+
recoded.split_by_separator(sep).each {|k,v|
|
101
|
+
dataset[v_name+"_"+k]=v
|
102
|
+
}
|
103
|
+
else
|
104
|
+
dataset[v_name+"_recoded"]=recoded
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
109
|
+
require 'pp'
|
110
|
+
h=YAML::load(yaml)
|
111
|
+
v_names||=h.keys
|
112
|
+
v_names.each{|v_name|
|
113
|
+
inverse=inverse_hash(h[v_name],sep)
|
114
|
+
io.puts "Vector: #{v_name}"
|
115
|
+
YAML.dump(inverse.sort,io)
|
116
|
+
}
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|