statsample 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +79 -0
- data/Manifest.txt +56 -0
- data/README.txt +77 -0
- data/Rakefile +22 -0
- data/bin/statsample +2 -0
- data/demo/benchmark.rb +52 -0
- data/demo/chi-square.rb +44 -0
- data/demo/dice.rb +13 -0
- data/demo/distribution_t.rb +95 -0
- data/demo/graph.rb +9 -0
- data/demo/item_analysis.rb +30 -0
- data/demo/mean.rb +81 -0
- data/demo/proportion.rb +57 -0
- data/demo/sample_test.csv +113 -0
- data/demo/strata_proportion.rb +152 -0
- data/demo/stratum.rb +141 -0
- data/lib/spss.rb +131 -0
- data/lib/statsample.rb +216 -0
- data/lib/statsample/anova.rb +74 -0
- data/lib/statsample/bivariate.rb +255 -0
- data/lib/statsample/chidistribution.rb +39 -0
- data/lib/statsample/codification.rb +120 -0
- data/lib/statsample/converters.rb +338 -0
- data/lib/statsample/crosstab.rb +122 -0
- data/lib/statsample/dataset.rb +526 -0
- data/lib/statsample/dominanceanalysis.rb +259 -0
- data/lib/statsample/dominanceanalysis/bootstrap.rb +126 -0
- data/lib/statsample/graph/gdchart.rb +45 -0
- data/lib/statsample/graph/svgboxplot.rb +108 -0
- data/lib/statsample/graph/svggraph.rb +181 -0
- data/lib/statsample/graph/svghistogram.rb +208 -0
- data/lib/statsample/graph/svgscatterplot.rb +111 -0
- data/lib/statsample/htmlreport.rb +232 -0
- data/lib/statsample/multiset.rb +281 -0
- data/lib/statsample/regression.rb +522 -0
- data/lib/statsample/reliability.rb +235 -0
- data/lib/statsample/resample.rb +20 -0
- data/lib/statsample/srs.rb +159 -0
- data/lib/statsample/test.rb +25 -0
- data/lib/statsample/vector.rb +759 -0
- data/test/_test_chart.rb +58 -0
- data/test/test_anova.rb +31 -0
- data/test/test_codification.rb +59 -0
- data/test/test_crosstab.rb +55 -0
- data/test/test_csv.csv +7 -0
- data/test/test_csv.rb +27 -0
- data/test/test_dataset.rb +293 -0
- data/test/test_ggobi.rb +42 -0
- data/test/test_multiset.rb +98 -0
- data/test/test_regression.rb +108 -0
- data/test/test_reliability.rb +32 -0
- data/test/test_resample.rb +23 -0
- data/test/test_srs.rb +14 -0
- data/test/test_statistics.rb +152 -0
- data/test/test_stratified.rb +19 -0
- data/test/test_svg_graph.rb +63 -0
- data/test/test_vector.rb +265 -0
- data/test/test_xls.rb +32 -0
- metadata +158 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Anova
|
3
|
+
# One Way Anova
|
4
|
+
# Example:
|
5
|
+
# v1=[2,3,4,5,6].to_vector(:scale)
|
6
|
+
# v2=[3,3,4,5,6].to_vector(:scale)
|
7
|
+
# v3=[5,3,1,5,6].to_vector(:scale)
|
8
|
+
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
+
# puts anova.f
|
10
|
+
# puts anova.significance
|
11
|
+
class OneWay
|
12
|
+
def initialize(vectors)
|
13
|
+
@vectors=vectors
|
14
|
+
end
|
15
|
+
# Total sum
|
16
|
+
def sum
|
17
|
+
@vectors.inject(0){|a,v| a+v.sum}
|
18
|
+
end
|
19
|
+
# Total mean
|
20
|
+
def mean
|
21
|
+
sum.quo(n)
|
22
|
+
end
|
23
|
+
# Total sum of squares
|
24
|
+
def sst
|
25
|
+
m=mean.to_f
|
26
|
+
@vectors.inject(0) {|total,vector|
|
27
|
+
total+vector.sum_of_squares(m)
|
28
|
+
}
|
29
|
+
end
|
30
|
+
# Sum of squares within groups
|
31
|
+
def sswg
|
32
|
+
@vectors.inject(0) {|total,vector|
|
33
|
+
total+vector.sum_of_squares
|
34
|
+
}
|
35
|
+
end
|
36
|
+
# Sum of squares between groups
|
37
|
+
def ssbg
|
38
|
+
m=mean
|
39
|
+
@vectors.inject(0) {|total,vector|
|
40
|
+
total+(vector.mean-m).square*vector.size
|
41
|
+
}
|
42
|
+
end
|
43
|
+
# Degrees of freedom within groups
|
44
|
+
def df_wg
|
45
|
+
@vectors.inject(0) {|a,v| a+(v.size-1)}
|
46
|
+
end
|
47
|
+
# Degrees of freedom between groups
|
48
|
+
def df_bg
|
49
|
+
@vectors.size-1
|
50
|
+
end
|
51
|
+
# Total Degrees of freedom
|
52
|
+
def df_total
|
53
|
+
n-1
|
54
|
+
end
|
55
|
+
# Total number of cases
|
56
|
+
def n
|
57
|
+
@vectors.inject(0){|a,v| a+v.size}
|
58
|
+
end
|
59
|
+
# Fisher
|
60
|
+
def f
|
61
|
+
k=@vectors.size
|
62
|
+
(ssbg*(n-k)) / (sswg*(k-1))
|
63
|
+
end
|
64
|
+
# Significance of Fisher
|
65
|
+
def significance
|
66
|
+
if HAS_GSL
|
67
|
+
GSL::Cdf.fdist_Q(f,df_bg,df_wg)
|
68
|
+
else
|
69
|
+
raise "Need Ruby/GSL"
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,255 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Diverse correlation methods
|
3
|
+
module Bivariate
|
4
|
+
class << self
|
5
|
+
# Covariance between two vectors
|
6
|
+
def covariance(v1,v2)
|
7
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
8
|
+
return nil if v1a.size==0
|
9
|
+
if HAS_GSL
|
10
|
+
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
11
|
+
else
|
12
|
+
covariance_slow(v1a,v2a)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
# Covariance. The denominator is n-1
|
16
|
+
def covariance_slow(v1a,v2a)
|
17
|
+
t=0
|
18
|
+
m1=v1a.mean
|
19
|
+
m2=v1a.mean
|
20
|
+
(0...v1a.size).each {|i|
|
21
|
+
t+=((v1a[i]-m1)*(v2a[i]-m2))
|
22
|
+
}
|
23
|
+
t.to_f / (v1a.size-1)
|
24
|
+
end
|
25
|
+
# Calculate Pearson correlation coefficient between 2 vectors
|
26
|
+
def pearson(v1,v2)
|
27
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
28
|
+
return nil if v1a.size ==0
|
29
|
+
if HAS_GSL
|
30
|
+
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
31
|
+
else
|
32
|
+
pearson_slow(v1a,v2a)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
#:nodoc:
|
36
|
+
def pearson_slow(v1a,v2a)
|
37
|
+
v1s,v2s=v1a.vector_standarized_pop,v2a.vector_standarized_pop
|
38
|
+
t=0
|
39
|
+
siz=v1s.size
|
40
|
+
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
41
|
+
t.to_f/v2s.size
|
42
|
+
end
|
43
|
+
# Retrieves the value for t test for a pearson correlation
|
44
|
+
# between two vectors to test the null hipothesis of r=0
|
45
|
+
def t_pearson(v1,v2)
|
46
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
47
|
+
r=pearson(v1a,v2a)
|
48
|
+
if(r==1.0)
|
49
|
+
0
|
50
|
+
else
|
51
|
+
t_r(r,v1a.size)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
# Retrieves the value for t test for a pearson correlation
|
55
|
+
# giving r and vector size
|
56
|
+
def t_r(r,size)
|
57
|
+
r*Math::sqrt(((size)-2).to_f / (1 - r**2))
|
58
|
+
end
|
59
|
+
# Retrieves the probability value (a la SPSS)
|
60
|
+
# for a given t, size and number of tails
|
61
|
+
def prop_pearson(t,size, tails=2)
|
62
|
+
if HAS_GSL
|
63
|
+
t=-t if t>0
|
64
|
+
cdf=GSL::Cdf::tdist_P(t,(size)-2)
|
65
|
+
cdf*tails
|
66
|
+
else
|
67
|
+
raise "Needs ruby-gsl"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
# Returns residual score after delete variance
|
71
|
+
# from another variable
|
72
|
+
#
|
73
|
+
def residuals(from,del)
|
74
|
+
r=Statsample::Bivariate.pearson(from,del)
|
75
|
+
froms, dels = from.vector_standarized, del.vector_standarized
|
76
|
+
nv=[]
|
77
|
+
froms.data_with_nils.each_index{|i|
|
78
|
+
if froms[i].nil? or dels[i].nil?
|
79
|
+
nv.push(nil)
|
80
|
+
else
|
81
|
+
nv.push(froms[i]-r*dels[i])
|
82
|
+
end
|
83
|
+
}
|
84
|
+
nv.to_vector(:scale)
|
85
|
+
end
|
86
|
+
def partial_correlation(v1,v2,control)
|
87
|
+
v1a,v2a,cona=Statsample.only_valid(v1,v2,control)
|
88
|
+
rv1v2=pearson(v1a,v2a)
|
89
|
+
rv1con=pearson(v1a,cona)
|
90
|
+
rv2con=pearson(v2a,cona)
|
91
|
+
|
92
|
+
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
93
|
+
|
94
|
+
end
|
95
|
+
# Covariance matrix
|
96
|
+
def covariance_matrix(ds)
|
97
|
+
ds.collect_matrix do |row,col|
|
98
|
+
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
99
|
+
nil
|
100
|
+
else
|
101
|
+
covariance(ds[row],ds[col])
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# The classic correlation matrix for all fields of a dataset
|
107
|
+
|
108
|
+
def correlation_matrix(ds)
|
109
|
+
ds.collect_matrix {|row,col|
|
110
|
+
if row==col
|
111
|
+
1.0
|
112
|
+
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
113
|
+
nil
|
114
|
+
else
|
115
|
+
pearson(ds[row],ds[col])
|
116
|
+
end
|
117
|
+
}
|
118
|
+
end
|
119
|
+
# Retrieves the n valid pairwise
|
120
|
+
def n_valid_matrix(ds)
|
121
|
+
ds.collect_matrix {|row,col|
|
122
|
+
if row==col
|
123
|
+
ds[row].valid_data.size
|
124
|
+
else
|
125
|
+
rowa,rowb=Statsample.only_valid(ds[row],ds[col])
|
126
|
+
rowa.size
|
127
|
+
end
|
128
|
+
}
|
129
|
+
end
|
130
|
+
def correlation_probability_matrix(ds)
|
131
|
+
rows=ds.fields.collect{|row|
|
132
|
+
ds.fields.collect{|col|
|
133
|
+
v1a,v2a=Statsample.only_valid(ds[row],ds[col])
|
134
|
+
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size)
|
135
|
+
}
|
136
|
+
}
|
137
|
+
Matrix.rows(rows)
|
138
|
+
end
|
139
|
+
# Calculate Spearman correlation coefficient between 2 vectors
|
140
|
+
def spearman(v1,v2)
|
141
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
142
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
143
|
+
pearson(v1r,v2r)
|
144
|
+
end
|
145
|
+
# Calculate Point biserial correlation.
|
146
|
+
# Equal to Pearson correlation, with one dichotomous value replaced
|
147
|
+
# by "0" and the other by "1"
|
148
|
+
def point_biserial(dichotomous,continous)
|
149
|
+
ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
|
150
|
+
raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
|
151
|
+
raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
|
152
|
+
f0=ds['d'].factors.sort[0]
|
153
|
+
m0=ds.filter_field('c') {|c| c['d']==f0}
|
154
|
+
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
155
|
+
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
156
|
+
end
|
157
|
+
# Kendall Rank Correlation Coefficient.
|
158
|
+
#
|
159
|
+
# Based on Hervé Adbi article
|
160
|
+
def tau_a(v1,v2)
|
161
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
162
|
+
n=v1.size
|
163
|
+
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
164
|
+
o1=ordered_pairs(v1r)
|
165
|
+
o2=ordered_pairs(v2r)
|
166
|
+
delta= o1.size*2-(o2 & o1).size*2
|
167
|
+
1-(delta * 2 / (n*(n-1)).to_f)
|
168
|
+
end
|
169
|
+
# Calculates Tau b correlation.
|
170
|
+
#
|
171
|
+
# Tau-b defines perfect association as strict monotonicity.
|
172
|
+
# Although it requires strict monotonicity to reach 1.0,
|
173
|
+
# it does not penalize ties as much as some other measures.
|
174
|
+
#
|
175
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
176
|
+
def tau_b(matrix)
|
177
|
+
v=pairs(matrix)
|
178
|
+
((v['P']-v['Q']).to_f / Math::sqrt((v['P']+v['Q']+v['Y'])*(v['P']+v['Q']+v['X'])).to_f)
|
179
|
+
end
|
180
|
+
# Calculates Goodman and Kruskal's gamma.
|
181
|
+
#
|
182
|
+
# Gamma is the surplus of concordant pairs over discordant pairs,
|
183
|
+
# as a percentage of all pairs ignoring ties.
|
184
|
+
#
|
185
|
+
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
186
|
+
def gamma(matrix)
|
187
|
+
v=pairs(matrix)
|
188
|
+
(v['P']-v['Q']).to_f / (v['P']+v['Q']).to_f
|
189
|
+
end
|
190
|
+
# Calculate indexes for a matrix
|
191
|
+
# the rows and cols has to be ordered
|
192
|
+
def pairs(matrix)
|
193
|
+
# calculate concordant
|
194
|
+
#p matrix
|
195
|
+
rs=matrix.row_size
|
196
|
+
cs=matrix.column_size
|
197
|
+
conc=disc=ties_x=ties_y=0
|
198
|
+
(0...(rs-1)).each {|x|
|
199
|
+
(0...(cs-1)).each{|y|
|
200
|
+
((x+1)...rs).each{|x2|
|
201
|
+
((y+1)...cs).each{|y2|
|
202
|
+
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
203
|
+
conc+=matrix[x,y]*matrix[x2,y2]
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
}
|
208
|
+
(0...(rs-1)).each {|x|
|
209
|
+
(1...(cs)).each{|y|
|
210
|
+
((x+1)...rs).each{|x2|
|
211
|
+
(0...y).each{|y2|
|
212
|
+
#p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
213
|
+
disc+=matrix[x,y]*matrix[x2,y2]
|
214
|
+
}
|
215
|
+
}
|
216
|
+
}
|
217
|
+
}
|
218
|
+
(0...(rs-1)).each {|x|
|
219
|
+
(0...(cs)).each{|y|
|
220
|
+
((x+1)...(rs)).each{|x2|
|
221
|
+
ties_x+=matrix[x,y]*matrix[x2,y]
|
222
|
+
}
|
223
|
+
}
|
224
|
+
}
|
225
|
+
(0...rs).each {|x|
|
226
|
+
(0...(cs-1)).each{|y|
|
227
|
+
((y+1)...(cs)).each{|y2|
|
228
|
+
ties_y+=matrix[x,y]*matrix[x,y2]
|
229
|
+
}
|
230
|
+
}
|
231
|
+
}
|
232
|
+
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
|
233
|
+
end
|
234
|
+
def ordered_pairs(vector)
|
235
|
+
d=vector.data
|
236
|
+
a=[]
|
237
|
+
(0...(d.size-1)).each{|i|
|
238
|
+
((i+1)...(d.size)).each {|j|
|
239
|
+
a.push([d[i],d[j]])
|
240
|
+
}
|
241
|
+
}
|
242
|
+
a
|
243
|
+
end
|
244
|
+
def sum_of_codeviated(v1,v2)
|
245
|
+
v1a,v2a=Statsample.only_valid(v1,v2)
|
246
|
+
sum=0
|
247
|
+
(0...v1a.size).each{|i|
|
248
|
+
sum+=v1a[i]*v2a[i]
|
249
|
+
}
|
250
|
+
sum-((v1a.sum*v2a.sum) / v1a.size.to_f)
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Statsample
|
2
|
+
# Based on Babatunde, Iyiola & Eni () :
|
3
|
+
# "A Numerical Procedure for Computing Chi-Square Percentage Points"
|
4
|
+
#
|
5
|
+
module ChiDistribution
|
6
|
+
class << self
|
7
|
+
def steps(av, bv, itv)
|
8
|
+
steps = ((bv.to_f - av.to_f) / itv.to_f).to_i
|
9
|
+
end
|
10
|
+
def loggamma(k)
|
11
|
+
c1 = 76.18009173
|
12
|
+
c2 = -86.50532033
|
13
|
+
c3 = 24.01409822
|
14
|
+
c4 = -1.231739516
|
15
|
+
c5 = 0.00120858
|
16
|
+
c6 = -0.000005364
|
17
|
+
c7 = 2.506628275
|
18
|
+
x1 = k - 1
|
19
|
+
ws = x1 + 5.5
|
20
|
+
ws = (x1 + 0.5) * Math::log(ws) - ws
|
21
|
+
s = 1 + c1 / (x1 + 1) + c2 / (x1 + 2) + c3 / (x1 + 3) + c4 / (x1 + 4) + c5 / (x1 + 5) + c6 / (x1 + 6)
|
22
|
+
ws + Math::log(c7 * s)
|
23
|
+
end
|
24
|
+
def f(x, k)
|
25
|
+
Math::exp(0.5 * k * Math::log(0.5 * x) - Math::log(x) - loggamma(0.5 * k) - 0.5 * x)
|
26
|
+
end
|
27
|
+
def cdf(b,k)
|
28
|
+
a = 0.001
|
29
|
+
b=b.to_f
|
30
|
+
if k==2
|
31
|
+
1 - Math::exp( -b.to_f / 2)
|
32
|
+
else
|
33
|
+
w = (b - a) / 28.to_f
|
34
|
+
2 * w / 45 * (7 * (f(a, k) + f(a + 28 * w, k)) + 12 * (f(a + 2 * w, k) + f(a + 6 * w, k) + f(a + 10 * w, k) + f(a + 14 * w, k) + f(a + 18 * w, k) + f(a + 22 * w, k) + f(a + 26 * w, k)) + 14 * (f(a + 4 * w, k) + f(a + 8 * w, k) + f(a + 12 * w, k) + f(a + 16 * w, k) + f(a + 20 * w, k) + f(a + 24 * w, k)) + 32 * (f(a + w, k) + f(a + 3 * w, k) + f(a + 5 * w, k) + f(a + 7 * w, k) + f(a + 9 * w, k) + f(a + 11 * w, k) + f(a + 13 * w, k) + f(a + 15 * w, k) + f(a + 17 * w, k) + f(a + 19 * w, k) + f(a + 21 * w, k) + f(a + 23 * w, k) + f(a + 25 * w, k) + f(a + 27 * w, k)))
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
|
3
|
+
module Statsample
|
4
|
+
# Codification
|
5
|
+
#
|
6
|
+
# This tool aids to code open questions
|
7
|
+
# * Load one or more vectors on the workflow, to create a file on yaml of values. If data have Statsample::SEPARATOR_TOKEN, the value will be splitted on two or more values
|
8
|
+
# * Edit the yaml and replace the values with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SEPARATOR_TOKEN)
|
9
|
+
# * Recode the vectors, loading the yaml file:
|
10
|
+
# * The new vectors have the same name of the original plus "_recoded"
|
11
|
+
# * Instead of load new recoded vectors, create many vectors as values, as add_vectors_by_split
|
12
|
+
#
|
13
|
+
# Usage:
|
14
|
+
# recode_file="recodification.yaml"
|
15
|
+
# phase=:first # flag
|
16
|
+
# if phase==:first
|
17
|
+
# File.open(recode_file,"w") {|fp|
|
18
|
+
# Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
|
19
|
+
# } # Edit the file recodification.yaml
|
20
|
+
# elsif phase==:second
|
21
|
+
# File.open(recode_file,"r") {|fp|
|
22
|
+
# Statsample::Codification.verify(fp,['vector1'])
|
23
|
+
# }
|
24
|
+
# elsif phase==:third
|
25
|
+
# File.open(recode_file,"r") {|fp|
|
26
|
+
# Statsample::Codification.recode_dataset_split!(ds,fp,"*")
|
27
|
+
# }
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
module Codification
|
31
|
+
class << self
|
32
|
+
# Create a yaml dump for a hash, based on vectors
|
33
|
+
# The keys will be vectors name on dataset and the values
|
34
|
+
# will be hashes, with keys = values, for recodification
|
35
|
+
#
|
36
|
+
# v1=%w{a,b b,c d}.to_vector
|
37
|
+
# ds={"v1"=>v1}.to_dataset
|
38
|
+
# Statsample::Codification.create_yaml(ds,['v1'])
|
39
|
+
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
40
|
+
def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
|
41
|
+
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
42
|
+
pro_hash=vectors.inject({}){|h,v_name|
|
43
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
44
|
+
v=dataset[v_name]
|
45
|
+
split_data=v.splitted(sep)
|
46
|
+
factors=split_data.flatten.uniq.compact.sort.inject({}) {|a,v| a[v]=v;a}
|
47
|
+
h[v_name]=factors
|
48
|
+
h
|
49
|
+
}
|
50
|
+
YAML.dump(pro_hash,io)
|
51
|
+
end
|
52
|
+
def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
|
53
|
+
h.inject({}) {|a,v|
|
54
|
+
v[1].split(sep).each {|val|
|
55
|
+
a[val]||=[]
|
56
|
+
a[val].push(v[0])
|
57
|
+
}
|
58
|
+
a
|
59
|
+
}
|
60
|
+
end
|
61
|
+
def dictionary(h,sep=Statsample::SPLIT_TOKEN)
|
62
|
+
h.inject({}) {|a,v|
|
63
|
+
a[v[0]]=v[1].split(sep)
|
64
|
+
a
|
65
|
+
}
|
66
|
+
end
|
67
|
+
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
68
|
+
dict=dictionary(h,sep)
|
69
|
+
new_data=v.splitted(sep)
|
70
|
+
recoded=new_data.collect{|c|
|
71
|
+
if c.nil?
|
72
|
+
nil
|
73
|
+
else
|
74
|
+
c.collect{|value|
|
75
|
+
dict[value]
|
76
|
+
}.flatten.uniq
|
77
|
+
end
|
78
|
+
}
|
79
|
+
end
|
80
|
+
def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
|
81
|
+
_recode_dataset(dataset,yaml,sep,false)
|
82
|
+
end
|
83
|
+
def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
|
84
|
+
_recode_dataset(dataset,yaml,sep,true)
|
85
|
+
end
|
86
|
+
|
87
|
+
def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
|
88
|
+
h=YAML::load(yaml)
|
89
|
+
v_names||=h.keys
|
90
|
+
v_names.each do |v_name|
|
91
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
92
|
+
recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
|
93
|
+
if c.nil?
|
94
|
+
nil
|
95
|
+
else
|
96
|
+
c.join(sep)
|
97
|
+
end
|
98
|
+
}.to_vector
|
99
|
+
if(split)
|
100
|
+
recoded.split_by_separator(sep).each {|k,v|
|
101
|
+
dataset[v_name+"_"+k]=v
|
102
|
+
}
|
103
|
+
else
|
104
|
+
dataset[v_name+"_recoded"]=recoded
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
109
|
+
require 'pp'
|
110
|
+
h=YAML::load(yaml)
|
111
|
+
v_names||=h.keys
|
112
|
+
v_names.each{|v_name|
|
113
|
+
inverse=inverse_hash(h[v_name],sep)
|
114
|
+
io.puts "Vector: #{v_name}"
|
115
|
+
YAML.dump(inverse.sort,io)
|
116
|
+
}
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|