statsample 0.5.1 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 0.6.0 / 2010-02-05
|
2
|
+
* New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
|
3
|
+
* New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
|
4
|
+
* New class Statsample::Permutation to produce permutations of a given array
|
5
|
+
* New class Statsample::Histogram, with same interface as GSL one
|
6
|
+
* New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
|
7
|
+
* Improved support for ReportBuilder
|
8
|
+
* Statsample::Codification module reworked
|
9
|
+
* Fixed bugs on Dominance Analysis classes
|
10
|
+
* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
|
11
|
+
|
1
12
|
=== 0.5.1 / 2009-10-06
|
2
13
|
|
3
14
|
* New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
|
@@ -18,6 +29,7 @@
|
|
18
29
|
* Logit tests
|
19
30
|
* Bug fix: rescue for requires doesn't specify LoadError
|
20
31
|
* Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
|
32
|
+
|
21
33
|
=== 0.4.0 / 2009-09-10
|
22
34
|
* New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
|
23
35
|
* New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.
|
data/Manifest.txt
CHANGED
@@ -18,6 +18,7 @@ demo/graph.rb
|
|
18
18
|
demo/item_analysis.rb
|
19
19
|
demo/mean.rb
|
20
20
|
demo/nunnally_6.rb
|
21
|
+
demo/pca.rb
|
21
22
|
demo/proportion.rb
|
22
23
|
demo/regression.rb
|
23
24
|
demo/sample_test.csv
|
@@ -25,6 +26,7 @@ demo/spss_matrix.rb
|
|
25
26
|
demo/strata_proportion.rb
|
26
27
|
demo/stratum.rb
|
27
28
|
demo/t-student.rb
|
29
|
+
demo/umann.rb
|
28
30
|
lib/distribution.rb
|
29
31
|
lib/distribution/chisquare.rb
|
30
32
|
lib/distribution/f.rb
|
@@ -47,17 +49,23 @@ lib/statsample/crosstab.rb
|
|
47
49
|
lib/statsample/dataset.rb
|
48
50
|
lib/statsample/dominanceanalysis.rb
|
49
51
|
lib/statsample/dominanceanalysis/bootstrap.rb
|
52
|
+
lib/statsample/factor.rb
|
53
|
+
lib/statsample/factor/pca.rb
|
54
|
+
lib/statsample/factor/principalaxis.rb
|
55
|
+
lib/statsample/factor/rotation.rb
|
50
56
|
lib/statsample/graph/gdchart.rb
|
51
57
|
lib/statsample/graph/svgboxplot.rb
|
52
58
|
lib/statsample/graph/svggraph.rb
|
53
59
|
lib/statsample/graph/svghistogram.rb
|
54
60
|
lib/statsample/graph/svgscatterplot.rb
|
61
|
+
lib/statsample/histogram.rb
|
55
62
|
lib/statsample/htmlreport.rb
|
56
63
|
lib/statsample/mle.rb
|
57
64
|
lib/statsample/mle/logit.rb
|
58
65
|
lib/statsample/mle/normal.rb
|
59
66
|
lib/statsample/mle/probit.rb
|
60
67
|
lib/statsample/multiset.rb
|
68
|
+
lib/statsample/permutation.rb
|
61
69
|
lib/statsample/regression.rb
|
62
70
|
lib/statsample/regression/binomial.rb
|
63
71
|
lib/statsample/regression/binomial/logit.rb
|
@@ -72,6 +80,7 @@ lib/statsample/reliability.rb
|
|
72
80
|
lib/statsample/resample.rb
|
73
81
|
lib/statsample/srs.rb
|
74
82
|
lib/statsample/test.rb
|
83
|
+
lib/statsample/test/umannwhitney.rb
|
75
84
|
lib/statsample/vector.rb
|
76
85
|
po/es/statsample.po
|
77
86
|
po/statsample.pot
|
@@ -85,11 +94,14 @@ test/test_csv.csv
|
|
85
94
|
test/test_csv.rb
|
86
95
|
test/test_dataset.rb
|
87
96
|
test/test_distribution.rb
|
97
|
+
test/test_factor.rb
|
88
98
|
test/test_ggobi.rb
|
89
99
|
test/test_gsl.rb
|
100
|
+
test/test_histogram.rb
|
90
101
|
test/test_logit.rb
|
91
102
|
test/test_mle.rb
|
92
103
|
test/test_multiset.rb
|
104
|
+
test/test_permutation.rb
|
93
105
|
test/test_regression.rb
|
94
106
|
test/test_reliability.rb
|
95
107
|
test/test_resample.rb
|
@@ -97,6 +109,7 @@ test/test_srs.rb
|
|
97
109
|
test/test_statistics.rb
|
98
110
|
test/test_stratified.rb
|
99
111
|
test/test_svg_graph.rb
|
112
|
+
test/test_umannwhitney.rb
|
100
113
|
test/test_vector.rb
|
101
114
|
test/test_xls.rb
|
102
115
|
test/test_xls.xls
|
data/README.txt
CHANGED
@@ -5,10 +5,11 @@ http://ruby-statsample.rubyforge.org/
|
|
5
5
|
|
6
6
|
== DESCRIPTION:
|
7
7
|
|
8
|
-
A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
|
8
|
+
A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, factorial analysis, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
|
9
9
|
|
10
10
|
== FEATURES:
|
11
11
|
|
12
|
+
* Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
|
12
13
|
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
|
13
14
|
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
14
15
|
* Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
|
data/demo/pca.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__)+"/../lib/statsample"
|
2
|
+
require 'matrix_extension'
|
3
|
+
require 'reportbuilder'
|
4
|
+
require 'gsl'
|
5
|
+
ds=Statsample.load("/home/cdx/trabajo/sepade/pdie/2008_ntic/analisis_c1/tesis.ds")
|
6
|
+
ds2=ds['ac_gen'..'ac_db'].dup_only_valid
|
7
|
+
|
8
|
+
cm=Statsample::Bivariate.correlation_matrix(ds2)
|
9
|
+
|
10
|
+
pca=Statsample::Factor::PCA.new(cm, :m=>2)
|
11
|
+
rb=ReportBuilder.new()
|
12
|
+
rb.add(pca)
|
13
|
+
|
14
|
+
varimax=Statsample::Factor::Quartimax.new(pca.component_matrix.to_matrix)
|
15
|
+
varimax.iterate
|
16
|
+
rb.add(varimax.rotated)
|
17
|
+
rb.add(varimax.iterations)
|
18
|
+
rb.add(varimax.component_transformation_matrix)
|
19
|
+
rb.add(varimax.h2)
|
20
|
+
=begin
|
21
|
+
fa=Statsample::Factor::PrincipalAxis.new(cm, :m=>1)
|
22
|
+
rb=ReportBuilder.new()
|
23
|
+
rb.add(fa)
|
24
|
+
|
25
|
+
=end
|
26
|
+
puts rb.to_text
|
27
|
+
|
28
|
+
|
29
|
+
|
data/demo/umann.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
v1=[1,2,3,4,7,8,9,10,14,15].to_scale
|
3
|
+
v2=[5,6,11,12,13,16,17,18,19].to_scale
|
4
|
+
u=Statsample::Test::UMannWhitney.new(v1,v2)
|
5
|
+
|
6
|
+
puts u.summary
|
7
|
+
|
8
|
+
#p Statsample::Test::UMannWhitney.exact_probability_as62(100,100)
|
data/lib/distribution.rb
CHANGED
data/lib/matrix_extension.rb
CHANGED
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'matrix'
|
2
|
+
|
3
|
+
if RUBY_VERSION<="1.9.0"
|
4
|
+
class Vector
|
5
|
+
alias_method :old_coerce, :coerce
|
6
|
+
def coerce(other)
|
7
|
+
case other
|
8
|
+
when Numeric
|
9
|
+
return Matrix::Scalar.new(other), self
|
10
|
+
else
|
11
|
+
raise TypeError, "#{self.class} can't be coerced into #{other.class}"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
2
16
|
class Matrix
|
3
17
|
def rows_sum
|
4
18
|
(0...row_size).collect {|i|
|
@@ -37,31 +51,31 @@ class Matrix
|
|
37
51
|
end
|
38
52
|
# Test if a Matrix is a identity one
|
39
53
|
def identity?
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
54
|
+
if regular?
|
55
|
+
rows=(0...row_size).each{|i|
|
56
|
+
(0...column_size).each {|j|
|
57
|
+
v = self[i,j]
|
58
|
+
return false if (i==j and v!=1) or (i!=j and v!=0)
|
59
|
+
}
|
60
|
+
}
|
61
|
+
true
|
62
|
+
else
|
63
|
+
false
|
64
|
+
end
|
51
65
|
end
|
52
66
|
def to_gsl
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
67
|
+
out=[]
|
68
|
+
self.row_size.times{|i|
|
69
|
+
out[i]=self.row(i).to_a
|
70
|
+
}
|
71
|
+
GSL::Matrix[*out]
|
58
72
|
end
|
59
73
|
def orthogonal?
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
74
|
+
if regular?
|
75
|
+
(self * self.t).identity?
|
76
|
+
else
|
77
|
+
false
|
78
|
+
end
|
65
79
|
end
|
66
80
|
end
|
67
81
|
|
data/lib/statsample.rb
CHANGED
@@ -108,16 +108,18 @@ end
|
|
108
108
|
# * Dataset: An union of vectors.
|
109
109
|
#
|
110
110
|
module Statsample
|
111
|
-
VERSION = '0.
|
111
|
+
VERSION = '0.6.0'
|
112
112
|
SPLIT_TOKEN = ","
|
113
113
|
autoload(:Database, 'statsample/converters')
|
114
114
|
autoload(:Anova, 'statsample/anova')
|
115
115
|
autoload(:Combination, 'statsample/combination')
|
116
|
+
autoload(:Permutation, 'statsample/permutation')
|
116
117
|
autoload(:CSV, 'statsample/converters')
|
117
118
|
autoload(:PlainText, 'statsample/converters')
|
118
119
|
autoload(:Excel, 'statsample/converters')
|
119
120
|
autoload(:GGobi, 'statsample/converters')
|
120
121
|
autoload(:SPSS, 'statsample/converter/spss')
|
122
|
+
autoload(:Histogram, 'statsample/histogram')
|
121
123
|
autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
|
122
124
|
autoload(:HtmlReport, 'statsample/htmlreport')
|
123
125
|
autoload(:Mx, 'statsample/converters')
|
@@ -132,6 +134,7 @@ module Statsample
|
|
132
134
|
autoload(:MLE, 'statsample/mle')
|
133
135
|
autoload(:Regression, 'statsample/regression')
|
134
136
|
autoload(:Test, 'statsample/test')
|
137
|
+
autoload(:Factor, 'statsample/factor')
|
135
138
|
def self.load(filename)
|
136
139
|
if File.exists? filename
|
137
140
|
o=false
|
@@ -157,38 +160,38 @@ module Statsample
|
|
157
160
|
end
|
158
161
|
module Writable
|
159
162
|
def save(filename)
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
+
fp=File.open(filename,"w")
|
164
|
+
Marshal.dump(self,fp)
|
165
|
+
fp.close
|
163
166
|
end
|
164
167
|
end
|
165
168
|
module HtmlSummary
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
169
|
+
def add_line(n=nil)
|
170
|
+
self << "<hr />"
|
171
|
+
end
|
172
|
+
def nl
|
173
|
+
self << "<br />"
|
174
|
+
end
|
175
|
+
def add(text)
|
176
|
+
self << ("<p>"+text.gsub("\n","<br />")+"</p>")
|
177
|
+
end
|
178
|
+
def parse_table(table)
|
179
|
+
self << table.parse_html
|
180
|
+
end
|
178
181
|
end
|
179
182
|
module ConsoleSummary
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
183
|
+
def add_line(n=80)
|
184
|
+
self << "-"*n+"\n"
|
185
|
+
end
|
186
|
+
def nl
|
187
|
+
self << "\n"
|
188
|
+
end
|
189
|
+
def add(text)
|
190
|
+
self << text
|
191
|
+
end
|
192
|
+
def parse_table(table)
|
193
|
+
self << table.parse_console
|
194
|
+
end
|
192
195
|
end
|
193
196
|
class ReportTable
|
194
197
|
attr_reader :header
|
data/lib/statsample/anova.rb
CHANGED
@@ -6,8 +6,13 @@ module Statsample
|
|
6
6
|
# v2=[3,3,4,5,6].to_scale
|
7
7
|
# v3=[5,3,1,5,6].to_scale
|
8
8
|
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
-
#
|
10
|
-
#
|
9
|
+
# anova.f
|
10
|
+
# => 0.0243902439024391
|
11
|
+
# anova.significance
|
12
|
+
# => 0.975953044203438
|
13
|
+
# anova.sst
|
14
|
+
# => 32.9333333333333
|
15
|
+
#
|
11
16
|
class OneWay
|
12
17
|
def initialize(vectors)
|
13
18
|
@vectors=vectors
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -13,6 +13,7 @@ module Statsample
|
|
13
13
|
covariance_slow(v1a,v2a)
|
14
14
|
end
|
15
15
|
end
|
16
|
+
# Estimate the ML between two dichotomic vectors
|
16
17
|
def maximum_likehood_dichotomic(pred,real)
|
17
18
|
preda,reala=Statsample.only_valid(pred,real)
|
18
19
|
sum=0
|
@@ -59,13 +60,14 @@ module Statsample
|
|
59
60
|
end
|
60
61
|
# Retrieves the value for t test for a pearson correlation
|
61
62
|
# giving r and vector size
|
63
|
+
# Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
|
62
64
|
def t_r(r,size)
|
63
65
|
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
|
64
66
|
end
|
65
67
|
# Retrieves the probability value (a la SPSS)
|
66
68
|
# for a given t, size and number of tails.
|
67
69
|
# Uses a second parameter
|
68
|
-
# * :both or 2 : for r!=0
|
70
|
+
# * :both or 2 : for r!=0 (default)
|
69
71
|
# * :right, :positive or 1 : for r > 0
|
70
72
|
# * :left, :negative : for r < 0
|
71
73
|
|
@@ -112,6 +114,7 @@ module Statsample
|
|
112
114
|
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
113
115
|
|
114
116
|
end
|
117
|
+
|
115
118
|
# Covariance matrix.
|
116
119
|
# Order of rows and columns depends on Dataset#fields order
|
117
120
|
|
@@ -139,7 +142,8 @@ module Statsample
|
|
139
142
|
end
|
140
143
|
end
|
141
144
|
end
|
142
|
-
|
145
|
+
|
146
|
+
# Retrieves the n valid pairwise.
|
143
147
|
def n_valid_matrix(ds)
|
144
148
|
ds.collect_matrix do |row,col|
|
145
149
|
if row==col
|
@@ -150,7 +154,8 @@ module Statsample
|
|
150
154
|
end
|
151
155
|
end
|
152
156
|
end
|
153
|
-
|
157
|
+
|
158
|
+
# Matrix of correlation probabilities.
|
154
159
|
# Order of rows and columns depends on Dataset#fields order
|
155
160
|
|
156
161
|
def correlation_probability_matrix(ds, tails=:both)
|
@@ -162,6 +167,7 @@ module Statsample
|
|
162
167
|
end
|
163
168
|
Matrix.rows(rows)
|
164
169
|
end
|
170
|
+
|
165
171
|
# Spearman ranked correlation coefficient between 2 vectors
|
166
172
|
def spearman(v1,v2)
|
167
173
|
v1a,v2a=Statsample.only_valid(v1,v2)
|
@@ -218,16 +224,16 @@ module Statsample
|
|
218
224
|
rs=matrix.row_size
|
219
225
|
cs=matrix.column_size
|
220
226
|
conc=disc=ties_x=ties_y=0
|
221
|
-
(0...(rs-1)).each
|
222
|
-
(0...(cs-1)).each
|
223
|
-
((x+1)...rs).each
|
224
|
-
((y+1)...cs).each
|
227
|
+
(0...(rs-1)).each do |x|
|
228
|
+
(0...(cs-1)).each do |y|
|
229
|
+
((x+1)...rs).each do |x2|
|
230
|
+
((y+1)...cs).each do |y2|
|
225
231
|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
226
232
|
conc+=matrix[x,y]*matrix[x2,y2]
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
231
237
|
(0...(rs-1)).each {|x|
|
232
238
|
(1...(cs)).each{|y|
|
233
239
|
((x+1)...rs).each{|x2|
|
@@ -27,94 +27,143 @@ module Statsample
|
|
27
27
|
# }
|
28
28
|
# end
|
29
29
|
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
30
|
+
module Codification
|
31
|
+
class << self
|
32
|
+
# Create a hash, based on vectors, to create the dictionary.
|
33
|
+
# The keys will be vectors name on dataset and the values
|
34
|
+
# will be hashes, with keys = values, for recodification
|
35
|
+
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
|
36
|
+
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
37
|
+
pro_hash=vectors.inject({}){|h,v_name|
|
38
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
39
|
+
v=dataset[v_name]
|
40
|
+
split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
|
41
|
+
|
42
|
+
factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
|
43
|
+
h[v_name]=factors
|
44
|
+
h
|
45
|
+
}
|
46
|
+
pro_hash
|
47
|
+
end
|
48
|
+
# Create a yaml to create a dictionary, based on vectors
|
49
|
+
# The keys will be vectors name on dataset and the values
|
50
|
+
# will be hashes, with keys = values, for recodification
|
51
|
+
#
|
52
|
+
# v1=%w{a,b b,c d}.to_vector
|
53
|
+
# ds={"v1"=>v1}.to_dataset
|
54
|
+
# Statsample::Codification.create_yaml(ds,['v1'])
|
55
|
+
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
56
|
+
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
|
57
|
+
pro_hash=create_hash(dataset, vectors, sep)
|
58
|
+
YAML.dump(pro_hash,io)
|
59
|
+
end
|
60
|
+
# Create a excel to create a dictionary, based on vectors.
|
61
|
+
# Raises an error if filename exists
|
62
|
+
# The rows will be:
|
63
|
+
# * field: name of vector
|
64
|
+
# * original: original name
|
65
|
+
# * recoded: new code
|
66
|
+
|
67
|
+
def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
|
68
|
+
require 'spreadsheet'
|
69
|
+
if File.exists?(filename)
|
70
|
+
raise "Exists a file named #{filename}. Delete ir before overwrite."
|
71
|
+
end
|
72
|
+
book = Spreadsheet::Workbook.new
|
73
|
+
sheet = book.create_worksheet
|
74
|
+
sheet.row(0).concat(%w{field original recoded})
|
75
|
+
i=1
|
76
|
+
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
|
77
|
+
inner_hash.sort.each do |k,v|
|
78
|
+
sheet.row(i).concat([field.dup,k.dup,v.dup])
|
79
|
+
i+=1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
book.write(filename)
|
83
|
+
end
|
84
|
+
# From a excel generates a dictionary hash
|
85
|
+
# to use on recode_dataset_simple!() or recode_dataset_split!().
|
86
|
+
#
|
87
|
+
def excel_to_recoded_hash(filename)
|
88
|
+
require 'spreadsheet'
|
89
|
+
h={}
|
90
|
+
book = Spreadsheet.open filename
|
91
|
+
sheet= book.worksheet 0
|
92
|
+
row_i=0
|
93
|
+
sheet.each do |row|
|
94
|
+
row_i+=1
|
95
|
+
next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
|
96
|
+
h[row[0]]={} if h[row[0]].nil?
|
97
|
+
h[row[0]][row[1]]=row[2]
|
98
|
+
end
|
99
|
+
h
|
100
|
+
end
|
101
|
+
|
102
|
+
def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
|
103
|
+
h.inject({}) do |a,v|
|
104
|
+
v[1].split(sep).each do |val|
|
105
|
+
a[val]||=[]
|
106
|
+
a[val].push(v[0])
|
107
|
+
end
|
108
|
+
a
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
113
|
+
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
|
114
|
+
end
|
115
|
+
|
116
|
+
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
117
|
+
dict=dictionary(h,sep)
|
118
|
+
new_data=v.splitted(sep)
|
119
|
+
recoded=new_data.collect do |c|
|
120
|
+
if c.nil?
|
121
|
+
nil
|
122
|
+
else
|
123
|
+
c.collect{|value| dict[value] }.flatten.uniq
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
|
128
|
+
_recode_dataset(dataset,dictionary_hash ,sep,false)
|
129
|
+
end
|
130
|
+
def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
|
131
|
+
_recode_dataset(dataset, dictionary_hash, sep,true)
|
132
|
+
end
|
133
|
+
|
134
|
+
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
135
|
+
v_names||=h.keys
|
136
|
+
v_names.each do |v_name|
|
137
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
138
|
+
recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
|
139
|
+
if c.nil?
|
140
|
+
nil
|
141
|
+
else
|
142
|
+
c.join(sep)
|
143
|
+
end
|
144
|
+
}.to_vector
|
145
|
+
if(split)
|
146
|
+
recoded.split_by_separator(sep).each {|k,v|
|
147
|
+
dataset[v_name+"_"+k]=v
|
148
|
+
}
|
149
|
+
else
|
150
|
+
dataset[v_name+"_recoded"]=recoded
|
117
151
|
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
157
|
+
require 'pp'
|
158
|
+
v_names||=h.keys
|
159
|
+
v_names.each{|v_name|
|
160
|
+
inverse=inverse_hash(h[v_name],sep)
|
161
|
+
io.puts "- Field: #{v_name}"
|
162
|
+
inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
|
163
|
+
io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
|
164
|
+
}
|
165
|
+
}
|
118
166
|
end
|
119
167
|
end
|
168
|
+
end
|
120
169
|
end
|