statsample 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +12 -0
- data/Manifest.txt +13 -0
- data/README.txt +2 -1
- data/demo/pca.rb +29 -0
- data/demo/umann.rb +8 -0
- data/lib/distribution.rb +0 -1
- data/lib/matrix_extension.rb +35 -21
- data/lib/statsample.rb +31 -28
- data/lib/statsample/anova.rb +7 -2
- data/lib/statsample/bivariate.rb +17 -11
- data/lib/statsample/codification.rb +136 -87
- data/lib/statsample/combination.rb +0 -2
- data/lib/statsample/converter/csv18.rb +1 -1
- data/lib/statsample/converter/csv19.rb +1 -1
- data/lib/statsample/converters.rb +176 -171
- data/lib/statsample/crosstab.rb +227 -154
- data/lib/statsample/dataset.rb +94 -12
- data/lib/statsample/dominanceanalysis.rb +69 -62
- data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
- data/lib/statsample/factor.rb +18 -0
- data/lib/statsample/factor/pca.rb +128 -0
- data/lib/statsample/factor/principalaxis.rb +133 -0
- data/lib/statsample/factor/rotation.rb +125 -0
- data/lib/statsample/histogram.rb +99 -0
- data/lib/statsample/mle.rb +125 -126
- data/lib/statsample/mle/logit.rb +91 -91
- data/lib/statsample/mle/probit.rb +84 -85
- data/lib/statsample/multiset.rb +1 -1
- data/lib/statsample/permutation.rb +96 -0
- data/lib/statsample/regression.rb +1 -1
- data/lib/statsample/regression/binomial.rb +89 -89
- data/lib/statsample/regression/binomial/logit.rb +9 -9
- data/lib/statsample/regression/binomial/probit.rb +9 -9
- data/lib/statsample/regression/multiple.rb +8 -14
- data/lib/statsample/regression/multiple/gslengine.rb +1 -1
- data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
- data/lib/statsample/resample.rb +12 -17
- data/lib/statsample/srs.rb +4 -1
- data/lib/statsample/test.rb +23 -22
- data/lib/statsample/test/umannwhitney.rb +182 -0
- data/lib/statsample/vector.rb +854 -815
- data/test/test_bivariate.rb +132 -132
- data/test/test_codification.rb +71 -50
- data/test/test_dataset.rb +19 -1
- data/test/test_factor.rb +44 -0
- data/test/test_histogram.rb +26 -0
- data/test/test_permutation.rb +37 -0
- data/test/test_statistics.rb +74 -63
- data/test/test_umannwhitney.rb +17 -0
- data/test/test_vector.rb +46 -30
- metadata +31 -4
data/History.txt
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
=== 0.6.0 / 2010-02-05
|
2
|
+
* New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
|
3
|
+
* New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
|
4
|
+
* New class Statsample::Permutation to produce permutations of a given array
|
5
|
+
* New class Statsample::Histogram, with same interface as GSL one
|
6
|
+
* New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
|
7
|
+
* Improved support for ReportBuilder
|
8
|
+
* Statsample::Codification module reworked
|
9
|
+
* Fixed bugs on Dominance Analysis classes
|
10
|
+
* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
|
11
|
+
|
1
12
|
=== 0.5.1 / 2009-10-06
|
2
13
|
|
3
14
|
* New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
|
@@ -18,6 +29,7 @@
|
|
18
29
|
* Logit tests
|
19
30
|
* Bug fix: rescue for requires doesn't specify LoadError
|
20
31
|
* Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
|
32
|
+
|
21
33
|
=== 0.4.0 / 2009-09-10
|
22
34
|
* New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
|
23
35
|
* New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.
|
data/Manifest.txt
CHANGED
@@ -18,6 +18,7 @@ demo/graph.rb
|
|
18
18
|
demo/item_analysis.rb
|
19
19
|
demo/mean.rb
|
20
20
|
demo/nunnally_6.rb
|
21
|
+
demo/pca.rb
|
21
22
|
demo/proportion.rb
|
22
23
|
demo/regression.rb
|
23
24
|
demo/sample_test.csv
|
@@ -25,6 +26,7 @@ demo/spss_matrix.rb
|
|
25
26
|
demo/strata_proportion.rb
|
26
27
|
demo/stratum.rb
|
27
28
|
demo/t-student.rb
|
29
|
+
demo/umann.rb
|
28
30
|
lib/distribution.rb
|
29
31
|
lib/distribution/chisquare.rb
|
30
32
|
lib/distribution/f.rb
|
@@ -47,17 +49,23 @@ lib/statsample/crosstab.rb
|
|
47
49
|
lib/statsample/dataset.rb
|
48
50
|
lib/statsample/dominanceanalysis.rb
|
49
51
|
lib/statsample/dominanceanalysis/bootstrap.rb
|
52
|
+
lib/statsample/factor.rb
|
53
|
+
lib/statsample/factor/pca.rb
|
54
|
+
lib/statsample/factor/principalaxis.rb
|
55
|
+
lib/statsample/factor/rotation.rb
|
50
56
|
lib/statsample/graph/gdchart.rb
|
51
57
|
lib/statsample/graph/svgboxplot.rb
|
52
58
|
lib/statsample/graph/svggraph.rb
|
53
59
|
lib/statsample/graph/svghistogram.rb
|
54
60
|
lib/statsample/graph/svgscatterplot.rb
|
61
|
+
lib/statsample/histogram.rb
|
55
62
|
lib/statsample/htmlreport.rb
|
56
63
|
lib/statsample/mle.rb
|
57
64
|
lib/statsample/mle/logit.rb
|
58
65
|
lib/statsample/mle/normal.rb
|
59
66
|
lib/statsample/mle/probit.rb
|
60
67
|
lib/statsample/multiset.rb
|
68
|
+
lib/statsample/permutation.rb
|
61
69
|
lib/statsample/regression.rb
|
62
70
|
lib/statsample/regression/binomial.rb
|
63
71
|
lib/statsample/regression/binomial/logit.rb
|
@@ -72,6 +80,7 @@ lib/statsample/reliability.rb
|
|
72
80
|
lib/statsample/resample.rb
|
73
81
|
lib/statsample/srs.rb
|
74
82
|
lib/statsample/test.rb
|
83
|
+
lib/statsample/test/umannwhitney.rb
|
75
84
|
lib/statsample/vector.rb
|
76
85
|
po/es/statsample.po
|
77
86
|
po/statsample.pot
|
@@ -85,11 +94,14 @@ test/test_csv.csv
|
|
85
94
|
test/test_csv.rb
|
86
95
|
test/test_dataset.rb
|
87
96
|
test/test_distribution.rb
|
97
|
+
test/test_factor.rb
|
88
98
|
test/test_ggobi.rb
|
89
99
|
test/test_gsl.rb
|
100
|
+
test/test_histogram.rb
|
90
101
|
test/test_logit.rb
|
91
102
|
test/test_mle.rb
|
92
103
|
test/test_multiset.rb
|
104
|
+
test/test_permutation.rb
|
93
105
|
test/test_regression.rb
|
94
106
|
test/test_reliability.rb
|
95
107
|
test/test_resample.rb
|
@@ -97,6 +109,7 @@ test/test_srs.rb
|
|
97
109
|
test/test_statistics.rb
|
98
110
|
test/test_stratified.rb
|
99
111
|
test/test_svg_graph.rb
|
112
|
+
test/test_umannwhitney.rb
|
100
113
|
test/test_vector.rb
|
101
114
|
test/test_xls.rb
|
102
115
|
test/test_xls.xls
|
data/README.txt
CHANGED
@@ -5,10 +5,11 @@ http://ruby-statsample.rubyforge.org/
|
|
5
5
|
|
6
6
|
== DESCRIPTION:
|
7
7
|
|
8
|
-
A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
|
8
|
+
A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, factorial analysis, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
|
9
9
|
|
10
10
|
== FEATURES:
|
11
11
|
|
12
|
+
* Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
|
12
13
|
* Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
|
13
14
|
* Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
14
15
|
* Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them
|
data/demo/pca.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__)+"/../lib/statsample"
|
2
|
+
require 'matrix_extension'
|
3
|
+
require 'reportbuilder'
|
4
|
+
require 'gsl'
|
5
|
+
ds=Statsample.load("/home/cdx/trabajo/sepade/pdie/2008_ntic/analisis_c1/tesis.ds")
|
6
|
+
ds2=ds['ac_gen'..'ac_db'].dup_only_valid
|
7
|
+
|
8
|
+
cm=Statsample::Bivariate.correlation_matrix(ds2)
|
9
|
+
|
10
|
+
pca=Statsample::Factor::PCA.new(cm, :m=>2)
|
11
|
+
rb=ReportBuilder.new()
|
12
|
+
rb.add(pca)
|
13
|
+
|
14
|
+
varimax=Statsample::Factor::Quartimax.new(pca.component_matrix.to_matrix)
|
15
|
+
varimax.iterate
|
16
|
+
rb.add(varimax.rotated)
|
17
|
+
rb.add(varimax.iterations)
|
18
|
+
rb.add(varimax.component_transformation_matrix)
|
19
|
+
rb.add(varimax.h2)
|
20
|
+
=begin
|
21
|
+
fa=Statsample::Factor::PrincipalAxis.new(cm, :m=>1)
|
22
|
+
rb=ReportBuilder.new()
|
23
|
+
rb.add(fa)
|
24
|
+
|
25
|
+
=end
|
26
|
+
puts rb.to_text
|
27
|
+
|
28
|
+
|
29
|
+
|
data/demo/umann.rb
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../lib/statsample'
|
2
|
+
v1=[1,2,3,4,7,8,9,10,14,15].to_scale
|
3
|
+
v2=[5,6,11,12,13,16,17,18,19].to_scale
|
4
|
+
u=Statsample::Test::UMannWhitney.new(v1,v2)
|
5
|
+
|
6
|
+
puts u.summary
|
7
|
+
|
8
|
+
#p Statsample::Test::UMannWhitney.exact_probability_as62(100,100)
|
data/lib/distribution.rb
CHANGED
data/lib/matrix_extension.rb
CHANGED
@@ -1,4 +1,18 @@
|
|
1
1
|
require 'matrix'
|
2
|
+
|
3
|
+
if RUBY_VERSION<="1.9.0"
|
4
|
+
class Vector
|
5
|
+
alias_method :old_coerce, :coerce
|
6
|
+
def coerce(other)
|
7
|
+
case other
|
8
|
+
when Numeric
|
9
|
+
return Matrix::Scalar.new(other), self
|
10
|
+
else
|
11
|
+
raise TypeError, "#{self.class} can't be coerced into #{other.class}"
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
2
16
|
class Matrix
|
3
17
|
def rows_sum
|
4
18
|
(0...row_size).collect {|i|
|
@@ -37,31 +51,31 @@ class Matrix
|
|
37
51
|
end
|
38
52
|
# Test if a Matrix is a identity one
|
39
53
|
def identity?
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
54
|
+
if regular?
|
55
|
+
rows=(0...row_size).each{|i|
|
56
|
+
(0...column_size).each {|j|
|
57
|
+
v = self[i,j]
|
58
|
+
return false if (i==j and v!=1) or (i!=j and v!=0)
|
59
|
+
}
|
60
|
+
}
|
61
|
+
true
|
62
|
+
else
|
63
|
+
false
|
64
|
+
end
|
51
65
|
end
|
52
66
|
def to_gsl
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
67
|
+
out=[]
|
68
|
+
self.row_size.times{|i|
|
69
|
+
out[i]=self.row(i).to_a
|
70
|
+
}
|
71
|
+
GSL::Matrix[*out]
|
58
72
|
end
|
59
73
|
def orthogonal?
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
74
|
+
if regular?
|
75
|
+
(self * self.t).identity?
|
76
|
+
else
|
77
|
+
false
|
78
|
+
end
|
65
79
|
end
|
66
80
|
end
|
67
81
|
|
data/lib/statsample.rb
CHANGED
@@ -108,16 +108,18 @@ end
|
|
108
108
|
# * Dataset: An union of vectors.
|
109
109
|
#
|
110
110
|
module Statsample
|
111
|
-
VERSION = '0.
|
111
|
+
VERSION = '0.6.0'
|
112
112
|
SPLIT_TOKEN = ","
|
113
113
|
autoload(:Database, 'statsample/converters')
|
114
114
|
autoload(:Anova, 'statsample/anova')
|
115
115
|
autoload(:Combination, 'statsample/combination')
|
116
|
+
autoload(:Permutation, 'statsample/permutation')
|
116
117
|
autoload(:CSV, 'statsample/converters')
|
117
118
|
autoload(:PlainText, 'statsample/converters')
|
118
119
|
autoload(:Excel, 'statsample/converters')
|
119
120
|
autoload(:GGobi, 'statsample/converters')
|
120
121
|
autoload(:SPSS, 'statsample/converter/spss')
|
122
|
+
autoload(:Histogram, 'statsample/histogram')
|
121
123
|
autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
|
122
124
|
autoload(:HtmlReport, 'statsample/htmlreport')
|
123
125
|
autoload(:Mx, 'statsample/converters')
|
@@ -132,6 +134,7 @@ module Statsample
|
|
132
134
|
autoload(:MLE, 'statsample/mle')
|
133
135
|
autoload(:Regression, 'statsample/regression')
|
134
136
|
autoload(:Test, 'statsample/test')
|
137
|
+
autoload(:Factor, 'statsample/factor')
|
135
138
|
def self.load(filename)
|
136
139
|
if File.exists? filename
|
137
140
|
o=false
|
@@ -157,38 +160,38 @@ module Statsample
|
|
157
160
|
end
|
158
161
|
module Writable
|
159
162
|
def save(filename)
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
+
fp=File.open(filename,"w")
|
164
|
+
Marshal.dump(self,fp)
|
165
|
+
fp.close
|
163
166
|
end
|
164
167
|
end
|
165
168
|
module HtmlSummary
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
169
|
+
def add_line(n=nil)
|
170
|
+
self << "<hr />"
|
171
|
+
end
|
172
|
+
def nl
|
173
|
+
self << "<br />"
|
174
|
+
end
|
175
|
+
def add(text)
|
176
|
+
self << ("<p>"+text.gsub("\n","<br />")+"</p>")
|
177
|
+
end
|
178
|
+
def parse_table(table)
|
179
|
+
self << table.parse_html
|
180
|
+
end
|
178
181
|
end
|
179
182
|
module ConsoleSummary
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
183
|
+
def add_line(n=80)
|
184
|
+
self << "-"*n+"\n"
|
185
|
+
end
|
186
|
+
def nl
|
187
|
+
self << "\n"
|
188
|
+
end
|
189
|
+
def add(text)
|
190
|
+
self << text
|
191
|
+
end
|
192
|
+
def parse_table(table)
|
193
|
+
self << table.parse_console
|
194
|
+
end
|
192
195
|
end
|
193
196
|
class ReportTable
|
194
197
|
attr_reader :header
|
data/lib/statsample/anova.rb
CHANGED
@@ -6,8 +6,13 @@ module Statsample
|
|
6
6
|
# v2=[3,3,4,5,6].to_scale
|
7
7
|
# v3=[5,3,1,5,6].to_scale
|
8
8
|
# anova=Statsample::Anova::OneWay.new([v1,v2,v3])
|
9
|
-
#
|
10
|
-
#
|
9
|
+
# anova.f
|
10
|
+
# => 0.0243902439024391
|
11
|
+
# anova.significance
|
12
|
+
# => 0.975953044203438
|
13
|
+
# anova.sst
|
14
|
+
# => 32.9333333333333
|
15
|
+
#
|
11
16
|
class OneWay
|
12
17
|
def initialize(vectors)
|
13
18
|
@vectors=vectors
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -13,6 +13,7 @@ module Statsample
|
|
13
13
|
covariance_slow(v1a,v2a)
|
14
14
|
end
|
15
15
|
end
|
16
|
+
# Estimate the ML between two dichotomic vectors
|
16
17
|
def maximum_likehood_dichotomic(pred,real)
|
17
18
|
preda,reala=Statsample.only_valid(pred,real)
|
18
19
|
sum=0
|
@@ -59,13 +60,14 @@ module Statsample
|
|
59
60
|
end
|
60
61
|
# Retrieves the value for t test for a pearson correlation
|
61
62
|
# giving r and vector size
|
63
|
+
# Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
|
62
64
|
def t_r(r,size)
|
63
65
|
r * Math::sqrt(((size)-2).to_f / (1 - r**2))
|
64
66
|
end
|
65
67
|
# Retrieves the probability value (a la SPSS)
|
66
68
|
# for a given t, size and number of tails.
|
67
69
|
# Uses a second parameter
|
68
|
-
# * :both or 2 : for r!=0
|
70
|
+
# * :both or 2 : for r!=0 (default)
|
69
71
|
# * :right, :positive or 1 : for r > 0
|
70
72
|
# * :left, :negative : for r < 0
|
71
73
|
|
@@ -112,6 +114,7 @@ module Statsample
|
|
112
114
|
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
|
113
115
|
|
114
116
|
end
|
117
|
+
|
115
118
|
# Covariance matrix.
|
116
119
|
# Order of rows and columns depends on Dataset#fields order
|
117
120
|
|
@@ -139,7 +142,8 @@ module Statsample
|
|
139
142
|
end
|
140
143
|
end
|
141
144
|
end
|
142
|
-
|
145
|
+
|
146
|
+
# Retrieves the n valid pairwise.
|
143
147
|
def n_valid_matrix(ds)
|
144
148
|
ds.collect_matrix do |row,col|
|
145
149
|
if row==col
|
@@ -150,7 +154,8 @@ module Statsample
|
|
150
154
|
end
|
151
155
|
end
|
152
156
|
end
|
153
|
-
|
157
|
+
|
158
|
+
# Matrix of correlation probabilities.
|
154
159
|
# Order of rows and columns depends on Dataset#fields order
|
155
160
|
|
156
161
|
def correlation_probability_matrix(ds, tails=:both)
|
@@ -162,6 +167,7 @@ module Statsample
|
|
162
167
|
end
|
163
168
|
Matrix.rows(rows)
|
164
169
|
end
|
170
|
+
|
165
171
|
# Spearman ranked correlation coefficient between 2 vectors
|
166
172
|
def spearman(v1,v2)
|
167
173
|
v1a,v2a=Statsample.only_valid(v1,v2)
|
@@ -218,16 +224,16 @@ module Statsample
|
|
218
224
|
rs=matrix.row_size
|
219
225
|
cs=matrix.column_size
|
220
226
|
conc=disc=ties_x=ties_y=0
|
221
|
-
(0...(rs-1)).each
|
222
|
-
(0...(cs-1)).each
|
223
|
-
((x+1)...rs).each
|
224
|
-
((y+1)...cs).each
|
227
|
+
(0...(rs-1)).each do |x|
|
228
|
+
(0...(cs-1)).each do |y|
|
229
|
+
((x+1)...rs).each do |x2|
|
230
|
+
((y+1)...cs).each do |y2|
|
225
231
|
# #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
|
226
232
|
conc+=matrix[x,y]*matrix[x2,y2]
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
end
|
231
237
|
(0...(rs-1)).each {|x|
|
232
238
|
(1...(cs)).each{|y|
|
233
239
|
((x+1)...rs).each{|x2|
|
@@ -27,94 +27,143 @@ module Statsample
|
|
27
27
|
# }
|
28
28
|
# end
|
29
29
|
#
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
30
|
+
module Codification
|
31
|
+
class << self
|
32
|
+
# Create a hash, based on vectors, to create the dictionary.
|
33
|
+
# The keys will be vectors name on dataset and the values
|
34
|
+
# will be hashes, with keys = values, for recodification
|
35
|
+
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
|
36
|
+
raise ArgumentError,"Array should't be empty" if vectors.size==0
|
37
|
+
pro_hash=vectors.inject({}){|h,v_name|
|
38
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
39
|
+
v=dataset[v_name]
|
40
|
+
split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
|
41
|
+
|
42
|
+
factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
|
43
|
+
h[v_name]=factors
|
44
|
+
h
|
45
|
+
}
|
46
|
+
pro_hash
|
47
|
+
end
|
48
|
+
# Create a yaml to create a dictionary, based on vectors
|
49
|
+
# The keys will be vectors name on dataset and the values
|
50
|
+
# will be hashes, with keys = values, for recodification
|
51
|
+
#
|
52
|
+
# v1=%w{a,b b,c d}.to_vector
|
53
|
+
# ds={"v1"=>v1}.to_dataset
|
54
|
+
# Statsample::Codification.create_yaml(ds,['v1'])
|
55
|
+
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
|
56
|
+
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
|
57
|
+
pro_hash=create_hash(dataset, vectors, sep)
|
58
|
+
YAML.dump(pro_hash,io)
|
59
|
+
end
|
60
|
+
# Create a excel to create a dictionary, based on vectors.
|
61
|
+
# Raises an error if filename exists
|
62
|
+
# The rows will be:
|
63
|
+
# * field: name of vector
|
64
|
+
# * original: original name
|
65
|
+
# * recoded: new code
|
66
|
+
|
67
|
+
def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
|
68
|
+
require 'spreadsheet'
|
69
|
+
if File.exists?(filename)
|
70
|
+
raise "Exists a file named #{filename}. Delete ir before overwrite."
|
71
|
+
end
|
72
|
+
book = Spreadsheet::Workbook.new
|
73
|
+
sheet = book.create_worksheet
|
74
|
+
sheet.row(0).concat(%w{field original recoded})
|
75
|
+
i=1
|
76
|
+
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
|
77
|
+
inner_hash.sort.each do |k,v|
|
78
|
+
sheet.row(i).concat([field.dup,k.dup,v.dup])
|
79
|
+
i+=1
|
80
|
+
end
|
81
|
+
end
|
82
|
+
book.write(filename)
|
83
|
+
end
|
84
|
+
# From a excel generates a dictionary hash
|
85
|
+
# to use on recode_dataset_simple!() or recode_dataset_split!().
|
86
|
+
#
|
87
|
+
def excel_to_recoded_hash(filename)
|
88
|
+
require 'spreadsheet'
|
89
|
+
h={}
|
90
|
+
book = Spreadsheet.open filename
|
91
|
+
sheet= book.worksheet 0
|
92
|
+
row_i=0
|
93
|
+
sheet.each do |row|
|
94
|
+
row_i+=1
|
95
|
+
next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
|
96
|
+
h[row[0]]={} if h[row[0]].nil?
|
97
|
+
h[row[0]][row[1]]=row[2]
|
98
|
+
end
|
99
|
+
h
|
100
|
+
end
|
101
|
+
|
102
|
+
def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
|
103
|
+
h.inject({}) do |a,v|
|
104
|
+
v[1].split(sep).each do |val|
|
105
|
+
a[val]||=[]
|
106
|
+
a[val].push(v[0])
|
107
|
+
end
|
108
|
+
a
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
|
113
|
+
h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
|
114
|
+
end
|
115
|
+
|
116
|
+
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
|
117
|
+
dict=dictionary(h,sep)
|
118
|
+
new_data=v.splitted(sep)
|
119
|
+
recoded=new_data.collect do |c|
|
120
|
+
if c.nil?
|
121
|
+
nil
|
122
|
+
else
|
123
|
+
c.collect{|value| dict[value] }.flatten.uniq
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
|
128
|
+
_recode_dataset(dataset,dictionary_hash ,sep,false)
|
129
|
+
end
|
130
|
+
def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
|
131
|
+
_recode_dataset(dataset, dictionary_hash, sep,true)
|
132
|
+
end
|
133
|
+
|
134
|
+
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
|
135
|
+
v_names||=h.keys
|
136
|
+
v_names.each do |v_name|
|
137
|
+
raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
|
138
|
+
recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
|
139
|
+
if c.nil?
|
140
|
+
nil
|
141
|
+
else
|
142
|
+
c.join(sep)
|
143
|
+
end
|
144
|
+
}.to_vector
|
145
|
+
if(split)
|
146
|
+
recoded.split_by_separator(sep).each {|k,v|
|
147
|
+
dataset[v_name+"_"+k]=v
|
148
|
+
}
|
149
|
+
else
|
150
|
+
dataset[v_name+"_recoded"]=recoded
|
117
151
|
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
|
156
|
+
def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
|
157
|
+
require 'pp'
|
158
|
+
v_names||=h.keys
|
159
|
+
v_names.each{|v_name|
|
160
|
+
inverse=inverse_hash(h[v_name],sep)
|
161
|
+
io.puts "- Field: #{v_name}"
|
162
|
+
inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
|
163
|
+
io.puts " - \"#{k}\" (#{v.count}) :\n -'"+v.join("\n -'")+"'"
|
164
|
+
}
|
165
|
+
}
|
118
166
|
end
|
119
167
|
end
|
168
|
+
end
|
120
169
|
end
|