statsample 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +20 -1
- data/Manifest.txt +8 -1
- data/README.txt +11 -7
- data/Rakefile +2 -2
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/dataset.rb +8 -0
- data/examples/multiple_regression.rb +1 -1
- data/examples/parallel_analysis.rb +29 -0
- data/examples/parallel_analysis_tetrachoric.rb +30 -0
- data/examples/vector.rb +6 -0
- data/lib/distribution.rb +16 -6
- data/lib/distribution/normal.rb +27 -20
- data/lib/distribution/normalbivariate.rb +1 -1
- data/lib/statsample.rb +19 -2
- data/lib/statsample/anova.rb +118 -16
- data/lib/statsample/bivariate.rb +27 -13
- data/lib/statsample/bivariate/polychoric.rb +18 -5
- data/lib/statsample/crosstab.rb +66 -74
- data/lib/statsample/dataset.rb +52 -45
- data/lib/statsample/dominanceanalysis.rb +2 -5
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/factor/parallelanalysis.rb +122 -0
- data/lib/statsample/factor/pca.rb +23 -28
- data/lib/statsample/factor/principalaxis.rb +8 -3
- data/lib/statsample/matrix.rb +27 -24
- data/lib/statsample/mle.rb +11 -11
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression.rb +10 -8
- data/lib/statsample/regression/multiple/baseengine.rb +36 -25
- data/lib/statsample/regression/multiple/gslengine.rb +14 -0
- data/lib/statsample/regression/multiple/matrixengine.rb +4 -32
- data/lib/statsample/regression/multiple/rubyengine.rb +2 -6
- data/lib/statsample/regression/simple.rb +1 -1
- data/lib/statsample/reliability.rb +42 -54
- data/lib/statsample/test.rb +10 -6
- data/lib/statsample/test/f.rb +16 -26
- data/lib/statsample/test/levene.rb +4 -8
- data/lib/statsample/test/t.rb +30 -24
- data/lib/statsample/test/umannwhitney.rb +13 -6
- data/lib/statsample/vector.rb +86 -76
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +127 -94
- data/po/statsample.pot +114 -79
- data/test/test_anovaoneway.rb +27 -0
- data/test/test_anovawithvectors.rb +97 -0
- data/test/test_bivariate.rb +6 -57
- data/test/test_bivariate_polychoric.rb +65 -0
- data/test/test_crosstab.rb +6 -0
- data/test/test_dataset.rb +29 -1
- data/test/test_distribution.rb +6 -13
- data/test/test_dominance_analysis.rb +1 -1
- data/test/test_factor.rb +3 -3
- data/test/test_helpers.rb +18 -18
- data/test/test_matrix.rb +33 -20
- data/test/test_permutation.rb +36 -30
- data/test/test_regression.rb +26 -8
- data/test/test_reliability.rb +104 -14
- data/test/test_test_f.rb +11 -14
- data/test/test_test_t.rb +42 -35
- data/test/test_umannwhitney.rb +22 -10
- data/test/test_vector.rb +204 -102
- metadata +57 -81
- metadata.gz.sig +0 -0
- data/test/test_anova.rb +0 -24
data/lib/statsample/bivariate.rb
CHANGED
@@ -6,7 +6,7 @@ module Statsample
|
|
6
6
|
class << self
|
7
7
|
# Covariance between two vectors
|
8
8
|
def covariance(v1,v2)
|
9
|
-
v1a,v2a=Statsample.
|
9
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
10
10
|
return nil if v1a.size==0
|
11
11
|
if Statsample.has_gsl?
|
12
12
|
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
@@ -16,7 +16,7 @@ module Statsample
|
|
16
16
|
end
|
17
17
|
# Estimate the ML between two dichotomic vectors
|
18
18
|
def maximum_likehood_dichotomic(pred,real)
|
19
|
-
preda,reala=Statsample.
|
19
|
+
preda,reala=Statsample.only_valid_clone(pred,real)
|
20
20
|
sum=0
|
21
21
|
pred.each_index{|i|
|
22
22
|
sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
|
@@ -29,14 +29,14 @@ module Statsample
|
|
29
29
|
sum_of_squares(v1a,v2a) / (v1a.size-1)
|
30
30
|
end
|
31
31
|
def sum_of_squares(v1,v2)
|
32
|
-
v1a,v2a=Statsample.
|
32
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
33
33
|
m1=v1a.mean
|
34
34
|
m2=v2a.mean
|
35
35
|
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
36
36
|
end
|
37
37
|
# Calculate Pearson correlation coefficient (r) between 2 vectors
|
38
38
|
def pearson(v1,v2)
|
39
|
-
v1a,v2a=Statsample.
|
39
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
40
40
|
return nil if v1a.size ==0
|
41
41
|
if Statsample.has_gsl?
|
42
42
|
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
@@ -45,7 +45,7 @@ module Statsample
|
|
45
45
|
end
|
46
46
|
end
|
47
47
|
def pearson_slow(v1,v2) # :nodoc:
|
48
|
-
v1a,v2a=Statsample.
|
48
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
49
49
|
# Calculate sum of squares
|
50
50
|
ss=sum_of_squares(v1a,v2a)
|
51
51
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
@@ -60,7 +60,7 @@ module Statsample
|
|
60
60
|
# Retrieves the value for t test for a pearson correlation
|
61
61
|
# between two vectors to test the null hipothesis of r=0
|
62
62
|
def t_pearson(v1,v2)
|
63
|
-
v1a,v2a=Statsample.
|
63
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
64
64
|
r=pearson(v1a,v2a)
|
65
65
|
if(r==1.0)
|
66
66
|
0
|
@@ -117,7 +117,7 @@ module Statsample
|
|
117
117
|
# Correlation between v1 and v2, controling the effect of
|
118
118
|
# control on both.
|
119
119
|
def partial_correlation(v1,v2,control)
|
120
|
-
v1a,v2a,cona=Statsample.
|
120
|
+
v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
|
121
121
|
rv1v2=pearson(v1a,v2a)
|
122
122
|
rv1con=pearson(v1a,cona)
|
123
123
|
rv2con=pearson(v2a,cona)
|
@@ -129,13 +129,20 @@ module Statsample
|
|
129
129
|
# Order of rows and columns depends on Dataset#fields order
|
130
130
|
|
131
131
|
def covariance_matrix(ds)
|
132
|
+
cache={}
|
132
133
|
matrix=ds.collect_matrix do |row,col|
|
133
134
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
134
135
|
nil
|
135
136
|
elsif row==col
|
136
137
|
ds[row].variance
|
137
138
|
else
|
138
|
-
|
139
|
+
if cache[[col,row]].nil?
|
140
|
+
cov=covariance(ds[row],ds[col])
|
141
|
+
cache[[row,col]]=cov
|
142
|
+
cov
|
143
|
+
else
|
144
|
+
cache[[col,row]]
|
145
|
+
end
|
139
146
|
end
|
140
147
|
end
|
141
148
|
matrix.extend CovariateMatrix
|
@@ -147,13 +154,20 @@ module Statsample
|
|
147
154
|
# Order of rows and columns depends on Dataset#fields order
|
148
155
|
|
149
156
|
def correlation_matrix(ds)
|
157
|
+
cache={}
|
150
158
|
cm=ds.collect_matrix do |row,col|
|
151
159
|
if row==col
|
152
160
|
1.0
|
153
161
|
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
154
162
|
nil
|
155
163
|
else
|
156
|
-
|
164
|
+
if cache[[col,row]].nil?
|
165
|
+
r=pearson(ds[row],ds[col])
|
166
|
+
cache[[row,col]]=r
|
167
|
+
r
|
168
|
+
else
|
169
|
+
cache[[col,row]]
|
170
|
+
end
|
157
171
|
end
|
158
172
|
end
|
159
173
|
cm.extend(Statsample::CovariateMatrix)
|
@@ -167,7 +181,7 @@ module Statsample
|
|
167
181
|
if row==col
|
168
182
|
ds[row].valid_data.size
|
169
183
|
else
|
170
|
-
rowa,rowb=Statsample.
|
184
|
+
rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
|
171
185
|
rowa.size
|
172
186
|
end
|
173
187
|
end
|
@@ -179,7 +193,7 @@ module Statsample
|
|
179
193
|
def correlation_probability_matrix(ds, tails=:both)
|
180
194
|
rows=ds.fields.collect do |row|
|
181
195
|
ds.fields.collect do |col|
|
182
|
-
v1a,v2a=Statsample.
|
196
|
+
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
|
183
197
|
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
184
198
|
end
|
185
199
|
end
|
@@ -188,7 +202,7 @@ module Statsample
|
|
188
202
|
|
189
203
|
# Spearman ranked correlation coefficient (rho) between 2 vectors
|
190
204
|
def spearman(v1,v2)
|
191
|
-
v1a,v2a=Statsample.
|
205
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
192
206
|
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
193
207
|
pearson(v1r,v2r)
|
194
208
|
end
|
@@ -206,7 +220,7 @@ module Statsample
|
|
206
220
|
# Kendall Rank Correlation Coefficient.
|
207
221
|
# Based on Hervé Adbi article
|
208
222
|
def tau_a(v1,v2)
|
209
|
-
v1a,v2a=Statsample.
|
223
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
210
224
|
n=v1.size
|
211
225
|
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
212
226
|
o1=ordered_pairs(v1r)
|
@@ -10,18 +10,29 @@ module Statsample
|
|
10
10
|
# Polychoric correlation matrix.
|
11
11
|
# Order of rows and columns depends on Dataset#fields order
|
12
12
|
def self.polychoric_correlation_matrix(ds)
|
13
|
-
|
13
|
+
cache={}
|
14
|
+
matrix=ds.collect_matrix do |row,col|
|
14
15
|
if row==col
|
15
16
|
1.0
|
16
17
|
else
|
17
18
|
begin
|
18
|
-
|
19
|
+
if cache[[col,row]].nil?
|
20
|
+
poly=polychoric(ds[row],ds[col])
|
21
|
+
cache[[row,col]]=poly
|
22
|
+
poly
|
23
|
+
else
|
24
|
+
cache[[col,row]]
|
25
|
+
end
|
19
26
|
rescue RuntimeError
|
20
27
|
nil
|
21
28
|
end
|
22
29
|
end
|
23
30
|
end
|
31
|
+
matrix.extend CovariateMatrix
|
32
|
+
matrix.fields=ds.fields
|
33
|
+
matrix
|
24
34
|
end
|
35
|
+
|
25
36
|
# = Polychoric correlation.
|
26
37
|
#
|
27
38
|
# The <em>polychoric</em> correlation is a measure of
|
@@ -83,6 +94,7 @@ module Statsample
|
|
83
94
|
|
84
95
|
|
85
96
|
# Method of calculation of polychoric series.
|
97
|
+
# <tt>:two_step</tt> used by default.
|
86
98
|
#
|
87
99
|
# :two_step:: two-step ML, based on code by Gegenfurtner(1992).
|
88
100
|
# :polychoric_series:: polychoric series estimate, using
|
@@ -107,7 +119,7 @@ module Statsample
|
|
107
119
|
EPSILON=1e-6
|
108
120
|
MINIMIZER_TYPE_TWO_STEP="brent"
|
109
121
|
MINIMIZER_TYPE_JOINT="nmsimplex"
|
110
|
-
def new_with_vectors(v1,v2)
|
122
|
+
def self.new_with_vectors(v1,v2)
|
111
123
|
Polychoric.new(Crosstab.new(v1,v2).to_matrix)
|
112
124
|
end
|
113
125
|
# Params:
|
@@ -249,6 +261,7 @@ module Statsample
|
|
249
261
|
b=(j==@nc-1) ? 100: beta[j]
|
250
262
|
#puts "a:#{a} b:#{b}"
|
251
263
|
pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
|
264
|
+
|
252
265
|
end
|
253
266
|
pc[i][j] = pd[i][j]
|
254
267
|
pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
|
@@ -256,7 +269,7 @@ module Statsample
|
|
256
269
|
pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
|
257
270
|
res= pd[i][j]
|
258
271
|
#puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
|
259
|
-
if (res
|
272
|
+
if (res<=0)
|
260
273
|
# puts "Correccion"
|
261
274
|
res=1e-16
|
262
275
|
end
|
@@ -328,7 +341,7 @@ module Statsample
|
|
328
341
|
min.epsilon=@epsilon
|
329
342
|
min.expected=0
|
330
343
|
min.iterate
|
331
|
-
@log+=min.log
|
344
|
+
@log+=min.log.to_table.to_s
|
332
345
|
@r=min.x_minimum
|
333
346
|
@loglike_model=-min.f_minimum
|
334
347
|
puts @log if @debug
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -4,50 +4,44 @@ module Statsample
|
|
4
4
|
# The first vector will be at rows and the second will the the columns
|
5
5
|
#
|
6
6
|
class Crosstab
|
7
|
-
include
|
8
|
-
bindtextdomain("statsample")
|
7
|
+
include Summarizable
|
9
8
|
attr_reader :v_rows, :v_cols
|
10
9
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
11
10
|
def initialize(v1, v2, opts=Hash.new)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
if (!@row_label.nil? and !@column_label.nil?)
|
25
|
-
@name=_("Crosstab %s - %s") % [@row_label, @column_label]
|
26
|
-
else
|
27
|
-
@name=_("Crosstab")
|
28
|
-
end
|
29
|
-
end
|
11
|
+
raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
12
|
+
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
13
|
+
@v_rows, @v_cols=Statsample.only_valid_clone(v1,v2)
|
14
|
+
@cases=@v_rows.size
|
15
|
+
@row_label=v1.name
|
16
|
+
@column_label=v2.name
|
17
|
+
@name=nil
|
18
|
+
@percentage_row=@percentage_column=@percentage_total=false
|
19
|
+
opts.each{|k,v|
|
20
|
+
self.send("#{k}=",v) if self.respond_to? k
|
21
|
+
}
|
22
|
+
@name||=_("Crosstab %s - %s") % [@row_label, @column_label]
|
30
23
|
end
|
31
24
|
def rows_names
|
32
|
-
|
25
|
+
@v_rows.factors.sort
|
33
26
|
end
|
34
27
|
def cols_names
|
35
|
-
|
28
|
+
@v_cols.factors.sort
|
36
29
|
end
|
37
30
|
def rows_total
|
38
|
-
|
31
|
+
@v_rows.frequencies
|
39
32
|
end
|
40
33
|
def cols_total
|
41
|
-
|
34
|
+
@v_cols.frequencies
|
42
35
|
end
|
36
|
+
|
43
37
|
def frequencies
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
38
|
+
base=rows_names.inject([]){|s,row|
|
39
|
+
s+=cols_names.collect{|col| [row,col]}
|
40
|
+
}.inject({}) {|s,par|
|
41
|
+
s[par]=0
|
42
|
+
s
|
43
|
+
}
|
44
|
+
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
|
51
45
|
end
|
52
46
|
def to_matrix
|
53
47
|
f=frequencies
|
@@ -93,52 +87,50 @@ module Statsample
|
|
93
87
|
def cols_empty_hash
|
94
88
|
cols_names.inject({}) {|a,x| a[x]=0;a}
|
95
89
|
end
|
96
|
-
def report_building(
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
90
|
+
def report_building(builder)
|
91
|
+
builder.section(:name=>@name) do |generator|
|
92
|
+
fq=frequencies
|
93
|
+
rn=rows_names
|
94
|
+
cn=cols_names
|
95
|
+
total=0
|
96
|
+
total_cols=cols_empty_hash
|
97
|
+
generator.text "Chi Square: #{chi_square}"
|
98
|
+
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
99
|
+
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
100
|
+
|
101
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
|
102
|
+
rn.each do |row|
|
103
|
+
total_row=0
|
104
|
+
t_row=[@v_rows.labeling(row)]
|
105
|
+
cn.each do |col|
|
106
|
+
data=fq[[row,col]]
|
107
|
+
total_row+=fq[[row,col]]
|
108
|
+
total+=fq[[row,col]]
|
109
|
+
total_cols[col]+=fq[[row,col]]
|
110
|
+
t_row.push(data)
|
111
|
+
end
|
112
|
+
t_row.push(total_row)
|
113
|
+
t.row(t_row)
|
114
|
+
end
|
115
|
+
t.hr
|
116
|
+
t_row=[_("Total")]
|
117
|
+
cn.each do |v|
|
118
|
+
t_row.push(total_cols[v])
|
118
119
|
end
|
119
|
-
t_row.push(
|
120
|
+
t_row.push(total)
|
120
121
|
t.row(t_row)
|
122
|
+
generator.parse_element(t)
|
123
|
+
|
124
|
+
if(@percentage_row)
|
125
|
+
table_percentage(generator,:row)
|
126
|
+
end
|
127
|
+
if(@percentage_column)
|
128
|
+
table_percentage(generator,:column)
|
129
|
+
end
|
130
|
+
if(@percentage_total)
|
131
|
+
table_percentage(generator,:total)
|
132
|
+
end
|
121
133
|
end
|
122
|
-
t.hr
|
123
|
-
t_row=[_("Total")]
|
124
|
-
cn.each do |v|
|
125
|
-
t_row.push(total_cols[v])
|
126
|
-
end
|
127
|
-
t_row.push(total)
|
128
|
-
t.row(t_row)
|
129
|
-
generator.parse_element(t)
|
130
|
-
|
131
|
-
if(@percentage_row)
|
132
|
-
table_percentage(generator,:row)
|
133
|
-
end
|
134
|
-
if(@percentage_column)
|
135
|
-
table_percentage(generator,:column)
|
136
|
-
end
|
137
|
-
if(@percentage_total)
|
138
|
-
table_percentage(generator,:total)
|
139
|
-
end
|
140
|
-
|
141
|
-
generator.html("</div>")
|
142
134
|
end
|
143
135
|
|
144
136
|
|
data/lib/statsample/dataset.rb
CHANGED
@@ -56,16 +56,17 @@ module Statsample
|
|
56
56
|
|
57
57
|
class Dataset
|
58
58
|
include Writable
|
59
|
+
include Summarizable
|
59
60
|
# Hash of Statsample::Vector
|
60
61
|
attr_reader :vectors
|
61
62
|
# Ordered names of vectors
|
62
63
|
attr_reader :fields
|
64
|
+
# Name of dataset
|
65
|
+
attr_accessor:name
|
63
66
|
# Number of cases
|
64
67
|
attr_reader :cases
|
65
68
|
# Location of pointer on enumerations methods (like #each)
|
66
69
|
attr_reader :i
|
67
|
-
# Deprecated: Label of vectors
|
68
|
-
attr_accessor :labels
|
69
70
|
|
70
71
|
# Generates a new dataset, using three vectors
|
71
72
|
# - Rows
|
@@ -122,10 +123,12 @@ module Statsample
|
|
122
123
|
# [fields] Array of names for vectors. Is only used for set the
|
123
124
|
# order of variables. If empty, vectors keys on alfabethic order as
|
124
125
|
# used as fields
|
125
|
-
# [labels] Hash to set names for fields.
|
126
126
|
|
127
127
|
#
|
128
|
-
def initialize(vectors={}, fields=[]
|
128
|
+
def initialize(vectors={}, fields=[])
|
129
|
+
@@n_dataset||=0
|
130
|
+
@@n_dataset+=1
|
131
|
+
@name=_("Dataset %d") % @@n_dataset
|
129
132
|
if vectors.instance_of? Array
|
130
133
|
@fields=vectors.dup
|
131
134
|
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
@@ -137,7 +140,6 @@ module Statsample
|
|
137
140
|
check_length
|
138
141
|
end
|
139
142
|
@i=nil
|
140
|
-
@labels=labels
|
141
143
|
end
|
142
144
|
def to_gsl_matrix
|
143
145
|
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
@@ -146,11 +148,7 @@ module Statsample
|
|
146
148
|
end
|
147
149
|
matrix
|
148
150
|
end
|
149
|
-
|
150
|
-
def label(v_id)
|
151
|
-
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
152
|
-
@labels[v_id].nil? ? v_id : @labels[v_id]
|
153
|
-
end
|
151
|
+
|
154
152
|
# Creates a copy of the given dataset, deleting all the cases with
|
155
153
|
# missing data on one of the vectors
|
156
154
|
def dup_only_valid
|
@@ -172,7 +170,8 @@ module Statsample
|
|
172
170
|
@fields.slice(@fields.index(from)..@fields.index(to))
|
173
171
|
end
|
174
172
|
# Returns a duplicate of the Database
|
175
|
-
# If fields given, only include those vectors
|
173
|
+
# If fields given, only include those vectors.
|
174
|
+
# Every vector will be dup
|
176
175
|
def dup(*fields_to_include)
|
177
176
|
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
178
177
|
fields_to_include=fields_to_include[0]
|
@@ -180,14 +179,27 @@ module Statsample
|
|
180
179
|
fields_to_include=@fields if fields_to_include.size==0
|
181
180
|
vectors={}
|
182
181
|
fields=[]
|
183
|
-
new_labels={}
|
184
182
|
fields_to_include.each{|f|
|
185
183
|
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
186
184
|
vectors[f]=@vectors[f].dup
|
187
|
-
new_labels[f]=@labels[f]
|
188
185
|
fields.push(f)
|
189
186
|
}
|
190
|
-
Dataset.new(vectors,fields
|
187
|
+
Dataset.new(vectors,fields)
|
188
|
+
end
|
189
|
+
# Returns a shallow copy of Dataset.
|
190
|
+
# Object id will be distinct, but @vectors will be the same.
|
191
|
+
def clone(*fields_to_include)
|
192
|
+
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
193
|
+
fields_to_include=fields_to_include[0]
|
194
|
+
end
|
195
|
+
fields_to_include=@fields.dup if fields_to_include.size==0
|
196
|
+
ds=Dataset.new
|
197
|
+
fields_to_include.each{|f|
|
198
|
+
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
199
|
+
ds[f]=@vectors[f]
|
200
|
+
}
|
201
|
+
ds.fields=fields_to_include
|
202
|
+
ds
|
191
203
|
end
|
192
204
|
# Creates a copy of the given dataset, without data on vectors
|
193
205
|
def dup_empty
|
@@ -195,7 +207,7 @@ module Statsample
|
|
195
207
|
a[v[0]]=v[1].dup_empty
|
196
208
|
a
|
197
209
|
}
|
198
|
-
Dataset.new(vectors,@fields.dup
|
210
|
+
Dataset.new(vectors,@fields.dup)
|
199
211
|
end
|
200
212
|
# Merge vectors from two datasets
|
201
213
|
# In case of name collition, the vectors names are changed to
|
@@ -216,14 +228,14 @@ module Statsample
|
|
216
228
|
ds_new.update_valid_data
|
217
229
|
ds_new
|
218
230
|
end
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
231
|
+
# Returns a dataset with standarized data
|
232
|
+
def standarize
|
233
|
+
ds=dup()
|
234
|
+
ds.fields.each do |f|
|
235
|
+
ds[f]=ds[f].vector_standarized
|
236
|
+
end
|
237
|
+
ds
|
238
|
+
end
|
227
239
|
# Generate a matrix, based on fields of dataset
|
228
240
|
def collect_matrix
|
229
241
|
rows=@fields.collect{|row|
|
@@ -233,7 +245,7 @@ module Statsample
|
|
233
245
|
}
|
234
246
|
Matrix.rows(rows)
|
235
247
|
end
|
236
|
-
# We have the same datasets if
|
248
|
+
# We have the same datasets if vectors and fields are the same
|
237
249
|
def ==(d2)
|
238
250
|
@vectors==d2.vectors and @fields==d2.fields
|
239
251
|
end
|
@@ -305,12 +317,12 @@ module Statsample
|
|
305
317
|
@vectors.delete(name)
|
306
318
|
end
|
307
319
|
|
308
|
-
def add_vectors_by_split_recode(
|
309
|
-
split=@vectors[
|
320
|
+
def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
|
321
|
+
split=@vectors[name_].split_by_separator(sep)
|
310
322
|
i=1
|
311
323
|
split.each{|k,v|
|
312
|
-
new_field=
|
313
|
-
|
324
|
+
new_field=name_+join+i.to_s
|
325
|
+
v.name=name_+":"+k
|
314
326
|
add_vector(new_field,v)
|
315
327
|
i+=1
|
316
328
|
}
|
@@ -505,15 +517,13 @@ module Statsample
|
|
505
517
|
end
|
506
518
|
# Returns the vector named i
|
507
519
|
def[](i)
|
508
|
-
if i.is_a?
|
509
|
-
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
510
|
-
@vectors[i]
|
511
|
-
elsif i.is_a? Range
|
520
|
+
if i.is_a? Range
|
512
521
|
fields=from_to(i.begin,i.end)
|
513
522
|
vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
|
514
523
|
ds=Dataset.new(vectors,fields)
|
515
524
|
else
|
516
|
-
raise
|
525
|
+
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
526
|
+
@vectors[i]
|
517
527
|
end
|
518
528
|
end
|
519
529
|
# Retrieves a Statsample::Vector, based on the result
|
@@ -702,7 +712,7 @@ module Statsample
|
|
702
712
|
vr
|
703
713
|
end
|
704
714
|
def to_s
|
705
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"]
|
715
|
+
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
706
716
|
end
|
707
717
|
def inspect
|
708
718
|
self.to_s
|
@@ -779,17 +789,14 @@ module Statsample
|
|
779
789
|
ds
|
780
790
|
end
|
781
791
|
|
782
|
-
def
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
}
|
792
|
-
out
|
792
|
+
def report_building(b)
|
793
|
+
b.section(:name=>@name) do |g|
|
794
|
+
g.text _"Cases: %d" % cases
|
795
|
+
|
796
|
+
@fields.each do |f|
|
797
|
+
g.parse_element(@vectors[f])
|
798
|
+
end
|
799
|
+
end
|
793
800
|
end
|
794
801
|
def as_r
|
795
802
|
require 'rsruby/dataframe'
|