statsample 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +20 -1
- data/Manifest.txt +8 -1
- data/README.txt +11 -7
- data/Rakefile +2 -2
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/dataset.rb +8 -0
- data/examples/multiple_regression.rb +1 -1
- data/examples/parallel_analysis.rb +29 -0
- data/examples/parallel_analysis_tetrachoric.rb +30 -0
- data/examples/vector.rb +6 -0
- data/lib/distribution.rb +16 -6
- data/lib/distribution/normal.rb +27 -20
- data/lib/distribution/normalbivariate.rb +1 -1
- data/lib/statsample.rb +19 -2
- data/lib/statsample/anova.rb +118 -16
- data/lib/statsample/bivariate.rb +27 -13
- data/lib/statsample/bivariate/polychoric.rb +18 -5
- data/lib/statsample/crosstab.rb +66 -74
- data/lib/statsample/dataset.rb +52 -45
- data/lib/statsample/dominanceanalysis.rb +2 -5
- data/lib/statsample/factor.rb +1 -1
- data/lib/statsample/factor/parallelanalysis.rb +122 -0
- data/lib/statsample/factor/pca.rb +23 -28
- data/lib/statsample/factor/principalaxis.rb +8 -3
- data/lib/statsample/matrix.rb +27 -24
- data/lib/statsample/mle.rb +11 -11
- data/lib/statsample/permutation.rb +2 -1
- data/lib/statsample/regression.rb +10 -8
- data/lib/statsample/regression/multiple/baseengine.rb +36 -25
- data/lib/statsample/regression/multiple/gslengine.rb +14 -0
- data/lib/statsample/regression/multiple/matrixengine.rb +4 -32
- data/lib/statsample/regression/multiple/rubyengine.rb +2 -6
- data/lib/statsample/regression/simple.rb +1 -1
- data/lib/statsample/reliability.rb +42 -54
- data/lib/statsample/test.rb +10 -6
- data/lib/statsample/test/f.rb +16 -26
- data/lib/statsample/test/levene.rb +4 -8
- data/lib/statsample/test/t.rb +30 -24
- data/lib/statsample/test/umannwhitney.rb +13 -6
- data/lib/statsample/vector.rb +86 -76
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +127 -94
- data/po/statsample.pot +114 -79
- data/test/test_anovaoneway.rb +27 -0
- data/test/test_anovawithvectors.rb +97 -0
- data/test/test_bivariate.rb +6 -57
- data/test/test_bivariate_polychoric.rb +65 -0
- data/test/test_crosstab.rb +6 -0
- data/test/test_dataset.rb +29 -1
- data/test/test_distribution.rb +6 -13
- data/test/test_dominance_analysis.rb +1 -1
- data/test/test_factor.rb +3 -3
- data/test/test_helpers.rb +18 -18
- data/test/test_matrix.rb +33 -20
- data/test/test_permutation.rb +36 -30
- data/test/test_regression.rb +26 -8
- data/test/test_reliability.rb +104 -14
- data/test/test_test_f.rb +11 -14
- data/test/test_test_t.rb +42 -35
- data/test/test_umannwhitney.rb +22 -10
- data/test/test_vector.rb +204 -102
- metadata +57 -81
- metadata.gz.sig +0 -0
- data/test/test_anova.rb +0 -24
data/lib/statsample/bivariate.rb
CHANGED
@@ -6,7 +6,7 @@ module Statsample
|
|
6
6
|
class << self
|
7
7
|
# Covariance between two vectors
|
8
8
|
def covariance(v1,v2)
|
9
|
-
v1a,v2a=Statsample.
|
9
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
10
10
|
return nil if v1a.size==0
|
11
11
|
if Statsample.has_gsl?
|
12
12
|
GSL::Stats::covariance(v1a.gsl, v2a.gsl)
|
@@ -16,7 +16,7 @@ module Statsample
|
|
16
16
|
end
|
17
17
|
# Estimate the ML between two dichotomic vectors
|
18
18
|
def maximum_likehood_dichotomic(pred,real)
|
19
|
-
preda,reala=Statsample.
|
19
|
+
preda,reala=Statsample.only_valid_clone(pred,real)
|
20
20
|
sum=0
|
21
21
|
pred.each_index{|i|
|
22
22
|
sum+=(real[i]*Math::log(pred[i])) + ((1-real[i])*Math::log(1-pred[i]))
|
@@ -29,14 +29,14 @@ module Statsample
|
|
29
29
|
sum_of_squares(v1a,v2a) / (v1a.size-1)
|
30
30
|
end
|
31
31
|
def sum_of_squares(v1,v2)
|
32
|
-
v1a,v2a=Statsample.
|
32
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
33
33
|
m1=v1a.mean
|
34
34
|
m2=v2a.mean
|
35
35
|
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
|
36
36
|
end
|
37
37
|
# Calculate Pearson correlation coefficient (r) between 2 vectors
|
38
38
|
def pearson(v1,v2)
|
39
|
-
v1a,v2a=Statsample.
|
39
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
40
40
|
return nil if v1a.size ==0
|
41
41
|
if Statsample.has_gsl?
|
42
42
|
GSL::Stats::correlation(v1a.gsl, v2a.gsl)
|
@@ -45,7 +45,7 @@ module Statsample
|
|
45
45
|
end
|
46
46
|
end
|
47
47
|
def pearson_slow(v1,v2) # :nodoc:
|
48
|
-
v1a,v2a=Statsample.
|
48
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
49
49
|
# Calculate sum of squares
|
50
50
|
ss=sum_of_squares(v1a,v2a)
|
51
51
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
@@ -60,7 +60,7 @@ module Statsample
|
|
60
60
|
# Retrieves the value for t test for a pearson correlation
|
61
61
|
# between two vectors to test the null hipothesis of r=0
|
62
62
|
def t_pearson(v1,v2)
|
63
|
-
v1a,v2a=Statsample.
|
63
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
64
64
|
r=pearson(v1a,v2a)
|
65
65
|
if(r==1.0)
|
66
66
|
0
|
@@ -117,7 +117,7 @@ module Statsample
|
|
117
117
|
# Correlation between v1 and v2, controling the effect of
|
118
118
|
# control on both.
|
119
119
|
def partial_correlation(v1,v2,control)
|
120
|
-
v1a,v2a,cona=Statsample.
|
120
|
+
v1a,v2a,cona=Statsample.only_valid_clone(v1,v2,control)
|
121
121
|
rv1v2=pearson(v1a,v2a)
|
122
122
|
rv1con=pearson(v1a,cona)
|
123
123
|
rv2con=pearson(v2a,cona)
|
@@ -129,13 +129,20 @@ module Statsample
|
|
129
129
|
# Order of rows and columns depends on Dataset#fields order
|
130
130
|
|
131
131
|
def covariance_matrix(ds)
|
132
|
+
cache={}
|
132
133
|
matrix=ds.collect_matrix do |row,col|
|
133
134
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
134
135
|
nil
|
135
136
|
elsif row==col
|
136
137
|
ds[row].variance
|
137
138
|
else
|
138
|
-
|
139
|
+
if cache[[col,row]].nil?
|
140
|
+
cov=covariance(ds[row],ds[col])
|
141
|
+
cache[[row,col]]=cov
|
142
|
+
cov
|
143
|
+
else
|
144
|
+
cache[[col,row]]
|
145
|
+
end
|
139
146
|
end
|
140
147
|
end
|
141
148
|
matrix.extend CovariateMatrix
|
@@ -147,13 +154,20 @@ module Statsample
|
|
147
154
|
# Order of rows and columns depends on Dataset#fields order
|
148
155
|
|
149
156
|
def correlation_matrix(ds)
|
157
|
+
cache={}
|
150
158
|
cm=ds.collect_matrix do |row,col|
|
151
159
|
if row==col
|
152
160
|
1.0
|
153
161
|
elsif (ds[row].type!=:scale or ds[col].type!=:scale)
|
154
162
|
nil
|
155
163
|
else
|
156
|
-
|
164
|
+
if cache[[col,row]].nil?
|
165
|
+
r=pearson(ds[row],ds[col])
|
166
|
+
cache[[row,col]]=r
|
167
|
+
r
|
168
|
+
else
|
169
|
+
cache[[col,row]]
|
170
|
+
end
|
157
171
|
end
|
158
172
|
end
|
159
173
|
cm.extend(Statsample::CovariateMatrix)
|
@@ -167,7 +181,7 @@ module Statsample
|
|
167
181
|
if row==col
|
168
182
|
ds[row].valid_data.size
|
169
183
|
else
|
170
|
-
rowa,rowb=Statsample.
|
184
|
+
rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
|
171
185
|
rowa.size
|
172
186
|
end
|
173
187
|
end
|
@@ -179,7 +193,7 @@ module Statsample
|
|
179
193
|
def correlation_probability_matrix(ds, tails=:both)
|
180
194
|
rows=ds.fields.collect do |row|
|
181
195
|
ds.fields.collect do |col|
|
182
|
-
v1a,v2a=Statsample.
|
196
|
+
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
|
183
197
|
(row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
|
184
198
|
end
|
185
199
|
end
|
@@ -188,7 +202,7 @@ module Statsample
|
|
188
202
|
|
189
203
|
# Spearman ranked correlation coefficient (rho) between 2 vectors
|
190
204
|
def spearman(v1,v2)
|
191
|
-
v1a,v2a=Statsample.
|
205
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
192
206
|
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
193
207
|
pearson(v1r,v2r)
|
194
208
|
end
|
@@ -206,7 +220,7 @@ module Statsample
|
|
206
220
|
# Kendall Rank Correlation Coefficient.
|
207
221
|
# Based on Hervé Adbi article
|
208
222
|
def tau_a(v1,v2)
|
209
|
-
v1a,v2a=Statsample.
|
223
|
+
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
210
224
|
n=v1.size
|
211
225
|
v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
|
212
226
|
o1=ordered_pairs(v1r)
|
@@ -10,18 +10,29 @@ module Statsample
|
|
10
10
|
# Polychoric correlation matrix.
|
11
11
|
# Order of rows and columns depends on Dataset#fields order
|
12
12
|
def self.polychoric_correlation_matrix(ds)
|
13
|
-
|
13
|
+
cache={}
|
14
|
+
matrix=ds.collect_matrix do |row,col|
|
14
15
|
if row==col
|
15
16
|
1.0
|
16
17
|
else
|
17
18
|
begin
|
18
|
-
|
19
|
+
if cache[[col,row]].nil?
|
20
|
+
poly=polychoric(ds[row],ds[col])
|
21
|
+
cache[[row,col]]=poly
|
22
|
+
poly
|
23
|
+
else
|
24
|
+
cache[[col,row]]
|
25
|
+
end
|
19
26
|
rescue RuntimeError
|
20
27
|
nil
|
21
28
|
end
|
22
29
|
end
|
23
30
|
end
|
31
|
+
matrix.extend CovariateMatrix
|
32
|
+
matrix.fields=ds.fields
|
33
|
+
matrix
|
24
34
|
end
|
35
|
+
|
25
36
|
# = Polychoric correlation.
|
26
37
|
#
|
27
38
|
# The <em>polychoric</em> correlation is a measure of
|
@@ -83,6 +94,7 @@ module Statsample
|
|
83
94
|
|
84
95
|
|
85
96
|
# Method of calculation of polychoric series.
|
97
|
+
# <tt>:two_step</tt> used by default.
|
86
98
|
#
|
87
99
|
# :two_step:: two-step ML, based on code by Gegenfurtner(1992).
|
88
100
|
# :polychoric_series:: polychoric series estimate, using
|
@@ -107,7 +119,7 @@ module Statsample
|
|
107
119
|
EPSILON=1e-6
|
108
120
|
MINIMIZER_TYPE_TWO_STEP="brent"
|
109
121
|
MINIMIZER_TYPE_JOINT="nmsimplex"
|
110
|
-
def new_with_vectors(v1,v2)
|
122
|
+
def self.new_with_vectors(v1,v2)
|
111
123
|
Polychoric.new(Crosstab.new(v1,v2).to_matrix)
|
112
124
|
end
|
113
125
|
# Params:
|
@@ -249,6 +261,7 @@ module Statsample
|
|
249
261
|
b=(j==@nc-1) ? 100: beta[j]
|
250
262
|
#puts "a:#{a} b:#{b}"
|
251
263
|
pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
|
264
|
+
|
252
265
|
end
|
253
266
|
pc[i][j] = pd[i][j]
|
254
267
|
pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
|
@@ -256,7 +269,7 @@ module Statsample
|
|
256
269
|
pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
|
257
270
|
res= pd[i][j]
|
258
271
|
#puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
|
259
|
-
if (res
|
272
|
+
if (res<=0)
|
260
273
|
# puts "Correccion"
|
261
274
|
res=1e-16
|
262
275
|
end
|
@@ -328,7 +341,7 @@ module Statsample
|
|
328
341
|
min.epsilon=@epsilon
|
329
342
|
min.expected=0
|
330
343
|
min.iterate
|
331
|
-
@log+=min.log
|
344
|
+
@log+=min.log.to_table.to_s
|
332
345
|
@r=min.x_minimum
|
333
346
|
@loglike_model=-min.f_minimum
|
334
347
|
puts @log if @debug
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -4,50 +4,44 @@ module Statsample
|
|
4
4
|
# The first vector will be at rows and the second will the the columns
|
5
5
|
#
|
6
6
|
class Crosstab
|
7
|
-
include
|
8
|
-
bindtextdomain("statsample")
|
7
|
+
include Summarizable
|
9
8
|
attr_reader :v_rows, :v_cols
|
10
9
|
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
|
11
10
|
def initialize(v1, v2, opts=Hash.new)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
if (!@row_label.nil? and !@column_label.nil?)
|
25
|
-
@name=_("Crosstab %s - %s") % [@row_label, @column_label]
|
26
|
-
else
|
27
|
-
@name=_("Crosstab")
|
28
|
-
end
|
29
|
-
end
|
11
|
+
raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
|
12
|
+
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
|
13
|
+
@v_rows, @v_cols=Statsample.only_valid_clone(v1,v2)
|
14
|
+
@cases=@v_rows.size
|
15
|
+
@row_label=v1.name
|
16
|
+
@column_label=v2.name
|
17
|
+
@name=nil
|
18
|
+
@percentage_row=@percentage_column=@percentage_total=false
|
19
|
+
opts.each{|k,v|
|
20
|
+
self.send("#{k}=",v) if self.respond_to? k
|
21
|
+
}
|
22
|
+
@name||=_("Crosstab %s - %s") % [@row_label, @column_label]
|
30
23
|
end
|
31
24
|
def rows_names
|
32
|
-
|
25
|
+
@v_rows.factors.sort
|
33
26
|
end
|
34
27
|
def cols_names
|
35
|
-
|
28
|
+
@v_cols.factors.sort
|
36
29
|
end
|
37
30
|
def rows_total
|
38
|
-
|
31
|
+
@v_rows.frequencies
|
39
32
|
end
|
40
33
|
def cols_total
|
41
|
-
|
34
|
+
@v_cols.frequencies
|
42
35
|
end
|
36
|
+
|
43
37
|
def frequencies
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
38
|
+
base=rows_names.inject([]){|s,row|
|
39
|
+
s+=cols_names.collect{|col| [row,col]}
|
40
|
+
}.inject({}) {|s,par|
|
41
|
+
s[par]=0
|
42
|
+
s
|
43
|
+
}
|
44
|
+
base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
|
51
45
|
end
|
52
46
|
def to_matrix
|
53
47
|
f=frequencies
|
@@ -93,52 +87,50 @@ module Statsample
|
|
93
87
|
def cols_empty_hash
|
94
88
|
cols_names.inject({}) {|a,x| a[x]=0;a}
|
95
89
|
end
|
96
|
-
def report_building(
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
90
|
+
def report_building(builder)
|
91
|
+
builder.section(:name=>@name) do |generator|
|
92
|
+
fq=frequencies
|
93
|
+
rn=rows_names
|
94
|
+
cn=cols_names
|
95
|
+
total=0
|
96
|
+
total_cols=cols_empty_hash
|
97
|
+
generator.text "Chi Square: #{chi_square}"
|
98
|
+
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
|
99
|
+
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
|
100
|
+
|
101
|
+
t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
|
102
|
+
rn.each do |row|
|
103
|
+
total_row=0
|
104
|
+
t_row=[@v_rows.labeling(row)]
|
105
|
+
cn.each do |col|
|
106
|
+
data=fq[[row,col]]
|
107
|
+
total_row+=fq[[row,col]]
|
108
|
+
total+=fq[[row,col]]
|
109
|
+
total_cols[col]+=fq[[row,col]]
|
110
|
+
t_row.push(data)
|
111
|
+
end
|
112
|
+
t_row.push(total_row)
|
113
|
+
t.row(t_row)
|
114
|
+
end
|
115
|
+
t.hr
|
116
|
+
t_row=[_("Total")]
|
117
|
+
cn.each do |v|
|
118
|
+
t_row.push(total_cols[v])
|
118
119
|
end
|
119
|
-
t_row.push(
|
120
|
+
t_row.push(total)
|
120
121
|
t.row(t_row)
|
122
|
+
generator.parse_element(t)
|
123
|
+
|
124
|
+
if(@percentage_row)
|
125
|
+
table_percentage(generator,:row)
|
126
|
+
end
|
127
|
+
if(@percentage_column)
|
128
|
+
table_percentage(generator,:column)
|
129
|
+
end
|
130
|
+
if(@percentage_total)
|
131
|
+
table_percentage(generator,:total)
|
132
|
+
end
|
121
133
|
end
|
122
|
-
t.hr
|
123
|
-
t_row=[_("Total")]
|
124
|
-
cn.each do |v|
|
125
|
-
t_row.push(total_cols[v])
|
126
|
-
end
|
127
|
-
t_row.push(total)
|
128
|
-
t.row(t_row)
|
129
|
-
generator.parse_element(t)
|
130
|
-
|
131
|
-
if(@percentage_row)
|
132
|
-
table_percentage(generator,:row)
|
133
|
-
end
|
134
|
-
if(@percentage_column)
|
135
|
-
table_percentage(generator,:column)
|
136
|
-
end
|
137
|
-
if(@percentage_total)
|
138
|
-
table_percentage(generator,:total)
|
139
|
-
end
|
140
|
-
|
141
|
-
generator.html("</div>")
|
142
134
|
end
|
143
135
|
|
144
136
|
|
data/lib/statsample/dataset.rb
CHANGED
@@ -56,16 +56,17 @@ module Statsample
|
|
56
56
|
|
57
57
|
class Dataset
|
58
58
|
include Writable
|
59
|
+
include Summarizable
|
59
60
|
# Hash of Statsample::Vector
|
60
61
|
attr_reader :vectors
|
61
62
|
# Ordered names of vectors
|
62
63
|
attr_reader :fields
|
64
|
+
# Name of dataset
|
65
|
+
attr_accessor:name
|
63
66
|
# Number of cases
|
64
67
|
attr_reader :cases
|
65
68
|
# Location of pointer on enumerations methods (like #each)
|
66
69
|
attr_reader :i
|
67
|
-
# Deprecated: Label of vectors
|
68
|
-
attr_accessor :labels
|
69
70
|
|
70
71
|
# Generates a new dataset, using three vectors
|
71
72
|
# - Rows
|
@@ -122,10 +123,12 @@ module Statsample
|
|
122
123
|
# [fields] Array of names for vectors. Is only used for set the
|
123
124
|
# order of variables. If empty, vectors keys on alfabethic order as
|
124
125
|
# used as fields
|
125
|
-
# [labels] Hash to set names for fields.
|
126
126
|
|
127
127
|
#
|
128
|
-
def initialize(vectors={}, fields=[]
|
128
|
+
def initialize(vectors={}, fields=[])
|
129
|
+
@@n_dataset||=0
|
130
|
+
@@n_dataset+=1
|
131
|
+
@name=_("Dataset %d") % @@n_dataset
|
129
132
|
if vectors.instance_of? Array
|
130
133
|
@fields=vectors.dup
|
131
134
|
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
@@ -137,7 +140,6 @@ module Statsample
|
|
137
140
|
check_length
|
138
141
|
end
|
139
142
|
@i=nil
|
140
|
-
@labels=labels
|
141
143
|
end
|
142
144
|
def to_gsl_matrix
|
143
145
|
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
@@ -146,11 +148,7 @@ module Statsample
|
|
146
148
|
end
|
147
149
|
matrix
|
148
150
|
end
|
149
|
-
|
150
|
-
def label(v_id)
|
151
|
-
raise "Vector #{v} doesn't exists" unless @fields.include? v_id
|
152
|
-
@labels[v_id].nil? ? v_id : @labels[v_id]
|
153
|
-
end
|
151
|
+
|
154
152
|
# Creates a copy of the given dataset, deleting all the cases with
|
155
153
|
# missing data on one of the vectors
|
156
154
|
def dup_only_valid
|
@@ -172,7 +170,8 @@ module Statsample
|
|
172
170
|
@fields.slice(@fields.index(from)..@fields.index(to))
|
173
171
|
end
|
174
172
|
# Returns a duplicate of the Database
|
175
|
-
# If fields given, only include those vectors
|
173
|
+
# If fields given, only include those vectors.
|
174
|
+
# Every vector will be dup
|
176
175
|
def dup(*fields_to_include)
|
177
176
|
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
178
177
|
fields_to_include=fields_to_include[0]
|
@@ -180,14 +179,27 @@ module Statsample
|
|
180
179
|
fields_to_include=@fields if fields_to_include.size==0
|
181
180
|
vectors={}
|
182
181
|
fields=[]
|
183
|
-
new_labels={}
|
184
182
|
fields_to_include.each{|f|
|
185
183
|
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
186
184
|
vectors[f]=@vectors[f].dup
|
187
|
-
new_labels[f]=@labels[f]
|
188
185
|
fields.push(f)
|
189
186
|
}
|
190
|
-
Dataset.new(vectors,fields
|
187
|
+
Dataset.new(vectors,fields)
|
188
|
+
end
|
189
|
+
# Returns a shallow copy of Dataset.
|
190
|
+
# Object id will be distinct, but @vectors will be the same.
|
191
|
+
def clone(*fields_to_include)
|
192
|
+
if fields_to_include.size==1 and fields_to_include[0].is_a? Array
|
193
|
+
fields_to_include=fields_to_include[0]
|
194
|
+
end
|
195
|
+
fields_to_include=@fields.dup if fields_to_include.size==0
|
196
|
+
ds=Dataset.new
|
197
|
+
fields_to_include.each{|f|
|
198
|
+
raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
|
199
|
+
ds[f]=@vectors[f]
|
200
|
+
}
|
201
|
+
ds.fields=fields_to_include
|
202
|
+
ds
|
191
203
|
end
|
192
204
|
# Creates a copy of the given dataset, without data on vectors
|
193
205
|
def dup_empty
|
@@ -195,7 +207,7 @@ module Statsample
|
|
195
207
|
a[v[0]]=v[1].dup_empty
|
196
208
|
a
|
197
209
|
}
|
198
|
-
Dataset.new(vectors,@fields.dup
|
210
|
+
Dataset.new(vectors,@fields.dup)
|
199
211
|
end
|
200
212
|
# Merge vectors from two datasets
|
201
213
|
# In case of name collition, the vectors names are changed to
|
@@ -216,14 +228,14 @@ module Statsample
|
|
216
228
|
ds_new.update_valid_data
|
217
229
|
ds_new
|
218
230
|
end
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
231
|
+
# Returns a dataset with standarized data
|
232
|
+
def standarize
|
233
|
+
ds=dup()
|
234
|
+
ds.fields.each do |f|
|
235
|
+
ds[f]=ds[f].vector_standarized
|
236
|
+
end
|
237
|
+
ds
|
238
|
+
end
|
227
239
|
# Generate a matrix, based on fields of dataset
|
228
240
|
def collect_matrix
|
229
241
|
rows=@fields.collect{|row|
|
@@ -233,7 +245,7 @@ module Statsample
|
|
233
245
|
}
|
234
246
|
Matrix.rows(rows)
|
235
247
|
end
|
236
|
-
# We have the same datasets if
|
248
|
+
# We have the same datasets if vectors and fields are the same
|
237
249
|
def ==(d2)
|
238
250
|
@vectors==d2.vectors and @fields==d2.fields
|
239
251
|
end
|
@@ -305,12 +317,12 @@ module Statsample
|
|
305
317
|
@vectors.delete(name)
|
306
318
|
end
|
307
319
|
|
308
|
-
def add_vectors_by_split_recode(
|
309
|
-
split=@vectors[
|
320
|
+
def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
|
321
|
+
split=@vectors[name_].split_by_separator(sep)
|
310
322
|
i=1
|
311
323
|
split.each{|k,v|
|
312
|
-
new_field=
|
313
|
-
|
324
|
+
new_field=name_+join+i.to_s
|
325
|
+
v.name=name_+":"+k
|
314
326
|
add_vector(new_field,v)
|
315
327
|
i+=1
|
316
328
|
}
|
@@ -505,15 +517,13 @@ module Statsample
|
|
505
517
|
end
|
506
518
|
# Returns the vector named i
|
507
519
|
def[](i)
|
508
|
-
if i.is_a?
|
509
|
-
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
510
|
-
@vectors[i]
|
511
|
-
elsif i.is_a? Range
|
520
|
+
if i.is_a? Range
|
512
521
|
fields=from_to(i.begin,i.end)
|
513
522
|
vectors=fields.inject({}) {|a,v| a[v]=@vectors[v];a}
|
514
523
|
ds=Dataset.new(vectors,fields)
|
515
524
|
else
|
516
|
-
raise
|
525
|
+
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
526
|
+
@vectors[i]
|
517
527
|
end
|
518
528
|
end
|
519
529
|
# Retrieves a Statsample::Vector, based on the result
|
@@ -702,7 +712,7 @@ module Statsample
|
|
702
712
|
vr
|
703
713
|
end
|
704
714
|
def to_s
|
705
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"]
|
715
|
+
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
706
716
|
end
|
707
717
|
def inspect
|
708
718
|
self.to_s
|
@@ -779,17 +789,14 @@ module Statsample
|
|
779
789
|
ds
|
780
790
|
end
|
781
791
|
|
782
|
-
def
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
}
|
792
|
-
out
|
792
|
+
def report_building(b)
|
793
|
+
b.section(:name=>@name) do |g|
|
794
|
+
g.text _"Cases: %d" % cases
|
795
|
+
|
796
|
+
@fields.each do |f|
|
797
|
+
g.parse_element(@vectors[f])
|
798
|
+
end
|
799
|
+
end
|
793
800
|
end
|
794
801
|
def as_r
|
795
802
|
require 'rsruby/dataframe'
|