statsample 0.18.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +23 -0
- data/Manifest.txt +28 -17
- data/Rakefile +3 -2
- data/benchmarks/correlation_matrix_15_variables.rb +31 -0
- data/benchmarks/correlation_matrix_5_variables.rb +32 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/examples/boxplot.rb +13 -14
- data/examples/correlation_matrix.rb +16 -8
- data/examples/dataset.rb +13 -4
- data/examples/dominance_analysis.rb +23 -17
- data/examples/dominance_analysis_bootstrap.rb +28 -22
- data/examples/histogram.rb +8 -9
- data/examples/icc.rb +20 -21
- data/examples/levene.rb +10 -4
- data/examples/multiple_regression.rb +9 -28
- data/examples/multivariate_correlation.rb +9 -3
- data/examples/parallel_analysis.rb +20 -16
- data/examples/polychoric.rb +15 -9
- data/examples/principal_axis.rb +18 -6
- data/examples/reliability.rb +26 -13
- data/examples/scatterplot.rb +10 -6
- data/examples/t_test.rb +15 -6
- data/examples/tetrachoric.rb +9 -2
- data/examples/u_test.rb +12 -4
- data/examples/vector.rb +13 -2
- data/examples/velicer_map_test.rb +33 -26
- data/lib/statsample.rb +32 -12
- data/lib/statsample/analysis.rb +79 -0
- data/lib/statsample/analysis/suite.rb +72 -0
- data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
- data/lib/statsample/bivariate.rb +70 -16
- data/lib/statsample/dataset.rb +25 -19
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/factor.rb +2 -0
- data/lib/statsample/factor/map.rb +16 -10
- data/lib/statsample/factor/parallelanalysis.rb +9 -3
- data/lib/statsample/factor/pca.rb +28 -32
- data/lib/statsample/factor/rotation.rb +15 -8
- data/lib/statsample/graph/boxplot.rb +3 -4
- data/lib/statsample/graph/histogram.rb +2 -1
- data/lib/statsample/graph/scatterplot.rb +1 -0
- data/lib/statsample/matrix.rb +106 -16
- data/lib/statsample/regression.rb +4 -1
- data/lib/statsample/regression/binomial.rb +1 -1
- data/lib/statsample/regression/multiple/baseengine.rb +19 -9
- data/lib/statsample/regression/multiple/gslengine.rb +127 -126
- data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
- data/lib/statsample/regression/simple.rb +31 -6
- data/lib/statsample/reliability.rb +11 -3
- data/lib/statsample/reliability/scaleanalysis.rb +4 -4
- data/lib/statsample/shorthand.rb +81 -0
- data/lib/statsample/test/chisquare.rb +1 -1
- data/lib/statsample/vector.rb +163 -163
- data/lib/statsample/vector/gsl.rb +106 -0
- data/references.txt +2 -2
- data/{data → test/fixtures}/crime.txt +0 -0
- data/{data → test/fixtures}/hartman_23.matrix +0 -0
- data/{data → test/fixtures}/repeated_fields.csv +0 -0
- data/{data → test/fixtures}/test_binomial.csv +0 -0
- data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
- data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
- data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
- data/{data → test/fixtures}/tetmat_test.txt +0 -0
- data/test/helpers_tests.rb +18 -2
- data/test/test_analysis.rb +118 -0
- data/test/test_anovatwoway.rb +1 -1
- data/test/test_anovatwowaywithdataset.rb +1 -1
- data/test/test_anovawithvectors.rb +1 -2
- data/test/test_bartlettsphericity.rb +1 -2
- data/test/test_bivariate.rb +64 -22
- data/test/test_codification.rb +1 -2
- data/test/test_crosstab.rb +1 -2
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +24 -3
- data/test/test_dominance_analysis.rb +1 -2
- data/test/test_factor.rb +8 -69
- data/test/test_factor_map.rb +43 -0
- data/test/test_factor_pa.rb +54 -0
- data/test/test_ggobi.rb +1 -1
- data/test/test_gsl.rb +12 -18
- data/test/test_histogram.rb +1 -2
- data/test/test_logit.rb +62 -18
- data/test/test_matrix.rb +4 -5
- data/test/test_mle.rb +3 -4
- data/test/test_regression.rb +21 -2
- data/test/test_reliability.rb +3 -3
- data/test/test_reliability_icc.rb +1 -1
- data/test/test_reliability_skillscale.rb +20 -4
- data/test/test_resample.rb +1 -2
- data/test/test_rserve_extension.rb +1 -2
- data/test/test_srs.rb +1 -2
- data/test/test_statistics.rb +1 -2
- data/test/test_stest.rb +1 -2
- data/test/test_stratified.rb +1 -2
- data/test/test_test_f.rb +1 -2
- data/test/test_test_t.rb +1 -2
- data/test/test_umannwhitney.rb +1 -2
- data/test/test_vector.rb +117 -18
- data/test/test_xls.rb +2 -3
- data/web/Rakefile +39 -0
- metadata +109 -29
- metadata.gz.sig +0 -0
- data/examples/parallel_analysis_tetrachoric.rb +0 -31
- data/lib/distribution.rb +0 -25
- data/lib/distribution/chisquare.rb +0 -23
- data/lib/distribution/f.rb +0 -35
- data/lib/distribution/normal.rb +0 -60
- data/lib/distribution/normalbivariate.rb +0 -284
- data/lib/distribution/normalmultivariate.rb +0 -73
- data/lib/distribution/t.rb +0 -55
- data/test/test_distribution.rb +0 -73
data/lib/statsample/bivariate.rb
CHANGED
@@ -1,7 +1,4 @@
|
|
1
1
|
require 'statsample/bivariate/pearson'
|
2
|
-
|
3
|
-
|
4
|
-
|
5
2
|
module Statsample
|
6
3
|
# Diverse methods and classes to calculate bivariate relations
|
7
4
|
# Specific classes:
|
@@ -11,7 +8,6 @@ module Statsample
|
|
11
8
|
module Bivariate
|
12
9
|
autoload(:Polychoric, 'statsample/bivariate/polychoric')
|
13
10
|
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
|
14
|
-
|
15
11
|
class << self
|
16
12
|
# Covariance between two vectors
|
17
13
|
def covariance(v1,v2)
|
@@ -27,8 +23,8 @@ module Statsample
|
|
27
23
|
def maximum_likehood_dichotomic(pred,real)
|
28
24
|
preda,reala=Statsample.only_valid_clone(pred,real)
|
29
25
|
sum=0
|
30
|
-
|
31
|
-
sum+=(
|
26
|
+
preda.each_index{|i|
|
27
|
+
sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
|
32
28
|
}
|
33
29
|
sum
|
34
30
|
end
|
@@ -101,6 +97,20 @@ module Statsample
|
|
101
97
|
cdf*n_tails
|
102
98
|
end
|
103
99
|
end
|
100
|
+
|
101
|
+
|
102
|
+
# Predicted time for pairwise correlation matrix, in miliseconds
|
103
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
104
|
+
|
105
|
+
def prediction_pairwise(vars,cases)
|
106
|
+
((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
|
107
|
+
end
|
108
|
+
# Predicted time for optimized correlation matrix, in miliseconds
|
109
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
110
|
+
|
111
|
+
def prediction_optimized(vars,cases)
|
112
|
+
((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
|
113
|
+
end
|
104
114
|
# Returns residual score after delete variance
|
105
115
|
# from another variable
|
106
116
|
#
|
@@ -128,10 +138,35 @@ module Statsample
|
|
128
138
|
|
129
139
|
end
|
130
140
|
|
141
|
+
def covariance_matrix_optimized(ds)
|
142
|
+
x=ds.to_gsl
|
143
|
+
n=x.row_size
|
144
|
+
m=x.column_size
|
145
|
+
means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
|
146
|
+
centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
|
147
|
+
ss=centered.transpose*centered
|
148
|
+
s=((1/(n-1).to_f))*ss
|
149
|
+
s
|
150
|
+
end
|
151
|
+
|
131
152
|
# Covariance matrix.
|
132
153
|
# Order of rows and columns depends on Dataset#fields order
|
133
154
|
|
134
155
|
def covariance_matrix(ds)
|
156
|
+
vars,cases=ds.fields.size,ds.cases
|
157
|
+
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
158
|
+
cm=covariance_matrix_optimized(ds)
|
159
|
+
else
|
160
|
+
cm=covariance_matrix_pairwise(ds)
|
161
|
+
|
162
|
+
end
|
163
|
+
cm.extend(Statsample::CovariateMatrix)
|
164
|
+
cm.fields=ds.fields
|
165
|
+
cm
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
def covariance_matrix_pairwise(ds)
|
135
170
|
cache={}
|
136
171
|
matrix=ds.collect_matrix do |row,col|
|
137
172
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
@@ -148,15 +183,34 @@ module Statsample
|
|
148
183
|
end
|
149
184
|
end
|
150
185
|
end
|
151
|
-
matrix.extend CovariateMatrix
|
152
|
-
matrix.fields=ds.fields
|
153
186
|
matrix
|
154
187
|
end
|
155
188
|
|
156
189
|
# Correlation matrix.
|
157
190
|
# Order of rows and columns depends on Dataset#fields order
|
158
|
-
|
159
191
|
def correlation_matrix(ds)
|
192
|
+
vars,cases=ds.fields.size,ds.cases
|
193
|
+
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
194
|
+
cm=correlation_matrix_optimized(ds)
|
195
|
+
else
|
196
|
+
cm=correlation_matrix_pairwise(ds)
|
197
|
+
end
|
198
|
+
cm.extend(Statsample::CovariateMatrix)
|
199
|
+
cm.fields=ds.fields
|
200
|
+
cm
|
201
|
+
end
|
202
|
+
|
203
|
+
def correlation_matrix_optimized(ds)
|
204
|
+
s=covariance_matrix_optimized(ds)
|
205
|
+
sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
|
206
|
+
cm=sds*s*sds
|
207
|
+
# Fix diagonal
|
208
|
+
s.row_size.times {|i|
|
209
|
+
cm[i,i]=1.0
|
210
|
+
}
|
211
|
+
cm
|
212
|
+
end
|
213
|
+
def correlation_matrix_pairwise(ds)
|
160
214
|
cache={}
|
161
215
|
cm=ds.collect_matrix do |row,col|
|
162
216
|
if row==col
|
@@ -173,9 +227,6 @@ module Statsample
|
|
173
227
|
end
|
174
228
|
end
|
175
229
|
end
|
176
|
-
cm.extend(Statsample::CovariateMatrix)
|
177
|
-
cm.fields=ds.fields
|
178
|
-
cm
|
179
230
|
end
|
180
231
|
|
181
232
|
# Retrieves the n valid pairwise.
|
@@ -220,7 +271,7 @@ module Statsample
|
|
220
271
|
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
221
272
|
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
222
273
|
end
|
223
|
-
# Kendall Rank Correlation Coefficient
|
274
|
+
# Kendall Rank Correlation Coefficient (Tau a)
|
224
275
|
# Based on Hervé Adbi article
|
225
276
|
def tau_a(v1,v2)
|
226
277
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
@@ -231,12 +282,15 @@ module Statsample
|
|
231
282
|
delta= o1.size*2-(o2 & o1).size*2
|
232
283
|
1-(delta * 2 / (n*(n-1)).to_f)
|
233
284
|
end
|
234
|
-
# Calculates Tau b correlation.
|
235
|
-
#
|
285
|
+
# Calculates Goodman and Kruskal’s Tau b correlation.
|
286
|
+
# Tb is an asymmetric P-R-E measure of association for nominal scales
|
287
|
+
# (Mielke, X)
|
288
|
+
#
|
236
289
|
# Tau-b defines perfect association as strict monotonicity. Although it
|
237
290
|
# requires strict monotonicity to reach 1.0, it does not penalize ties as
|
238
291
|
# much as some other measures.
|
239
|
-
#
|
292
|
+
# == Reference
|
293
|
+
# Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
|
240
294
|
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
241
295
|
def tau_b(matrix)
|
242
296
|
v=pairs(matrix)
|
data/lib/statsample/dataset.rb
CHANGED
@@ -115,6 +115,10 @@ module Statsample
|
|
115
115
|
ds.update_valid_data
|
116
116
|
ds
|
117
117
|
end
|
118
|
+
# Return true if any vector has missing data
|
119
|
+
def has_missing_data?
|
120
|
+
@vectors.any? {|k,v| v.has_missing_data?}
|
121
|
+
end
|
118
122
|
# Creates a new dataset. A dataset is a set of ordered named vectors
|
119
123
|
# of the same size.
|
120
124
|
#
|
@@ -128,6 +132,10 @@ module Statsample
|
|
128
132
|
@@n_dataset||=0
|
129
133
|
@@n_dataset+=1
|
130
134
|
@name=_("Dataset %d") % @@n_dataset
|
135
|
+
@cases=0
|
136
|
+
@gsl=nil
|
137
|
+
@i=nil
|
138
|
+
|
131
139
|
if vectors.instance_of? Array
|
132
140
|
@fields=vectors.dup
|
133
141
|
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
@@ -138,17 +146,6 @@ module Statsample
|
|
138
146
|
check_order
|
139
147
|
check_length
|
140
148
|
end
|
141
|
-
@i=nil
|
142
|
-
end
|
143
|
-
#
|
144
|
-
# Returns a GSL::matrix
|
145
|
-
#
|
146
|
-
def to_gsl_matrix
|
147
|
-
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
148
|
-
each_array do |row|
|
149
|
-
row.each_index{|y| matrix.set(@i,y,row[y]) }
|
150
|
-
end
|
151
|
-
matrix
|
152
149
|
end
|
153
150
|
#
|
154
151
|
# Creates a copy of the given dataset, deleting all the cases with
|
@@ -375,6 +372,7 @@ module Statsample
|
|
375
372
|
# Check vectors and fields after inserting data. Use only
|
376
373
|
# after #add_case_array or #add_case with second parameter to false
|
377
374
|
def update_valid_data
|
375
|
+
@gsl=nil
|
378
376
|
@fields.each{|f| @vectors[f].set_valid_data}
|
379
377
|
check_length
|
380
378
|
end
|
@@ -491,7 +489,6 @@ module Statsample
|
|
491
489
|
size=v.size
|
492
490
|
else
|
493
491
|
if v.size!=size
|
494
|
-
p v.to_a.size
|
495
492
|
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
496
493
|
end
|
497
494
|
end
|
@@ -629,7 +626,6 @@ module Statsample
|
|
629
626
|
end
|
630
627
|
# Recode a vector based on a block
|
631
628
|
def recode!(vector_name)
|
632
|
-
|
633
629
|
0.upto(@cases-1) {|i|
|
634
630
|
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
635
631
|
}
|
@@ -658,13 +654,23 @@ module Statsample
|
|
658
654
|
end
|
659
655
|
|
660
656
|
if Statsample.has_gsl?
|
661
|
-
def
|
662
|
-
|
663
|
-
self.each_array{|c|
|
664
|
-
rows.push(c)
|
665
|
-
}
|
666
|
-
GSL::Matrix.alloc(*rows)
|
657
|
+
def clear_gsl
|
658
|
+
@gsl=nil
|
667
659
|
end
|
660
|
+
|
661
|
+
def to_gsl
|
662
|
+
if @gsl.nil?
|
663
|
+
if cases.nil?
|
664
|
+
update_valid_data
|
665
|
+
end
|
666
|
+
@gsl=GSL::Matrix.alloc(cases,fields.size)
|
667
|
+
self.each_array{|c|
|
668
|
+
@gsl.set_row(@i,c)
|
669
|
+
}
|
670
|
+
end
|
671
|
+
@gsl
|
672
|
+
end
|
673
|
+
|
668
674
|
end
|
669
675
|
|
670
676
|
# Return a correlation matrix for fields included as parameters.
|
@@ -107,8 +107,8 @@ module Statsample
|
|
107
107
|
else
|
108
108
|
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
109
109
|
@method_association=:r2
|
110
|
-
|
111
110
|
end
|
111
|
+
|
112
112
|
@name=nil
|
113
113
|
opts.each{|k,v|
|
114
114
|
self.send("#{k}=",v) if self.respond_to? k
|
@@ -117,7 +117,7 @@ module Statsample
|
|
117
117
|
@dependent=[@dependent] unless @dependent.is_a? Array
|
118
118
|
|
119
119
|
@predictors ||= input.fields-@dependent
|
120
|
-
|
120
|
+
|
121
121
|
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
122
122
|
|
123
123
|
if input.is_a? Statsample::Dataset
|
data/lib/statsample/factor.rb
CHANGED
@@ -41,8 +41,10 @@ module Statsample
|
|
41
41
|
aicm
|
42
42
|
end
|
43
43
|
def self.anti_image_correlation_matrix(matrix)
|
44
|
+
matrix=matrix.to_matrix
|
44
45
|
s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse
|
45
46
|
aicm=s*matrix.inverse*s
|
47
|
+
|
46
48
|
aicm.extend(Statsample::CovariateMatrix)
|
47
49
|
aicm.fields=matrix.fields if matrix.respond_to? :fields
|
48
50
|
aicm
|
@@ -48,32 +48,37 @@ module Statsample
|
|
48
48
|
attr_reader :fm
|
49
49
|
# Smallest average squared correlation
|
50
50
|
attr_reader :minfm
|
51
|
+
|
52
|
+
attr_accessor :use_gsl
|
51
53
|
def self.with_dataset(ds,opts=Hash.new)
|
52
54
|
new(ds.correlation_matrix,opts)
|
53
55
|
end
|
54
56
|
def initialize(matrix, opts=Hash.new)
|
55
57
|
@matrix=matrix
|
56
58
|
opts_default={
|
59
|
+
:use_gsl=>true,
|
57
60
|
:name=>_("Velicer's MAP")
|
58
61
|
}
|
59
62
|
@opts=opts_default.merge(opts)
|
60
63
|
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
61
64
|
end
|
62
65
|
def compute
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
+
gsl_m=(use_gsl and Statsample.has_gsl?) ? @matrix.to_gsl : @matrix
|
67
|
+
klass_m=gsl_m.class
|
68
|
+
eigvect,@eigenvalues=gsl_m.eigenvectors_matrix, gsl_m.eigenvalues
|
69
|
+
eigenvalues_sqrt=@eigenvalues.collect {|v| Math.sqrt(v)}
|
70
|
+
loadings=eigvect*(klass_m.diagonal(*eigenvalues_sqrt))
|
66
71
|
fm=Array.new(@matrix.row_size)
|
67
72
|
ncol=@matrix.column_size
|
68
|
-
|
73
|
+
|
74
|
+
fm[0]=(gsl_m.mssq - ncol).quo(ncol*(ncol-1))
|
75
|
+
|
69
76
|
(ncol-1).times do |m|
|
70
77
|
puts "MAP:Eigenvalue #{m+1}" if $DEBUG
|
71
78
|
a=loadings[0..(loadings.row_size-1),0..m]
|
72
|
-
partcov=
|
73
|
-
|
74
|
-
|
75
|
-
}
|
76
|
-
d=Matrix.diag(*pc_prediag)
|
79
|
+
partcov= gsl_m - (a*a.transpose)
|
80
|
+
|
81
|
+
d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)}))
|
77
82
|
pr=d*partcov*d
|
78
83
|
fm[m+1]=(pr.mssq-ncol).quo(ncol*(ncol-1))
|
79
84
|
end
|
@@ -81,7 +86,7 @@ module Statsample
|
|
81
86
|
nfactors=0
|
82
87
|
@errors=[]
|
83
88
|
fm.each_with_index do |v,s|
|
84
|
-
if v.is_a? Complex
|
89
|
+
if defined?(Complex) and v.is_a? ::Complex
|
85
90
|
@errors.push(s)
|
86
91
|
else
|
87
92
|
if v < minfm
|
@@ -93,6 +98,7 @@ module Statsample
|
|
93
98
|
@number_of_factors=nfactors
|
94
99
|
@fm=fm
|
95
100
|
@minfm=minfm
|
101
|
+
|
96
102
|
end
|
97
103
|
def report_building(g) #:nodoc:
|
98
104
|
g.section(:name=>@name) do |s|
|
@@ -58,7 +58,7 @@ module Statsample
|
|
58
58
|
attr_accessor :no_data
|
59
59
|
# Show extra information if true
|
60
60
|
attr_accessor :debug
|
61
|
-
|
61
|
+
attr_accessor :use_gsl
|
62
62
|
def initialize(ds, opts=Hash.new)
|
63
63
|
@ds=ds
|
64
64
|
@fields=@ds.fields
|
@@ -74,6 +74,7 @@ module Statsample
|
|
74
74
|
:no_data=>false,
|
75
75
|
:matrix_method=>:correlation_matrix
|
76
76
|
}
|
77
|
+
@use_gsl=Statsample.has_gsl?
|
77
78
|
@opts=opts_default.merge(opts)
|
78
79
|
@opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
|
79
80
|
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
@@ -120,11 +121,12 @@ module Statsample
|
|
120
121
|
# Perform calculation. Shouldn't be called directly for the user
|
121
122
|
def compute
|
122
123
|
|
124
|
+
|
123
125
|
@original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
|
124
126
|
@ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
|
125
127
|
@ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
|
126
128
|
if bootstrap_method==:parameter or bootstrap_method==:random
|
127
|
-
rng = Distribution::Normal.
|
129
|
+
rng = Distribution::Normal.rng
|
128
130
|
end
|
129
131
|
|
130
132
|
@iterations.times do |i|
|
@@ -132,16 +134,20 @@ module Statsample
|
|
132
134
|
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
|
133
135
|
# Create a dataset of dummy values
|
134
136
|
ds_bootstrap=Statsample::Dataset.new(@ds.fields)
|
137
|
+
|
135
138
|
@fields.each do |f|
|
136
139
|
if bootstrap_method==:random
|
137
140
|
ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale
|
138
141
|
elsif bootstrap_method==:data
|
139
|
-
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
|
142
|
+
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
|
140
143
|
else
|
141
144
|
raise "bootstrap_method doesn't recogniced"
|
142
145
|
end
|
143
146
|
end
|
147
|
+
ds_bootstrap.update_valid_data
|
148
|
+
|
144
149
|
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
|
150
|
+
matrix=matrix.to_gsl if @use_gsl
|
145
151
|
if smc
|
146
152
|
smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
|
147
153
|
smc_v.each_with_index do |v,ii|
|
@@ -50,7 +50,7 @@ module Factor
|
|
50
50
|
attr_accessor :summary_parallel_analysis
|
51
51
|
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
|
52
52
|
attr_accessor :rotation_type
|
53
|
-
attr_accessor :
|
53
|
+
attr_accessor :matrix_type
|
54
54
|
def initialize(matrix, opts=Hash.new)
|
55
55
|
@use_gsl=nil
|
56
56
|
@name=_("Principal Component Analysis")
|
@@ -58,7 +58,7 @@ module Factor
|
|
58
58
|
@n_variables=@matrix.column_size
|
59
59
|
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
|
60
60
|
|
61
|
-
@
|
61
|
+
@matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
|
62
62
|
|
63
63
|
@m=nil
|
64
64
|
|
@@ -103,30 +103,45 @@ module Factor
|
|
103
103
|
# So, i=variable, j=component
|
104
104
|
def feature_matrix(m=nil)
|
105
105
|
m||=@m
|
106
|
-
|
107
|
-
|
108
|
-
|
106
|
+
if @use_gsl
|
107
|
+
omega_m=GSL::Matrix.zeros(@n_variables,m)
|
108
|
+
ev=eigenvectors
|
109
|
+
m.times do |i|
|
110
|
+
omega_m.set_column(i,ev[i])
|
111
|
+
end
|
112
|
+
omega_m
|
113
|
+
else
|
114
|
+
omega_m=::Matrix.build(@n_variables, m) {0}
|
115
|
+
m.times do |i|
|
116
|
+
omega_m.column= i, @eigenpairs[i][1]
|
117
|
+
end
|
118
|
+
omega_m
|
109
119
|
end
|
110
|
-
omega_m
|
111
120
|
end
|
112
121
|
# Returns Principal Components for +input+ matrix or dataset
|
113
122
|
# The number of PC to return is equal to parameter +m+.
|
114
|
-
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
123
|
+
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
124
|
+
# Use covariance matrix
|
125
|
+
|
115
126
|
def principal_components(input, m=nil)
|
116
|
-
|
117
|
-
|
127
|
+
if @use_gsl
|
128
|
+
data_matrix=input.to_gsl
|
129
|
+
else
|
130
|
+
data_matrix=input.to_matrix
|
131
|
+
end
|
118
132
|
m||=@m
|
119
133
|
|
120
134
|
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
|
121
135
|
|
122
136
|
fv=feature_matrix(m)
|
123
137
|
pcs=(fv.transpose*data_matrix.transpose).transpose
|
138
|
+
|
124
139
|
pcs.extend Statsample::NamedMatrix
|
125
140
|
pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
|
126
141
|
pcs.to_dataset
|
127
142
|
end
|
128
143
|
def component_matrix(m=nil)
|
129
|
-
var="component_matrix_#{
|
144
|
+
var="component_matrix_#{matrix_type}"
|
130
145
|
send(var,m)
|
131
146
|
end
|
132
147
|
# Matrix with correlations between components and
|
@@ -141,7 +156,7 @@ module Factor
|
|
141
156
|
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
|
142
157
|
}
|
143
158
|
}
|
144
|
-
cm.extend
|
159
|
+
cm.extend NamedMatrix
|
145
160
|
cm.name=_("Component matrix (from covariance)")
|
146
161
|
cm.fields_x = @variables_names
|
147
162
|
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
@@ -187,32 +202,13 @@ module Factor
|
|
187
202
|
end
|
188
203
|
def eigenvectors
|
189
204
|
@eigenpairs.collect {|c|
|
190
|
-
c[1].
|
205
|
+
@use_gsl ? c[1].to_gsl : c[1].to_vector
|
191
206
|
}
|
192
207
|
end
|
193
208
|
def calculate_eigenpairs
|
194
|
-
|
195
|
-
calculate_eigenpairs_gsl
|
196
|
-
else
|
197
|
-
calculate_eigenpairs_ruby
|
198
|
-
end
|
209
|
+
@eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
|
199
210
|
end
|
200
211
|
|
201
|
-
def calculate_eigenpairs_ruby #:nodoc:
|
202
|
-
@eigenpairs = @matrix.eigenpairs_ruby
|
203
|
-
end
|
204
|
-
# Eigenvectors calculated with gsl
|
205
|
-
# Note: The signs of some vectors could be different of
|
206
|
-
# ruby generated
|
207
|
-
def calculate_eigenpairs_gsl #:nodoc:
|
208
|
-
eigval, eigvec= GSL::Eigen.symmv(@matrix.to_gsl)
|
209
|
-
#puts "***"
|
210
|
-
ep=eigval.size.times.map {|i|
|
211
|
-
ev=eigvec.get_col(i)
|
212
|
-
[eigval[i], ev]
|
213
|
-
}
|
214
|
-
@eigenpairs=ep.sort{|a,b| a[0]<=>b[0]}.reverse
|
215
|
-
end
|
216
212
|
|
217
213
|
def report_building(builder) # :nodoc:
|
218
214
|
builder.section(:name=>@name) do |generator|
|