statsample 0.18.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +23 -0
- data/Manifest.txt +28 -17
- data/Rakefile +3 -2
- data/benchmarks/correlation_matrix_15_variables.rb +31 -0
- data/benchmarks/correlation_matrix_5_variables.rb +32 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +75 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/examples/boxplot.rb +13 -14
- data/examples/correlation_matrix.rb +16 -8
- data/examples/dataset.rb +13 -4
- data/examples/dominance_analysis.rb +23 -17
- data/examples/dominance_analysis_bootstrap.rb +28 -22
- data/examples/histogram.rb +8 -9
- data/examples/icc.rb +20 -21
- data/examples/levene.rb +10 -4
- data/examples/multiple_regression.rb +9 -28
- data/examples/multivariate_correlation.rb +9 -3
- data/examples/parallel_analysis.rb +20 -16
- data/examples/polychoric.rb +15 -9
- data/examples/principal_axis.rb +18 -6
- data/examples/reliability.rb +26 -13
- data/examples/scatterplot.rb +10 -6
- data/examples/t_test.rb +15 -6
- data/examples/tetrachoric.rb +9 -2
- data/examples/u_test.rb +12 -4
- data/examples/vector.rb +13 -2
- data/examples/velicer_map_test.rb +33 -26
- data/lib/statsample.rb +32 -12
- data/lib/statsample/analysis.rb +79 -0
- data/lib/statsample/analysis/suite.rb +72 -0
- data/lib/statsample/analysis/suitereportbuilder.rb +38 -0
- data/lib/statsample/bivariate.rb +70 -16
- data/lib/statsample/dataset.rb +25 -19
- data/lib/statsample/dominanceanalysis.rb +2 -2
- data/lib/statsample/factor.rb +2 -0
- data/lib/statsample/factor/map.rb +16 -10
- data/lib/statsample/factor/parallelanalysis.rb +9 -3
- data/lib/statsample/factor/pca.rb +28 -32
- data/lib/statsample/factor/rotation.rb +15 -8
- data/lib/statsample/graph/boxplot.rb +3 -4
- data/lib/statsample/graph/histogram.rb +2 -1
- data/lib/statsample/graph/scatterplot.rb +1 -0
- data/lib/statsample/matrix.rb +106 -16
- data/lib/statsample/regression.rb +4 -1
- data/lib/statsample/regression/binomial.rb +1 -1
- data/lib/statsample/regression/multiple/baseengine.rb +19 -9
- data/lib/statsample/regression/multiple/gslengine.rb +127 -126
- data/lib/statsample/regression/multiple/matrixengine.rb +8 -5
- data/lib/statsample/regression/multiple/rubyengine.rb +1 -1
- data/lib/statsample/regression/simple.rb +31 -6
- data/lib/statsample/reliability.rb +11 -3
- data/lib/statsample/reliability/scaleanalysis.rb +4 -4
- data/lib/statsample/shorthand.rb +81 -0
- data/lib/statsample/test/chisquare.rb +1 -1
- data/lib/statsample/vector.rb +163 -163
- data/lib/statsample/vector/gsl.rb +106 -0
- data/references.txt +2 -2
- data/{data → test/fixtures}/crime.txt +0 -0
- data/{data → test/fixtures}/hartman_23.matrix +0 -0
- data/{data → test/fixtures}/repeated_fields.csv +0 -0
- data/{data → test/fixtures}/test_binomial.csv +0 -0
- data/test/{test_csv.csv → fixtures/test_csv.csv} +0 -0
- data/test/{test_xls.xls → fixtures/test_xls.xls} +0 -0
- data/{data → test/fixtures}/tetmat_matrix.txt +0 -0
- data/{data → test/fixtures}/tetmat_test.txt +0 -0
- data/test/helpers_tests.rb +18 -2
- data/test/test_analysis.rb +118 -0
- data/test/test_anovatwoway.rb +1 -1
- data/test/test_anovatwowaywithdataset.rb +1 -1
- data/test/test_anovawithvectors.rb +1 -2
- data/test/test_bartlettsphericity.rb +1 -2
- data/test/test_bivariate.rb +64 -22
- data/test/test_codification.rb +1 -2
- data/test/test_crosstab.rb +1 -2
- data/test/test_csv.rb +3 -4
- data/test/test_dataset.rb +24 -3
- data/test/test_dominance_analysis.rb +1 -2
- data/test/test_factor.rb +8 -69
- data/test/test_factor_map.rb +43 -0
- data/test/test_factor_pa.rb +54 -0
- data/test/test_ggobi.rb +1 -1
- data/test/test_gsl.rb +12 -18
- data/test/test_histogram.rb +1 -2
- data/test/test_logit.rb +62 -18
- data/test/test_matrix.rb +4 -5
- data/test/test_mle.rb +3 -4
- data/test/test_regression.rb +21 -2
- data/test/test_reliability.rb +3 -3
- data/test/test_reliability_icc.rb +1 -1
- data/test/test_reliability_skillscale.rb +20 -4
- data/test/test_resample.rb +1 -2
- data/test/test_rserve_extension.rb +1 -2
- data/test/test_srs.rb +1 -2
- data/test/test_statistics.rb +1 -2
- data/test/test_stest.rb +1 -2
- data/test/test_stratified.rb +1 -2
- data/test/test_test_f.rb +1 -2
- data/test/test_test_t.rb +1 -2
- data/test/test_umannwhitney.rb +1 -2
- data/test/test_vector.rb +117 -18
- data/test/test_xls.rb +2 -3
- data/web/Rakefile +39 -0
- metadata +109 -29
- metadata.gz.sig +0 -0
- data/examples/parallel_analysis_tetrachoric.rb +0 -31
- data/lib/distribution.rb +0 -25
- data/lib/distribution/chisquare.rb +0 -23
- data/lib/distribution/f.rb +0 -35
- data/lib/distribution/normal.rb +0 -60
- data/lib/distribution/normalbivariate.rb +0 -284
- data/lib/distribution/normalmultivariate.rb +0 -73
- data/lib/distribution/t.rb +0 -55
- data/test/test_distribution.rb +0 -73
data/lib/statsample/bivariate.rb
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
require 'statsample/bivariate/pearson'
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
2
|
module Statsample
|
|
6
3
|
# Diverse methods and classes to calculate bivariate relations
|
|
7
4
|
# Specific classes:
|
|
@@ -11,7 +8,6 @@ module Statsample
|
|
|
11
8
|
module Bivariate
|
|
12
9
|
autoload(:Polychoric, 'statsample/bivariate/polychoric')
|
|
13
10
|
autoload(:Tetrachoric, 'statsample/bivariate/tetrachoric')
|
|
14
|
-
|
|
15
11
|
class << self
|
|
16
12
|
# Covariance between two vectors
|
|
17
13
|
def covariance(v1,v2)
|
|
@@ -27,8 +23,8 @@ module Statsample
|
|
|
27
23
|
def maximum_likehood_dichotomic(pred,real)
|
|
28
24
|
preda,reala=Statsample.only_valid_clone(pred,real)
|
|
29
25
|
sum=0
|
|
30
|
-
|
|
31
|
-
sum+=(
|
|
26
|
+
preda.each_index{|i|
|
|
27
|
+
sum+=(reala[i]*Math::log(preda[i])) + ((1-reala[i])*Math::log(1-preda[i]))
|
|
32
28
|
}
|
|
33
29
|
sum
|
|
34
30
|
end
|
|
@@ -101,6 +97,20 @@ module Statsample
|
|
|
101
97
|
cdf*n_tails
|
|
102
98
|
end
|
|
103
99
|
end
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Predicted time for pairwise correlation matrix, in miliseconds
|
|
103
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
|
104
|
+
|
|
105
|
+
def prediction_pairwise(vars,cases)
|
|
106
|
+
((-0.518111-0.000746*cases+1.235608*vars+0.000740*cases*vars)**2) / 100
|
|
107
|
+
end
|
|
108
|
+
# Predicted time for optimized correlation matrix, in miliseconds
|
|
109
|
+
# See benchmarks/correlation_matrix.rb to see mode of calculation
|
|
110
|
+
|
|
111
|
+
def prediction_optimized(vars,cases)
|
|
112
|
+
((4+0.018128*cases+0.246871*vars+0.001169*vars*cases)**2) / 100
|
|
113
|
+
end
|
|
104
114
|
# Returns residual score after delete variance
|
|
105
115
|
# from another variable
|
|
106
116
|
#
|
|
@@ -128,10 +138,35 @@ module Statsample
|
|
|
128
138
|
|
|
129
139
|
end
|
|
130
140
|
|
|
141
|
+
def covariance_matrix_optimized(ds)
|
|
142
|
+
x=ds.to_gsl
|
|
143
|
+
n=x.row_size
|
|
144
|
+
m=x.column_size
|
|
145
|
+
means=((1/n.to_f)*GSL::Matrix.ones(1,n)*x).row(0)
|
|
146
|
+
centered=x-(GSL::Matrix.ones(n,m)*GSL::Matrix.diag(means))
|
|
147
|
+
ss=centered.transpose*centered
|
|
148
|
+
s=((1/(n-1).to_f))*ss
|
|
149
|
+
s
|
|
150
|
+
end
|
|
151
|
+
|
|
131
152
|
# Covariance matrix.
|
|
132
153
|
# Order of rows and columns depends on Dataset#fields order
|
|
133
154
|
|
|
134
155
|
def covariance_matrix(ds)
|
|
156
|
+
vars,cases=ds.fields.size,ds.cases
|
|
157
|
+
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
|
158
|
+
cm=covariance_matrix_optimized(ds)
|
|
159
|
+
else
|
|
160
|
+
cm=covariance_matrix_pairwise(ds)
|
|
161
|
+
|
|
162
|
+
end
|
|
163
|
+
cm.extend(Statsample::CovariateMatrix)
|
|
164
|
+
cm.fields=ds.fields
|
|
165
|
+
cm
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def covariance_matrix_pairwise(ds)
|
|
135
170
|
cache={}
|
|
136
171
|
matrix=ds.collect_matrix do |row,col|
|
|
137
172
|
if (ds[row].type!=:scale or ds[col].type!=:scale)
|
|
@@ -148,15 +183,34 @@ module Statsample
|
|
|
148
183
|
end
|
|
149
184
|
end
|
|
150
185
|
end
|
|
151
|
-
matrix.extend CovariateMatrix
|
|
152
|
-
matrix.fields=ds.fields
|
|
153
186
|
matrix
|
|
154
187
|
end
|
|
155
188
|
|
|
156
189
|
# Correlation matrix.
|
|
157
190
|
# Order of rows and columns depends on Dataset#fields order
|
|
158
|
-
|
|
159
191
|
def correlation_matrix(ds)
|
|
192
|
+
vars,cases=ds.fields.size,ds.cases
|
|
193
|
+
if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
|
|
194
|
+
cm=correlation_matrix_optimized(ds)
|
|
195
|
+
else
|
|
196
|
+
cm=correlation_matrix_pairwise(ds)
|
|
197
|
+
end
|
|
198
|
+
cm.extend(Statsample::CovariateMatrix)
|
|
199
|
+
cm.fields=ds.fields
|
|
200
|
+
cm
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def correlation_matrix_optimized(ds)
|
|
204
|
+
s=covariance_matrix_optimized(ds)
|
|
205
|
+
sds=GSL::Matrix.diagonal(s.diagonal.sqrt.pow(-1))
|
|
206
|
+
cm=sds*s*sds
|
|
207
|
+
# Fix diagonal
|
|
208
|
+
s.row_size.times {|i|
|
|
209
|
+
cm[i,i]=1.0
|
|
210
|
+
}
|
|
211
|
+
cm
|
|
212
|
+
end
|
|
213
|
+
def correlation_matrix_pairwise(ds)
|
|
160
214
|
cache={}
|
|
161
215
|
cm=ds.collect_matrix do |row,col|
|
|
162
216
|
if row==col
|
|
@@ -173,9 +227,6 @@ module Statsample
|
|
|
173
227
|
end
|
|
174
228
|
end
|
|
175
229
|
end
|
|
176
|
-
cm.extend(Statsample::CovariateMatrix)
|
|
177
|
-
cm.fields=ds.fields
|
|
178
|
-
cm
|
|
179
230
|
end
|
|
180
231
|
|
|
181
232
|
# Retrieves the n valid pairwise.
|
|
@@ -220,7 +271,7 @@ module Statsample
|
|
|
220
271
|
m1=ds.filter_field('c') {|c| c['d']!=f0}
|
|
221
272
|
((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
|
|
222
273
|
end
|
|
223
|
-
# Kendall Rank Correlation Coefficient
|
|
274
|
+
# Kendall Rank Correlation Coefficient (Tau a)
|
|
224
275
|
# Based on Hervé Adbi article
|
|
225
276
|
def tau_a(v1,v2)
|
|
226
277
|
v1a,v2a=Statsample.only_valid_clone(v1,v2)
|
|
@@ -231,12 +282,15 @@ module Statsample
|
|
|
231
282
|
delta= o1.size*2-(o2 & o1).size*2
|
|
232
283
|
1-(delta * 2 / (n*(n-1)).to_f)
|
|
233
284
|
end
|
|
234
|
-
# Calculates Tau b correlation.
|
|
235
|
-
#
|
|
285
|
+
# Calculates Goodman and Kruskal’s Tau b correlation.
|
|
286
|
+
# Tb is an asymmetric P-R-E measure of association for nominal scales
|
|
287
|
+
# (Mielke, X)
|
|
288
|
+
#
|
|
236
289
|
# Tau-b defines perfect association as strict monotonicity. Although it
|
|
237
290
|
# requires strict monotonicity to reach 1.0, it does not penalize ties as
|
|
238
291
|
# much as some other measures.
|
|
239
|
-
#
|
|
292
|
+
# == Reference
|
|
293
|
+
# Mielke, P. GOODMAN–KRUSKAL TAU AND GAMMA.
|
|
240
294
|
# Source: http://faculty.chass.ncsu.edu/garson/PA765/assocordinal.htm
|
|
241
295
|
def tau_b(matrix)
|
|
242
296
|
v=pairs(matrix)
|
data/lib/statsample/dataset.rb
CHANGED
|
@@ -115,6 +115,10 @@ module Statsample
|
|
|
115
115
|
ds.update_valid_data
|
|
116
116
|
ds
|
|
117
117
|
end
|
|
118
|
+
# Return true if any vector has missing data
|
|
119
|
+
def has_missing_data?
|
|
120
|
+
@vectors.any? {|k,v| v.has_missing_data?}
|
|
121
|
+
end
|
|
118
122
|
# Creates a new dataset. A dataset is a set of ordered named vectors
|
|
119
123
|
# of the same size.
|
|
120
124
|
#
|
|
@@ -128,6 +132,10 @@ module Statsample
|
|
|
128
132
|
@@n_dataset||=0
|
|
129
133
|
@@n_dataset+=1
|
|
130
134
|
@name=_("Dataset %d") % @@n_dataset
|
|
135
|
+
@cases=0
|
|
136
|
+
@gsl=nil
|
|
137
|
+
@i=nil
|
|
138
|
+
|
|
131
139
|
if vectors.instance_of? Array
|
|
132
140
|
@fields=vectors.dup
|
|
133
141
|
@vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
|
|
@@ -138,17 +146,6 @@ module Statsample
|
|
|
138
146
|
check_order
|
|
139
147
|
check_length
|
|
140
148
|
end
|
|
141
|
-
@i=nil
|
|
142
|
-
end
|
|
143
|
-
#
|
|
144
|
-
# Returns a GSL::matrix
|
|
145
|
-
#
|
|
146
|
-
def to_gsl_matrix
|
|
147
|
-
matrix=GSL::Matrix.alloc(cases,@vectors.size)
|
|
148
|
-
each_array do |row|
|
|
149
|
-
row.each_index{|y| matrix.set(@i,y,row[y]) }
|
|
150
|
-
end
|
|
151
|
-
matrix
|
|
152
149
|
end
|
|
153
150
|
#
|
|
154
151
|
# Creates a copy of the given dataset, deleting all the cases with
|
|
@@ -375,6 +372,7 @@ module Statsample
|
|
|
375
372
|
# Check vectors and fields after inserting data. Use only
|
|
376
373
|
# after #add_case_array or #add_case with second parameter to false
|
|
377
374
|
def update_valid_data
|
|
375
|
+
@gsl=nil
|
|
378
376
|
@fields.each{|f| @vectors[f].set_valid_data}
|
|
379
377
|
check_length
|
|
380
378
|
end
|
|
@@ -491,7 +489,6 @@ module Statsample
|
|
|
491
489
|
size=v.size
|
|
492
490
|
else
|
|
493
491
|
if v.size!=size
|
|
494
|
-
p v.to_a.size
|
|
495
492
|
raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
|
|
496
493
|
end
|
|
497
494
|
end
|
|
@@ -629,7 +626,6 @@ module Statsample
|
|
|
629
626
|
end
|
|
630
627
|
# Recode a vector based on a block
|
|
631
628
|
def recode!(vector_name)
|
|
632
|
-
|
|
633
629
|
0.upto(@cases-1) {|i|
|
|
634
630
|
@vectors[vector_name].data[i]=yield case_as_hash(i)
|
|
635
631
|
}
|
|
@@ -658,13 +654,23 @@ module Statsample
|
|
|
658
654
|
end
|
|
659
655
|
|
|
660
656
|
if Statsample.has_gsl?
|
|
661
|
-
def
|
|
662
|
-
|
|
663
|
-
self.each_array{|c|
|
|
664
|
-
rows.push(c)
|
|
665
|
-
}
|
|
666
|
-
GSL::Matrix.alloc(*rows)
|
|
657
|
+
def clear_gsl
|
|
658
|
+
@gsl=nil
|
|
667
659
|
end
|
|
660
|
+
|
|
661
|
+
def to_gsl
|
|
662
|
+
if @gsl.nil?
|
|
663
|
+
if cases.nil?
|
|
664
|
+
update_valid_data
|
|
665
|
+
end
|
|
666
|
+
@gsl=GSL::Matrix.alloc(cases,fields.size)
|
|
667
|
+
self.each_array{|c|
|
|
668
|
+
@gsl.set_row(@i,c)
|
|
669
|
+
}
|
|
670
|
+
end
|
|
671
|
+
@gsl
|
|
672
|
+
end
|
|
673
|
+
|
|
668
674
|
end
|
|
669
675
|
|
|
670
676
|
# Return a correlation matrix for fields included as parameters.
|
|
@@ -107,8 +107,8 @@ module Statsample
|
|
|
107
107
|
else
|
|
108
108
|
@regression_class= UNIVARIATE_REGRESSION_CLASS
|
|
109
109
|
@method_association=:r2
|
|
110
|
-
|
|
111
110
|
end
|
|
111
|
+
|
|
112
112
|
@name=nil
|
|
113
113
|
opts.each{|k,v|
|
|
114
114
|
self.send("#{k}=",v) if self.respond_to? k
|
|
@@ -117,7 +117,7 @@ module Statsample
|
|
|
117
117
|
@dependent=[@dependent] unless @dependent.is_a? Array
|
|
118
118
|
|
|
119
119
|
@predictors ||= input.fields-@dependent
|
|
120
|
-
|
|
120
|
+
|
|
121
121
|
@name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
|
|
122
122
|
|
|
123
123
|
if input.is_a? Statsample::Dataset
|
data/lib/statsample/factor.rb
CHANGED
|
@@ -41,8 +41,10 @@ module Statsample
|
|
|
41
41
|
aicm
|
|
42
42
|
end
|
|
43
43
|
def self.anti_image_correlation_matrix(matrix)
|
|
44
|
+
matrix=matrix.to_matrix
|
|
44
45
|
s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse
|
|
45
46
|
aicm=s*matrix.inverse*s
|
|
47
|
+
|
|
46
48
|
aicm.extend(Statsample::CovariateMatrix)
|
|
47
49
|
aicm.fields=matrix.fields if matrix.respond_to? :fields
|
|
48
50
|
aicm
|
|
@@ -48,32 +48,37 @@ module Statsample
|
|
|
48
48
|
attr_reader :fm
|
|
49
49
|
# Smallest average squared correlation
|
|
50
50
|
attr_reader :minfm
|
|
51
|
+
|
|
52
|
+
attr_accessor :use_gsl
|
|
51
53
|
def self.with_dataset(ds,opts=Hash.new)
|
|
52
54
|
new(ds.correlation_matrix,opts)
|
|
53
55
|
end
|
|
54
56
|
def initialize(matrix, opts=Hash.new)
|
|
55
57
|
@matrix=matrix
|
|
56
58
|
opts_default={
|
|
59
|
+
:use_gsl=>true,
|
|
57
60
|
:name=>_("Velicer's MAP")
|
|
58
61
|
}
|
|
59
62
|
@opts=opts_default.merge(opts)
|
|
60
63
|
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
|
61
64
|
end
|
|
62
65
|
def compute
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
+
gsl_m=(use_gsl and Statsample.has_gsl?) ? @matrix.to_gsl : @matrix
|
|
67
|
+
klass_m=gsl_m.class
|
|
68
|
+
eigvect,@eigenvalues=gsl_m.eigenvectors_matrix, gsl_m.eigenvalues
|
|
69
|
+
eigenvalues_sqrt=@eigenvalues.collect {|v| Math.sqrt(v)}
|
|
70
|
+
loadings=eigvect*(klass_m.diagonal(*eigenvalues_sqrt))
|
|
66
71
|
fm=Array.new(@matrix.row_size)
|
|
67
72
|
ncol=@matrix.column_size
|
|
68
|
-
|
|
73
|
+
|
|
74
|
+
fm[0]=(gsl_m.mssq - ncol).quo(ncol*(ncol-1))
|
|
75
|
+
|
|
69
76
|
(ncol-1).times do |m|
|
|
70
77
|
puts "MAP:Eigenvalue #{m+1}" if $DEBUG
|
|
71
78
|
a=loadings[0..(loadings.row_size-1),0..m]
|
|
72
|
-
partcov=
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
}
|
|
76
|
-
d=Matrix.diag(*pc_prediag)
|
|
79
|
+
partcov= gsl_m - (a*a.transpose)
|
|
80
|
+
|
|
81
|
+
d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)}))
|
|
77
82
|
pr=d*partcov*d
|
|
78
83
|
fm[m+1]=(pr.mssq-ncol).quo(ncol*(ncol-1))
|
|
79
84
|
end
|
|
@@ -81,7 +86,7 @@ module Statsample
|
|
|
81
86
|
nfactors=0
|
|
82
87
|
@errors=[]
|
|
83
88
|
fm.each_with_index do |v,s|
|
|
84
|
-
if v.is_a? Complex
|
|
89
|
+
if defined?(Complex) and v.is_a? ::Complex
|
|
85
90
|
@errors.push(s)
|
|
86
91
|
else
|
|
87
92
|
if v < minfm
|
|
@@ -93,6 +98,7 @@ module Statsample
|
|
|
93
98
|
@number_of_factors=nfactors
|
|
94
99
|
@fm=fm
|
|
95
100
|
@minfm=minfm
|
|
101
|
+
|
|
96
102
|
end
|
|
97
103
|
def report_building(g) #:nodoc:
|
|
98
104
|
g.section(:name=>@name) do |s|
|
|
@@ -58,7 +58,7 @@ module Statsample
|
|
|
58
58
|
attr_accessor :no_data
|
|
59
59
|
# Show extra information if true
|
|
60
60
|
attr_accessor :debug
|
|
61
|
-
|
|
61
|
+
attr_accessor :use_gsl
|
|
62
62
|
def initialize(ds, opts=Hash.new)
|
|
63
63
|
@ds=ds
|
|
64
64
|
@fields=@ds.fields
|
|
@@ -74,6 +74,7 @@ module Statsample
|
|
|
74
74
|
:no_data=>false,
|
|
75
75
|
:matrix_method=>:correlation_matrix
|
|
76
76
|
}
|
|
77
|
+
@use_gsl=Statsample.has_gsl?
|
|
77
78
|
@opts=opts_default.merge(opts)
|
|
78
79
|
@opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
|
|
79
80
|
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
|
@@ -120,11 +121,12 @@ module Statsample
|
|
|
120
121
|
# Perform calculation. Shouldn't be called directly for the user
|
|
121
122
|
def compute
|
|
122
123
|
|
|
124
|
+
|
|
123
125
|
@original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
|
|
124
126
|
@ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
|
|
125
127
|
@ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
|
|
126
128
|
if bootstrap_method==:parameter or bootstrap_method==:random
|
|
127
|
-
rng = Distribution::Normal.
|
|
129
|
+
rng = Distribution::Normal.rng
|
|
128
130
|
end
|
|
129
131
|
|
|
130
132
|
@iterations.times do |i|
|
|
@@ -132,16 +134,20 @@ module Statsample
|
|
|
132
134
|
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
|
|
133
135
|
# Create a dataset of dummy values
|
|
134
136
|
ds_bootstrap=Statsample::Dataset.new(@ds.fields)
|
|
137
|
+
|
|
135
138
|
@fields.each do |f|
|
|
136
139
|
if bootstrap_method==:random
|
|
137
140
|
ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale
|
|
138
141
|
elsif bootstrap_method==:data
|
|
139
|
-
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
|
|
142
|
+
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
|
|
140
143
|
else
|
|
141
144
|
raise "bootstrap_method doesn't recogniced"
|
|
142
145
|
end
|
|
143
146
|
end
|
|
147
|
+
ds_bootstrap.update_valid_data
|
|
148
|
+
|
|
144
149
|
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
|
|
150
|
+
matrix=matrix.to_gsl if @use_gsl
|
|
145
151
|
if smc
|
|
146
152
|
smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
|
|
147
153
|
smc_v.each_with_index do |v,ii|
|
|
@@ -50,7 +50,7 @@ module Factor
|
|
|
50
50
|
attr_accessor :summary_parallel_analysis
|
|
51
51
|
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
|
|
52
52
|
attr_accessor :rotation_type
|
|
53
|
-
attr_accessor :
|
|
53
|
+
attr_accessor :matrix_type
|
|
54
54
|
def initialize(matrix, opts=Hash.new)
|
|
55
55
|
@use_gsl=nil
|
|
56
56
|
@name=_("Principal Component Analysis")
|
|
@@ -58,7 +58,7 @@ module Factor
|
|
|
58
58
|
@n_variables=@matrix.column_size
|
|
59
59
|
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
|
|
60
60
|
|
|
61
|
-
@
|
|
61
|
+
@matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
|
|
62
62
|
|
|
63
63
|
@m=nil
|
|
64
64
|
|
|
@@ -103,30 +103,45 @@ module Factor
|
|
|
103
103
|
# So, i=variable, j=component
|
|
104
104
|
def feature_matrix(m=nil)
|
|
105
105
|
m||=@m
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
106
|
+
if @use_gsl
|
|
107
|
+
omega_m=GSL::Matrix.zeros(@n_variables,m)
|
|
108
|
+
ev=eigenvectors
|
|
109
|
+
m.times do |i|
|
|
110
|
+
omega_m.set_column(i,ev[i])
|
|
111
|
+
end
|
|
112
|
+
omega_m
|
|
113
|
+
else
|
|
114
|
+
omega_m=::Matrix.build(@n_variables, m) {0}
|
|
115
|
+
m.times do |i|
|
|
116
|
+
omega_m.column= i, @eigenpairs[i][1]
|
|
117
|
+
end
|
|
118
|
+
omega_m
|
|
109
119
|
end
|
|
110
|
-
omega_m
|
|
111
120
|
end
|
|
112
121
|
# Returns Principal Components for +input+ matrix or dataset
|
|
113
122
|
# The number of PC to return is equal to parameter +m+.
|
|
114
|
-
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
|
123
|
+
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
|
124
|
+
# Use covariance matrix
|
|
125
|
+
|
|
115
126
|
def principal_components(input, m=nil)
|
|
116
|
-
|
|
117
|
-
|
|
127
|
+
if @use_gsl
|
|
128
|
+
data_matrix=input.to_gsl
|
|
129
|
+
else
|
|
130
|
+
data_matrix=input.to_matrix
|
|
131
|
+
end
|
|
118
132
|
m||=@m
|
|
119
133
|
|
|
120
134
|
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
|
|
121
135
|
|
|
122
136
|
fv=feature_matrix(m)
|
|
123
137
|
pcs=(fv.transpose*data_matrix.transpose).transpose
|
|
138
|
+
|
|
124
139
|
pcs.extend Statsample::NamedMatrix
|
|
125
140
|
pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
|
|
126
141
|
pcs.to_dataset
|
|
127
142
|
end
|
|
128
143
|
def component_matrix(m=nil)
|
|
129
|
-
var="component_matrix_#{
|
|
144
|
+
var="component_matrix_#{matrix_type}"
|
|
130
145
|
send(var,m)
|
|
131
146
|
end
|
|
132
147
|
# Matrix with correlations between components and
|
|
@@ -141,7 +156,7 @@ module Factor
|
|
|
141
156
|
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
|
|
142
157
|
}
|
|
143
158
|
}
|
|
144
|
-
cm.extend
|
|
159
|
+
cm.extend NamedMatrix
|
|
145
160
|
cm.name=_("Component matrix (from covariance)")
|
|
146
161
|
cm.fields_x = @variables_names
|
|
147
162
|
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
|
@@ -187,32 +202,13 @@ module Factor
|
|
|
187
202
|
end
|
|
188
203
|
def eigenvectors
|
|
189
204
|
@eigenpairs.collect {|c|
|
|
190
|
-
c[1].
|
|
205
|
+
@use_gsl ? c[1].to_gsl : c[1].to_vector
|
|
191
206
|
}
|
|
192
207
|
end
|
|
193
208
|
def calculate_eigenpairs
|
|
194
|
-
|
|
195
|
-
calculate_eigenpairs_gsl
|
|
196
|
-
else
|
|
197
|
-
calculate_eigenpairs_ruby
|
|
198
|
-
end
|
|
209
|
+
@eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
|
|
199
210
|
end
|
|
200
211
|
|
|
201
|
-
def calculate_eigenpairs_ruby #:nodoc:
|
|
202
|
-
@eigenpairs = @matrix.eigenpairs_ruby
|
|
203
|
-
end
|
|
204
|
-
# Eigenvectors calculated with gsl
|
|
205
|
-
# Note: The signs of some vectors could be different of
|
|
206
|
-
# ruby generated
|
|
207
|
-
def calculate_eigenpairs_gsl #:nodoc:
|
|
208
|
-
eigval, eigvec= GSL::Eigen.symmv(@matrix.to_gsl)
|
|
209
|
-
#puts "***"
|
|
210
|
-
ep=eigval.size.times.map {|i|
|
|
211
|
-
ev=eigvec.get_col(i)
|
|
212
|
-
[eigval[i], ev]
|
|
213
|
-
}
|
|
214
|
-
@eigenpairs=ep.sort{|a,b| a[0]<=>b[0]}.reverse
|
|
215
|
-
end
|
|
216
212
|
|
|
217
213
|
def report_building(builder) # :nodoc:
|
|
218
214
|
builder.section(:name=>@name) do |generator|
|