statsample 0.13.1 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +15 -0
- data/Manifest.txt +4 -0
- data/README.txt +11 -3
- data/Rakefile +2 -2
- data/data/hartman_23.matrix +9 -0
- data/examples/correlation_matrix.rb +1 -1
- data/examples/velicer_map_test.rb +35 -0
- data/lib/distribution/chisquare.rb +2 -2
- data/lib/statsample.rb +1 -1
- data/lib/statsample/bivariate/pearson.rb +0 -1
- data/lib/statsample/converters.rb +2 -2
- data/lib/statsample/crosstab.rb +1 -1
- data/lib/statsample/factor.rb +3 -1
- data/lib/statsample/factor/map.rb +102 -0
- data/lib/statsample/factor/parallelanalysis.rb +54 -24
- data/lib/statsample/factor/pca.rb +46 -28
- data/lib/statsample/factor/principalaxis.rb +54 -22
- data/lib/statsample/factor/rotation.rb +51 -4
- data/lib/statsample/matrix.rb +14 -14
- data/lib/statsample/reliability.rb +1 -0
- data/lib/statsample/reliability/multiscaleanalysis.rb +35 -10
- data/lib/statsample/reliability/scaleanalysis.rb +10 -9
- data/lib/statsample/test.rb +12 -11
- data/lib/statsample/test/chisquare.rb +43 -0
- data/lib/statsample/vector.rb +18 -11
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +151 -85
- data/po/statsample.pot +126 -53
- data/test/test_factor.rb +29 -3
- data/test/test_matrix.rb +2 -0
- data/test/test_reliability.rb +46 -46
- data/test/test_rserve_extension.rb +2 -2
- data/test/test_stest.rb +16 -2
- data/test/test_vector.rb +10 -1
- metadata +14 -9
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,18 @@
|
|
1
|
+
=== 0.14.0 / 2010-08-16
|
2
|
+
* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA
|
3
|
+
* Bug fix on test suite on Ruby 1.8.7
|
4
|
+
* Horn's Parallel Analysis operational and tested for pure random data
|
5
|
+
* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error).
|
6
|
+
* Extra information on Factorial Analysis on summaries
|
7
|
+
* Fixed bug on Factor::Rotation when used ::Matrix without field method.
|
8
|
+
* Added Vector#vector_percentil method
|
9
|
+
* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved.
|
10
|
+
* Factor::PCA could have rotation and parallel analysis on summary.
|
11
|
+
* Cronbach's alpha from covariance matrix raise an error on size<2
|
12
|
+
* MultiScaleAnalysis could have Parallel Analysis on summary.
|
13
|
+
* Added Chi Square test
|
14
|
+
* Added new information on README.txt
|
15
|
+
|
1
16
|
=== 0.13.1 / 2010-07-03
|
2
17
|
|
3
18
|
* Rserve extensions for dataset and vector operational
|
data/Manifest.txt
CHANGED
@@ -5,6 +5,7 @@ README.txt
|
|
5
5
|
Rakefile
|
6
6
|
bin/statsample
|
7
7
|
data/crime.txt
|
8
|
+
data/hartman_23.matrix
|
8
9
|
data/locale/es/LC_MESSAGES/statsample.mo
|
9
10
|
data/repeated_fields.csv
|
10
11
|
data/test_binomial.csv
|
@@ -27,6 +28,7 @@ examples/t_test.rb
|
|
27
28
|
examples/tetrachoric.rb
|
28
29
|
examples/u_test.rb
|
29
30
|
examples/vector.rb
|
31
|
+
examples/velicer_map_test.rb
|
30
32
|
lib/distribution.rb
|
31
33
|
lib/distribution/chisquare.rb
|
32
34
|
lib/distribution/f.rb
|
@@ -51,6 +53,7 @@ lib/statsample/dataset.rb
|
|
51
53
|
lib/statsample/dominanceanalysis.rb
|
52
54
|
lib/statsample/dominanceanalysis/bootstrap.rb
|
53
55
|
lib/statsample/factor.rb
|
56
|
+
lib/statsample/factor/map.rb
|
54
57
|
lib/statsample/factor/parallelanalysis.rb
|
55
58
|
lib/statsample/factor/pca.rb
|
56
59
|
lib/statsample/factor/principalaxis.rb
|
@@ -87,6 +90,7 @@ lib/statsample/resample.rb
|
|
87
90
|
lib/statsample/rserve_extension.rb
|
88
91
|
lib/statsample/srs.rb
|
89
92
|
lib/statsample/test.rb
|
93
|
+
lib/statsample/test/chisquare.rb
|
90
94
|
lib/statsample/test/f.rb
|
91
95
|
lib/statsample/test/levene.rb
|
92
96
|
lib/statsample/test/t.rb
|
data/README.txt
CHANGED
@@ -10,14 +10,15 @@ A suite for basic and advanced statistics on Ruby. Tested on Ruby 1.8.7, 1.9.1,
|
|
10
10
|
Include:
|
11
11
|
* Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
|
12
12
|
* Imports and exports datasets from and to Excel, CSV and plain text files.
|
13
|
-
* Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b
|
13
|
+
* Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
|
14
14
|
* Anova: generic and vector-based One-way ANOVA and Two-way ANOVA
|
15
15
|
* Tests: F, T, Levene, U-Mannwhitney.
|
16
16
|
* Regression: Simple, Multiple (OLS), Probit and Logit
|
17
|
-
* Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis, for estimation of number of factors.
|
17
|
+
* Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
|
18
18
|
* Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
|
19
19
|
* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
|
20
20
|
* Sample calculation related formulas
|
21
|
+
* Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+
|
21
22
|
* Creates reports on text, html and rtf, using ReportBuilder gem
|
22
23
|
|
23
24
|
== FEATURES:
|
@@ -41,7 +42,9 @@ Include:
|
|
41
42
|
* Statsample::Factor::Varimax
|
42
43
|
* Statsample::Factor::Equimax
|
43
44
|
* Statsample::Factor::Quartimax
|
44
|
-
*
|
45
|
+
* Classes for calculation of factors to retain
|
46
|
+
* Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components.
|
47
|
+
* Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance.
|
45
48
|
* Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression
|
46
49
|
* Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
|
47
50
|
* Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
|
@@ -62,6 +65,7 @@ Include:
|
|
62
65
|
* Statsample::Test::UMannWhitney
|
63
66
|
* Statsample::Test::T
|
64
67
|
* Statsample::Test::F
|
68
|
+
* Gem +statsample-sem+ provides a DSL to R libraries +sem+ and +OpenMx+
|
65
69
|
* Interfaces to gdchart, gnuplot and SVG::Graph (experimental)
|
66
70
|
* Close integration with gem <tt>reportbuilder</tt>, to easily create reports on text, html and rtf formats.
|
67
71
|
|
@@ -109,6 +113,10 @@ If you use Ruby 1.8, you should compile statsample-optimization, usign parameter
|
|
109
113
|
|
110
114
|
$ sudo gem install statsample-optimization --platform ruby
|
111
115
|
|
116
|
+
If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed
|
117
|
+
|
118
|
+
$ sudo gem install statsample-sem
|
119
|
+
|
112
120
|
Available setup.rb file
|
113
121
|
|
114
122
|
sudo gem ruby setup.rb
|
data/Rakefile
CHANGED
@@ -4,9 +4,9 @@
|
|
4
4
|
$:.unshift(File.dirname(__FILE__)+'/lib/')
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
|
-
require 'hoe'
|
8
7
|
require 'statsample'
|
9
8
|
|
9
|
+
require 'hoe'
|
10
10
|
Hoe.plugin :git
|
11
11
|
|
12
12
|
desc "Ruby Lint"
|
@@ -40,7 +40,7 @@ h=Hoe.spec('statsample') do
|
|
40
40
|
#self.testlib=:minitest
|
41
41
|
self.rubyforge_name = "ruby-statsample"
|
42
42
|
self.developer('Claudio Bustos', 'clbustos@gmail.com')
|
43
|
-
self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>1.0"] << ["minimization", "~>0.2.0"] << ["fastercsv"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.
|
43
|
+
self.extra_deps << ["spreadsheet","~>0.6.0"] << ["svg-graph", "~>1.0"] << ["reportbuilder", "~>1.0"] << ["minimization", "~>0.2.0"] << ["fastercsv"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", "~>0.13.0"]
|
44
44
|
|
45
45
|
self.extra_dev_deps << ["shoulda"]
|
46
46
|
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
|
@@ -0,0 +1,9 @@
|
|
1
|
+
"height" "arm.span" "forearm" "lower.leg" "weight" "bitro.diameter" "chest.girth" "chest.width"
|
2
|
+
"height" 1 0.846 0.805 0.859 0.473 0.398 0.301 0.382
|
3
|
+
"arm.span" 0.846 1 0.881 0.826 0.376 0.326 0.277 0.415
|
4
|
+
"forearm" 0.805 0.881 1 0.801 0.38 0.319 0.237 0.345
|
5
|
+
"lower.leg" 0.859 0.826 0.801 1 0.436 0.329 0.327 0.365
|
6
|
+
"weight" 0.473 0.376 0.38 0.436 1 0.762 0.73 0.629
|
7
|
+
"bitro.diameter" 0.398 0.326 0.319 0.329 0.762 1 0.583 0.577
|
8
|
+
"chest.girth" 0.301 0.277 0.237 0.327 0.73 0.583 1 0.539
|
9
|
+
"chest.width" 0.382 0.415 0.345 0.365 0.629 0.577 0.539 1
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
|
+
|
4
|
+
require 'statsample'
|
5
|
+
samples=100
|
6
|
+
variables=10
|
7
|
+
rng = GSL::Rng.alloc()
|
8
|
+
f1=samples.times.collect {rng.ugaussian()}.to_scale
|
9
|
+
f2=samples.times.collect {rng.ugaussian()}.to_scale
|
10
|
+
|
11
|
+
vectors={}
|
12
|
+
|
13
|
+
variables.times do |i|
|
14
|
+
vectors["v#{i}"]=samples.times.collect {|nv|
|
15
|
+
if i<5
|
16
|
+
f1[nv]*5 + f2[nv] *2 +rng.ugaussian()
|
17
|
+
else
|
18
|
+
f1[nv]*2 + f2[nv] *3 +rng.ugaussian()
|
19
|
+
end
|
20
|
+
}.to_scale
|
21
|
+
end
|
22
|
+
ds=vectors.to_dataset
|
23
|
+
cor=Statsample::Bivariate.correlation_matrix(ds)
|
24
|
+
map=Statsample::Factor::MAP.new(cor)
|
25
|
+
pca=Statsample::Factor::PCA.new(cor)
|
26
|
+
|
27
|
+
rb=ReportBuilder.new(:name=>"Velicer's MAP test") do |g|
|
28
|
+
g.text("There are 2 real factors on data")
|
29
|
+
g.parse_element(pca)
|
30
|
+
g.text("Traditional Kaiser criterion (k>1) returns #{pca.m} factors")
|
31
|
+
g.parse_element(map)
|
32
|
+
g.text("Velicer's MAP Test returns #{map.number_of_factors} factors to preserve")
|
33
|
+
end
|
34
|
+
|
35
|
+
puts rb.to_text
|
@@ -8,7 +8,7 @@ module Distribution
|
|
8
8
|
# Return the P-value of the corresponding integral with
|
9
9
|
# k degrees of freedom
|
10
10
|
def p_value(pr,k)
|
11
|
-
Statistics2.pchi2X_(k, pr)
|
11
|
+
Statistics2.pchi2X_(k.to_i, pr)
|
12
12
|
end
|
13
13
|
# Chi-square cumulative distribution function (cdf).
|
14
14
|
#
|
@@ -16,7 +16,7 @@ module Distribution
|
|
16
16
|
# with k degrees of freedom over [0, x]
|
17
17
|
#
|
18
18
|
def cdf(x,k)
|
19
|
-
Statistics2.chi2dist(k,x)
|
19
|
+
Statistics2.chi2dist(k.to_i,x)
|
20
20
|
end
|
21
21
|
end
|
22
22
|
end
|
data/lib/statsample.rb
CHANGED
@@ -90,7 +90,7 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
90
90
|
fields=row.to_a.collect{|c| c.downcase}
|
91
91
|
fields.recode_repeated
|
92
92
|
end
|
93
|
-
|
93
|
+
|
94
94
|
def process_row(row,empty)
|
95
95
|
row.to_a.collect do |c|
|
96
96
|
if empty.include?(c)
|
@@ -146,7 +146,7 @@ raise "Should'nt be empty headers: [#{row.to_a.join(",")}]" if row.to_a.find_all
|
|
146
146
|
sheet = book.create_worksheet
|
147
147
|
format = Spreadsheet::Format.new :color => :blue,
|
148
148
|
:weight => :bold
|
149
|
-
sheet.row(0).concat(dataset.fields)
|
149
|
+
sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
|
150
150
|
sheet.row(0).default_format = format
|
151
151
|
i=1
|
152
152
|
dataset.each_array{|row|
|
data/lib/statsample/crosstab.rb
CHANGED
@@ -15,7 +15,7 @@ module Statsample
|
|
15
15
|
@row_label=v1.name
|
16
16
|
@column_label=v2.name
|
17
17
|
@name=nil
|
18
|
-
@percentage_row
|
18
|
+
@percentage_row = @percentage_column = @percentage_total=false
|
19
19
|
opts.each{|k,v|
|
20
20
|
self.send("#{k}=",v) if self.respond_to? k
|
21
21
|
}
|
data/lib/statsample/factor.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'statsample/factor/rotation'
|
1
2
|
require 'statsample/factor/pca'
|
2
3
|
require 'statsample/factor/principalaxis'
|
3
|
-
require 'statsample/factor/rotation'
|
4
4
|
require 'statsample/factor/parallelanalysis'
|
5
|
+
require 'statsample/factor/map'
|
6
|
+
|
5
7
|
module Statsample
|
6
8
|
# Factor Analysis toolbox.
|
7
9
|
# * Classes for Extraction of factors:
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Factor
|
3
|
+
# = Velicer's Minimum Average Partial
|
4
|
+
#
|
5
|
+
# "Velicer’s (1976) MAP test involves a complete princi-
|
6
|
+
# pal components analysis followed by the examination of
|
7
|
+
# a series of matrices of partial correlations. Specifically,
|
8
|
+
# on the first step, the first principal component is par-
|
9
|
+
# tialed out of the correlations between the variables of in-
|
10
|
+
# terest, and the average squared coefficient in the off-
|
11
|
+
# diagonals of the resulting partial correlation matrix is
|
12
|
+
# computed. On the second step, the first two principal
|
13
|
+
# components are partialed out of the original correlation
|
14
|
+
# matrix and the average squared partial correlation is
|
15
|
+
# again computed. These computations are conducted for k
|
16
|
+
# (the number of variables) minus one steps. The average
|
17
|
+
# squared partial correlations from these steps are then
|
18
|
+
# lined up, and the number of components is determined by
|
19
|
+
# the step number in the analyses that resulted in the lowest
|
20
|
+
# average squared partial correlation. The average squared
|
21
|
+
# coefficient in the original correlation matrix is also com-
|
22
|
+
# puted, and if this coefficient happens to be lower than
|
23
|
+
# the lowest average squared partial correlation, then no
|
24
|
+
# components should be extracted from the correlation ma-
|
25
|
+
# trix. Statistically, components are retained as long as the
|
26
|
+
# variance in the correlation matrix represents systematic
|
27
|
+
# variance. Components are no longer retained when there
|
28
|
+
# is proportionately more unsystematic variance than sys-
|
29
|
+
# tematic variance." (O'Connor, 2000, p.397).
|
30
|
+
#
|
31
|
+
# Current algorithm is loosely based on SPSS O'Connor algorithm
|
32
|
+
|
33
|
+
class MAP
|
34
|
+
include Summarizable
|
35
|
+
include DirtyMemoize
|
36
|
+
# Name of analysis
|
37
|
+
attr_accessor :name
|
38
|
+
attr_reader :eigenvalues
|
39
|
+
# Number of factors to retain
|
40
|
+
attr_reader :number_of_factors
|
41
|
+
# Average squared correlations
|
42
|
+
attr_reader :fm
|
43
|
+
# Smallest average squared correlation
|
44
|
+
attr_reader :minfm
|
45
|
+
def initialize(matrix, opts=Hash.new)
|
46
|
+
@matrix=matrix
|
47
|
+
opts_default={
|
48
|
+
:name=>_("Velicer's MAP")
|
49
|
+
}
|
50
|
+
@opts=opts_default.merge(opts)
|
51
|
+
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
52
|
+
end
|
53
|
+
def compute
|
54
|
+
eigen=@matrix.eigen
|
55
|
+
eigvect,@eigenvalues=eigen[:eigenvectors],eigen[:eigenvalues]
|
56
|
+
loadings=eigvect*(Matrix.diag(*@eigenvalues).sqrt)
|
57
|
+
fm=Array.new(@matrix.row_size)
|
58
|
+
ncol=@matrix.column_size
|
59
|
+
fm[0]=(@matrix.mssq - ncol).quo(ncol*(ncol-1))
|
60
|
+
(ncol-1).times do |m|
|
61
|
+
a=loadings[0..(loadings.row_size-1),0..m]
|
62
|
+
partcov= @matrix - (a*a.t)
|
63
|
+
pc_prediag=partcov.row_size.times.map{|i|
|
64
|
+
1.quo(Math::sqrt(partcov[i,i]))
|
65
|
+
}
|
66
|
+
d=Matrix.diag(*pc_prediag)
|
67
|
+
pr=d*partcov*d
|
68
|
+
fm[m+1]=(pr.mssq-ncol).quo(ncol*(ncol-1))
|
69
|
+
end
|
70
|
+
minfm=fm[0]
|
71
|
+
nfactors=0
|
72
|
+
fm.each_with_index do |v,s|
|
73
|
+
if v < minfm
|
74
|
+
minfm=v
|
75
|
+
nfactors=s
|
76
|
+
end
|
77
|
+
end
|
78
|
+
@number_of_factors=nfactors
|
79
|
+
@fm=fm
|
80
|
+
@minfm=minfm
|
81
|
+
end
|
82
|
+
def report_building(g) #:nodoc:
|
83
|
+
g.section(:name=>@name) do |s|
|
84
|
+
s.table(:name=>_("Eigenvalues"),:header=>[_("Value")]) do |t|
|
85
|
+
eigenvalues.each do |e|
|
86
|
+
t.row(["%0.6f" % e])
|
87
|
+
end
|
88
|
+
end
|
89
|
+
s.table(:name=>_("Velicer's Average Squared Correlations"), :header=>[_("number of components"),_("average square correlation")]) do |t|
|
90
|
+
fm.each_with_index do |v,i|
|
91
|
+
t.row(["%d" % i, "%0.6f" % v])
|
92
|
+
end
|
93
|
+
end
|
94
|
+
s.text(_("The smallest average squared correlation is : %0.6f" % minfm))
|
95
|
+
s.text(_("The number of components is : %d" % number_of_factors))
|
96
|
+
end
|
97
|
+
end
|
98
|
+
dirty_memoize :number_of_factors, :fm, :minfm, :eigenvalues
|
99
|
+
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -2,19 +2,30 @@ module Statsample
|
|
2
2
|
module Factor
|
3
3
|
# Performs Horn's 'parallel analysis' to a principal components analysis
|
4
4
|
# to adjust for sample bias in the retention of components.
|
5
|
-
# Can create the bootstrap samples using
|
6
|
-
#
|
5
|
+
# Can create the bootstrap samples using random data, using number
|
6
|
+
# of cases and variables, parameters for actual data (mean and standard
|
7
|
+
# deviation of each variable) or bootstrap sampling for actual data.
|
7
8
|
# == Description
|
8
9
|
# "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194)
|
9
10
|
# == Usage
|
11
|
+
# *With real dataset*
|
10
12
|
# # ds should be any valid dataset
|
11
13
|
# pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:raw_data)
|
12
14
|
#
|
15
|
+
# *With number of cases and variables*
|
16
|
+
# pa=Statsample::Factor::ParallelAnalysis.with_random_data(100,8)
|
17
|
+
#
|
13
18
|
# == References:
|
14
19
|
# * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. <i>Organizational Research Methods, 7</i> (2), 191-205.
|
15
|
-
# *
|
20
|
+
# * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer’s MAP test. Behavior Research Methods, Instruments, & Computers, 32 (3), 396-402
|
16
21
|
class ParallelAnalysis
|
17
|
-
|
22
|
+
def self.with_random_data(cases,vars,iterations=100,percentil=95)
|
23
|
+
require 'ostruct'
|
24
|
+
ds=OpenStruct.new
|
25
|
+
ds.fields=vars.times.map {|i| "v#{i+1}"}
|
26
|
+
ds.cases=cases
|
27
|
+
pa=new(ds,{:bootstrap_method=>:random, :no_data=>true, :iterations=>iterations,:percentil=>percentil})
|
28
|
+
end
|
18
29
|
include DirtyMemoize
|
19
30
|
include Summarizable
|
20
31
|
# Number of random sets to produce. 50 by default
|
@@ -23,25 +34,31 @@ module Statsample
|
|
23
34
|
attr_accessor :name
|
24
35
|
# Dataset. You could use mock vectors when use bootstrap method
|
25
36
|
attr_reader :ds
|
26
|
-
# Bootstrap method. <tt>:
|
27
|
-
# * <tt>:
|
28
|
-
# * <tt>:raw_data</tt> : sample with replacement from actual data.
|
29
|
-
#
|
37
|
+
# Bootstrap method. <tt>:random</tt> used by default
|
38
|
+
# * <tt>:random</tt>: uses number of variables and cases for the dataset
|
39
|
+
# * <tt>:raw_data</tt> : sample with replacement from actual data.
|
40
|
+
# * <tt>:parameter</tt>: uses number of variables and cases, uses mean and standard deviation of each variable
|
41
|
+
|
30
42
|
attr_accessor :bootstrap_method
|
31
43
|
# Factor method.
|
32
44
|
# Could be Statsample::Factor::PCA or Statsample::Factor::PrincipalAxis.
|
33
45
|
# PCA used by default.
|
46
|
+
# Remember to set n_variables when using Principal Axis Analysis.
|
34
47
|
attr_accessor :factor_class
|
35
48
|
# Percentil over bootstrap eigenvalue should be accepted. 95 by default
|
36
49
|
attr_accessor :percentil
|
37
50
|
# Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
|
38
|
-
attr_accessor :matrix_method
|
51
|
+
attr_accessor :matrix_method
|
52
|
+
# Number of eigenvalues to calculate. Should be set for
|
53
|
+
# Principal Axis Analysis.
|
54
|
+
attr_accessor :n_variables
|
39
55
|
# Dataset with bootstrapped eigenvalues
|
40
56
|
attr_reader :ds_eigenvalues
|
57
|
+
# Perform analysis without actual data.
|
58
|
+
attr_accessor :no_data
|
41
59
|
# Show extra information if true
|
42
60
|
attr_accessor :debug
|
43
|
-
|
44
|
-
|
61
|
+
|
45
62
|
def initialize(ds, opts=Hash.new)
|
46
63
|
@ds=ds
|
47
64
|
@fields=@ds.fields
|
@@ -49,11 +66,12 @@ module Statsample
|
|
49
66
|
@n_cases=ds.cases
|
50
67
|
opts_default={
|
51
68
|
:name=>_("Parallel Analysis"),
|
52
|
-
:iterations=>
|
53
|
-
:bootstrap_method => :
|
69
|
+
:iterations=>100,
|
70
|
+
:bootstrap_method => :random,
|
54
71
|
:factor_class => Statsample::Factor::PCA,
|
55
72
|
:percentil=>95,
|
56
73
|
:debug=>false,
|
74
|
+
:no_data=>false,
|
57
75
|
:matrix_method=>:correlation_matrix
|
58
76
|
}
|
59
77
|
@opts=opts_default.merge(opts)
|
@@ -75,11 +93,20 @@ module Statsample
|
|
75
93
|
s.text _("Number of variables: %d") % @n_variables
|
76
94
|
s.text _("Number of cases: %d") % @n_cases
|
77
95
|
s.text _("Number of iterations: %d") % @iterations
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
96
|
+
if @no_data
|
97
|
+
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t|
|
98
|
+
ds_eigenvalues.fields.each_with_index do |f,i|
|
99
|
+
v=ds_eigenvalues[f]
|
100
|
+
t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ]
|
101
|
+
end
|
102
|
+
end
|
103
|
+
else
|
104
|
+
s.text _("Number or factors to preserve: %d") % number_of_factors
|
105
|
+
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t|
|
106
|
+
ds_eigenvalues.fields.each_with_index do |f,i|
|
107
|
+
v=ds_eigenvalues[f]
|
108
|
+
t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
|
109
|
+
end
|
83
110
|
end
|
84
111
|
end
|
85
112
|
|
@@ -87,26 +114,29 @@ module Statsample
|
|
87
114
|
end
|
88
115
|
# Perform calculation. Shouldn't be called directly for the user
|
89
116
|
def compute
|
90
|
-
@original=factor_class.new(Statsample::Bivariate.correlation_matrix(@ds), :m=>@n_variables).eigenvalues.sort.reverse
|
117
|
+
@original=factor_class.new(Statsample::Bivariate.correlation_matrix(@ds), :m=>@n_variables).eigenvalues.sort.reverse unless no_data
|
118
|
+
|
91
119
|
@ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
|
92
120
|
@ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
|
121
|
+
if bootstrap_method==:parameter or bootstrap_method==:random
|
122
|
+
rng = GSL::Rng.alloc(GSL::Rng::MT19937, rand(32000))
|
123
|
+
end
|
93
124
|
|
94
125
|
@iterations.times do |i|
|
95
126
|
# Create a dataset of dummy values
|
96
127
|
ds_bootstrap=Statsample::Dataset.new(@ds.fields)
|
97
|
-
if bootstrap_method==:parameter
|
98
|
-
rng = GSL::Rng.alloc()
|
99
|
-
end
|
100
128
|
@fields.each do |f|
|
101
|
-
|
102
129
|
if bootstrap_method==:parameter
|
103
130
|
sd=@ds[f].sd
|
104
131
|
mean=@ds[f].mean
|
105
|
-
ds_bootstrap[f]=@n_cases.times.map {|c| rng.gaussian(sd)+mean}.to_scale
|
132
|
+
ds_bootstrap[f]=@n_cases.times.map {|c| rng.gaussian(sd) + mean }.to_scale
|
133
|
+
elsif bootstrap_method==:random
|
134
|
+
ds_bootstrap[f]=@n_cases.times.map {|c| rng.ugaussian()}.to_scale
|
106
135
|
elsif bootstrap_method==:raw_data
|
107
136
|
ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases).to_scale
|
108
137
|
end
|
109
138
|
end
|
139
|
+
#pp Statsample::Bivariate.correlation_matrix(ds_bootstrap)
|
110
140
|
fa=factor_class.new(Statsample::Bivariate.send(matrix_method, ds_bootstrap), :m=>@n_variables)
|
111
141
|
ev=fa.eigenvalues.sort.reverse
|
112
142
|
@ds_eigenvalues.add_case_array(ev)
|