statsample-ekatena 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.travis.yml +23 -0
- data/CONTRIBUTING.md +17 -0
- data/Gemfile +2 -0
- data/History.txt +457 -0
- data/LICENSE.txt +12 -0
- data/README.md +175 -0
- data/Rakefile +44 -0
- data/benchmarks/correlation_matrix_15_variables.rb +32 -0
- data/benchmarks/correlation_matrix_5_variables.rb +33 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
- data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
- data/benchmarks/correlation_matrix_methods/results.ds +0 -0
- data/benchmarks/factor_map.rb +37 -0
- data/benchmarks/helpers_benchmark.rb +5 -0
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/doc_latex/manual/equations.tex +78 -0
- data/examples/boxplot.rb +28 -0
- data/examples/chisquare_test.rb +23 -0
- data/examples/correlation_matrix.rb +32 -0
- data/examples/dataset.rb +30 -0
- data/examples/dominance_analysis.rb +33 -0
- data/examples/dominance_analysis_bootstrap.rb +32 -0
- data/examples/histogram.rb +26 -0
- data/examples/icc.rb +24 -0
- data/examples/levene.rb +29 -0
- data/examples/multiple_regression.rb +20 -0
- data/examples/multivariate_correlation.rb +33 -0
- data/examples/parallel_analysis.rb +40 -0
- data/examples/polychoric.rb +40 -0
- data/examples/principal_axis.rb +26 -0
- data/examples/reliability.rb +31 -0
- data/examples/scatterplot.rb +25 -0
- data/examples/t_test.rb +27 -0
- data/examples/tetrachoric.rb +17 -0
- data/examples/u_test.rb +24 -0
- data/examples/vector.rb +20 -0
- data/examples/velicer_map_test.rb +46 -0
- data/grab_references.rb +29 -0
- data/lib/spss.rb +134 -0
- data/lib/statsample-ekatena/analysis.rb +100 -0
- data/lib/statsample-ekatena/analysis/suite.rb +89 -0
- data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
- data/lib/statsample-ekatena/anova.rb +24 -0
- data/lib/statsample-ekatena/anova/contrast.rb +79 -0
- data/lib/statsample-ekatena/anova/oneway.rb +187 -0
- data/lib/statsample-ekatena/anova/twoway.rb +207 -0
- data/lib/statsample-ekatena/bivariate.rb +406 -0
- data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
- data/lib/statsample-ekatena/codification.rb +182 -0
- data/lib/statsample-ekatena/converter/csv.rb +28 -0
- data/lib/statsample-ekatena/converter/spss.rb +48 -0
- data/lib/statsample-ekatena/converters.rb +211 -0
- data/lib/statsample-ekatena/crosstab.rb +188 -0
- data/lib/statsample-ekatena/daru.rb +115 -0
- data/lib/statsample-ekatena/dataset.rb +10 -0
- data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
- data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
- data/lib/statsample-ekatena/factor.rb +104 -0
- data/lib/statsample-ekatena/factor/map.rb +124 -0
- data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
- data/lib/statsample-ekatena/factor/pca.rb +242 -0
- data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
- data/lib/statsample-ekatena/factor/rotation.rb +198 -0
- data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
- data/lib/statsample-ekatena/formula/formula.rb +306 -0
- data/lib/statsample-ekatena/graph.rb +11 -0
- data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
- data/lib/statsample-ekatena/graph/histogram.rb +198 -0
- data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
- data/lib/statsample-ekatena/histogram.rb +180 -0
- data/lib/statsample-ekatena/matrix.rb +329 -0
- data/lib/statsample-ekatena/multiset.rb +310 -0
- data/lib/statsample-ekatena/regression.rb +65 -0
- data/lib/statsample-ekatena/regression/multiple.rb +89 -0
- data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
- data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
- data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
- data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
- data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
- data/lib/statsample-ekatena/regression/simple.rb +121 -0
- data/lib/statsample-ekatena/reliability.rb +150 -0
- data/lib/statsample-ekatena/reliability/icc.rb +415 -0
- data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
- data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
- data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
- data/lib/statsample-ekatena/resample.rb +15 -0
- data/lib/statsample-ekatena/shorthand.rb +125 -0
- data/lib/statsample-ekatena/srs.rb +169 -0
- data/lib/statsample-ekatena/test.rb +82 -0
- data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
- data/lib/statsample-ekatena/test/chisquare.rb +73 -0
- data/lib/statsample-ekatena/test/f.rb +52 -0
- data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
- data/lib/statsample-ekatena/test/levene.rb +88 -0
- data/lib/statsample-ekatena/test/t.rb +309 -0
- data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
- data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
- data/lib/statsample-ekatena/vector.rb +19 -0
- data/lib/statsample-ekatena/version.rb +3 -0
- data/lib/statsample.rb +282 -0
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +959 -0
- data/po/statsample.pot +947 -0
- data/references.txt +24 -0
- data/statsample-ekatena.gemspec +49 -0
- data/test/fixtures/bank2.dat +200 -0
- data/test/fixtures/correlation_matrix.rb +17 -0
- data/test/fixtures/df.csv +15 -0
- data/test/fixtures/hartman_23.matrix +9 -0
- data/test/fixtures/stock_data.csv +500 -0
- data/test/fixtures/tetmat_matrix.txt +5 -0
- data/test/fixtures/tetmat_test.txt +1001 -0
- data/test/helpers_tests.rb +83 -0
- data/test/test_analysis.rb +176 -0
- data/test/test_anova_contrast.rb +36 -0
- data/test/test_anovaoneway.rb +26 -0
- data/test/test_anovatwoway.rb +37 -0
- data/test/test_anovatwowaywithdataset.rb +47 -0
- data/test/test_anovawithvectors.rb +102 -0
- data/test/test_awesome_print_bug.rb +16 -0
- data/test/test_bartlettsphericity.rb +25 -0
- data/test/test_bivariate.rb +164 -0
- data/test/test_codification.rb +78 -0
- data/test/test_crosstab.rb +67 -0
- data/test/test_dominance_analysis.rb +39 -0
- data/test/test_factor.rb +228 -0
- data/test/test_factor_map.rb +38 -0
- data/test/test_factor_pa.rb +56 -0
- data/test/test_fit_model.rb +88 -0
- data/test/test_ggobi.rb +35 -0
- data/test/test_gsl.rb +15 -0
- data/test/test_histogram.rb +109 -0
- data/test/test_matrix.rb +48 -0
- data/test/test_multiset.rb +176 -0
- data/test/test_regression.rb +231 -0
- data/test/test_reliability.rb +223 -0
- data/test/test_reliability_icc.rb +198 -0
- data/test/test_reliability_skillscale.rb +57 -0
- data/test/test_resample.rb +24 -0
- data/test/test_srs.rb +9 -0
- data/test/test_statistics.rb +69 -0
- data/test/test_stest.rb +69 -0
- data/test/test_stratified.rb +17 -0
- data/test/test_test_f.rb +33 -0
- data/test/test_test_kolmogorovsmirnov.rb +34 -0
- data/test/test_test_t.rb +62 -0
- data/test/test_umannwhitney.rb +27 -0
- data/test/test_vector.rb +12 -0
- data/test/test_wilcoxonsignedrank.rb +64 -0
- metadata +570 -0
@@ -0,0 +1,166 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Factor
|
3
|
+
# Performs Horn's 'parallel analysis' to a principal components analysis
|
4
|
+
# to adjust for sample bias in the retention of components.
|
5
|
+
# Can create the bootstrap samples using random data, using number
|
6
|
+
# of cases and variables, parameters for actual data (mean and standard
|
7
|
+
# deviation of each variable) or bootstrap sampling for actual data.
|
8
|
+
# == Description
|
9
|
+
# "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194)
|
10
|
+
# == Usage
|
11
|
+
# *With real dataset*
|
12
|
+
# # ds should be any valid dataset
|
13
|
+
# pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:data)
|
14
|
+
#
|
15
|
+
# *With number of cases and variables*
|
16
|
+
# pa=Statsample::Factor::ParallelAnalysis.with_random_data(100,8)
|
17
|
+
#
|
18
|
+
# == Reference
|
19
|
+
# * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. <i>Organizational Research Methods, 7</i> (2), 191-205.
|
20
|
+
# * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402.
|
21
|
+
# * Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562.
|
22
|
+
|
23
|
+
class ParallelAnalysis
|
24
|
+
def self.with_random_data(cases,vars,opts=Hash.new)
|
25
|
+
ds= Daru::DataFrame.new({},
|
26
|
+
order: vars.times.map {|i| "v#{i+1}".to_sym},
|
27
|
+
index: cases )
|
28
|
+
opts=opts.merge({:bootstrap_method=> :random, :no_data=>true})
|
29
|
+
new(ds, opts)
|
30
|
+
end
|
31
|
+
|
32
|
+
include DirtyMemoize
|
33
|
+
include Summarizable
|
34
|
+
# Number of random sets to produce. 50 by default
|
35
|
+
attr_accessor :iterations
|
36
|
+
# Name of analysis
|
37
|
+
attr_accessor :name
|
38
|
+
# Dataset. You could use mock vectors when use bootstrap method
|
39
|
+
attr_reader :ds
|
40
|
+
# Bootstrap method. <tt>:random</tt> used by default
|
41
|
+
# * <tt>:random</tt>: uses number of variables and cases for the dataset
|
42
|
+
# * <tt>:data</tt> : sample with replacement from actual data.
|
43
|
+
attr_accessor :bootstrap_method
|
44
|
+
# Uses smc on diagonal of matrixes, to perform simulation
|
45
|
+
# of a Principal Axis analysis.
|
46
|
+
# By default, false.
|
47
|
+
attr_accessor :smc
|
48
|
+
# Percentil over bootstrap eigenvalue should be accepted. 95 by default
|
49
|
+
attr_accessor :percentil
|
50
|
+
# Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
|
51
|
+
attr_accessor :matrix_method
|
52
|
+
# Number of eigenvalues to calculate. Should be set for
|
53
|
+
# Principal Axis Analysis.
|
54
|
+
attr_accessor :n_variables
|
55
|
+
# Dataset with bootstrapped eigenvalues
|
56
|
+
attr_reader :ds_eigenvalues
|
57
|
+
# Perform analysis without actual data.
|
58
|
+
attr_accessor :no_data
|
59
|
+
# Show extra information if true
|
60
|
+
attr_accessor :debug
|
61
|
+
attr_accessor :use_gsl
|
62
|
+
def initialize(ds, opts=Hash.new)
|
63
|
+
@ds=ds
|
64
|
+
@fields=@ds.vectors.to_a
|
65
|
+
@n_variables=@fields.size
|
66
|
+
@n_cases=ds.nrows
|
67
|
+
opts_default={
|
68
|
+
:name=>_("Parallel Analysis"),
|
69
|
+
:iterations=>50, # See Liu and Rijmen (2008)
|
70
|
+
:bootstrap_method => :random,
|
71
|
+
:smc=>false,
|
72
|
+
:percentil=>95,
|
73
|
+
:debug=>false,
|
74
|
+
:no_data=>false,
|
75
|
+
:matrix_method=>:correlation_matrix
|
76
|
+
}
|
77
|
+
@use_gsl=Statsample.has_gsl?
|
78
|
+
@opts=opts_default.merge(opts)
|
79
|
+
@opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
|
80
|
+
opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
|
81
|
+
end
|
82
|
+
# Number of factor to retent
|
83
|
+
def number_of_factors
|
84
|
+
total=0
|
85
|
+
ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
|
86
|
+
if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil))
|
87
|
+
total+=1
|
88
|
+
else
|
89
|
+
break
|
90
|
+
end
|
91
|
+
end
|
92
|
+
total
|
93
|
+
end
|
94
|
+
def report_building(g) #:nodoc:
|
95
|
+
g.section(:name=>@name) do |s|
|
96
|
+
s.text _("Bootstrap Method: %s") % bootstrap_method
|
97
|
+
s.text _("Uses SMC: %s") % (smc ? _("Yes") : _("No"))
|
98
|
+
s.text _("Correlation Matrix type : %s") % matrix_method
|
99
|
+
s.text _("Number of variables: %d") % @n_variables
|
100
|
+
s.text _("Number of cases: %d") % @n_cases
|
101
|
+
s.text _("Number of iterations: %d") % @iterations
|
102
|
+
if @no_data
|
103
|
+
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t|
|
104
|
+
ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
|
105
|
+
v=ds_eigenvalues[f]
|
106
|
+
t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ]
|
107
|
+
end
|
108
|
+
end
|
109
|
+
else
|
110
|
+
s.text _("Number or factors to preserve: %d") % number_of_factors
|
111
|
+
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t|
|
112
|
+
ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
|
113
|
+
v=ds_eigenvalues[f]
|
114
|
+
t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
end
|
121
|
+
# Perform calculation. Shouldn't be called directly for the user
|
122
|
+
def compute
|
123
|
+
@original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
|
124
|
+
@ds_eigenvalues=Daru::DataFrame.new({}, order: (1..@n_variables).map{|v| ("ev_%05d" % v).to_sym})
|
125
|
+
|
126
|
+
if bootstrap_method==:parameter or bootstrap_method==:random
|
127
|
+
rng = Distribution::Normal.rng
|
128
|
+
end
|
129
|
+
|
130
|
+
@iterations.times do |i|
|
131
|
+
begin
|
132
|
+
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
|
133
|
+
# Create a dataset of dummy values
|
134
|
+
ds_bootstrap = Daru::DataFrame.new({}, order: @ds.vectors, index: @n_cases)
|
135
|
+
|
136
|
+
@fields.each do |f|
|
137
|
+
if bootstrap_method==:random
|
138
|
+
ds_bootstrap[f] = Daru::Vector.new(@n_cases.times.map {|c| rng.call})
|
139
|
+
elsif bootstrap_method==:data
|
140
|
+
ds_bootstrap[f] = ds[f].sample_with_replacement(@n_cases)
|
141
|
+
else
|
142
|
+
raise "bootstrap_method doesn't recogniced"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
|
147
|
+
matrix=matrix.to_gsl if @use_gsl
|
148
|
+
if smc
|
149
|
+
smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
|
150
|
+
smc_v.each_with_index do |v,ii|
|
151
|
+
matrix[ii,ii]=v
|
152
|
+
end
|
153
|
+
end
|
154
|
+
ev=matrix.eigenvalues
|
155
|
+
@ds_eigenvalues.add_row(ev)
|
156
|
+
rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e
|
157
|
+
puts "Error: #{e}" if $DEBUG
|
158
|
+
redo
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
dirty_memoize :number_of_factors, :ds_eigenvalues
|
163
|
+
dirty_writer :iterations, :bootstrap_method, :percentil, :smc
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,242 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Statsample
|
3
|
+
module Factor
|
4
|
+
# Principal Component Analysis (PCA) of a covariance or
|
5
|
+
# correlation matrix..
|
6
|
+
#
|
7
|
+
# NOTE: Sign of second and later eigenvalues could be different
|
8
|
+
# using Ruby or GSL, so values for PCs and component matrix
|
9
|
+
# should differ, because extendmatrix and gsl's methods to calculate
|
10
|
+
# eigenvectors are different. Using R is worse, cause first
|
11
|
+
# eigenvector could have negative values!
|
12
|
+
# For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
|
13
|
+
#
|
14
|
+
# == Usage:
|
15
|
+
# require 'statsample'
|
16
|
+
# a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
|
17
|
+
# b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
|
18
|
+
# ds = Daru::DataFrame.new({:a => a,:b => b})
|
19
|
+
# cor_matrix = Statsample::Bivariate.correlation_matrix(ds)
|
20
|
+
# pca= Statsample::Factor::PCA.new(cor_matrix)
|
21
|
+
# pca.m
|
22
|
+
# => 1
|
23
|
+
# pca.eigenvalues
|
24
|
+
# => [1.92592927269225, 0.0740707273077545]
|
25
|
+
# pca.component_matrix
|
26
|
+
# => GSL::Matrix
|
27
|
+
# [ 9.813e-01
|
28
|
+
# 9.813e-01 ]
|
29
|
+
# pca.communalities
|
30
|
+
# => [0.962964636346122, 0.962964636346122]
|
31
|
+
#
|
32
|
+
# == References:
|
33
|
+
# * SPSS Manual
|
34
|
+
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
|
35
|
+
# * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
|
36
|
+
#
|
37
|
+
class PCA
|
38
|
+
include Summarizable
|
39
|
+
# Name of analysis
|
40
|
+
attr_accessor :name
|
41
|
+
|
42
|
+
# Number of factors. Set by default to the number of factors
|
43
|
+
# with eigen values > 1
|
44
|
+
attr_accessor :m
|
45
|
+
# Use GSL if available
|
46
|
+
attr_accessor :use_gsl
|
47
|
+
# Add to the summary a rotation report
|
48
|
+
attr_accessor :summary_rotation
|
49
|
+
# Add to the summary a parallel analysis report
|
50
|
+
attr_accessor :summary_parallel_analysis
|
51
|
+
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
|
52
|
+
attr_accessor :rotation_type
|
53
|
+
attr_accessor :matrix_type
|
54
|
+
def initialize(matrix, opts=Hash.new)
|
55
|
+
@use_gsl = opts[:use_gsl]
|
56
|
+
opts.delete :use_gsl
|
57
|
+
|
58
|
+
@name=_("Principal Component Analysis")
|
59
|
+
@matrix=matrix
|
60
|
+
@n_variables=@matrix.column_size
|
61
|
+
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| "VAR_#{i+1}".to_sym }
|
62
|
+
|
63
|
+
@matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
|
64
|
+
|
65
|
+
@m=nil
|
66
|
+
|
67
|
+
@rotation_type=Statsample::Factor::Varimax
|
68
|
+
|
69
|
+
opts.each{|k,v|
|
70
|
+
self.send("#{k}=",v) if self.respond_to? k
|
71
|
+
}
|
72
|
+
|
73
|
+
if @use_gsl.nil?
|
74
|
+
@use_gsl=Statsample.has_gsl?
|
75
|
+
end
|
76
|
+
if @matrix.respond_to? :fields
|
77
|
+
@variables_names=@matrix.fields
|
78
|
+
else
|
79
|
+
@variables_names=@n_variables.times.map {|i| "V#{i+1}".to_sym}
|
80
|
+
end
|
81
|
+
calculate_eigenpairs
|
82
|
+
|
83
|
+
if @m.nil?
|
84
|
+
# Set number of factors with eigenvalues > 1
|
85
|
+
@m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
|
86
|
+
end
|
87
|
+
end
|
88
|
+
def rotation
|
89
|
+
@rotation_type.new(component_matrix)
|
90
|
+
end
|
91
|
+
def total_eigenvalues
|
92
|
+
eigenvalues.inject(0) {|ac,v| ac+v}
|
93
|
+
end
|
94
|
+
def create_centered_ds
|
95
|
+
h={}
|
96
|
+
@original_ds.factors.each {|f|
|
97
|
+
mean = @original_ds[f].mean
|
98
|
+
h[f] = @original_ds[f].recode {|c| c-mean}
|
99
|
+
}
|
100
|
+
@ds = Daru::DataFrame.new(h)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Feature matrix for +m+ factors
|
104
|
+
# Returns +m+ eigenvectors as columns.
|
105
|
+
# So, i=variable, j=component
|
106
|
+
def feature_matrix(m=nil)
|
107
|
+
m||=@m
|
108
|
+
if @use_gsl
|
109
|
+
omega_m=GSL::Matrix.zeros(@n_variables,m)
|
110
|
+
ev=eigenvectors
|
111
|
+
m.times do |i|
|
112
|
+
omega_m.set_column(i,ev[i])
|
113
|
+
end
|
114
|
+
omega_m
|
115
|
+
else
|
116
|
+
omega_m=::Matrix.build(@n_variables, m) {0}
|
117
|
+
m.times do |i|
|
118
|
+
omega_m.column= i, @eigenpairs[i][1]
|
119
|
+
end
|
120
|
+
omega_m
|
121
|
+
end
|
122
|
+
end
|
123
|
+
# Returns Principal Components for +input+ matrix or dataset
|
124
|
+
# The number of PC to return is equal to parameter +m+.
|
125
|
+
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
126
|
+
# Use covariance matrix
|
127
|
+
|
128
|
+
def principal_components(input, m=nil)
|
129
|
+
if @use_gsl
|
130
|
+
data_matrix=input.to_gsl
|
131
|
+
else
|
132
|
+
data_matrix=input.to_matrix
|
133
|
+
end
|
134
|
+
m||=@m
|
135
|
+
|
136
|
+
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
|
137
|
+
|
138
|
+
fv=feature_matrix(m)
|
139
|
+
pcs=(fv.transpose*data_matrix.transpose).transpose
|
140
|
+
|
141
|
+
pcs.extend Statsample::NamedMatrix
|
142
|
+
pcs.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
|
143
|
+
pcs.to_dataframe
|
144
|
+
end
|
145
|
+
def component_matrix(m=nil)
|
146
|
+
var="component_matrix_#{matrix_type}"
|
147
|
+
send(var,m)
|
148
|
+
end
|
149
|
+
# Matrix with correlations between components and
|
150
|
+
# variables. Based on Härdle & Simar (2003, p.243)
|
151
|
+
def component_matrix_covariance(m=nil)
|
152
|
+
m||=@m
|
153
|
+
raise "m should be > 0" if m<1
|
154
|
+
ff=feature_matrix(m)
|
155
|
+
cm=::Matrix.build(@n_variables, m) {0}
|
156
|
+
@n_variables.times {|i|
|
157
|
+
m.times {|j|
|
158
|
+
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
|
159
|
+
}
|
160
|
+
}
|
161
|
+
cm.extend NamedMatrix
|
162
|
+
cm.name=_("Component matrix (from covariance)")
|
163
|
+
cm.fields_x = @variables_names
|
164
|
+
cm.fields_y = m.times.map {|i| "PC_#{i+1}".to_sym }
|
165
|
+
|
166
|
+
cm
|
167
|
+
end
|
168
|
+
# Matrix with correlations between components and
|
169
|
+
# variables
|
170
|
+
def component_matrix_correlation(m=nil)
|
171
|
+
m||=@m
|
172
|
+
raise "m should be > 0" if m<1
|
173
|
+
omega_m=::Matrix.build(@n_variables, m) {0}
|
174
|
+
gammas=[]
|
175
|
+
m.times {|i|
|
176
|
+
omega_m.column=i, @eigenpairs[i][1]
|
177
|
+
gammas.push(Math::sqrt(@eigenpairs[i][0]))
|
178
|
+
}
|
179
|
+
gamma_m=::Matrix.diagonal(*gammas)
|
180
|
+
cm=(omega_m*(gamma_m)).to_matrix
|
181
|
+
|
182
|
+
cm.extend CovariateMatrix
|
183
|
+
cm.name=_("Component matrix")
|
184
|
+
cm.fields_x = @variables_names
|
185
|
+
cm.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
|
186
|
+
cm
|
187
|
+
end
|
188
|
+
def communalities(m=nil)
|
189
|
+
m||=@m
|
190
|
+
h=[]
|
191
|
+
@n_variables.times do |i|
|
192
|
+
sum=0
|
193
|
+
m.times do |j|
|
194
|
+
sum += (@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
|
195
|
+
end
|
196
|
+
h.push(sum)
|
197
|
+
end
|
198
|
+
h
|
199
|
+
end
|
200
|
+
# Array with eigenvalues
|
201
|
+
def eigenvalues
|
202
|
+
@eigenpairs.collect {|c| c[0] }
|
203
|
+
end
|
204
|
+
def eigenvectors
|
205
|
+
@eigenpairs.collect {|c|
|
206
|
+
@use_gsl ? c[1].to_gsl : Daru::Vector.new(c[1])
|
207
|
+
}
|
208
|
+
end
|
209
|
+
def calculate_eigenpairs
|
210
|
+
@eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
def report_building(builder) # :nodoc:
|
215
|
+
builder.section(:name=>@name) do |generator|
|
216
|
+
generator.text _("Number of factors: %d") % m
|
217
|
+
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t|
|
218
|
+
communalities(m).each_with_index {|com, i|
|
219
|
+
perc=com*100.quo(@matrix[i,i])
|
220
|
+
t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc])
|
221
|
+
}
|
222
|
+
end
|
223
|
+
te=total_eigenvalues
|
224
|
+
generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t|
|
225
|
+
ac_eigen=0
|
226
|
+
eigenvalues.each_with_index {|eigenvalue,i|
|
227
|
+
ac_eigen+=eigenvalue
|
228
|
+
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))])
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
generator.parse_element(component_matrix(m))
|
233
|
+
|
234
|
+
if (summary_rotation)
|
235
|
+
generator.parse_element(rotation)
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
private :calculate_eigenpairs, :create_centered_ds
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
@@ -0,0 +1,243 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Factor
|
3
|
+
# Principal Axis Analysis for a covariance or correlation matrix.
|
4
|
+
#
|
5
|
+
# For PCA, use Statsample::Factor::PCA
|
6
|
+
#
|
7
|
+
# == Usage:
|
8
|
+
# require 'statsample'
|
9
|
+
# a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
|
10
|
+
# b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
|
11
|
+
# ds= Daru::DataFrame.new({:a => a,:b => b})
|
12
|
+
# cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
|
13
|
+
# pa=Statsample::Factor::PrincipalAxis.new(cor_matrix)
|
14
|
+
# pa.iterate(1)
|
15
|
+
# pa.m
|
16
|
+
# => 1
|
17
|
+
# pca.component_matrix
|
18
|
+
# => GSL::Matrix
|
19
|
+
# [ 9.622e-01
|
20
|
+
# 9.622e-01 ]
|
21
|
+
# pca.communalities
|
22
|
+
# => [0.962964636346122, 0.962964636346122]
|
23
|
+
#
|
24
|
+
# == References:
|
25
|
+
# * SPSS Manual
|
26
|
+
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
|
27
|
+
#
|
28
|
+
class PrincipalAxis
|
29
|
+
include DirtyMemoize
|
30
|
+
include Summarizable
|
31
|
+
# Name of analysis
|
32
|
+
attr_accessor :name
|
33
|
+
|
34
|
+
# Number of factors. Set by default to the number of factors
|
35
|
+
# with eigenvalues > 1 (Kaiser criterion).
|
36
|
+
#
|
37
|
+
# _Warning:_ Kaiser criterion overfactors! Give yourself some time
|
38
|
+
# and use Horn's Parallel Analysis.
|
39
|
+
#
|
40
|
+
attr_accessor :m
|
41
|
+
|
42
|
+
# Number of iterations required to converge
|
43
|
+
attr_reader :iterations
|
44
|
+
|
45
|
+
# Initial eigenvalues
|
46
|
+
attr_reader :initial_eigenvalues
|
47
|
+
|
48
|
+
# Tolerance for iterations
|
49
|
+
attr_accessor :epsilon
|
50
|
+
|
51
|
+
# Use SMC(squared multiple correlations) as diagonal. If false, use 1
|
52
|
+
attr_accessor :smc
|
53
|
+
|
54
|
+
# Maximum number of iterations
|
55
|
+
attr_accessor :max_iterations
|
56
|
+
|
57
|
+
# Eigenvalues of factor analysis
|
58
|
+
attr_reader :eigenvalues
|
59
|
+
|
60
|
+
# Minimum difference between succesive iterations on sum of communalities
|
61
|
+
DELTA=1e-3
|
62
|
+
# Maximum number of iterations
|
63
|
+
MAX_ITERATIONS=25
|
64
|
+
|
65
|
+
def initialize(matrix, opts=Hash.new)
|
66
|
+
@matrix=matrix
|
67
|
+
if @matrix.respond_to? :fields
|
68
|
+
@fields=@matrix.fields
|
69
|
+
else
|
70
|
+
@fields=@matrix.row_size.times.map {|i| _("Variable %d") % (i+1)}
|
71
|
+
end
|
72
|
+
@n_variables=@matrix.row_size
|
73
|
+
@name=""
|
74
|
+
@m=nil
|
75
|
+
@initial_eigenvalues=nil
|
76
|
+
@initial_communalities=nil
|
77
|
+
@component_matrix=nil
|
78
|
+
@delta=DELTA
|
79
|
+
@smc=true
|
80
|
+
@max_iterations=MAX_ITERATIONS
|
81
|
+
opts.each{|k,v|
|
82
|
+
self.send("#{k}=",v) if self.respond_to? k
|
83
|
+
}
|
84
|
+
if @matrix.respond_to? :fields
|
85
|
+
@variables_names=@matrix.fields
|
86
|
+
else
|
87
|
+
@variables_names=@n_variables.times.map {|i| "V#{i+1}"}
|
88
|
+
end
|
89
|
+
if @m.nil?
|
90
|
+
pca=PCA.new(::Matrix.rows(@matrix.to_a))
|
91
|
+
@m=pca.m
|
92
|
+
end
|
93
|
+
|
94
|
+
@clean=true
|
95
|
+
end
|
96
|
+
# Communality for all variables given m factors
|
97
|
+
def communalities(m=nil)
|
98
|
+
if m!=@m or @clean
|
99
|
+
iterate(m)
|
100
|
+
raise "Can't calculate comunality" if @communalities.nil?
|
101
|
+
end
|
102
|
+
@communalities
|
103
|
+
end
|
104
|
+
# Component matrix for m factors
|
105
|
+
def component_matrix(m=nil)
|
106
|
+
if m!=@m or @clean
|
107
|
+
iterate(m)
|
108
|
+
end
|
109
|
+
@component_matrix
|
110
|
+
end
|
111
|
+
# Iterate to find the factors
|
112
|
+
def iterate(m=nil)
|
113
|
+
@clean=false
|
114
|
+
m||=@m
|
115
|
+
@m=m
|
116
|
+
t = @max_iterations
|
117
|
+
work_matrix=@matrix.to_a
|
118
|
+
|
119
|
+
prev_com=initial_communalities
|
120
|
+
|
121
|
+
pca=PCA.new(::Matrix.rows(work_matrix))
|
122
|
+
@initial_eigenvalues=pca.eigenvalues
|
123
|
+
prev_sum=prev_com.inject(0) {|ac,v| ac+v}
|
124
|
+
@iterations=0
|
125
|
+
t.times do |i|
|
126
|
+
"#{@name}: Iteration #{i}" if $DEBUG
|
127
|
+
@iterations+=1
|
128
|
+
prev_com.each_with_index{|v,it|
|
129
|
+
work_matrix[it][it]=v
|
130
|
+
}
|
131
|
+
pca=PCA.new(::Matrix.rows(work_matrix))
|
132
|
+
@communalities=pca.communalities(m)
|
133
|
+
@eigenvalues=pca.eigenvalues
|
134
|
+
com_sum = @communalities.inject(0) {|ac,v| ac+v}
|
135
|
+
#jump=true
|
136
|
+
|
137
|
+
break if (com_sum-prev_sum).abs < @delta
|
138
|
+
@communalities.each_with_index do |v2,i2|
|
139
|
+
raise "Variable #{i2} with communality > 1" if v2>1.0
|
140
|
+
end
|
141
|
+
prev_sum=com_sum
|
142
|
+
prev_com=@communalities
|
143
|
+
|
144
|
+
end
|
145
|
+
@component_matrix=pca.component_matrix(m)
|
146
|
+
@component_matrix.extend CovariateMatrix
|
147
|
+
@component_matrix.name=_("Factor Matrix")
|
148
|
+
@component_matrix.fields_x = @variables_names
|
149
|
+
@component_matrix.fields_y = m.times.map {|i| "factor_#{i+1}"}
|
150
|
+
|
151
|
+
end
|
152
|
+
alias :compute :iterate
|
153
|
+
|
154
|
+
def initial_communalities
|
155
|
+
if @initial_communalities.nil?
|
156
|
+
|
157
|
+
if @smc
|
158
|
+
# Based on O'Connors(2000)
|
159
|
+
@initial_communalities=@matrix.inverse.diagonal.map{|i| 1-(1.quo(i))}
|
160
|
+
=begin
|
161
|
+
@initial_communalities=@matrix.column_size.times.collect {|i|
|
162
|
+
rxx , rxy = PrincipalAxis.separate_matrices(@matrix,i)
|
163
|
+
matrix=(rxy.t*rxx.inverse*rxy)
|
164
|
+
matrix[0,0]
|
165
|
+
}
|
166
|
+
=end
|
167
|
+
else
|
168
|
+
@initial_communalities=[1.0]*@matrix.column_size
|
169
|
+
end
|
170
|
+
end
|
171
|
+
@initial_communalities
|
172
|
+
end
|
173
|
+
|
174
|
+
|
175
|
+
# Returns two matrixes from a correlation matrix
|
176
|
+
# with regressors correlation matrix and criteria xy
|
177
|
+
# matrix.
|
178
|
+
def self.separate_matrices(matrix, y)
|
179
|
+
ac=[]
|
180
|
+
matrix.column_size.times do |i|
|
181
|
+
ac.push(matrix[y,i]) if i!=y
|
182
|
+
end
|
183
|
+
rxy=Matrix.columns([ac])
|
184
|
+
rows=[]
|
185
|
+
matrix.row_size.times do |i|
|
186
|
+
if i!=y
|
187
|
+
row=[]
|
188
|
+
matrix.row_size.times do |j|
|
189
|
+
row.push(matrix[i,j]) if j!=y
|
190
|
+
end
|
191
|
+
rows.push(row)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
rxx=Matrix.rows(rows)
|
195
|
+
[rxx,rxy]
|
196
|
+
end
|
197
|
+
def report_building(generator)
|
198
|
+
iterate if @clean
|
199
|
+
generator.section(:name=>@name) do |s|
|
200
|
+
s.text _("Number of factors: %d") % m
|
201
|
+
s.text _("Iterations: %d") % @iterations
|
202
|
+
s.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
|
203
|
+
communalities(m).each_with_index {|com,i|
|
204
|
+
t.row([@fields[i], sprintf("%0.4f", initial_communalities[i]), sprintf("%0.3f", com)])
|
205
|
+
}
|
206
|
+
end
|
207
|
+
s.table(:name=>_("Total Variance"), :header=>[_("Factor"), _("I.E.Total"), _("I.E. %"), _("I.E.Cum. %"),
|
208
|
+
_("S.L.Total"), _("S.L. %"), _("S.L.Cum. %")
|
209
|
+
]) do |t|
|
210
|
+
ac_eigen,ac_i_eigen=0,0
|
211
|
+
@initial_eigenvalues.each_with_index {|eigenvalue,i|
|
212
|
+
ac_i_eigen+=eigenvalue
|
213
|
+
ac_eigen+=@eigenvalues[i]
|
214
|
+
new_row=[
|
215
|
+
_("Factor %d") % (i+1),
|
216
|
+
sprintf("%0.3f",eigenvalue),
|
217
|
+
sprintf("%0.3f%%", eigenvalue*100.quo(@n_variables)),
|
218
|
+
sprintf("%0.3f",ac_i_eigen*100.quo(@n_variables))
|
219
|
+
]
|
220
|
+
if i<@m
|
221
|
+
new_row.concat [
|
222
|
+
sprintf("%0.3f", @eigenvalues[i]),
|
223
|
+
sprintf("%0.3f%%", @eigenvalues[i]*100.quo(@n_variables)),
|
224
|
+
sprintf("%0.3f",ac_eigen*100.quo(@n_variables))
|
225
|
+
]
|
226
|
+
else
|
227
|
+
new_row.concat ["","",""]
|
228
|
+
end
|
229
|
+
|
230
|
+
t.row new_row
|
231
|
+
}
|
232
|
+
end
|
233
|
+
s.parse_element(component_matrix)
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
dirty_writer :max_iterations, :epsilon, :smc
|
238
|
+
dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues
|
239
|
+
|
240
|
+
end
|
241
|
+
|
242
|
+
end
|
243
|
+
end
|