statsample-ekatena 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (156) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/.travis.yml +23 -0
  4. data/CONTRIBUTING.md +17 -0
  5. data/Gemfile +2 -0
  6. data/History.txt +457 -0
  7. data/LICENSE.txt +12 -0
  8. data/README.md +175 -0
  9. data/Rakefile +44 -0
  10. data/benchmarks/correlation_matrix_15_variables.rb +32 -0
  11. data/benchmarks/correlation_matrix_5_variables.rb +33 -0
  12. data/benchmarks/correlation_matrix_methods/correlation_matrix.ds +0 -0
  13. data/benchmarks/correlation_matrix_methods/correlation_matrix.html +93 -0
  14. data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +71 -0
  15. data/benchmarks/correlation_matrix_methods/correlation_matrix.xls +0 -0
  16. data/benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods +0 -0
  17. data/benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods +0 -0
  18. data/benchmarks/correlation_matrix_methods/results.ds +0 -0
  19. data/benchmarks/factor_map.rb +37 -0
  20. data/benchmarks/helpers_benchmark.rb +5 -0
  21. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  22. data/doc_latex/manual/equations.tex +78 -0
  23. data/examples/boxplot.rb +28 -0
  24. data/examples/chisquare_test.rb +23 -0
  25. data/examples/correlation_matrix.rb +32 -0
  26. data/examples/dataset.rb +30 -0
  27. data/examples/dominance_analysis.rb +33 -0
  28. data/examples/dominance_analysis_bootstrap.rb +32 -0
  29. data/examples/histogram.rb +26 -0
  30. data/examples/icc.rb +24 -0
  31. data/examples/levene.rb +29 -0
  32. data/examples/multiple_regression.rb +20 -0
  33. data/examples/multivariate_correlation.rb +33 -0
  34. data/examples/parallel_analysis.rb +40 -0
  35. data/examples/polychoric.rb +40 -0
  36. data/examples/principal_axis.rb +26 -0
  37. data/examples/reliability.rb +31 -0
  38. data/examples/scatterplot.rb +25 -0
  39. data/examples/t_test.rb +27 -0
  40. data/examples/tetrachoric.rb +17 -0
  41. data/examples/u_test.rb +24 -0
  42. data/examples/vector.rb +20 -0
  43. data/examples/velicer_map_test.rb +46 -0
  44. data/grab_references.rb +29 -0
  45. data/lib/spss.rb +134 -0
  46. data/lib/statsample-ekatena/analysis.rb +100 -0
  47. data/lib/statsample-ekatena/analysis/suite.rb +89 -0
  48. data/lib/statsample-ekatena/analysis/suitereportbuilder.rb +44 -0
  49. data/lib/statsample-ekatena/anova.rb +24 -0
  50. data/lib/statsample-ekatena/anova/contrast.rb +79 -0
  51. data/lib/statsample-ekatena/anova/oneway.rb +187 -0
  52. data/lib/statsample-ekatena/anova/twoway.rb +207 -0
  53. data/lib/statsample-ekatena/bivariate.rb +406 -0
  54. data/lib/statsample-ekatena/bivariate/pearson.rb +54 -0
  55. data/lib/statsample-ekatena/codification.rb +182 -0
  56. data/lib/statsample-ekatena/converter/csv.rb +28 -0
  57. data/lib/statsample-ekatena/converter/spss.rb +48 -0
  58. data/lib/statsample-ekatena/converters.rb +211 -0
  59. data/lib/statsample-ekatena/crosstab.rb +188 -0
  60. data/lib/statsample-ekatena/daru.rb +115 -0
  61. data/lib/statsample-ekatena/dataset.rb +10 -0
  62. data/lib/statsample-ekatena/dominanceanalysis.rb +425 -0
  63. data/lib/statsample-ekatena/dominanceanalysis/bootstrap.rb +232 -0
  64. data/lib/statsample-ekatena/factor.rb +104 -0
  65. data/lib/statsample-ekatena/factor/map.rb +124 -0
  66. data/lib/statsample-ekatena/factor/parallelanalysis.rb +166 -0
  67. data/lib/statsample-ekatena/factor/pca.rb +242 -0
  68. data/lib/statsample-ekatena/factor/principalaxis.rb +243 -0
  69. data/lib/statsample-ekatena/factor/rotation.rb +198 -0
  70. data/lib/statsample-ekatena/formula/fit_model.rb +46 -0
  71. data/lib/statsample-ekatena/formula/formula.rb +306 -0
  72. data/lib/statsample-ekatena/graph.rb +11 -0
  73. data/lib/statsample-ekatena/graph/boxplot.rb +236 -0
  74. data/lib/statsample-ekatena/graph/histogram.rb +198 -0
  75. data/lib/statsample-ekatena/graph/scatterplot.rb +213 -0
  76. data/lib/statsample-ekatena/histogram.rb +180 -0
  77. data/lib/statsample-ekatena/matrix.rb +329 -0
  78. data/lib/statsample-ekatena/multiset.rb +310 -0
  79. data/lib/statsample-ekatena/regression.rb +65 -0
  80. data/lib/statsample-ekatena/regression/multiple.rb +89 -0
  81. data/lib/statsample-ekatena/regression/multiple/alglibengine.rb +128 -0
  82. data/lib/statsample-ekatena/regression/multiple/baseengine.rb +251 -0
  83. data/lib/statsample-ekatena/regression/multiple/gslengine.rb +129 -0
  84. data/lib/statsample-ekatena/regression/multiple/matrixengine.rb +205 -0
  85. data/lib/statsample-ekatena/regression/multiple/rubyengine.rb +86 -0
  86. data/lib/statsample-ekatena/regression/simple.rb +121 -0
  87. data/lib/statsample-ekatena/reliability.rb +150 -0
  88. data/lib/statsample-ekatena/reliability/icc.rb +415 -0
  89. data/lib/statsample-ekatena/reliability/multiscaleanalysis.rb +181 -0
  90. data/lib/statsample-ekatena/reliability/scaleanalysis.rb +233 -0
  91. data/lib/statsample-ekatena/reliability/skillscaleanalysis.rb +114 -0
  92. data/lib/statsample-ekatena/resample.rb +15 -0
  93. data/lib/statsample-ekatena/shorthand.rb +125 -0
  94. data/lib/statsample-ekatena/srs.rb +169 -0
  95. data/lib/statsample-ekatena/test.rb +82 -0
  96. data/lib/statsample-ekatena/test/bartlettsphericity.rb +45 -0
  97. data/lib/statsample-ekatena/test/chisquare.rb +73 -0
  98. data/lib/statsample-ekatena/test/f.rb +52 -0
  99. data/lib/statsample-ekatena/test/kolmogorovsmirnov.rb +63 -0
  100. data/lib/statsample-ekatena/test/levene.rb +88 -0
  101. data/lib/statsample-ekatena/test/t.rb +309 -0
  102. data/lib/statsample-ekatena/test/umannwhitney.rb +208 -0
  103. data/lib/statsample-ekatena/test/wilcoxonsignedrank.rb +90 -0
  104. data/lib/statsample-ekatena/vector.rb +19 -0
  105. data/lib/statsample-ekatena/version.rb +3 -0
  106. data/lib/statsample.rb +282 -0
  107. data/po/es/statsample.mo +0 -0
  108. data/po/es/statsample.po +959 -0
  109. data/po/statsample.pot +947 -0
  110. data/references.txt +24 -0
  111. data/statsample-ekatena.gemspec +49 -0
  112. data/test/fixtures/bank2.dat +200 -0
  113. data/test/fixtures/correlation_matrix.rb +17 -0
  114. data/test/fixtures/df.csv +15 -0
  115. data/test/fixtures/hartman_23.matrix +9 -0
  116. data/test/fixtures/stock_data.csv +500 -0
  117. data/test/fixtures/tetmat_matrix.txt +5 -0
  118. data/test/fixtures/tetmat_test.txt +1001 -0
  119. data/test/helpers_tests.rb +83 -0
  120. data/test/test_analysis.rb +176 -0
  121. data/test/test_anova_contrast.rb +36 -0
  122. data/test/test_anovaoneway.rb +26 -0
  123. data/test/test_anovatwoway.rb +37 -0
  124. data/test/test_anovatwowaywithdataset.rb +47 -0
  125. data/test/test_anovawithvectors.rb +102 -0
  126. data/test/test_awesome_print_bug.rb +16 -0
  127. data/test/test_bartlettsphericity.rb +25 -0
  128. data/test/test_bivariate.rb +164 -0
  129. data/test/test_codification.rb +78 -0
  130. data/test/test_crosstab.rb +67 -0
  131. data/test/test_dominance_analysis.rb +39 -0
  132. data/test/test_factor.rb +228 -0
  133. data/test/test_factor_map.rb +38 -0
  134. data/test/test_factor_pa.rb +56 -0
  135. data/test/test_fit_model.rb +88 -0
  136. data/test/test_ggobi.rb +35 -0
  137. data/test/test_gsl.rb +15 -0
  138. data/test/test_histogram.rb +109 -0
  139. data/test/test_matrix.rb +48 -0
  140. data/test/test_multiset.rb +176 -0
  141. data/test/test_regression.rb +231 -0
  142. data/test/test_reliability.rb +223 -0
  143. data/test/test_reliability_icc.rb +198 -0
  144. data/test/test_reliability_skillscale.rb +57 -0
  145. data/test/test_resample.rb +24 -0
  146. data/test/test_srs.rb +9 -0
  147. data/test/test_statistics.rb +69 -0
  148. data/test/test_stest.rb +69 -0
  149. data/test/test_stratified.rb +17 -0
  150. data/test/test_test_f.rb +33 -0
  151. data/test/test_test_kolmogorovsmirnov.rb +34 -0
  152. data/test/test_test_t.rb +62 -0
  153. data/test/test_umannwhitney.rb +27 -0
  154. data/test/test_vector.rb +12 -0
  155. data/test/test_wilcoxonsignedrank.rb +64 -0
  156. metadata +570 -0
@@ -0,0 +1,166 @@
1
+ module Statsample
2
+ module Factor
3
+ # Performs Horn's 'parallel analysis' to a principal components analysis
4
+ # to adjust for sample bias in the retention of components.
5
+ # Can create the bootstrap samples using random data, using number
6
+ # of cases and variables, parameters for actual data (mean and standard
7
+ # deviation of each variable) or bootstrap sampling for actual data.
8
+ # == Description
9
+ # "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194)
10
+ # == Usage
11
+ # *With real dataset*
12
+ # # ds should be any valid dataset
13
+ # pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:data)
14
+ #
15
+ # *With number of cases and variables*
16
+ # pa=Statsample::Factor::ParallelAnalysis.with_random_data(100,8)
17
+ #
18
+ # == Reference
19
+ # * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. <i>Organizational Research Methods, 7</i> (2), 191-205.
20
+ # * O'Connor, B. (2000). SPSS and SAS programs for determining the number of components using parallel analysis and Velicer's MAP test. Behavior Research Methods, Instruments, & Computers, 32(3), 396-402.
21
+ # * Liu, O., & Rijmen, F. (2008). A modified procedure for parallel analysis of ordered categorical data. Behavior Research Methods, 40(2), 556-562.
22
+
23
+ class ParallelAnalysis
24
+ def self.with_random_data(cases,vars,opts=Hash.new)
25
+ ds= Daru::DataFrame.new({},
26
+ order: vars.times.map {|i| "v#{i+1}".to_sym},
27
+ index: cases )
28
+ opts=opts.merge({:bootstrap_method=> :random, :no_data=>true})
29
+ new(ds, opts)
30
+ end
31
+
32
+ include DirtyMemoize
33
+ include Summarizable
34
+ # Number of random sets to produce. 50 by default
35
+ attr_accessor :iterations
36
+ # Name of analysis
37
+ attr_accessor :name
38
+ # Dataset. You could use mock vectors when use bootstrap method
39
+ attr_reader :ds
40
+ # Bootstrap method. <tt>:random</tt> used by default
41
+ # * <tt>:random</tt>: uses number of variables and cases for the dataset
42
+ # * <tt>:data</tt> : sample with replacement from actual data.
43
+ attr_accessor :bootstrap_method
44
+ # Uses smc on diagonal of matrixes, to perform simulation
45
+ # of a Principal Axis analysis.
46
+ # By default, false.
47
+ attr_accessor :smc
48
+ # Percentil over bootstrap eigenvalue should be accepted. 95 by default
49
+ attr_accessor :percentil
50
+ # Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
51
+ attr_accessor :matrix_method
52
+ # Number of eigenvalues to calculate. Should be set for
53
+ # Principal Axis Analysis.
54
+ attr_accessor :n_variables
55
+ # Dataset with bootstrapped eigenvalues
56
+ attr_reader :ds_eigenvalues
57
+ # Perform analysis without actual data.
58
+ attr_accessor :no_data
59
+ # Show extra information if true
60
+ attr_accessor :debug
61
+ attr_accessor :use_gsl
62
+ def initialize(ds, opts=Hash.new)
63
+ @ds=ds
64
+ @fields=@ds.vectors.to_a
65
+ @n_variables=@fields.size
66
+ @n_cases=ds.nrows
67
+ opts_default={
68
+ :name=>_("Parallel Analysis"),
69
+ :iterations=>50, # See Liu and Rijmen (2008)
70
+ :bootstrap_method => :random,
71
+ :smc=>false,
72
+ :percentil=>95,
73
+ :debug=>false,
74
+ :no_data=>false,
75
+ :matrix_method=>:correlation_matrix
76
+ }
77
+ @use_gsl=Statsample.has_gsl?
78
+ @opts=opts_default.merge(opts)
79
+ @opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
80
+ opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
81
+ end
82
+ # Number of factor to retent
83
+ def number_of_factors
84
+ total=0
85
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
86
+ if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil))
87
+ total+=1
88
+ else
89
+ break
90
+ end
91
+ end
92
+ total
93
+ end
94
+ def report_building(g) #:nodoc:
95
+ g.section(:name=>@name) do |s|
96
+ s.text _("Bootstrap Method: %s") % bootstrap_method
97
+ s.text _("Uses SMC: %s") % (smc ? _("Yes") : _("No"))
98
+ s.text _("Correlation Matrix type : %s") % matrix_method
99
+ s.text _("Number of variables: %d") % @n_variables
100
+ s.text _("Number of cases: %d") % @n_cases
101
+ s.text _("Number of iterations: %d") % @iterations
102
+ if @no_data
103
+ s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t|
104
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
105
+ v=ds_eigenvalues[f]
106
+ t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ]
107
+ end
108
+ end
109
+ else
110
+ s.text _("Number or factors to preserve: %d") % number_of_factors
111
+ s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t|
112
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
113
+ v=ds_eigenvalues[f]
114
+ t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
115
+ end
116
+ end
117
+ end
118
+
119
+ end
120
+ end
121
+ # Perform calculation. Shouldn't be called directly for the user
122
+ def compute
123
+ @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
124
+ @ds_eigenvalues=Daru::DataFrame.new({}, order: (1..@n_variables).map{|v| ("ev_%05d" % v).to_sym})
125
+
126
+ if bootstrap_method==:parameter or bootstrap_method==:random
127
+ rng = Distribution::Normal.rng
128
+ end
129
+
130
+ @iterations.times do |i|
131
+ begin
132
+ puts "#{@name}: Iteration #{i}" if $DEBUG or debug
133
+ # Create a dataset of dummy values
134
+ ds_bootstrap = Daru::DataFrame.new({}, order: @ds.vectors, index: @n_cases)
135
+
136
+ @fields.each do |f|
137
+ if bootstrap_method==:random
138
+ ds_bootstrap[f] = Daru::Vector.new(@n_cases.times.map {|c| rng.call})
139
+ elsif bootstrap_method==:data
140
+ ds_bootstrap[f] = ds[f].sample_with_replacement(@n_cases)
141
+ else
142
+ raise "bootstrap_method doesn't recogniced"
143
+ end
144
+ end
145
+
146
+ matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
147
+ matrix=matrix.to_gsl if @use_gsl
148
+ if smc
149
+ smc_v=matrix.inverse.diagonal.map{|ii| 1-(1.quo(ii))}
150
+ smc_v.each_with_index do |v,ii|
151
+ matrix[ii,ii]=v
152
+ end
153
+ end
154
+ ev=matrix.eigenvalues
155
+ @ds_eigenvalues.add_row(ev)
156
+ rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e
157
+ puts "Error: #{e}" if $DEBUG
158
+ redo
159
+ end
160
+ end
161
+ end
162
+ dirty_memoize :number_of_factors, :ds_eigenvalues
163
+ dirty_writer :iterations, :bootstrap_method, :percentil, :smc
164
+ end
165
+ end
166
+ end
@@ -0,0 +1,242 @@
1
+ # encoding: UTF-8
2
+ module Statsample
3
+ module Factor
4
+ # Principal Component Analysis (PCA) of a covariance or
5
+ # correlation matrix..
6
+ #
7
+ # NOTE: Sign of second and later eigenvalues could be different
8
+ # using Ruby or GSL, so values for PCs and component matrix
9
+ # should differ, because extendmatrix and gsl's methods to calculate
10
+ # eigenvectors are different. Using R is worse, cause first
11
+ # eigenvector could have negative values!
12
+ # For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
13
+ #
14
+ # == Usage:
15
+ # require 'statsample'
16
+ # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
17
+ # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
18
+ # ds = Daru::DataFrame.new({:a => a,:b => b})
19
+ # cor_matrix = Statsample::Bivariate.correlation_matrix(ds)
20
+ # pca= Statsample::Factor::PCA.new(cor_matrix)
21
+ # pca.m
22
+ # => 1
23
+ # pca.eigenvalues
24
+ # => [1.92592927269225, 0.0740707273077545]
25
+ # pca.component_matrix
26
+ # => GSL::Matrix
27
+ # [ 9.813e-01
28
+ # 9.813e-01 ]
29
+ # pca.communalities
30
+ # => [0.962964636346122, 0.962964636346122]
31
+ #
32
+ # == References:
33
+ # * SPSS Manual
34
+ # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
35
+ # * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
36
+ #
37
+ class PCA
38
+ include Summarizable
39
+ # Name of analysis
40
+ attr_accessor :name
41
+
42
+ # Number of factors. Set by default to the number of factors
43
+ # with eigen values > 1
44
+ attr_accessor :m
45
+ # Use GSL if available
46
+ attr_accessor :use_gsl
47
+ # Add to the summary a rotation report
48
+ attr_accessor :summary_rotation
49
+ # Add to the summary a parallel analysis report
50
+ attr_accessor :summary_parallel_analysis
51
+ # Type of rotation. By default, Statsample::Factor::Rotation::Varimax
52
+ attr_accessor :rotation_type
53
+ attr_accessor :matrix_type
54
+ def initialize(matrix, opts=Hash.new)
55
+ @use_gsl = opts[:use_gsl]
56
+ opts.delete :use_gsl
57
+
58
+ @name=_("Principal Component Analysis")
59
+ @matrix=matrix
60
+ @n_variables=@matrix.column_size
61
+ @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| "VAR_#{i+1}".to_sym }
62
+
63
+ @matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
64
+
65
+ @m=nil
66
+
67
+ @rotation_type=Statsample::Factor::Varimax
68
+
69
+ opts.each{|k,v|
70
+ self.send("#{k}=",v) if self.respond_to? k
71
+ }
72
+
73
+ if @use_gsl.nil?
74
+ @use_gsl=Statsample.has_gsl?
75
+ end
76
+ if @matrix.respond_to? :fields
77
+ @variables_names=@matrix.fields
78
+ else
79
+ @variables_names=@n_variables.times.map {|i| "V#{i+1}".to_sym}
80
+ end
81
+ calculate_eigenpairs
82
+
83
+ if @m.nil?
84
+ # Set number of factors with eigenvalues > 1
85
+ @m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
86
+ end
87
+ end
88
+ def rotation
89
+ @rotation_type.new(component_matrix)
90
+ end
91
+ def total_eigenvalues
92
+ eigenvalues.inject(0) {|ac,v| ac+v}
93
+ end
94
+ def create_centered_ds
95
+ h={}
96
+ @original_ds.factors.each {|f|
97
+ mean = @original_ds[f].mean
98
+ h[f] = @original_ds[f].recode {|c| c-mean}
99
+ }
100
+ @ds = Daru::DataFrame.new(h)
101
+ end
102
+
103
+ # Feature matrix for +m+ factors
104
+ # Returns +m+ eigenvectors as columns.
105
+ # So, i=variable, j=component
106
+ def feature_matrix(m=nil)
107
+ m||=@m
108
+ if @use_gsl
109
+ omega_m=GSL::Matrix.zeros(@n_variables,m)
110
+ ev=eigenvectors
111
+ m.times do |i|
112
+ omega_m.set_column(i,ev[i])
113
+ end
114
+ omega_m
115
+ else
116
+ omega_m=::Matrix.build(@n_variables, m) {0}
117
+ m.times do |i|
118
+ omega_m.column= i, @eigenpairs[i][1]
119
+ end
120
+ omega_m
121
+ end
122
+ end
123
+ # Returns Principal Components for +input+ matrix or dataset
124
+ # The number of PC to return is equal to parameter +m+.
125
+ # If +m+ isn't set, m set to number of PCs selected at object creation.
126
+ # Use covariance matrix
127
+
128
+ def principal_components(input, m=nil)
129
+ if @use_gsl
130
+ data_matrix=input.to_gsl
131
+ else
132
+ data_matrix=input.to_matrix
133
+ end
134
+ m||=@m
135
+
136
+ raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
137
+
138
+ fv=feature_matrix(m)
139
+ pcs=(fv.transpose*data_matrix.transpose).transpose
140
+
141
+ pcs.extend Statsample::NamedMatrix
142
+ pcs.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
143
+ pcs.to_dataframe
144
+ end
145
+ def component_matrix(m=nil)
146
+ var="component_matrix_#{matrix_type}"
147
+ send(var,m)
148
+ end
149
+ # Matrix with correlations between components and
150
+ # variables. Based on Härdle & Simar (2003, p.243)
151
+ def component_matrix_covariance(m=nil)
152
+ m||=@m
153
+ raise "m should be > 0" if m<1
154
+ ff=feature_matrix(m)
155
+ cm=::Matrix.build(@n_variables, m) {0}
156
+ @n_variables.times {|i|
157
+ m.times {|j|
158
+ cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
159
+ }
160
+ }
161
+ cm.extend NamedMatrix
162
+ cm.name=_("Component matrix (from covariance)")
163
+ cm.fields_x = @variables_names
164
+ cm.fields_y = m.times.map {|i| "PC_#{i+1}".to_sym }
165
+
166
+ cm
167
+ end
168
+ # Matrix with correlations between components and
169
+ # variables
170
+ def component_matrix_correlation(m=nil)
171
+ m||=@m
172
+ raise "m should be > 0" if m<1
173
+ omega_m=::Matrix.build(@n_variables, m) {0}
174
+ gammas=[]
175
+ m.times {|i|
176
+ omega_m.column=i, @eigenpairs[i][1]
177
+ gammas.push(Math::sqrt(@eigenpairs[i][0]))
178
+ }
179
+ gamma_m=::Matrix.diagonal(*gammas)
180
+ cm=(omega_m*(gamma_m)).to_matrix
181
+
182
+ cm.extend CovariateMatrix
183
+ cm.name=_("Component matrix")
184
+ cm.fields_x = @variables_names
185
+ cm.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
186
+ cm
187
+ end
188
+ def communalities(m=nil)
189
+ m||=@m
190
+ h=[]
191
+ @n_variables.times do |i|
192
+ sum=0
193
+ m.times do |j|
194
+ sum += (@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
195
+ end
196
+ h.push(sum)
197
+ end
198
+ h
199
+ end
200
+ # Array with eigenvalues
201
+ def eigenvalues
202
+ @eigenpairs.collect {|c| c[0] }
203
+ end
204
+ def eigenvectors
205
+ @eigenpairs.collect {|c|
206
+ @use_gsl ? c[1].to_gsl : Daru::Vector.new(c[1])
207
+ }
208
+ end
209
+ def calculate_eigenpairs
210
+ @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
211
+ end
212
+
213
+
214
+ def report_building(builder) # :nodoc:
215
+ builder.section(:name=>@name) do |generator|
216
+ generator.text _("Number of factors: %d") % m
217
+ generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t|
218
+ communalities(m).each_with_index {|com, i|
219
+ perc=com*100.quo(@matrix[i,i])
220
+ t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc])
221
+ }
222
+ end
223
+ te=total_eigenvalues
224
+ generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t|
225
+ ac_eigen=0
226
+ eigenvalues.each_with_index {|eigenvalue,i|
227
+ ac_eigen+=eigenvalue
228
+ t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))])
229
+ }
230
+ end
231
+
232
+ generator.parse_element(component_matrix(m))
233
+
234
+ if (summary_rotation)
235
+ generator.parse_element(rotation)
236
+ end
237
+ end
238
+ end
239
+ private :calculate_eigenpairs, :create_centered_ds
240
+ end
241
+ end
242
+ end
@@ -0,0 +1,243 @@
1
+ module Statsample
2
+ module Factor
3
+ # Principal Axis Analysis for a covariance or correlation matrix.
4
+ #
5
+ # For PCA, use Statsample::Factor::PCA
6
+ #
7
+ # == Usage:
8
+ # require 'statsample'
9
+ # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
10
+ # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
11
+ # ds= Daru::DataFrame.new({:a => a,:b => b})
12
+ # cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
13
+ # pa=Statsample::Factor::PrincipalAxis.new(cor_matrix)
14
+ # pa.iterate(1)
15
+ # pa.m
16
+ # => 1
17
+ # pca.component_matrix
18
+ # => GSL::Matrix
19
+ # [ 9.622e-01
20
+ # 9.622e-01 ]
21
+ # pca.communalities
22
+ # => [0.962964636346122, 0.962964636346122]
23
+ #
24
+ # == References:
25
+ # * SPSS Manual
26
+ # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
27
+ #
28
+ class PrincipalAxis
29
+ include DirtyMemoize
30
+ include Summarizable
31
+ # Name of analysis
32
+ attr_accessor :name
33
+
34
+ # Number of factors. Set by default to the number of factors
35
+ # with eigenvalues > 1 (Kaiser criterion).
36
+ #
37
+ # _Warning:_ Kaiser criterion overfactors! Give yourself some time
38
+ # and use Horn's Parallel Analysis.
39
+ #
40
+ attr_accessor :m
41
+
42
+ # Number of iterations required to converge
43
+ attr_reader :iterations
44
+
45
+ # Initial eigenvalues
46
+ attr_reader :initial_eigenvalues
47
+
48
+ # Tolerance for iterations
49
+ attr_accessor :epsilon
50
+
51
+ # Use SMC(squared multiple correlations) as diagonal. If false, use 1
52
+ attr_accessor :smc
53
+
54
+ # Maximum number of iterations
55
+ attr_accessor :max_iterations
56
+
57
+ # Eigenvalues of factor analysis
58
+ attr_reader :eigenvalues
59
+
60
+ # Minimum difference between succesive iterations on sum of communalities
61
+ DELTA=1e-3
62
+ # Maximum number of iterations
63
+ MAX_ITERATIONS=25
64
+
65
+ def initialize(matrix, opts=Hash.new)
66
+ @matrix=matrix
67
+ if @matrix.respond_to? :fields
68
+ @fields=@matrix.fields
69
+ else
70
+ @fields=@matrix.row_size.times.map {|i| _("Variable %d") % (i+1)}
71
+ end
72
+ @n_variables=@matrix.row_size
73
+ @name=""
74
+ @m=nil
75
+ @initial_eigenvalues=nil
76
+ @initial_communalities=nil
77
+ @component_matrix=nil
78
+ @delta=DELTA
79
+ @smc=true
80
+ @max_iterations=MAX_ITERATIONS
81
+ opts.each{|k,v|
82
+ self.send("#{k}=",v) if self.respond_to? k
83
+ }
84
+ if @matrix.respond_to? :fields
85
+ @variables_names=@matrix.fields
86
+ else
87
+ @variables_names=@n_variables.times.map {|i| "V#{i+1}"}
88
+ end
89
+ if @m.nil?
90
+ pca=PCA.new(::Matrix.rows(@matrix.to_a))
91
+ @m=pca.m
92
+ end
93
+
94
+ @clean=true
95
+ end
96
+ # Communality for all variables given m factors
97
+ def communalities(m=nil)
98
+ if m!=@m or @clean
99
+ iterate(m)
100
+ raise "Can't calculate comunality" if @communalities.nil?
101
+ end
102
+ @communalities
103
+ end
104
+ # Component matrix for m factors
105
+ def component_matrix(m=nil)
106
+ if m!=@m or @clean
107
+ iterate(m)
108
+ end
109
+ @component_matrix
110
+ end
111
+ # Iterate to find the factors
112
+ def iterate(m=nil)
113
+ @clean=false
114
+ m||=@m
115
+ @m=m
116
+ t = @max_iterations
117
+ work_matrix=@matrix.to_a
118
+
119
+ prev_com=initial_communalities
120
+
121
+ pca=PCA.new(::Matrix.rows(work_matrix))
122
+ @initial_eigenvalues=pca.eigenvalues
123
+ prev_sum=prev_com.inject(0) {|ac,v| ac+v}
124
+ @iterations=0
125
+ t.times do |i|
126
+ "#{@name}: Iteration #{i}" if $DEBUG
127
+ @iterations+=1
128
+ prev_com.each_with_index{|v,it|
129
+ work_matrix[it][it]=v
130
+ }
131
+ pca=PCA.new(::Matrix.rows(work_matrix))
132
+ @communalities=pca.communalities(m)
133
+ @eigenvalues=pca.eigenvalues
134
+ com_sum = @communalities.inject(0) {|ac,v| ac+v}
135
+ #jump=true
136
+
137
+ break if (com_sum-prev_sum).abs < @delta
138
+ @communalities.each_with_index do |v2,i2|
139
+ raise "Variable #{i2} with communality > 1" if v2>1.0
140
+ end
141
+ prev_sum=com_sum
142
+ prev_com=@communalities
143
+
144
+ end
145
+ @component_matrix=pca.component_matrix(m)
146
+ @component_matrix.extend CovariateMatrix
147
+ @component_matrix.name=_("Factor Matrix")
148
+ @component_matrix.fields_x = @variables_names
149
+ @component_matrix.fields_y = m.times.map {|i| "factor_#{i+1}"}
150
+
151
+ end
152
+ alias :compute :iterate
153
+
154
+ def initial_communalities
155
+ if @initial_communalities.nil?
156
+
157
+ if @smc
158
+ # Based on O'Connors(2000)
159
+ @initial_communalities=@matrix.inverse.diagonal.map{|i| 1-(1.quo(i))}
160
+ =begin
161
+ @initial_communalities=@matrix.column_size.times.collect {|i|
162
+ rxx , rxy = PrincipalAxis.separate_matrices(@matrix,i)
163
+ matrix=(rxy.t*rxx.inverse*rxy)
164
+ matrix[0,0]
165
+ }
166
+ =end
167
+ else
168
+ @initial_communalities=[1.0]*@matrix.column_size
169
+ end
170
+ end
171
+ @initial_communalities
172
+ end
173
+
174
+
175
+ # Returns two matrixes from a correlation matrix
176
+ # with regressors correlation matrix and criteria xy
177
+ # matrix.
178
+ def self.separate_matrices(matrix, y)
179
+ ac=[]
180
+ matrix.column_size.times do |i|
181
+ ac.push(matrix[y,i]) if i!=y
182
+ end
183
+ rxy=Matrix.columns([ac])
184
+ rows=[]
185
+ matrix.row_size.times do |i|
186
+ if i!=y
187
+ row=[]
188
+ matrix.row_size.times do |j|
189
+ row.push(matrix[i,j]) if j!=y
190
+ end
191
+ rows.push(row)
192
+ end
193
+ end
194
+ rxx=Matrix.rows(rows)
195
+ [rxx,rxy]
196
+ end
197
+ def report_building(generator)
198
+ iterate if @clean
199
+ generator.section(:name=>@name) do |s|
200
+ s.text _("Number of factors: %d") % m
201
+ s.text _("Iterations: %d") % @iterations
202
+ s.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
203
+ communalities(m).each_with_index {|com,i|
204
+ t.row([@fields[i], sprintf("%0.4f", initial_communalities[i]), sprintf("%0.3f", com)])
205
+ }
206
+ end
207
+ s.table(:name=>_("Total Variance"), :header=>[_("Factor"), _("I.E.Total"), _("I.E. %"), _("I.E.Cum. %"),
208
+ _("S.L.Total"), _("S.L. %"), _("S.L.Cum. %")
209
+ ]) do |t|
210
+ ac_eigen,ac_i_eigen=0,0
211
+ @initial_eigenvalues.each_with_index {|eigenvalue,i|
212
+ ac_i_eigen+=eigenvalue
213
+ ac_eigen+=@eigenvalues[i]
214
+ new_row=[
215
+ _("Factor %d") % (i+1),
216
+ sprintf("%0.3f",eigenvalue),
217
+ sprintf("%0.3f%%", eigenvalue*100.quo(@n_variables)),
218
+ sprintf("%0.3f",ac_i_eigen*100.quo(@n_variables))
219
+ ]
220
+ if i<@m
221
+ new_row.concat [
222
+ sprintf("%0.3f", @eigenvalues[i]),
223
+ sprintf("%0.3f%%", @eigenvalues[i]*100.quo(@n_variables)),
224
+ sprintf("%0.3f",ac_eigen*100.quo(@n_variables))
225
+ ]
226
+ else
227
+ new_row.concat ["","",""]
228
+ end
229
+
230
+ t.row new_row
231
+ }
232
+ end
233
+ s.parse_element(component_matrix)
234
+ end
235
+ end
236
+
237
+ dirty_writer :max_iterations, :epsilon, :smc
238
+ dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues
239
+
240
+ end
241
+
242
+ end
243
+ end