statsample 0.17.0 → 0.18.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +27 -0
- data/Manifest.txt +1 -0
- data/Rakefile +2 -3
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/histogram.rb +6 -7
- data/lib/statsample.rb +25 -1
- data/lib/statsample/bivariate.rb +0 -7
- data/lib/statsample/converters.rb +3 -1
- data/lib/statsample/dataset.rb +44 -10
- data/lib/statsample/factor.rb +12 -1
- data/lib/statsample/factor/map.rb +14 -6
- data/lib/statsample/factor/parallelanalysis.rb +1 -4
- data/lib/statsample/factor/pca.rb +86 -25
- data/lib/statsample/graph.rb +4 -0
- data/lib/statsample/graph/boxplot.rb +39 -28
- data/lib/statsample/graph/histogram.rb +78 -14
- data/lib/statsample/graph/scatterplot.rb +61 -11
- data/lib/statsample/histogram.rb +37 -1
- data/lib/statsample/matrix.rb +74 -31
- data/lib/statsample/multiset.rb +36 -0
- data/lib/statsample/reliability/multiscaleanalysis.rb +24 -5
- data/lib/statsample/reliability/scaleanalysis.rb +9 -5
- data/lib/statsample/reliability/skillscaleanalysis.rb +20 -4
- data/lib/statsample/vector.rb +65 -49
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +255 -108
- data/po/statsample.pot +245 -98
- data/test/fixtures/bank2.dat +200 -0
- data/test/helpers_tests.rb +22 -3
- data/test/test_factor.rb +115 -17
- data/test/test_histogram.rb +25 -1
- data/test/test_matrix.rb +17 -1
- data/test/test_multiset.rb +66 -2
- data/test/test_vector.rb +21 -2
- metadata +15 -32
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,30 @@
|
|
1
|
+
=== 0.18.0 / 2011-01-07
|
2
|
+
* New Statsample.load_excel
|
3
|
+
* New Statsample.load_csv
|
4
|
+
* Statsample::Dataset#[] accepts an array of fields and uses clone
|
5
|
+
* New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix
|
6
|
+
* Statsample::Dataset.filter add labels to vectors
|
7
|
+
* Principal Components generation complete on PCA (covariance matrix prefered)
|
8
|
+
* Added note on Statsample::Factor::PCA about erratic signs on eigenvalues,
|
9
|
+
* Statsample::Factor::PCA.component_matrix calculated different for covariance matrix
|
10
|
+
* Improved summary for PCA using covariance matrix
|
11
|
+
* New attribute :label_angle for Statsample::Graph::Boxplot
|
12
|
+
* Fixed Scatterplots scaling problems
|
13
|
+
* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x,
|
14
|
+
* New Statsample::Multiset#union allows to create a new dataset based on a m
|
15
|
+
* New Statsample::Multiset#each to traverse through datasets
|
16
|
+
* Bug fix: Vector#standarized and Vector#percentile crash on nil data
|
17
|
+
* Bug fix: Vector#mean and Vector#sd crash on data without valid values
|
18
|
+
* Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components
|
19
|
+
* Added Statsample::Vector.vector_centered
|
20
|
+
* Factor::MAP.with_dataset() implemented
|
21
|
+
* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram
|
22
|
+
* Added MPA to Reliability::MultiScaleAnalysis
|
23
|
+
* Added custom names for returned vectors and datasets
|
24
|
+
* Updated spanish traslation
|
25
|
+
* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing
|
26
|
+
* Updated Histogram class, with several new methods compatibles with GSL::Histogram
|
27
|
+
|
1
28
|
=== 0.17.0 / 2010-12-09
|
2
29
|
* Added Statsample::Graph::Histogram and Statsample::Graph::Boxplot
|
3
30
|
* Added Statsample::Reliability::SkillScaleAnalysis for analysis of skill based scales.
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -5,7 +5,6 @@ $:.unshift(File.dirname(__FILE__)+'/lib/')
|
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'statsample'
|
8
|
-
|
9
8
|
require 'hoe'
|
10
9
|
Hoe.plugin :git
|
11
10
|
|
@@ -41,9 +40,9 @@ h=Hoe.spec('statsample') do
|
|
41
40
|
#self.testlib=:minitest
|
42
41
|
self.rubyforge_name = "ruby-statsample"
|
43
42
|
self.developer('Claudio Bustos', 'clbustos@gmail.com')
|
44
|
-
self.extra_deps << ["spreadsheet","~>0.6.
|
43
|
+
self.extra_deps << ["spreadsheet","~>0.6.5"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client", "~>0.2.5"] << ["rubyvis", "~>0.4.0"]
|
45
44
|
|
46
|
-
self.extra_dev_deps << ["shoulda"] << ["minitest", "~>2.0"]
|
45
|
+
self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"]
|
47
46
|
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
|
48
47
|
self.post_install_message = <<-EOF
|
49
48
|
***************************************************
|
Binary file
|
data/examples/histogram.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
3
|
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
|
4
|
-
|
5
|
-
require 'benchmark'
|
6
4
|
require 'statsample'
|
7
|
-
n=
|
8
|
-
|
9
|
-
|
5
|
+
n=3000
|
6
|
+
rng=Distribution::Normal.rng_ugaussian
|
7
|
+
a=n.times.map {|i| rng.call()*20}.to_scale
|
8
|
+
hg=Statsample::Graph::Histogram.new(a, :bins=>20, :line_normal_distribution=>true )
|
10
9
|
|
11
10
|
rb=ReportBuilder.new
|
12
|
-
rb.add(a.histogram)
|
11
|
+
#rb.add(a.histogram)
|
13
12
|
rb.add(hg)
|
14
|
-
|
13
|
+
rb.save_html('histogram.html')
|
data/lib/statsample.rb
CHANGED
@@ -118,7 +118,7 @@ module Statsample
|
|
118
118
|
@@has_gsl
|
119
119
|
end
|
120
120
|
|
121
|
-
VERSION = '0.
|
121
|
+
VERSION = '0.18.0'
|
122
122
|
SPLIT_TOKEN = ","
|
123
123
|
autoload(:Database, 'statsample/converters')
|
124
124
|
autoload(:Anova, 'statsample/anova')
|
@@ -157,6 +157,30 @@ module Statsample
|
|
157
157
|
false
|
158
158
|
end
|
159
159
|
end
|
160
|
+
# Import an Excel file. Cache result by default
|
161
|
+
def load_excel(filename, opts=Hash.new, cache=true)
|
162
|
+
file_ds=filename+".ds"
|
163
|
+
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
|
164
|
+
ds=Statsample.load(file_ds)
|
165
|
+
else
|
166
|
+
ds=Statsample::Excel.read(filename)
|
167
|
+
ds.save(file_ds) if cache
|
168
|
+
end
|
169
|
+
ds
|
170
|
+
end
|
171
|
+
|
172
|
+
# Import an Excel file. Cache result by default
|
173
|
+
def load_csv(filename, opts=Hash.new, cache=true)
|
174
|
+
file_ds=filename+".ds"
|
175
|
+
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
|
176
|
+
ds=Statsample.load(file_ds)
|
177
|
+
else
|
178
|
+
ds=Statsample::CSV.read(filename,opts)
|
179
|
+
ds.save(file_ds) if cache
|
180
|
+
end
|
181
|
+
ds
|
182
|
+
end
|
183
|
+
|
160
184
|
|
161
185
|
# Create a matrix using vectors as columns.
|
162
186
|
# Use:
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -58,13 +58,6 @@ module Statsample
|
|
58
58
|
# Calculate sum of squares
|
59
59
|
ss=sum_of_squares(v1a,v2a)
|
60
60
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
61
|
-
=begin
|
62
|
-
v1s,v2s=v1a.vector_standarized,v2a.vector_standarized
|
63
|
-
t=0
|
64
|
-
siz=v1s.size
|
65
|
-
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
66
|
-
t.quo(v2s.size-1)
|
67
|
-
=end
|
68
61
|
end
|
69
62
|
alias :correlation :pearson
|
70
63
|
# Retrieves the value for t test for a pearson correlation
|
@@ -17,6 +17,7 @@ module Statsample
|
|
17
17
|
fields=[]
|
18
18
|
sth.column_info.each {|c|
|
19
19
|
vectors[c['name']]=Statsample::Vector.new([])
|
20
|
+
vectors[c['name']].name=c['name']
|
20
21
|
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
|
21
22
|
fields.push(c['name'])
|
22
23
|
}
|
@@ -35,7 +36,7 @@ module Statsample
|
|
35
36
|
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
36
37
|
# Statsample::Database.insert(ds,dbh,"test")
|
37
38
|
#
|
38
|
-
def insert(ds, dbh,table)
|
39
|
+
def insert(ds, dbh, table)
|
39
40
|
require 'dbi'
|
40
41
|
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
41
42
|
sth=dbh.prepare(query)
|
@@ -235,6 +236,7 @@ module Statsample
|
|
235
236
|
fields.each {|f|
|
236
237
|
ds[f].name=f
|
237
238
|
}
|
239
|
+
ds.name=filename
|
238
240
|
ds
|
239
241
|
end
|
240
242
|
end
|
data/lib/statsample/dataset.rb
CHANGED
@@ -25,7 +25,7 @@ module Statsample
|
|
25
25
|
end
|
26
26
|
def to_s
|
27
27
|
m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
28
|
-
m+="\nRow
|
28
|
+
m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
|
29
29
|
m
|
30
30
|
end
|
31
31
|
end
|
@@ -140,7 +140,6 @@ module Statsample
|
|
140
140
|
end
|
141
141
|
@i=nil
|
142
142
|
end
|
143
|
-
|
144
143
|
#
|
145
144
|
# Returns a GSL::matrix
|
146
145
|
#
|
@@ -239,6 +238,7 @@ module Statsample
|
|
239
238
|
ds[f]=@vectors[f]
|
240
239
|
}
|
241
240
|
ds.fields=fields_to_include
|
241
|
+
ds.name=@name
|
242
242
|
ds.update_valid_data
|
243
243
|
ds
|
244
244
|
end
|
@@ -419,13 +419,15 @@ module Statsample
|
|
419
419
|
# if fields parameter is empty, sum all fields
|
420
420
|
def vector_sum(fields=nil)
|
421
421
|
fields||=@fields
|
422
|
-
collect_with_index do |row, i|
|
422
|
+
vector=collect_with_index do |row, i|
|
423
423
|
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
424
424
|
nil
|
425
425
|
else
|
426
426
|
fields.inject(0) {|ac,v| ac + row[v].to_f}
|
427
427
|
end
|
428
428
|
end
|
429
|
+
vector.name=_("Sum from %s") % @name
|
430
|
+
vector
|
429
431
|
end
|
430
432
|
# Check if #fields attribute is correct, after inserting or deleting vectors
|
431
433
|
def check_fields(fields)
|
@@ -476,7 +478,9 @@ module Statsample
|
|
476
478
|
a.push(sum.quo(size-invalids))
|
477
479
|
end
|
478
480
|
end
|
479
|
-
a.to_vector(:scale)
|
481
|
+
a=a.to_vector(:scale)
|
482
|
+
a.name=_("Means from %s") % @name
|
483
|
+
a
|
480
484
|
end
|
481
485
|
# Check vectors for type and size.
|
482
486
|
def check_length # :nodoc:
|
@@ -598,8 +602,9 @@ module Statsample
|
|
598
602
|
def[](i)
|
599
603
|
if i.is_a? Range
|
600
604
|
fields=from_to(i.begin,i.end)
|
601
|
-
|
602
|
-
|
605
|
+
clone(*fields)
|
606
|
+
elsif i.is_a? Array
|
607
|
+
clone(i)
|
603
608
|
else
|
604
609
|
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
605
610
|
@vectors[i]
|
@@ -661,16 +666,36 @@ module Statsample
|
|
661
666
|
GSL::Matrix.alloc(*rows)
|
662
667
|
end
|
663
668
|
end
|
664
|
-
|
665
|
-
|
669
|
+
|
670
|
+
# Return a correlation matrix for fields included as parameters.
|
671
|
+
# By default, uses all fields of dataset
|
672
|
+
def correlation_matrix(fields=nil)
|
673
|
+
if fields
|
674
|
+
ds=clone(fields)
|
675
|
+
else
|
676
|
+
ds=self
|
677
|
+
end
|
678
|
+
Statsample::Bivariate.correlation_matrix(ds)
|
679
|
+
end
|
680
|
+
# Return a correlation matrix for fields included as parameters.
|
681
|
+
# By default, uses all fields of dataset
|
682
|
+
def covariance_matrix(fields=nil)
|
683
|
+
if fields
|
684
|
+
ds=clone(fields)
|
685
|
+
else
|
686
|
+
ds=self
|
687
|
+
end
|
688
|
+
Statsample::Bivariate.covariance_matrix(ds)
|
689
|
+
end
|
666
690
|
|
667
691
|
# Create a new dataset with all cases which the block returns true
|
668
692
|
def filter
|
669
693
|
ds=self.dup_empty
|
670
694
|
each {|c|
|
671
|
-
ds.add_case(c,false) if yield c
|
695
|
+
ds.add_case(c, false) if yield c
|
672
696
|
}
|
673
697
|
ds.update_valid_data
|
698
|
+
ds.name=_("%s(filtered)") % @name
|
674
699
|
ds
|
675
700
|
end
|
676
701
|
|
@@ -712,6 +737,8 @@ module Statsample
|
|
712
737
|
# puts "Vector #{k1}:"+v1.to_s
|
713
738
|
v1.type=@vectors[k1].type
|
714
739
|
v1.name=@vectors[k1].name
|
740
|
+
v1.labels=@vectors[k1].labels
|
741
|
+
|
715
742
|
}
|
716
743
|
}
|
717
744
|
ms
|
@@ -737,9 +764,16 @@ module Statsample
|
|
737
764
|
|
738
765
|
ms.datasets.each do |k,ds|
|
739
766
|
ds.update_valid_data
|
767
|
+
ds.name=fields.size.times.map {|i|
|
768
|
+
f=fields[i]
|
769
|
+
sk=k[i]
|
770
|
+
@vectors[f].labeling(sk)
|
771
|
+
}.join("-")
|
740
772
|
ds.vectors.each{|k1,v1|
|
741
773
|
v1.type=@vectors[k1].type
|
742
774
|
v1.name=@vectors[k1].name
|
775
|
+
v1.labels=@vectors[k1].labels
|
776
|
+
|
743
777
|
}
|
744
778
|
end
|
745
779
|
ms
|
@@ -805,7 +839,7 @@ module Statsample
|
|
805
839
|
vr
|
806
840
|
end
|
807
841
|
def to_s
|
808
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
842
|
+
"#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
809
843
|
end
|
810
844
|
def inspect
|
811
845
|
self.to_s
|
data/lib/statsample/factor.rb
CHANGED
@@ -13,7 +13,18 @@ module Statsample
|
|
13
13
|
# * Statsample::Factor::Varimax
|
14
14
|
# * Statsample::Factor::Equimax
|
15
15
|
# * Statsample::Factor::Quartimax
|
16
|
-
#
|
16
|
+
# * Classes for determining the number of components
|
17
|
+
# * Statsample::Factor::MAP
|
18
|
+
# * Statsample::Factor::ParallelAnalysis
|
19
|
+
#
|
20
|
+
# About number of components, O'Connor(2000) said:
|
21
|
+
# The two procedures [PA and MAP ] complement each other nicely,
|
22
|
+
# in that the MAP tends to err (when it does err) in the direction
|
23
|
+
# of underextraction, whereas parallel analysis tends to err
|
24
|
+
# (when it does err) in the direction of overextraction.
|
25
|
+
# Optimal decisions are thus likely to be made after considering
|
26
|
+
# the results of both analytic procedures. (p.10)
|
27
|
+
|
17
28
|
module Factor
|
18
29
|
# Anti-image covariance matrix.
|
19
30
|
# Useful for inspection of desireability of data for factor analysis.
|
@@ -48,6 +48,9 @@ module Statsample
|
|
48
48
|
attr_reader :fm
|
49
49
|
# Smallest average squared correlation
|
50
50
|
attr_reader :minfm
|
51
|
+
def self.with_dataset(ds,opts=Hash.new)
|
52
|
+
new(ds.correlation_matrix,opts)
|
53
|
+
end
|
51
54
|
def initialize(matrix, opts=Hash.new)
|
52
55
|
@matrix=matrix
|
53
56
|
opts_default={
|
@@ -76,10 +79,15 @@ module Statsample
|
|
76
79
|
end
|
77
80
|
minfm=fm[0]
|
78
81
|
nfactors=0
|
82
|
+
@errors=[]
|
79
83
|
fm.each_with_index do |v,s|
|
80
|
-
if v
|
81
|
-
|
82
|
-
|
84
|
+
if v.is_a? Complex
|
85
|
+
@errors.push(s)
|
86
|
+
else
|
87
|
+
if v < minfm
|
88
|
+
minfm=v
|
89
|
+
nfactors=s
|
90
|
+
end
|
83
91
|
end
|
84
92
|
end
|
85
93
|
@number_of_factors=nfactors
|
@@ -89,13 +97,13 @@ module Statsample
|
|
89
97
|
def report_building(g) #:nodoc:
|
90
98
|
g.section(:name=>@name) do |s|
|
91
99
|
s.table(:name=>_("Eigenvalues"),:header=>[_("Value")]) do |t|
|
92
|
-
eigenvalues.
|
93
|
-
|
100
|
+
eigenvalues.each_with_index do |e,i|
|
101
|
+
t.row([@errors.include?(i) ? "*" : "%0.6f" % e])
|
94
102
|
end
|
95
103
|
end
|
96
104
|
s.table(:name=>_("Velicer's Average Squared Correlations"), :header=>[_("number of components"),_("average square correlation")]) do |t|
|
97
105
|
fm.each_with_index do |v,i|
|
98
|
-
t.row(["%d" % i, "%0.6f" % v])
|
106
|
+
t.row(["%d" % i, @errors.include?(i) ? "*" : "%0.6f" % v])
|
99
107
|
end
|
100
108
|
end
|
101
109
|
s.text(_("The smallest average squared correlation is : %0.6f" % minfm))
|
@@ -39,15 +39,12 @@ module Statsample
|
|
39
39
|
attr_reader :ds
|
40
40
|
# Bootstrap method. <tt>:random</tt> used by default
|
41
41
|
# * <tt>:random</tt>: uses number of variables and cases for the dataset
|
42
|
-
# * <tt>:data</tt> : sample with replacement from actual data.
|
43
|
-
|
42
|
+
# * <tt>:data</tt> : sample with replacement from actual data.
|
44
43
|
attr_accessor :bootstrap_method
|
45
44
|
# Uses smc on diagonal of matrixes, to perform simulation
|
46
45
|
# of a Principal Axis analysis.
|
47
46
|
# By default, false.
|
48
|
-
|
49
47
|
attr_accessor :smc
|
50
|
-
|
51
48
|
# Percentil over bootstrap eigenvalue should be accepted. 95 by default
|
52
49
|
attr_accessor :percentil
|
53
50
|
# Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
|
@@ -1,8 +1,14 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
module Statsample
|
2
3
|
module Factor
|
3
|
-
# Principal Component Analysis (PCA) of a
|
4
|
-
#
|
4
|
+
# Principal Component Analysis (PCA) of a covariance or
|
5
|
+
# correlation matrix..
|
5
6
|
#
|
7
|
+
# NOTE: Sign of second and later eigenvalues could be different
|
8
|
+
# using Ruby or GSL, so values for PCs and component matrix
|
9
|
+
# should differ, because extendmatrix and gsl's methods to calculate
|
10
|
+
# eigenvectors are different. Using R is worse, cause first
|
11
|
+
# eigenvector could have negative values!
|
6
12
|
# For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
|
7
13
|
#
|
8
14
|
# == Usage:
|
@@ -26,6 +32,7 @@ module Factor
|
|
26
32
|
# == References:
|
27
33
|
# * SPSS Manual
|
28
34
|
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
|
35
|
+
# * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
|
29
36
|
#
|
30
37
|
class PCA
|
31
38
|
include Summarizable
|
@@ -43,12 +50,16 @@ module Factor
|
|
43
50
|
attr_accessor :summary_parallel_analysis
|
44
51
|
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
|
45
52
|
attr_accessor :rotation_type
|
46
|
-
|
53
|
+
attr_accessor :type
|
47
54
|
def initialize(matrix, opts=Hash.new)
|
48
55
|
@use_gsl=nil
|
49
56
|
@name=_("Principal Component Analysis")
|
50
57
|
@matrix=matrix
|
51
|
-
@n_variables=@matrix.column_size
|
58
|
+
@n_variables=@matrix.column_size
|
59
|
+
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
|
60
|
+
|
61
|
+
@type = @matrix.respond_to?(:type) ? @matrix.type : :correlation
|
62
|
+
|
52
63
|
@m=nil
|
53
64
|
|
54
65
|
@rotation_type=Statsample::Factor::Varimax
|
@@ -65,15 +76,19 @@ module Factor
|
|
65
76
|
@variables_names=@n_variables.times.map {|i| "V#{i+1}"}
|
66
77
|
end
|
67
78
|
calculate_eigenpairs
|
79
|
+
|
68
80
|
if @m.nil?
|
69
81
|
# Set number of factors with eigenvalues > 1
|
70
82
|
@m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
|
71
83
|
end
|
72
|
-
|
84
|
+
|
73
85
|
end
|
74
86
|
def rotation
|
75
87
|
@rotation_type.new(component_matrix)
|
76
88
|
end
|
89
|
+
def total_eigenvalues
|
90
|
+
eigenvalues.inject(0) {|ac,v| ac+v}
|
91
|
+
end
|
77
92
|
def create_centered_ds
|
78
93
|
h={}
|
79
94
|
@original_ds.factors.each {|f|
|
@@ -83,8 +98,10 @@ module Factor
|
|
83
98
|
@ds=h.to_dataset
|
84
99
|
end
|
85
100
|
|
86
|
-
# Feature
|
87
|
-
|
101
|
+
# Feature matrix for +m+ factors
|
102
|
+
# Returns +m+ eigenvectors as columns.
|
103
|
+
# So, i=variable, j=component
|
104
|
+
def feature_matrix(m=nil)
|
88
105
|
m||=@m
|
89
106
|
omega_m=::Matrix.build(@n_variables, m) {0}
|
90
107
|
m.times do |i|
|
@@ -92,15 +109,48 @@ module Factor
|
|
92
109
|
end
|
93
110
|
omega_m
|
94
111
|
end
|
95
|
-
#
|
96
|
-
|
112
|
+
# Returns Principal Components for +input+ matrix or dataset
|
113
|
+
# The number of PC to return is equal to parameter +m+.
|
114
|
+
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
115
|
+
def principal_components(input, m=nil)
|
116
|
+
data_matrix=input.to_matrix
|
117
|
+
var_names=(data_matrix.respond_to? :fields_y) ? data_matrix.fields_y : data_matrix.column_size.times.map {|i| "VAR_%d" % (i+1)}
|
97
118
|
m||=@m
|
98
|
-
|
99
|
-
|
100
|
-
|
119
|
+
|
120
|
+
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
|
121
|
+
|
122
|
+
fv=feature_matrix(m)
|
123
|
+
pcs=(fv.transpose*data_matrix.transpose).transpose
|
124
|
+
pcs.extend Statsample::NamedMatrix
|
125
|
+
pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
|
126
|
+
pcs.to_dataset
|
101
127
|
end
|
102
|
-
# Component matrix for m factors
|
103
128
|
def component_matrix(m=nil)
|
129
|
+
var="component_matrix_#{type}"
|
130
|
+
send(var,m)
|
131
|
+
end
|
132
|
+
# Matrix with correlations between components and
|
133
|
+
# variables. Based on Härdle & Simar (2003, p.243)
|
134
|
+
def component_matrix_covariance(m=nil)
|
135
|
+
m||=@m
|
136
|
+
raise "m should be > 0" if m<1
|
137
|
+
ff=feature_matrix(m)
|
138
|
+
cm=::Matrix.build(@n_variables, m) {0}
|
139
|
+
@n_variables.times {|i|
|
140
|
+
m.times {|j|
|
141
|
+
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
|
142
|
+
}
|
143
|
+
}
|
144
|
+
cm.extend CovariateMatrix
|
145
|
+
cm.name=_("Component matrix (from covariance)")
|
146
|
+
cm.fields_x = @variables_names
|
147
|
+
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
148
|
+
|
149
|
+
cm
|
150
|
+
end
|
151
|
+
# Matrix with correlations between components and
|
152
|
+
# variables
|
153
|
+
def component_matrix_correlation(m=nil)
|
104
154
|
m||=@m
|
105
155
|
raise "m should be > 0" if m<1
|
106
156
|
omega_m=::Matrix.build(@n_variables, m) {0}
|
@@ -115,17 +165,17 @@ module Factor
|
|
115
165
|
cm.extend CovariateMatrix
|
116
166
|
cm.name=_("Component matrix")
|
117
167
|
cm.fields_x = @variables_names
|
118
|
-
cm.fields_y = m.times.map {|i| "
|
168
|
+
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
119
169
|
cm
|
120
170
|
end
|
121
|
-
# Communalities for all variables given m factors
|
122
171
|
def communalities(m=nil)
|
172
|
+
|
123
173
|
m||=@m
|
124
174
|
h=[]
|
125
175
|
@n_variables.times do |i|
|
126
176
|
sum=0
|
127
177
|
m.times do |j|
|
128
|
-
sum
|
178
|
+
sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
|
129
179
|
end
|
130
180
|
h.push(sum)
|
131
181
|
end
|
@@ -135,7 +185,11 @@ module Factor
|
|
135
185
|
def eigenvalues
|
136
186
|
@eigenpairs.collect {|c| c[0] }
|
137
187
|
end
|
138
|
-
|
188
|
+
def eigenvectors
|
189
|
+
@eigenpairs.collect {|c|
|
190
|
+
c[1].to_matrix
|
191
|
+
}
|
192
|
+
end
|
139
193
|
def calculate_eigenpairs
|
140
194
|
if @use_gsl
|
141
195
|
calculate_eigenpairs_gsl
|
@@ -144,14 +198,18 @@ module Factor
|
|
144
198
|
end
|
145
199
|
end
|
146
200
|
|
147
|
-
def calculate_eigenpairs_ruby
|
201
|
+
def calculate_eigenpairs_ruby #:nodoc:
|
148
202
|
@eigenpairs = @matrix.eigenpairs_ruby
|
149
203
|
end
|
150
|
-
|
204
|
+
# Eigenvectors calculated with gsl
|
205
|
+
# Note: The signs of some vectors could be different of
|
206
|
+
# ruby generated
|
207
|
+
def calculate_eigenpairs_gsl #:nodoc:
|
151
208
|
eigval, eigvec= GSL::Eigen.symmv(@matrix.to_gsl)
|
152
|
-
|
209
|
+
#puts "***"
|
153
210
|
ep=eigval.size.times.map {|i|
|
154
|
-
|
211
|
+
ev=eigvec.get_col(i)
|
212
|
+
[eigval[i], ev]
|
155
213
|
}
|
156
214
|
@eigenpairs=ep.sort{|a,b| a[0]<=>b[0]}.reverse
|
157
215
|
end
|
@@ -159,20 +217,23 @@ module Factor
|
|
159
217
|
def report_building(builder) # :nodoc:
|
160
218
|
builder.section(:name=>@name) do |generator|
|
161
219
|
generator.text _("Number of factors: %d") % m
|
162
|
-
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
|
220
|
+
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t|
|
163
221
|
communalities(m).each_with_index {|com, i|
|
164
|
-
|
222
|
+
perc=com*100.quo(@matrix[i,i])
|
223
|
+
t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc])
|
165
224
|
}
|
166
225
|
end
|
167
|
-
|
226
|
+
te=total_eigenvalues
|
168
227
|
generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t|
|
169
228
|
ac_eigen=0
|
170
229
|
eigenvalues.each_with_index {|eigenvalue,i|
|
171
230
|
ac_eigen+=eigenvalue
|
172
|
-
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(
|
231
|
+
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))])
|
173
232
|
}
|
174
233
|
end
|
234
|
+
|
175
235
|
generator.parse_element(component_matrix(m))
|
236
|
+
|
176
237
|
if (summary_rotation)
|
177
238
|
generator.parse_element(rotation)
|
178
239
|
end
|