statsample 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +27 -0
- data/Manifest.txt +1 -0
- data/Rakefile +2 -3
- data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
- data/examples/histogram.rb +6 -7
- data/lib/statsample.rb +25 -1
- data/lib/statsample/bivariate.rb +0 -7
- data/lib/statsample/converters.rb +3 -1
- data/lib/statsample/dataset.rb +44 -10
- data/lib/statsample/factor.rb +12 -1
- data/lib/statsample/factor/map.rb +14 -6
- data/lib/statsample/factor/parallelanalysis.rb +1 -4
- data/lib/statsample/factor/pca.rb +86 -25
- data/lib/statsample/graph.rb +4 -0
- data/lib/statsample/graph/boxplot.rb +39 -28
- data/lib/statsample/graph/histogram.rb +78 -14
- data/lib/statsample/graph/scatterplot.rb +61 -11
- data/lib/statsample/histogram.rb +37 -1
- data/lib/statsample/matrix.rb +74 -31
- data/lib/statsample/multiset.rb +36 -0
- data/lib/statsample/reliability/multiscaleanalysis.rb +24 -5
- data/lib/statsample/reliability/scaleanalysis.rb +9 -5
- data/lib/statsample/reliability/skillscaleanalysis.rb +20 -4
- data/lib/statsample/vector.rb +65 -49
- data/po/es/statsample.mo +0 -0
- data/po/es/statsample.po +255 -108
- data/po/statsample.pot +245 -98
- data/test/fixtures/bank2.dat +200 -0
- data/test/helpers_tests.rb +22 -3
- data/test/test_factor.rb +115 -17
- data/test/test_histogram.rb +25 -1
- data/test/test_matrix.rb +17 -1
- data/test/test_multiset.rb +66 -2
- data/test/test_vector.rb +21 -2
- metadata +15 -32
- metadata.gz.sig +0 -0
data.tar.gz.sig
CHANGED
Binary file
|
data/History.txt
CHANGED
@@ -1,3 +1,30 @@
|
|
1
|
+
=== 0.18.0 / 2011-01-07
|
2
|
+
* New Statsample.load_excel
|
3
|
+
* New Statsample.load_csv
|
4
|
+
* Statsample::Dataset#[] accepts an array of fields and uses clone
|
5
|
+
* New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix
|
6
|
+
* Statsample::Dataset.filter add labels to vectors
|
7
|
+
* Principal Components generation complete on PCA (covariance matrix prefered)
|
8
|
+
* Added note on Statsample::Factor::PCA about erratic signs on eigenvalues,
|
9
|
+
* Statsample::Factor::PCA.component_matrix calculated different for covariance matrix
|
10
|
+
* Improved summary for PCA using covariance matrix
|
11
|
+
* New attribute :label_angle for Statsample::Graph::Boxplot
|
12
|
+
* Fixed Scatterplots scaling problems
|
13
|
+
* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x,
|
14
|
+
* New Statsample::Multiset#union allows to create a new dataset based on a m
|
15
|
+
* New Statsample::Multiset#each to traverse through datasets
|
16
|
+
* Bug fix: Vector#standarized and Vector#percentile crash on nil data
|
17
|
+
* Bug fix: Vector#mean and Vector#sd crash on data without valid values
|
18
|
+
* Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components
|
19
|
+
* Added Statsample::Vector.vector_centered
|
20
|
+
* Factor::MAP.with_dataset() implemented
|
21
|
+
* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram
|
22
|
+
* Added MPA to Reliability::MultiScaleAnalysis
|
23
|
+
* Added custom names for returned vectors and datasets
|
24
|
+
* Updated spanish traslation
|
25
|
+
* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing
|
26
|
+
* Updated Histogram class, with several new methods compatibles with GSL::Histogram
|
27
|
+
|
1
28
|
=== 0.17.0 / 2010-12-09
|
2
29
|
* Added Statsample::Graph::Histogram and Statsample::Graph::Boxplot
|
3
30
|
* Added Statsample::Reliability::SkillScaleAnalysis for analysis of skill based scales.
|
data/Manifest.txt
CHANGED
data/Rakefile
CHANGED
@@ -5,7 +5,6 @@ $:.unshift(File.dirname(__FILE__)+'/lib/')
|
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'statsample'
|
8
|
-
|
9
8
|
require 'hoe'
|
10
9
|
Hoe.plugin :git
|
11
10
|
|
@@ -41,9 +40,9 @@ h=Hoe.spec('statsample') do
|
|
41
40
|
#self.testlib=:minitest
|
42
41
|
self.rubyforge_name = "ruby-statsample"
|
43
42
|
self.developer('Claudio Bustos', 'clbustos@gmail.com')
|
44
|
-
self.extra_deps << ["spreadsheet","~>0.6.
|
43
|
+
self.extra_deps << ["spreadsheet","~>0.6.5"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client", "~>0.2.5"] << ["rubyvis", "~>0.4.0"]
|
45
44
|
|
46
|
-
self.extra_dev_deps << ["shoulda"] << ["minitest", "~>2.0"]
|
45
|
+
self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>0"] << ["minitest", "~>2.0"]
|
47
46
|
self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
|
48
47
|
self.post_install_message = <<-EOF
|
49
48
|
***************************************************
|
Binary file
|
data/examples/histogram.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
#!/usr/bin/ruby
|
2
2
|
$:.unshift(File.dirname(__FILE__)+'/../lib/')
|
3
3
|
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
|
4
|
-
|
5
|
-
require 'benchmark'
|
6
4
|
require 'statsample'
|
7
|
-
n=
|
8
|
-
|
9
|
-
|
5
|
+
n=3000
|
6
|
+
rng=Distribution::Normal.rng_ugaussian
|
7
|
+
a=n.times.map {|i| rng.call()*20}.to_scale
|
8
|
+
hg=Statsample::Graph::Histogram.new(a, :bins=>20, :line_normal_distribution=>true )
|
10
9
|
|
11
10
|
rb=ReportBuilder.new
|
12
|
-
rb.add(a.histogram)
|
11
|
+
#rb.add(a.histogram)
|
13
12
|
rb.add(hg)
|
14
|
-
|
13
|
+
rb.save_html('histogram.html')
|
data/lib/statsample.rb
CHANGED
@@ -118,7 +118,7 @@ module Statsample
|
|
118
118
|
@@has_gsl
|
119
119
|
end
|
120
120
|
|
121
|
-
VERSION = '0.
|
121
|
+
VERSION = '0.18.0'
|
122
122
|
SPLIT_TOKEN = ","
|
123
123
|
autoload(:Database, 'statsample/converters')
|
124
124
|
autoload(:Anova, 'statsample/anova')
|
@@ -157,6 +157,30 @@ module Statsample
|
|
157
157
|
false
|
158
158
|
end
|
159
159
|
end
|
160
|
+
# Import an Excel file. Cache result by default
|
161
|
+
def load_excel(filename, opts=Hash.new, cache=true)
|
162
|
+
file_ds=filename+".ds"
|
163
|
+
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
|
164
|
+
ds=Statsample.load(file_ds)
|
165
|
+
else
|
166
|
+
ds=Statsample::Excel.read(filename)
|
167
|
+
ds.save(file_ds) if cache
|
168
|
+
end
|
169
|
+
ds
|
170
|
+
end
|
171
|
+
|
172
|
+
# Import an Excel file. Cache result by default
|
173
|
+
def load_csv(filename, opts=Hash.new, cache=true)
|
174
|
+
file_ds=filename+".ds"
|
175
|
+
if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
|
176
|
+
ds=Statsample.load(file_ds)
|
177
|
+
else
|
178
|
+
ds=Statsample::CSV.read(filename,opts)
|
179
|
+
ds.save(file_ds) if cache
|
180
|
+
end
|
181
|
+
ds
|
182
|
+
end
|
183
|
+
|
160
184
|
|
161
185
|
# Create a matrix using vectors as columns.
|
162
186
|
# Use:
|
data/lib/statsample/bivariate.rb
CHANGED
@@ -58,13 +58,6 @@ module Statsample
|
|
58
58
|
# Calculate sum of squares
|
59
59
|
ss=sum_of_squares(v1a,v2a)
|
60
60
|
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
|
61
|
-
=begin
|
62
|
-
v1s,v2s=v1a.vector_standarized,v2a.vector_standarized
|
63
|
-
t=0
|
64
|
-
siz=v1s.size
|
65
|
-
(0...v1s.size).each {|i| t+=(v1s[i]*v2s[i]) }
|
66
|
-
t.quo(v2s.size-1)
|
67
|
-
=end
|
68
61
|
end
|
69
62
|
alias :correlation :pearson
|
70
63
|
# Retrieves the value for t test for a pearson correlation
|
@@ -17,6 +17,7 @@ module Statsample
|
|
17
17
|
fields=[]
|
18
18
|
sth.column_info.each {|c|
|
19
19
|
vectors[c['name']]=Statsample::Vector.new([])
|
20
|
+
vectors[c['name']].name=c['name']
|
20
21
|
vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
|
21
22
|
fields.push(c['name'])
|
22
23
|
}
|
@@ -35,7 +36,7 @@ module Statsample
|
|
35
36
|
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
36
37
|
# Statsample::Database.insert(ds,dbh,"test")
|
37
38
|
#
|
38
|
-
def insert(ds, dbh,table)
|
39
|
+
def insert(ds, dbh, table)
|
39
40
|
require 'dbi'
|
40
41
|
query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
|
41
42
|
sth=dbh.prepare(query)
|
@@ -235,6 +236,7 @@ module Statsample
|
|
235
236
|
fields.each {|f|
|
236
237
|
ds[f].name=f
|
237
238
|
}
|
239
|
+
ds.name=filename
|
238
240
|
ds
|
239
241
|
end
|
240
242
|
end
|
data/lib/statsample/dataset.rb
CHANGED
@@ -25,7 +25,7 @@ module Statsample
|
|
25
25
|
end
|
26
26
|
def to_s
|
27
27
|
m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
|
28
|
-
m+="\nRow
|
28
|
+
m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
|
29
29
|
m
|
30
30
|
end
|
31
31
|
end
|
@@ -140,7 +140,6 @@ module Statsample
|
|
140
140
|
end
|
141
141
|
@i=nil
|
142
142
|
end
|
143
|
-
|
144
143
|
#
|
145
144
|
# Returns a GSL::matrix
|
146
145
|
#
|
@@ -239,6 +238,7 @@ module Statsample
|
|
239
238
|
ds[f]=@vectors[f]
|
240
239
|
}
|
241
240
|
ds.fields=fields_to_include
|
241
|
+
ds.name=@name
|
242
242
|
ds.update_valid_data
|
243
243
|
ds
|
244
244
|
end
|
@@ -419,13 +419,15 @@ module Statsample
|
|
419
419
|
# if fields parameter is empty, sum all fields
|
420
420
|
def vector_sum(fields=nil)
|
421
421
|
fields||=@fields
|
422
|
-
collect_with_index do |row, i|
|
422
|
+
vector=collect_with_index do |row, i|
|
423
423
|
if(fields.find{|f| !@vectors[f].data_with_nils[i]})
|
424
424
|
nil
|
425
425
|
else
|
426
426
|
fields.inject(0) {|ac,v| ac + row[v].to_f}
|
427
427
|
end
|
428
428
|
end
|
429
|
+
vector.name=_("Sum from %s") % @name
|
430
|
+
vector
|
429
431
|
end
|
430
432
|
# Check if #fields attribute is correct, after inserting or deleting vectors
|
431
433
|
def check_fields(fields)
|
@@ -476,7 +478,9 @@ module Statsample
|
|
476
478
|
a.push(sum.quo(size-invalids))
|
477
479
|
end
|
478
480
|
end
|
479
|
-
a.to_vector(:scale)
|
481
|
+
a=a.to_vector(:scale)
|
482
|
+
a.name=_("Means from %s") % @name
|
483
|
+
a
|
480
484
|
end
|
481
485
|
# Check vectors for type and size.
|
482
486
|
def check_length # :nodoc:
|
@@ -598,8 +602,9 @@ module Statsample
|
|
598
602
|
def[](i)
|
599
603
|
if i.is_a? Range
|
600
604
|
fields=from_to(i.begin,i.end)
|
601
|
-
|
602
|
-
|
605
|
+
clone(*fields)
|
606
|
+
elsif i.is_a? Array
|
607
|
+
clone(i)
|
603
608
|
else
|
604
609
|
raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
|
605
610
|
@vectors[i]
|
@@ -661,16 +666,36 @@ module Statsample
|
|
661
666
|
GSL::Matrix.alloc(*rows)
|
662
667
|
end
|
663
668
|
end
|
664
|
-
|
665
|
-
|
669
|
+
|
670
|
+
# Return a correlation matrix for fields included as parameters.
|
671
|
+
# By default, uses all fields of dataset
|
672
|
+
def correlation_matrix(fields=nil)
|
673
|
+
if fields
|
674
|
+
ds=clone(fields)
|
675
|
+
else
|
676
|
+
ds=self
|
677
|
+
end
|
678
|
+
Statsample::Bivariate.correlation_matrix(ds)
|
679
|
+
end
|
680
|
+
# Return a correlation matrix for fields included as parameters.
|
681
|
+
# By default, uses all fields of dataset
|
682
|
+
def covariance_matrix(fields=nil)
|
683
|
+
if fields
|
684
|
+
ds=clone(fields)
|
685
|
+
else
|
686
|
+
ds=self
|
687
|
+
end
|
688
|
+
Statsample::Bivariate.covariance_matrix(ds)
|
689
|
+
end
|
666
690
|
|
667
691
|
# Create a new dataset with all cases which the block returns true
|
668
692
|
def filter
|
669
693
|
ds=self.dup_empty
|
670
694
|
each {|c|
|
671
|
-
ds.add_case(c,false) if yield c
|
695
|
+
ds.add_case(c, false) if yield c
|
672
696
|
}
|
673
697
|
ds.update_valid_data
|
698
|
+
ds.name=_("%s(filtered)") % @name
|
674
699
|
ds
|
675
700
|
end
|
676
701
|
|
@@ -712,6 +737,8 @@ module Statsample
|
|
712
737
|
# puts "Vector #{k1}:"+v1.to_s
|
713
738
|
v1.type=@vectors[k1].type
|
714
739
|
v1.name=@vectors[k1].name
|
740
|
+
v1.labels=@vectors[k1].labels
|
741
|
+
|
715
742
|
}
|
716
743
|
}
|
717
744
|
ms
|
@@ -737,9 +764,16 @@ module Statsample
|
|
737
764
|
|
738
765
|
ms.datasets.each do |k,ds|
|
739
766
|
ds.update_valid_data
|
767
|
+
ds.name=fields.size.times.map {|i|
|
768
|
+
f=fields[i]
|
769
|
+
sk=k[i]
|
770
|
+
@vectors[f].labeling(sk)
|
771
|
+
}.join("-")
|
740
772
|
ds.vectors.each{|k1,v1|
|
741
773
|
v1.type=@vectors[k1].type
|
742
774
|
v1.name=@vectors[k1].name
|
775
|
+
v1.labels=@vectors[k1].labels
|
776
|
+
|
743
777
|
}
|
744
778
|
end
|
745
779
|
ms
|
@@ -805,7 +839,7 @@ module Statsample
|
|
805
839
|
vr
|
806
840
|
end
|
807
841
|
def to_s
|
808
|
-
"#<"+self.class.to_s+":"+self.object_id.to_s+" @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
842
|
+
"#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
|
809
843
|
end
|
810
844
|
def inspect
|
811
845
|
self.to_s
|
data/lib/statsample/factor.rb
CHANGED
@@ -13,7 +13,18 @@ module Statsample
|
|
13
13
|
# * Statsample::Factor::Varimax
|
14
14
|
# * Statsample::Factor::Equimax
|
15
15
|
# * Statsample::Factor::Quartimax
|
16
|
-
#
|
16
|
+
# * Classes for determining the number of components
|
17
|
+
# * Statsample::Factor::MAP
|
18
|
+
# * Statsample::Factor::ParallelAnalysis
|
19
|
+
#
|
20
|
+
# About number of components, O'Connor(2000) said:
|
21
|
+
# The two procedures [PA and MAP ] complement each other nicely,
|
22
|
+
# in that the MAP tends to err (when it does err) in the direction
|
23
|
+
# of underextraction, whereas parallel analysis tends to err
|
24
|
+
# (when it does err) in the direction of overextraction.
|
25
|
+
# Optimal decisions are thus likely to be made after considering
|
26
|
+
# the results of both analytic procedures. (p.10)
|
27
|
+
|
17
28
|
module Factor
|
18
29
|
# Anti-image covariance matrix.
|
19
30
|
# Useful for inspection of desireability of data for factor analysis.
|
@@ -48,6 +48,9 @@ module Statsample
|
|
48
48
|
attr_reader :fm
|
49
49
|
# Smallest average squared correlation
|
50
50
|
attr_reader :minfm
|
51
|
+
def self.with_dataset(ds,opts=Hash.new)
|
52
|
+
new(ds.correlation_matrix,opts)
|
53
|
+
end
|
51
54
|
def initialize(matrix, opts=Hash.new)
|
52
55
|
@matrix=matrix
|
53
56
|
opts_default={
|
@@ -76,10 +79,15 @@ module Statsample
|
|
76
79
|
end
|
77
80
|
minfm=fm[0]
|
78
81
|
nfactors=0
|
82
|
+
@errors=[]
|
79
83
|
fm.each_with_index do |v,s|
|
80
|
-
if v
|
81
|
-
|
82
|
-
|
84
|
+
if v.is_a? Complex
|
85
|
+
@errors.push(s)
|
86
|
+
else
|
87
|
+
if v < minfm
|
88
|
+
minfm=v
|
89
|
+
nfactors=s
|
90
|
+
end
|
83
91
|
end
|
84
92
|
end
|
85
93
|
@number_of_factors=nfactors
|
@@ -89,13 +97,13 @@ module Statsample
|
|
89
97
|
def report_building(g) #:nodoc:
|
90
98
|
g.section(:name=>@name) do |s|
|
91
99
|
s.table(:name=>_("Eigenvalues"),:header=>[_("Value")]) do |t|
|
92
|
-
eigenvalues.
|
93
|
-
|
100
|
+
eigenvalues.each_with_index do |e,i|
|
101
|
+
t.row([@errors.include?(i) ? "*" : "%0.6f" % e])
|
94
102
|
end
|
95
103
|
end
|
96
104
|
s.table(:name=>_("Velicer's Average Squared Correlations"), :header=>[_("number of components"),_("average square correlation")]) do |t|
|
97
105
|
fm.each_with_index do |v,i|
|
98
|
-
t.row(["%d" % i, "%0.6f" % v])
|
106
|
+
t.row(["%d" % i, @errors.include?(i) ? "*" : "%0.6f" % v])
|
99
107
|
end
|
100
108
|
end
|
101
109
|
s.text(_("The smallest average squared correlation is : %0.6f" % minfm))
|
@@ -39,15 +39,12 @@ module Statsample
|
|
39
39
|
attr_reader :ds
|
40
40
|
# Bootstrap method. <tt>:random</tt> used by default
|
41
41
|
# * <tt>:random</tt>: uses number of variables and cases for the dataset
|
42
|
-
# * <tt>:data</tt> : sample with replacement from actual data.
|
43
|
-
|
42
|
+
# * <tt>:data</tt> : sample with replacement from actual data.
|
44
43
|
attr_accessor :bootstrap_method
|
45
44
|
# Uses smc on diagonal of matrixes, to perform simulation
|
46
45
|
# of a Principal Axis analysis.
|
47
46
|
# By default, false.
|
48
|
-
|
49
47
|
attr_accessor :smc
|
50
|
-
|
51
48
|
# Percentil over bootstrap eigenvalue should be accepted. 95 by default
|
52
49
|
attr_accessor :percentil
|
53
50
|
# Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
|
@@ -1,8 +1,14 @@
|
|
1
|
+
# encoding: UTF-8
|
1
2
|
module Statsample
|
2
3
|
module Factor
|
3
|
-
# Principal Component Analysis (PCA) of a
|
4
|
-
#
|
4
|
+
# Principal Component Analysis (PCA) of a covariance or
|
5
|
+
# correlation matrix..
|
5
6
|
#
|
7
|
+
# NOTE: Sign of second and later eigenvalues could be different
|
8
|
+
# using Ruby or GSL, so values for PCs and component matrix
|
9
|
+
# should differ, because extendmatrix and gsl's methods to calculate
|
10
|
+
# eigenvectors are different. Using R is worse, cause first
|
11
|
+
# eigenvector could have negative values!
|
6
12
|
# For Principal Axis Analysis, use Statsample::Factor::PrincipalAxis
|
7
13
|
#
|
8
14
|
# == Usage:
|
@@ -26,6 +32,7 @@ module Factor
|
|
26
32
|
# == References:
|
27
33
|
# * SPSS Manual
|
28
34
|
# * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
|
35
|
+
# * Härdle, W. & Simar, L. (2003). Applied Multivariate Statistical Analysis. Springer
|
29
36
|
#
|
30
37
|
class PCA
|
31
38
|
include Summarizable
|
@@ -43,12 +50,16 @@ module Factor
|
|
43
50
|
attr_accessor :summary_parallel_analysis
|
44
51
|
# Type of rotation. By default, Statsample::Factor::Rotation::Varimax
|
45
52
|
attr_accessor :rotation_type
|
46
|
-
|
53
|
+
attr_accessor :type
|
47
54
|
def initialize(matrix, opts=Hash.new)
|
48
55
|
@use_gsl=nil
|
49
56
|
@name=_("Principal Component Analysis")
|
50
57
|
@matrix=matrix
|
51
|
-
@n_variables=@matrix.column_size
|
58
|
+
@n_variables=@matrix.column_size
|
59
|
+
@variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
|
60
|
+
|
61
|
+
@type = @matrix.respond_to?(:type) ? @matrix.type : :correlation
|
62
|
+
|
52
63
|
@m=nil
|
53
64
|
|
54
65
|
@rotation_type=Statsample::Factor::Varimax
|
@@ -65,15 +76,19 @@ module Factor
|
|
65
76
|
@variables_names=@n_variables.times.map {|i| "V#{i+1}"}
|
66
77
|
end
|
67
78
|
calculate_eigenpairs
|
79
|
+
|
68
80
|
if @m.nil?
|
69
81
|
# Set number of factors with eigenvalues > 1
|
70
82
|
@m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
|
71
83
|
end
|
72
|
-
|
84
|
+
|
73
85
|
end
|
74
86
|
def rotation
|
75
87
|
@rotation_type.new(component_matrix)
|
76
88
|
end
|
89
|
+
def total_eigenvalues
|
90
|
+
eigenvalues.inject(0) {|ac,v| ac+v}
|
91
|
+
end
|
77
92
|
def create_centered_ds
|
78
93
|
h={}
|
79
94
|
@original_ds.factors.each {|f|
|
@@ -83,8 +98,10 @@ module Factor
|
|
83
98
|
@ds=h.to_dataset
|
84
99
|
end
|
85
100
|
|
86
|
-
# Feature
|
87
|
-
|
101
|
+
# Feature matrix for +m+ factors
|
102
|
+
# Returns +m+ eigenvectors as columns.
|
103
|
+
# So, i=variable, j=component
|
104
|
+
def feature_matrix(m=nil)
|
88
105
|
m||=@m
|
89
106
|
omega_m=::Matrix.build(@n_variables, m) {0}
|
90
107
|
m.times do |i|
|
@@ -92,15 +109,48 @@ module Factor
|
|
92
109
|
end
|
93
110
|
omega_m
|
94
111
|
end
|
95
|
-
#
|
96
|
-
|
112
|
+
# Returns Principal Components for +input+ matrix or dataset
|
113
|
+
# The number of PC to return is equal to parameter +m+.
|
114
|
+
# If +m+ isn't set, m set to number of PCs selected at object creation.
|
115
|
+
def principal_components(input, m=nil)
|
116
|
+
data_matrix=input.to_matrix
|
117
|
+
var_names=(data_matrix.respond_to? :fields_y) ? data_matrix.fields_y : data_matrix.column_size.times.map {|i| "VAR_%d" % (i+1)}
|
97
118
|
m||=@m
|
98
|
-
|
99
|
-
|
100
|
-
|
119
|
+
|
120
|
+
raise "data matrix variables<>pca variables" if data_matrix.column_size!=@n_variables
|
121
|
+
|
122
|
+
fv=feature_matrix(m)
|
123
|
+
pcs=(fv.transpose*data_matrix.transpose).transpose
|
124
|
+
pcs.extend Statsample::NamedMatrix
|
125
|
+
pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
|
126
|
+
pcs.to_dataset
|
101
127
|
end
|
102
|
-
# Component matrix for m factors
|
103
128
|
def component_matrix(m=nil)
|
129
|
+
var="component_matrix_#{type}"
|
130
|
+
send(var,m)
|
131
|
+
end
|
132
|
+
# Matrix with correlations between components and
|
133
|
+
# variables. Based on Härdle & Simar (2003, p.243)
|
134
|
+
def component_matrix_covariance(m=nil)
|
135
|
+
m||=@m
|
136
|
+
raise "m should be > 0" if m<1
|
137
|
+
ff=feature_matrix(m)
|
138
|
+
cm=::Matrix.build(@n_variables, m) {0}
|
139
|
+
@n_variables.times {|i|
|
140
|
+
m.times {|j|
|
141
|
+
cm[i,j]=ff[i,j] * Math.sqrt(eigenvalues[j] / @matrix[i,i])
|
142
|
+
}
|
143
|
+
}
|
144
|
+
cm.extend CovariateMatrix
|
145
|
+
cm.name=_("Component matrix (from covariance)")
|
146
|
+
cm.fields_x = @variables_names
|
147
|
+
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
148
|
+
|
149
|
+
cm
|
150
|
+
end
|
151
|
+
# Matrix with correlations between components and
|
152
|
+
# variables
|
153
|
+
def component_matrix_correlation(m=nil)
|
104
154
|
m||=@m
|
105
155
|
raise "m should be > 0" if m<1
|
106
156
|
omega_m=::Matrix.build(@n_variables, m) {0}
|
@@ -115,17 +165,17 @@ module Factor
|
|
115
165
|
cm.extend CovariateMatrix
|
116
166
|
cm.name=_("Component matrix")
|
117
167
|
cm.fields_x = @variables_names
|
118
|
-
cm.fields_y = m.times.map {|i| "
|
168
|
+
cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
|
119
169
|
cm
|
120
170
|
end
|
121
|
-
# Communalities for all variables given m factors
|
122
171
|
def communalities(m=nil)
|
172
|
+
|
123
173
|
m||=@m
|
124
174
|
h=[]
|
125
175
|
@n_variables.times do |i|
|
126
176
|
sum=0
|
127
177
|
m.times do |j|
|
128
|
-
sum
|
178
|
+
sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
|
129
179
|
end
|
130
180
|
h.push(sum)
|
131
181
|
end
|
@@ -135,7 +185,11 @@ module Factor
|
|
135
185
|
def eigenvalues
|
136
186
|
@eigenpairs.collect {|c| c[0] }
|
137
187
|
end
|
138
|
-
|
188
|
+
def eigenvectors
|
189
|
+
@eigenpairs.collect {|c|
|
190
|
+
c[1].to_matrix
|
191
|
+
}
|
192
|
+
end
|
139
193
|
def calculate_eigenpairs
|
140
194
|
if @use_gsl
|
141
195
|
calculate_eigenpairs_gsl
|
@@ -144,14 +198,18 @@ module Factor
|
|
144
198
|
end
|
145
199
|
end
|
146
200
|
|
147
|
-
def calculate_eigenpairs_ruby
|
201
|
+
def calculate_eigenpairs_ruby #:nodoc:
|
148
202
|
@eigenpairs = @matrix.eigenpairs_ruby
|
149
203
|
end
|
150
|
-
|
204
|
+
# Eigenvectors calculated with gsl
|
205
|
+
# Note: The signs of some vectors could be different of
|
206
|
+
# ruby generated
|
207
|
+
def calculate_eigenpairs_gsl #:nodoc:
|
151
208
|
eigval, eigvec= GSL::Eigen.symmv(@matrix.to_gsl)
|
152
|
-
|
209
|
+
#puts "***"
|
153
210
|
ep=eigval.size.times.map {|i|
|
154
|
-
|
211
|
+
ev=eigvec.get_col(i)
|
212
|
+
[eigval[i], ev]
|
155
213
|
}
|
156
214
|
@eigenpairs=ep.sort{|a,b| a[0]<=>b[0]}.reverse
|
157
215
|
end
|
@@ -159,20 +217,23 @@ module Factor
|
|
159
217
|
def report_building(builder) # :nodoc:
|
160
218
|
builder.section(:name=>@name) do |generator|
|
161
219
|
generator.text _("Number of factors: %d") % m
|
162
|
-
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction")]) do |t|
|
220
|
+
generator.table(:name=>_("Communalities"), :header=>[_("Variable"),_("Initial"),_("Extraction"), _("%")]) do |t|
|
163
221
|
communalities(m).each_with_index {|com, i|
|
164
|
-
|
222
|
+
perc=com*100.quo(@matrix[i,i])
|
223
|
+
t.row([@variables_names[i], "%0.3f" % @matrix[i,i] , "%0.3f" % com, "%0.3f" % perc])
|
165
224
|
}
|
166
225
|
end
|
167
|
-
|
226
|
+
te=total_eigenvalues
|
168
227
|
generator.table(:name=>_("Total Variance Explained"), :header=>[_("Component"), _("E.Total"), _("%"), _("Cum. %")]) do |t|
|
169
228
|
ac_eigen=0
|
170
229
|
eigenvalues.each_with_index {|eigenvalue,i|
|
171
230
|
ac_eigen+=eigenvalue
|
172
|
-
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(
|
231
|
+
t.row([_("Component %d") % (i+1), sprintf("%0.3f",eigenvalue), sprintf("%0.3f%%", eigenvalue*100.quo(te)), sprintf("%0.3f",ac_eigen*100.quo(te))])
|
173
232
|
}
|
174
233
|
end
|
234
|
+
|
175
235
|
generator.parse_element(component_matrix(m))
|
236
|
+
|
176
237
|
if (summary_rotation)
|
177
238
|
generator.parse_element(rotation)
|
178
239
|
end
|