statsample 0.5.1 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -0,0 +1,128 @@
1
+ module Statsample
2
+ module Factor
3
+ # Principal Component Analysis of a given covariance or correlation matrix.
4
+ # For factorial Analysis, use Statsample::Factor::PrincipalAxis
5
+ # Reference: SPSS manual
6
+ # Use:
7
+ # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
8
+ # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
9
+ # ds={'a'=>a,'b'=>b}.to_dataset
10
+ # cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
11
+ # pca=Statsample::Factor::PCA.new(cor_matrix)
12
+ # p pca.component_matrix
13
+ class PCA
14
+ attr_accessor :name, :m
15
+ include GetText
16
+ bindtextdomain("statsample")
17
+
18
+
19
+ def initialize(matrix ,opts=Hash.new)
20
+ if matrix.is_a? ::Matrix
21
+ require 'matrix_extension'
22
+ matrix=matrix.to_gsl
23
+ end
24
+ @name=""
25
+ @matrix=matrix
26
+ @n_variables=@matrix.size1
27
+ @m=nil
28
+ opts.each{|k,v|
29
+ self.send("#{k}=",v) if self.respond_to? k
30
+ }
31
+ calculate_eigenpairs
32
+ if @m.nil?
33
+ # Set number of factors with eigenvalues > 1
34
+ @m=@eigenpairs.find_all {|v| v[0]>=1.0}.size
35
+ end
36
+
37
+ end
38
+ def create_centered_ds
39
+ h={}
40
+ @original_ds.factors.each {|f|
41
+ mean=@original_ds[f].mean
42
+ h[f]=@original_ds[f].recode {|c| c-mean}
43
+ }
44
+ @ds=h.to_dataset
45
+ end
46
+ # Feature vector for m factors
47
+ def feature_vector(m=nil)
48
+ m||=@m
49
+ omega_m=GSL::Matrix.zeros(@n_variables, m)
50
+ m.times do |i|
51
+ omega_m.set_col(i, @eigenpairs[i][1])
52
+ end
53
+ omega_m
54
+ end
55
+ # data_transformation
56
+ def data_transformation(data_matrix, m)
57
+ m||=@m
58
+ raise "Data variables number should be equal to original variable number" if data_matrix.size2!=@n_variables
59
+ fv=feature_vector(m)
60
+ (fv.transpose*data_matrix.transpose).transpose
61
+ end
62
+ # Component matrix for m factors
63
+ def component_matrix(m=nil)
64
+ m||=@m
65
+ raise "m should be > 0" if m<1
66
+ omega_m=GSL::Matrix.zeros(@n_variables, m)
67
+ gammas=[]
68
+ m.times {|i|
69
+ omega_m.set_col(i, @eigenpairs[i][1])
70
+ gammas.push(Math::sqrt(@eigenpairs[i][0]))
71
+ }
72
+ gamma_m=GSL::Matrix.diagonal(gammas)
73
+ omega_m*(gamma_m)
74
+ end
75
+ # Communality for all variables given m factors
76
+ def communality(m=nil)
77
+ m||=@m
78
+ h=[]
79
+ @n_variables.times do |i|
80
+ sum=0
81
+ m.times do |j|
82
+ sum+=@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2
83
+ end
84
+ h.push(sum)
85
+ end
86
+ h
87
+ end
88
+ def eigenvalues
89
+ @eigenpairs.collect {|c| c[0] }
90
+ end
91
+ def calculate_eigenpairs
92
+ eigval, eigvec= GSL::Eigen.symmv(@matrix)
93
+ @eigenpairs={}
94
+ eigval.each_index {|i|
95
+ @eigenpairs[eigval[i]]=eigvec.get_col(i)
96
+ }
97
+ @eigenpairs=@eigenpairs.sort.reverse
98
+ end
99
+ def to_reportbuilder(generator)
100
+ anchor=generator.add_toc_entry(_("PCA: ")+name)
101
+ generator.add_html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
102
+
103
+ generator.add_text "Number of factors: #{m}"
104
+ t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
105
+ communality(m).each_with_index {|com,i|
106
+ t.add_row([i, 1.0, sprintf("%0.3f", com)])
107
+ }
108
+ generator.parse_element(t)
109
+
110
+ t=ReportBuilder::Table.new(:name=>_("Eigenvalues"), :header=>["Variable","Value"])
111
+ eigenvalues.each_with_index {|eigenvalue,i|
112
+ t.add_row([i, sprintf("%0.3f",eigenvalue)])
113
+ }
114
+ generator.parse_element(t)
115
+
116
+ t=ReportBuilder::Table.new(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1})
117
+
118
+ i=0
119
+ component_matrix(m).to_a.each do |row|
120
+ t.add_row([i]+row.collect {|c| sprintf("%0.3f",c)})
121
+ i+=1
122
+ end
123
+ generator.parse_element(t)
124
+ generator.add_html("</div>")
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,133 @@
1
+ module Statsample
2
+ module Factor
3
+ class PrincipalAxis
4
+ MIN_CHANGE_ESTIMATE=0.0001
5
+ include GetText
6
+ bindtextdomain("statsample")
7
+ attr_accessor :m, :name
8
+
9
+ attr_reader :iterations, :initial_eigenvalues
10
+ def initialize(matrix ,opts=Hash.new)
11
+ @matrix=matrix
12
+ @name=""
13
+ @m=nil
14
+ opts.each{|k,v|
15
+ self.send("#{k}=",v) if self.respond_to? k
16
+ }
17
+ @clean=true
18
+ end
19
+ def communality(m)
20
+ if m!=@m or @clean
21
+ iterate(m)
22
+ raise "Can't calculate comunnality" if @communality.nil?
23
+ end
24
+ @communality
25
+ end
26
+ def component_matrix(m)
27
+ if m!=@m or @clean
28
+ iterate(m)
29
+ end
30
+ @component_matrix
31
+ end
32
+
33
+ def iterate(m, t=25)
34
+ @clean=false
35
+ @m=m
36
+ work_matrix=@matrix.to_a
37
+ prev_com=initial_communalities
38
+ pca=PCA.new(::Matrix.rows(work_matrix))
39
+ @initial_eigenvalues=pca.eigenvalues
40
+ @iterations=0
41
+ t.times do |i|
42
+ @iterations+=1
43
+ prev_com.each_with_index{|v,i|
44
+ work_matrix[i][i]=v
45
+ }
46
+ pca=Statsample::PCA.new(::Matrix.rows(work_matrix))
47
+
48
+ @communality=pca.communality(m)
49
+ jump=true
50
+ @communality.each_with_index do |v2,i2|
51
+ raise "Variable #{i2} with communality > 1" if v2>1.0
52
+ #p (v2-prev_com[i2]).abs
53
+ jump=false if (v2-prev_com[i2]).abs>=MIN_CHANGE_ESTIMATE
54
+ end
55
+ break if jump
56
+ prev_com=@communality
57
+ end
58
+ @component_matrix=pca.component_matrix(m)
59
+ end
60
+
61
+
62
+ def initial_communalities
63
+ if @initial_communalities.nil?
64
+ @initial_communalities=@matrix.column_size.times.collect {|i|
65
+ rxx , rxy = FactorialAnalysis.separate_matrices(@matrix,i)
66
+ matrix=(rxy.t*rxx.inverse*rxy)
67
+ matrix[0,0]
68
+ }
69
+ end
70
+ @initial_communalities
71
+ end
72
+ # Returns two matrixes from a correlation matrix
73
+ # with regressors correlation matrix and criteria xy
74
+ # matrix.
75
+ def self.separate_matrices(matrix, y)
76
+ ac=[]
77
+ matrix.column_size.times do |i|
78
+ ac.push(matrix[y,i]) if i!=y
79
+ end
80
+ rxy=Matrix.columns([ac])
81
+ rows=[]
82
+ matrix.row_size.times do |i|
83
+ if i!=y
84
+ row=[]
85
+ matrix.row_size.times do |j|
86
+ row.push(matrix[i,j]) if j!=y
87
+ end
88
+ rows.push(row)
89
+ end
90
+ end
91
+ rxx=Matrix.rows(rows)
92
+ [rxx,rxy]
93
+ end
94
+
95
+
96
+ def to_reportbuilder(generator)
97
+ anchor=generator.add_toc_entry(_("Factor Analysis: ")+name)
98
+ generator.add_html "<div class='pca'>"+_("Factor Analysis")+" #{@name}<a name='#{anchor}'></a>"
99
+ if @m.nil?
100
+ # Set number of factors with eigenvalues > 1
101
+ m=@eigenpairs.find_all {|v| v[0]>=1.0}.size
102
+ else
103
+ m=@m
104
+ end
105
+ generator.add_text "Number of factors: #{m}"
106
+ t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
107
+ communality(m).each_with_index {|com,i|
108
+ t.add_row([i, sprintf("%0.3f", initial_communalities[i]), sprintf("%0.3f", com)])
109
+ }
110
+ generator.parse_element(t)
111
+
112
+ t=ReportBuilder::Table.new(:name=>_("Eigenvalues"), :header=>["Variable","Value"])
113
+ @initial_eigenvalues.each_with_index {|eigenvalue,i|
114
+ t.add_row([i, sprintf("%0.3f",eigenvalue)])
115
+ }
116
+ generator.parse_element(t)
117
+
118
+ t=ReportBuilder::Table.new(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1})
119
+
120
+ i=0
121
+ component_matrix(m).to_a.each do |row|
122
+ t.add_row([i]+row.collect {|c| sprintf("%0.3f",c)})
123
+ i+=1
124
+ end
125
+ generator.parse_element(t)
126
+ generator.add_html("</div>")
127
+ end
128
+
129
+
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,125 @@
1
+ module Statsample
2
+ module Factor
3
+ # Base class for rotate matrixes
4
+ # References:
5
+ # * SPSS Manual
6
+ # * Johnny Lin code for IDL: http://www.johnny-lin.com/idl_code/varimax_k58.pro
7
+ # Use Varimax, Equimax or Quartimax for desired type of rotation
8
+ # Use:
9
+ # a = Matrix[ [ 0.4320, 0.8129, 0.3872]
10
+ # , [ 0.7950, -0.5416, 0.2565]
11
+ # , [ 0.5944, 0.7234, -0.3441]
12
+ # , [ 0.8945, -0.3921, -0.1863] ]
13
+ # rotation = Statsample::Factor::Varimax(a)
14
+ # rotation.iterate
15
+ # p rotation.rotated
16
+ # p rotation.component_transformation_matrix
17
+ #
18
+ class Rotation
19
+ MAX_PRECISION=1e-15
20
+ attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
21
+ def initialize(matrix, opts=Hash.new)
22
+ @matrix=matrix
23
+ @n=@matrix.row_size # Variables, p on original
24
+ @m=@matrix.column_size # Factors, r on original
25
+ @component_transformation_matrix=nil
26
+ @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
27
+ end
28
+ alias_method :communalities, :h2
29
+ alias_method :rotated_component_matrix, :rotated
30
+ # Start iteration of
31
+ def iterate(max_i=25)
32
+ t=Matrix.identity(@m)
33
+ b=@matrix.dup
34
+ h=Matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
35
+ h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
36
+ bh=h_inverse*b
37
+ @not_converged=true
38
+ @iterations=0
39
+ while @not_converged
40
+ break if iterations>max_i
41
+ @iterations+=1
42
+ #puts "Iteration #{iterations}"
43
+ num_pairs=@m*(@m-1).quo(2)
44
+ (0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
45
+ ((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
46
+
47
+ xx = bh.column(i)
48
+ yy = bh.column(j)
49
+ tx = t.column(i)
50
+ ty = t.column(j)
51
+
52
+ uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
53
+ vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
54
+
55
+ a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
56
+ b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
57
+ c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
58
+ d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
59
+ num=x(a,b,c,d)
60
+ den=y(a,b,c,d)
61
+ phi=Math::atan2(num,den) / 4.0
62
+ # puts "#{i}-#{j}: #{phi}"
63
+
64
+ if(Math::sin(phi.abs) >= MAX_PRECISION)
65
+ xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
66
+ yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
67
+
68
+
69
+ tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
70
+ ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
71
+
72
+
73
+ bh=bh.to_a
74
+
75
+ @n.times {|row_i|
76
+ bh[row_i][i] = xx_rot[row_i]
77
+ bh[row_i][j] = yy_rot[row_i]
78
+ }
79
+ t=t.to_a
80
+ @m.times {|row_i|
81
+ t[row_i][i]=tx_rot[row_i]
82
+ t[row_i][j]=ty_rot[row_i]
83
+ }
84
+
85
+ bh=Matrix.rows(bh)
86
+ t=Matrix.rows(t)
87
+ else
88
+ num_pairs=num_pairs-1
89
+ @not_converged=false if num_pairs==0
90
+ end # if
91
+ end #j
92
+ end #i
93
+ end # while
94
+ @rotated=h*bh
95
+ @component_transformation_matrix=t
96
+ @rotated
97
+ end
98
+
99
+ end
100
+ class Varimax < Rotation
101
+ def x(a,b,c,d)
102
+ d-(2*a*b / @n.to_f)
103
+ end
104
+ def y(a,b,c,d)
105
+ c-((a**2-b**2) / @n.to_f)
106
+ end
107
+ end
108
+ class Equimax < Rotation
109
+ def x(a,b,c,d)
110
+ d-(@m*a*b / @n.to_f)
111
+ end
112
+ def y(a,b,c,d)
113
+ c-@m*((a**2-b**2) / (2*@n.to_f))
114
+ end
115
+ end
116
+ class Quartimax < Rotation
117
+ def x(a,b,c,d)
118
+ d
119
+ end
120
+ def y(a,b,c,d)
121
+ c
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,99 @@
1
+ module Statsample
2
+ # A histogram consists of a set of bins which count the
3
+ # number of events falling into a given range of a continuous variable x.
4
+ #
5
+ # This implementations follows convention of GSL
6
+ # for specification.
7
+ #
8
+ # * Verbatim: *
9
+ #
10
+ # The range for bin[i] is given by range[i] to range[i+1].
11
+ # For n bins there are n+1 entries in the array range.
12
+ # Each bin is inclusive at the lower end and exclusive at the upper end.
13
+ # Mathematically this means that the bins are defined
14
+ # by the following inequality,
15
+ #
16
+ # bin[i] corresponds to range[i] <= x < range[i+1]
17
+ #
18
+ # Here is a diagram of the correspondence between ranges and bins
19
+ # on the number-line for x,
20
+ #
21
+ #
22
+ # [ bin[0] )[ bin[1] )[ bin[2] )[ bin[3] )[ bin[4] )
23
+ # ---|---------|---------|---------|---------|---------|--- x
24
+ # r[0] r[1] r[2] r[3] r[4] r[5]
25
+ #
26
+ #
27
+ # In this picture the values of the range array are denoted by r.
28
+ # On the left-hand side of each bin the square bracket ‘[’ denotes
29
+ # an inclusive lower bound ( r <= x), and the round parentheses ‘)’
30
+ # on the right-hand side denote an exclusive upper bound (x < r).
31
+ # Thus any samples which fall on the upper end of the histogram are
32
+ # excluded.
33
+ # If you want to include this value for the last bin you will need to
34
+ # add an extra bin to your histogram.
35
+ #
36
+ #
37
+ # Reference:
38
+ # http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
39
+
40
+ class Histogram
41
+ class << self
42
+ def alloc(n_bins, range=nil, opts=Hash.new)
43
+ Histogram.new(n_bins, range)
44
+
45
+ end
46
+ end
47
+ attr_accessor :name
48
+ attr_reader :bin
49
+ attr_reader :range
50
+ include GetText
51
+ bindtextdomain("statsample")
52
+ def initialize(p1, min_max=false, opts=Hash.new)
53
+
54
+ if p1.is_a? Array
55
+ range=p1
56
+ n_bins=p1.size-1
57
+ elsif p1.is_a? Integer
58
+ n_bins=p1
59
+ end
60
+
61
+ @bin=[0.0]*(n_bins)
62
+ if(min_max)
63
+ min, max=min_max[0], min_max[1]
64
+ range=Array.new(n_bins+1)
65
+ (n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(n_bins)) }
66
+ end
67
+ range||=[0.0]*(n_bins+1)
68
+ set_ranges(range)
69
+ @name=""
70
+ opts.each{|k,v|
71
+ self.send("#{k}=",v) if self.respond_to? k
72
+ }
73
+ end
74
+
75
+ def increment(x, w=1)
76
+ if x.is_a? Array
77
+ x.each{|y| increment(y,w) }
78
+ elsif x.is_a? Numeric
79
+ (range.size-1).times do |i|
80
+ if x>=range[i] and x<range[i+1]
81
+ @bin[i]+=w
82
+ break
83
+ end
84
+ end
85
+ end
86
+ end
87
+ def set_ranges(range)
88
+ raise "Range size should be bin+1" if range.size!=@bin.size+1
89
+ @range=range
90
+ end
91
+ def to_reportbuilder_text(generator)
92
+ anchor=generator.add_toc_entry(_("Histogram %s") % [@name])
93
+ range.each_with_index do |r,i|
94
+ next if i==@bin.size
95
+ generator.add_text(sprintf("%4.2f : %d", r, @bin[i]))
96
+ end
97
+ end
98
+ end
99
+ end