statsample 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. data/History.txt +12 -0
  2. data/Manifest.txt +13 -0
  3. data/README.txt +2 -1
  4. data/demo/pca.rb +29 -0
  5. data/demo/umann.rb +8 -0
  6. data/lib/distribution.rb +0 -1
  7. data/lib/matrix_extension.rb +35 -21
  8. data/lib/statsample.rb +31 -28
  9. data/lib/statsample/anova.rb +7 -2
  10. data/lib/statsample/bivariate.rb +17 -11
  11. data/lib/statsample/codification.rb +136 -87
  12. data/lib/statsample/combination.rb +0 -2
  13. data/lib/statsample/converter/csv18.rb +1 -1
  14. data/lib/statsample/converter/csv19.rb +1 -1
  15. data/lib/statsample/converters.rb +176 -171
  16. data/lib/statsample/crosstab.rb +227 -154
  17. data/lib/statsample/dataset.rb +94 -12
  18. data/lib/statsample/dominanceanalysis.rb +69 -62
  19. data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
  20. data/lib/statsample/factor.rb +18 -0
  21. data/lib/statsample/factor/pca.rb +128 -0
  22. data/lib/statsample/factor/principalaxis.rb +133 -0
  23. data/lib/statsample/factor/rotation.rb +125 -0
  24. data/lib/statsample/histogram.rb +99 -0
  25. data/lib/statsample/mle.rb +125 -126
  26. data/lib/statsample/mle/logit.rb +91 -91
  27. data/lib/statsample/mle/probit.rb +84 -85
  28. data/lib/statsample/multiset.rb +1 -1
  29. data/lib/statsample/permutation.rb +96 -0
  30. data/lib/statsample/regression.rb +1 -1
  31. data/lib/statsample/regression/binomial.rb +89 -89
  32. data/lib/statsample/regression/binomial/logit.rb +9 -9
  33. data/lib/statsample/regression/binomial/probit.rb +9 -9
  34. data/lib/statsample/regression/multiple.rb +8 -14
  35. data/lib/statsample/regression/multiple/gslengine.rb +1 -1
  36. data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
  37. data/lib/statsample/resample.rb +12 -17
  38. data/lib/statsample/srs.rb +4 -1
  39. data/lib/statsample/test.rb +23 -22
  40. data/lib/statsample/test/umannwhitney.rb +182 -0
  41. data/lib/statsample/vector.rb +854 -815
  42. data/test/test_bivariate.rb +132 -132
  43. data/test/test_codification.rb +71 -50
  44. data/test/test_dataset.rb +19 -1
  45. data/test/test_factor.rb +44 -0
  46. data/test/test_histogram.rb +26 -0
  47. data/test/test_permutation.rb +37 -0
  48. data/test/test_statistics.rb +74 -63
  49. data/test/test_umannwhitney.rb +17 -0
  50. data/test/test_vector.rb +46 -30
  51. metadata +31 -4
@@ -0,0 +1,128 @@
1
+ module Statsample
2
+ module Factor
3
+ # Principal Component Analysis of a given covariance or correlation matrix.
4
+ # For factorial Analysis, use Statsample::Factor::PrincipalAxis
5
+ # Reference: SPSS manual
6
+ # Use:
7
+ # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
8
+ # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
9
+ # ds={'a'=>a,'b'=>b}.to_dataset
10
+ # cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
11
+ # pca=Statsample::Factor::PCA.new(cor_matrix)
12
+ # p pca.component_matrix
13
+ class PCA
14
+ attr_accessor :name, :m
15
+ include GetText
16
+ bindtextdomain("statsample")
17
+
18
+
19
+ def initialize(matrix ,opts=Hash.new)
20
+ if matrix.is_a? ::Matrix
21
+ require 'matrix_extension'
22
+ matrix=matrix.to_gsl
23
+ end
24
+ @name=""
25
+ @matrix=matrix
26
+ @n_variables=@matrix.size1
27
+ @m=nil
28
+ opts.each{|k,v|
29
+ self.send("#{k}=",v) if self.respond_to? k
30
+ }
31
+ calculate_eigenpairs
32
+ if @m.nil?
33
+ # Set number of factors with eigenvalues > 1
34
+ @m=@eigenpairs.find_all {|v| v[0]>=1.0}.size
35
+ end
36
+
37
+ end
38
+ def create_centered_ds
39
+ h={}
40
+ @original_ds.factors.each {|f|
41
+ mean=@original_ds[f].mean
42
+ h[f]=@original_ds[f].recode {|c| c-mean}
43
+ }
44
+ @ds=h.to_dataset
45
+ end
46
+ # Feature vector for m factors
47
+ def feature_vector(m=nil)
48
+ m||=@m
49
+ omega_m=GSL::Matrix.zeros(@n_variables, m)
50
+ m.times do |i|
51
+ omega_m.set_col(i, @eigenpairs[i][1])
52
+ end
53
+ omega_m
54
+ end
55
+ # data_transformation
56
+ def data_transformation(data_matrix, m)
57
+ m||=@m
58
+ raise "Data variables number should be equal to original variable number" if data_matrix.size2!=@n_variables
59
+ fv=feature_vector(m)
60
+ (fv.transpose*data_matrix.transpose).transpose
61
+ end
62
+ # Component matrix for m factors
63
+ def component_matrix(m=nil)
64
+ m||=@m
65
+ raise "m should be > 0" if m<1
66
+ omega_m=GSL::Matrix.zeros(@n_variables, m)
67
+ gammas=[]
68
+ m.times {|i|
69
+ omega_m.set_col(i, @eigenpairs[i][1])
70
+ gammas.push(Math::sqrt(@eigenpairs[i][0]))
71
+ }
72
+ gamma_m=GSL::Matrix.diagonal(gammas)
73
+ omega_m*(gamma_m)
74
+ end
75
+ # Communality for all variables given m factors
76
+ def communality(m=nil)
77
+ m||=@m
78
+ h=[]
79
+ @n_variables.times do |i|
80
+ sum=0
81
+ m.times do |j|
82
+ sum+=@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2
83
+ end
84
+ h.push(sum)
85
+ end
86
+ h
87
+ end
88
+ def eigenvalues
89
+ @eigenpairs.collect {|c| c[0] }
90
+ end
91
+ def calculate_eigenpairs
92
+ eigval, eigvec= GSL::Eigen.symmv(@matrix)
93
+ @eigenpairs={}
94
+ eigval.each_index {|i|
95
+ @eigenpairs[eigval[i]]=eigvec.get_col(i)
96
+ }
97
+ @eigenpairs=@eigenpairs.sort.reverse
98
+ end
99
+ def to_reportbuilder(generator)
100
+ anchor=generator.add_toc_entry(_("PCA: ")+name)
101
+ generator.add_html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
102
+
103
+ generator.add_text "Number of factors: #{m}"
104
+ t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
105
+ communality(m).each_with_index {|com,i|
106
+ t.add_row([i, 1.0, sprintf("%0.3f", com)])
107
+ }
108
+ generator.parse_element(t)
109
+
110
+ t=ReportBuilder::Table.new(:name=>_("Eigenvalues"), :header=>["Variable","Value"])
111
+ eigenvalues.each_with_index {|eigenvalue,i|
112
+ t.add_row([i, sprintf("%0.3f",eigenvalue)])
113
+ }
114
+ generator.parse_element(t)
115
+
116
+ t=ReportBuilder::Table.new(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1})
117
+
118
+ i=0
119
+ component_matrix(m).to_a.each do |row|
120
+ t.add_row([i]+row.collect {|c| sprintf("%0.3f",c)})
121
+ i+=1
122
+ end
123
+ generator.parse_element(t)
124
+ generator.add_html("</div>")
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,133 @@
1
+ module Statsample
2
+ module Factor
3
+ class PrincipalAxis
4
+ MIN_CHANGE_ESTIMATE=0.0001
5
+ include GetText
6
+ bindtextdomain("statsample")
7
+ attr_accessor :m, :name
8
+
9
+ attr_reader :iterations, :initial_eigenvalues
10
+ def initialize(matrix ,opts=Hash.new)
11
+ @matrix=matrix
12
+ @name=""
13
+ @m=nil
14
+ opts.each{|k,v|
15
+ self.send("#{k}=",v) if self.respond_to? k
16
+ }
17
+ @clean=true
18
+ end
19
+ def communality(m)
20
+ if m!=@m or @clean
21
+ iterate(m)
22
+ raise "Can't calculate comunnality" if @communality.nil?
23
+ end
24
+ @communality
25
+ end
26
+ def component_matrix(m)
27
+ if m!=@m or @clean
28
+ iterate(m)
29
+ end
30
+ @component_matrix
31
+ end
32
+
33
+ def iterate(m, t=25)
34
+ @clean=false
35
+ @m=m
36
+ work_matrix=@matrix.to_a
37
+ prev_com=initial_communalities
38
+ pca=PCA.new(::Matrix.rows(work_matrix))
39
+ @initial_eigenvalues=pca.eigenvalues
40
+ @iterations=0
41
+ t.times do |i|
42
+ @iterations+=1
43
+ prev_com.each_with_index{|v,i|
44
+ work_matrix[i][i]=v
45
+ }
46
+ pca=Statsample::PCA.new(::Matrix.rows(work_matrix))
47
+
48
+ @communality=pca.communality(m)
49
+ jump=true
50
+ @communality.each_with_index do |v2,i2|
51
+ raise "Variable #{i2} with communality > 1" if v2>1.0
52
+ #p (v2-prev_com[i2]).abs
53
+ jump=false if (v2-prev_com[i2]).abs>=MIN_CHANGE_ESTIMATE
54
+ end
55
+ break if jump
56
+ prev_com=@communality
57
+ end
58
+ @component_matrix=pca.component_matrix(m)
59
+ end
60
+
61
+
62
+ def initial_communalities
63
+ if @initial_communalities.nil?
64
+ @initial_communalities=@matrix.column_size.times.collect {|i|
65
+ rxx , rxy = FactorialAnalysis.separate_matrices(@matrix,i)
66
+ matrix=(rxy.t*rxx.inverse*rxy)
67
+ matrix[0,0]
68
+ }
69
+ end
70
+ @initial_communalities
71
+ end
72
+ # Returns two matrixes from a correlation matrix
73
+ # with regressors correlation matrix and criteria xy
74
+ # matrix.
75
+ def self.separate_matrices(matrix, y)
76
+ ac=[]
77
+ matrix.column_size.times do |i|
78
+ ac.push(matrix[y,i]) if i!=y
79
+ end
80
+ rxy=Matrix.columns([ac])
81
+ rows=[]
82
+ matrix.row_size.times do |i|
83
+ if i!=y
84
+ row=[]
85
+ matrix.row_size.times do |j|
86
+ row.push(matrix[i,j]) if j!=y
87
+ end
88
+ rows.push(row)
89
+ end
90
+ end
91
+ rxx=Matrix.rows(rows)
92
+ [rxx,rxy]
93
+ end
94
+
95
+
96
+ def to_reportbuilder(generator)
97
+ anchor=generator.add_toc_entry(_("Factor Analysis: ")+name)
98
+ generator.add_html "<div class='pca'>"+_("Factor Analysis")+" #{@name}<a name='#{anchor}'></a>"
99
+ if @m.nil?
100
+ # Set number of factors with eigenvalues > 1
101
+ m=@eigenpairs.find_all {|v| v[0]>=1.0}.size
102
+ else
103
+ m=@m
104
+ end
105
+ generator.add_text "Number of factors: #{m}"
106
+ t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
107
+ communality(m).each_with_index {|com,i|
108
+ t.add_row([i, sprintf("%0.3f", initial_communalities[i]), sprintf("%0.3f", com)])
109
+ }
110
+ generator.parse_element(t)
111
+
112
+ t=ReportBuilder::Table.new(:name=>_("Eigenvalues"), :header=>["Variable","Value"])
113
+ @initial_eigenvalues.each_with_index {|eigenvalue,i|
114
+ t.add_row([i, sprintf("%0.3f",eigenvalue)])
115
+ }
116
+ generator.parse_element(t)
117
+
118
+ t=ReportBuilder::Table.new(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1})
119
+
120
+ i=0
121
+ component_matrix(m).to_a.each do |row|
122
+ t.add_row([i]+row.collect {|c| sprintf("%0.3f",c)})
123
+ i+=1
124
+ end
125
+ generator.parse_element(t)
126
+ generator.add_html("</div>")
127
+ end
128
+
129
+
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,125 @@
1
+ module Statsample
2
+ module Factor
3
+ # Base class for rotate matrixes
4
+ # References:
5
+ # * SPSS Manual
6
+ # * Johnny Lin code for IDL: http://www.johnny-lin.com/idl_code/varimax_k58.pro
7
+ # Use Varimax, Equimax or Quartimax for desired type of rotation
8
+ # Use:
9
+ # a = Matrix[ [ 0.4320, 0.8129, 0.3872]
10
+ # , [ 0.7950, -0.5416, 0.2565]
11
+ # , [ 0.5944, 0.7234, -0.3441]
12
+ # , [ 0.8945, -0.3921, -0.1863] ]
13
+ # rotation = Statsample::Factor::Varimax(a)
14
+ # rotation.iterate
15
+ # p rotation.rotated
16
+ # p rotation.component_transformation_matrix
17
+ #
18
+ class Rotation
19
+ MAX_PRECISION=1e-15
20
+ attr_reader :iterations, :rotated, :component_transformation_matrix, :h2
21
+ def initialize(matrix, opts=Hash.new)
22
+ @matrix=matrix
23
+ @n=@matrix.row_size # Variables, p on original
24
+ @m=@matrix.column_size # Factors, r on original
25
+ @component_transformation_matrix=nil
26
+ @h2=(@matrix.collect {|c| c**2} * Matrix.column_vector([1]*@m)).column(0).to_a
27
+ end
28
+ alias_method :communalities, :h2
29
+ alias_method :rotated_component_matrix, :rotated
30
+ # Start iteration of
31
+ def iterate(max_i=25)
32
+ t=Matrix.identity(@m)
33
+ b=@matrix.dup
34
+ h=Matrix.diagonal(*@h2).collect {|c| Math::sqrt(c)}
35
+ h_inverse=h.collect {|c| c!=0 ? 1/c : 0 }
36
+ bh=h_inverse*b
37
+ @not_converged=true
38
+ @iterations=0
39
+ while @not_converged
40
+ break if iterations>max_i
41
+ @iterations+=1
42
+ #puts "Iteration #{iterations}"
43
+ num_pairs=@m*(@m-1).quo(2)
44
+ (0..(@m-2)).each do |i| #+ go through factor index 0:r-1-1 (begin)
45
+ ((i+1)..(@m-1)).each do |j| #+ pair i to "rest" of factors (begin)
46
+
47
+ xx = bh.column(i)
48
+ yy = bh.column(j)
49
+ tx = t.column(i)
50
+ ty = t.column(j)
51
+
52
+ uu = @n.times.collect {|var_i| xx[var_i]**2-yy[var_i]**2}
53
+ vv = @n.times.collect {|var_i| 2*xx[var_i]*yy[var_i]}
54
+
55
+ a = @n.times.inject(0) {|ac,var_i| ac+ uu[var_i] }
56
+ b = @n.times.inject(0) {|ac,var_i| ac+ vv[var_i] }
57
+ c = @n.times.inject(0) {|ac,var_i| ac+ (uu[var_i]**2 - vv[var_i]**2) }
58
+ d = @n.times.inject(0) {|ac,var_i| ac+ (2*uu[var_i]*vv[var_i]) }
59
+ num=x(a,b,c,d)
60
+ den=y(a,b,c,d)
61
+ phi=Math::atan2(num,den) / 4.0
62
+ # puts "#{i}-#{j}: #{phi}"
63
+
64
+ if(Math::sin(phi.abs) >= MAX_PRECISION)
65
+ xx_rot=( Math::cos(phi)*xx)+(Math::sin(phi)*yy)
66
+ yy_rot=((-Math::sin(phi))*xx)+(Math::cos(phi)*yy)
67
+
68
+
69
+ tx_rot=( Math::cos(phi)*tx)+(Math::sin(phi)*ty)
70
+ ty_rot=((-Math::sin(phi))*tx)+(Math::cos(phi)*ty)
71
+
72
+
73
+ bh=bh.to_a
74
+
75
+ @n.times {|row_i|
76
+ bh[row_i][i] = xx_rot[row_i]
77
+ bh[row_i][j] = yy_rot[row_i]
78
+ }
79
+ t=t.to_a
80
+ @m.times {|row_i|
81
+ t[row_i][i]=tx_rot[row_i]
82
+ t[row_i][j]=ty_rot[row_i]
83
+ }
84
+
85
+ bh=Matrix.rows(bh)
86
+ t=Matrix.rows(t)
87
+ else
88
+ num_pairs=num_pairs-1
89
+ @not_converged=false if num_pairs==0
90
+ end # if
91
+ end #j
92
+ end #i
93
+ end # while
94
+ @rotated=h*bh
95
+ @component_transformation_matrix=t
96
+ @rotated
97
+ end
98
+
99
+ end
100
+ class Varimax < Rotation
101
+ def x(a,b,c,d)
102
+ d-(2*a*b / @n.to_f)
103
+ end
104
+ def y(a,b,c,d)
105
+ c-((a**2-b**2) / @n.to_f)
106
+ end
107
+ end
108
+ class Equimax < Rotation
109
+ def x(a,b,c,d)
110
+ d-(@m*a*b / @n.to_f)
111
+ end
112
+ def y(a,b,c,d)
113
+ c-@m*((a**2-b**2) / (2*@n.to_f))
114
+ end
115
+ end
116
+ class Quartimax < Rotation
117
+ def x(a,b,c,d)
118
+ d
119
+ end
120
+ def y(a,b,c,d)
121
+ c
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,99 @@
1
+ module Statsample
2
+ # A histogram consists of a set of bins which count the
3
+ # number of events falling into a given range of a continuous variable x.
4
+ #
5
+ # This implementations follows convention of GSL
6
+ # for specification.
7
+ #
8
+ # * Verbatim: *
9
+ #
10
+ # The range for bin[i] is given by range[i] to range[i+1].
11
+ # For n bins there are n+1 entries in the array range.
12
+ # Each bin is inclusive at the lower end and exclusive at the upper end.
13
+ # Mathematically this means that the bins are defined
14
+ # by the following inequality,
15
+ #
16
+ # bin[i] corresponds to range[i] <= x < range[i+1]
17
+ #
18
+ # Here is a diagram of the correspondence between ranges and bins
19
+ # on the number-line for x,
20
+ #
21
+ #
22
+ # [ bin[0] )[ bin[1] )[ bin[2] )[ bin[3] )[ bin[4] )
23
+ # ---|---------|---------|---------|---------|---------|--- x
24
+ # r[0] r[1] r[2] r[3] r[4] r[5]
25
+ #
26
+ #
27
+ # In this picture the values of the range array are denoted by r.
28
+ # On the left-hand side of each bin the square bracket ‘[’ denotes
29
+ # an inclusive lower bound ( r <= x), and the round parentheses ‘)’
30
+ # on the right-hand side denote an exclusive upper bound (x < r).
31
+ # Thus any samples which fall on the upper end of the histogram are
32
+ # excluded.
33
+ # If you want to include this value for the last bin you will need to
34
+ # add an extra bin to your histogram.
35
+ #
36
+ #
37
+ # Reference:
38
+ # http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
39
+
40
+ class Histogram
41
+ class << self
42
+ def alloc(n_bins, range=nil, opts=Hash.new)
43
+ Histogram.new(n_bins, range)
44
+
45
+ end
46
+ end
47
+ attr_accessor :name
48
+ attr_reader :bin
49
+ attr_reader :range
50
+ include GetText
51
+ bindtextdomain("statsample")
52
+ def initialize(p1, min_max=false, opts=Hash.new)
53
+
54
+ if p1.is_a? Array
55
+ range=p1
56
+ n_bins=p1.size-1
57
+ elsif p1.is_a? Integer
58
+ n_bins=p1
59
+ end
60
+
61
+ @bin=[0.0]*(n_bins)
62
+ if(min_max)
63
+ min, max=min_max[0], min_max[1]
64
+ range=Array.new(n_bins+1)
65
+ (n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(n_bins)) }
66
+ end
67
+ range||=[0.0]*(n_bins+1)
68
+ set_ranges(range)
69
+ @name=""
70
+ opts.each{|k,v|
71
+ self.send("#{k}=",v) if self.respond_to? k
72
+ }
73
+ end
74
+
75
+ def increment(x, w=1)
76
+ if x.is_a? Array
77
+ x.each{|y| increment(y,w) }
78
+ elsif x.is_a? Numeric
79
+ (range.size-1).times do |i|
80
+ if x>=range[i] and x<range[i+1]
81
+ @bin[i]+=w
82
+ break
83
+ end
84
+ end
85
+ end
86
+ end
87
+ def set_ranges(range)
88
+ raise "Range size should be bin+1" if range.size!=@bin.size+1
89
+ @range=range
90
+ end
91
+ def to_reportbuilder_text(generator)
92
+ anchor=generator.add_toc_entry(_("Histogram %s") % [@name])
93
+ range.each_with_index do |r,i|
94
+ next if i==@bin.size
95
+ generator.add_text(sprintf("%4.2f : %d", r, @bin[i]))
96
+ end
97
+ end
98
+ end
99
+ end