statsample 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data.tar.gz.sig +0 -0
  2. data/History.txt +20 -1
  3. data/Manifest.txt +8 -1
  4. data/README.txt +11 -7
  5. data/Rakefile +2 -2
  6. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  7. data/examples/dataset.rb +8 -0
  8. data/examples/multiple_regression.rb +1 -1
  9. data/examples/parallel_analysis.rb +29 -0
  10. data/examples/parallel_analysis_tetrachoric.rb +30 -0
  11. data/examples/vector.rb +6 -0
  12. data/lib/distribution.rb +16 -6
  13. data/lib/distribution/normal.rb +27 -20
  14. data/lib/distribution/normalbivariate.rb +1 -1
  15. data/lib/statsample.rb +19 -2
  16. data/lib/statsample/anova.rb +118 -16
  17. data/lib/statsample/bivariate.rb +27 -13
  18. data/lib/statsample/bivariate/polychoric.rb +18 -5
  19. data/lib/statsample/crosstab.rb +66 -74
  20. data/lib/statsample/dataset.rb +52 -45
  21. data/lib/statsample/dominanceanalysis.rb +2 -5
  22. data/lib/statsample/factor.rb +1 -1
  23. data/lib/statsample/factor/parallelanalysis.rb +122 -0
  24. data/lib/statsample/factor/pca.rb +23 -28
  25. data/lib/statsample/factor/principalaxis.rb +8 -3
  26. data/lib/statsample/matrix.rb +27 -24
  27. data/lib/statsample/mle.rb +11 -11
  28. data/lib/statsample/permutation.rb +2 -1
  29. data/lib/statsample/regression.rb +10 -8
  30. data/lib/statsample/regression/multiple/baseengine.rb +36 -25
  31. data/lib/statsample/regression/multiple/gslengine.rb +14 -0
  32. data/lib/statsample/regression/multiple/matrixengine.rb +4 -32
  33. data/lib/statsample/regression/multiple/rubyengine.rb +2 -6
  34. data/lib/statsample/regression/simple.rb +1 -1
  35. data/lib/statsample/reliability.rb +42 -54
  36. data/lib/statsample/test.rb +10 -6
  37. data/lib/statsample/test/f.rb +16 -26
  38. data/lib/statsample/test/levene.rb +4 -8
  39. data/lib/statsample/test/t.rb +30 -24
  40. data/lib/statsample/test/umannwhitney.rb +13 -6
  41. data/lib/statsample/vector.rb +86 -76
  42. data/po/es/statsample.mo +0 -0
  43. data/po/es/statsample.po +127 -94
  44. data/po/statsample.pot +114 -79
  45. data/test/test_anovaoneway.rb +27 -0
  46. data/test/test_anovawithvectors.rb +97 -0
  47. data/test/test_bivariate.rb +6 -57
  48. data/test/test_bivariate_polychoric.rb +65 -0
  49. data/test/test_crosstab.rb +6 -0
  50. data/test/test_dataset.rb +29 -1
  51. data/test/test_distribution.rb +6 -13
  52. data/test/test_dominance_analysis.rb +1 -1
  53. data/test/test_factor.rb +3 -3
  54. data/test/test_helpers.rb +18 -18
  55. data/test/test_matrix.rb +33 -20
  56. data/test/test_permutation.rb +36 -30
  57. data/test/test_regression.rb +26 -8
  58. data/test/test_reliability.rb +104 -14
  59. data/test/test_test_f.rb +11 -14
  60. data/test/test_test_t.rb +42 -35
  61. data/test/test_umannwhitney.rb +22 -10
  62. data/test/test_vector.rb +204 -102
  63. metadata +57 -81
  64. metadata.gz.sig +0 -0
  65. data/test/test_anova.rb +0 -24
@@ -134,6 +134,7 @@ module Statsample
134
134
  end
135
135
  @models=nil
136
136
  @models_data=nil
137
+ @general_averages=nil
137
138
  end
138
139
  # Compute models.
139
140
  def compute
@@ -308,11 +309,7 @@ module Statsample
308
309
  @general_averages
309
310
  end
310
311
 
311
- def summary
312
- rp=ReportBuilder.new()
313
- rp.add(self)
314
- rp.to_text
315
- end
312
+
316
313
  def report_building(g)
317
314
  compute if @models.nil?
318
315
 
@@ -1,7 +1,7 @@
1
1
  require 'statsample/factor/pca'
2
2
  require 'statsample/factor/principalaxis'
3
3
  require 'statsample/factor/rotation'
4
-
4
+ require 'statsample/factor/parallelanalysis'
5
5
  module Statsample
6
6
  # Factor Analysis toolbox.
7
7
  # * Classes for Extraction of factors:
@@ -0,0 +1,122 @@
1
+ module Statsample
2
+ module Factor
3
+ # Performs Horn's 'parallel analysis' to a principal components analysis
4
+ # to adjust for sample bias in the retention of components.
5
+ # Can create the bootstrap samples using parameters (mean and standard
6
+ # deviation of each variable) or sampling for actual data.
7
+ # == Description
8
+ # "PA involves the construction of a number of correlation matrices of random variables based on the same sample size and number of variables in the real data set. The average eigenvalues from the random correlation matrices are then compared to the eigenvalues from the real data correlation matrix, such that the first observed eigenvalue is compared to the first random eigenvalue, the second observed eigenvalue is compared to the second random eigenvalue, and so on." (Hayton, Allen & Scarpello, 2004, p.194)
9
+ # == Usage
10
+ # # ds should be any valid dataset
11
+ # pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>100, :bootstrap_method=>:raw_data)
12
+ # == References:
13
+ # * Hayton, J., Allen, D. & Scarpello, V.(2004). Factor Retention Decisions in Exploratory Factor Analysis: a Tutorial on Parallel Analysis. <i>Organizational Research Methods, 7</i> (2), 191-205.
14
+ # * https://people.ok.ubc.ca/brioconn/nfactors/nfactors.html (for inspiration)
15
+ class ParallelAnalysis
16
+
17
+ include DirtyMemoize
18
+
19
+ # Number of random sets to produce. 50 by default
20
+ attr_accessor :iterations
21
+ # Name of analysis
22
+ attr_accessor :name
23
+ # Dataset. You could use mock vectors when use bootstrap method
24
+ attr_reader :ds
25
+ # Bootstrap method. <tt>:raw_data</tt> used by default
26
+ # * <tt>:parameter</tt>: uses mean and standard deviation of each variable
27
+ # * <tt>:raw_data</tt> : sample with replacement from actual data.
28
+ #
29
+ attr_accessor :bootstrap_method
30
+ # Factor method.
31
+ # Could be Statsample::Factor::PCA or Statsample::Factor::PrincipalAxis.
32
+ # PCA used by default.
33
+ attr_accessor :factor_class
34
+ # Percentil over bootstrap eigenvalue should be accepted. 95 by default
35
+ attr_accessor :percentil
36
+ # Correlation matrix used with :raw_data . <tt>:correlation_matrix</tt> used by default
37
+ attr_accessor :matrix_method
38
+ # Dataset with bootstrapped eigenvalues
39
+ attr_reader :ds_eigenvalues
40
+ # Show extra information if true
41
+ attr_accessor :debug
42
+
43
+
44
+ def initialize(ds, opts=Hash.new)
45
+ @ds=ds
46
+ @fields=@ds.fields
47
+ @n_variables=@fields.size
48
+ @n_cases=ds.cases
49
+ opts_default={
50
+ :name=>"Parallel Analysis",
51
+ :iterations=>50,
52
+ :bootstrap_method => :raw_data,
53
+ :factor_class => Statsample::Factor::PCA,
54
+ :percentil=>95,
55
+ :debug=>false,
56
+ :matrix_method=>:correlation_matrix
57
+ }
58
+ @opts=opts_default.merge(opts)
59
+ @opts[:matrix_method]==:correlation_matrix if @opts[:bootstrap_method]==:parameters
60
+ opts_default.keys.each {|k| send("#{k}=", @opts[k]) }
61
+ end
62
+ # Summary of results
63
+ def summary
64
+ ReportBuilder.new(:no_title=>true).add(self).to_text
65
+ end
66
+ # Number of factor to retent
67
+ def number_of_factors
68
+ total=0
69
+ ds_eigenvalues.fields.each_with_index do |f,i|
70
+ total+=1 if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil))
71
+ end
72
+ total
73
+ end
74
+ def report_building(g) #:nodoc:
75
+ g.section(:name=>@name) do |s|
76
+ s.text "Bootstrap Method: #{bootstrap_method}"
77
+ s.text "Correlation Matrix type : #{matrix_method}"
78
+ s.text "Number of variables: #{@n_variables}"
79
+ s.text "Number of cases: #{@n_cases}"
80
+ s.text "Number of iterations: #{@iterations}"
81
+ s.text "Number or factors to preserve: #{number_of_factors}"
82
+ s.table(:name=>"Eigenvalues", :header=>["Eigenvalue", "actual", "mean","p.#{percentil}","preserve?"]) do |t|
83
+ ds_eigenvalues.fields.each_with_index do |f,i|
84
+ v=ds_eigenvalues[f]
85
+ t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
86
+ end
87
+ end
88
+
89
+ end
90
+ end
91
+ # Perform calculation. Shouldn't be called directly for the user
92
+ def compute
93
+ @original=factor_class.new(Statsample::Bivariate.correlation_matrix(@ds), :m=>@n_variables).eigenvalues.sort.reverse
94
+ @ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
95
+ @ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
96
+
97
+ @iterations.times do |i|
98
+ # Create a dataset of dummy values
99
+ ds_bootstrap=Statsample::Dataset.new(@ds.fields)
100
+ if bootstrap_method==:parameter
101
+ rng = GSL::Rng.alloc()
102
+ end
103
+
104
+ @fields.each do |f|
105
+ if bootstrap_method==:parameter
106
+ ds_bootstrap[f]=@n_cases.times.map {|c| rng.gaussian( @ds[f].sd)+@ds[f].mean}.to_scale
107
+ elsif bootstrap_method==:raw_data
108
+ ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases).to_scale
109
+ end
110
+ end
111
+ fa=factor_class.new(Statsample::Bivariate.send(matrix_method, ds_bootstrap), :m=>@n_variables)
112
+ ev=fa.eigenvalues.sort.reverse
113
+ @ds_eigenvalues.add_case_array(ev)
114
+ puts "iteration #{i}" if $DEBUG or debug
115
+ end
116
+ @ds_eigenvalues.update_valid_data
117
+ end
118
+ dirty_memoize :number_of_factors, :ds_eigenvalues
119
+ dirty_writer :iterations, :bootstrap_method, :factor_class, :percentil
120
+ end
121
+ end
122
+ end
@@ -41,7 +41,7 @@ module Factor
41
41
  if matrix.respond_to? :to_gsl
42
42
  matrix=matrix.to_gsl
43
43
  end
44
- @name=""
44
+ @name=_("Principal Component Analysis")
45
45
  @matrix=matrix
46
46
  @n_variables=@matrix.size1
47
47
  @m=nil
@@ -51,7 +51,7 @@ module Factor
51
51
  calculate_eigenpairs
52
52
  if @m.nil?
53
53
  # Set number of factors with eigenvalues > 1
54
- @m=@eigenpairs.find_all {|v| v[0]>=1.0}.size
54
+ @m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
55
55
  end
56
56
 
57
57
  end
@@ -120,36 +120,31 @@ module Factor
120
120
  @eigenpairs=@eigenpairs.sort.reverse
121
121
  end
122
122
  def summary
123
- rp=ReportBuilder.new()
124
- rp.add(self)
125
- rp.to_text
123
+ ReportBuilder.new(:no_title=>true).add(self).to_text
126
124
  end
127
- def report_building(generator) # :nodoc:
128
- anchor=generator.toc_entry(_("PCA: ")+name)
129
- generator.html "<div class='pca'>"+_("PCA")+" #{@name}<a name='#{anchor}'></a>"
130
-
125
+ def report_building(builder) # :nodoc:
126
+ builder.section(:name=>@name) do |generator|
131
127
  generator.text "Number of factors: #{m}"
132
- t=ReportBuilder::Table.new(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"])
133
- communalities(m).each_with_index {|com,i|
134
- t.row([i, 1.0, sprintf("%0.3f", com)])
135
- }
136
- generator.parse_element(t)
137
-
138
- t=ReportBuilder::Table.new(:name=>_("Eigenvalues"), :header=>["Variable","Value"])
139
- eigenvalues.each_with_index {|eigenvalue,i|
140
- t.row([i, sprintf("%0.3f",eigenvalue)])
141
- }
142
- generator.parse_element(t)
143
-
144
- t=ReportBuilder::Table.new(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1})
128
+ generator.table(:name=>_("Communalities"), :header=>["Variable","Initial","Extraction"]) do |t|
129
+ communalities(m).each_with_index {|com,i|
130
+ t.row([i, 1.0, sprintf("%0.3f", com)])
131
+ }
132
+ end
133
+ generator.table(:name=>_("Eigenvalues"), :header=>["Variable","Value"]) do |t|
134
+ eigenvalues.each_with_index {|eigenvalue,i|
135
+ t.row([i, sprintf("%0.3f",eigenvalue)])
136
+ }
137
+ end
145
138
 
146
- i=0
147
- component_matrix(m).to_a.each do |row|
148
- t.row([i]+row.collect {|c| sprintf("%0.3f",c)})
149
- i+=1
139
+ generator.table(:name=>_("Component Matrix"), :header=>["Variable"]+m.times.collect {|c| c+1}) do |t|
140
+
141
+ i=0
142
+ component_matrix(m).to_a.each do |row|
143
+ t.row([i]+row.collect {|c| sprintf("%0.3f",c)})
144
+ i+=1
145
+ end
146
+ end
150
147
  end
151
- generator.parse_element(t)
152
- generator.html("</div>")
153
148
  end
154
149
  private :calculate_eigenpairs, :create_centered_ds
155
150
  end
@@ -27,6 +27,7 @@ module Factor
27
27
  # * Smith, L. (2002). A tutorial on Principal Component Analysis. Available on http://courses.eas.ualberta.ca/eas570/pca_tutorial.pdf
28
28
  #
29
29
  class PrincipalAxis
30
+ include DirtyMemoize
30
31
  # Minimum difference between succesive iterations on sum of communalities
31
32
  DELTA=1e-3
32
33
  # Maximum number of iterations
@@ -51,7 +52,9 @@ module Factor
51
52
  # Maximum number of iterations
52
53
  attr_accessor :max_iterations
53
54
  # Eigenvalues of factor analysis
54
- attr_accessor :eigenvalues
55
+ attr_reader :eigenvalues
56
+
57
+
55
58
 
56
59
  def initialize(matrix, opts=Hash.new)
57
60
  @matrix=matrix
@@ -124,7 +127,7 @@ module Factor
124
127
  end
125
128
  @component_matrix=pca.component_matrix(m)
126
129
  end
127
-
130
+ alias :compute :iterate
128
131
 
129
132
  def initial_communalities
130
133
  if @initial_communalities.nil?
@@ -200,7 +203,9 @@ module Factor
200
203
  generator.html("</div>")
201
204
  end
202
205
 
203
-
206
+ dirty_writer :max_iterations, :epsilon, :smc
207
+ dirty_memoize :eigenvalues, :iterations, :initial_eigenvalues
208
+
204
209
  end
205
210
 
206
211
  end
@@ -1,7 +1,8 @@
1
1
  require 'matrix'
2
- if RUBY_VERSION<="1.9.0"
3
- class ::Vector
4
- alias_method :old_coerce, :coerce
2
+
3
+ class ::Vector
4
+ if RUBY_VERSION<="1.9.0"
5
+ alias_method :old_coerce, :coerce
5
6
  def coerce(other)
6
7
  case other
7
8
  when Numeric
@@ -10,9 +11,11 @@ if RUBY_VERSION<="1.9.0"
10
11
  raise TypeError, "#{self.class} can't be coerced into #{other.class}"
11
12
  end
12
13
  end
14
+
13
15
  end
14
16
  end
15
17
 
18
+
16
19
  class ::Matrix
17
20
  def to_gsl
18
21
  out=[]
@@ -21,7 +24,6 @@ class ::Matrix
21
24
  }
22
25
  GSL::Matrix[*out]
23
26
  end
24
-
25
27
  # Calculate marginal of rows
26
28
  def row_sum
27
29
  (0...row_size).collect {|i|
@@ -78,6 +80,7 @@ class ::Matrix
78
80
  def total_sum
79
81
  row_sum.inject(0){|a,v| a+v}
80
82
  end
83
+
81
84
  end
82
85
 
83
86
  module GSL
@@ -98,7 +101,7 @@ module Statsample
98
101
  # matrix.extend CovariateMatrix
99
102
  #
100
103
  module CovariateMatrix
101
- # Gives a nice
104
+ # Gives a nice summary
102
105
  def summary
103
106
  rp=ReportBuilder.new()
104
107
  rp.add(self)
@@ -158,16 +161,10 @@ module Statsample
158
161
  @fields_y=v
159
162
  end
160
163
  def fields_x
161
- if @fields_x.nil?
162
- @fields_x=row_size.times.collect {|i| i}
163
- end
164
- @fields_x
164
+ @fields_x||=row_size.times.collect {|i| i}
165
165
  end
166
166
  def fields_y
167
- if @fields_y.nil?
168
- @fields_y=column_size.times.collect {|i| i}
169
- end
170
- @fields_y
167
+ @fields_y||=column_size.times.collect {|i| i}
171
168
  end
172
169
 
173
170
  def name=(v)
@@ -176,11 +173,17 @@ module Statsample
176
173
  def name
177
174
  @name
178
175
  end
179
- # Select a submatrix of factors. You could use labels or index to select
180
- # the factors.
181
- # If you don't specify columns, will be equal to rows
176
+ # Select a submatrix of factors. If you have a correlation matrix
177
+ # with a, b and c, you could obtain a submatrix of correlations of
178
+ # a and b, b and c or a and b
179
+ #
180
+ # You could use labels or index to select the factors.
181
+ # If you don't specify columns, its will be equal to rows.
182
+ #
182
183
  # Example:
183
- # a=Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]]
184
+ # a=Matrix[[1.0, 0.3, 0.2],
185
+ # [0.3, 1.0, 0.5],
186
+ # [0.2, 0.5, 1.0]]
184
187
  # a.extends CovariateMatrix
185
188
  # a.labels=%w{a b c}
186
189
  # a.submatrix(%{c a}, %w{b})
@@ -188,6 +191,7 @@ module Statsample
188
191
  # a.submatrix(%{c a})
189
192
  # => Matrix[[1.0, 0.2] , [0.2, 1.0]]
190
193
  def submatrix(rows,columns=nil)
194
+ raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size==0
191
195
  columns||=rows
192
196
  # Convert all labels on index
193
197
  row_index=rows.collect {|v|
@@ -196,8 +200,7 @@ module Statsample
196
200
  column_index=columns.collect {|v|
197
201
  v.is_a?(Numeric) ? v : fields_y.index(v)
198
202
  }
199
-
200
-
203
+
201
204
  fx=row_index.collect {|v| fields_x[v]}
202
205
  fy=column_index.collect {|v| fields_y[v]}
203
206
 
@@ -211,11 +214,11 @@ module Statsample
211
214
  end
212
215
  def report_building(generator)
213
216
  @name||= (type==:correlation ? "Correlation":"Covariance")+" Matrix"
214
- t=ReportBuilder::Table.new(:name=>@name, :header=>[""]+fields_y)
215
- row_size.times {|i|
216
- t.row([fields_x[i]]+@rows[i].collect {|i1| sprintf("%0.3f",i1).gsub("0.",".")})
217
- }
218
- generator.parse_element(t)
217
+ generator.table(:name=>@name, :header=>[""]+fields_y) do |t|
218
+ row_size.times {|i|
219
+ t.row([fields_x[i]]+@rows[i].collect {|i1| sprintf("%0.3f",i1).gsub("0.",".")})
220
+ }
221
+ end
219
222
  end
220
223
  end
221
224
  end
@@ -101,17 +101,17 @@ module Statsample
101
101
  parameters = parameters-(h.inverse*(fd))
102
102
 
103
103
  if @stop_criteria==:parameters
104
- flag=true
105
- k.times do |j|
106
- diff= ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
107
- flag=false if diff.abs >= MIN_DIFF_PARAMETERS
108
- @output.puts "Parameters #{j}: #{diff}" if @verbose
109
- end
110
- if flag
111
- @var_cov_matrix = h.inverse*-1.0
112
- return parameters
113
- end
114
- old_parameters=parameters
104
+ flag=true
105
+ k.times do |j|
106
+ diff= ( parameters[j,0] - old_parameters[j,0] ) / parameters[j,0]
107
+ flag=false if diff.abs >= MIN_DIFF_PARAMETERS
108
+ @output.puts "Parameters #{j}: #{diff}" if @verbose
109
+ end
110
+ if flag
111
+ @var_cov_matrix = h.inverse*-1.0
112
+ return parameters
113
+ end
114
+ old_parameters=parameters
115
115
  else
116
116
  begin
117
117
  new_likehood = log_likehood(x,y,parameters)
@@ -50,7 +50,7 @@ module Statsample
50
50
  end
51
51
  def reset
52
52
  @iterations=0
53
- @data=@original.dup
53
+ @data=@original.to_a.dup
54
54
  end
55
55
  def each
56
56
  reset
@@ -58,6 +58,7 @@ module Statsample
58
58
  yield next_value
59
59
  end
60
60
  end
61
+ # Returns permutations
61
62
  def permutations
62
63
  a=Array.new
63
64
  each {|c| a.push(c)}
@@ -65,23 +65,25 @@ module Statsample
65
65
  # Creates one of the Statsample::Regression::Multiple object,
66
66
  # for OLS multiple regression.
67
67
  # Parameters:
68
- # * ds: Dataset.
68
+ # * <tt>ds</tt>: Dataset.
69
69
  # * y: Name of dependent variable.
70
- # * missing_data: Could be
71
- # * :listwise: delete cases with one or more empty data (default).
72
- # * :pairwise: uses correlation matrix. Use with caution.
70
+ # * opts: A hash with options
71
+ # * missing_data: Could be
72
+ # * :listwise: delete cases with one or more empty data (default).
73
+ # * :pairwise: uses correlation matrix. Use with caution.
73
74
  #
74
75
  # <b>Usage:</b>
75
76
  # lr=Statsample::Regression::multiple(ds,'y')
76
- def self.multiple(ds,y_var, missing_data=:listwise)
77
+ def self.multiple(ds,y_var, opts=Hash.new)
78
+ missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data)
77
79
  if missing_data==:pairwise
78
- RubyEngine.new(ds,y_var)
80
+ Statsample::Regression::Multiple::RubyEngine.new(ds,y_var, opts)
79
81
  else
80
82
  if Statsample.has_gsl?
81
- Statsample::Regression::Multiple::GslEngine.new(ds, y_var)
83
+ Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
82
84
  else
83
85
  ds2=ds.dup_only_valid
84
- Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var)
86
+ Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts)
85
87
  end
86
88
  end
87
89
  end