statsample 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/statsample.rb CHANGED
@@ -109,7 +109,7 @@ end
109
109
  # * Dataset: An union of vectors.
110
110
  #
111
111
  module Statsample
112
- VERSION = '0.6.2'
112
+ VERSION = '0.6.3'
113
113
  SPLIT_TOKEN = ","
114
114
  autoload(:Database, 'statsample/converters')
115
115
  autoload(:Anova, 'statsample/anova')
@@ -21,31 +21,51 @@ module Statsample
21
21
  end
22
22
  end
23
23
  end
24
- # Compute polychoric correlation.
24
+ # == Polychoric correlation.
25
25
  #
26
- # The polychoric correlation estimate what the correlation between raters, who classified on a ordered category scale, would be if ratings were made on a continuous scale; they are, theoretically, invariant over changes in the number or "width" of rating categories.
27
- # See extensive documentation on http://www.john-uebersax.com/stat/tetra.htm
28
-
26
+ # The <em>polychoric</em> correlation is a measure of
27
+ # bivariate association arising when both observed variates
28
+ # are ordered, categorical variables that result from polychotomizing
29
+ # the two undelying continuous variables (Drasgow, 2006)
30
+ #
31
+ # According to Drasgow(2006), there are tree methods to estimate
32
+ # the polychoric correlation:
33
+ #
34
+ # 1. Maximum Likehood Estimator
35
+ # 2. Two-step estimator and
36
+ # 3. Polychoric series estimate.
37
+ #
38
+ # By default, Two-step estimation are used. You can select
39
+ # the estimation method with method attribute
40
+ #
41
+ # See extensive documentation on Uebersax(2002) and Drasgow(2006)
29
42
  class Polychoric
30
43
  include GetText
31
44
  bindtextdomain("statsample")
32
45
  # Name of the analysis
33
46
  attr_accessor :name
34
- # Max number of iterations used on iterative methods. Default to 100
47
+ # Max number of iterations used on iterative methods. Default to MAX_ITERATIONS
35
48
  attr_accessor :max_iterations
36
49
  # Debug algorithm (See iterations, for example)
37
50
  attr_accessor :debug
38
51
  # Minimizer type. Default GSL::Min::FMinimizer::BRENT
39
52
  # See http://rb-gsl.rubyforge.org/min.html for reference.
40
- attr_accessor :minimizer_type
41
- # Method of calculation.
42
- #
43
- # Drasgow (1988, cited by Uebersax, 2002) describe two method: joint maximum likelihood (ML) approach and two-step ML estimation.
44
- # For now, only implemented two-step ML (:two_step), with algorithm
45
- # based on Drasgow(1986, cited by Gegenfurtner, 1992)
46
- #
53
+ attr_accessor :minimizer_type_two_step
54
+
55
+ # Minimizer type. Default GSL::Min::FMinimizer::BRENT
56
+ # See http://rb-gsl.rubyforge.org/min.html for reference.
57
+ attr_accessor :minimizer_type_joint
58
+
59
+
60
+ # Method of calculation of polychoric series.
61
+ #
62
+ # :two_step:: two-step ML, based on code by Gegenfurtner(1992)
63
+ # :polychoric_series:: polychoric series estimate, using
64
+ # algorithm AS87 by Martinson and Hamdan (1975)
65
+ # :joint: one-step ML, based on R package 'polycor'
66
+ # by J.Fox.
47
67
  attr_accessor :method
48
- # Absolute error for iteration. Default to 0.001
68
+ # Absolute error for iteration.
49
69
  attr_accessor :epsilon
50
70
 
51
71
  # Number of iterations
@@ -54,12 +74,31 @@ module Statsample
54
74
  # Log of algorithm
55
75
  attr_reader :log
56
76
  attr_reader :loglike
57
- MAX_ITERATIONS=100
58
- EPSILON=0.001
59
- MINIMIZER_TYPE=GSL::Min::FMinimizer::BRENT
77
+
78
+ METHOD=:two_step
79
+ MAX_ITERATIONS=300
80
+ EPSILON=0.000001
81
+ MINIMIZER_TYPE_TWO_STEP="brent"
82
+ MINIMIZER_TYPE_JOINT="nmsimplex"
60
83
  def new_with_vectors(v1,v2)
61
84
  Polychoric.new(Crosstab.new(v1,v2).to_matrix)
62
85
  end
86
+ # Calculate Polychoric correlation
87
+ # You should enter a Matrix with ordered data. For
88
+ # -------------------
89
+ # | y=0 | y=1 | y=2 |
90
+ # -------------------
91
+ # x = 0 | 1 | 10 | 20 |
92
+ # -------------------
93
+ # x = 1 | 20 | 20 | 50 |
94
+ # -------------------
95
+ #
96
+ # The code will be
97
+ #
98
+ # matrix=Matrix[[1,10,20],[20,20,50]]
99
+ # poly=Statsample::Bivariate::Polychoric.new(matrix, :method=>:joint)
100
+ # puts poly.r
101
+
63
102
 
64
103
  def initialize(matrix, opts=Hash.new)
65
104
  @matrix=matrix
@@ -68,68 +107,126 @@ module Statsample
68
107
  raise "row size <1" if @m<=1
69
108
  raise "column size <1" if @n<=1
70
109
 
71
- @method=:two_step
110
+ @method=METHOD
72
111
  @name="Polychoric correlation"
73
112
  @max_iterations=MAX_ITERATIONS
74
113
  @epsilon=EPSILON
75
- @minimizer_type=GSL::Min::FMinimizer::BRENT
114
+ @minimizer_type_two_step=MINIMIZER_TYPE_TWO_STEP
115
+ @minimizer_type_joint=MINIMIZER_TYPE_JOINT
76
116
  @debug=false
77
117
  @iteration=nil
78
118
  opts.each{|k,v|
79
119
  self.send("#{k}=",v) if self.respond_to? k
80
120
  }
81
121
  @r=nil
122
+ compute_basic_parameters
82
123
  end
124
+ # Returns the polychoric correlation
83
125
  def r
84
126
  if @r.nil?
85
127
  compute
86
128
  end
87
129
  @r
88
130
  end
131
+ # Returns the rows thresholds
89
132
 
90
133
  def threshold_x
91
134
  if @alpha.nil?
92
135
  compute
93
136
  end
94
- @alpha[0,@alpha.size-1]
137
+ @alpha
95
138
  end
139
+ # Returns the column thresholds
96
140
 
97
141
  def threshold_y
98
142
  if @beta.nil?
99
143
  compute
100
144
  end
101
- @beta[0,@beta.size-1]
145
+ @beta
102
146
  end
103
147
 
104
148
 
105
-
149
+ # Start the computation of polychoric correlation
150
+ # based on attribute method
106
151
  def compute
107
152
  if @method==:two_step
108
153
  compute_two_step_mle_drasgow
109
- elsif @method==:as87
110
- compute_two_step_as87
154
+ elsif @method==:joint
155
+ compute_one_step_mle
156
+ elsif @method==:polychoric_series
157
+ compute_polychoric_series
111
158
  else
112
159
  raise "Not implemented"
113
160
  end
114
161
  end
115
- # *Computation of polychoric correlation usign two-step ML estimation.*
116
- #
117
- # Two-step ML estimation "first estimates the thresholds from the one-way marginal frequencies, then estimates rho, conditional on these thresholds, via maximum likelihood" (Uebersax, 2006).
118
- #
119
- # The algorithm is based on Drasgow(1986, cited by Gegenfurtner (1992)
120
- # References:
121
- # * Gegenfurtner, K. (1992). PRAXIS: Brent's algorithm for function minimization. Behavior Research Methods, Instruments & Computers, 24(4), 560-564. Available on http://www.allpsych.uni-giessen.de/karl/pdf/03.praxis.pdf
122
- # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
123
- #
124
- def compute_two_step_mle_drasgow
162
+
163
+ def loglike_data
164
+ loglike=0
165
+ @nr.times { |i|
166
+ @nc.times { |j|
167
+ res=@matrix[i,j].quo(@total)
168
+ if (res==0)
169
+ # puts "Correccion"
170
+ res=1e-16
171
+ end
172
+ loglike+= @matrix[i,j] * Math::log(res )
173
+ }
174
+ }
175
+ loglike
176
+ end
177
+ def chi_square
178
+ if @loglike_model.nil?
179
+ compute
180
+ end
181
+ -2*(@loglike_model-loglike_data)
182
+ end
183
+ def chi_square_df
184
+ (@nr*@nc)-@nc-@nr
185
+ end
186
+ def loglike(alpha,beta,rho)
187
+ if rho.abs>0.9999
188
+ rho= (rho>0) ? 0.9999 : -0.9999
189
+ end
190
+
191
+ loglike=0
192
+ pd=@nr.times.collect{ [0]*@nc}
193
+ pc=@nr.times.collect{ [0]*@nc}
194
+ @nr.times { |i|
195
+ @nc.times { |j|
196
+ #puts "i:#{i} | j:#{j}"
197
+ if i==@nr-1 and j==@nc-1
198
+ pd[i][j]=1.0
199
+ else
200
+ a=(i==@nr-1) ? 100: alpha[i]
201
+ b=(j==@nc-1) ? 100: beta[j]
202
+ #puts "a:#{a} b:#{b}"
203
+ pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
204
+ end
205
+ pc[i][j] = pd[i][j]
206
+ pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
207
+ pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
208
+ pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
209
+ res= pd[i][j]
210
+ #puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
211
+ if (res==0)
212
+ # puts "Correccion"
213
+ res=1e-16
214
+ end
215
+ loglike+= @matrix[i,j] * Math::log( res )
216
+ }
217
+ }
218
+ @pd=pd
219
+ -loglike
220
+ end
221
+ def compute_basic_parameters
125
222
  @nr=@matrix.row_size
126
223
  @nc=@matrix.column_size
127
224
  @sumr=[0]*@matrix.row_size
128
225
  @sumrac=[0]*@matrix.row_size
129
226
  @sumc=[0]*@matrix.column_size
130
227
  @sumcac=[0]*@matrix.column_size
131
- @alpha=[0]*@matrix.row_size
132
- @beta=[0]*@matrix.row_size
228
+ @alpha=[0]*(@nr-1)
229
+ @beta=[0]*(@nc-1)
133
230
  @total=0
134
231
  @nr.times do |i|
135
232
  @nc.times do |j|
@@ -150,44 +247,31 @@ module Statsample
150
247
  @beta[i]=Distribution::Normal.p_value(@sumcac[i] / @total.to_f)
151
248
  ac=@sumcac[i]
152
249
  end
153
- @alpha[@nr-1]=10
154
- @beta[@nc-1]=10
155
- fn1=GSL::Function.alloc {|x|
156
- loglike=0
157
- pd=@nr.times.collect{ [0]*@nc}
158
- pc=@nr.times.collect{ [0]*@nc}
159
-
160
- @nr.times { |i|
161
- @nc.times { |j|
162
- pd[i][j]=Distribution::NormalBivariate.cdf(@alpha[i], @beta[j], x)
163
- pc[i][j] = pd[i][j]
164
- pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
165
- pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
166
- pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
167
- res= pd[i][j]
168
-
169
- if res==0.0
170
- res=1e-15
171
- end
172
-
173
- # puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
174
- loglike+= @matrix[i,j] * Math::log( res )
175
- }
176
- }
177
- # p pd
178
- @loglike=loglike
179
- @pd=pd
180
- -loglike
250
+ end
251
+ # Computation of polychoric correlation usign two-step ML estimation.
252
+ #
253
+ # Two-step ML estimation "first estimates the thresholds from the one-way marginal frequencies, then estimates rho, conditional on these thresholds, via maximum likelihood" (Uebersax, 2006).
254
+ #
255
+ # The algorithm is based on code by Gegenfurtner(1992).
256
+ #
257
+ # <b>References</b>:
258
+ # * Gegenfurtner, K. (1992). PRAXIS: Brent's algorithm for function minimization. Behavior Research Methods, Instruments & Computers, 24(4), 560-564. Available on http://www.allpsych.uni-giessen.de/karl/pdf/03.praxis.pdf
259
+ # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
260
+ #
261
+ def compute_two_step_mle_drasgow
262
+
263
+ fn1=GSL::Function.alloc {|rho|
264
+ loglike(@alpha,@beta, rho)
181
265
  }
182
266
  @iteration = 0
183
267
  max_iter = @max_iterations
184
268
  m = 0 # initial guess
185
- m_expected = 0.5
186
- a=-0.99999
187
- b=+0.99999
188
- gmf = GSL::Min::FMinimizer.alloc(@minimizer_type)
269
+ m_expected = 0
270
+ a=-0.9999
271
+ b=+0.9999
272
+ gmf = GSL::Min::FMinimizer.alloc(@minimizer_type_two_step)
189
273
  gmf.set(fn1, m, a, b)
190
- header=sprintf("using %s method\n", gmf.name)
274
+ header=sprintf("Two step minimization using %s method\n", gmf.name)
191
275
  header+=sprintf("%5s [%9s, %9s] %9s %10s %9s\n", "iter", "lower", "upper", "min",
192
276
  "err", "err(est)")
193
277
 
@@ -197,11 +281,11 @@ module Statsample
197
281
  begin
198
282
  @iteration += 1
199
283
  status = gmf.iterate
200
- status = gmf.test_interval(0.001, 0.0)
284
+ status = gmf.test_interval(@epsilon, 0.0)
201
285
 
202
286
  if status == GSL::SUCCESS
203
- @log+="Converged:"
204
- puts "Converged:" if @debug
287
+ @log+="converged:"
288
+ puts "converged:" if @debug
205
289
  end
206
290
  a = gmf.x_lower
207
291
  b = gmf.x_upper
@@ -212,26 +296,66 @@ module Statsample
212
296
  puts message if @debug
213
297
  end while status == GSL::CONTINUE and @iteration < @max_iterations
214
298
  @r=gmf.x_minimum
299
+ @loglike_model=-gmf.f_minimum
215
300
  end
216
- # Chi-square to test r=0
217
- def chi_square_independence
218
- Statsample::Test::chi_square(@matrix, expected)
219
- end
220
- # Chi-square to test model==independence
221
301
 
222
- def chi_square_model_expected
223
- calculate if @r.nil?
224
- model=Matrix.rows(@pd).collect {|c| c*@total}
225
- Statsample::Test::chi_square(model, expected)
226
-
227
- end
228
- # Chi-square to test real == calculated with rho
229
- def chi_square_model
230
- calculate if @r.nil?
231
- e=Matrix.rows(@pd).collect {|c| c*@total}
232
- Statsample::Test::chi_square(@matrix, e)
302
+ # Compute Polychoric correlation with joint estimate.
303
+ # Rho and thresholds are estimated at same time.
304
+ # Code based on R package "polycor", by J.Fox.
305
+ #
306
+
307
+ def compute_one_step_mle
308
+ # Get initial values with two-step aproach
309
+ compute_two_step_mle_drasgow
310
+ # Start iteration with past values
311
+ rho=@r
312
+ cut_alpha=@alpha
313
+ cut_beta=@beta
314
+ parameters=[rho]+cut_alpha+cut_beta
315
+ minimization = Proc.new { |v, params|
316
+ rho=v[0]
317
+ alpha=v[1,@nr-1]
318
+ beta=v[@nr,@nc-1]
319
+ loglike(alpha,beta,rho)
320
+ }
321
+ np=@nc-1+@nr
322
+ my_func = GSL::MultiMin::Function.alloc(minimization, np)
323
+ my_func.set_params(parameters) # parameters
324
+
325
+ x = GSL::Vector.alloc(parameters.dup)
326
+
327
+ ss = GSL::Vector.alloc(np)
328
+ ss.set_all(1.0)
329
+
330
+ minimizer = GSL::MultiMin::FMinimizer.alloc(minimizer_type_joint,np)
331
+ minimizer.set(my_func, x, ss)
332
+
333
+ iter = 0
334
+ message=""
335
+ begin
336
+ iter += 1
337
+ status = minimizer.iterate()
338
+ status = minimizer.test_size(@epsilon)
339
+ if status == GSL::SUCCESS
340
+ message="Joint MLE converged to minimum at\n"
341
+ end
342
+ x = minimizer.x
343
+ message+= sprintf("%5d iterations", iter)+"\n";
344
+ for i in 0...np do
345
+ message+=sprintf("%10.3e ", x[i])
346
+ end
347
+ message+=sprintf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)+"\n";
348
+ end while status == GSL::CONTINUE and iter < @max_iterations
349
+ @iteration=@iter
350
+ @log+=message
351
+ puts message if @debug
352
+ @r=minimizer.x[0]
353
+ @alpha=minimizer.x[1,@nr-1].to_a
354
+ @beta=minimizer.x[@nr,@nc-1].to_a
355
+ @loglike_model= -minimizer.minimum
233
356
  end
234
- def matrix_for_rho(rho)
357
+
358
+ def matrix_for_rho(rho) # :nodoc:
235
359
  pd=@nr.times.collect{ [0]*@nc}
236
360
  pc=@nr.times.collect{ [0]*@nc}
237
361
  @nr.times { |i|
@@ -246,37 +370,8 @@ module Statsample
246
370
  }
247
371
  Matrix.rows(pc)
248
372
  end
249
- def g2
250
- raise "Doesn't work"
251
- e=expected
252
- no_r_likehood=0
253
- @nr.times {|i|
254
- @nc.times {|j|
255
- #p @matrix[i,j]
256
- if @matrix[i,j]!=0
257
- no_r_likehood+= @matrix[i,j]*Math::log(e[i,j])
258
- end
259
- }
260
- }
261
- p no_r_likehood
262
- model=Matrix.rows(@pd).collect {|c| c*@total}
263
-
264
- model_likehood=0
265
- @nr.times {|i|
266
- @nc.times {|j|
267
- #p @matrix[i,j]
268
- if @matrix[i,j]!=0
269
- model_likehood+= @matrix[i,j] * Math::log(model[i,j])
270
- end
271
- }
272
- }
273
-
274
- p model_likehood
275
-
276
- -2*(no_r_likehood-model_likehood)
277
-
278
- end
279
- def expected
373
+
374
+ def expected # :nodoc:
280
375
  rt=[]
281
376
  ct=[]
282
377
  t=0
@@ -300,10 +395,14 @@ module Statsample
300
395
 
301
396
  Matrix.rows(m)
302
397
  end
303
- # Compute polychoric using AS87.
304
- # Doesn't work for now! I can't find the error :(
305
398
 
306
- def compute_two_step_as87
399
+ # Compute polychoric correlation using polychoric series.
400
+ # Algorithm: AS87, by Martinson and Hamdam(1975).
401
+ #
402
+ # <b>Warning</b>: According to Drasgow(2006), this
403
+ # computation diverges greatly of joint and two-step methods.
404
+ #
405
+ def compute_polychoric_series
307
406
  @nn=@n-1
308
407
  @mm=@m-1
309
408
  @nn7=7*@nn
@@ -390,10 +489,10 @@ module Statsample
390
489
  (1..@nn).each do |i| #do 22
391
490
  beta[i]=Distribution::Normal.p_value(sumc[i] / sum.to_f)
392
491
  end # 21
393
- @alpha=alpha[1,alpha.size] << nil
394
- @beta=beta[1,beta.size] << nil
395
- @sumr=sumr
396
- @sumc=sumc
492
+ @alpha=alpha[1,alpha.size]
493
+ @beta=beta[1,beta.size]
494
+ @sumr=row[1,row.size]
495
+ @sumc=colmn[1,colmn.size]
397
496
  @total=sum
398
497
 
399
498
  # Compute Fourier coefficients a and b. Verified
@@ -522,9 +621,12 @@ module Statsample
522
621
  end # 43
523
622
  raise "Error" if norts==0
524
623
  @r=pcorl
624
+
625
+ @loglike_model=-loglike(@alpha, @beta, @r)
626
+
525
627
  end
526
628
  #Computes vector h(mm7) of orthogonal hermite...
527
- def hermit(s,k)
629
+ def hermit(s,k) # :nodoc:
528
630
  h=[]
529
631
  (1..k).each do |i| # do 14
530
632
  l=i
@@ -544,7 +646,7 @@ module Statsample
544
646
  end
545
647
  h
546
648
  end
547
- def xnorm(t)
649
+ def xnorm(t) # :nodoc:
548
650
  Math::exp(-0.5 * t **2) * (1.0/Math::sqrt(2*Math::PI))
549
651
  end
550
652
 
@@ -554,7 +656,7 @@ module Statsample
554
656
  rp.to_text
555
657
  end
556
658
 
557
- def to_reportbuilder(generator)
659
+ def to_reportbuilder(generator) # :nodoc:
558
660
  compute if @r.nil?
559
661
  section=ReportBuilder::Section.new(:name=>@name)
560
662
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>[""]+(@n.times.collect {|i| "Y=#{i}"})+["Total"])
@@ -574,6 +676,7 @@ module Statsample
574
676
  t.add_row(["Threshold Y #{i}", sprintf("%0.4f", val)])
575
677
  }
576
678
  section.add(t)
679
+ section.add(_("Test of bivariate normality: X2 = %0.3f, df = %d, p= %0.5f" % [ chi_square, chi_square_df, 1-Distribution::ChiSquare.cdf(chi_square, chi_square_df)]))
577
680
  generator.parse_element(section)
578
681
  end
579
682
  end