statsample 0.6.2 → 0.6.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/statsample.rb CHANGED
@@ -109,7 +109,7 @@ end
109
109
  # * Dataset: An union of vectors.
110
110
  #
111
111
  module Statsample
112
- VERSION = '0.6.2'
112
+ VERSION = '0.6.3'
113
113
  SPLIT_TOKEN = ","
114
114
  autoload(:Database, 'statsample/converters')
115
115
  autoload(:Anova, 'statsample/anova')
@@ -21,31 +21,51 @@ module Statsample
21
21
  end
22
22
  end
23
23
  end
24
- # Compute polychoric correlation.
24
+ # == Polychoric correlation.
25
25
  #
26
- # The polychoric correlation estimate what the correlation between raters, who classified on a ordered category scale, would be if ratings were made on a continuous scale; they are, theoretically, invariant over changes in the number or "width" of rating categories.
27
- # See extensive documentation on http://www.john-uebersax.com/stat/tetra.htm
28
-
26
+ # The <em>polychoric</em> correlation is a measure of
27
+ # bivariate association arising when both observed variates
28
+ # are ordered, categorical variables that result from polychotomizing
29
+ # the two undelying continuous variables (Drasgow, 2006)
30
+ #
31
+ # According to Drasgow(2006), there are tree methods to estimate
32
+ # the polychoric correlation:
33
+ #
34
+ # 1. Maximum Likehood Estimator
35
+ # 2. Two-step estimator and
36
+ # 3. Polychoric series estimate.
37
+ #
38
+ # By default, Two-step estimation are used. You can select
39
+ # the estimation method with method attribute
40
+ #
41
+ # See extensive documentation on Uebersax(2002) and Drasgow(2006)
29
42
  class Polychoric
30
43
  include GetText
31
44
  bindtextdomain("statsample")
32
45
  # Name of the analysis
33
46
  attr_accessor :name
34
- # Max number of iterations used on iterative methods. Default to 100
47
+ # Max number of iterations used on iterative methods. Default to MAX_ITERATIONS
35
48
  attr_accessor :max_iterations
36
49
  # Debug algorithm (See iterations, for example)
37
50
  attr_accessor :debug
38
51
  # Minimizer type. Default GSL::Min::FMinimizer::BRENT
39
52
  # See http://rb-gsl.rubyforge.org/min.html for reference.
40
- attr_accessor :minimizer_type
41
- # Method of calculation.
42
- #
43
- # Drasgow (1988, cited by Uebersax, 2002) describe two method: joint maximum likelihood (ML) approach and two-step ML estimation.
44
- # For now, only implemented two-step ML (:two_step), with algorithm
45
- # based on Drasgow(1986, cited by Gegenfurtner, 1992)
46
- #
53
+ attr_accessor :minimizer_type_two_step
54
+
55
+ # Minimizer type. Default GSL::Min::FMinimizer::BRENT
56
+ # See http://rb-gsl.rubyforge.org/min.html for reference.
57
+ attr_accessor :minimizer_type_joint
58
+
59
+
60
+ # Method of calculation of polychoric series.
61
+ #
62
+ # :two_step:: two-step ML, based on code by Gegenfurtner(1992)
63
+ # :polychoric_series:: polychoric series estimate, using
64
+ # algorithm AS87 by Martinson and Hamdan (1975)
65
+ # :joint: one-step ML, based on R package 'polycor'
66
+ # by J.Fox.
47
67
  attr_accessor :method
48
- # Absolute error for iteration. Default to 0.001
68
+ # Absolute error for iteration.
49
69
  attr_accessor :epsilon
50
70
 
51
71
  # Number of iterations
@@ -54,12 +74,31 @@ module Statsample
54
74
  # Log of algorithm
55
75
  attr_reader :log
56
76
  attr_reader :loglike
57
- MAX_ITERATIONS=100
58
- EPSILON=0.001
59
- MINIMIZER_TYPE=GSL::Min::FMinimizer::BRENT
77
+
78
+ METHOD=:two_step
79
+ MAX_ITERATIONS=300
80
+ EPSILON=0.000001
81
+ MINIMIZER_TYPE_TWO_STEP="brent"
82
+ MINIMIZER_TYPE_JOINT="nmsimplex"
60
83
  def new_with_vectors(v1,v2)
61
84
  Polychoric.new(Crosstab.new(v1,v2).to_matrix)
62
85
  end
86
+ # Calculate Polychoric correlation
87
+ # You should enter a Matrix with ordered data. For
88
+ # -------------------
89
+ # | y=0 | y=1 | y=2 |
90
+ # -------------------
91
+ # x = 0 | 1 | 10 | 20 |
92
+ # -------------------
93
+ # x = 1 | 20 | 20 | 50 |
94
+ # -------------------
95
+ #
96
+ # The code will be
97
+ #
98
+ # matrix=Matrix[[1,10,20],[20,20,50]]
99
+ # poly=Statsample::Bivariate::Polychoric.new(matrix, :method=>:joint)
100
+ # puts poly.r
101
+
63
102
 
64
103
  def initialize(matrix, opts=Hash.new)
65
104
  @matrix=matrix
@@ -68,68 +107,126 @@ module Statsample
68
107
  raise "row size <1" if @m<=1
69
108
  raise "column size <1" if @n<=1
70
109
 
71
- @method=:two_step
110
+ @method=METHOD
72
111
  @name="Polychoric correlation"
73
112
  @max_iterations=MAX_ITERATIONS
74
113
  @epsilon=EPSILON
75
- @minimizer_type=GSL::Min::FMinimizer::BRENT
114
+ @minimizer_type_two_step=MINIMIZER_TYPE_TWO_STEP
115
+ @minimizer_type_joint=MINIMIZER_TYPE_JOINT
76
116
  @debug=false
77
117
  @iteration=nil
78
118
  opts.each{|k,v|
79
119
  self.send("#{k}=",v) if self.respond_to? k
80
120
  }
81
121
  @r=nil
122
+ compute_basic_parameters
82
123
  end
124
+ # Returns the polychoric correlation
83
125
  def r
84
126
  if @r.nil?
85
127
  compute
86
128
  end
87
129
  @r
88
130
  end
131
+ # Returns the rows thresholds
89
132
 
90
133
  def threshold_x
91
134
  if @alpha.nil?
92
135
  compute
93
136
  end
94
- @alpha[0,@alpha.size-1]
137
+ @alpha
95
138
  end
139
+ # Returns the column thresholds
96
140
 
97
141
  def threshold_y
98
142
  if @beta.nil?
99
143
  compute
100
144
  end
101
- @beta[0,@beta.size-1]
145
+ @beta
102
146
  end
103
147
 
104
148
 
105
-
149
+ # Start the computation of polychoric correlation
150
+ # based on attribute method
106
151
  def compute
107
152
  if @method==:two_step
108
153
  compute_two_step_mle_drasgow
109
- elsif @method==:as87
110
- compute_two_step_as87
154
+ elsif @method==:joint
155
+ compute_one_step_mle
156
+ elsif @method==:polychoric_series
157
+ compute_polychoric_series
111
158
  else
112
159
  raise "Not implemented"
113
160
  end
114
161
  end
115
- # *Computation of polychoric correlation usign two-step ML estimation.*
116
- #
117
- # Two-step ML estimation "first estimates the thresholds from the one-way marginal frequencies, then estimates rho, conditional on these thresholds, via maximum likelihood" (Uebersax, 2006).
118
- #
119
- # The algorithm is based on Drasgow(1986, cited by Gegenfurtner (1992)
120
- # References:
121
- # * Gegenfurtner, K. (1992). PRAXIS: Brent's algorithm for function minimization. Behavior Research Methods, Instruments & Computers, 24(4), 560-564. Available on http://www.allpsych.uni-giessen.de/karl/pdf/03.praxis.pdf
122
- # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
123
- #
124
- def compute_two_step_mle_drasgow
162
+
163
+ def loglike_data
164
+ loglike=0
165
+ @nr.times { |i|
166
+ @nc.times { |j|
167
+ res=@matrix[i,j].quo(@total)
168
+ if (res==0)
169
+ # puts "Correccion"
170
+ res=1e-16
171
+ end
172
+ loglike+= @matrix[i,j] * Math::log(res )
173
+ }
174
+ }
175
+ loglike
176
+ end
177
+ def chi_square
178
+ if @loglike_model.nil?
179
+ compute
180
+ end
181
+ -2*(@loglike_model-loglike_data)
182
+ end
183
+ def chi_square_df
184
+ (@nr*@nc)-@nc-@nr
185
+ end
186
+ def loglike(alpha,beta,rho)
187
+ if rho.abs>0.9999
188
+ rho= (rho>0) ? 0.9999 : -0.9999
189
+ end
190
+
191
+ loglike=0
192
+ pd=@nr.times.collect{ [0]*@nc}
193
+ pc=@nr.times.collect{ [0]*@nc}
194
+ @nr.times { |i|
195
+ @nc.times { |j|
196
+ #puts "i:#{i} | j:#{j}"
197
+ if i==@nr-1 and j==@nc-1
198
+ pd[i][j]=1.0
199
+ else
200
+ a=(i==@nr-1) ? 100: alpha[i]
201
+ b=(j==@nc-1) ? 100: beta[j]
202
+ #puts "a:#{a} b:#{b}"
203
+ pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
204
+ end
205
+ pc[i][j] = pd[i][j]
206
+ pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
207
+ pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
208
+ pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
209
+ res= pd[i][j]
210
+ #puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
211
+ if (res==0)
212
+ # puts "Correccion"
213
+ res=1e-16
214
+ end
215
+ loglike+= @matrix[i,j] * Math::log( res )
216
+ }
217
+ }
218
+ @pd=pd
219
+ -loglike
220
+ end
221
+ def compute_basic_parameters
125
222
  @nr=@matrix.row_size
126
223
  @nc=@matrix.column_size
127
224
  @sumr=[0]*@matrix.row_size
128
225
  @sumrac=[0]*@matrix.row_size
129
226
  @sumc=[0]*@matrix.column_size
130
227
  @sumcac=[0]*@matrix.column_size
131
- @alpha=[0]*@matrix.row_size
132
- @beta=[0]*@matrix.row_size
228
+ @alpha=[0]*(@nr-1)
229
+ @beta=[0]*(@nc-1)
133
230
  @total=0
134
231
  @nr.times do |i|
135
232
  @nc.times do |j|
@@ -150,44 +247,31 @@ module Statsample
150
247
  @beta[i]=Distribution::Normal.p_value(@sumcac[i] / @total.to_f)
151
248
  ac=@sumcac[i]
152
249
  end
153
- @alpha[@nr-1]=10
154
- @beta[@nc-1]=10
155
- fn1=GSL::Function.alloc {|x|
156
- loglike=0
157
- pd=@nr.times.collect{ [0]*@nc}
158
- pc=@nr.times.collect{ [0]*@nc}
159
-
160
- @nr.times { |i|
161
- @nc.times { |j|
162
- pd[i][j]=Distribution::NormalBivariate.cdf(@alpha[i], @beta[j], x)
163
- pc[i][j] = pd[i][j]
164
- pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
165
- pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
166
- pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
167
- res= pd[i][j]
168
-
169
- if res==0.0
170
- res=1e-15
171
- end
172
-
173
- # puts "i:#{i} | j:#{j} | ac: #{sprintf("%0.4f", pc[i][j])} | pd: #{sprintf("%0.4f", pd[i][j])} | res:#{sprintf("%0.4f", res)}"
174
- loglike+= @matrix[i,j] * Math::log( res )
175
- }
176
- }
177
- # p pd
178
- @loglike=loglike
179
- @pd=pd
180
- -loglike
250
+ end
251
+ # Computation of polychoric correlation usign two-step ML estimation.
252
+ #
253
+ # Two-step ML estimation "first estimates the thresholds from the one-way marginal frequencies, then estimates rho, conditional on these thresholds, via maximum likelihood" (Uebersax, 2006).
254
+ #
255
+ # The algorithm is based on code by Gegenfurtner(1992).
256
+ #
257
+ # <b>References</b>:
258
+ # * Gegenfurtner, K. (1992). PRAXIS: Brent's algorithm for function minimization. Behavior Research Methods, Instruments & Computers, 24(4), 560-564. Available on http://www.allpsych.uni-giessen.de/karl/pdf/03.praxis.pdf
259
+ # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
260
+ #
261
+ def compute_two_step_mle_drasgow
262
+
263
+ fn1=GSL::Function.alloc {|rho|
264
+ loglike(@alpha,@beta, rho)
181
265
  }
182
266
  @iteration = 0
183
267
  max_iter = @max_iterations
184
268
  m = 0 # initial guess
185
- m_expected = 0.5
186
- a=-0.99999
187
- b=+0.99999
188
- gmf = GSL::Min::FMinimizer.alloc(@minimizer_type)
269
+ m_expected = 0
270
+ a=-0.9999
271
+ b=+0.9999
272
+ gmf = GSL::Min::FMinimizer.alloc(@minimizer_type_two_step)
189
273
  gmf.set(fn1, m, a, b)
190
- header=sprintf("using %s method\n", gmf.name)
274
+ header=sprintf("Two step minimization using %s method\n", gmf.name)
191
275
  header+=sprintf("%5s [%9s, %9s] %9s %10s %9s\n", "iter", "lower", "upper", "min",
192
276
  "err", "err(est)")
193
277
 
@@ -197,11 +281,11 @@ module Statsample
197
281
  begin
198
282
  @iteration += 1
199
283
  status = gmf.iterate
200
- status = gmf.test_interval(0.001, 0.0)
284
+ status = gmf.test_interval(@epsilon, 0.0)
201
285
 
202
286
  if status == GSL::SUCCESS
203
- @log+="Converged:"
204
- puts "Converged:" if @debug
287
+ @log+="converged:"
288
+ puts "converged:" if @debug
205
289
  end
206
290
  a = gmf.x_lower
207
291
  b = gmf.x_upper
@@ -212,26 +296,66 @@ module Statsample
212
296
  puts message if @debug
213
297
  end while status == GSL::CONTINUE and @iteration < @max_iterations
214
298
  @r=gmf.x_minimum
299
+ @loglike_model=-gmf.f_minimum
215
300
  end
216
- # Chi-square to test r=0
217
- def chi_square_independence
218
- Statsample::Test::chi_square(@matrix, expected)
219
- end
220
- # Chi-square to test model==independence
221
301
 
222
- def chi_square_model_expected
223
- calculate if @r.nil?
224
- model=Matrix.rows(@pd).collect {|c| c*@total}
225
- Statsample::Test::chi_square(model, expected)
226
-
227
- end
228
- # Chi-square to test real == calculated with rho
229
- def chi_square_model
230
- calculate if @r.nil?
231
- e=Matrix.rows(@pd).collect {|c| c*@total}
232
- Statsample::Test::chi_square(@matrix, e)
302
+ # Compute Polychoric correlation with joint estimate.
303
+ # Rho and thresholds are estimated at same time.
304
+ # Code based on R package "polycor", by J.Fox.
305
+ #
306
+
307
+ def compute_one_step_mle
308
+ # Get initial values with two-step aproach
309
+ compute_two_step_mle_drasgow
310
+ # Start iteration with past values
311
+ rho=@r
312
+ cut_alpha=@alpha
313
+ cut_beta=@beta
314
+ parameters=[rho]+cut_alpha+cut_beta
315
+ minimization = Proc.new { |v, params|
316
+ rho=v[0]
317
+ alpha=v[1,@nr-1]
318
+ beta=v[@nr,@nc-1]
319
+ loglike(alpha,beta,rho)
320
+ }
321
+ np=@nc-1+@nr
322
+ my_func = GSL::MultiMin::Function.alloc(minimization, np)
323
+ my_func.set_params(parameters) # parameters
324
+
325
+ x = GSL::Vector.alloc(parameters.dup)
326
+
327
+ ss = GSL::Vector.alloc(np)
328
+ ss.set_all(1.0)
329
+
330
+ minimizer = GSL::MultiMin::FMinimizer.alloc(minimizer_type_joint,np)
331
+ minimizer.set(my_func, x, ss)
332
+
333
+ iter = 0
334
+ message=""
335
+ begin
336
+ iter += 1
337
+ status = minimizer.iterate()
338
+ status = minimizer.test_size(@epsilon)
339
+ if status == GSL::SUCCESS
340
+ message="Joint MLE converged to minimum at\n"
341
+ end
342
+ x = minimizer.x
343
+ message+= sprintf("%5d iterations", iter)+"\n";
344
+ for i in 0...np do
345
+ message+=sprintf("%10.3e ", x[i])
346
+ end
347
+ message+=sprintf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)+"\n";
348
+ end while status == GSL::CONTINUE and iter < @max_iterations
349
+ @iteration=@iter
350
+ @log+=message
351
+ puts message if @debug
352
+ @r=minimizer.x[0]
353
+ @alpha=minimizer.x[1,@nr-1].to_a
354
+ @beta=minimizer.x[@nr,@nc-1].to_a
355
+ @loglike_model= -minimizer.minimum
233
356
  end
234
- def matrix_for_rho(rho)
357
+
358
+ def matrix_for_rho(rho) # :nodoc:
235
359
  pd=@nr.times.collect{ [0]*@nc}
236
360
  pc=@nr.times.collect{ [0]*@nc}
237
361
  @nr.times { |i|
@@ -246,37 +370,8 @@ module Statsample
246
370
  }
247
371
  Matrix.rows(pc)
248
372
  end
249
- def g2
250
- raise "Doesn't work"
251
- e=expected
252
- no_r_likehood=0
253
- @nr.times {|i|
254
- @nc.times {|j|
255
- #p @matrix[i,j]
256
- if @matrix[i,j]!=0
257
- no_r_likehood+= @matrix[i,j]*Math::log(e[i,j])
258
- end
259
- }
260
- }
261
- p no_r_likehood
262
- model=Matrix.rows(@pd).collect {|c| c*@total}
263
-
264
- model_likehood=0
265
- @nr.times {|i|
266
- @nc.times {|j|
267
- #p @matrix[i,j]
268
- if @matrix[i,j]!=0
269
- model_likehood+= @matrix[i,j] * Math::log(model[i,j])
270
- end
271
- }
272
- }
273
-
274
- p model_likehood
275
-
276
- -2*(no_r_likehood-model_likehood)
277
-
278
- end
279
- def expected
373
+
374
+ def expected # :nodoc:
280
375
  rt=[]
281
376
  ct=[]
282
377
  t=0
@@ -300,10 +395,14 @@ module Statsample
300
395
 
301
396
  Matrix.rows(m)
302
397
  end
303
- # Compute polychoric using AS87.
304
- # Doesn't work for now! I can't find the error :(
305
398
 
306
- def compute_two_step_as87
399
+ # Compute polychoric correlation using polychoric series.
400
+ # Algorithm: AS87, by Martinson and Hamdam(1975).
401
+ #
402
+ # <b>Warning</b>: According to Drasgow(2006), this
403
+ # computation diverges greatly of joint and two-step methods.
404
+ #
405
+ def compute_polychoric_series
307
406
  @nn=@n-1
308
407
  @mm=@m-1
309
408
  @nn7=7*@nn
@@ -390,10 +489,10 @@ module Statsample
390
489
  (1..@nn).each do |i| #do 22
391
490
  beta[i]=Distribution::Normal.p_value(sumc[i] / sum.to_f)
392
491
  end # 21
393
- @alpha=alpha[1,alpha.size] << nil
394
- @beta=beta[1,beta.size] << nil
395
- @sumr=sumr
396
- @sumc=sumc
492
+ @alpha=alpha[1,alpha.size]
493
+ @beta=beta[1,beta.size]
494
+ @sumr=row[1,row.size]
495
+ @sumc=colmn[1,colmn.size]
397
496
  @total=sum
398
497
 
399
498
  # Compute Fourier coefficients a and b. Verified
@@ -522,9 +621,12 @@ module Statsample
522
621
  end # 43
523
622
  raise "Error" if norts==0
524
623
  @r=pcorl
624
+
625
+ @loglike_model=-loglike(@alpha, @beta, @r)
626
+
525
627
  end
526
628
  #Computes vector h(mm7) of orthogonal hermite...
527
- def hermit(s,k)
629
+ def hermit(s,k) # :nodoc:
528
630
  h=[]
529
631
  (1..k).each do |i| # do 14
530
632
  l=i
@@ -544,7 +646,7 @@ module Statsample
544
646
  end
545
647
  h
546
648
  end
547
- def xnorm(t)
649
+ def xnorm(t) # :nodoc:
548
650
  Math::exp(-0.5 * t **2) * (1.0/Math::sqrt(2*Math::PI))
549
651
  end
550
652
 
@@ -554,7 +656,7 @@ module Statsample
554
656
  rp.to_text
555
657
  end
556
658
 
557
- def to_reportbuilder(generator)
659
+ def to_reportbuilder(generator) # :nodoc:
558
660
  compute if @r.nil?
559
661
  section=ReportBuilder::Section.new(:name=>@name)
560
662
  t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>[""]+(@n.times.collect {|i| "Y=#{i}"})+["Total"])
@@ -574,6 +676,7 @@ module Statsample
574
676
  t.add_row(["Threshold Y #{i}", sprintf("%0.4f", val)])
575
677
  }
576
678
  section.add(t)
679
+ section.add(_("Test of bivariate normality: X2 = %0.3f, df = %d, p= %0.5f" % [ chi_square, chi_square_df, 1-Distribution::ChiSquare.cdf(chi_square, chi_square_df)]))
577
680
  generator.parse_element(section)
578
681
  end
579
682
  end