statsample-bivariate-extension 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data.tar.gz.sig +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +7 -0
- data/README.txt +50 -0
- data/Rakefile +18 -0
- data/lib/statsample/bivariate/polychoric.rb +886 -0
- data/lib/statsample/bivariate/tetrachoric.rb +450 -0
- data/spec/spec_helper.rb +9 -0
- metadata +137 -0
- metadata.gz.sig +1 -0
data.tar.gz.sig
ADDED
Binary file
|
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
= statsample-bivariate-extension
|
2
|
+
|
3
|
+
* http://ruby-statsample.rubyforge.org/
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Provides advanced bivariate statistics:
|
8
|
+
* Tetrachoric correlation
|
9
|
+
* Polychoric correlation
|
10
|
+
|
11
|
+
== FEATURES/PROBLEMS:
|
12
|
+
|
13
|
+
* Statsample::Bivariate::Polychoric class provides polychoric correlation
|
14
|
+
* Statsample::Bivariate::Tetracoric class provides tetrachoric correlation
|
15
|
+
|
16
|
+
|
17
|
+
== SYNOPSIS:
|
18
|
+
|
19
|
+
=== Tetrachoric correlation
|
20
|
+
|
21
|
+
require 'statsample'
|
22
|
+
a=40
|
23
|
+
b=10
|
24
|
+
c=20
|
25
|
+
d=30
|
26
|
+
tetra=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
|
27
|
+
puts tetra.summary
|
28
|
+
|
29
|
+
=== Polychoric correlation
|
30
|
+
|
31
|
+
require 'statsample'
|
32
|
+
ct=Matrix[[58,52,1],[26,58,3],[8,12,9]]
|
33
|
+
|
34
|
+
poly=Statsample::Bivariate::Polychoric.new(ct)
|
35
|
+
puts poly.summary
|
36
|
+
|
37
|
+
|
38
|
+
== REQUIREMENTS:
|
39
|
+
|
40
|
+
* Statsample
|
41
|
+
|
42
|
+
== INSTALL:
|
43
|
+
|
44
|
+
This gem is a statsample dependency. If you want to install it separatly
|
45
|
+
|
46
|
+
* sudo gem install statsample-bivariate-extension
|
47
|
+
|
48
|
+
== LICENSE:
|
49
|
+
|
50
|
+
GPL-2
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
# -*- ruby -*-
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'hoe'
|
6
|
+
|
7
|
+
$:.unshift(File.dirname(__FILE__)+"/lib")
|
8
|
+
|
9
|
+
require 'statsample/bivariate/extension_version.rb'
|
10
|
+
Hoe.spec 'statsample-bivariate-extension' do
|
11
|
+
self.rubyforge_name = 'ruby-statsample'
|
12
|
+
self.version=Statsample::Bivariate::EXTENSION_VERSION
|
13
|
+
self.developer('Claudio Bustos', 'clbustos_at_gmail.com')
|
14
|
+
self.extra_deps << ["statsample","~>0.13.0"]
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
# vim: syntax=ruby
|
@@ -0,0 +1,886 @@
|
|
1
|
+
require 'minimization'
|
2
|
+
module Statsample
|
3
|
+
module Bivariate
|
4
|
+
# Calculate Polychoric correlation for two vectors.
|
5
|
+
def self.polychoric(v1,v2)
|
6
|
+
pc=Polychoric.new_with_vectors(v1,v2)
|
7
|
+
pc.r
|
8
|
+
end
|
9
|
+
|
10
|
+
# Polychoric correlation matrix.
|
11
|
+
# Order of rows and columns depends on Dataset#fields order
|
12
|
+
def self.polychoric_correlation_matrix(ds)
|
13
|
+
cache={}
|
14
|
+
matrix=ds.collect_matrix do |row,col|
|
15
|
+
if row==col
|
16
|
+
1.0
|
17
|
+
else
|
18
|
+
begin
|
19
|
+
if cache[[col,row]].nil?
|
20
|
+
poly=polychoric(ds[row],ds[col])
|
21
|
+
cache[[row,col]]=poly
|
22
|
+
poly
|
23
|
+
else
|
24
|
+
cache[[col,row]]
|
25
|
+
end
|
26
|
+
rescue RuntimeError
|
27
|
+
nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
matrix.extend CovariateMatrix
|
32
|
+
matrix.fields=ds.fields
|
33
|
+
matrix
|
34
|
+
end
|
35
|
+
|
36
|
+
# = Polychoric correlation.
|
37
|
+
#
|
38
|
+
# The <em>polychoric</em> correlation is a measure of
|
39
|
+
# bivariate association arising when both observed variates
|
40
|
+
# are ordered, categorical variables that result from polychotomizing
|
41
|
+
# the two undelying continuous variables (Drasgow, 2006)
|
42
|
+
#
|
43
|
+
# According to Drasgow(2006), there are tree methods to estimate
|
44
|
+
# the polychoric correlation:
|
45
|
+
#
|
46
|
+
# 1. Maximum Likehood Estimator
|
47
|
+
# 2. Two-step estimator and
|
48
|
+
# 3. Polychoric series estimate.
|
49
|
+
#
|
50
|
+
# By default, two-step estimation are used. You can select
|
51
|
+
# the estimation method with method attribute. Joint estimate and polychoric series requires gsl library and rb-gsl.
|
52
|
+
#
|
53
|
+
# == Use
|
54
|
+
#
|
55
|
+
# You should enter a Matrix with ordered data. For example:
|
56
|
+
# -------------------
|
57
|
+
# | y=0 | y=1 | y=2 |
|
58
|
+
# -------------------
|
59
|
+
# x = 0 | 1 | 10 | 20 |
|
60
|
+
# -------------------
|
61
|
+
# x = 1 | 20 | 20 | 50 |
|
62
|
+
# -------------------
|
63
|
+
#
|
64
|
+
# The code will be
|
65
|
+
#
|
66
|
+
# matrix=Matrix[[1,10,20],[20,20,50]]
|
67
|
+
# poly=Statsample::Bivariate::Polychoric.new(matrix, :method=>:joint)
|
68
|
+
# puts poly.r
|
69
|
+
#
|
70
|
+
# See extensive documentation on Uebersax(2002) and Drasgow(2006)
|
71
|
+
#
|
72
|
+
# == References
|
73
|
+
#
|
74
|
+
# * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
|
75
|
+
# * Drasgow F. (2006). Polychoric and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley.
|
76
|
+
|
77
|
+
class Polychoric
|
78
|
+
include Summarizable
|
79
|
+
class Processor
|
80
|
+
attr_reader :alpha, :beta, :rho
|
81
|
+
def initialize(alpha,beta,rho)
|
82
|
+
@alpha=alpha
|
83
|
+
@beta=beta
|
84
|
+
@nr=@alpha.size+1
|
85
|
+
@nc=@beta.size+1
|
86
|
+
@rho=rho
|
87
|
+
@pd=nil
|
88
|
+
end
|
89
|
+
def bipdf(i,j)
|
90
|
+
Distribution::NormalBivariate.pdf(a(i), b(j), rho)
|
91
|
+
end
|
92
|
+
def a(i)
|
93
|
+
i < 0 ? -100 : (i==@nr-1 ? 100 : alpha[i])
|
94
|
+
end
|
95
|
+
def b(j)
|
96
|
+
j < 0 ? -100 : (j==@nc-1 ? 100 : beta[j])
|
97
|
+
end
|
98
|
+
# Equation(10) from Olsson(1979)
|
99
|
+
def fd_loglike_cell_a(i,j,k)
|
100
|
+
if k==i
|
101
|
+
Distribution::NormalBivariate.pd_cdf_x(a(k),b(j), rho) - Distribution::NormalBivariate.pd_cdf_x(a(k),b(j-1),rho)
|
102
|
+
elsif k==(i-1)
|
103
|
+
-Distribution::NormalBivariate.pd_cdf_x(a(k),b(j),rho) + Distribution::NormalBivariate.pd_cdf_x(a(k),b(j-1),rho)
|
104
|
+
else
|
105
|
+
0
|
106
|
+
end
|
107
|
+
|
108
|
+
end
|
109
|
+
# phi_ij for each i and j
|
110
|
+
# Uses equation(4) from Olsson(1979)
|
111
|
+
def pd
|
112
|
+
if @pd.nil?
|
113
|
+
@pd=@nr.times.collect{ [0] * @nc}
|
114
|
+
pc=@nr.times.collect{ [0] * @nc}
|
115
|
+
@nr.times do |i|
|
116
|
+
@nc.times do |j|
|
117
|
+
|
118
|
+
if i==@nr-1 and j==@nc-1
|
119
|
+
@pd[i][j]=1.0
|
120
|
+
else
|
121
|
+
a=(i==@nr-1) ? 100: alpha[i]
|
122
|
+
b=(j==@nc-1) ? 100: beta[j]
|
123
|
+
#puts "a:#{a} b:#{b}"
|
124
|
+
@pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
|
125
|
+
end
|
126
|
+
pc[i][j] = @pd[i][j]
|
127
|
+
@pd[i][j] = @pd[i][j] - pc[i-1][j] if i>0
|
128
|
+
@pd[i][j] = @pd[i][j] - pc[i][j-1] if j>0
|
129
|
+
@pd[i][j] = @pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
@pd
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
include DirtyMemoize
|
138
|
+
# Name of the analysis
|
139
|
+
attr_accessor :name
|
140
|
+
# Max number of iterations used on iterative methods. Default to MAX_ITERATIONS
|
141
|
+
attr_accessor :max_iterations
|
142
|
+
# Debug algorithm (See iterations, for example)
|
143
|
+
attr_accessor :debug
|
144
|
+
# Minimizer type for two step. Default "brent"
|
145
|
+
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
146
|
+
attr_accessor :minimizer_type_two_step
|
147
|
+
|
148
|
+
# Minimizer type for joint estimate. Default "nmsimplex"
|
149
|
+
# See http://rb-gsl.rubyforge.org/min.html for reference.
|
150
|
+
attr_accessor :minimizer_type_joint
|
151
|
+
|
152
|
+
|
153
|
+
# Method of calculation of polychoric series.
|
154
|
+
# <tt>:two_step</tt> used by default.
|
155
|
+
#
|
156
|
+
# :two_step:: two-step ML, based on code by Gegenfurtner(1992).
|
157
|
+
# :polychoric_series:: polychoric series estimate, using
|
158
|
+
# algorithm AS87 by Martinson and Hamdan (1975).
|
159
|
+
# :joint:: one-step ML, based on R package 'polycor'
|
160
|
+
# by J.Fox.
|
161
|
+
attr_accessor :method
|
162
|
+
# Absolute error for iteration.
|
163
|
+
attr_accessor :epsilon
|
164
|
+
|
165
|
+
# Number of iterations
|
166
|
+
attr_reader :iteration
|
167
|
+
|
168
|
+
# Log of algorithm
|
169
|
+
attr_reader :log
|
170
|
+
|
171
|
+
|
172
|
+
attr_reader :loglike_model
|
173
|
+
|
174
|
+
METHOD=:two_step
|
175
|
+
MAX_ITERATIONS=300
|
176
|
+
EPSILON=1e-6
|
177
|
+
MINIMIZER_TYPE_TWO_STEP="brent"
|
178
|
+
MINIMIZER_TYPE_JOINT="nmsimplex"
|
179
|
+
def self.new_with_vectors(v1,v2)
|
180
|
+
Polychoric.new(Crosstab.new(v1,v2).to_matrix)
|
181
|
+
end
|
182
|
+
# Params:
|
183
|
+
# * matrix: Contingence table
|
184
|
+
# * opts: Any attribute
|
185
|
+
|
186
|
+
def initialize(matrix, opts=Hash.new)
|
187
|
+
@matrix=matrix
|
188
|
+
@n=matrix.column_size
|
189
|
+
@m=matrix.row_size
|
190
|
+
raise "row size <1" if @m<=1
|
191
|
+
raise "column size <1" if @n<=1
|
192
|
+
|
193
|
+
@method=METHOD
|
194
|
+
@name=_("Polychoric correlation")
|
195
|
+
@max_iterations=MAX_ITERATIONS
|
196
|
+
@epsilon=EPSILON
|
197
|
+
@minimizer_type_two_step=MINIMIZER_TYPE_TWO_STEP
|
198
|
+
@minimizer_type_joint=MINIMIZER_TYPE_JOINT
|
199
|
+
@debug=false
|
200
|
+
@iteration=nil
|
201
|
+
opts.each{|k,v|
|
202
|
+
self.send("#{k}=",v) if self.respond_to? k
|
203
|
+
}
|
204
|
+
@r=nil
|
205
|
+
@pd=nil
|
206
|
+
compute_basic_parameters
|
207
|
+
end
|
208
|
+
# Returns the polychoric correlation
|
209
|
+
attr_reader :r
|
210
|
+
# Returns the rows thresholds
|
211
|
+
attr_reader :alpha
|
212
|
+
# Returns the columns thresholds
|
213
|
+
attr_reader :beta
|
214
|
+
|
215
|
+
dirty_writer :max_iterations, :epsilon, :minimizer_type_two_step, :minimizer_type_joint, :method
|
216
|
+
dirty_memoize :r, :alpha, :beta
|
217
|
+
|
218
|
+
alias :threshold_x :alpha
|
219
|
+
alias :threshold_y :beta
|
220
|
+
|
221
|
+
|
222
|
+
# Start the computation of polychoric correlation
|
223
|
+
# based on attribute method
|
224
|
+
def compute
|
225
|
+
if @method==:two_step
|
226
|
+
compute_two_step_mle_drasgow
|
227
|
+
elsif @method==:joint
|
228
|
+
compute_one_step_mle
|
229
|
+
elsif @method==:polychoric_series
|
230
|
+
compute_polychoric_series
|
231
|
+
else
|
232
|
+
raise "Not implemented"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
# Retrieve log likehood for actual data.
|
236
|
+
def loglike_data
|
237
|
+
loglike=0
|
238
|
+
@nr.times do |i|
|
239
|
+
@nc.times do |j|
|
240
|
+
res=@matrix[i,j].quo(@total)
|
241
|
+
if (res==0)
|
242
|
+
res=1e-16
|
243
|
+
end
|
244
|
+
loglike+= @matrix[i,j] * Math::log(res )
|
245
|
+
end
|
246
|
+
end
|
247
|
+
loglike
|
248
|
+
end
|
249
|
+
|
250
|
+
# Chi Square of model
|
251
|
+
def chi_square
|
252
|
+
if @loglike_model.nil?
|
253
|
+
compute
|
254
|
+
end
|
255
|
+
-2*(@loglike_model-loglike_data)
|
256
|
+
end
|
257
|
+
|
258
|
+
def chi_square_df
|
259
|
+
(@nr*@nc)-@nc-@nr
|
260
|
+
end
|
261
|
+
|
262
|
+
|
263
|
+
|
264
|
+
|
265
|
+
# Retrieve all cell probabilities for givens alpha, beta and rho
|
266
|
+
def cell_probabilities(alpha,beta,rho)
|
267
|
+
pd=@nr.times.collect{ [0] * @nc}
|
268
|
+
pc=@nr.times.collect{ [0] * @nc}
|
269
|
+
@nr.times do |i|
|
270
|
+
@nc.times do |j|
|
271
|
+
|
272
|
+
if i==@nr-1 and j==@nc-1
|
273
|
+
pd[i][j]=1.0
|
274
|
+
else
|
275
|
+
a=(i==@nr-1) ? 100: alpha[i]
|
276
|
+
b=(j==@nc-1) ? 100: beta[j]
|
277
|
+
#puts "a:#{a} b:#{b}"
|
278
|
+
pd[i][j]=Distribution::NormalBivariate.cdf(a, b, rho)
|
279
|
+
end
|
280
|
+
pc[i][j] = pd[i][j]
|
281
|
+
pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
|
282
|
+
pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
|
283
|
+
pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
|
284
|
+
end
|
285
|
+
end
|
286
|
+
@pd=pd
|
287
|
+
pd
|
288
|
+
end
|
289
|
+
def loglike(alpha,beta,rho)
|
290
|
+
if rho.abs>0.9999
|
291
|
+
rho= (rho>0) ? 0.9999 : -0.9999
|
292
|
+
end
|
293
|
+
pr=Processor.new(alpha,beta,rho)
|
294
|
+
loglike=0
|
295
|
+
|
296
|
+
|
297
|
+
@nr.times do |i|
|
298
|
+
@nc.times do |j|
|
299
|
+
res=pr.pd[i][j]+EPSILON
|
300
|
+
loglike+= @matrix[i,j] * Math::log( res )
|
301
|
+
end
|
302
|
+
end
|
303
|
+
-loglike
|
304
|
+
end
|
305
|
+
# First derivate for rho
|
306
|
+
# Uses equation (9) from Olsson(1979)
|
307
|
+
def fd_loglike_rho(alpha,beta,rho)
|
308
|
+
if rho.abs>0.9999
|
309
|
+
rho= (rho>0) ? 0.9999 : -0.9999
|
310
|
+
end
|
311
|
+
total=0
|
312
|
+
pr=Processor.new(alpha,beta,rho)
|
313
|
+
@nr.times do |i|
|
314
|
+
@nc.times do |j|
|
315
|
+
pi=pr.pd[i][j] + EPSILON
|
316
|
+
total+= (@matrix[i,j] / pi) * (pr.bipdf(i,j)-pr.bipdf(i-1,j)-pr.bipdf(i,j-1)+pr.bipdf(i-1,j-1))
|
317
|
+
end
|
318
|
+
end
|
319
|
+
total
|
320
|
+
end
|
321
|
+
|
322
|
+
# First derivative for alpha_k
|
323
|
+
def fd_loglike_a(alpha,beta,rho,k)
|
324
|
+
fd_loglike_a_eq6(alpha,beta,rho,k)
|
325
|
+
end
|
326
|
+
# Uses equation (6) from Olsson(1979)
|
327
|
+
def fd_loglike_a_eq6(alpha,beta,rho,k)
|
328
|
+
if rho.abs>0.9999
|
329
|
+
rho= (rho>0) ? 0.9999 : -0.9999
|
330
|
+
end
|
331
|
+
pr=Processor.new(alpha,beta,rho)
|
332
|
+
total=0
|
333
|
+
pd=pr.pd
|
334
|
+
@nr.times do |i|
|
335
|
+
@nc.times do |j|
|
336
|
+
total+=@matrix[i,j].quo(pd[i][j]+EPSILON) * pr.fd_loglike_cell_a(i,j,k)
|
337
|
+
end
|
338
|
+
end
|
339
|
+
total
|
340
|
+
end
|
341
|
+
# Uses equation(13) from Olsson(1979)
|
342
|
+
def fd_loglike_a_eq13(alpha,beta,rho,k)
|
343
|
+
if rho.abs>0.9999
|
344
|
+
rho= (rho>0) ? 0.9999 : -0.9999
|
345
|
+
end
|
346
|
+
pr=Processor.new(alpha,beta,rho)
|
347
|
+
total=0
|
348
|
+
a_k=pr.a(k)
|
349
|
+
pd=pr.pd
|
350
|
+
@nc.times do |j|
|
351
|
+
#puts "j: #{j}"
|
352
|
+
#puts "b #{j} : #{b.call(j)}"
|
353
|
+
#puts "b #{j-1} : #{b.call(j-1)}"
|
354
|
+
|
355
|
+
e_1=@matrix[k,j].quo(pd[k][j]+EPSILON) - @matrix[k+1,j].quo(pd[k+1][j]+EPSILON)
|
356
|
+
e_2=Distribution::Normal.pdf(a_k)
|
357
|
+
e_3=Distribution::Normal.cdf((pr.b(j)-rho*a_k).quo(Math::sqrt(1-rho**2))) - Distribution::Normal.cdf((pr.b(j-1)-rho*a_k).quo(Math::sqrt(1-rho**2)))
|
358
|
+
#puts "val #{j}: #{e_1} | #{e_2} | #{e_3}"
|
359
|
+
|
360
|
+
total+= e_1*e_2*e_3
|
361
|
+
end
|
362
|
+
total
|
363
|
+
end
|
364
|
+
# First derivative for beta_m
|
365
|
+
# Uses equation(14) from Olsson(1979)
|
366
|
+
def fd_loglike_b(alpha,beta,rho,m)
|
367
|
+
if rho.abs>0.9999
|
368
|
+
rho= (rho>0) ? 0.9999 : -0.9999
|
369
|
+
end
|
370
|
+
pr=Processor.new(alpha,beta,rho)
|
371
|
+
total=0
|
372
|
+
b_m=pr.b m
|
373
|
+
pd=pr.pd
|
374
|
+
@nr.times do |i|
|
375
|
+
#puts "j: #{j}"
|
376
|
+
#puts "b #{j} : #{b.call(j)}"
|
377
|
+
#puts "b #{j-1} : #{b.call(j-1)}"
|
378
|
+
|
379
|
+
e_1=@matrix[i,m].quo(pd[i][m]+EPSILON) - @matrix[i,m+1].quo(pd[i][m+1]+EPSILON)
|
380
|
+
e_2=Distribution::Normal.pdf(b_m)
|
381
|
+
e_3=Distribution::Normal.cdf((pr.a(i)-rho*b_m).quo(Math::sqrt(1-rho**2))) - Distribution::Normal.cdf((pr.a(i-1)-rho*b_m).quo(Math::sqrt(1-rho**2)))
|
382
|
+
#puts "val #{j}: #{e_1} | #{e_2} | #{e_3}"
|
383
|
+
|
384
|
+
total+= e_1*e_2*e_3
|
385
|
+
end
|
386
|
+
total
|
387
|
+
end
|
388
|
+
|
389
|
+
|
390
|
+
def compute_basic_parameters
|
391
|
+
@nr=@matrix.row_size
|
392
|
+
@nc=@matrix.column_size
|
393
|
+
@sumr=[0]*@matrix.row_size
|
394
|
+
@sumrac=[0]*@matrix.row_size
|
395
|
+
@sumc=[0]*@matrix.column_size
|
396
|
+
@sumcac=[0]*@matrix.column_size
|
397
|
+
@alpha=[0]*(@nr-1)
|
398
|
+
@beta=[0]*(@nc-1)
|
399
|
+
@total=0
|
400
|
+
@nr.times do |i|
|
401
|
+
@nc.times do |j|
|
402
|
+
@sumr[i]+=@matrix[i,j]
|
403
|
+
@sumc[j]+=@matrix[i,j]
|
404
|
+
@total+=@matrix[i,j]
|
405
|
+
end
|
406
|
+
end
|
407
|
+
ac=0
|
408
|
+
(@nr-1).times do |i|
|
409
|
+
@sumrac[i]=@sumr[i]+ac
|
410
|
+
@alpha[i]=Distribution::Normal.p_value(@sumrac[i] / @total.to_f)
|
411
|
+
ac=@sumrac[i]
|
412
|
+
end
|
413
|
+
ac=0
|
414
|
+
(@nc-1).times do |i|
|
415
|
+
@sumcac[i]=@sumc[i]+ac
|
416
|
+
@beta[i]=Distribution::Normal.p_value(@sumcac[i] / @total.to_f)
|
417
|
+
ac=@sumcac[i]
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
|
422
|
+
# Computation of polychoric correlation usign two-step ML estimation.
|
423
|
+
#
|
424
|
+
# Two-step ML estimation "first estimates the thresholds from the one-way marginal frequencies, then estimates rho, conditional on these thresholds, via maximum likelihood" (Uebersax, 2006).
|
425
|
+
#
|
426
|
+
# The algorithm is based on code by Gegenfurtner(1992).
|
427
|
+
#
|
428
|
+
# <b>References</b>:
|
429
|
+
# * Gegenfurtner, K. (1992). PRAXIS: Brent's algorithm for function minimization. Behavior Research Methods, Instruments & Computers, 24(4), 560-564. Available on http://www.allpsych.uni-giessen.de/karl/pdf/03.praxis.pdf
|
430
|
+
# * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
|
431
|
+
#
|
432
|
+
def compute_two_step_mle_drasgow
|
433
|
+
if Statsample.has_gsl?
|
434
|
+
compute_two_step_mle_drasgow_gsl
|
435
|
+
else
|
436
|
+
compute_two_step_mle_drasgow_ruby
|
437
|
+
end
|
438
|
+
end
|
439
|
+
|
440
|
+
# Depends on minimization algorithm.
|
441
|
+
|
442
|
+
def compute_two_step_mle_drasgow_ruby #:nodoc:
|
443
|
+
|
444
|
+
f=proc {|rho|
|
445
|
+
loglike(@alpha,@beta, rho)
|
446
|
+
}
|
447
|
+
@log=_("Minimizing using GSL Brent method\n")
|
448
|
+
min=Minimization::Brent.new(-0.9999,0.9999,f)
|
449
|
+
min.epsilon=@epsilon
|
450
|
+
min.expected=0
|
451
|
+
min.iterate
|
452
|
+
@log+=min.log.to_table.to_s
|
453
|
+
@r=min.x_minimum
|
454
|
+
@loglike_model=-min.f_minimum
|
455
|
+
puts @log if @debug
|
456
|
+
|
457
|
+
end
|
458
|
+
|
459
|
+
|
460
|
+
def compute_two_step_mle_drasgow_gsl #:nodoc:
|
461
|
+
|
462
|
+
fn1=GSL::Function.alloc {|rho|
|
463
|
+
loglike(@alpha,@beta, rho)
|
464
|
+
}
|
465
|
+
@iteration = 0
|
466
|
+
max_iter = @max_iterations
|
467
|
+
m = 0 # initial guess
|
468
|
+
m_expected = 0
|
469
|
+
a=-0.9999
|
470
|
+
b=+0.9999
|
471
|
+
gmf = GSL::Min::FMinimizer.alloc(@minimizer_type_two_step)
|
472
|
+
gmf.set(fn1, m, a, b)
|
473
|
+
header=_("Two step minimization using %s method\n") % gmf.name
|
474
|
+
header+=sprintf("%5s [%9s, %9s] %9s %10s %9s\n", "iter", "lower", "upper", "min",
|
475
|
+
"err", "err(est)")
|
476
|
+
|
477
|
+
header+=sprintf("%5d [%.7f, %.7f] %.7f %+.7f %.7f\n", @iteration, a, b, m, m - m_expected, b - a)
|
478
|
+
@log=header
|
479
|
+
puts header if @debug
|
480
|
+
begin
|
481
|
+
@iteration += 1
|
482
|
+
status = gmf.iterate
|
483
|
+
status = gmf.test_interval(@epsilon, 0.0)
|
484
|
+
|
485
|
+
if status == GSL::SUCCESS
|
486
|
+
@log+="converged:"
|
487
|
+
puts "converged:" if @debug
|
488
|
+
end
|
489
|
+
a = gmf.x_lower
|
490
|
+
b = gmf.x_upper
|
491
|
+
m = gmf.x_minimum
|
492
|
+
message=sprintf("%5d [%.7f, %.7f] %.7f %+.7f %.7f\n",
|
493
|
+
@iteration, a, b, m, m - m_expected, b - a);
|
494
|
+
@log+=message
|
495
|
+
puts message if @debug
|
496
|
+
end while status == GSL::CONTINUE and @iteration < @max_iterations
|
497
|
+
@r=gmf.x_minimum
|
498
|
+
@loglike_model=-gmf.f_minimum
|
499
|
+
end
|
500
|
+
|
501
|
+
# Compute Polychoric correlation with joint estimate.
|
502
|
+
# Rho and thresholds are estimated at same time.
|
503
|
+
# Code based on R package "polycor", by J.Fox.
|
504
|
+
#
|
505
|
+
|
506
|
+
def compute_one_step_mle
|
507
|
+
# Get initial values with two-step aproach
|
508
|
+
compute_two_step_mle_drasgow
|
509
|
+
# Start iteration with past values
|
510
|
+
rho=@r
|
511
|
+
cut_alpha=@alpha
|
512
|
+
cut_beta=@beta
|
513
|
+
parameters=[rho]+cut_alpha+cut_beta
|
514
|
+
minimization = Proc.new { |v, params|
|
515
|
+
rho=v[0]
|
516
|
+
alpha=v[1, @nr-1]
|
517
|
+
beta=v[@nr, @nc-1]
|
518
|
+
|
519
|
+
#puts "f'rho=#{fd_loglike_rho(alpha,beta,rho)}"
|
520
|
+
#(@nr-1).times {|k|
|
521
|
+
# puts "f'a(#{k}) = #{fd_loglike_a(alpha,beta,rho,k)}"
|
522
|
+
# puts "f'a(#{k}) v2 = #{fd_loglike_a2(alpha,beta,rho,k)}"
|
523
|
+
#
|
524
|
+
#}
|
525
|
+
#(@nc-1).times {|k|
|
526
|
+
# puts "f'b(#{k}) = #{fd_loglike_b(alpha,beta,rho,k)}"
|
527
|
+
#}
|
528
|
+
|
529
|
+
loglike(alpha,beta,rho)
|
530
|
+
}
|
531
|
+
np=@nc-1+@nr
|
532
|
+
my_func = GSL::MultiMin::Function.alloc(minimization, np)
|
533
|
+
my_func.set_params(parameters) # parameters
|
534
|
+
|
535
|
+
x = GSL::Vector.alloc(parameters.dup)
|
536
|
+
|
537
|
+
ss = GSL::Vector.alloc(np)
|
538
|
+
ss.set_all(1.0)
|
539
|
+
|
540
|
+
minimizer = GSL::MultiMin::FMinimizer.alloc(minimizer_type_joint,np)
|
541
|
+
minimizer.set(my_func, x, ss)
|
542
|
+
|
543
|
+
iter = 0
|
544
|
+
message=""
|
545
|
+
begin
|
546
|
+
iter += 1
|
547
|
+
status = minimizer.iterate()
|
548
|
+
status = minimizer.test_size(@epsilon)
|
549
|
+
if status == GSL::SUCCESS
|
550
|
+
message="Joint MLE converged to minimum at\n"
|
551
|
+
end
|
552
|
+
x = minimizer.x
|
553
|
+
message+= sprintf("%5d iterations", iter)+"\n";
|
554
|
+
for i in 0...np do
|
555
|
+
message+=sprintf("%10.3e ", x[i])
|
556
|
+
end
|
557
|
+
message+=sprintf("f() = %7.3f size = %.3f\n", minimizer.fval, minimizer.size)+"\n";
|
558
|
+
end while status == GSL::CONTINUE and iter < @max_iterations
|
559
|
+
@iteration=iter
|
560
|
+
@log+=message
|
561
|
+
@r=minimizer.x[0]
|
562
|
+
@alpha=minimizer.x[1,@nr-1].to_a
|
563
|
+
@beta=minimizer.x[@nr,@nc-1].to_a
|
564
|
+
@loglike_model= -minimizer.minimum
|
565
|
+
end
|
566
|
+
|
567
|
+
def matrix_for_rho(rho) # :nodoc:
|
568
|
+
pd=@nr.times.collect{ [0]*@nc}
|
569
|
+
pc=@nr.times.collect{ [0]*@nc}
|
570
|
+
@nr.times { |i|
|
571
|
+
@nc.times { |j|
|
572
|
+
pd[i][j]=Distribution::NormalBivariate.cdf(@alpha[i], @beta[j], rho)
|
573
|
+
pc[i][j] = pd[i][j]
|
574
|
+
pd[i][j] = pd[i][j] - pc[i-1][j] if i>0
|
575
|
+
pd[i][j] = pd[i][j] - pc[i][j-1] if j>0
|
576
|
+
pd[i][j] = pd[i][j] + pc[i-1][j-1] if (i>0 and j>0)
|
577
|
+
res= pd[i][j]
|
578
|
+
}
|
579
|
+
}
|
580
|
+
Matrix.rows(pc)
|
581
|
+
end
|
582
|
+
|
583
|
+
def expected # :nodoc:
|
584
|
+
rt=[]
|
585
|
+
ct=[]
|
586
|
+
t=0
|
587
|
+
@matrix.row_size.times {|i|
|
588
|
+
@matrix.column_size.times {|j|
|
589
|
+
rt[i]=0 if rt[i].nil?
|
590
|
+
ct[j]=0 if ct[j].nil?
|
591
|
+
rt[i]+=@matrix[i,j]
|
592
|
+
ct[j]+=@matrix[i,j]
|
593
|
+
t+=@matrix[i,j]
|
594
|
+
}
|
595
|
+
}
|
596
|
+
m=[]
|
597
|
+
@matrix.row_size.times {|i|
|
598
|
+
row=[]
|
599
|
+
@matrix.column_size.times {|j|
|
600
|
+
row[j]=(rt[i]*ct[j]).quo(t)
|
601
|
+
}
|
602
|
+
m.push(row)
|
603
|
+
}
|
604
|
+
|
605
|
+
Matrix.rows(m)
|
606
|
+
end
|
607
|
+
|
608
|
+
# Compute polychoric correlation using polychoric series.
|
609
|
+
# Algorithm: AS87, by Martinson and Hamdam(1975).
|
610
|
+
#
|
611
|
+
# <b>Warning</b>: According to Drasgow(2006), this
|
612
|
+
# computation diverges greatly of joint and two-step methods.
|
613
|
+
#
|
614
|
+
def compute_polychoric_series
|
615
|
+
@nn=@n-1
|
616
|
+
@mm=@m-1
|
617
|
+
@nn7=7*@nn
|
618
|
+
@mm7=7*@mm
|
619
|
+
@mn=@n*@m
|
620
|
+
@cont=[nil]
|
621
|
+
@n.times {|j|
|
622
|
+
@m.times {|i|
|
623
|
+
@cont.push(@matrix[i,j])
|
624
|
+
}
|
625
|
+
}
|
626
|
+
|
627
|
+
pcorl=0
|
628
|
+
cont=@cont
|
629
|
+
xmean=0.0
|
630
|
+
sum=0.0
|
631
|
+
row=[]
|
632
|
+
colmn=[]
|
633
|
+
(1..@m).each do |i|
|
634
|
+
row[i]=0.0
|
635
|
+
l=i
|
636
|
+
(1..@n).each do |j|
|
637
|
+
row[i]=row[i]+cont[l]
|
638
|
+
l+=@m
|
639
|
+
end
|
640
|
+
raise "Should not be empty rows" if(row[i]==0.0)
|
641
|
+
xmean=xmean+row[i]*i.to_f
|
642
|
+
sum+=row[i]
|
643
|
+
end
|
644
|
+
xmean=xmean/sum.to_f
|
645
|
+
ymean=0.0
|
646
|
+
(1..@n).each do |j|
|
647
|
+
colmn[j]=0.0
|
648
|
+
l=(j-1)*@m
|
649
|
+
(1..@m).each do |i|
|
650
|
+
l=l+1
|
651
|
+
colmn[j]=colmn[j]+cont[l] #12
|
652
|
+
end
|
653
|
+
raise "Should not be empty cols" if colmn[j]==0
|
654
|
+
ymean=ymean+colmn[j]*j.to_f
|
655
|
+
end
|
656
|
+
ymean=ymean/sum.to_f
|
657
|
+
covxy=0.0
|
658
|
+
(1..@m).each do |i|
|
659
|
+
l=i
|
660
|
+
(1..@n).each do |j|
|
661
|
+
conxy=covxy+cont[l]*(i.to_f-xmean)*(j.to_f-ymean)
|
662
|
+
l=l+@m
|
663
|
+
end
|
664
|
+
end
|
665
|
+
|
666
|
+
chisq=0.0
|
667
|
+
(1..@m).each do |i|
|
668
|
+
l=i
|
669
|
+
(1..@n).each do |j|
|
670
|
+
chisq=chisq+((cont[l]**2).quo(row[i]*colmn[j]))
|
671
|
+
l=l+@m
|
672
|
+
end
|
673
|
+
end
|
674
|
+
|
675
|
+
phisq=chisq-1.0-(@mm*@nn).to_f / sum.to_f
|
676
|
+
phisq=0 if(phisq<0.0)
|
677
|
+
# Compute cumulative sum of columns and rows
|
678
|
+
sumc=[]
|
679
|
+
sumr=[]
|
680
|
+
sumc[1]=colmn[1]
|
681
|
+
sumr[1]=row[1]
|
682
|
+
cum=0
|
683
|
+
(1..@nn).each do |i| # goto 17 r20
|
684
|
+
cum=cum+colmn[i]
|
685
|
+
sumc[i]=cum
|
686
|
+
end
|
687
|
+
cum=0
|
688
|
+
(1..@mm).each do |i|
|
689
|
+
cum=cum+row[i]
|
690
|
+
sumr[i]=cum
|
691
|
+
end
|
692
|
+
alpha=[]
|
693
|
+
beta=[]
|
694
|
+
# Compute points of polytomy
|
695
|
+
(1..@mm).each do |i| #do 21
|
696
|
+
alpha[i]=Distribution::Normal.p_value(sumr[i] / sum.to_f)
|
697
|
+
end # 21
|
698
|
+
(1..@nn).each do |i| #do 22
|
699
|
+
beta[i]=Distribution::Normal.p_value(sumc[i] / sum.to_f)
|
700
|
+
end # 21
|
701
|
+
@alpha=alpha[1,alpha.size]
|
702
|
+
@beta=beta[1,beta.size]
|
703
|
+
@sumr=row[1,row.size]
|
704
|
+
@sumc=colmn[1,colmn.size]
|
705
|
+
@total=sum
|
706
|
+
|
707
|
+
# Compute Fourier coefficients a and b. Verified
|
708
|
+
h=hermit(alpha,@mm)
|
709
|
+
hh=hermit(beta,@nn)
|
710
|
+
a=[]
|
711
|
+
b=[]
|
712
|
+
if @m!=2 # goto 24
|
713
|
+
mmm=@m-2
|
714
|
+
(1..mmm).each do |i| #do 23
|
715
|
+
a1=sum.quo(row[i+1] * sumr[i] * sumr[i+1])
|
716
|
+
a2=sumr[i] * xnorm(alpha[i+1])
|
717
|
+
a3=sumr[i+1] * xnorm(alpha[i])
|
718
|
+
l=i
|
719
|
+
(1..7).each do |j| #do 23
|
720
|
+
a[l]=Math::sqrt(a1.quo(j))*(h[l+1] * a2 - h[l] * a3)
|
721
|
+
l=l+@mm
|
722
|
+
end
|
723
|
+
end #23
|
724
|
+
end
|
725
|
+
# 24
|
726
|
+
|
727
|
+
|
728
|
+
if @n!=2 # goto 26
|
729
|
+
nnn=@n-2
|
730
|
+
(1..nnn).each do |i| #do 25
|
731
|
+
a1=sum.quo(colmn[i+1] * sumc[i] * sumc[i+1])
|
732
|
+
a2=sumc[i] * xnorm(beta[i+1])
|
733
|
+
a3=sumc[i+1] * xnorm(beta[i])
|
734
|
+
l=i
|
735
|
+
(1..7).each do |j| #do 25
|
736
|
+
b[l]=Math::sqrt(a1.quo(j))*(a2 * hh[l+1] - a3*hh[l])
|
737
|
+
l=l+@nn
|
738
|
+
end # 25
|
739
|
+
end # 25
|
740
|
+
end
|
741
|
+
#26 r20
|
742
|
+
l = @mm
|
743
|
+
a1 = -sum * xnorm(alpha[@mm])
|
744
|
+
a2 = row[@m] * sumr[@mm]
|
745
|
+
(1..7).each do |j| # do 27
|
746
|
+
a[l]=a1 * h[l].quo(Math::sqrt(j*a2))
|
747
|
+
l=l+@mm
|
748
|
+
end # 27
|
749
|
+
|
750
|
+
l = @nn
|
751
|
+
a1 = -sum * xnorm(beta[@nn])
|
752
|
+
a2 = colmn[@n] * sumc[@nn]
|
753
|
+
|
754
|
+
(1..7).each do |j| # do 28
|
755
|
+
b[l]=a1 * hh[l].quo(Math::sqrt(j*a2))
|
756
|
+
l = l + @nn
|
757
|
+
end # 28
|
758
|
+
rcof=[]
|
759
|
+
# compute coefficients rcof of polynomial of order 8
|
760
|
+
rcof[1]=-phisq
|
761
|
+
(2..9).each do |i| # do 30
|
762
|
+
rcof[i]=0.0
|
763
|
+
end #30
|
764
|
+
m1=@mm
|
765
|
+
(1..@mm).each do |i| # do 31
|
766
|
+
m1=m1+1
|
767
|
+
m2=m1+@mm
|
768
|
+
m3=m2+@mm
|
769
|
+
m4=m3+@mm
|
770
|
+
m5=m4+@mm
|
771
|
+
m6=m5+@mm
|
772
|
+
n1=@nn
|
773
|
+
(1..@nn).each do |j| # do 31
|
774
|
+
n1=n1+1
|
775
|
+
n2=n1+@nn
|
776
|
+
n3=n2+@nn
|
777
|
+
n4=n3+@nn
|
778
|
+
n5=n4+@nn
|
779
|
+
n6=n5+@nn
|
780
|
+
|
781
|
+
rcof[3] = rcof[3] + a[i]**2 * b[j]**2
|
782
|
+
|
783
|
+
rcof[4] = rcof[4] + 2.0 * a[i] * a[m1] * b[j] * b[n1]
|
784
|
+
|
785
|
+
rcof[5] = rcof[5] + a[m1]**2 * b[n1]**2 +
|
786
|
+
2.0 * a[i] * a[m2] * b[j] * b[n2]
|
787
|
+
|
788
|
+
rcof[6] = rcof[6] + 2.0 * (a[i] * a[m3] * b[j] *
|
789
|
+
b[n3] + a[m1] * a[m2] * b[n1] * b[n2])
|
790
|
+
|
791
|
+
rcof[7] = rcof[7] + a[m2]**2 * b[n2]**2 +
|
792
|
+
2.0 * (a[i] * a[m4] * b[j] * b[n4] + a[m1] * a[m3] *
|
793
|
+
b[n1] * b[n3])
|
794
|
+
|
795
|
+
rcof[8] = rcof[8] + 2.0 * (a[i] * a[m5] * b[j] * b[n5] +
|
796
|
+
a[m1] * a[m4] * b[n1] * b[n4] + a[m2] * a[m3] * b[n2] * b[n3])
|
797
|
+
|
798
|
+
rcof[9] = rcof[9] + a[m3]**2 * b[n3]**2 +
|
799
|
+
2.0 * (a[i] * a[m6] * b[j] * b[n6] + a[m1] * a[m5] * b[n1] *
|
800
|
+
b[n5] + (a[m2] * a[m4] * b[n2] * b[n4]))
|
801
|
+
end # 31
|
802
|
+
end # 31
|
803
|
+
|
804
|
+
rcof=rcof[1,rcof.size]
|
805
|
+
poly = GSL::Poly.alloc(rcof)
|
806
|
+
roots=poly.solve
|
807
|
+
rootr=[nil]
|
808
|
+
rooti=[nil]
|
809
|
+
roots.each {|c|
|
810
|
+
rootr.push(c.real)
|
811
|
+
rooti.push(c.im)
|
812
|
+
}
|
813
|
+
@rootr=rootr
|
814
|
+
@rooti=rooti
|
815
|
+
|
816
|
+
norts=0
|
817
|
+
(1..7).each do |i| # do 43
|
818
|
+
|
819
|
+
next if rooti[i]!=0.0
|
820
|
+
if (covxy>=0.0)
|
821
|
+
next if(rootr[i]<0.0 or rootr[i]>1.0)
|
822
|
+
pcorl=rootr[i]
|
823
|
+
norts=norts+1
|
824
|
+
else
|
825
|
+
if (rootr[i]>=-1.0 and rootr[i]<0.0)
|
826
|
+
pcorl=rootr[i]
|
827
|
+
norts=norts+1
|
828
|
+
end
|
829
|
+
end
|
830
|
+
end # 43
|
831
|
+
raise "Error" if norts==0
|
832
|
+
@r=pcorl
|
833
|
+
|
834
|
+
@loglike_model=-loglike(@alpha, @beta, @r)
|
835
|
+
|
836
|
+
end
|
837
|
+
#Computes vector h(mm7) of orthogonal hermite...
|
838
|
+
def hermit(s,k) # :nodoc:
|
839
|
+
h=[]
|
840
|
+
(1..k).each do |i| # do 14
|
841
|
+
l=i
|
842
|
+
ll=i+k
|
843
|
+
lll=ll+k
|
844
|
+
h[i]=1.0
|
845
|
+
h[ll]=s[i]
|
846
|
+
v=1.0
|
847
|
+
(2..6).each do |j| #do 14
|
848
|
+
w=Math::sqrt(j)
|
849
|
+
h[lll]=(s[i]*h[ll] - v*h[l]).quo(w)
|
850
|
+
v=w
|
851
|
+
l=l+k
|
852
|
+
ll=ll+k
|
853
|
+
lll=lll+k
|
854
|
+
end
|
855
|
+
end
|
856
|
+
h
|
857
|
+
end
|
858
|
+
def xnorm(t) # :nodoc:
|
859
|
+
Math::exp(-0.5 * t **2) * (1.0/Math::sqrt(2*Math::PI))
|
860
|
+
end
|
861
|
+
|
862
|
+
def report_building(generator) # :nodoc:
|
863
|
+
compute if dirty?
|
864
|
+
section=ReportBuilder::Section.new(:name=>@name)
|
865
|
+
t=ReportBuilder::Table.new(:name=>_("Contingence Table"), :header=>[""]+(@n.times.collect {|i| "Y=#{i}"})+["Total"])
|
866
|
+
@m.times do |i|
|
867
|
+
t.row(["X = #{i}"]+(@n.times.collect {|j| @matrix[i,j]}) + [@sumr[i]])
|
868
|
+
end
|
869
|
+
t.hr
|
870
|
+
t.row(["T"]+(@n.times.collect {|j| @sumc[j]})+[@total])
|
871
|
+
section.add(t)
|
872
|
+
section.add(sprintf("r: %0.4f",r))
|
873
|
+
t=ReportBuilder::Table.new(:name=>_("Thresholds"), :header=>["","Value"])
|
874
|
+
threshold_x.each_with_index {|val,i|
|
875
|
+
t.row([_("Threshold X %d") % i, sprintf("%0.4f", val)])
|
876
|
+
}
|
877
|
+
threshold_y.each_with_index {|val,i|
|
878
|
+
t.row([_("Threshold Y %d") % i, sprintf("%0.4f", val)])
|
879
|
+
}
|
880
|
+
section.add(t)
|
881
|
+
section.add(_("Test of bivariate normality: X2 = %0.3f, df = %d, p= %0.5f" % [ chi_square, chi_square_df, 1-Distribution::ChiSquare.cdf(chi_square, chi_square_df)]))
|
882
|
+
generator.parse_element(section)
|
883
|
+
end
|
884
|
+
end
|
885
|
+
end
|
886
|
+
end
|