statsample-bivariate-extension 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- data.tar.gz.sig +0 -0
- data/History.txt +4 -0
- data/Manifest.txt +7 -0
- data/README.txt +50 -0
- data/Rakefile +18 -0
- data/lib/statsample/bivariate/polychoric.rb +886 -0
- data/lib/statsample/bivariate/tetrachoric.rb +450 -0
- data/spec/spec_helper.rb +9 -0
- metadata +137 -0
- metadata.gz.sig +1 -0
@@ -0,0 +1,450 @@
|
|
1
|
+
module Statsample
|
2
|
+
module Bivariate
|
3
|
+
# Calculate Tetrachoric correlation for two vectors.
|
4
|
+
def self.tetrachoric(v1,v2)
|
5
|
+
tc=Tetrachoric.new_with_vectors(v1,v2)
|
6
|
+
tc.r
|
7
|
+
end
|
8
|
+
|
9
|
+
# Tetrachoric correlation matrix.
|
10
|
+
# Order of rows and columns depends on Dataset#fields order
|
11
|
+
def self.tetrachoric_correlation_matrix(ds)
|
12
|
+
ds.collect_matrix do |row,col|
|
13
|
+
if row==col
|
14
|
+
1.0
|
15
|
+
else
|
16
|
+
begin
|
17
|
+
tetrachoric(ds[row],ds[col])
|
18
|
+
rescue RuntimeError
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
# Compute tetrachoric correlation.
|
25
|
+
#
|
26
|
+
# The <em>tetrachoric</em> correlation is a measure of
|
27
|
+
# bivariate association arising when both observed variates
|
28
|
+
# are categorical variables that result from dichotomizing
|
29
|
+
# the two undelying continuous variables (Drasgow, 2006).
|
30
|
+
# The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)
|
31
|
+
#
|
32
|
+
# This class uses Brown (1977) algorithm. You can see FORTRAN code on http://lib.stat.cmu.edu/apstat/116
|
33
|
+
#
|
34
|
+
#
|
35
|
+
# == Usage
|
36
|
+
# With two variables x and y on a crosstab like this:
|
37
|
+
#
|
38
|
+
# -------------
|
39
|
+
# | y=0 | y=1 |
|
40
|
+
# -------------
|
41
|
+
# x = 0 | a | b |
|
42
|
+
# -------------
|
43
|
+
# x = 1 | c | d |
|
44
|
+
# -------------
|
45
|
+
#
|
46
|
+
# The code will be
|
47
|
+
# tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
|
48
|
+
# tc.r # correlation
|
49
|
+
# tc.se # standard error
|
50
|
+
# tc.threshold_y # threshold for y variable
|
51
|
+
# tc.threshold_x # threshold for x variable
|
52
|
+
#
|
53
|
+
# == References:
|
54
|
+
#
|
55
|
+
# * Brown, MB. (1977) Algorithm AS 116: the tetrachoric correlation and its standard error. <em>Applied Statistics, 26</em>, 343-351.
|
56
|
+
# * Drasgow F. (2006). Polychoric and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley.
|
57
|
+
# * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
|
58
|
+
|
59
|
+
class Tetrachoric
|
60
|
+
include Summarizable
|
61
|
+
attr_reader :r
|
62
|
+
attr_accessor :name
|
63
|
+
|
64
|
+
TWOPI=Math::PI*2
|
65
|
+
SQT2PI= 2.50662827
|
66
|
+
RLIMIT = 0.9999
|
67
|
+
RCUT= 0.95
|
68
|
+
UPLIM= 5.0
|
69
|
+
CONST= 1E-36
|
70
|
+
CHALF= 1E-18
|
71
|
+
CONV =1E-8
|
72
|
+
CITER = 1E-6
|
73
|
+
NITER = 25
|
74
|
+
X=[0,0.9972638618, 0.9856115115, 0.9647622556, 0.9349060759, 0.8963211558, 0.8493676137, 0.7944837960, 0.7321821187, 0.6630442669, 0.5877157572, 0.5068999089, 0.4213512761, 0.3318686023, 0.2392873623, 0.1444719616, 0.0483076657]
|
75
|
+
W=[0, 0.0070186100, 0.0162743947, 0.0253920653, 0.0342738629, 0.0428358980, 0.0509980593, 0.0586840935, 0.0658222228, 0.0723457941, 0.0781938958, 0.0833119242, 0.0876520930, 0.0911738787, 0.0938443991, 0.0956387201, 0.0965400885]
|
76
|
+
# Creates a Tetrachoric object based on a 2x2 Matrix.
|
77
|
+
def self.new_with_matrix(m)
|
78
|
+
Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1])
|
79
|
+
end
|
80
|
+
# Creates a Tetrachoric object based on two vectors.
|
81
|
+
# The vectors are dichotomized previously.
|
82
|
+
def self.new_with_vectors(v1,v2)
|
83
|
+
v1a, v2a=Statsample.only_valid(v1,v2)
|
84
|
+
v1a=v1a.dichotomize
|
85
|
+
v2a=v2a.dichotomize
|
86
|
+
raise "v1 have only 0" if v1a.factors==[0]
|
87
|
+
raise "v2 have only 0" if v2a.factors==[0]
|
88
|
+
a,b,c,d = 0,0,0,0
|
89
|
+
v1a.each_index{|i|
|
90
|
+
x,y=v1a[i],v2a[i]
|
91
|
+
a+=1 if x==0 and y==0
|
92
|
+
b+=1 if x==0 and y==1
|
93
|
+
c+=1 if x==1 and y==0
|
94
|
+
d+=1 if x==1 and y==1
|
95
|
+
}
|
96
|
+
Tetrachoric.new(a,b,c,d)
|
97
|
+
end
|
98
|
+
# Standard error
|
99
|
+
def se
|
100
|
+
@sdr
|
101
|
+
end
|
102
|
+
# Threshold for variable x (rows)
|
103
|
+
# Point on gauss curve under X rater select cases
|
104
|
+
def threshold_x
|
105
|
+
@zab
|
106
|
+
end
|
107
|
+
|
108
|
+
# Threshold for variable y (columns)
|
109
|
+
# Point on gauss curve under Y rater select cases
|
110
|
+
|
111
|
+
def threshold_y
|
112
|
+
@zac
|
113
|
+
end
|
114
|
+
def report_building(generator) # :nodoc:
|
115
|
+
generator.section(:name=>@name) do |s|
|
116
|
+
s.table(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"]) do |t|
|
117
|
+
t.row(["X=0", @a,@b,@a+@b])
|
118
|
+
t.row(["X=1", @c,@d,@c+@d])
|
119
|
+
t.hr
|
120
|
+
t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
|
121
|
+
end
|
122
|
+
s.text(sprintf("r: %0.3f",r))
|
123
|
+
s.text(_("SE: %0.3f") % se)
|
124
|
+
s.text(_("Threshold X: %0.3f ") % threshold_x)
|
125
|
+
s.text(_("Threshold Y: %0.3f ") % threshold_y )
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
# Creates a new tetrachoric object for analysis
|
130
|
+
def initialize(a,b,c,d)
|
131
|
+
@a,@b,@c,@d=a,b,c,d
|
132
|
+
@name=_("Tetrachoric correlation")
|
133
|
+
#
|
134
|
+
# CHECK IF ANY CELL FREQUENCY IS NEGATIVE
|
135
|
+
#
|
136
|
+
raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0)
|
137
|
+
compute
|
138
|
+
end
|
139
|
+
# Compute the tetrachoric correlation.
|
140
|
+
# Called on object creation.
|
141
|
+
#
|
142
|
+
def compute
|
143
|
+
|
144
|
+
#
|
145
|
+
# INITIALIZATION
|
146
|
+
#
|
147
|
+
@r = 0
|
148
|
+
sdzero = 0
|
149
|
+
@sdr = 0
|
150
|
+
@itype = 0
|
151
|
+
@ifault = 0
|
152
|
+
|
153
|
+
#
|
154
|
+
# CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
|
155
|
+
#
|
156
|
+
@kdelta = 1
|
157
|
+
delta = 0
|
158
|
+
@kdelta = 2 if (@a == 0 or @d == 0)
|
159
|
+
@kdelta += 2 if (@b == 0 or @c == 0)
|
160
|
+
#
|
161
|
+
# kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
|
162
|
+
#
|
163
|
+
|
164
|
+
raise "Rows and columns should have more than 0 items" if @kdelta==4
|
165
|
+
|
166
|
+
# GOTO (4, 1, 2 , 92), kdelta
|
167
|
+
#
|
168
|
+
# delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
|
169
|
+
#
|
170
|
+
|
171
|
+
if(@kdelta==2)
|
172
|
+
# 1
|
173
|
+
delta=0.5
|
174
|
+
@r=-1 if (@a==0 and @d==0)
|
175
|
+
elsif(@kdelta==3)
|
176
|
+
# 2
|
177
|
+
delta=-0.5
|
178
|
+
@r=1 if (@b==0 and @c==0)
|
179
|
+
end
|
180
|
+
# 4
|
181
|
+
if @r!=0
|
182
|
+
@itype=3
|
183
|
+
end
|
184
|
+
|
185
|
+
#
|
186
|
+
# STORE FREQUENCIES IN AA, BB, CC AND DD
|
187
|
+
#
|
188
|
+
@aa = @a + delta
|
189
|
+
@bb = @b - delta
|
190
|
+
@cc = @c - delta
|
191
|
+
@dd = @d + delta
|
192
|
+
@tot = @aa+@bb+@cc+@dd
|
193
|
+
#
|
194
|
+
# CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
|
195
|
+
# IF (AA * DD - BB * CC) 7, 5, 6
|
196
|
+
|
197
|
+
corr_dir=@aa * @dd - @bb * @cc
|
198
|
+
if(corr_dir < 0)
|
199
|
+
# 7
|
200
|
+
@probaa = @bb.quo(@tot)
|
201
|
+
@probac = (@bb + @dd).quo(@tot)
|
202
|
+
@ksign = 2
|
203
|
+
# -> 8
|
204
|
+
else
|
205
|
+
if (corr_dir==0)
|
206
|
+
# 5
|
207
|
+
@itype=4
|
208
|
+
end
|
209
|
+
# 6
|
210
|
+
#
|
211
|
+
# COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
|
212
|
+
# PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
|
213
|
+
# KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
|
214
|
+
#
|
215
|
+
|
216
|
+
@probaa = @aa.quo(@tot)
|
217
|
+
@probac = (@aa+@cc).quo(@tot)
|
218
|
+
@ksign=1
|
219
|
+
end
|
220
|
+
# 8
|
221
|
+
|
222
|
+
@probab = (@aa+@bb).quo(@tot)
|
223
|
+
|
224
|
+
#
|
225
|
+
# COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
|
226
|
+
# SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
|
227
|
+
#
|
228
|
+
@zac = Distribution::Normal.p_value(@probac)
|
229
|
+
@zab = Distribution::Normal.p_value(@probab)
|
230
|
+
@ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
|
231
|
+
#
|
232
|
+
# WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
|
233
|
+
#
|
234
|
+
if (@r != 0 or @itype > 0)
|
235
|
+
compute_sdzero
|
236
|
+
return true
|
237
|
+
end
|
238
|
+
#
|
239
|
+
# WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
|
240
|
+
#
|
241
|
+
if (@a == @b and @b == @c)
|
242
|
+
calculate_cosine
|
243
|
+
return true
|
244
|
+
end
|
245
|
+
#
|
246
|
+
# INITIAL ESTIMATE OF CORRELATION IS YULES Y
|
247
|
+
#
|
248
|
+
@rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs
|
249
|
+
@iter = 0
|
250
|
+
begin
|
251
|
+
#
|
252
|
+
# IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
|
253
|
+
#
|
254
|
+
#10
|
255
|
+
if @rr>RCUT
|
256
|
+
gaussian_quadrature
|
257
|
+
return true
|
258
|
+
end
|
259
|
+
#
|
260
|
+
# TETRACHORIC SERIES IS COMPUTED
|
261
|
+
#
|
262
|
+
# INITIALIZATION
|
263
|
+
#
|
264
|
+
va=1.0
|
265
|
+
vb=@zac.to_f
|
266
|
+
wa=1.0
|
267
|
+
wb=@zab.to_f
|
268
|
+
term = 1.0
|
269
|
+
iterm = 0.0
|
270
|
+
@sum = @probab * @probac
|
271
|
+
deriv = 0.0
|
272
|
+
sr = @ss
|
273
|
+
#15
|
274
|
+
begin
|
275
|
+
if(sr.abs<=CONST)
|
276
|
+
#
|
277
|
+
# RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
|
278
|
+
#
|
279
|
+
sr = sr / CONST
|
280
|
+
va = va * CHALF
|
281
|
+
vb = vb * CHALF
|
282
|
+
wa = wa * CHALF
|
283
|
+
wb = wb * CHALF
|
284
|
+
end
|
285
|
+
#
|
286
|
+
# FORM SUM AND DERIVATIVE OF SERIES
|
287
|
+
#
|
288
|
+
# 20
|
289
|
+
dr = sr * va * wa
|
290
|
+
sr = sr * @rr / term
|
291
|
+
cof = sr * va * wa
|
292
|
+
#
|
293
|
+
# ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV
|
294
|
+
#
|
295
|
+
iterm+= 1
|
296
|
+
iterm=0 if (cof.abs > CONV)
|
297
|
+
@sum = @sum + cof
|
298
|
+
deriv += dr
|
299
|
+
vaa = va
|
300
|
+
waa = wa
|
301
|
+
va = vb
|
302
|
+
wa = wb
|
303
|
+
vb = @zac * va - term * vaa
|
304
|
+
wb = @zab * wa - term * waa
|
305
|
+
term += 1
|
306
|
+
end while (iterm < 2 or term < 6)
|
307
|
+
#
|
308
|
+
# CHECK IF ITERATION CONVERGED
|
309
|
+
#
|
310
|
+
if((@sum-@probaa).abs <= CITER)
|
311
|
+
@itype=term
|
312
|
+
calculate_sdr
|
313
|
+
return true
|
314
|
+
end
|
315
|
+
#
|
316
|
+
# CALCULATE NEXT ESTIMATE OF CORRELATION
|
317
|
+
#
|
318
|
+
#25
|
319
|
+
@iter += 1
|
320
|
+
#
|
321
|
+
# IF TOO MANY ITERATlONS, RUN IS TERMINATED
|
322
|
+
#
|
323
|
+
delta = (@sum - @probaa) / deriv
|
324
|
+
@rrprev = @rr
|
325
|
+
@rr = @rr - delta
|
326
|
+
@rr += 0.5 * delta if(@iter == 1)
|
327
|
+
@rr= RLIMIT if (@rr > RLIMIT)
|
328
|
+
@rr =0 if (@rr < 0.0)
|
329
|
+
end while @iter < NITER
|
330
|
+
raise "Too many iteration"
|
331
|
+
# GOTO 10
|
332
|
+
end
|
333
|
+
# GAUSSIAN QUADRATURE
|
334
|
+
# 40
|
335
|
+
def gaussian_quadrature
|
336
|
+
if(@iter==0)
|
337
|
+
# INITIALIZATION, IF THIS IS FIRST ITERATION
|
338
|
+
@sum=@probab*@probac
|
339
|
+
@rrprev=0
|
340
|
+
end
|
341
|
+
|
342
|
+
# 41
|
343
|
+
sumprv = @probab - @sum
|
344
|
+
@prob = @bb.quo(@tot)
|
345
|
+
@prob = @aa.quo(@tot) if (@ksign == 2)
|
346
|
+
@itype = 1
|
347
|
+
#
|
348
|
+
# LOOP TO FIND ESTIMATE OF CORRELATION
|
349
|
+
# COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
|
350
|
+
#
|
351
|
+
# 42
|
352
|
+
|
353
|
+
begin
|
354
|
+
rrsq = Math::sqrt(1 - @rr ** 2)
|
355
|
+
amid = 0.5 * (UPLIM + @zac)
|
356
|
+
xlen = UPLIM - amid
|
357
|
+
@sum = 0
|
358
|
+
(1..16).each do |iquad|
|
359
|
+
xla = amid + X[iquad] * xlen
|
360
|
+
xlb = amid - X[iquad] * xlen
|
361
|
+
|
362
|
+
|
363
|
+
#
|
364
|
+
# TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
|
365
|
+
#
|
366
|
+
tempa = (@zab - @rr * xla) / rrsq
|
367
|
+
if (tempa >= -6.0)
|
368
|
+
@sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa)
|
369
|
+
end
|
370
|
+
tempb = (@zab - @rr * xlb) / rrsq
|
371
|
+
|
372
|
+
if (tempb >= -6.0)
|
373
|
+
@sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
|
374
|
+
end
|
375
|
+
end # 44 ~ iquad
|
376
|
+
@sum=@sum*xlen / SQT2PI
|
377
|
+
#
|
378
|
+
# CHECK IF ITERATION HAS CONVERGED
|
379
|
+
#
|
380
|
+
if ((@prob - @sum).abs <= CITER)
|
381
|
+
calculate_sdr
|
382
|
+
return true
|
383
|
+
end
|
384
|
+
# ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION
|
385
|
+
|
386
|
+
rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
|
387
|
+
rrest = RLIMIT if (rrest > RLIMIT)
|
388
|
+
rrest = 0 if (rrest < 0)
|
389
|
+
@rrprev = @rr
|
390
|
+
@rr = rrest
|
391
|
+
sumprv = @sum
|
392
|
+
#
|
393
|
+
# if estimate has same value on two iterations, stop iteration
|
394
|
+
#
|
395
|
+
if @rr == @rrprev
|
396
|
+
calculate_sdr
|
397
|
+
return true
|
398
|
+
end
|
399
|
+
|
400
|
+
|
401
|
+
end while @iter < NITER
|
402
|
+
raise "Too many iterations"
|
403
|
+
# ir a 42
|
404
|
+
end
|
405
|
+
def calculate_cosine
|
406
|
+
#
|
407
|
+
# WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
|
408
|
+
#
|
409
|
+
@rr = -Math::cos(TWOPI * @probaa)
|
410
|
+
@itype = 2
|
411
|
+
calculate_sdr
|
412
|
+
end
|
413
|
+
|
414
|
+
|
415
|
+
def calculate_sdr # :nodoc:
|
416
|
+
#
|
417
|
+
# COMPUTE SDR
|
418
|
+
#
|
419
|
+
@r = @rr
|
420
|
+
rrsq = Math::sqrt(1.0 - @r ** 2)
|
421
|
+
@itype = -@itype if (@kdelta > 1)
|
422
|
+
if (@ksign != 1)
|
423
|
+
@r = -@r
|
424
|
+
@zac = -@zac
|
425
|
+
end
|
426
|
+
# 71
|
427
|
+
pdf = Math::exp(-0.5 * (@zac ** 2 - 2 * @r * @zac * @zab + @zab ** 2) / rrsq ** 2) / (TWOPI * rrsq)
|
428
|
+
@pac = Distribution::Normal.cdf((@zac - @r * @zab) / rrsq) - 0.5
|
429
|
+
@pab = Distribution::Normal.cdf((@zab - @r * @zac) / rrsq) - 0.5
|
430
|
+
|
431
|
+
@sdr = ((@aa+@dd) * (@bb + @cc)).quo(4) + @pab ** 2 * (@aa + @cc) * (@bb + @dd) + @pac ** 2 * (@aa + @bb) * (@cc + @dd) + 2.0 * @pab * @pac * (@aa * @dd - @bb * @cc) - @pab * (@aa * @bb - @cc * @dd) - @pac * (@aa * @cc - @bb * @dd)
|
432
|
+
@sdr=0 if (@sdr<0)
|
433
|
+
@sdr= Math::sqrt(@sdr) / (@tot * pdf * Math::sqrt(@tot))
|
434
|
+
compute_sdzero
|
435
|
+
end
|
436
|
+
|
437
|
+
# 85
|
438
|
+
#
|
439
|
+
# COMPUTE SDZERO
|
440
|
+
#
|
441
|
+
def compute_sdzero
|
442
|
+
@sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
|
443
|
+
@sdr = @sdzero if (@r == 0)
|
444
|
+
end
|
445
|
+
private :calculate_cosine, :calculate_sdr, :compute_sdzero, :compute, :gaussian_quadrature
|
446
|
+
end
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
|
data/spec/spec_helper.rb
ADDED