statsample-bivariate-extension 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,450 @@
1
+ module Statsample
2
+ module Bivariate
3
+ # Calculate Tetrachoric correlation for two vectors.
4
+ def self.tetrachoric(v1,v2)
5
+ tc=Tetrachoric.new_with_vectors(v1,v2)
6
+ tc.r
7
+ end
8
+
9
+ # Tetrachoric correlation matrix.
10
+ # Order of rows and columns depends on Dataset#fields order
11
+ def self.tetrachoric_correlation_matrix(ds)
12
+ ds.collect_matrix do |row,col|
13
+ if row==col
14
+ 1.0
15
+ else
16
+ begin
17
+ tetrachoric(ds[row],ds[col])
18
+ rescue RuntimeError
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
24
+ # Compute tetrachoric correlation.
25
+ #
26
+ # The <em>tetrachoric</em> correlation is a measure of
27
+ # bivariate association arising when both observed variates
28
+ # are categorical variables that result from dichotomizing
29
+ # the two undelying continuous variables (Drasgow, 2006).
30
+ # The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)
31
+ #
32
+ # This class uses Brown (1977) algorithm. You can see FORTRAN code on http://lib.stat.cmu.edu/apstat/116
33
+ #
34
+ #
35
+ # == Usage
36
+ # With two variables x and y on a crosstab like this:
37
+ #
38
+ # -------------
39
+ # | y=0 | y=1 |
40
+ # -------------
41
+ # x = 0 | a | b |
42
+ # -------------
43
+ # x = 1 | c | d |
44
+ # -------------
45
+ #
46
+ # The code will be
47
+ # tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
48
+ # tc.r # correlation
49
+ # tc.se # standard error
50
+ # tc.threshold_y # threshold for y variable
51
+ # tc.threshold_x # threshold for x variable
52
+ #
53
+ # == References:
54
+ #
55
+ # * Brown, MB. (1977) Algorithm AS 116: the tetrachoric correlation and its standard error. <em>Applied Statistics, 26</em>, 343-351.
56
+ # * Drasgow F. (2006). Polychoric and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley.
57
+ # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
58
+
59
+ class Tetrachoric
60
+ include Summarizable
61
+ attr_reader :r
62
+ attr_accessor :name
63
+
64
+ TWOPI=Math::PI*2
65
+ SQT2PI= 2.50662827
66
+ RLIMIT = 0.9999
67
+ RCUT= 0.95
68
+ UPLIM= 5.0
69
+ CONST= 1E-36
70
+ CHALF= 1E-18
71
+ CONV =1E-8
72
+ CITER = 1E-6
73
+ NITER = 25
74
+ X=[0,0.9972638618, 0.9856115115, 0.9647622556, 0.9349060759, 0.8963211558, 0.8493676137, 0.7944837960, 0.7321821187, 0.6630442669, 0.5877157572, 0.5068999089, 0.4213512761, 0.3318686023, 0.2392873623, 0.1444719616, 0.0483076657]
75
+ W=[0, 0.0070186100, 0.0162743947, 0.0253920653, 0.0342738629, 0.0428358980, 0.0509980593, 0.0586840935, 0.0658222228, 0.0723457941, 0.0781938958, 0.0833119242, 0.0876520930, 0.0911738787, 0.0938443991, 0.0956387201, 0.0965400885]
76
+ # Creates a Tetrachoric object based on a 2x2 Matrix.
77
+ def self.new_with_matrix(m)
78
+ Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1])
79
+ end
80
+ # Creates a Tetrachoric object based on two vectors.
81
+ # The vectors are dichotomized previously.
82
+ def self.new_with_vectors(v1,v2)
83
+ v1a, v2a=Statsample.only_valid(v1,v2)
84
+ v1a=v1a.dichotomize
85
+ v2a=v2a.dichotomize
86
+ raise "v1 have only 0" if v1a.factors==[0]
87
+ raise "v2 have only 0" if v2a.factors==[0]
88
+ a,b,c,d = 0,0,0,0
89
+ v1a.each_index{|i|
90
+ x,y=v1a[i],v2a[i]
91
+ a+=1 if x==0 and y==0
92
+ b+=1 if x==0 and y==1
93
+ c+=1 if x==1 and y==0
94
+ d+=1 if x==1 and y==1
95
+ }
96
+ Tetrachoric.new(a,b,c,d)
97
+ end
98
+ # Standard error
99
+ def se
100
+ @sdr
101
+ end
102
+ # Threshold for variable x (rows)
103
+ # Point on gauss curve under X rater select cases
104
+ def threshold_x
105
+ @zab
106
+ end
107
+
108
+ # Threshold for variable y (columns)
109
+ # Point on gauss curve under Y rater select cases
110
+
111
+ def threshold_y
112
+ @zac
113
+ end
114
+ def report_building(generator) # :nodoc:
115
+ generator.section(:name=>@name) do |s|
116
+ s.table(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"]) do |t|
117
+ t.row(["X=0", @a,@b,@a+@b])
118
+ t.row(["X=1", @c,@d,@c+@d])
119
+ t.hr
120
+ t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
121
+ end
122
+ s.text(sprintf("r: %0.3f",r))
123
+ s.text(_("SE: %0.3f") % se)
124
+ s.text(_("Threshold X: %0.3f ") % threshold_x)
125
+ s.text(_("Threshold Y: %0.3f ") % threshold_y )
126
+ end
127
+ end
128
+
129
+ # Creates a new tetrachoric object for analysis
130
+ def initialize(a,b,c,d)
131
+ @a,@b,@c,@d=a,b,c,d
132
+ @name=_("Tetrachoric correlation")
133
+ #
134
+ # CHECK IF ANY CELL FREQUENCY IS NEGATIVE
135
+ #
136
+ raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0)
137
+ compute
138
+ end
139
+ # Compute the tetrachoric correlation.
140
+ # Called on object creation.
141
+ #
142
+ def compute
143
+
144
+ #
145
+ # INITIALIZATION
146
+ #
147
+ @r = 0
148
+ sdzero = 0
149
+ @sdr = 0
150
+ @itype = 0
151
+ @ifault = 0
152
+
153
+ #
154
+ # CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
155
+ #
156
+ @kdelta = 1
157
+ delta = 0
158
+ @kdelta = 2 if (@a == 0 or @d == 0)
159
+ @kdelta += 2 if (@b == 0 or @c == 0)
160
+ #
161
+ # kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
162
+ #
163
+
164
+ raise "Rows and columns should have more than 0 items" if @kdelta==4
165
+
166
+ # GOTO (4, 1, 2 , 92), kdelta
167
+ #
168
+ # delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
169
+ #
170
+
171
+ if(@kdelta==2)
172
+ # 1
173
+ delta=0.5
174
+ @r=-1 if (@a==0 and @d==0)
175
+ elsif(@kdelta==3)
176
+ # 2
177
+ delta=-0.5
178
+ @r=1 if (@b==0 and @c==0)
179
+ end
180
+ # 4
181
+ if @r!=0
182
+ @itype=3
183
+ end
184
+
185
+ #
186
+ # STORE FREQUENCIES IN AA, BB, CC AND DD
187
+ #
188
+ @aa = @a + delta
189
+ @bb = @b - delta
190
+ @cc = @c - delta
191
+ @dd = @d + delta
192
+ @tot = @aa+@bb+@cc+@dd
193
+ #
194
+ # CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
195
+ # IF (AA * DD - BB * CC) 7, 5, 6
196
+
197
+ corr_dir=@aa * @dd - @bb * @cc
198
+ if(corr_dir < 0)
199
+ # 7
200
+ @probaa = @bb.quo(@tot)
201
+ @probac = (@bb + @dd).quo(@tot)
202
+ @ksign = 2
203
+ # -> 8
204
+ else
205
+ if (corr_dir==0)
206
+ # 5
207
+ @itype=4
208
+ end
209
+ # 6
210
+ #
211
+ # COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
212
+ # PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
213
+ # KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
214
+ #
215
+
216
+ @probaa = @aa.quo(@tot)
217
+ @probac = (@aa+@cc).quo(@tot)
218
+ @ksign=1
219
+ end
220
+ # 8
221
+
222
+ @probab = (@aa+@bb).quo(@tot)
223
+
224
+ #
225
+ # COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
226
+ # SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
227
+ #
228
+ @zac = Distribution::Normal.p_value(@probac)
229
+ @zab = Distribution::Normal.p_value(@probab)
230
+ @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
231
+ #
232
+ # WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
233
+ #
234
+ if (@r != 0 or @itype > 0)
235
+ compute_sdzero
236
+ return true
237
+ end
238
+ #
239
+ # WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
240
+ #
241
+ if (@a == @b and @b == @c)
242
+ calculate_cosine
243
+ return true
244
+ end
245
+ #
246
+ # INITIAL ESTIMATE OF CORRELATION IS YULES Y
247
+ #
248
+ @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs
249
+ @iter = 0
250
+ begin
251
+ #
252
+ # IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
253
+ #
254
+ #10
255
+ if @rr>RCUT
256
+ gaussian_quadrature
257
+ return true
258
+ end
259
+ #
260
+ # TETRACHORIC SERIES IS COMPUTED
261
+ #
262
+ # INITIALIZATION
263
+ #
264
+ va=1.0
265
+ vb=@zac.to_f
266
+ wa=1.0
267
+ wb=@zab.to_f
268
+ term = 1.0
269
+ iterm = 0.0
270
+ @sum = @probab * @probac
271
+ deriv = 0.0
272
+ sr = @ss
273
+ #15
274
+ begin
275
+ if(sr.abs<=CONST)
276
+ #
277
+ # RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
278
+ #
279
+ sr = sr / CONST
280
+ va = va * CHALF
281
+ vb = vb * CHALF
282
+ wa = wa * CHALF
283
+ wb = wb * CHALF
284
+ end
285
+ #
286
+ # FORM SUM AND DERIVATIVE OF SERIES
287
+ #
288
+ # 20
289
+ dr = sr * va * wa
290
+ sr = sr * @rr / term
291
+ cof = sr * va * wa
292
+ #
293
+ # ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV
294
+ #
295
+ iterm+= 1
296
+ iterm=0 if (cof.abs > CONV)
297
+ @sum = @sum + cof
298
+ deriv += dr
299
+ vaa = va
300
+ waa = wa
301
+ va = vb
302
+ wa = wb
303
+ vb = @zac * va - term * vaa
304
+ wb = @zab * wa - term * waa
305
+ term += 1
306
+ end while (iterm < 2 or term < 6)
307
+ #
308
+ # CHECK IF ITERATION CONVERGED
309
+ #
310
+ if((@sum-@probaa).abs <= CITER)
311
+ @itype=term
312
+ calculate_sdr
313
+ return true
314
+ end
315
+ #
316
+ # CALCULATE NEXT ESTIMATE OF CORRELATION
317
+ #
318
+ #25
319
+ @iter += 1
320
+ #
321
+ # IF TOO MANY ITERATlONS, RUN IS TERMINATED
322
+ #
323
+ delta = (@sum - @probaa) / deriv
324
+ @rrprev = @rr
325
+ @rr = @rr - delta
326
+ @rr += 0.5 * delta if(@iter == 1)
327
+ @rr= RLIMIT if (@rr > RLIMIT)
328
+ @rr =0 if (@rr < 0.0)
329
+ end while @iter < NITER
330
+ raise "Too many iteration"
331
+ # GOTO 10
332
+ end
333
+ # GAUSSIAN QUADRATURE
334
+ # 40
335
+ def gaussian_quadrature
336
+ if(@iter==0)
337
+ # INITIALIZATION, IF THIS IS FIRST ITERATION
338
+ @sum=@probab*@probac
339
+ @rrprev=0
340
+ end
341
+
342
+ # 41
343
+ sumprv = @probab - @sum
344
+ @prob = @bb.quo(@tot)
345
+ @prob = @aa.quo(@tot) if (@ksign == 2)
346
+ @itype = 1
347
+ #
348
+ # LOOP TO FIND ESTIMATE OF CORRELATION
349
+ # COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
350
+ #
351
+ # 42
352
+
353
+ begin
354
+ rrsq = Math::sqrt(1 - @rr ** 2)
355
+ amid = 0.5 * (UPLIM + @zac)
356
+ xlen = UPLIM - amid
357
+ @sum = 0
358
+ (1..16).each do |iquad|
359
+ xla = amid + X[iquad] * xlen
360
+ xlb = amid - X[iquad] * xlen
361
+
362
+
363
+ #
364
+ # TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
365
+ #
366
+ tempa = (@zab - @rr * xla) / rrsq
367
+ if (tempa >= -6.0)
368
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa)
369
+ end
370
+ tempb = (@zab - @rr * xlb) / rrsq
371
+
372
+ if (tempb >= -6.0)
373
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
374
+ end
375
+ end # 44 ~ iquad
376
+ @sum=@sum*xlen / SQT2PI
377
+ #
378
+ # CHECK IF ITERATION HAS CONVERGED
379
+ #
380
+ if ((@prob - @sum).abs <= CITER)
381
+ calculate_sdr
382
+ return true
383
+ end
384
+ # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION
385
+
386
+ rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
387
+ rrest = RLIMIT if (rrest > RLIMIT)
388
+ rrest = 0 if (rrest < 0)
389
+ @rrprev = @rr
390
+ @rr = rrest
391
+ sumprv = @sum
392
+ #
393
+ # if estimate has same value on two iterations, stop iteration
394
+ #
395
+ if @rr == @rrprev
396
+ calculate_sdr
397
+ return true
398
+ end
399
+
400
+
401
+ end while @iter < NITER
402
+ raise "Too many iterations"
403
+ # ir a 42
404
+ end
405
+ def calculate_cosine
406
+ #
407
+ # WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
408
+ #
409
+ @rr = -Math::cos(TWOPI * @probaa)
410
+ @itype = 2
411
+ calculate_sdr
412
+ end
413
+
414
+
415
+ def calculate_sdr # :nodoc:
416
+ #
417
+ # COMPUTE SDR
418
+ #
419
+ @r = @rr
420
+ rrsq = Math::sqrt(1.0 - @r ** 2)
421
+ @itype = -@itype if (@kdelta > 1)
422
+ if (@ksign != 1)
423
+ @r = -@r
424
+ @zac = -@zac
425
+ end
426
+ # 71
427
+ pdf = Math::exp(-0.5 * (@zac ** 2 - 2 * @r * @zac * @zab + @zab ** 2) / rrsq ** 2) / (TWOPI * rrsq)
428
+ @pac = Distribution::Normal.cdf((@zac - @r * @zab) / rrsq) - 0.5
429
+ @pab = Distribution::Normal.cdf((@zab - @r * @zac) / rrsq) - 0.5
430
+
431
+ @sdr = ((@aa+@dd) * (@bb + @cc)).quo(4) + @pab ** 2 * (@aa + @cc) * (@bb + @dd) + @pac ** 2 * (@aa + @bb) * (@cc + @dd) + 2.0 * @pab * @pac * (@aa * @dd - @bb * @cc) - @pab * (@aa * @bb - @cc * @dd) - @pac * (@aa * @cc - @bb * @dd)
432
+ @sdr=0 if (@sdr<0)
433
+ @sdr= Math::sqrt(@sdr) / (@tot * pdf * Math::sqrt(@tot))
434
+ compute_sdzero
435
+ end
436
+
437
+ # 85
438
+ #
439
+ # COMPUTE SDZERO
440
+ #
441
+ def compute_sdzero
442
+ @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
443
+ @sdr = @sdzero if (@r == 0)
444
+ end
445
+ private :calculate_cosine, :calculate_sdr, :compute_sdzero, :compute, :gaussian_quadrature
446
+ end
447
+ end
448
+ end
449
+
450
+
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__), '..', 'lib')
4
+ require "rubygems"
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+ require 'statsample'
8
+ require 'statsample/bivariate/tetrachoric'
9
+ require 'statsample/bivariate/polychoric'