statsample-bivariate-extension 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ module Statsample
2
+ module Bivariate
3
+ # Calculate Tetrachoric correlation for two vectors.
4
+ def self.tetrachoric(v1,v2)
5
+ tc=Tetrachoric.new_with_vectors(v1,v2)
6
+ tc.r
7
+ end
8
+
9
+ # Tetrachoric correlation matrix.
10
+ # Order of rows and columns depends on Dataset#fields order
11
+ def self.tetrachoric_correlation_matrix(ds)
12
+ ds.collect_matrix do |row,col|
13
+ if row==col
14
+ 1.0
15
+ else
16
+ begin
17
+ tetrachoric(ds[row],ds[col])
18
+ rescue RuntimeError
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
24
+ # Compute tetrachoric correlation.
25
+ #
26
+ # The <em>tetrachoric</em> correlation is a measure of
27
+ # bivariate association arising when both observed variates
28
+ # are categorical variables that result from dichotomizing
29
+ # the two undelying continuous variables (Drasgow, 2006).
30
+ # The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)
31
+ #
32
+ # This class uses Brown (1977) algorithm. You can see FORTRAN code on http://lib.stat.cmu.edu/apstat/116
33
+ #
34
+ #
35
+ # == Usage
36
+ # With two variables x and y on a crosstab like this:
37
+ #
38
+ # -------------
39
+ # | y=0 | y=1 |
40
+ # -------------
41
+ # x = 0 | a | b |
42
+ # -------------
43
+ # x = 1 | c | d |
44
+ # -------------
45
+ #
46
+ # The code will be
47
+ # tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
48
+ # tc.r # correlation
49
+ # tc.se # standard error
50
+ # tc.threshold_y # threshold for y variable
51
+ # tc.threshold_x # threshold for x variable
52
+ #
53
+ # == References:
54
+ #
55
+ # * Brown, MB. (1977) Algorithm AS 116: the tetrachoric correlation and its standard error. <em>Applied Statistics, 26</em>, 343-351.
56
+ # * Drasgow F. (2006). Polychoric and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley.
57
+ # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
58
+
59
+ class Tetrachoric
60
+ include Summarizable
61
+ attr_reader :r
62
+ attr_accessor :name
63
+
64
+ TWOPI=Math::PI*2
65
+ SQT2PI= 2.50662827
66
+ RLIMIT = 0.9999
67
+ RCUT= 0.95
68
+ UPLIM= 5.0
69
+ CONST= 1E-36
70
+ CHALF= 1E-18
71
+ CONV =1E-8
72
+ CITER = 1E-6
73
+ NITER = 25
74
+ X=[0,0.9972638618, 0.9856115115, 0.9647622556, 0.9349060759, 0.8963211558, 0.8493676137, 0.7944837960, 0.7321821187, 0.6630442669, 0.5877157572, 0.5068999089, 0.4213512761, 0.3318686023, 0.2392873623, 0.1444719616, 0.0483076657]
75
+ W=[0, 0.0070186100, 0.0162743947, 0.0253920653, 0.0342738629, 0.0428358980, 0.0509980593, 0.0586840935, 0.0658222228, 0.0723457941, 0.0781938958, 0.0833119242, 0.0876520930, 0.0911738787, 0.0938443991, 0.0956387201, 0.0965400885]
76
+ # Creates a Tetrachoric object based on a 2x2 Matrix.
77
+ def self.new_with_matrix(m)
78
+ Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1])
79
+ end
80
+ # Creates a Tetrachoric object based on two vectors.
81
+ # The vectors are dichotomized previously.
82
+ def self.new_with_vectors(v1,v2)
83
+ v1a, v2a=Statsample.only_valid(v1,v2)
84
+ v1a=v1a.dichotomize
85
+ v2a=v2a.dichotomize
86
+ raise "v1 have only 0" if v1a.factors==[0]
87
+ raise "v2 have only 0" if v2a.factors==[0]
88
+ a,b,c,d = 0,0,0,0
89
+ v1a.each_index{|i|
90
+ x,y=v1a[i],v2a[i]
91
+ a+=1 if x==0 and y==0
92
+ b+=1 if x==0 and y==1
93
+ c+=1 if x==1 and y==0
94
+ d+=1 if x==1 and y==1
95
+ }
96
+ Tetrachoric.new(a,b,c,d)
97
+ end
98
+ # Standard error
99
+ def se
100
+ @sdr
101
+ end
102
+ # Threshold for variable x (rows)
103
+ # Point on gauss curve under X rater select cases
104
+ def threshold_x
105
+ @zab
106
+ end
107
+
108
+ # Threshold for variable y (columns)
109
+ # Point on gauss curve under Y rater select cases
110
+
111
+ def threshold_y
112
+ @zac
113
+ end
114
+ def report_building(generator) # :nodoc:
115
+ generator.section(:name=>@name) do |s|
116
+ s.table(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"]) do |t|
117
+ t.row(["X=0", @a,@b,@a+@b])
118
+ t.row(["X=1", @c,@d,@c+@d])
119
+ t.hr
120
+ t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
121
+ end
122
+ s.text(sprintf("r: %0.3f",r))
123
+ s.text(_("SE: %0.3f") % se)
124
+ s.text(_("Threshold X: %0.3f ") % threshold_x)
125
+ s.text(_("Threshold Y: %0.3f ") % threshold_y )
126
+ end
127
+ end
128
+
129
+ # Creates a new tetrachoric object for analysis
130
+ def initialize(a,b,c,d)
131
+ @a,@b,@c,@d=a,b,c,d
132
+ @name=_("Tetrachoric correlation")
133
+ #
134
+ # CHECK IF ANY CELL FREQUENCY IS NEGATIVE
135
+ #
136
+ raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0)
137
+ compute
138
+ end
139
+ # Compute the tetrachoric correlation.
140
+ # Called on object creation.
141
+ #
142
+ def compute
143
+
144
+ #
145
+ # INITIALIZATION
146
+ #
147
+ @r = 0
148
+ sdzero = 0
149
+ @sdr = 0
150
+ @itype = 0
151
+ @ifault = 0
152
+
153
+ #
154
+ # CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
155
+ #
156
+ @kdelta = 1
157
+ delta = 0
158
+ @kdelta = 2 if (@a == 0 or @d == 0)
159
+ @kdelta += 2 if (@b == 0 or @c == 0)
160
+ #
161
+ # kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
162
+ #
163
+
164
+ raise "Rows and columns should have more than 0 items" if @kdelta==4
165
+
166
+ # GOTO (4, 1, 2 , 92), kdelta
167
+ #
168
+ # delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
169
+ #
170
+
171
+ if(@kdelta==2)
172
+ # 1
173
+ delta=0.5
174
+ @r=-1 if (@a==0 and @d==0)
175
+ elsif(@kdelta==3)
176
+ # 2
177
+ delta=-0.5
178
+ @r=1 if (@b==0 and @c==0)
179
+ end
180
+ # 4
181
+ if @r!=0
182
+ @itype=3
183
+ end
184
+
185
+ #
186
+ # STORE FREQUENCIES IN AA, BB, CC AND DD
187
+ #
188
+ @aa = @a + delta
189
+ @bb = @b - delta
190
+ @cc = @c - delta
191
+ @dd = @d + delta
192
+ @tot = @aa+@bb+@cc+@dd
193
+ #
194
+ # CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
195
+ # IF (AA * DD - BB * CC) 7, 5, 6
196
+
197
+ corr_dir=@aa * @dd - @bb * @cc
198
+ if(corr_dir < 0)
199
+ # 7
200
+ @probaa = @bb.quo(@tot)
201
+ @probac = (@bb + @dd).quo(@tot)
202
+ @ksign = 2
203
+ # -> 8
204
+ else
205
+ if (corr_dir==0)
206
+ # 5
207
+ @itype=4
208
+ end
209
+ # 6
210
+ #
211
+ # COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
212
+ # PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
213
+ # KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
214
+ #
215
+
216
+ @probaa = @aa.quo(@tot)
217
+ @probac = (@aa+@cc).quo(@tot)
218
+ @ksign=1
219
+ end
220
+ # 8
221
+
222
+ @probab = (@aa+@bb).quo(@tot)
223
+
224
+ #
225
+ # COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
226
+ # SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
227
+ #
228
+ @zac = Distribution::Normal.p_value(@probac)
229
+ @zab = Distribution::Normal.p_value(@probab)
230
+ @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
231
+ #
232
+ # WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
233
+ #
234
+ if (@r != 0 or @itype > 0)
235
+ compute_sdzero
236
+ return true
237
+ end
238
+ #
239
+ # WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
240
+ #
241
+ if (@a == @b and @b == @c)
242
+ calculate_cosine
243
+ return true
244
+ end
245
+ #
246
+ # INITIAL ESTIMATE OF CORRELATION IS YULES Y
247
+ #
248
+ @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs
249
+ @iter = 0
250
+ begin
251
+ #
252
+ # IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
253
+ #
254
+ #10
255
+ if @rr>RCUT
256
+ gaussian_quadrature
257
+ return true
258
+ end
259
+ #
260
+ # TETRACHORIC SERIES IS COMPUTED
261
+ #
262
+ # INITIALIZATION
263
+ #
264
+ va=1.0
265
+ vb=@zac.to_f
266
+ wa=1.0
267
+ wb=@zab.to_f
268
+ term = 1.0
269
+ iterm = 0.0
270
+ @sum = @probab * @probac
271
+ deriv = 0.0
272
+ sr = @ss
273
+ #15
274
+ begin
275
+ if(sr.abs<=CONST)
276
+ #
277
+ # RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
278
+ #
279
+ sr = sr / CONST
280
+ va = va * CHALF
281
+ vb = vb * CHALF
282
+ wa = wa * CHALF
283
+ wb = wb * CHALF
284
+ end
285
+ #
286
+ # FORM SUM AND DERIVATIVE OF SERIES
287
+ #
288
+ # 20
289
+ dr = sr * va * wa
290
+ sr = sr * @rr / term
291
+ cof = sr * va * wa
292
+ #
293
+ # ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV
294
+ #
295
+ iterm+= 1
296
+ iterm=0 if (cof.abs > CONV)
297
+ @sum = @sum + cof
298
+ deriv += dr
299
+ vaa = va
300
+ waa = wa
301
+ va = vb
302
+ wa = wb
303
+ vb = @zac * va - term * vaa
304
+ wb = @zab * wa - term * waa
305
+ term += 1
306
+ end while (iterm < 2 or term < 6)
307
+ #
308
+ # CHECK IF ITERATION CONVERGED
309
+ #
310
+ if((@sum-@probaa).abs <= CITER)
311
+ @itype=term
312
+ calculate_sdr
313
+ return true
314
+ end
315
+ #
316
+ # CALCULATE NEXT ESTIMATE OF CORRELATION
317
+ #
318
+ #25
319
+ @iter += 1
320
+ #
321
+ # IF TOO MANY ITERATlONS, RUN IS TERMINATED
322
+ #
323
+ delta = (@sum - @probaa) / deriv
324
+ @rrprev = @rr
325
+ @rr = @rr - delta
326
+ @rr += 0.5 * delta if(@iter == 1)
327
+ @rr= RLIMIT if (@rr > RLIMIT)
328
+ @rr =0 if (@rr < 0.0)
329
+ end while @iter < NITER
330
+ raise "Too many iteration"
331
+ # GOTO 10
332
+ end
333
+ # GAUSSIAN QUADRATURE
334
+ # 40
335
+ def gaussian_quadrature
336
+ if(@iter==0)
337
+ # INITIALIZATION, IF THIS IS FIRST ITERATION
338
+ @sum=@probab*@probac
339
+ @rrprev=0
340
+ end
341
+
342
+ # 41
343
+ sumprv = @probab - @sum
344
+ @prob = @bb.quo(@tot)
345
+ @prob = @aa.quo(@tot) if (@ksign == 2)
346
+ @itype = 1
347
+ #
348
+ # LOOP TO FIND ESTIMATE OF CORRELATION
349
+ # COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
350
+ #
351
+ # 42
352
+
353
+ begin
354
+ rrsq = Math::sqrt(1 - @rr ** 2)
355
+ amid = 0.5 * (UPLIM + @zac)
356
+ xlen = UPLIM - amid
357
+ @sum = 0
358
+ (1..16).each do |iquad|
359
+ xla = amid + X[iquad] * xlen
360
+ xlb = amid - X[iquad] * xlen
361
+
362
+
363
+ #
364
+ # TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
365
+ #
366
+ tempa = (@zab - @rr * xla) / rrsq
367
+ if (tempa >= -6.0)
368
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa)
369
+ end
370
+ tempb = (@zab - @rr * xlb) / rrsq
371
+
372
+ if (tempb >= -6.0)
373
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
374
+ end
375
+ end # 44 ~ iquad
376
+ @sum=@sum*xlen / SQT2PI
377
+ #
378
+ # CHECK IF ITERATION HAS CONVERGED
379
+ #
380
+ if ((@prob - @sum).abs <= CITER)
381
+ calculate_sdr
382
+ return true
383
+ end
384
+ # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION
385
+
386
+ rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
387
+ rrest = RLIMIT if (rrest > RLIMIT)
388
+ rrest = 0 if (rrest < 0)
389
+ @rrprev = @rr
390
+ @rr = rrest
391
+ sumprv = @sum
392
+ #
393
+ # if estimate has same value on two iterations, stop iteration
394
+ #
395
+ if @rr == @rrprev
396
+ calculate_sdr
397
+ return true
398
+ end
399
+
400
+
401
+ end while @iter < NITER
402
+ raise "Too many iterations"
403
+ # ir a 42
404
+ end
405
+ def calculate_cosine
406
+ #
407
+ # WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
408
+ #
409
+ @rr = -Math::cos(TWOPI * @probaa)
410
+ @itype = 2
411
+ calculate_sdr
412
+ end
413
+
414
+
415
+ def calculate_sdr # :nodoc:
416
+ #
417
+ # COMPUTE SDR
418
+ #
419
+ @r = @rr
420
+ rrsq = Math::sqrt(1.0 - @r ** 2)
421
+ @itype = -@itype if (@kdelta > 1)
422
+ if (@ksign != 1)
423
+ @r = -@r
424
+ @zac = -@zac
425
+ end
426
+ # 71
427
+ pdf = Math::exp(-0.5 * (@zac ** 2 - 2 * @r * @zac * @zab + @zab ** 2) / rrsq ** 2) / (TWOPI * rrsq)
428
+ @pac = Distribution::Normal.cdf((@zac - @r * @zab) / rrsq) - 0.5
429
+ @pab = Distribution::Normal.cdf((@zab - @r * @zac) / rrsq) - 0.5
430
+
431
+ @sdr = ((@aa+@dd) * (@bb + @cc)).quo(4) + @pab ** 2 * (@aa + @cc) * (@bb + @dd) + @pac ** 2 * (@aa + @bb) * (@cc + @dd) + 2.0 * @pab * @pac * (@aa * @dd - @bb * @cc) - @pab * (@aa * @bb - @cc * @dd) - @pac * (@aa * @cc - @bb * @dd)
432
+ @sdr=0 if (@sdr<0)
433
+ @sdr= Math::sqrt(@sdr) / (@tot * pdf * Math::sqrt(@tot))
434
+ compute_sdzero
435
+ end
436
+
437
+ # 85
438
+ #
439
+ # COMPUTE SDZERO
440
+ #
441
+ def compute_sdzero
442
+ @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
443
+ @sdr = @sdzero if (@r == 0)
444
+ end
445
+ private :calculate_cosine, :calculate_sdr, :compute_sdzero, :compute, :gaussian_quadrature
446
+ end
447
+ end
448
+ end
449
+
450
+
@@ -0,0 +1,9 @@
1
+ # encoding: utf-8
2
+
3
+ $:.unshift File.join(File.dirname(__FILE__), '..', 'lib')
4
+ require "rubygems"
5
+ require 'spec'
6
+ require 'spec/autorun'
7
+ require 'statsample'
8
+ require 'statsample/bivariate/tetrachoric'
9
+ require 'statsample/bivariate/polychoric'