statsample 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ module Statsample
2
+ module Bivariate
3
+ # Calculate Tetrachoric correlation for two vectors.
4
+ def self.tetrachoric(v1,v2)
5
+ tc=Tetrachoric.new_with_vectors(v1,v2)
6
+ tc.r
7
+ end
8
+
9
+ # Tetrachoric correlation matrix.
10
+ # Order of rows and columns depends on Dataset#fields order
11
+ def self.tetrachoric_correlation_matrix(ds)
12
+ ds.collect_matrix do |row,col|
13
+ if row==col
14
+ 1.0
15
+ else
16
+ begin
17
+ tetrachoric(ds[row],ds[col])
18
+ rescue RuntimeError
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
24
+ #
25
+ # Compute tetrachoric correlation.
26
+ #
27
+ # See http://www.john-uebersax.com/stat/tetra.htm for extensive
28
+ # documentation about tetrachoric correlation.
29
+ #
30
+ # This class uses algorithm AS116 from Applied Statistics(1977)
31
+ # vol.26, no.3.
32
+ #
33
+ # You can see FORTRAN code on http://lib.stat.cmu.edu/apstat/116
34
+ #
35
+ # <b>Usage</b>.
36
+ # With two variables x and y on a crosstab like this:
37
+ #
38
+ # -------------
39
+ # | y=0 | y=1 |
40
+ # -------------
41
+ # x = 0 | a | b |
42
+ # -------------
43
+ # x = 1 | c | d |
44
+ # -------------
45
+ #
46
+ # Use:
47
+ # tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
48
+ # tc.r # correlation
49
+ # tc.se # standard error
50
+ # tc.threshold_y # threshold for y variable
51
+ # tc.threshold_x # threshold for x variable
52
+
53
+
54
+ class Tetrachoric
55
+
56
+ attr_reader :r
57
+
58
+ TWOPI=Math::PI*2
59
+ SQT2PI= 2.50662827
60
+ RLIMIT = 0.9999
61
+ RCUT= 0.95
62
+ UPLIM= 5.0
63
+ CONST= 1E-36
64
+ CHALF= 1E-18
65
+ CONV =1E-8
66
+ CITER = 1E-6
67
+ NITER = 25
68
+ X=[0,0.9972638618, 0.9856115115, 0.9647622556, 0.9349060759, 0.8963211558, 0.8493676137, 0.7944837960, 0.7321821187, 0.6630442669, 0.5877157572, 0.5068999089, 0.4213512761, 0.3318686023, 0.2392873623, 0.1444719616, 0.0483076657]
69
+ W=[0, 0.0070186100, 0.0162743947, 0.0253920653, 0.0342738629, 0.0428358980, 0.0509980593, 0.0586840935, 0.0658222228, 0.0723457941, 0.0781938958, 0.0833119242, 0.0876520930, 0.0911738787, 0.0938443991, 0.0956387201, 0.0965400885]
70
+ # Creates a Tetrachoric object based on two vectors.
71
+ # The vectors are dichotomized previously.
72
+ def self.new_with_vectors(v1,v2)
73
+ v1a,v2a=Statsample.only_valid(v1,v2)
74
+ v1a=v1a.dichotomize
75
+ v2a=v2a.dichotomize
76
+ raise "v1 have only 0" if v1a.factors==[0]
77
+ raise "v2 have only 0" if v2a.factors==[0]
78
+ a,b,c,d = 0,0,0,0
79
+ v1a.each_index{|i|
80
+ x,y=v1a[i],v2a[i]
81
+ a+=1 if x==0 and y==0
82
+ b+=1 if x==0 and y==1
83
+ c+=1 if x==1 and y==0
84
+ d+=1 if x==1 and y==1
85
+ }
86
+ Tetrachoric.new(a,b,c,d)
87
+ end
88
+ # Standard error
89
+ def se
90
+ @sdr
91
+ end
92
+ # Threshold for variable x (rows)
93
+ def threshold_x
94
+ @zac
95
+ end
96
+
97
+ # Threshold for variable y (columns)
98
+ def threshold_y
99
+ @zab
100
+ end
101
+
102
+ def initialize(a,b,c,d)
103
+ @a,@b,@c,@d=a,b,c,d
104
+ #
105
+ # CHECK IF ANY CELL FREQUENCY IS NEGATIVE
106
+ #
107
+ raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0)
108
+ compute
109
+ end
110
+
111
+ def compute
112
+
113
+ #
114
+ # INITIALIZATION
115
+ #
116
+ @r = 0
117
+ sdzero = 0
118
+ @sdr = 0
119
+ @itype = 0
120
+ @ifault = 0
121
+
122
+ #
123
+ # CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
124
+ #
125
+ @kdelta = 1
126
+ delta = 0
127
+ @kdelta = 2 if (@a == 0 or @d == 0)
128
+ @kdelta += 2 if (@b == 0 or @c == 0)
129
+ #
130
+ # kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
131
+ #
132
+
133
+ raise "Rows and columns should have more than 0 items" if @kdelta==4
134
+
135
+ # GOTO (4, 1, 2 , 92), kdelta
136
+ #
137
+ # delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
138
+ #
139
+
140
+ if(@kdelta==2)
141
+ # 1
142
+ delta=0.5
143
+ @r=-1 if (@a==0 and @d==0)
144
+ elsif(@kdelta==3)
145
+ # 2
146
+ delta=-0.5
147
+ @r=1 if (@b==0 and @c==0)
148
+ end
149
+ # 4
150
+ if @r!=0
151
+ @itype=3
152
+ end
153
+
154
+ #
155
+ # STORE FREQUENCIES IN AA, BB, CC AND DD
156
+ #
157
+ @aa = @a + delta
158
+ @bb = @b - delta
159
+ @cc = @c - delta
160
+ @dd = @d + delta
161
+ @tot = @aa+@bb+@cc+@dd
162
+ #
163
+ # CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
164
+ # IF (AA * DD - BB * CC) 7, 5, 6
165
+
166
+ corr_dir=@aa * @dd - @bb * @cc
167
+ if(corr_dir < 0)
168
+ # 7
169
+ @probaa = @bb.quo(@tot)
170
+ @probac = (@bb + @dd).quo(@tot)
171
+ @ksign = 2
172
+ # -> 8
173
+ else
174
+ if (corr_dir==0)
175
+ # 5
176
+ @itype=4
177
+ end
178
+ # 6
179
+ #
180
+ # COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
181
+ # PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
182
+ # KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
183
+ #
184
+
185
+ @probaa = @aa.quo(@tot)
186
+ @probac = (@aa+@cc).quo(@tot)
187
+ @ksign=1
188
+ end
189
+ # 8
190
+
191
+ @probab = (@aa+@bb).quo(@tot)
192
+
193
+ #
194
+ # COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
195
+ # SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
196
+ #
197
+ @zac = Distribution::Normal.p_value(@probac)
198
+ @zab = Distribution::Normal.p_value(@probab)
199
+ @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
200
+ #
201
+ # WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
202
+ #
203
+ if (@r != 0 or @itype > 0)
204
+ compute_sdzero
205
+ return true
206
+ end
207
+ #
208
+ # WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
209
+ #
210
+ if (@a == @b and @b == @c)
211
+ calculate_cosine
212
+ return true
213
+ end
214
+ #
215
+ # INITIAL ESTIMATE OF CORRELATION IS YULES Y
216
+ #
217
+ @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs
218
+ @iter = 0
219
+ begin
220
+ #
221
+ # IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
222
+ #
223
+ #10
224
+ if @rr>RCUT
225
+ gaussian_quadrature
226
+ return true
227
+ end
228
+ #
229
+ # TETRACHORIC SERIES IS COMPUTED
230
+ #
231
+ # INITIALIZATION
232
+ #
233
+ va=1.0
234
+ vb=@zac.to_f
235
+ wa=1.0
236
+ wb=@zab.to_f
237
+ term = 1.0
238
+ iterm = 0.0
239
+ @sum = @probab * @probac
240
+ deriv = 0.0
241
+ sr = @ss
242
+ #15
243
+ begin
244
+ if(sr.abs<=CONST)
245
+ #
246
+ # RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
247
+ #
248
+ sr = sr / CONST
249
+ va = va * CHALF
250
+ vb = vb * CHALF
251
+ wa = wa * CHALF
252
+ wb = wb * CHALF
253
+ end
254
+ #
255
+ # FORM SUM AND DERIVATIVE OF SERIES
256
+ #
257
+ # 20
258
+ dr = sr * va * wa
259
+ sr = sr * @rr / term
260
+ cof = sr * va * wa
261
+ #
262
+ # ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV
263
+ #
264
+ iterm+= 1
265
+ iterm=0 if (cof.abs > CONV)
266
+ @sum = @sum + cof
267
+ deriv += dr
268
+ vaa = va
269
+ waa = wa
270
+ va = vb
271
+ wa = wb
272
+ vb = @zac * va - term * vaa
273
+ wb = @zab * wa - term * waa
274
+ term += 1
275
+ end while (iterm < 2 or term < 6)
276
+ #
277
+ # CHECK IF ITERATION CONVERGED
278
+ #
279
+ if((@sum-@probaa).abs <= CITER)
280
+ @itype=term
281
+ calculate_sdr
282
+ return true
283
+ end
284
+ #
285
+ # CALCULATE NEXT ESTIMATE OF CORRELATION
286
+ #
287
+ #25
288
+ @iter += 1
289
+ #
290
+ # IF TOO MANY ITERATlONS, RUN IS TERMINATED
291
+ #
292
+ delta = (@sum - @probaa) / deriv
293
+ @rrprev = @rr
294
+ @rr = @rr - delta
295
+ @rr += 0.5 * delta if(@iter == 1)
296
+ @rr= RLIMIT if (@rr > RLIMIT)
297
+ @rr =0 if (@rr < 0.0)
298
+ end while @iter < NITER
299
+ raise "Too many iteration"
300
+ # GOTO 10
301
+ end
302
+ # GAUSSIAN QUADRATURE
303
+ # 40
304
+ def gaussian_quadrature
305
+ if(@iter==0)
306
+ # INITIALIZATION, IF THIS IS FIRST ITERATION
307
+ @sum=@probab*@probac
308
+ @rrprev=0
309
+ end
310
+
311
+ # 41
312
+ sumprv = @probab - @sum
313
+ @prob = @bb.quo(@tot)
314
+ @prob = @aa.quo(@tot) if (@ksign == 2)
315
+ @itype = 1
316
+ #
317
+ # LOOP TO FIND ESTIMATE OF CORRELATION
318
+ # COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
319
+ #
320
+ # 42
321
+
322
+ begin
323
+ rrsq = Math::sqrt(1 - @rr ** 2)
324
+ amid = 0.5 * (UPLIM + @zac)
325
+ xlen = UPLIM - amid
326
+ @sum = 0
327
+ (1..16).each do |iquad|
328
+ xla = amid + X[iquad] * xlen
329
+ xlb = amid - X[iquad] * xlen
330
+
331
+
332
+ #
333
+ # TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
334
+ #
335
+ tempa = (@zab - @rr * xla) / rrsq
336
+ if (tempa >= -6.0)
337
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa)
338
+ end
339
+ tempb = (@zab - @rr * xlb) / rrsq
340
+
341
+ if (tempb >= -6.0)
342
+ @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
343
+ end
344
+ end # 44 ~ iquad
345
+ @sum=@sum*xlen / SQT2PI
346
+ #
347
+ # CHECK IF ITERATION HAS CONVERGED
348
+ #
349
+ if ((@prob - @sum).abs <= CITER)
350
+ calculate_sdr
351
+ return true
352
+ end
353
+ # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION
354
+
355
+ rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
356
+ rrest = RLIMIT if (rrest > RLIMIT)
357
+ rrest = 0 if (rrest < 0)
358
+ @rrprev = @rr
359
+ @rr = rrest
360
+ sumprv = @sum
361
+ #
362
+ # if estimate has same value on two iterations, stop iteration
363
+ #
364
+ if @rr == @rrprev
365
+ calculate_sdr
366
+ return true
367
+ end
368
+
369
+
370
+ end while @iter < NITER
371
+ raise "Too many iterations"
372
+ # ir a 42
373
+ end
374
+ def calculate_cosine
375
+ #
376
+ # WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
377
+ #
378
+ @rr = -Math::cos(TWOPI * @probaa)
379
+ @itype = 2
380
+ calculate_sdr
381
+ end
382
+
383
+
384
+ def calculate_sdr
385
+ #
386
+ # COMPUTE SDR
387
+ #
388
+ @r = @rr
389
+ rrsq = Math::sqrt(1.0 - @r ** 2)
390
+ @itype = -@itype if (@kdelta > 1)
391
+ if (@ksign != 1)
392
+ @r = -@r
393
+ @zac = -@zac
394
+ end
395
+ # 71
396
+ pdf = Math::exp(-0.5 * (@zac ** 2 - 2 * @r * @zac * @zab + @zab ** 2) / rrsq ** 2) / (TWOPI * rrsq)
397
+ @pac = Distribution::Normal.cdf((@zac - @r * @zab) / rrsq) - 0.5
398
+ @pab = Distribution::Normal.cdf((@zab - @r * @zac) / rrsq) - 0.5
399
+ @sdr = ((@aa+@dd) * (@bb + @cc)).quo(4) + @pab ** 2 * (@aa + @cc) * (@bb + @dd) + @pac ** 2 * (@aa + @bb) * (@cc + @dd) + 2.0 * @pab * @pac * (@aa * @dd - @bb * @cc) - @pab * (@aa * @bb - @cc * @dd) - @pac * (@aa * @cc - @bb * @dd)
400
+ @sdr=0 if (@sdr<0)
401
+ @sdr= Math::sqrt(@sdr) / (@tot * pdf * Math::sqrt(@tot))
402
+ compute_sdzero
403
+ end
404
+
405
+ # 85
406
+ #
407
+ # COMPUTE SDZERO
408
+ #
409
+ def compute_sdzero
410
+ @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
411
+ @sdr = @sdzero if (@r == 0)
412
+ end
413
+ private :calculate_cosine, :calculate_sdr, :compute, :gaussian_quadrature
414
+ end
415
+ end
416
+ end
417
+
418
+
@@ -1,30 +1,30 @@
1
1
  require 'yaml'
2
2
 
3
3
  module Statsample
4
- # Codification
5
- #
6
- # This tool aids to code open questions
7
- # * Load one or more vectors on the workflow, to create a file on yaml of values. If data have Statsample::SEPARATOR_TOKEN, the value will be splitted on two or more values
8
- # * Edit the yaml and replace the values with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SEPARATOR_TOKEN)
4
+ # This module aids to code open questions
5
+ # * Select one or more vectors of a dataset, to create a yaml files, on which each vector is a hash, which keys and values are the vector's factors . If data have Statsample::SPLIT_TOKEN on a value, each value will be separated on two or more hash keys.
6
+ # * Edit the yaml and replace the values of hashes with your codes. If you need to create two or mores codes for an answer, use the separator (default Statsample::SPLIT_TOKEN)
9
7
  # * Recode the vectors, loading the yaml file:
10
- # * The new vectors have the same name of the original plus "_recoded"
11
- # * Instead of load new recoded vectors, create many vectors as values, as add_vectors_by_split
8
+ # * recode_dataset_simple!() : The new vectors have the same name of the original plus "_recoded"
9
+ # * recode_dataset_split!() : Create equal number of vectors as values. See Vector.add_vectors_by_split() for arguments
12
10
  #
13
11
  # Usage:
14
12
  # recode_file="recodification.yaml"
15
13
  # phase=:first # flag
16
14
  # if phase==:first
17
- # File.open(recode_file,"w") {|fp|
18
- # Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
19
- # } # Edit the file recodification.yaml
15
+ # File.open(recode_file,"w") {|fp|
16
+ # Statsample::Codification.create_yaml(ds,%w{vector1 vector2}, ",",fp)
17
+ # }
18
+ # # Edit the file recodification.yaml and verify changes
20
19
  # elsif phase==:second
21
- # File.open(recode_file,"r") {|fp|
20
+ # File.open(recode_file,"r") {|fp|
22
21
  # Statsample::Codification.verify(fp,['vector1'])
23
- # }
22
+ # }
23
+ # # Add new vectors to the dataset
24
24
  # elsif phase==:third
25
- # File.open(recode_file,"r") {|fp|
26
- # Statsample::Codification.recode_dataset_split!(ds,fp,"*")
27
- # }
25
+ # File.open(recode_file,"r") {|fp|
26
+ # Statsample::Codification.recode_dataset_split!(ds,fp,"*")
27
+ # }
28
28
  # end
29
29
  #
30
30
  module Codification