statsample 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data.tar.gz.sig +2 -1
  2. data/History.txt +11 -0
  3. data/Manifest.txt +2 -3
  4. data/README.txt +0 -17
  5. data/Rakefile +10 -9
  6. data/data/locale/es/LC_MESSAGES/statsample.mo +0 -0
  7. data/examples/principal_axis.rb +2 -0
  8. data/examples/u_test.rb +8 -0
  9. data/lib/distribution.rb +1 -1
  10. data/lib/statsample.rb +12 -12
  11. data/lib/statsample/anova/oneway.rb +4 -4
  12. data/lib/statsample/bivariate.rb +10 -3
  13. data/lib/statsample/bivariate/pearson.rb +55 -0
  14. data/lib/statsample/dataset.rb +57 -49
  15. data/lib/statsample/dominanceanalysis.rb +1 -2
  16. data/lib/statsample/dominanceanalysis/bootstrap.rb +46 -54
  17. data/lib/statsample/factor.rb +0 -1
  18. data/lib/statsample/factor/parallelanalysis.rb +9 -13
  19. data/lib/statsample/factor/pca.rb +5 -10
  20. data/lib/statsample/factor/principalaxis.rb +27 -33
  21. data/lib/statsample/matrix.rb +11 -11
  22. data/lib/statsample/mle.rb +0 -1
  23. data/lib/statsample/regression.rb +0 -1
  24. data/lib/statsample/reliability.rb +2 -2
  25. data/lib/statsample/reliability/multiscaleanalysis.rb +62 -15
  26. data/lib/statsample/reliability/scaleanalysis.rb +5 -6
  27. data/lib/statsample/test/f.rb +2 -5
  28. data/lib/statsample/test/levene.rb +2 -5
  29. data/lib/statsample/test/t.rb +4 -13
  30. data/lib/statsample/test/umannwhitney.rb +19 -19
  31. data/po/es/statsample.mo +0 -0
  32. data/po/es/statsample.po +304 -111
  33. data/po/statsample.pot +224 -90
  34. data/test/test_bivariate.rb +8 -69
  35. data/test/test_reliability.rb +3 -4
  36. metadata +30 -18
  37. metadata.gz.sig +0 -0
  38. data/lib/statsample/bivariate/polychoric.rb +0 -893
  39. data/lib/statsample/bivariate/tetrachoric.rb +0 -457
  40. data/test/test_bivariate_polychoric.rb +0 -70
@@ -1,457 +0,0 @@
1
- module Statsample
2
- module Bivariate
3
- # Calculate Tetrachoric correlation for two vectors.
4
- def self.tetrachoric(v1,v2)
5
- tc=Tetrachoric.new_with_vectors(v1,v2)
6
- tc.r
7
- end
8
-
9
- # Tetrachoric correlation matrix.
10
- # Order of rows and columns depends on Dataset#fields order
11
- def self.tetrachoric_correlation_matrix(ds)
12
- ds.collect_matrix do |row,col|
13
- if row==col
14
- 1.0
15
- else
16
- begin
17
- tetrachoric(ds[row],ds[col])
18
- rescue RuntimeError
19
- nil
20
- end
21
- end
22
- end
23
- end
24
- # Compute tetrachoric correlation.
25
- #
26
- # The <em>tetrachoric</em> correlation is a measure of
27
- # bivariate association arising when both observed variates
28
- # are categorical variables that result from dichotomizing
29
- # the two undelying continuous variables (Drasgow, 2006).
30
- # The tetrachoric correlation is a good way to measure rater agreement (Uebersax, 2006)
31
- #
32
- # This class uses Brown (1977) algorithm. You can see FORTRAN code on http://lib.stat.cmu.edu/apstat/116
33
- #
34
- #
35
- # == Usage
36
- # With two variables x and y on a crosstab like this:
37
- #
38
- # -------------
39
- # | y=0 | y=1 |
40
- # -------------
41
- # x = 0 | a | b |
42
- # -------------
43
- # x = 1 | c | d |
44
- # -------------
45
- #
46
- # The code will be
47
- # tc=Statsample::Bivariate::Tetrachoric.new(a,b,c,d)
48
- # tc.r # correlation
49
- # tc.se # standard error
50
- # tc.threshold_y # threshold for y variable
51
- # tc.threshold_x # threshold for x variable
52
- #
53
- # == References:
54
- #
55
- # * Brown, MB. (1977) Algorithm AS 116: the tetrachoric correlation and its standard error. <em>Applied Statistics, 26</em>, 343-351.
56
- # * Drasgow F. (2006). Polychoric and polyserial correlations. In Kotz L, Johnson NL (Eds.), Encyclopedia of statistical sciences. Vol. 7 (pp. 69-74). New York: Wiley.
57
- # * Uebersax, J.S. (2006). The tetrachoric and polychoric correlation coefficients. Statistical Methods for Rater Agreement web site. 2006. Available at: http://john-uebersax.com/stat/tetra.htm . Accessed February, 11, 2010
58
-
59
- class Tetrachoric
60
- include GetText
61
- bindtextdomain("statsample")
62
- attr_reader :r
63
- attr_accessor :name
64
-
65
- TWOPI=Math::PI*2
66
- SQT2PI= 2.50662827
67
- RLIMIT = 0.9999
68
- RCUT= 0.95
69
- UPLIM= 5.0
70
- CONST= 1E-36
71
- CHALF= 1E-18
72
- CONV =1E-8
73
- CITER = 1E-6
74
- NITER = 25
75
- X=[0,0.9972638618, 0.9856115115, 0.9647622556, 0.9349060759, 0.8963211558, 0.8493676137, 0.7944837960, 0.7321821187, 0.6630442669, 0.5877157572, 0.5068999089, 0.4213512761, 0.3318686023, 0.2392873623, 0.1444719616, 0.0483076657]
76
- W=[0, 0.0070186100, 0.0162743947, 0.0253920653, 0.0342738629, 0.0428358980, 0.0509980593, 0.0586840935, 0.0658222228, 0.0723457941, 0.0781938958, 0.0833119242, 0.0876520930, 0.0911738787, 0.0938443991, 0.0956387201, 0.0965400885]
77
- # Creates a Tetrachoric object based on a 2x2 Matrix.
78
- def self.new_with_matrix(m)
79
- Tetrachoric.new(m[0,0], m[0,1], m[1,0],m[1,1])
80
- end
81
- # Creates a Tetrachoric object based on two vectors.
82
- # The vectors are dichotomized previously.
83
- def self.new_with_vectors(v1,v2)
84
- v1a, v2a=Statsample.only_valid(v1,v2)
85
- v1a=v1a.dichotomize
86
- v2a=v2a.dichotomize
87
- raise "v1 have only 0" if v1a.factors==[0]
88
- raise "v2 have only 0" if v2a.factors==[0]
89
- a,b,c,d = 0,0,0,0
90
- v1a.each_index{|i|
91
- x,y=v1a[i],v2a[i]
92
- a+=1 if x==0 and y==0
93
- b+=1 if x==0 and y==1
94
- c+=1 if x==1 and y==0
95
- d+=1 if x==1 and y==1
96
- }
97
- Tetrachoric.new(a,b,c,d)
98
- end
99
- # Standard error
100
- def se
101
- @sdr
102
- end
103
- # Threshold for variable x (rows)
104
- # Point on gauss curve under X rater select cases
105
- def threshold_x
106
- @zab
107
- end
108
-
109
- # Threshold for variable y (columns)
110
- # Point on gauss curve under Y rater select cases
111
-
112
- def threshold_y
113
- @zac
114
- end
115
- # Summary of the analysis
116
- def summary
117
- ReportBuilder.new(:name=>@name).add(self).to_text
118
- end
119
-
120
- def report_building(generator) # :nodoc:
121
- section=ReportBuilder::Section.new(:name=>@name)
122
- t=ReportBuilder::Table.new(:name=>_("Contingence Table"),:header=>["","Y=0","Y=1", "T"])
123
- t.row(["X=0", @a,@b,@a+@b])
124
- t.row(["X=1", @c,@d,@c+@d])
125
- t.hr
126
- t.row(["T", @a+@c,@b+@d,@a+@b+@c+@d])
127
- section.add(t)
128
- #generator.parse_element(t)
129
- section.add(sprintf("r: %0.3f",r))
130
- section.add(_("SE: %0.3f") % se)
131
- section.add(_("Threshold X: %0.3f ") % [threshold_x] )
132
- section.add(_("Threshold Y: %0.3f ") % [threshold_y] )
133
- generator.parse_element(section)
134
- end
135
-
136
- # Creates a new tetrachoric object for analysis
137
- def initialize(a,b,c,d)
138
- @a,@b,@c,@d=a,b,c,d
139
- @name=_("Tetrachoric correlation")
140
- #
141
- # CHECK IF ANY CELL FREQUENCY IS NEGATIVE
142
- #
143
- raise "All frequencies should be positive" if (@a < 0 or @b < 0 or @c < 0 or @d < 0)
144
- compute
145
- end
146
- # Compute the tetrachoric correlation.
147
- # Called on object creation.
148
- #
149
- def compute
150
-
151
- #
152
- # INITIALIZATION
153
- #
154
- @r = 0
155
- sdzero = 0
156
- @sdr = 0
157
- @itype = 0
158
- @ifault = 0
159
-
160
- #
161
- # CHECK IF ANY FREQUENCY IS 0.0 AND SET kdelta
162
- #
163
- @kdelta = 1
164
- delta = 0
165
- @kdelta = 2 if (@a == 0 or @d == 0)
166
- @kdelta += 2 if (@b == 0 or @c == 0)
167
- #
168
- # kdelta=4 MEANS TABLE HAS 0.0 ROW OR COLUMN, RUN IS TERMINATED
169
- #
170
-
171
- raise "Rows and columns should have more than 0 items" if @kdelta==4
172
-
173
- # GOTO (4, 1, 2 , 92), kdelta
174
- #
175
- # delta IS 0.0, 0.5 OR -0.5 ACCORDING TO WHICH CELL IS 0.0
176
- #
177
-
178
- if(@kdelta==2)
179
- # 1
180
- delta=0.5
181
- @r=-1 if (@a==0 and @d==0)
182
- elsif(@kdelta==3)
183
- # 2
184
- delta=-0.5
185
- @r=1 if (@b==0 and @c==0)
186
- end
187
- # 4
188
- if @r!=0
189
- @itype=3
190
- end
191
-
192
- #
193
- # STORE FREQUENCIES IN AA, BB, CC AND DD
194
- #
195
- @aa = @a + delta
196
- @bb = @b - delta
197
- @cc = @c - delta
198
- @dd = @d + delta
199
- @tot = @aa+@bb+@cc+@dd
200
- #
201
- # CHECK IF CORRELATION IS NEGATIVE, 0.0, POSITIVE
202
- # IF (AA * DD - BB * CC) 7, 5, 6
203
-
204
- corr_dir=@aa * @dd - @bb * @cc
205
- if(corr_dir < 0)
206
- # 7
207
- @probaa = @bb.quo(@tot)
208
- @probac = (@bb + @dd).quo(@tot)
209
- @ksign = 2
210
- # -> 8
211
- else
212
- if (corr_dir==0)
213
- # 5
214
- @itype=4
215
- end
216
- # 6
217
- #
218
- # COMPUTE PROBABILITIES OF QUADRANT AND OF MARGINALS
219
- # PROBAA AND PROBAC CHOSEN SO THAT CORRELATION IS POSITIVE.
220
- # KSIGN INDICATES WHETHER QUADRANTS HAVE BEEN SWITCHED
221
- #
222
-
223
- @probaa = @aa.quo(@tot)
224
- @probac = (@aa+@cc).quo(@tot)
225
- @ksign=1
226
- end
227
- # 8
228
-
229
- @probab = (@aa+@bb).quo(@tot)
230
-
231
- #
232
- # COMPUTE NORMAL DEVIATES FOR THE MARGINAL FREQUENCIES
233
- # SINCE NO MARGINAL CAN BE 0.0, IE IS NOT CHECKED
234
- #
235
- @zac = Distribution::Normal.p_value(@probac)
236
- @zab = Distribution::Normal.p_value(@probab)
237
- @ss = Math::exp(-0.5 * (@zac ** 2 + @zab ** 2)).quo(TWOPI)
238
- #
239
- # WHEN R IS 0.0, 1.0 OR -1.0, TRANSFER TO COMPUTE SDZERO
240
- #
241
- if (@r != 0 or @itype > 0)
242
- compute_sdzero
243
- return true
244
- end
245
- #
246
- # WHEN MARGINALS ARE EQUAL, COSINE EVALUATION IS USED
247
- #
248
- if (@a == @b and @b == @c)
249
- calculate_cosine
250
- return true
251
- end
252
- #
253
- # INITIAL ESTIMATE OF CORRELATION IS YULES Y
254
- #
255
- @rr = ((Math::sqrt(@aa * @dd) - Math::sqrt(@bb * @cc)) ** 2) / (@aa * @dd - @bb * @cc).abs
256
- @iter = 0
257
- begin
258
- #
259
- # IF RR EXCEEDS RCUT, GAUSSIAN QUADRATURE IS USED
260
- #
261
- #10
262
- if @rr>RCUT
263
- gaussian_quadrature
264
- return true
265
- end
266
- #
267
- # TETRACHORIC SERIES IS COMPUTED
268
- #
269
- # INITIALIZATION
270
- #
271
- va=1.0
272
- vb=@zac.to_f
273
- wa=1.0
274
- wb=@zab.to_f
275
- term = 1.0
276
- iterm = 0.0
277
- @sum = @probab * @probac
278
- deriv = 0.0
279
- sr = @ss
280
- #15
281
- begin
282
- if(sr.abs<=CONST)
283
- #
284
- # RESCALE TERMS TO AVOID OVERFLOWS AND UNDERFLOWS
285
- #
286
- sr = sr / CONST
287
- va = va * CHALF
288
- vb = vb * CHALF
289
- wa = wa * CHALF
290
- wb = wb * CHALF
291
- end
292
- #
293
- # FORM SUM AND DERIVATIVE OF SERIES
294
- #
295
- # 20
296
- dr = sr * va * wa
297
- sr = sr * @rr / term
298
- cof = sr * va * wa
299
- #
300
- # ITERM COUNTS NO. OF CONSECUTIVE TERMS < CONV
301
- #
302
- iterm+= 1
303
- iterm=0 if (cof.abs > CONV)
304
- @sum = @sum + cof
305
- deriv += dr
306
- vaa = va
307
- waa = wa
308
- va = vb
309
- wa = wb
310
- vb = @zac * va - term * vaa
311
- wb = @zab * wa - term * waa
312
- term += 1
313
- end while (iterm < 2 or term < 6)
314
- #
315
- # CHECK IF ITERATION CONVERGED
316
- #
317
- if((@sum-@probaa).abs <= CITER)
318
- @itype=term
319
- calculate_sdr
320
- return true
321
- end
322
- #
323
- # CALCULATE NEXT ESTIMATE OF CORRELATION
324
- #
325
- #25
326
- @iter += 1
327
- #
328
- # IF TOO MANY ITERATlONS, RUN IS TERMINATED
329
- #
330
- delta = (@sum - @probaa) / deriv
331
- @rrprev = @rr
332
- @rr = @rr - delta
333
- @rr += 0.5 * delta if(@iter == 1)
334
- @rr= RLIMIT if (@rr > RLIMIT)
335
- @rr =0 if (@rr < 0.0)
336
- end while @iter < NITER
337
- raise "Too many iteration"
338
- # GOTO 10
339
- end
340
- # GAUSSIAN QUADRATURE
341
- # 40
342
- def gaussian_quadrature
343
- if(@iter==0)
344
- # INITIALIZATION, IF THIS IS FIRST ITERATION
345
- @sum=@probab*@probac
346
- @rrprev=0
347
- end
348
-
349
- # 41
350
- sumprv = @probab - @sum
351
- @prob = @bb.quo(@tot)
352
- @prob = @aa.quo(@tot) if (@ksign == 2)
353
- @itype = 1
354
- #
355
- # LOOP TO FIND ESTIMATE OF CORRELATION
356
- # COMPUTATION OF INTEGRAL (SUM) BY QUADRATURE
357
- #
358
- # 42
359
-
360
- begin
361
- rrsq = Math::sqrt(1 - @rr ** 2)
362
- amid = 0.5 * (UPLIM + @zac)
363
- xlen = UPLIM - amid
364
- @sum = 0
365
- (1..16).each do |iquad|
366
- xla = amid + X[iquad] * xlen
367
- xlb = amid - X[iquad] * xlen
368
-
369
-
370
- #
371
- # TO AVOID UNDERFLOWS, TEMPA AND TEMPB ARE USED
372
- #
373
- tempa = (@zab - @rr * xla) / rrsq
374
- if (tempa >= -6.0)
375
- @sum = @sum + W[iquad] * Math::exp(-0.5 * xla ** 2) * Distribution::Normal.cdf(tempa)
376
- end
377
- tempb = (@zab - @rr * xlb) / rrsq
378
-
379
- if (tempb >= -6.0)
380
- @sum = @sum + W[iquad] * Math::exp(-0.5 * xlb ** 2) * Distribution::Normal.cdf(tempb)
381
- end
382
- end # 44 ~ iquad
383
- @sum=@sum*xlen / SQT2PI
384
- #
385
- # CHECK IF ITERATION HAS CONVERGED
386
- #
387
- if ((@prob - @sum).abs <= CITER)
388
- calculate_sdr
389
- return true
390
- end
391
- # ESTIMATE CORRELATION FOR NEXT ITERATION BY LINEAR INTERPOLATION
392
-
393
- rrest = ((@prob - @sum) * @rrprev - (@prob - sumprv) * @rr) / (sumprv - @sum)
394
- rrest = RLIMIT if (rrest > RLIMIT)
395
- rrest = 0 if (rrest < 0)
396
- @rrprev = @rr
397
- @rr = rrest
398
- sumprv = @sum
399
- #
400
- # if estimate has same value on two iterations, stop iteration
401
- #
402
- if @rr == @rrprev
403
- calculate_sdr
404
- return true
405
- end
406
-
407
-
408
- end while @iter < NITER
409
- raise "Too many iterations"
410
- # ir a 42
411
- end
412
- def calculate_cosine
413
- #
414
- # WHEN ALL MARGINALS ARE EQUAL THE COSINE FUNCTION IS USED
415
- #
416
- @rr = -Math::cos(TWOPI * @probaa)
417
- @itype = 2
418
- calculate_sdr
419
- end
420
-
421
-
422
- def calculate_sdr # :nodoc:
423
- #
424
- # COMPUTE SDR
425
- #
426
- @r = @rr
427
- rrsq = Math::sqrt(1.0 - @r ** 2)
428
- @itype = -@itype if (@kdelta > 1)
429
- if (@ksign != 1)
430
- @r = -@r
431
- @zac = -@zac
432
- end
433
- # 71
434
- pdf = Math::exp(-0.5 * (@zac ** 2 - 2 * @r * @zac * @zab + @zab ** 2) / rrsq ** 2) / (TWOPI * rrsq)
435
- @pac = Distribution::Normal.cdf((@zac - @r * @zab) / rrsq) - 0.5
436
- @pab = Distribution::Normal.cdf((@zab - @r * @zac) / rrsq) - 0.5
437
-
438
- @sdr = ((@aa+@dd) * (@bb + @cc)).quo(4) + @pab ** 2 * (@aa + @cc) * (@bb + @dd) + @pac ** 2 * (@aa + @bb) * (@cc + @dd) + 2.0 * @pab * @pac * (@aa * @dd - @bb * @cc) - @pab * (@aa * @bb - @cc * @dd) - @pac * (@aa * @cc - @bb * @dd)
439
- @sdr=0 if (@sdr<0)
440
- @sdr= Math::sqrt(@sdr) / (@tot * pdf * Math::sqrt(@tot))
441
- compute_sdzero
442
- end
443
-
444
- # 85
445
- #
446
- # COMPUTE SDZERO
447
- #
448
- def compute_sdzero
449
- @sdzero = Math::sqrt(((@aa + @bb) * (@aa + @cc) * (@bb + @dd) * (@cc + @dd)).quo(@tot)).quo(@tot ** 2 * @ss)
450
- @sdr = @sdzero if (@r == 0)
451
- end
452
- private :calculate_cosine, :calculate_sdr, :compute_sdzero, :compute, :gaussian_quadrature
453
- end
454
- end
455
- end
456
-
457
-