enumerable-stats 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/enumerable_stats/enumerable_ext.rb +164 -54
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 67bc0d2458936c421a69879c70fa9eb553e48c7447012e87810ce3c4ab028d66
|
4
|
+
data.tar.gz: 819fcf5e9446b4f7e4a1d7e75aeb914d4ab6ca6a76e65065877d5c3d3bab3634
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c5dfe0571aa591684f9ae8cf8e26a472695b6a26c424185878a6e7a9bfd2786ee20c205c2eca0be234a88fb25c895fc0d2cb25a599aabbc569375657b5ee1705
|
7
|
+
data.tar.gz: cd39ddb06cc2cbc1f204deaece6cce9fcf1835583510559a8dd7d24615df95aac16c9a2ff6bd0ba1678c1dce80c0e7636eae6c121e4b7fd31d418b619fed8604
|
@@ -32,6 +32,24 @@ module EnumerableStats
|
|
32
32
|
# @see Enumerable
|
33
33
|
# @since 0.1.0
|
34
34
|
module EnumerableExt
|
35
|
+
# Epsilon for floating point comparisons to avoid precision issues
|
36
|
+
EPSILON = 1e-10
|
37
|
+
|
38
|
+
# Common alpha levels with their corresponding high-precision z-scores
|
39
|
+
# Used to avoid floating point comparison issues while maintaining backward compatibility
|
40
|
+
COMMON_ALPHA_VALUES = {
|
41
|
+
0.10 => 1.2815515655446004,
|
42
|
+
0.05 => 1.6448536269514722,
|
43
|
+
0.025 => 1.9599639845400545,
|
44
|
+
0.01 => 2.3263478740408408,
|
45
|
+
0.005 => 2.5758293035489004,
|
46
|
+
0.001 => 3.0902323061678132
|
47
|
+
}.freeze
|
48
|
+
|
49
|
+
CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
|
50
|
+
EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
|
51
|
+
BSM_THRESHOLD = 1e-20
|
52
|
+
|
35
53
|
# Calculates the percentage difference between this collection's mean and another value or collection's mean
|
36
54
|
# Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
|
37
55
|
# This is useful for comparing datasets or metrics where direction doesn't matter
|
@@ -321,74 +339,166 @@ module EnumerableStats
|
|
321
339
|
private
|
322
340
|
|
323
341
|
# Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
|
324
|
-
# Uses
|
342
|
+
# Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
|
325
343
|
#
|
326
344
|
# @param df [Float] Degrees of freedom
|
327
345
|
# @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
|
328
346
|
# @return [Float] Critical t-value for one-tailed test
|
329
347
|
def critical_t_value(df, alpha)
|
330
|
-
# For large df (≥
|
331
|
-
return
|
332
|
-
|
333
|
-
# Lookup table for common t-values (one-tailed, α = 0.05)
|
334
|
-
# These are standard critical values from t-tables
|
335
|
-
t_table_05 = {
|
336
|
-
1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
|
337
|
-
6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
|
338
|
-
11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
|
339
|
-
16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
|
340
|
-
21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
|
341
|
-
26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
|
342
|
-
}
|
348
|
+
# For very large df (≥1000), t-distribution is essentially normal
|
349
|
+
return inverse_normal_cdf(alpha) if df >= 1000
|
343
350
|
|
344
|
-
#
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
|
349
|
-
16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
|
350
|
-
21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
|
351
|
-
26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
|
352
|
-
}
|
351
|
+
# Use Hill's approximation for inverse t-distribution
|
352
|
+
# This is more accurate than lookup tables and handles any df/alpha combination
|
353
|
+
inverse_t_distribution(df, alpha)
|
354
|
+
end
|
353
355
|
|
354
|
-
|
356
|
+
# Calculates the inverse t-distribution using Cornish-Fisher expansion
|
357
|
+
# This provides accurate critical t-values for any degrees of freedom and alpha level
|
358
|
+
# Based on methods used in statistical software like R and MATLAB
|
359
|
+
#
|
360
|
+
# @param df [Float] Degrees of freedom
|
361
|
+
# @param alpha [Float] Significance level for one-tailed test
|
362
|
+
# @return [Float] Critical t-value
|
363
|
+
def inverse_t_distribution(df, alpha)
|
364
|
+
# Handle boundary cases
|
365
|
+
return Float::INFINITY if df <= 0 || alpha <= 0
|
366
|
+
return -Float::INFINITY if alpha >= 1
|
367
|
+
return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
|
368
|
+
|
369
|
+
# Get the corresponding normal quantile
|
370
|
+
z = inverse_normal_cdf(alpha)
|
371
|
+
|
372
|
+
# Special cases with exact solutions
|
373
|
+
if df == 1
|
374
|
+
# Cauchy distribution: exact inverse
|
375
|
+
return Math.tan(Math::PI * (0.5 - alpha))
|
376
|
+
elsif df == 2
|
377
|
+
# Exact closed-form solution for df=2
|
378
|
+
# For df=2, CDF: F(t) = 1/2 * (1 + t/√(t² + 2))
|
379
|
+
# Quantile function: t = (2p - 1)/√(2p(1 - p)) where p = 1 - α
|
380
|
+
|
381
|
+
p = 1.0 - alpha
|
382
|
+
|
383
|
+
# Handle edge cases
|
384
|
+
return Float::INFINITY if p >= 1.0
|
385
|
+
return -Float::INFINITY if p <= 0.0
|
386
|
+
|
387
|
+
# For p very close to 0.5, use normal approximation to avoid numerical issues
|
388
|
+
return 0.0 if (p - 0.5).abs < 1e-10
|
389
|
+
|
390
|
+
# Apply exact formula: t = (2p - 1)/√(2p(1 - p))
|
391
|
+
numerator = (2.0 * p) - 1.0
|
392
|
+
denominator_sq = 2.0 * p * (1.0 - p)
|
393
|
+
|
394
|
+
# Ensure we don't have numerical issues with the square root
|
395
|
+
return numerator / Math.sqrt(denominator_sq) if denominator_sq.positive?
|
396
|
+
|
397
|
+
# Fallback to normal approximation for edge cases
|
398
|
+
return z
|
399
|
+
end
|
355
400
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
401
|
+
# Use Cornish-Fisher expansion for general case
|
402
|
+
# This is the method used in most statistical software
|
403
|
+
|
404
|
+
# Base normal quantile
|
405
|
+
t = z
|
406
|
+
|
407
|
+
# First-order correction - Cornish-Fisher expansion
|
408
|
+
# Standard form: (z³ + z)/(4ν)
|
409
|
+
if df >= 4
|
410
|
+
c1 = ((z**3) + z) / 4.0
|
411
|
+
t += c1 / df
|
412
|
+
end
|
413
|
+
|
414
|
+
# Second-order correction - Cornish-Fisher expansion
|
415
|
+
# Standard form: (5z⁵ + 16z³ + 3z)/(96ν²)
|
416
|
+
if df >= 6
|
417
|
+
c2 = ((5.0 * (z**5)) + (16.0 * (z**3)) + (3.0 * z)) / 96.0
|
418
|
+
t += c2 / (df**2)
|
419
|
+
end
|
420
|
+
|
421
|
+
# Third-order correction for better accuracy
|
422
|
+
# Standard form: (3z⁷ + 19z⁵ + 17z³ - 15z)/(384ν³)
|
423
|
+
if df >= 8
|
424
|
+
c3 = ((3.0 * (z**7)) + (19.0 * (z**5)) + (17.0 * (z**3)) - (15.0 * z)) / 384.0
|
425
|
+
t += c3 / (df**3)
|
426
|
+
end
|
427
|
+
|
428
|
+
# Fourth-order correction - using standard coefficients
|
429
|
+
# More conservative approach for high accuracy
|
430
|
+
if df >= 12
|
431
|
+
c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) + (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
|
432
|
+
t += c4 / (df**4)
|
365
433
|
end
|
434
|
+
|
435
|
+
# For small degrees of freedom, apply additional small-sample correction
|
436
|
+
if df < 8
|
437
|
+
# Edgeworth expansion adjustment for small df
|
438
|
+
delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
|
439
|
+
small_sample_correction = z * delta * ((z**2) + 1.0)
|
440
|
+
t += small_sample_correction
|
441
|
+
end
|
442
|
+
|
443
|
+
t
|
366
444
|
end
|
367
445
|
|
368
|
-
#
|
369
|
-
#
|
446
|
+
# Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
|
447
|
+
# This is more accurate than the previous hard-coded approach
|
370
448
|
#
|
371
|
-
# @param alpha [Float] Significance level
|
372
|
-
# @return [Float]
|
373
|
-
def
|
374
|
-
#
|
375
|
-
|
376
|
-
if
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
449
|
+
# @param alpha [Float] Significance level (0 < alpha < 1)
|
450
|
+
# @return [Float] Z-score corresponding to the upper-tail probability alpha
|
451
|
+
def inverse_normal_cdf(alpha)
|
452
|
+
# Handle edge cases
|
453
|
+
return Float::INFINITY if alpha <= 0
|
454
|
+
return -Float::INFINITY if alpha >= 1
|
455
|
+
|
456
|
+
# For common values, use high-precision constants to maintain backward compatibility
|
457
|
+
# Use epsilon-based comparisons to avoid floating point precision issues
|
458
|
+
COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
|
459
|
+
return z_score if (alpha - target_alpha).abs < EPSILON
|
460
|
+
end
|
461
|
+
|
462
|
+
# Use Beasley-Springer-Moro algorithm for other values
|
463
|
+
# This is accurate to about 7 decimal places
|
464
|
+
|
465
|
+
# Transform to work with cumulative probability from left tail
|
466
|
+
p = 1.0 - alpha
|
467
|
+
|
468
|
+
# Handle symmetric case
|
469
|
+
if p > 0.5
|
470
|
+
sign = 1
|
471
|
+
p = 1.0 - p
|
386
472
|
else
|
387
|
-
|
388
|
-
# This is a rough approximation of the inverse normal CDF
|
389
|
-
# For α = 0.05, this gives approximately 1.645
|
390
|
-
Math.sqrt(-2 * Math.log(alpha))
|
473
|
+
sign = -1
|
391
474
|
end
|
475
|
+
|
476
|
+
# Constants for the approximation
|
477
|
+
if p >= BSM_THRESHOLD
|
478
|
+
# Rational approximation for central region
|
479
|
+
t = Math.sqrt(-2.0 * Math.log(p))
|
480
|
+
|
481
|
+
# Numerator coefficients
|
482
|
+
c0 = 2.515517
|
483
|
+
c1 = 0.802853
|
484
|
+
c2 = 0.010328
|
485
|
+
|
486
|
+
# Denominator coefficients
|
487
|
+
d0 = 1.000000
|
488
|
+
d1 = 1.432788
|
489
|
+
d2 = 0.189269
|
490
|
+
d3 = 0.001308
|
491
|
+
|
492
|
+
numerator = c0 + (c1 * t) + (c2 * (t**2))
|
493
|
+
denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
|
494
|
+
|
495
|
+
x = t - (numerator / denominator)
|
496
|
+
else
|
497
|
+
# For very small p, use asymptotic expansion
|
498
|
+
x = Math.sqrt(-2.0 * Math.log(p))
|
499
|
+
end
|
500
|
+
|
501
|
+
sign * x
|
392
502
|
end
|
393
503
|
end
|
394
504
|
end
|