enumerable-stats 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
4
- data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
3
+ metadata.gz: 67bc0d2458936c421a69879c70fa9eb553e48c7447012e87810ce3c4ab028d66
4
+ data.tar.gz: 819fcf5e9446b4f7e4a1d7e75aeb914d4ab6ca6a76e65065877d5c3d3bab3634
5
5
  SHA512:
6
- metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
7
- data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
6
+ metadata.gz: c5dfe0571aa591684f9ae8cf8e26a472695b6a26c424185878a6e7a9bfd2786ee20c205c2eca0be234a88fb25c895fc0d2cb25a599aabbc569375657b5ee1705
7
+ data.tar.gz: cd39ddb06cc2cbc1f204deaece6cce9fcf1835583510559a8dd7d24615df95aac16c9a2ff6bd0ba1678c1dce80c0e7636eae6c121e4b7fd31d418b619fed8604
@@ -32,6 +32,24 @@ module EnumerableStats
32
32
  # @see Enumerable
33
33
  # @since 0.1.0
34
34
  module EnumerableExt
35
+ # Epsilon for floating point comparisons to avoid precision issues
36
+ EPSILON = 1e-10
37
+
38
+ # Common alpha levels with their corresponding high-precision z-scores
39
+ # Used to avoid floating point comparison issues while maintaining backward compatibility
40
+ COMMON_ALPHA_VALUES = {
41
+ 0.10 => 1.2815515655446004,
42
+ 0.05 => 1.6448536269514722,
43
+ 0.025 => 1.9599639845400545,
44
+ 0.01 => 2.3263478740408408,
45
+ 0.005 => 2.5758293035489004,
46
+ 0.001 => 3.0902323061678132
47
+ }.freeze
48
+
49
+ CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
50
+ EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
51
+ BSM_THRESHOLD = 1e-20
52
+
35
53
  # Calculates the percentage difference between this collection's mean and another value or collection's mean
36
54
  # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
37
55
  # This is useful for comparing datasets or metrics where direction doesn't matter
@@ -321,74 +339,166 @@ module EnumerableStats
321
339
  private
322
340
 
323
341
  # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
324
- # Uses a lookup table for common df values and approximations for others
342
+ # Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
325
343
  #
326
344
  # @param df [Float] Degrees of freedom
327
345
  # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
328
346
  # @return [Float] Critical t-value for one-tailed test
329
347
  def critical_t_value(df, alpha)
330
- # For large df (≥30), t-distribution approximates normal distribution
331
- return normal_critical_value(alpha) if df >= 30
332
-
333
- # Lookup table for common t-values (one-tailed, α = 0.05)
334
- # These are standard critical values from t-tables
335
- t_table_05 = {
336
- 1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
337
- 6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
338
- 11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
339
- 16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
340
- 21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
341
- 26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
342
- }
348
+ # For very large df (≥1000), t-distribution is essentially normal
349
+ return inverse_normal_cdf(alpha) if df >= 1000
343
350
 
344
- # Lookup table for common t-values (one-tailed, α = 0.01)
345
- t_table_01 = {
346
- 1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
347
- 6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
348
- 11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
349
- 16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
350
- 21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
351
- 26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
352
- }
351
+ # Use Hill's approximation for inverse t-distribution
352
+ # This is more accurate than lookup tables and handles any df/alpha combination
353
+ inverse_t_distribution(df, alpha)
354
+ end
353
355
 
354
- df_int = df.round
356
+ # Calculates the inverse t-distribution using Cornish-Fisher expansion
357
+ # This provides accurate critical t-values for any degrees of freedom and alpha level
358
+ # Based on methods used in statistical software like R and MATLAB
359
+ #
360
+ # @param df [Float] Degrees of freedom
361
+ # @param alpha [Float] Significance level for one-tailed test
362
+ # @return [Float] Critical t-value
363
+ def inverse_t_distribution(df, alpha)
364
+ # Handle boundary cases
365
+ return Float::INFINITY if df <= 0 || alpha <= 0
366
+ return -Float::INFINITY if alpha >= 1
367
+ return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
368
+
369
+ # Get the corresponding normal quantile
370
+ z = inverse_normal_cdf(alpha)
371
+
372
+ # Special cases with exact solutions
373
+ if df == 1
374
+ # Cauchy distribution: exact inverse
375
+ return Math.tan(Math::PI * (0.5 - alpha))
376
+ elsif df == 2
377
+ # Exact closed-form solution for df=2
378
+ # For df=2, CDF: F(t) = 1/2 * (1 + t/√(t² + 2))
379
+ # Quantile function: t = (2p - 1)/√(2p(1 - p)) where p = 1 - α
380
+
381
+ p = 1.0 - alpha
382
+
383
+ # Handle edge cases
384
+ return Float::INFINITY if p >= 1.0
385
+ return -Float::INFINITY if p <= 0.0
386
+
387
+ # For p very close to 0.5, use normal approximation to avoid numerical issues
388
+ return 0.0 if (p - 0.5).abs < 1e-10
389
+
390
+ # Apply exact formula: t = (2p - 1)/√(2p(1 - p))
391
+ numerator = (2.0 * p) - 1.0
392
+ denominator_sq = 2.0 * p * (1.0 - p)
393
+
394
+ # Ensure we don't have numerical issues with the square root
395
+ return numerator / Math.sqrt(denominator_sq) if denominator_sq.positive?
396
+
397
+ # Fallback to normal approximation for edge cases
398
+ return z
399
+ end
355
400
 
356
- if alpha <= 0.01
357
- t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
358
- elsif alpha <= 0.05
359
- t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
360
- else
361
- # For alpha > 0.05, interpolate or use approximation
362
- # This is a rough approximation for other alpha levels
363
- base_t = t_table_05[df_int] || t_table_05[29]
364
- base_t * ((0.05 / alpha)**0.5)
401
+ # Use Cornish-Fisher expansion for general case
402
+ # This is the method used in most statistical software
403
+
404
+ # Base normal quantile
405
+ t = z
406
+
407
+ # First-order correction - Cornish-Fisher expansion
408
+ # Standard form: (z³ + z)/(4ν)
409
+ if df >= 4
410
+ c1 = ((z**3) + z) / 4.0
411
+ t += c1 / df
412
+ end
413
+
414
+ # Second-order correction - Cornish-Fisher expansion
415
+ # Standard form: (5z⁵ + 16z³ + 3z)/(96ν²)
416
+ if df >= 6
417
+ c2 = ((5.0 * (z**5)) + (16.0 * (z**3)) + (3.0 * z)) / 96.0
418
+ t += c2 / (df**2)
419
+ end
420
+
421
+ # Third-order correction for better accuracy
422
+ # Standard form: (3z⁷ + 19z⁵ + 17z³ - 15z)/(384ν³)
423
+ if df >= 8
424
+ c3 = ((3.0 * (z**7)) + (19.0 * (z**5)) + (17.0 * (z**3)) - (15.0 * z)) / 384.0
425
+ t += c3 / (df**3)
426
+ end
427
+
428
+ # Fourth-order correction - using standard coefficients
429
+ # More conservative approach for high accuracy
430
+ if df >= 12
431
+ c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) + (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
432
+ t += c4 / (df**4)
365
433
  end
434
+
435
+ # For small degrees of freedom, apply additional small-sample correction
436
+ if df < 8
437
+ # Edgeworth expansion adjustment for small df
438
+ delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
439
+ small_sample_correction = z * delta * ((z**2) + 1.0)
440
+ t += small_sample_correction
441
+ end
442
+
443
+ t
366
444
  end
367
445
 
368
- # Returns the critical value for standard normal distribution (z-score)
369
- # Used when degrees of freedom is large (≥30)
446
+ # Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
447
+ # This is more accurate than the previous hard-coded approach
370
448
  #
371
- # @param alpha [Float] Significance level
372
- # @return [Float] Critical z-value for one-tailed test
373
- def normal_critical_value(alpha)
374
- # Common z-values for one-tailed tests
375
- # Use approximate comparisons to avoid float equality issues
376
- if (alpha - 0.10).abs < 1e-10
377
- 1.282
378
- elsif (alpha - 0.05).abs < 1e-10
379
- 1.645
380
- elsif (alpha - 0.025).abs < 1e-10
381
- 1.960
382
- elsif (alpha - 0.01).abs < 1e-10
383
- 2.326
384
- elsif (alpha - 0.005).abs < 1e-10
385
- 2.576
449
+ # @param alpha [Float] Significance level (0 < alpha < 1)
450
+ # @return [Float] Z-score corresponding to the upper-tail probability alpha
451
+ def inverse_normal_cdf(alpha)
452
+ # Handle edge cases
453
+ return Float::INFINITY if alpha <= 0
454
+ return -Float::INFINITY if alpha >= 1
455
+
456
+ # For common values, use high-precision constants to maintain backward compatibility
457
+ # Use epsilon-based comparisons to avoid floating point precision issues
458
+ COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
459
+ return z_score if (alpha - target_alpha).abs < EPSILON
460
+ end
461
+
462
+ # Use Beasley-Springer-Moro algorithm for other values
463
+ # This is accurate to about 7 decimal places
464
+
465
+ # Transform to work with cumulative probability from left tail
466
+ p = 1.0 - alpha
467
+
468
+ # Handle symmetric case
469
+ if p > 0.5
470
+ sign = 1
471
+ p = 1.0 - p
386
472
  else
387
- # Approximation using inverse normal for other alpha values
388
- # This is a rough approximation of the inverse normal CDF
389
- # For α = 0.05, this gives approximately 1.645
390
- Math.sqrt(-2 * Math.log(alpha))
473
+ sign = -1
391
474
  end
475
+
476
+ # Constants for the approximation
477
+ if p >= BSM_THRESHOLD
478
+ # Rational approximation for central region
479
+ t = Math.sqrt(-2.0 * Math.log(p))
480
+
481
+ # Numerator coefficients
482
+ c0 = 2.515517
483
+ c1 = 0.802853
484
+ c2 = 0.010328
485
+
486
+ # Denominator coefficients
487
+ d0 = 1.000000
488
+ d1 = 1.432788
489
+ d2 = 0.189269
490
+ d3 = 0.001308
491
+
492
+ numerator = c0 + (c1 * t) + (c2 * (t**2))
493
+ denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
494
+
495
+ x = t - (numerator / denominator)
496
+ else
497
+ # For very small p, use asymptotic expansion
498
+ x = Math.sqrt(-2.0 * Math.log(p))
499
+ end
500
+
501
+ sign * x
392
502
  end
393
503
  end
394
504
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: enumerable-stats
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Daniel