enumerable-stats 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
4
- data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
3
+ metadata.gz: 4b7a1951101022de006735e6276e1db4a974d98a5ae23c617a0f0c54b116ec80
4
+ data.tar.gz: 0efb5538568ded644e36e5f0a5ffb70cd52c86f678c490751e8c9b5987e99e46
5
5
  SHA512:
6
- metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
7
- data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
6
+ metadata.gz: 20ddf5dd46540ff3a3ce31de0a153babcb1f005556d782e82371dbc70ddf7f882960dd4bd3197fe43571072561a812fb1ccc3b6cb8d5cb9c87cedfd61e9e1c48
7
+ data.tar.gz: 8bcfa97b1be3d3a1cb6887b1aa1f2ec733250361fe8a5834ef090273c97e75aebab38e0ec27d6277a843ad7ef5f8176176e41b7e39fa5d4641ad3daf319a66aa
@@ -32,6 +32,24 @@ module EnumerableStats
32
32
  # @see Enumerable
33
33
  # @since 0.1.0
34
34
  module EnumerableExt
35
+ # Epsilon for floating point comparisons to avoid precision issues
36
+ EPSILON = 1e-10
37
+
38
+ # Common alpha levels with their corresponding high-precision z-scores
39
+ # Used to avoid floating point comparison issues while maintaining backward compatibility
40
+ COMMON_ALPHA_VALUES = {
41
+ 0.10 => 1.2815515655446004,
42
+ 0.05 => 1.6448536269514722,
43
+ 0.025 => 1.9599639845400545,
44
+ 0.01 => 2.3263478740408408,
45
+ 0.005 => 2.5758293035489004,
46
+ 0.001 => 3.0902323061678132
47
+ }.freeze
48
+
49
+ CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
50
+ EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
51
+ BSM_THRESHOLD = 1e-20
52
+
35
53
  # Calculates the percentage difference between this collection's mean and another value or collection's mean
36
54
  # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
37
55
  # This is useful for comparing datasets or metrics where direction doesn't matter
@@ -321,74 +339,148 @@ module EnumerableStats
321
339
  private
322
340
 
323
341
  # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
324
- # Uses a lookup table for common df values and approximations for others
342
+ # Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
325
343
  #
326
344
  # @param df [Float] Degrees of freedom
327
345
  # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
328
346
  # @return [Float] Critical t-value for one-tailed test
329
347
  def critical_t_value(df, alpha)
330
- # For large df (≥30), t-distribution approximates normal distribution
331
- return normal_critical_value(alpha) if df >= 30
332
-
333
- # Lookup table for common t-values (one-tailed, α = 0.05)
334
- # These are standard critical values from t-tables
335
- t_table_05 = {
336
- 1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
337
- 6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
338
- 11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
339
- 16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
340
- 21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
341
- 26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
342
- }
348
+ # For very large df (≥1000), t-distribution is essentially normal
349
+ return inverse_normal_cdf(alpha) if df >= 1000
343
350
 
344
- # Lookup table for common t-values (one-tailed, α = 0.01)
345
- t_table_01 = {
346
- 1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
347
- 6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
348
- 11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
349
- 16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
350
- 21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
351
- 26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
352
- }
351
+ # Use Hill's approximation for inverse t-distribution
352
+ # This is more accurate than lookup tables and handles any df/alpha combination
353
+ inverse_t_distribution(df, alpha)
354
+ end
353
355
 
354
- df_int = df.round
356
+ # Calculates the inverse t-distribution using Cornish-Fisher expansion
357
+ # This provides accurate critical t-values for any degrees of freedom and alpha level
358
+ # Based on methods used in statistical software like R and MATLAB
359
+ #
360
+ # @param df [Float] Degrees of freedom
361
+ # @param alpha [Float] Significance level for one-tailed test
362
+ # @return [Float] Critical t-value
363
+ def inverse_t_distribution(df, alpha)
364
+ # Handle boundary cases
365
+ return Float::INFINITY if df <= 0 || alpha <= 0
366
+ return -Float::INFINITY if alpha >= 1
367
+ return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
368
+
369
+ # Get the corresponding normal quantile
370
+ z = inverse_normal_cdf(alpha)
371
+
372
+ # Special cases with exact solutions
373
+ if df == 1
374
+ # Cauchy distribution: exact inverse
375
+ return Math.tan(Math::PI * (0.5 - alpha))
376
+ elsif df == 2
377
+ # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
378
+ # This is more numerically stable
379
+ z_sq = z**2
380
+ # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
381
+ return z / Math.sqrt(1.0 - (z_sq / (z_sq + 2.0)))
355
382
 
356
- if alpha <= 0.01
357
- t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
358
- elsif alpha <= 0.05
359
- t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
360
- else
361
- # For alpha > 0.05, interpolate or use approximation
362
- # This is a rough approximation for other alpha levels
363
- base_t = t_table_05[df_int] || t_table_05[29]
364
- base_t * ((0.05 / alpha)**0.5)
365
383
  end
384
+
385
+ # Use Cornish-Fisher expansion for general case
386
+ # This is the method used in most statistical software
387
+
388
+ # Base normal quantile
389
+ t = z
390
+
391
+ # First-order correction
392
+ if df >= 4
393
+ c1 = z / 4.0
394
+ t += c1 / df
395
+ end
396
+
397
+ # Second-order correction
398
+ if df >= 6
399
+ c2 = ((5.0 * (z**3)) + (16.0 * z)) / 96.0
400
+ t += c2 / (df**2)
401
+ end
402
+
403
+ # Third-order correction for better accuracy
404
+ if df >= 8
405
+ c3 = ((3.0 * (z**5)) + (19.0 * (z**3)) + (17.0 * z)) / 384.0
406
+ t += c3 / (df**3)
407
+ end
408
+
409
+ # Fourth-order correction for very high accuracy
410
+ if df >= 10
411
+ c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) +
412
+ (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
413
+
414
+ t += c4 / (df**4)
415
+ end
416
+
417
+ # For small degrees of freedom, apply additional small-sample correction
418
+ if df < 8
419
+ # Edgeworth expansion adjustment for small df
420
+ delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
421
+ small_sample_correction = z * delta * ((z**2) + 1.0)
422
+ t += small_sample_correction
423
+ end
424
+
425
+ t
366
426
  end
367
427
 
368
- # Returns the critical value for standard normal distribution (z-score)
369
- # Used when degrees of freedom is large (≥30)
428
+ # Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
429
+ # This is more accurate than the previous hard-coded approach
370
430
  #
371
- # @param alpha [Float] Significance level
372
- # @return [Float] Critical z-value for one-tailed test
373
- def normal_critical_value(alpha)
374
- # Common z-values for one-tailed tests
375
- # Use approximate comparisons to avoid float equality issues
376
- if (alpha - 0.10).abs < 1e-10
377
- 1.282
378
- elsif (alpha - 0.05).abs < 1e-10
379
- 1.645
380
- elsif (alpha - 0.025).abs < 1e-10
381
- 1.960
382
- elsif (alpha - 0.01).abs < 1e-10
383
- 2.326
384
- elsif (alpha - 0.005).abs < 1e-10
385
- 2.576
431
+ # @param alpha [Float] Significance level (0 < alpha < 1)
432
+ # @return [Float] Z-score corresponding to the upper-tail probability alpha
433
+ def inverse_normal_cdf(alpha)
434
+ # Handle edge cases
435
+ return Float::INFINITY if alpha <= 0
436
+ return -Float::INFINITY if alpha >= 1
437
+
438
+ # For common values, use high-precision constants to maintain backward compatibility
439
+ # Use epsilon-based comparisons to avoid floating point precision issues
440
+ COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
441
+ return z_score if (alpha - target_alpha).abs < EPSILON
442
+ end
443
+
444
+ # Use Beasley-Springer-Moro algorithm for other values
445
+ # This is accurate to about 7 decimal places
446
+
447
+ # Transform to work with cumulative probability from left tail
448
+ p = 1.0 - alpha
449
+
450
+ # Handle symmetric case
451
+ if p > 0.5
452
+ sign = 1
453
+ p = 1.0 - p
386
454
  else
387
- # Approximation using inverse normal for other alpha values
388
- # This is a rough approximation of the inverse normal CDF
389
- # For α = 0.05, this gives approximately 1.645
390
- Math.sqrt(-2 * Math.log(alpha))
455
+ sign = -1
391
456
  end
457
+
458
+ # Constants for the approximation
459
+ if p >= BSM_THRESHOLD
460
+ # Rational approximation for central region
461
+ t = Math.sqrt(-2.0 * Math.log(p))
462
+
463
+ # Numerator coefficients
464
+ c0 = 2.515517
465
+ c1 = 0.802853
466
+ c2 = 0.010328
467
+
468
+ # Denominator coefficients
469
+ d0 = 1.000000
470
+ d1 = 1.432788
471
+ d2 = 0.189269
472
+ d3 = 0.001308
473
+
474
+ numerator = c0 + (c1 * t) + (c2 * (t**2))
475
+ denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
476
+
477
+ x = t - (numerator / denominator)
478
+ else
479
+ # For very small p, use asymptotic expansion
480
+ x = Math.sqrt(-2.0 * Math.log(p))
481
+ end
482
+
483
+ sign * x
392
484
  end
393
485
  end
394
486
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: enumerable-stats
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Daniel