RubyGems - enumerable-stats - Versions diffs - 1.2.0 → 1.2.2 - Mend

enumerable-stats 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml +4 -4
data/lib/enumerable_stats/enumerable_ext.rb +164 -54
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
-  data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
+  metadata.gz: 67bc0d2458936c421a69879c70fa9eb553e48c7447012e87810ce3c4ab028d66
+  data.tar.gz: 819fcf5e9446b4f7e4a1d7e75aeb914d4ab6ca6a76e65065877d5c3d3bab3634
 SHA512:
-  metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
-  data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
+  metadata.gz: c5dfe0571aa591684f9ae8cf8e26a472695b6a26c424185878a6e7a9bfd2786ee20c205c2eca0be234a88fb25c895fc0d2cb25a599aabbc569375657b5ee1705
+  data.tar.gz: cd39ddb06cc2cbc1f204deaece6cce9fcf1835583510559a8dd7d24615df95aac16c9a2ff6bd0ba1678c1dce80c0e7636eae6c121e4b7fd31d418b619fed8604

data/lib/enumerable_stats/enumerable_ext.rb CHANGED Viewed

@@ -32,6 +32,24 @@ module EnumerableStats
   # @see Enumerable
   # @since 0.1.0
   module EnumerableExt
+    # Epsilon for floating point comparisons to avoid precision issues
+    EPSILON = 1e-10
+    # Common alpha levels with their corresponding high-precision z-scores
+    # Used to avoid floating point comparison issues while maintaining backward compatibility
+    COMMON_ALPHA_VALUES = {
+      0.10 => 1.2815515655446004,
+      0.05 => 1.6448536269514722,
+      0.025 => 1.9599639845400545,
+      0.01 => 2.3263478740408408,
+      0.005 => 2.5758293035489004,
+      0.001 => 3.0902323061678132
+    }.freeze
+    CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
+    EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
+    BSM_THRESHOLD = 1e-20
     # Calculates the percentage difference between this collection's mean and another value or collection's mean
     # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
     # This is useful for comparing datasets or metrics where direction doesn't matter
@@ -321,74 +339,166 @@ module EnumerableStats
     private
     # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
-    # Uses a lookup table for common df values and approximations for others
+    # Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
     #
     # @param df [Float] Degrees of freedom
     # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
     # @return [Float] Critical t-value for one-tailed test
     def critical_t_value(df, alpha)
-      # For large df (≥30), t-distribution approximates normal distribution
-      return normal_critical_value(alpha) if df >= 30
-      # Lookup table for common t-values (one-tailed, α = 0.05)
-      # These are standard critical values from t-tables
-      t_table_05 = {
-        1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
-        6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
-        11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
-        16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
-        21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
-        26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
-      }
+      # For very large df (≥1000), t-distribution is essentially normal
+      return inverse_normal_cdf(alpha) if df >= 1000
-      # Lookup table for common t-values (one-tailed, α = 0.01)
-      t_table_01 = {
-        1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
-        6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
-        11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
-        16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
-        21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
-        26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
-      }
+      # Use Hill's approximation for inverse t-distribution
+      # This is more accurate than lookup tables and handles any df/alpha combination
+      inverse_t_distribution(df, alpha)
+    end
-      df_int = df.round
+    # Calculates the inverse t-distribution using Cornish-Fisher expansion
+    # This provides accurate critical t-values for any degrees of freedom and alpha level
+    # Based on methods used in statistical software like R and MATLAB
+    #
+    # @param df [Float] Degrees of freedom
+    # @param alpha [Float] Significance level for one-tailed test
+    # @return [Float] Critical t-value
+    def inverse_t_distribution(df, alpha)
+      # Handle boundary cases
+      return Float::INFINITY if df <= 0 || alpha <= 0
+      return -Float::INFINITY if alpha >= 1
+      return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
+      # Get the corresponding normal quantile
+      z = inverse_normal_cdf(alpha)
+      # Special cases with exact solutions
+      if df == 1
+        # Cauchy distribution: exact inverse
+        return Math.tan(Math::PI * (0.5 - alpha))
+      elsif df == 2
+        # Exact closed-form solution for df=2
+        # For df=2, CDF: F(t) = 1/2 * (1 + t/√(t² + 2))
+        # Quantile function: t = (2p - 1)/√(2p(1 - p)) where p = 1 - α
+        p = 1.0 - alpha
+        # Handle edge cases
+        return Float::INFINITY if p >= 1.0
+        return -Float::INFINITY if p <= 0.0
+        # For p very close to 0.5, use normal approximation to avoid numerical issues
+        return 0.0 if (p - 0.5).abs < 1e-10
+        # Apply exact formula: t = (2p - 1)/√(2p(1 - p))
+        numerator = (2.0 * p) - 1.0
+        denominator_sq = 2.0 * p * (1.0 - p)
+        # Ensure we don't have numerical issues with the square root
+        return numerator / Math.sqrt(denominator_sq) if denominator_sq.positive?
+        # Fallback to normal approximation for edge cases
+        return z
+      end
-      if alpha <= 0.01
-        t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
-      elsif alpha <= 0.05
-        t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
-      else
-        # For alpha > 0.05, interpolate or use approximation
-        # This is a rough approximation for other alpha levels
-        base_t = t_table_05[df_int] || t_table_05[29]
-        base_t * ((0.05 / alpha)**0.5)
+      # Use Cornish-Fisher expansion for general case
+      # This is the method used in most statistical software
+      # Base normal quantile
+      t = z
+      # First-order correction - Cornish-Fisher expansion
+      # Standard form: (z³ + z)/(4ν)
+      if df >= 4
+        c1 = ((z**3) + z) / 4.0
+        t += c1 / df
+      end
+      # Second-order correction - Cornish-Fisher expansion
+      # Standard form: (5z⁵ + 16z³ + 3z)/(96ν²)
+      if df >= 6
+        c2 = ((5.0 * (z**5)) + (16.0 * (z**3)) + (3.0 * z)) / 96.0
+        t += c2 / (df**2)
+      end
+      # Third-order correction for better accuracy
+      # Standard form: (3z⁷ + 19z⁵ + 17z³ - 15z)/(384ν³)
+      if df >= 8
+        c3 = ((3.0 * (z**7)) + (19.0 * (z**5)) + (17.0 * (z**3)) - (15.0 * z)) / 384.0
+        t += c3 / (df**3)
+      end
+      # Fourth-order correction - using standard coefficients
+      # More conservative approach for high accuracy
+      if df >= 12
+        c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) + (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
+        t += c4 / (df**4)
       end
+      # For small degrees of freedom, apply additional small-sample correction
+      if df < 8
+        # Edgeworth expansion adjustment for small df
+        delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
+        small_sample_correction = z * delta * ((z**2) + 1.0)
+        t += small_sample_correction
+      end
+      t
     end
-    # Returns the critical value for standard normal distribution (z-score)
-    # Used when degrees of freedom is large (≥30)
+    # Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
+    # This is more accurate than the previous hard-coded approach
     #
-    # @param alpha [Float] Significance level
-    # @return [Float] Critical z-value for one-tailed test
-    def normal_critical_value(alpha)
-      # Common z-values for one-tailed tests
-      # Use approximate comparisons to avoid float equality issues
-      if (alpha - 0.10).abs < 1e-10
-        1.282
-      elsif (alpha - 0.05).abs < 1e-10
-        1.645
-      elsif (alpha - 0.025).abs < 1e-10
-        1.960
-      elsif (alpha - 0.01).abs < 1e-10
-        2.326
-      elsif (alpha - 0.005).abs < 1e-10
-        2.576
+    # @param alpha [Float] Significance level (0 < alpha < 1)
+    # @return [Float] Z-score corresponding to the upper-tail probability alpha
+    def inverse_normal_cdf(alpha)
+      # Handle edge cases
+      return Float::INFINITY if alpha <= 0
+      return -Float::INFINITY if alpha >= 1
+      # For common values, use high-precision constants to maintain backward compatibility
+      # Use epsilon-based comparisons to avoid floating point precision issues
+      COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
+        return z_score if (alpha - target_alpha).abs < EPSILON
+      end
+      # Use Beasley-Springer-Moro algorithm for other values
+      # This is accurate to about 7 decimal places
+      # Transform to work with cumulative probability from left tail
+      p = 1.0 - alpha
+      # Handle symmetric case
+      if p > 0.5
+        sign = 1
+        p = 1.0 - p
       else
-        # Approximation using inverse normal for other alpha values
-        # This is a rough approximation of the inverse normal CDF
-        # For α = 0.05, this gives approximately 1.645
-        Math.sqrt(-2 * Math.log(alpha))
+        sign = -1
       end
+      # Constants for the approximation
+      if p >= BSM_THRESHOLD
+        # Rational approximation for central region
+        t = Math.sqrt(-2.0 * Math.log(p))
+        # Numerator coefficients
+        c0 = 2.515517
+        c1 = 0.802853
+        c2 = 0.010328
+        # Denominator coefficients
+        d0 = 1.000000
+        d1 = 1.432788
+        d2 = 0.189269
+        d3 = 0.001308
+        numerator = c0 + (c1 * t) + (c2 * (t**2))
+        denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
+        x = t - (numerator / denominator)
+      else
+        # For very small p, use asymptotic expansion
+        x = Math.sqrt(-2.0 * Math.log(p))
+      end
+      sign * x
     end
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: enumerable-stats
 version: !ruby/object:Gem::Version
-  version: 1.2.0
+  version: 1.2.2
 platform: ruby
 authors:
 - Jon Daniel