RubyGems - enumerable-stats - Versions diffs - 1.1.0 → 1.2.1 - Mend

enumerable-stats 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/lib/enumerable-stats.rb +1 -1
data/lib/enumerable_stats/enumerable_ext.rb +248 -13
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5a52c483b38592d8be9155651e63a4d238850927bc3282f27f22a4859e1db1a
-  data.tar.gz: 0e807826f7103e049effcd1bd925279e5b1ea7d34932fddebf7167e184a6b887
+  metadata.gz: 4b7a1951101022de006735e6276e1db4a974d98a5ae23c617a0f0c54b116ec80
+  data.tar.gz: 0efb5538568ded644e36e5f0a5ffb70cd52c86f678c490751e8c9b5987e99e46
 SHA512:
-  metadata.gz: df23176506fd05b2769fc6ee49ed62bc7954ec9f810d6153a69754df90a3cbed7b4744e269c7f26b08b1e2bb56b0384d3c1a46bdacc7a57ecba7f9029c4bcb19
-  data.tar.gz: 8e80e9d596018a704773ca2836f951d1565876b35aedf7428117a61d031f685ea788ac115d02f04fc7b27ab9b06ebae5a617fe891a97130b8da628cdee807c20
+  metadata.gz: 20ddf5dd46540ff3a3ce31de0a153babcb1f005556d782e82371dbc70ddf7f882960dd4bd3197fe43571072561a812fb1ccc3b6cb8d5cb9c87cedfd61e9e1c48
+  data.tar.gz: 8bcfa97b1be3d3a1cb6887b1aa1f2ec733250361fe8a5834ef090273c97e75aebab38e0ec27d6277a843ad7ef5f8176176e41b7e39fa5d4641ad3daf319a66aa

data/lib/enumerable-stats.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-require_relative 'enumerable_stats/enumerable_ext'
+require_relative "enumerable_stats/enumerable_ext"
 module Enumerable
   include EnumerableStats::EnumerableExt

data/lib/enumerable_stats/enumerable_ext.rb CHANGED Viewed

@@ -1,7 +1,55 @@
 # frozen_string_literal: true
 module EnumerableStats
+  # Extension module that adds statistical methods to all Enumerable objects.
+  #
+  # This module provides essential statistical functions including measures of central tendency
+  # (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
+  # outlier detection using the IQR method, and statistical comparison methods.
+  #
+  # When included, these methods become available on all Ruby collections that include
+  # Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
+  # without external dependencies.
+  #
+  # @example Basic statistical calculations
+  #   [1, 2, 3, 4, 5].mean          #=> 3.0
+  #   [1, 2, 3, 4, 5].median        #=> 3
+  #   [1, 2, 3, 4, 5].percentile(75) #=> 4.0
+  #
+  # @example Outlier detection
+  #   data = [1, 2, 3, 4, 100]
+  #   data.remove_outliers           #=> [1, 2, 3, 4]
+  #   data.outlier_stats             #=> { outliers_removed: 1, percentage: 20.0, ... }
+  #
+  # @example Statistical testing
+  #   control = [10, 12, 14, 16, 18]
+  #   treatment = [15, 17, 19, 21, 23]
+  #   control.t_value(treatment)     #=> negative t-statistic
+  #   control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
+  #   treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
+  #   control.less_than?(treatment)    #=> true (control mean significantly < treatment mean)
+  #
+  # @see Enumerable
+  # @since 0.1.0
   module EnumerableExt
+    # Epsilon for floating point comparisons to avoid precision issues
+    EPSILON = 1e-10
+    # Common alpha levels with their corresponding high-precision z-scores
+    # Used to avoid floating point comparison issues while maintaining backward compatibility
+    COMMON_ALPHA_VALUES = {
+      0.10 => 1.2815515655446004,
+      0.05 => 1.6448536269514722,
+      0.025 => 1.9599639845400545,
+      0.01 => 2.3263478740408408,
+      0.005 => 2.5758293035489004,
+      0.001 => 3.0902323061678132
+    }.freeze
+    CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
+    EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
+    BSM_THRESHOLD = 1e-20
     # Calculates the percentage difference between this collection's mean and another value or collection's mean
     # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
     # This is useful for comparing datasets or metrics where direction doesn't matter
@@ -13,7 +61,7 @@ module EnumerableStats
       b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
       return 0.0 if a == b
-      return Float::INFINITY if a + b == 0
+      return Float::INFINITY if (a + b).zero?
       ((a - b).abs / ((a + b) / 2.0).abs) * 100
     end
@@ -29,7 +77,7 @@ module EnumerableStats
       b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
       return 0.0 if a == b
-      return Float::INFINITY if a + b == 0
+      return Float::INFINITY if (a + b).zero?
       ((a - b) / ((a + b) / 2.0).abs) * 100
     end
@@ -70,12 +118,52 @@ module EnumerableStats
       n = (n1 + n2)**2
-      d1 = variance**2 / (count**2 * (count - 1))
-      d2 = other.variance**2 / (other.count**2 * (other.count - 1))
+      d1 = (variance**2) / ((count**2) * (count - 1))
+      d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
       n / (d1 + d2)
     end
+    # Tests if this collection's mean is significantly greater than another collection's mean
+    # using a one-tailed Student's t-test. Returns true if the test indicates statistical
+    # significance at the specified alpha level.
+    #
+    # @param other [Enumerable] Another collection to compare against
+    # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
+    # @return [Boolean] True if this collection's mean is significantly greater
+    # @example
+    #   control = [10, 12, 11, 13, 12]     # mean ≈ 11.6
+    #   treatment = [15, 17, 16, 18, 14]   # mean = 16.0
+    #   treatment.greater_than?(control)   # => true (treatment significantly > control)
+    #   control.greater_than?(treatment)   # => false
+    def greater_than?(other, alpha: 0.05)
+      t_stat = t_value(other)
+      df = degrees_of_freedom(other)
+      critical_value = critical_t_value(df, alpha)
+      t_stat > critical_value
+    end
+    # Tests if this collection's mean is significantly less than another collection's mean
+    # using a one-tailed Student's t-test. Returns true if the test indicates statistical
+    # significance at the specified alpha level.
+    #
+    # @param other [Enumerable] Another collection to compare against
+    # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
+    # @return [Boolean] True if this collection's mean is significantly less
+    # @example
+    #   control = [10, 12, 11, 13, 12]     # mean ≈ 11.6
+    #   treatment = [15, 17, 16, 18, 14]   # mean = 16.0
+    #   control.less_than?(treatment)      # => true (control significantly < treatment)
+    #   treatment.less_than?(control)      # => false
+    def less_than?(other, alpha: 0.05)
+      t_stat = t_value(other)
+      df = degrees_of_freedom(other)
+      critical_value = critical_t_value(df, alpha)
+      t_stat < -critical_value
+    end
     # Calculates the arithmetic mean (average) of the collection
     #
     # @return [Float] The arithmetic mean of all numeric values
@@ -96,7 +184,7 @@ module EnumerableStats
     #   [5, 1, 3, 2, 4].median        # => 3 (automatically sorts)
     #   [].median                     # => nil
     def median
-      return nil if size == 0
+      return nil if size.zero?
       sorted = sort
       midpoint = size / 2
@@ -123,7 +211,7 @@ module EnumerableStats
     #   [1, 2, 3, 4, 5].percentile(100)   # => 5 (maximum value)
     #   [].percentile(50)                 # => nil (empty collection)
     def percentile(percentile)
-      return nil if size == 0
+      return nil if size.zero?
       unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
         raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
@@ -132,7 +220,7 @@ module EnumerableStats
       sorted = sort
       # Handle edge cases
-      return sorted.first if percentile == 0
+      return sorted.first if percentile.zero?
       return sorted.last if percentile == 100
       # Calculate the position using the "linear" method (R-7/Excel method)
@@ -151,7 +239,7 @@ module EnumerableStats
         lower_value = sorted[lower_index]
         upper_value = sorted[upper_index]
-        lower_value + weight * (upper_value - lower_value)
+        lower_value + (weight * (upper_value - lower_value))
       end
     end
@@ -164,7 +252,7 @@ module EnumerableStats
     #   [5, 5, 5, 5].variance         # => 0.0 (no variation)
     def variance
       mean = self.mean
-      sum_of_squares = map { |r| (r - mean)**2 }.sum
+      sum_of_squares = sum { |r| (r - mean)**2 }
       sum_of_squares / (count - 1).to_f
     end
@@ -204,7 +292,7 @@ module EnumerableStats
         lower_index = q1_pos.floor
         upper_index = q1_pos.ceil
         weight = q1_pos - q1_pos.floor
-        q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
+        q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
       end
       # Calculate Q3
@@ -214,7 +302,7 @@ module EnumerableStats
         lower_index = q3_pos.floor
         upper_index = q3_pos.ceil
         weight = q3_pos - q3_pos.floor
-        q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
+        q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
       end
       iqr = q3 - q1
@@ -224,7 +312,7 @@ module EnumerableStats
       upper_bound = q3 + (multiplier * iqr)
       # Filter out outliers
-      select { |value| value >= lower_bound && value <= upper_bound }
+      select { |value| value.between?(lower_bound, upper_bound) }
     end
     # Returns statistics about outlier removal for debugging/logging
@@ -247,5 +335,152 @@ module EnumerableStats
         outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
       }
     end
+    private
+    # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
+    # Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
+    #
+    # @param df [Float] Degrees of freedom
+    # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
+    # @return [Float] Critical t-value for one-tailed test
+    def critical_t_value(df, alpha)
+      # For very large df (≥1000), t-distribution is essentially normal
+      return inverse_normal_cdf(alpha) if df >= 1000
+      # Use Hill's approximation for inverse t-distribution
+      # This is more accurate than lookup tables and handles any df/alpha combination
+      inverse_t_distribution(df, alpha)
+    end
+    # Calculates the inverse t-distribution using Cornish-Fisher expansion
+    # This provides accurate critical t-values for any degrees of freedom and alpha level
+    # Based on methods used in statistical software like R and MATLAB
+    #
+    # @param df [Float] Degrees of freedom
+    # @param alpha [Float] Significance level for one-tailed test
+    # @return [Float] Critical t-value
+    def inverse_t_distribution(df, alpha)
+      # Handle boundary cases
+      return Float::INFINITY if df <= 0 || alpha <= 0
+      return -Float::INFINITY if alpha >= 1
+      return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
+      # Get the corresponding normal quantile
+      z = inverse_normal_cdf(alpha)
+      # Special cases with exact solutions
+      if df == 1
+        # Cauchy distribution: exact inverse
+        return Math.tan(Math::PI * (0.5 - alpha))
+      elsif df == 2
+        # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
+        # This is more numerically stable
+        z_sq = z**2
+        # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
+        return z / Math.sqrt(1.0 - (z_sq / (z_sq + 2.0)))
+      end
+      # Use Cornish-Fisher expansion for general case
+      # This is the method used in most statistical software
+      # Base normal quantile
+      t = z
+      # First-order correction
+      if df >= 4
+        c1 = z / 4.0
+        t += c1 / df
+      end
+      # Second-order correction
+      if df >= 6
+        c2 = ((5.0 * (z**3)) + (16.0 * z)) / 96.0
+        t += c2 / (df**2)
+      end
+      # Third-order correction for better accuracy
+      if df >= 8
+        c3 = ((3.0 * (z**5)) + (19.0 * (z**3)) + (17.0 * z)) / 384.0
+        t += c3 / (df**3)
+      end
+      # Fourth-order correction for very high accuracy
+      if df >= 10
+        c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) +
+          (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
+        t += c4 / (df**4)
+      end
+      # For small degrees of freedom, apply additional small-sample correction
+      if df < 8
+        # Edgeworth expansion adjustment for small df
+        delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
+        small_sample_correction = z * delta * ((z**2) + 1.0)
+        t += small_sample_correction
+      end
+      t
+    end
+    # Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
+    # This is more accurate than the previous hard-coded approach
+    #
+    # @param alpha [Float] Significance level (0 < alpha < 1)
+    # @return [Float] Z-score corresponding to the upper-tail probability alpha
+    def inverse_normal_cdf(alpha)
+      # Handle edge cases
+      return Float::INFINITY if alpha <= 0
+      return -Float::INFINITY if alpha >= 1
+      # For common values, use high-precision constants to maintain backward compatibility
+      # Use epsilon-based comparisons to avoid floating point precision issues
+      COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
+        return z_score if (alpha - target_alpha).abs < EPSILON
+      end
+      # Use Beasley-Springer-Moro algorithm for other values
+      # This is accurate to about 7 decimal places
+      # Transform to work with cumulative probability from left tail
+      p = 1.0 - alpha
+      # Handle symmetric case
+      if p > 0.5
+        sign = 1
+        p = 1.0 - p
+      else
+        sign = -1
+      end
+      # Constants for the approximation
+      if p >= BSM_THRESHOLD
+        # Rational approximation for central region
+        t = Math.sqrt(-2.0 * Math.log(p))
+        # Numerator coefficients
+        c0 = 2.515517
+        c1 = 0.802853
+        c2 = 0.010328
+        # Denominator coefficients
+        d0 = 1.000000
+        d1 = 1.432788
+        d2 = 0.189269
+        d3 = 0.001308
+        numerator = c0 + (c1 * t) + (c2 * (t**2))
+        denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
+        x = t - (numerator / denominator)
+      else
+        # For very small p, use asymptotic expansion
+        x = Math.sqrt(-2.0 * Math.log(p))
+      end
+      sign * x
+    end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: enumerable-stats
 version: !ruby/object:Gem::Version
-  version: 1.1.0
+  version: 1.2.1
 platform: ruby
 authors:
 - Jon Daniel
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-08-01 00:00:00.000000000 Z
+date: 2025-08-02 00:00:00.000000000 Z
 dependencies: []
 description: |
   A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.