enumerable-stats 1.1.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5a52c483b38592d8be9155651e63a4d238850927bc3282f27f22a4859e1db1a
4
- data.tar.gz: 0e807826f7103e049effcd1bd925279e5b1ea7d34932fddebf7167e184a6b887
3
+ metadata.gz: 4b7a1951101022de006735e6276e1db4a974d98a5ae23c617a0f0c54b116ec80
4
+ data.tar.gz: 0efb5538568ded644e36e5f0a5ffb70cd52c86f678c490751e8c9b5987e99e46
5
5
  SHA512:
6
- metadata.gz: df23176506fd05b2769fc6ee49ed62bc7954ec9f810d6153a69754df90a3cbed7b4744e269c7f26b08b1e2bb56b0384d3c1a46bdacc7a57ecba7f9029c4bcb19
7
- data.tar.gz: 8e80e9d596018a704773ca2836f951d1565876b35aedf7428117a61d031f685ea788ac115d02f04fc7b27ab9b06ebae5a617fe891a97130b8da628cdee807c20
6
+ metadata.gz: 20ddf5dd46540ff3a3ce31de0a153babcb1f005556d782e82371dbc70ddf7f882960dd4bd3197fe43571072561a812fb1ccc3b6cb8d5cb9c87cedfd61e9e1c48
7
+ data.tar.gz: 8bcfa97b1be3d3a1cb6887b1aa1f2ec733250361fe8a5834ef090273c97e75aebab38e0ec27d6277a843ad7ef5f8176176e41b7e39fa5d4641ad3daf319a66aa
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'enumerable_stats/enumerable_ext'
3
+ require_relative "enumerable_stats/enumerable_ext"
4
4
 
5
5
  module Enumerable
6
6
  include EnumerableStats::EnumerableExt
@@ -1,7 +1,55 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EnumerableStats
4
+ # Extension module that adds statistical methods to all Enumerable objects.
5
+ #
6
+ # This module provides essential statistical functions including measures of central tendency
7
+ # (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
8
+ # outlier detection using the IQR method, and statistical comparison methods.
9
+ #
10
+ # When included, these methods become available on all Ruby collections that include
11
+ # Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
12
+ # without external dependencies.
13
+ #
14
+ # @example Basic statistical calculations
15
+ # [1, 2, 3, 4, 5].mean #=> 3.0
16
+ # [1, 2, 3, 4, 5].median #=> 3
17
+ # [1, 2, 3, 4, 5].percentile(75) #=> 4.0
18
+ #
19
+ # @example Outlier detection
20
+ # data = [1, 2, 3, 4, 100]
21
+ # data.remove_outliers #=> [1, 2, 3, 4]
22
+ # data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
23
+ #
24
+ # @example Statistical testing
25
+ # control = [10, 12, 14, 16, 18]
26
+ # treatment = [15, 17, 19, 21, 23]
27
+ # control.t_value(treatment) #=> negative t-statistic
28
+ # control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
29
+ # treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
30
+ # control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
31
+ #
32
+ # @see Enumerable
33
+ # @since 0.1.0
4
34
  module EnumerableExt
35
+ # Epsilon for floating point comparisons to avoid precision issues
36
+ EPSILON = 1e-10
37
+
38
+ # Common alpha levels with their corresponding high-precision z-scores
39
+ # Used to avoid floating point comparison issues while maintaining backward compatibility
40
+ COMMON_ALPHA_VALUES = {
41
+ 0.10 => 1.2815515655446004,
42
+ 0.05 => 1.6448536269514722,
43
+ 0.025 => 1.9599639845400545,
44
+ 0.01 => 2.3263478740408408,
45
+ 0.005 => 2.5758293035489004,
46
+ 0.001 => 3.0902323061678132
47
+ }.freeze
48
+
49
+ CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
50
+ EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
51
+ BSM_THRESHOLD = 1e-20
52
+
5
53
  # Calculates the percentage difference between this collection's mean and another value or collection's mean
6
54
  # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
7
55
  # This is useful for comparing datasets or metrics where direction doesn't matter
@@ -13,7 +61,7 @@ module EnumerableStats
13
61
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
14
62
 
15
63
  return 0.0 if a == b
16
- return Float::INFINITY if a + b == 0
64
+ return Float::INFINITY if (a + b).zero?
17
65
 
18
66
  ((a - b).abs / ((a + b) / 2.0).abs) * 100
19
67
  end
@@ -29,7 +77,7 @@ module EnumerableStats
29
77
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
30
78
 
31
79
  return 0.0 if a == b
32
- return Float::INFINITY if a + b == 0
80
+ return Float::INFINITY if (a + b).zero?
33
81
 
34
82
  ((a - b) / ((a + b) / 2.0).abs) * 100
35
83
  end
@@ -70,12 +118,52 @@ module EnumerableStats
70
118
 
71
119
  n = (n1 + n2)**2
72
120
 
73
- d1 = variance**2 / (count**2 * (count - 1))
74
- d2 = other.variance**2 / (other.count**2 * (other.count - 1))
121
+ d1 = (variance**2) / ((count**2) * (count - 1))
122
+ d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
75
123
 
76
124
  n / (d1 + d2)
77
125
  end
78
126
 
127
+ # Tests if this collection's mean is significantly greater than another collection's mean
128
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
129
+ # significance at the specified alpha level.
130
+ #
131
+ # @param other [Enumerable] Another collection to compare against
132
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
133
+ # @return [Boolean] True if this collection's mean is significantly greater
134
+ # @example
135
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
136
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
137
+ # treatment.greater_than?(control) # => true (treatment significantly > control)
138
+ # control.greater_than?(treatment) # => false
139
+ def greater_than?(other, alpha: 0.05)
140
+ t_stat = t_value(other)
141
+ df = degrees_of_freedom(other)
142
+ critical_value = critical_t_value(df, alpha)
143
+
144
+ t_stat > critical_value
145
+ end
146
+
147
+ # Tests if this collection's mean is significantly less than another collection's mean
148
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
149
+ # significance at the specified alpha level.
150
+ #
151
+ # @param other [Enumerable] Another collection to compare against
152
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
153
+ # @return [Boolean] True if this collection's mean is significantly less
154
+ # @example
155
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
156
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
157
+ # control.less_than?(treatment) # => true (control significantly < treatment)
158
+ # treatment.less_than?(control) # => false
159
+ def less_than?(other, alpha: 0.05)
160
+ t_stat = t_value(other)
161
+ df = degrees_of_freedom(other)
162
+ critical_value = critical_t_value(df, alpha)
163
+
164
+ t_stat < -critical_value
165
+ end
166
+
79
167
  # Calculates the arithmetic mean (average) of the collection
80
168
  #
81
169
  # @return [Float] The arithmetic mean of all numeric values
@@ -96,7 +184,7 @@ module EnumerableStats
96
184
  # [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
97
185
  # [].median # => nil
98
186
  def median
99
- return nil if size == 0
187
+ return nil if size.zero?
100
188
 
101
189
  sorted = sort
102
190
  midpoint = size / 2
@@ -123,7 +211,7 @@ module EnumerableStats
123
211
  # [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
124
212
  # [].percentile(50) # => nil (empty collection)
125
213
  def percentile(percentile)
126
- return nil if size == 0
214
+ return nil if size.zero?
127
215
 
128
216
  unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
129
217
  raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
@@ -132,7 +220,7 @@ module EnumerableStats
132
220
  sorted = sort
133
221
 
134
222
  # Handle edge cases
135
- return sorted.first if percentile == 0
223
+ return sorted.first if percentile.zero?
136
224
  return sorted.last if percentile == 100
137
225
 
138
226
  # Calculate the position using the "linear" method (R-7/Excel method)
@@ -151,7 +239,7 @@ module EnumerableStats
151
239
  lower_value = sorted[lower_index]
152
240
  upper_value = sorted[upper_index]
153
241
 
154
- lower_value + weight * (upper_value - lower_value)
242
+ lower_value + (weight * (upper_value - lower_value))
155
243
  end
156
244
  end
157
245
 
@@ -164,7 +252,7 @@ module EnumerableStats
164
252
  # [5, 5, 5, 5].variance # => 0.0 (no variation)
165
253
  def variance
166
254
  mean = self.mean
167
- sum_of_squares = map { |r| (r - mean)**2 }.sum
255
+ sum_of_squares = sum { |r| (r - mean)**2 }
168
256
  sum_of_squares / (count - 1).to_f
169
257
  end
170
258
 
@@ -204,7 +292,7 @@ module EnumerableStats
204
292
  lower_index = q1_pos.floor
205
293
  upper_index = q1_pos.ceil
206
294
  weight = q1_pos - q1_pos.floor
207
- q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
295
+ q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
208
296
  end
209
297
 
210
298
  # Calculate Q3
@@ -214,7 +302,7 @@ module EnumerableStats
214
302
  lower_index = q3_pos.floor
215
303
  upper_index = q3_pos.ceil
216
304
  weight = q3_pos - q3_pos.floor
217
- q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
305
+ q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
218
306
  end
219
307
 
220
308
  iqr = q3 - q1
@@ -224,7 +312,7 @@ module EnumerableStats
224
312
  upper_bound = q3 + (multiplier * iqr)
225
313
 
226
314
  # Filter out outliers
227
- select { |value| value >= lower_bound && value <= upper_bound }
315
+ select { |value| value.between?(lower_bound, upper_bound) }
228
316
  end
229
317
 
230
318
  # Returns statistics about outlier removal for debugging/logging
@@ -247,5 +335,152 @@ module EnumerableStats
247
335
  outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
248
336
  }
249
337
  end
338
+
339
+ private
340
+
341
+ # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
342
+ # Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
343
+ #
344
+ # @param df [Float] Degrees of freedom
345
+ # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
346
+ # @return [Float] Critical t-value for one-tailed test
347
+ def critical_t_value(df, alpha)
348
+ # For very large df (≥1000), t-distribution is essentially normal
349
+ return inverse_normal_cdf(alpha) if df >= 1000
350
+
351
+ # Use Hill's approximation for inverse t-distribution
352
+ # This is more accurate than lookup tables and handles any df/alpha combination
353
+ inverse_t_distribution(df, alpha)
354
+ end
355
+
356
+ # Calculates the inverse t-distribution using Cornish-Fisher expansion
357
+ # This provides accurate critical t-values for any degrees of freedom and alpha level
358
+ # Based on methods used in statistical software like R and MATLAB
359
+ #
360
+ # @param df [Float] Degrees of freedom
361
+ # @param alpha [Float] Significance level for one-tailed test
362
+ # @return [Float] Critical t-value
363
+ def inverse_t_distribution(df, alpha)
364
+ # Handle boundary cases
365
+ return Float::INFINITY if df <= 0 || alpha <= 0
366
+ return -Float::INFINITY if alpha >= 1
367
+ return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
368
+
369
+ # Get the corresponding normal quantile
370
+ z = inverse_normal_cdf(alpha)
371
+
372
+ # Special cases with exact solutions
373
+ if df == 1
374
+ # Cauchy distribution: exact inverse
375
+ return Math.tan(Math::PI * (0.5 - alpha))
376
+ elsif df == 2
377
+ # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
378
+ # This is more numerically stable
379
+ z_sq = z**2
380
+ # Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
381
+ return z / Math.sqrt(1.0 - (z_sq / (z_sq + 2.0)))
382
+
383
+ end
384
+
385
+ # Use Cornish-Fisher expansion for general case
386
+ # This is the method used in most statistical software
387
+
388
+ # Base normal quantile
389
+ t = z
390
+
391
+ # First-order correction
392
+ if df >= 4
393
+ c1 = z / 4.0
394
+ t += c1 / df
395
+ end
396
+
397
+ # Second-order correction
398
+ if df >= 6
399
+ c2 = ((5.0 * (z**3)) + (16.0 * z)) / 96.0
400
+ t += c2 / (df**2)
401
+ end
402
+
403
+ # Third-order correction for better accuracy
404
+ if df >= 8
405
+ c3 = ((3.0 * (z**5)) + (19.0 * (z**3)) + (17.0 * z)) / 384.0
406
+ t += c3 / (df**3)
407
+ end
408
+
409
+ # Fourth-order correction for very high accuracy
410
+ if df >= 10
411
+ c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) +
412
+ (1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
413
+
414
+ t += c4 / (df**4)
415
+ end
416
+
417
+ # For small degrees of freedom, apply additional small-sample correction
418
+ if df < 8
419
+ # Edgeworth expansion adjustment for small df
420
+ delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
421
+ small_sample_correction = z * delta * ((z**2) + 1.0)
422
+ t += small_sample_correction
423
+ end
424
+
425
+ t
426
+ end
427
+
428
+ # Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
429
+ # This is more accurate than the previous hard-coded approach
430
+ #
431
+ # @param alpha [Float] Significance level (0 < alpha < 1)
432
+ # @return [Float] Z-score corresponding to the upper-tail probability alpha
433
+ def inverse_normal_cdf(alpha)
434
+ # Handle edge cases
435
+ return Float::INFINITY if alpha <= 0
436
+ return -Float::INFINITY if alpha >= 1
437
+
438
+ # For common values, use high-precision constants to maintain backward compatibility
439
+ # Use epsilon-based comparisons to avoid floating point precision issues
440
+ COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
441
+ return z_score if (alpha - target_alpha).abs < EPSILON
442
+ end
443
+
444
+ # Use Beasley-Springer-Moro algorithm for other values
445
+ # This is accurate to about 7 decimal places
446
+
447
+ # Transform to work with cumulative probability from left tail
448
+ p = 1.0 - alpha
449
+
450
+ # Handle symmetric case
451
+ if p > 0.5
452
+ sign = 1
453
+ p = 1.0 - p
454
+ else
455
+ sign = -1
456
+ end
457
+
458
+ # Constants for the approximation
459
+ if p >= BSM_THRESHOLD
460
+ # Rational approximation for central region
461
+ t = Math.sqrt(-2.0 * Math.log(p))
462
+
463
+ # Numerator coefficients
464
+ c0 = 2.515517
465
+ c1 = 0.802853
466
+ c2 = 0.010328
467
+
468
+ # Denominator coefficients
469
+ d0 = 1.000000
470
+ d1 = 1.432788
471
+ d2 = 0.189269
472
+ d3 = 0.001308
473
+
474
+ numerator = c0 + (c1 * t) + (c2 * (t**2))
475
+ denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
476
+
477
+ x = t - (numerator / denominator)
478
+ else
479
+ # For very small p, use asymptotic expansion
480
+ x = Math.sqrt(-2.0 * Math.log(p))
481
+ end
482
+
483
+ sign * x
484
+ end
250
485
  end
251
- end
486
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: enumerable-stats
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Daniel
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-08-01 00:00:00.000000000 Z
11
+ date: 2025-08-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |
14
14
  A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.