enumerable-stats 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/enumerable-stats.rb +1 -1
- data/lib/enumerable_stats/enumerable_ext.rb +248 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4b7a1951101022de006735e6276e1db4a974d98a5ae23c617a0f0c54b116ec80
|
4
|
+
data.tar.gz: 0efb5538568ded644e36e5f0a5ffb70cd52c86f678c490751e8c9b5987e99e46
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 20ddf5dd46540ff3a3ce31de0a153babcb1f005556d782e82371dbc70ddf7f882960dd4bd3197fe43571072561a812fb1ccc3b6cb8d5cb9c87cedfd61e9e1c48
|
7
|
+
data.tar.gz: 8bcfa97b1be3d3a1cb6887b1aa1f2ec733250361fe8a5834ef090273c97e75aebab38e0ec27d6277a843ad7ef5f8176176e41b7e39fa5d4641ad3daf319a66aa
|
data/lib/enumerable-stats.rb
CHANGED
@@ -1,7 +1,55 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module EnumerableStats
|
4
|
+
# Extension module that adds statistical methods to all Enumerable objects.
|
5
|
+
#
|
6
|
+
# This module provides essential statistical functions including measures of central tendency
|
7
|
+
# (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
|
8
|
+
# outlier detection using the IQR method, and statistical comparison methods.
|
9
|
+
#
|
10
|
+
# When included, these methods become available on all Ruby collections that include
|
11
|
+
# Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
|
12
|
+
# without external dependencies.
|
13
|
+
#
|
14
|
+
# @example Basic statistical calculations
|
15
|
+
# [1, 2, 3, 4, 5].mean #=> 3.0
|
16
|
+
# [1, 2, 3, 4, 5].median #=> 3
|
17
|
+
# [1, 2, 3, 4, 5].percentile(75) #=> 4.0
|
18
|
+
#
|
19
|
+
# @example Outlier detection
|
20
|
+
# data = [1, 2, 3, 4, 100]
|
21
|
+
# data.remove_outliers #=> [1, 2, 3, 4]
|
22
|
+
# data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
|
23
|
+
#
|
24
|
+
# @example Statistical testing
|
25
|
+
# control = [10, 12, 14, 16, 18]
|
26
|
+
# treatment = [15, 17, 19, 21, 23]
|
27
|
+
# control.t_value(treatment) #=> negative t-statistic
|
28
|
+
# control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
|
29
|
+
# treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
|
30
|
+
# control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
|
31
|
+
#
|
32
|
+
# @see Enumerable
|
33
|
+
# @since 0.1.0
|
4
34
|
module EnumerableExt
|
35
|
+
# Epsilon for floating point comparisons to avoid precision issues
|
36
|
+
EPSILON = 1e-10
|
37
|
+
|
38
|
+
# Common alpha levels with their corresponding high-precision z-scores
|
39
|
+
# Used to avoid floating point comparison issues while maintaining backward compatibility
|
40
|
+
COMMON_ALPHA_VALUES = {
|
41
|
+
0.10 => 1.2815515655446004,
|
42
|
+
0.05 => 1.6448536269514722,
|
43
|
+
0.025 => 1.9599639845400545,
|
44
|
+
0.01 => 2.3263478740408408,
|
45
|
+
0.005 => 2.5758293035489004,
|
46
|
+
0.001 => 3.0902323061678132
|
47
|
+
}.freeze
|
48
|
+
|
49
|
+
CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR = 92_160.0
|
50
|
+
EDGEWORTH_SMALL_SAMPLE_COEFF = 4.0
|
51
|
+
BSM_THRESHOLD = 1e-20
|
52
|
+
|
5
53
|
# Calculates the percentage difference between this collection's mean and another value or collection's mean
|
6
54
|
# Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
|
7
55
|
# This is useful for comparing datasets or metrics where direction doesn't matter
|
@@ -13,7 +61,7 @@ module EnumerableStats
|
|
13
61
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
14
62
|
|
15
63
|
return 0.0 if a == b
|
16
|
-
return Float::INFINITY if a + b
|
64
|
+
return Float::INFINITY if (a + b).zero?
|
17
65
|
|
18
66
|
((a - b).abs / ((a + b) / 2.0).abs) * 100
|
19
67
|
end
|
@@ -29,7 +77,7 @@ module EnumerableStats
|
|
29
77
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
30
78
|
|
31
79
|
return 0.0 if a == b
|
32
|
-
return Float::INFINITY if a + b
|
80
|
+
return Float::INFINITY if (a + b).zero?
|
33
81
|
|
34
82
|
((a - b) / ((a + b) / 2.0).abs) * 100
|
35
83
|
end
|
@@ -70,12 +118,52 @@ module EnumerableStats
|
|
70
118
|
|
71
119
|
n = (n1 + n2)**2
|
72
120
|
|
73
|
-
d1 = variance**2 / (count**2 * (count - 1))
|
74
|
-
d2 = other.variance**2 / (other.count**2 * (other.count - 1))
|
121
|
+
d1 = (variance**2) / ((count**2) * (count - 1))
|
122
|
+
d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
|
75
123
|
|
76
124
|
n / (d1 + d2)
|
77
125
|
end
|
78
126
|
|
127
|
+
# Tests if this collection's mean is significantly greater than another collection's mean
|
128
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
129
|
+
# significance at the specified alpha level.
|
130
|
+
#
|
131
|
+
# @param other [Enumerable] Another collection to compare against
|
132
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
133
|
+
# @return [Boolean] True if this collection's mean is significantly greater
|
134
|
+
# @example
|
135
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
136
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
137
|
+
# treatment.greater_than?(control) # => true (treatment significantly > control)
|
138
|
+
# control.greater_than?(treatment) # => false
|
139
|
+
def greater_than?(other, alpha: 0.05)
|
140
|
+
t_stat = t_value(other)
|
141
|
+
df = degrees_of_freedom(other)
|
142
|
+
critical_value = critical_t_value(df, alpha)
|
143
|
+
|
144
|
+
t_stat > critical_value
|
145
|
+
end
|
146
|
+
|
147
|
+
# Tests if this collection's mean is significantly less than another collection's mean
|
148
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
149
|
+
# significance at the specified alpha level.
|
150
|
+
#
|
151
|
+
# @param other [Enumerable] Another collection to compare against
|
152
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
153
|
+
# @return [Boolean] True if this collection's mean is significantly less
|
154
|
+
# @example
|
155
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
156
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
157
|
+
# control.less_than?(treatment) # => true (control significantly < treatment)
|
158
|
+
# treatment.less_than?(control) # => false
|
159
|
+
def less_than?(other, alpha: 0.05)
|
160
|
+
t_stat = t_value(other)
|
161
|
+
df = degrees_of_freedom(other)
|
162
|
+
critical_value = critical_t_value(df, alpha)
|
163
|
+
|
164
|
+
t_stat < -critical_value
|
165
|
+
end
|
166
|
+
|
79
167
|
# Calculates the arithmetic mean (average) of the collection
|
80
168
|
#
|
81
169
|
# @return [Float] The arithmetic mean of all numeric values
|
@@ -96,7 +184,7 @@ module EnumerableStats
|
|
96
184
|
# [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
|
97
185
|
# [].median # => nil
|
98
186
|
def median
|
99
|
-
return nil if size
|
187
|
+
return nil if size.zero?
|
100
188
|
|
101
189
|
sorted = sort
|
102
190
|
midpoint = size / 2
|
@@ -123,7 +211,7 @@ module EnumerableStats
|
|
123
211
|
# [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
|
124
212
|
# [].percentile(50) # => nil (empty collection)
|
125
213
|
def percentile(percentile)
|
126
|
-
return nil if size
|
214
|
+
return nil if size.zero?
|
127
215
|
|
128
216
|
unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
|
129
217
|
raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
|
@@ -132,7 +220,7 @@ module EnumerableStats
|
|
132
220
|
sorted = sort
|
133
221
|
|
134
222
|
# Handle edge cases
|
135
|
-
return sorted.first if percentile
|
223
|
+
return sorted.first if percentile.zero?
|
136
224
|
return sorted.last if percentile == 100
|
137
225
|
|
138
226
|
# Calculate the position using the "linear" method (R-7/Excel method)
|
@@ -151,7 +239,7 @@ module EnumerableStats
|
|
151
239
|
lower_value = sorted[lower_index]
|
152
240
|
upper_value = sorted[upper_index]
|
153
241
|
|
154
|
-
lower_value + weight * (upper_value - lower_value)
|
242
|
+
lower_value + (weight * (upper_value - lower_value))
|
155
243
|
end
|
156
244
|
end
|
157
245
|
|
@@ -164,7 +252,7 @@ module EnumerableStats
|
|
164
252
|
# [5, 5, 5, 5].variance # => 0.0 (no variation)
|
165
253
|
def variance
|
166
254
|
mean = self.mean
|
167
|
-
sum_of_squares =
|
255
|
+
sum_of_squares = sum { |r| (r - mean)**2 }
|
168
256
|
sum_of_squares / (count - 1).to_f
|
169
257
|
end
|
170
258
|
|
@@ -204,7 +292,7 @@ module EnumerableStats
|
|
204
292
|
lower_index = q1_pos.floor
|
205
293
|
upper_index = q1_pos.ceil
|
206
294
|
weight = q1_pos - q1_pos.floor
|
207
|
-
q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
295
|
+
q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
208
296
|
end
|
209
297
|
|
210
298
|
# Calculate Q3
|
@@ -214,7 +302,7 @@ module EnumerableStats
|
|
214
302
|
lower_index = q3_pos.floor
|
215
303
|
upper_index = q3_pos.ceil
|
216
304
|
weight = q3_pos - q3_pos.floor
|
217
|
-
q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
305
|
+
q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
218
306
|
end
|
219
307
|
|
220
308
|
iqr = q3 - q1
|
@@ -224,7 +312,7 @@ module EnumerableStats
|
|
224
312
|
upper_bound = q3 + (multiplier * iqr)
|
225
313
|
|
226
314
|
# Filter out outliers
|
227
|
-
select { |value| value
|
315
|
+
select { |value| value.between?(lower_bound, upper_bound) }
|
228
316
|
end
|
229
317
|
|
230
318
|
# Returns statistics about outlier removal for debugging/logging
|
@@ -247,5 +335,152 @@ module EnumerableStats
|
|
247
335
|
outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
|
248
336
|
}
|
249
337
|
end
|
338
|
+
|
339
|
+
private
|
340
|
+
|
341
|
+
# Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
|
342
|
+
# Uses Hill's approximation (1970) for accurate inverse t-distribution calculation
|
343
|
+
#
|
344
|
+
# @param df [Float] Degrees of freedom
|
345
|
+
# @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
|
346
|
+
# @return [Float] Critical t-value for one-tailed test
|
347
|
+
def critical_t_value(df, alpha)
|
348
|
+
# For very large df (≥1000), t-distribution is essentially normal
|
349
|
+
return inverse_normal_cdf(alpha) if df >= 1000
|
350
|
+
|
351
|
+
# Use Hill's approximation for inverse t-distribution
|
352
|
+
# This is more accurate than lookup tables and handles any df/alpha combination
|
353
|
+
inverse_t_distribution(df, alpha)
|
354
|
+
end
|
355
|
+
|
356
|
+
# Calculates the inverse t-distribution using Cornish-Fisher expansion
|
357
|
+
# This provides accurate critical t-values for any degrees of freedom and alpha level
|
358
|
+
# Based on methods used in statistical software like R and MATLAB
|
359
|
+
#
|
360
|
+
# @param df [Float] Degrees of freedom
|
361
|
+
# @param alpha [Float] Significance level for one-tailed test
|
362
|
+
# @return [Float] Critical t-value
|
363
|
+
def inverse_t_distribution(df, alpha)
|
364
|
+
# Handle boundary cases
|
365
|
+
return Float::INFINITY if df <= 0 || alpha <= 0
|
366
|
+
return -Float::INFINITY if alpha >= 1
|
367
|
+
return inverse_normal_cdf(alpha) if df >= 200 # Normal approximation for large df
|
368
|
+
|
369
|
+
# Get the corresponding normal quantile
|
370
|
+
z = inverse_normal_cdf(alpha)
|
371
|
+
|
372
|
+
# Special cases with exact solutions
|
373
|
+
if df == 1
|
374
|
+
# Cauchy distribution: exact inverse
|
375
|
+
return Math.tan(Math::PI * (0.5 - alpha))
|
376
|
+
elsif df == 2
|
377
|
+
# Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
|
378
|
+
# This is more numerically stable
|
379
|
+
z_sq = z**2
|
380
|
+
# Exact formula for df=2: t = z / sqrt(1 - z^2/(z^2 + 2))
|
381
|
+
return z / Math.sqrt(1.0 - (z_sq / (z_sq + 2.0)))
|
382
|
+
|
383
|
+
end
|
384
|
+
|
385
|
+
# Use Cornish-Fisher expansion for general case
|
386
|
+
# This is the method used in most statistical software
|
387
|
+
|
388
|
+
# Base normal quantile
|
389
|
+
t = z
|
390
|
+
|
391
|
+
# First-order correction
|
392
|
+
if df >= 4
|
393
|
+
c1 = z / 4.0
|
394
|
+
t += c1 / df
|
395
|
+
end
|
396
|
+
|
397
|
+
# Second-order correction
|
398
|
+
if df >= 6
|
399
|
+
c2 = ((5.0 * (z**3)) + (16.0 * z)) / 96.0
|
400
|
+
t += c2 / (df**2)
|
401
|
+
end
|
402
|
+
|
403
|
+
# Third-order correction for better accuracy
|
404
|
+
if df >= 8
|
405
|
+
c3 = ((3.0 * (z**5)) + (19.0 * (z**3)) + (17.0 * z)) / 384.0
|
406
|
+
t += c3 / (df**3)
|
407
|
+
end
|
408
|
+
|
409
|
+
# Fourth-order correction for very high accuracy
|
410
|
+
if df >= 10
|
411
|
+
c4 = ((79.0 * (z**7)) + (776.0 * (z**5)) +
|
412
|
+
(1482.0 * (z**3)) + (776.0 * z)) / CORNISH_FISHER_FOURTH_ORDER_DENOMINATOR
|
413
|
+
|
414
|
+
t += c4 / (df**4)
|
415
|
+
end
|
416
|
+
|
417
|
+
# For small degrees of freedom, apply additional small-sample correction
|
418
|
+
if df < 8
|
419
|
+
# Edgeworth expansion adjustment for small df
|
420
|
+
delta = 1.0 / (EDGEWORTH_SMALL_SAMPLE_COEFF * df)
|
421
|
+
small_sample_correction = z * delta * ((z**2) + 1.0)
|
422
|
+
t += small_sample_correction
|
423
|
+
end
|
424
|
+
|
425
|
+
t
|
426
|
+
end
|
427
|
+
|
428
|
+
# Calculates the inverse normal CDF (quantile function) using Beasley-Springer-Moro algorithm
|
429
|
+
# This is more accurate than the previous hard-coded approach
|
430
|
+
#
|
431
|
+
# @param alpha [Float] Significance level (0 < alpha < 1)
|
432
|
+
# @return [Float] Z-score corresponding to the upper-tail probability alpha
|
433
|
+
def inverse_normal_cdf(alpha)
|
434
|
+
# Handle edge cases
|
435
|
+
return Float::INFINITY if alpha <= 0
|
436
|
+
return -Float::INFINITY if alpha >= 1
|
437
|
+
|
438
|
+
# For common values, use high-precision constants to maintain backward compatibility
|
439
|
+
# Use epsilon-based comparisons to avoid floating point precision issues
|
440
|
+
COMMON_ALPHA_VALUES.each do |target_alpha, z_score|
|
441
|
+
return z_score if (alpha - target_alpha).abs < EPSILON
|
442
|
+
end
|
443
|
+
|
444
|
+
# Use Beasley-Springer-Moro algorithm for other values
|
445
|
+
# This is accurate to about 7 decimal places
|
446
|
+
|
447
|
+
# Transform to work with cumulative probability from left tail
|
448
|
+
p = 1.0 - alpha
|
449
|
+
|
450
|
+
# Handle symmetric case
|
451
|
+
if p > 0.5
|
452
|
+
sign = 1
|
453
|
+
p = 1.0 - p
|
454
|
+
else
|
455
|
+
sign = -1
|
456
|
+
end
|
457
|
+
|
458
|
+
# Constants for the approximation
|
459
|
+
if p >= BSM_THRESHOLD
|
460
|
+
# Rational approximation for central region
|
461
|
+
t = Math.sqrt(-2.0 * Math.log(p))
|
462
|
+
|
463
|
+
# Numerator coefficients
|
464
|
+
c0 = 2.515517
|
465
|
+
c1 = 0.802853
|
466
|
+
c2 = 0.010328
|
467
|
+
|
468
|
+
# Denominator coefficients
|
469
|
+
d0 = 1.000000
|
470
|
+
d1 = 1.432788
|
471
|
+
d2 = 0.189269
|
472
|
+
d3 = 0.001308
|
473
|
+
|
474
|
+
numerator = c0 + (c1 * t) + (c2 * (t**2))
|
475
|
+
denominator = d0 + (d1 * t) + (d2 * (t**2)) + (d3 * (t**3))
|
476
|
+
|
477
|
+
x = t - (numerator / denominator)
|
478
|
+
else
|
479
|
+
# For very small p, use asymptotic expansion
|
480
|
+
x = Math.sqrt(-2.0 * Math.log(p))
|
481
|
+
end
|
482
|
+
|
483
|
+
sign * x
|
484
|
+
end
|
250
485
|
end
|
251
|
-
end
|
486
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: enumerable-stats
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1
|
4
|
+
version: 1.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Daniel
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-08-
|
11
|
+
date: 2025-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
|