enumerable-stats 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/enumerable-stats.rb +1 -1
- data/lib/enumerable_stats/enumerable_ext.rb +156 -13
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
|
4
|
+
data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
|
7
|
+
data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
|
data/lib/enumerable-stats.rb
CHANGED
@@ -1,6 +1,36 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module EnumerableStats
|
4
|
+
# Extension module that adds statistical methods to all Enumerable objects.
|
5
|
+
#
|
6
|
+
# This module provides essential statistical functions including measures of central tendency
|
7
|
+
# (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
|
8
|
+
# outlier detection using the IQR method, and statistical comparison methods.
|
9
|
+
#
|
10
|
+
# When included, these methods become available on all Ruby collections that include
|
11
|
+
# Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
|
12
|
+
# without external dependencies.
|
13
|
+
#
|
14
|
+
# @example Basic statistical calculations
|
15
|
+
# [1, 2, 3, 4, 5].mean #=> 3.0
|
16
|
+
# [1, 2, 3, 4, 5].median #=> 3
|
17
|
+
# [1, 2, 3, 4, 5].percentile(75) #=> 4.0
|
18
|
+
#
|
19
|
+
# @example Outlier detection
|
20
|
+
# data = [1, 2, 3, 4, 100]
|
21
|
+
# data.remove_outliers #=> [1, 2, 3, 4]
|
22
|
+
# data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
|
23
|
+
#
|
24
|
+
# @example Statistical testing
|
25
|
+
# control = [10, 12, 14, 16, 18]
|
26
|
+
# treatment = [15, 17, 19, 21, 23]
|
27
|
+
# control.t_value(treatment) #=> negative t-statistic
|
28
|
+
# control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
|
29
|
+
# treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
|
30
|
+
# control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
|
31
|
+
#
|
32
|
+
# @see Enumerable
|
33
|
+
# @since 0.1.0
|
4
34
|
module EnumerableExt
|
5
35
|
# Calculates the percentage difference between this collection's mean and another value or collection's mean
|
6
36
|
# Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
|
@@ -13,7 +43,7 @@ module EnumerableStats
|
|
13
43
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
14
44
|
|
15
45
|
return 0.0 if a == b
|
16
|
-
return Float::INFINITY if a + b
|
46
|
+
return Float::INFINITY if (a + b).zero?
|
17
47
|
|
18
48
|
((a - b).abs / ((a + b) / 2.0).abs) * 100
|
19
49
|
end
|
@@ -29,7 +59,7 @@ module EnumerableStats
|
|
29
59
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
30
60
|
|
31
61
|
return 0.0 if a == b
|
32
|
-
return Float::INFINITY if a + b
|
62
|
+
return Float::INFINITY if (a + b).zero?
|
33
63
|
|
34
64
|
((a - b) / ((a + b) / 2.0).abs) * 100
|
35
65
|
end
|
@@ -70,12 +100,52 @@ module EnumerableStats
|
|
70
100
|
|
71
101
|
n = (n1 + n2)**2
|
72
102
|
|
73
|
-
d1 = variance**2 / (count**2 * (count - 1))
|
74
|
-
d2 = other.variance**2 / (other.count**2 * (other.count - 1))
|
103
|
+
d1 = (variance**2) / ((count**2) * (count - 1))
|
104
|
+
d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
|
75
105
|
|
76
106
|
n / (d1 + d2)
|
77
107
|
end
|
78
108
|
|
109
|
+
# Tests if this collection's mean is significantly greater than another collection's mean
|
110
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
111
|
+
# significance at the specified alpha level.
|
112
|
+
#
|
113
|
+
# @param other [Enumerable] Another collection to compare against
|
114
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
115
|
+
# @return [Boolean] True if this collection's mean is significantly greater
|
116
|
+
# @example
|
117
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
118
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
119
|
+
# treatment.greater_than?(control) # => true (treatment significantly > control)
|
120
|
+
# control.greater_than?(treatment) # => false
|
121
|
+
def greater_than?(other, alpha: 0.05)
|
122
|
+
t_stat = t_value(other)
|
123
|
+
df = degrees_of_freedom(other)
|
124
|
+
critical_value = critical_t_value(df, alpha)
|
125
|
+
|
126
|
+
t_stat > critical_value
|
127
|
+
end
|
128
|
+
|
129
|
+
# Tests if this collection's mean is significantly less than another collection's mean
|
130
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
131
|
+
# significance at the specified alpha level.
|
132
|
+
#
|
133
|
+
# @param other [Enumerable] Another collection to compare against
|
134
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
135
|
+
# @return [Boolean] True if this collection's mean is significantly less
|
136
|
+
# @example
|
137
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
138
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
139
|
+
# control.less_than?(treatment) # => true (control significantly < treatment)
|
140
|
+
# treatment.less_than?(control) # => false
|
141
|
+
def less_than?(other, alpha: 0.05)
|
142
|
+
t_stat = t_value(other)
|
143
|
+
df = degrees_of_freedom(other)
|
144
|
+
critical_value = critical_t_value(df, alpha)
|
145
|
+
|
146
|
+
t_stat < -critical_value
|
147
|
+
end
|
148
|
+
|
79
149
|
# Calculates the arithmetic mean (average) of the collection
|
80
150
|
#
|
81
151
|
# @return [Float] The arithmetic mean of all numeric values
|
@@ -96,7 +166,7 @@ module EnumerableStats
|
|
96
166
|
# [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
|
97
167
|
# [].median # => nil
|
98
168
|
def median
|
99
|
-
return nil if size
|
169
|
+
return nil if size.zero?
|
100
170
|
|
101
171
|
sorted = sort
|
102
172
|
midpoint = size / 2
|
@@ -123,7 +193,7 @@ module EnumerableStats
|
|
123
193
|
# [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
|
124
194
|
# [].percentile(50) # => nil (empty collection)
|
125
195
|
def percentile(percentile)
|
126
|
-
return nil if size
|
196
|
+
return nil if size.zero?
|
127
197
|
|
128
198
|
unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
|
129
199
|
raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
|
@@ -132,7 +202,7 @@ module EnumerableStats
|
|
132
202
|
sorted = sort
|
133
203
|
|
134
204
|
# Handle edge cases
|
135
|
-
return sorted.first if percentile
|
205
|
+
return sorted.first if percentile.zero?
|
136
206
|
return sorted.last if percentile == 100
|
137
207
|
|
138
208
|
# Calculate the position using the "linear" method (R-7/Excel method)
|
@@ -151,7 +221,7 @@ module EnumerableStats
|
|
151
221
|
lower_value = sorted[lower_index]
|
152
222
|
upper_value = sorted[upper_index]
|
153
223
|
|
154
|
-
lower_value + weight * (upper_value - lower_value)
|
224
|
+
lower_value + (weight * (upper_value - lower_value))
|
155
225
|
end
|
156
226
|
end
|
157
227
|
|
@@ -164,7 +234,7 @@ module EnumerableStats
|
|
164
234
|
# [5, 5, 5, 5].variance # => 0.0 (no variation)
|
165
235
|
def variance
|
166
236
|
mean = self.mean
|
167
|
-
sum_of_squares =
|
237
|
+
sum_of_squares = sum { |r| (r - mean)**2 }
|
168
238
|
sum_of_squares / (count - 1).to_f
|
169
239
|
end
|
170
240
|
|
@@ -204,7 +274,7 @@ module EnumerableStats
|
|
204
274
|
lower_index = q1_pos.floor
|
205
275
|
upper_index = q1_pos.ceil
|
206
276
|
weight = q1_pos - q1_pos.floor
|
207
|
-
q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
277
|
+
q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
208
278
|
end
|
209
279
|
|
210
280
|
# Calculate Q3
|
@@ -214,7 +284,7 @@ module EnumerableStats
|
|
214
284
|
lower_index = q3_pos.floor
|
215
285
|
upper_index = q3_pos.ceil
|
216
286
|
weight = q3_pos - q3_pos.floor
|
217
|
-
q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
287
|
+
q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
218
288
|
end
|
219
289
|
|
220
290
|
iqr = q3 - q1
|
@@ -224,7 +294,7 @@ module EnumerableStats
|
|
224
294
|
upper_bound = q3 + (multiplier * iqr)
|
225
295
|
|
226
296
|
# Filter out outliers
|
227
|
-
select { |value| value
|
297
|
+
select { |value| value.between?(lower_bound, upper_bound) }
|
228
298
|
end
|
229
299
|
|
230
300
|
# Returns statistics about outlier removal for debugging/logging
|
@@ -247,5 +317,78 @@ module EnumerableStats
|
|
247
317
|
outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
|
248
318
|
}
|
249
319
|
end
|
320
|
+
|
321
|
+
private
|
322
|
+
|
323
|
+
# Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
|
324
|
+
# Uses a lookup table for common df values and approximations for others
|
325
|
+
#
|
326
|
+
# @param df [Float] Degrees of freedom
|
327
|
+
# @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
|
328
|
+
# @return [Float] Critical t-value for one-tailed test
|
329
|
+
def critical_t_value(df, alpha)
|
330
|
+
# For large df (≥30), t-distribution approximates normal distribution
|
331
|
+
return normal_critical_value(alpha) if df >= 30
|
332
|
+
|
333
|
+
# Lookup table for common t-values (one-tailed, α = 0.05)
|
334
|
+
# These are standard critical values from t-tables
|
335
|
+
t_table_05 = {
|
336
|
+
1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
|
337
|
+
6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
|
338
|
+
11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
|
339
|
+
16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
|
340
|
+
21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
|
341
|
+
26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
|
342
|
+
}
|
343
|
+
|
344
|
+
# Lookup table for common t-values (one-tailed, α = 0.01)
|
345
|
+
t_table_01 = {
|
346
|
+
1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
|
347
|
+
6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
|
348
|
+
11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
|
349
|
+
16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
|
350
|
+
21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
|
351
|
+
26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
|
352
|
+
}
|
353
|
+
|
354
|
+
df_int = df.round
|
355
|
+
|
356
|
+
if alpha <= 0.01
|
357
|
+
t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
|
358
|
+
elsif alpha <= 0.05
|
359
|
+
t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
|
360
|
+
else
|
361
|
+
# For alpha > 0.05, interpolate or use approximation
|
362
|
+
# This is a rough approximation for other alpha levels
|
363
|
+
base_t = t_table_05[df_int] || t_table_05[29]
|
364
|
+
base_t * ((0.05 / alpha)**0.5)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
# Returns the critical value for standard normal distribution (z-score)
|
369
|
+
# Used when degrees of freedom is large (≥30)
|
370
|
+
#
|
371
|
+
# @param alpha [Float] Significance level
|
372
|
+
# @return [Float] Critical z-value for one-tailed test
|
373
|
+
def normal_critical_value(alpha)
|
374
|
+
# Common z-values for one-tailed tests
|
375
|
+
# Use approximate comparisons to avoid float equality issues
|
376
|
+
if (alpha - 0.10).abs < 1e-10
|
377
|
+
1.282
|
378
|
+
elsif (alpha - 0.05).abs < 1e-10
|
379
|
+
1.645
|
380
|
+
elsif (alpha - 0.025).abs < 1e-10
|
381
|
+
1.960
|
382
|
+
elsif (alpha - 0.01).abs < 1e-10
|
383
|
+
2.326
|
384
|
+
elsif (alpha - 0.005).abs < 1e-10
|
385
|
+
2.576
|
386
|
+
else
|
387
|
+
# Approximation using inverse normal for other alpha values
|
388
|
+
# This is a rough approximation of the inverse normal CDF
|
389
|
+
# For α = 0.05, this gives approximately 1.645
|
390
|
+
Math.sqrt(-2 * Math.log(alpha))
|
391
|
+
end
|
392
|
+
end
|
250
393
|
end
|
251
|
-
end
|
394
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: enumerable-stats
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Daniel
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-08-
|
11
|
+
date: 2025-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
|