enumerable-stats 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/enumerable-stats.rb +1 -1
- data/lib/enumerable_stats/enumerable_ext.rb +200 -10
- metadata +6 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
|
4
|
+
data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
|
7
|
+
data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
|
data/lib/enumerable-stats.rb
CHANGED
@@ -1,6 +1,36 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module EnumerableStats
|
4
|
+
# Extension module that adds statistical methods to all Enumerable objects.
|
5
|
+
#
|
6
|
+
# This module provides essential statistical functions including measures of central tendency
|
7
|
+
# (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
|
8
|
+
# outlier detection using the IQR method, and statistical comparison methods.
|
9
|
+
#
|
10
|
+
# When included, these methods become available on all Ruby collections that include
|
11
|
+
# Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
|
12
|
+
# without external dependencies.
|
13
|
+
#
|
14
|
+
# @example Basic statistical calculations
|
15
|
+
# [1, 2, 3, 4, 5].mean #=> 3.0
|
16
|
+
# [1, 2, 3, 4, 5].median #=> 3
|
17
|
+
# [1, 2, 3, 4, 5].percentile(75) #=> 4.0
|
18
|
+
#
|
19
|
+
# @example Outlier detection
|
20
|
+
# data = [1, 2, 3, 4, 100]
|
21
|
+
# data.remove_outliers #=> [1, 2, 3, 4]
|
22
|
+
# data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
|
23
|
+
#
|
24
|
+
# @example Statistical testing
|
25
|
+
# control = [10, 12, 14, 16, 18]
|
26
|
+
# treatment = [15, 17, 19, 21, 23]
|
27
|
+
# control.t_value(treatment) #=> negative t-statistic
|
28
|
+
# control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
|
29
|
+
# treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
|
30
|
+
# control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
|
31
|
+
#
|
32
|
+
# @see Enumerable
|
33
|
+
# @since 0.1.0
|
4
34
|
module EnumerableExt
|
5
35
|
# Calculates the percentage difference between this collection's mean and another value or collection's mean
|
6
36
|
# Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
|
@@ -13,7 +43,7 @@ module EnumerableStats
|
|
13
43
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
14
44
|
|
15
45
|
return 0.0 if a == b
|
16
|
-
return Float::INFINITY if a + b
|
46
|
+
return Float::INFINITY if (a + b).zero?
|
17
47
|
|
18
48
|
((a - b).abs / ((a + b) / 2.0).abs) * 100
|
19
49
|
end
|
@@ -29,7 +59,7 @@ module EnumerableStats
|
|
29
59
|
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
30
60
|
|
31
61
|
return 0.0 if a == b
|
32
|
-
return Float::INFINITY if a + b
|
62
|
+
return Float::INFINITY if (a + b).zero?
|
33
63
|
|
34
64
|
((a - b) / ((a + b) / 2.0).abs) * 100
|
35
65
|
end
|
@@ -70,12 +100,52 @@ module EnumerableStats
|
|
70
100
|
|
71
101
|
n = (n1 + n2)**2
|
72
102
|
|
73
|
-
d1 = variance**2 / (count**2 * (count - 1))
|
74
|
-
d2 = other.variance**2 / (other.count**2 * (other.count - 1))
|
103
|
+
d1 = (variance**2) / ((count**2) * (count - 1))
|
104
|
+
d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
|
75
105
|
|
76
106
|
n / (d1 + d2)
|
77
107
|
end
|
78
108
|
|
109
|
+
# Tests if this collection's mean is significantly greater than another collection's mean
|
110
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
111
|
+
# significance at the specified alpha level.
|
112
|
+
#
|
113
|
+
# @param other [Enumerable] Another collection to compare against
|
114
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
115
|
+
# @return [Boolean] True if this collection's mean is significantly greater
|
116
|
+
# @example
|
117
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
118
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
119
|
+
# treatment.greater_than?(control) # => true (treatment significantly > control)
|
120
|
+
# control.greater_than?(treatment) # => false
|
121
|
+
def greater_than?(other, alpha: 0.05)
|
122
|
+
t_stat = t_value(other)
|
123
|
+
df = degrees_of_freedom(other)
|
124
|
+
critical_value = critical_t_value(df, alpha)
|
125
|
+
|
126
|
+
t_stat > critical_value
|
127
|
+
end
|
128
|
+
|
129
|
+
# Tests if this collection's mean is significantly less than another collection's mean
|
130
|
+
# using a one-tailed Student's t-test. Returns true if the test indicates statistical
|
131
|
+
# significance at the specified alpha level.
|
132
|
+
#
|
133
|
+
# @param other [Enumerable] Another collection to compare against
|
134
|
+
# @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
|
135
|
+
# @return [Boolean] True if this collection's mean is significantly less
|
136
|
+
# @example
|
137
|
+
# control = [10, 12, 11, 13, 12] # mean ≈ 11.6
|
138
|
+
# treatment = [15, 17, 16, 18, 14] # mean = 16.0
|
139
|
+
# control.less_than?(treatment) # => true (control significantly < treatment)
|
140
|
+
# treatment.less_than?(control) # => false
|
141
|
+
def less_than?(other, alpha: 0.05)
|
142
|
+
t_stat = t_value(other)
|
143
|
+
df = degrees_of_freedom(other)
|
144
|
+
critical_value = critical_t_value(df, alpha)
|
145
|
+
|
146
|
+
t_stat < -critical_value
|
147
|
+
end
|
148
|
+
|
79
149
|
# Calculates the arithmetic mean (average) of the collection
|
80
150
|
#
|
81
151
|
# @return [Float] The arithmetic mean of all numeric values
|
@@ -96,7 +166,7 @@ module EnumerableStats
|
|
96
166
|
# [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
|
97
167
|
# [].median # => nil
|
98
168
|
def median
|
99
|
-
return nil if size
|
169
|
+
return nil if size.zero?
|
100
170
|
|
101
171
|
sorted = sort
|
102
172
|
midpoint = size / 2
|
@@ -108,6 +178,53 @@ module EnumerableStats
|
|
108
178
|
end
|
109
179
|
end
|
110
180
|
|
181
|
+
# Calculates the specified percentile of the collection
|
182
|
+
# Uses linear interpolation between data points when the exact percentile falls between values
|
183
|
+
# This is equivalent to the "linear" method used by many statistical software packages
|
184
|
+
#
|
185
|
+
# @param percentile [Numeric] The percentile to calculate (0-100)
|
186
|
+
# @return [Numeric, nil] The value at the specified percentile, or nil if the collection is empty
|
187
|
+
# @raise [ArgumentError] If percentile is not between 0 and 100
|
188
|
+
# @example
|
189
|
+
# [1, 2, 3, 4, 5].percentile(50) # => 3 (same as median)
|
190
|
+
# [1, 2, 3, 4, 5].percentile(25) # => 2.0 (25th percentile)
|
191
|
+
# [1, 2, 3, 4, 5].percentile(75) # => 4.0 (75th percentile)
|
192
|
+
# [1, 2, 3, 4, 5].percentile(0) # => 1 (minimum value)
|
193
|
+
# [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
|
194
|
+
# [].percentile(50) # => nil (empty collection)
|
195
|
+
def percentile(percentile)
|
196
|
+
return nil if size.zero?
|
197
|
+
|
198
|
+
unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
|
199
|
+
raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
|
200
|
+
end
|
201
|
+
|
202
|
+
sorted = sort
|
203
|
+
|
204
|
+
# Handle edge cases
|
205
|
+
return sorted.first if percentile.zero?
|
206
|
+
return sorted.last if percentile == 100
|
207
|
+
|
208
|
+
# Calculate the position using the "linear" method (R-7/Excel method)
|
209
|
+
# This is the most commonly used method in statistical software
|
210
|
+
position = (size - 1) * (percentile / 100.0)
|
211
|
+
|
212
|
+
# If position is an integer, return that exact element
|
213
|
+
if position == position.floor
|
214
|
+
sorted[position.to_i]
|
215
|
+
else
|
216
|
+
# Linear interpolation between the two surrounding values
|
217
|
+
lower_index = position.floor
|
218
|
+
upper_index = position.ceil
|
219
|
+
weight = position - position.floor
|
220
|
+
|
221
|
+
lower_value = sorted[lower_index]
|
222
|
+
upper_value = sorted[upper_index]
|
223
|
+
|
224
|
+
lower_value + (weight * (upper_value - lower_value))
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
111
228
|
# Calculates the sample variance of the collection
|
112
229
|
# Uses the unbiased formula with n-1 degrees of freedom (Bessel's correction)
|
113
230
|
#
|
@@ -117,7 +234,7 @@ module EnumerableStats
|
|
117
234
|
# [5, 5, 5, 5].variance # => 0.0 (no variation)
|
118
235
|
def variance
|
119
236
|
mean = self.mean
|
120
|
-
sum_of_squares =
|
237
|
+
sum_of_squares = sum { |r| (r - mean)**2 }
|
121
238
|
sum_of_squares / (count - 1).to_f
|
122
239
|
end
|
123
240
|
|
@@ -157,7 +274,7 @@ module EnumerableStats
|
|
157
274
|
lower_index = q1_pos.floor
|
158
275
|
upper_index = q1_pos.ceil
|
159
276
|
weight = q1_pos - q1_pos.floor
|
160
|
-
q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
277
|
+
q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
161
278
|
end
|
162
279
|
|
163
280
|
# Calculate Q3
|
@@ -167,7 +284,7 @@ module EnumerableStats
|
|
167
284
|
lower_index = q3_pos.floor
|
168
285
|
upper_index = q3_pos.ceil
|
169
286
|
weight = q3_pos - q3_pos.floor
|
170
|
-
q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
287
|
+
q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
|
171
288
|
end
|
172
289
|
|
173
290
|
iqr = q3 - q1
|
@@ -177,7 +294,7 @@ module EnumerableStats
|
|
177
294
|
upper_bound = q3 + (multiplier * iqr)
|
178
295
|
|
179
296
|
# Filter out outliers
|
180
|
-
select { |value| value
|
297
|
+
select { |value| value.between?(lower_bound, upper_bound) }
|
181
298
|
end
|
182
299
|
|
183
300
|
# Returns statistics about outlier removal for debugging/logging
|
@@ -200,5 +317,78 @@ module EnumerableStats
|
|
200
317
|
outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
|
201
318
|
}
|
202
319
|
end
|
320
|
+
|
321
|
+
private
|
322
|
+
|
323
|
+
# Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
|
324
|
+
# Uses a lookup table for common df values and approximations for others
|
325
|
+
#
|
326
|
+
# @param df [Float] Degrees of freedom
|
327
|
+
# @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
|
328
|
+
# @return [Float] Critical t-value for one-tailed test
|
329
|
+
def critical_t_value(df, alpha)
|
330
|
+
# For large df (≥30), t-distribution approximates normal distribution
|
331
|
+
return normal_critical_value(alpha) if df >= 30
|
332
|
+
|
333
|
+
# Lookup table for common t-values (one-tailed, α = 0.05)
|
334
|
+
# These are standard critical values from t-tables
|
335
|
+
t_table_05 = {
|
336
|
+
1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
|
337
|
+
6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
|
338
|
+
11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
|
339
|
+
16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
|
340
|
+
21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
|
341
|
+
26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
|
342
|
+
}
|
343
|
+
|
344
|
+
# Lookup table for common t-values (one-tailed, α = 0.01)
|
345
|
+
t_table_01 = {
|
346
|
+
1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
|
347
|
+
6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
|
348
|
+
11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
|
349
|
+
16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
|
350
|
+
21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
|
351
|
+
26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
|
352
|
+
}
|
353
|
+
|
354
|
+
df_int = df.round
|
355
|
+
|
356
|
+
if alpha <= 0.01
|
357
|
+
t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
|
358
|
+
elsif alpha <= 0.05
|
359
|
+
t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
|
360
|
+
else
|
361
|
+
# For alpha > 0.05, interpolate or use approximation
|
362
|
+
# This is a rough approximation for other alpha levels
|
363
|
+
base_t = t_table_05[df_int] || t_table_05[29]
|
364
|
+
base_t * ((0.05 / alpha)**0.5)
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
# Returns the critical value for standard normal distribution (z-score)
|
369
|
+
# Used when degrees of freedom is large (≥30)
|
370
|
+
#
|
371
|
+
# @param alpha [Float] Significance level
|
372
|
+
# @return [Float] Critical z-value for one-tailed test
|
373
|
+
def normal_critical_value(alpha)
|
374
|
+
# Common z-values for one-tailed tests
|
375
|
+
# Use approximate comparisons to avoid float equality issues
|
376
|
+
if (alpha - 0.10).abs < 1e-10
|
377
|
+
1.282
|
378
|
+
elsif (alpha - 0.05).abs < 1e-10
|
379
|
+
1.645
|
380
|
+
elsif (alpha - 0.025).abs < 1e-10
|
381
|
+
1.960
|
382
|
+
elsif (alpha - 0.01).abs < 1e-10
|
383
|
+
2.326
|
384
|
+
elsif (alpha - 0.005).abs < 1e-10
|
385
|
+
2.576
|
386
|
+
else
|
387
|
+
# Approximation using inverse normal for other alpha values
|
388
|
+
# This is a rough approximation of the inverse normal CDF
|
389
|
+
# For α = 0.05, this gives approximately 1.645
|
390
|
+
Math.sqrt(-2 * Math.log(alpha))
|
391
|
+
end
|
392
|
+
end
|
203
393
|
end
|
204
|
-
end
|
394
|
+
end
|
metadata
CHANGED
@@ -1,13 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: enumerable-stats
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jon Daniel
|
8
|
+
autorequire:
|
8
9
|
bindir: bin
|
9
10
|
cert_chain: []
|
10
|
-
date: 2025-
|
11
|
+
date: 2025-08-02 00:00:00.000000000 Z
|
11
12
|
dependencies: []
|
12
13
|
description: |
|
13
14
|
A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
|
@@ -28,6 +29,7 @@ metadata:
|
|
28
29
|
source_code_uri: https://github.com/binarycleric/enumerable-stats
|
29
30
|
github_repo: ssh://github.com/binarycleric/enumerable-stats
|
30
31
|
rubygems_mfa_required: 'true'
|
32
|
+
post_install_message:
|
31
33
|
rdoc_options: []
|
32
34
|
require_paths:
|
33
35
|
- lib
|
@@ -42,7 +44,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
42
44
|
- !ruby/object:Gem::Version
|
43
45
|
version: '0'
|
44
46
|
requirements: []
|
45
|
-
rubygems_version: 3.
|
47
|
+
rubygems_version: 3.5.22
|
48
|
+
signing_key:
|
46
49
|
specification_version: 4
|
47
50
|
summary: Statistical Methods for Enumerable Collections
|
48
51
|
test_files: []
|