enumerable-stats 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/enumerable-stats.rb +7 -0
- data/lib/enumerable_stats/enumerable_ext.rb +204 -0
- metadata +48 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bd06a4ad69a379469e16e45d5eca8debb2a786edc66c3208429adde1c6f80f20
|
4
|
+
data.tar.gz: 2a8823cdf6f5fe2793e9df99dbd71f4427a63146fdd165475204fe4493607355
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 308025ee228b384520cff759f5664beace555bd66e3894c389e1547359e2eb90e7fad1bfd72c6774f5b1b327c222a5dd9e8a278b9e16155a9a7e48a693bc60bc
|
7
|
+
data.tar.gz: 224af711d1b3b24d8218cf15adb116216dcb57c81f677b5af8c02c238e2ea304e2ac3537aa9c221750055b68a61d15cce578ee5e7a7687283cce4fb0f917f5dd
|
@@ -0,0 +1,204 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module EnumerableStats
|
4
|
+
module EnumerableExt
|
5
|
+
# Calculates the percentage difference between this collection's mean and another value or collection's mean
|
6
|
+
# Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
|
7
|
+
# This is useful for comparing datasets or metrics where direction doesn't matter
|
8
|
+
#
|
9
|
+
# @param other [Numeric, Enumerable] Value or collection to compare against
|
10
|
+
# @return [Float] Absolute percentage difference (always positive)
|
11
|
+
def percentage_difference(other)
|
12
|
+
a = mean.to_f
|
13
|
+
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
14
|
+
|
15
|
+
return 0.0 if a == b
|
16
|
+
return Float::INFINITY if a + b == 0
|
17
|
+
|
18
|
+
((a - b).abs / ((a + b) / 2.0).abs) * 100
|
19
|
+
end
|
20
|
+
|
21
|
+
# Calculates the signed percentage difference between this collection's mean and another value or collection's mean
|
22
|
+
# Uses the signed percentage difference formula: (a - b) / ((a + b) / 2) * 100
|
23
|
+
# Useful for performance comparisons where direction matters (e.g., improvements vs regressions)
|
24
|
+
#
|
25
|
+
# @param other [Numeric, Enumerable] Value or collection to compare against
|
26
|
+
# @return [Float] Signed percentage difference (positive = this collection is higher, negative = other is higher)
|
27
|
+
def signed_percentage_difference(other)
|
28
|
+
a = mean.to_f
|
29
|
+
b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
|
30
|
+
|
31
|
+
return 0.0 if a == b
|
32
|
+
return Float::INFINITY if a + b == 0
|
33
|
+
|
34
|
+
((a - b) / ((a + b) / 2.0).abs) * 100
|
35
|
+
end
|
36
|
+
|
37
|
+
# Calculates the t-statistic for comparing the means of two samples
|
38
|
+
# Uses Welch's t-test formula which doesn't assume equal variances
|
39
|
+
# A larger absolute t-value indicates a greater difference between sample means
|
40
|
+
#
|
41
|
+
# @param other [Enumerable] Another collection to compare against
|
42
|
+
# @return [Float] The t-statistic value (can be positive or negative)
|
43
|
+
# @example
|
44
|
+
# control = [10, 12, 11, 13, 12]
|
45
|
+
# treatment = [15, 17, 16, 18, 14]
|
46
|
+
# t_stat = control.t_value(treatment) # => ~-4.2 (negative means treatment > control)
|
47
|
+
def t_value(other)
|
48
|
+
signal = (mean - other.mean)
|
49
|
+
noise = Math.sqrt(
|
50
|
+
((standard_deviation**2) / count) +
|
51
|
+
((other.standard_deviation**2) / other.count)
|
52
|
+
)
|
53
|
+
|
54
|
+
(signal / noise)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Calculates the degrees of freedom for comparing two samples using Welch's formula
|
58
|
+
# This is used in statistical hypothesis testing when sample variances are unequal
|
59
|
+
# The formula accounts for different sample sizes and variances between groups
|
60
|
+
#
|
61
|
+
# @param other [Enumerable] Another collection to compare against
|
62
|
+
# @return [Float] Degrees of freedom for statistical testing
|
63
|
+
# @example
|
64
|
+
# sample_a = [10, 12, 14, 16, 18]
|
65
|
+
# sample_b = [5, 15, 25, 35, 45, 55]
|
66
|
+
# df = sample_a.degrees_of_freedom(sample_b) # => ~7.2
|
67
|
+
def degrees_of_freedom(other)
|
68
|
+
n1 = variance / count
|
69
|
+
n2 = other.variance / other.count
|
70
|
+
|
71
|
+
n = (n1 + n2)**2
|
72
|
+
|
73
|
+
d1 = variance**2 / (count**2 * (count - 1))
|
74
|
+
d2 = other.variance**2 / (other.count**2 * (other.count - 1))
|
75
|
+
|
76
|
+
n / (d1 + d2)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Calculates the arithmetic mean (average) of the collection
|
80
|
+
#
|
81
|
+
# @return [Float] The arithmetic mean of all numeric values
|
82
|
+
# @example
|
83
|
+
# [1, 2, 3, 4, 5].mean # => 3.0
|
84
|
+
# (1..10).mean # => 5.5
|
85
|
+
def mean
|
86
|
+
sum / size.to_f
|
87
|
+
end
|
88
|
+
|
89
|
+
# Calculates the median (middle value) of the collection
|
90
|
+
# For collections with an even number of elements, returns the average of the two middle values
|
91
|
+
#
|
92
|
+
# @return [Numeric, nil] The median value, or nil if the collection is empty
|
93
|
+
# @example
|
94
|
+
# [1, 2, 3, 4, 5].median # => 3
|
95
|
+
# [1, 2, 3, 4].median # => 2.5
|
96
|
+
# [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
|
97
|
+
# [].median # => nil
|
98
|
+
def median
|
99
|
+
return nil if size == 0
|
100
|
+
|
101
|
+
sorted = sort
|
102
|
+
midpoint = size / 2
|
103
|
+
|
104
|
+
if size.even?
|
105
|
+
sorted[midpoint - 1, 2].sum / 2.0
|
106
|
+
else
|
107
|
+
sorted[midpoint]
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Calculates the sample variance of the collection
|
112
|
+
# Uses the unbiased formula with n-1 degrees of freedom (Bessel's correction)
|
113
|
+
#
|
114
|
+
# @return [Float] The sample variance
|
115
|
+
# @example
|
116
|
+
# [1, 2, 3, 4, 5].variance # => 2.5
|
117
|
+
# [5, 5, 5, 5].variance # => 0.0 (no variation)
|
118
|
+
def variance
|
119
|
+
mean = self.mean
|
120
|
+
sum_of_squares = map { |r| (r - mean)**2 }.sum
|
121
|
+
sum_of_squares / (count - 1).to_f
|
122
|
+
end
|
123
|
+
|
124
|
+
# Calculates the sample standard deviation of the collection
|
125
|
+
# Returns the square root of the sample variance
|
126
|
+
#
|
127
|
+
# @return [Float] The sample standard deviation
|
128
|
+
# @example
|
129
|
+
# [1, 2, 3, 4, 5].standard_deviation # => 1.58
|
130
|
+
# [5, 5, 5, 5].standard_deviation # => 0.0
|
131
|
+
def standard_deviation
|
132
|
+
Math.sqrt variance
|
133
|
+
end
|
134
|
+
|
135
|
+
# Removes extreme outliers using the IQR (Interquartile Range) method
|
136
|
+
# This is particularly effective for performance data which often has
|
137
|
+
# extreme values due to network issues, CPU scheduling, GC pauses, etc.
|
138
|
+
#
|
139
|
+
# @param multiplier [Float] IQR multiplier (1.5 is standard, 2.0 is more conservative)
|
140
|
+
# @return [Array] Array with outliers removed
|
141
|
+
def remove_outliers(multiplier: 1.5)
|
142
|
+
return self if size < 4 # Need minimum data points for quartiles
|
143
|
+
|
144
|
+
sorted = sort
|
145
|
+
n = size
|
146
|
+
|
147
|
+
# Use the standard quartile calculation with interpolation
|
148
|
+
# Q1 position = (n-1) * 0.25
|
149
|
+
# Q3 position = (n-1) * 0.75
|
150
|
+
q1_pos = (n - 1) * 0.25
|
151
|
+
q3_pos = (n - 1) * 0.75
|
152
|
+
|
153
|
+
# Calculate Q1
|
154
|
+
if q1_pos == q1_pos.floor
|
155
|
+
q1 = sorted[q1_pos.to_i]
|
156
|
+
else
|
157
|
+
lower_index = q1_pos.floor
|
158
|
+
upper_index = q1_pos.ceil
|
159
|
+
weight = q1_pos - q1_pos.floor
|
160
|
+
q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
161
|
+
end
|
162
|
+
|
163
|
+
# Calculate Q3
|
164
|
+
if q3_pos == q3_pos.floor
|
165
|
+
q3 = sorted[q3_pos.to_i]
|
166
|
+
else
|
167
|
+
lower_index = q3_pos.floor
|
168
|
+
upper_index = q3_pos.ceil
|
169
|
+
weight = q3_pos - q3_pos.floor
|
170
|
+
q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
|
171
|
+
end
|
172
|
+
|
173
|
+
iqr = q3 - q1
|
174
|
+
|
175
|
+
# Calculate bounds
|
176
|
+
lower_bound = q1 - (multiplier * iqr)
|
177
|
+
upper_bound = q3 + (multiplier * iqr)
|
178
|
+
|
179
|
+
# Filter out outliers
|
180
|
+
select { |value| value >= lower_bound && value <= upper_bound }
|
181
|
+
end
|
182
|
+
|
183
|
+
# Returns statistics about outlier removal for debugging/logging
|
184
|
+
# Provides detailed information about how many outliers were removed and their percentage
|
185
|
+
#
|
186
|
+
# @param multiplier [Float] IQR multiplier for outlier detection (1.5 is standard, 2.0 is more conservative)
|
187
|
+
# @return [Hash] Statistics hash containing :original_count, :filtered_count, :outliers_removed, :outlier_percentage
|
188
|
+
# @example
|
189
|
+
# data = [1, 2, 3, 4, 5, 100]
|
190
|
+
# stats = data.outlier_stats
|
191
|
+
# # => {original_count: 6, filtered_count: 5, outliers_removed: 1, outlier_percentage: 16.67}
|
192
|
+
def outlier_stats(multiplier: 1.5)
|
193
|
+
original_count = size
|
194
|
+
filtered = remove_outliers(multiplier: multiplier)
|
195
|
+
|
196
|
+
{
|
197
|
+
original_count: original_count,
|
198
|
+
filtered_count: filtered.size,
|
199
|
+
outliers_removed: original_count - filtered.size,
|
200
|
+
outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
|
201
|
+
}
|
202
|
+
end
|
203
|
+
end
|
204
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: enumerable-stats
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jon Daniel
|
8
|
+
bindir: bin
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-07-31 00:00:00.000000000 Z
|
11
|
+
dependencies: []
|
12
|
+
description: |
|
13
|
+
A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
|
14
|
+
Provides mean, median, variance, and standard deviation calculations, along with robust outlier detection using the IQR method.
|
15
|
+
Perfect for data analysis, performance monitoring, A/B testing, and cleaning datasets with extreme values.
|
16
|
+
Zero dependencies and works seamlessly with any Ruby collection that includes Enumerable.
|
17
|
+
email: binarycleric@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- lib/enumerable-stats.rb
|
23
|
+
- lib/enumerable_stats/enumerable_ext.rb
|
24
|
+
homepage: https://github.com/binarycleric/enumerable-stats
|
25
|
+
licenses:
|
26
|
+
- MIT
|
27
|
+
metadata:
|
28
|
+
source_code_uri: https://github.com/binarycleric/enumerable-stats
|
29
|
+
github_repo: ssh://github.com/binarycleric/enumerable-stats
|
30
|
+
rubygems_mfa_required: 'true'
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - ">="
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 3.1.0
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubygems_version: 3.6.2
|
46
|
+
specification_version: 4
|
47
|
+
summary: Statistical Methods for Enumerable Collections
|
48
|
+
test_files: []
|