enumerable-stats 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bd06a4ad69a379469e16e45d5eca8debb2a786edc66c3208429adde1c6f80f20
4
+ data.tar.gz: 2a8823cdf6f5fe2793e9df99dbd71f4427a63146fdd165475204fe4493607355
5
+ SHA512:
6
+ metadata.gz: 308025ee228b384520cff759f5664beace555bd66e3894c389e1547359e2eb90e7fad1bfd72c6774f5b1b327c222a5dd9e8a278b9e16155a9a7e48a693bc60bc
7
+ data.tar.gz: 224af711d1b3b24d8218cf15adb116216dcb57c81f677b5af8c02c238e2ea304e2ac3537aa9c221750055b68a61d15cce578ee5e7a7687283cce4fb0f917f5dd
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'enumerable_stats/enumerable_ext'
4
+
5
+ module Enumerable
6
+ include EnumerableStats::EnumerableExt
7
+ end
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module EnumerableStats
4
+ module EnumerableExt
5
+ # Calculates the percentage difference between this collection's mean and another value or collection's mean
6
+ # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
7
+ # This is useful for comparing datasets or metrics where direction doesn't matter
8
+ #
9
+ # @param other [Numeric, Enumerable] Value or collection to compare against
10
+ # @return [Float] Absolute percentage difference (always positive)
11
+ def percentage_difference(other)
12
+ a = mean.to_f
13
+ b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
14
+
15
+ return 0.0 if a == b
16
+ return Float::INFINITY if a + b == 0
17
+
18
+ ((a - b).abs / ((a + b) / 2.0).abs) * 100
19
+ end
20
+
21
+ # Calculates the signed percentage difference between this collection's mean and another value or collection's mean
22
+ # Uses the signed percentage difference formula: (a - b) / ((a + b) / 2) * 100
23
+ # Useful for performance comparisons where direction matters (e.g., improvements vs regressions)
24
+ #
25
+ # @param other [Numeric, Enumerable] Value or collection to compare against
26
+ # @return [Float] Signed percentage difference (positive = this collection is higher, negative = other is higher)
27
+ def signed_percentage_difference(other)
28
+ a = mean.to_f
29
+ b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
30
+
31
+ return 0.0 if a == b
32
+ return Float::INFINITY if a + b == 0
33
+
34
+ ((a - b) / ((a + b) / 2.0).abs) * 100
35
+ end
36
+
37
+ # Calculates the t-statistic for comparing the means of two samples
38
+ # Uses Welch's t-test formula which doesn't assume equal variances
39
+ # A larger absolute t-value indicates a greater difference between sample means
40
+ #
41
+ # @param other [Enumerable] Another collection to compare against
42
+ # @return [Float] The t-statistic value (can be positive or negative)
43
+ # @example
44
+ # control = [10, 12, 11, 13, 12]
45
+ # treatment = [15, 17, 16, 18, 14]
46
+ # t_stat = control.t_value(treatment) # => ~-4.2 (negative means treatment > control)
47
+ def t_value(other)
48
+ signal = (mean - other.mean)
49
+ noise = Math.sqrt(
50
+ ((standard_deviation**2) / count) +
51
+ ((other.standard_deviation**2) / other.count)
52
+ )
53
+
54
+ (signal / noise)
55
+ end
56
+
57
+ # Calculates the degrees of freedom for comparing two samples using Welch's formula
58
+ # This is used in statistical hypothesis testing when sample variances are unequal
59
+ # The formula accounts for different sample sizes and variances between groups
60
+ #
61
+ # @param other [Enumerable] Another collection to compare against
62
+ # @return [Float] Degrees of freedom for statistical testing
63
+ # @example
64
+ # sample_a = [10, 12, 14, 16, 18]
65
+ # sample_b = [5, 15, 25, 35, 45, 55]
66
+ # df = sample_a.degrees_of_freedom(sample_b) # => ~7.2
67
+ def degrees_of_freedom(other)
68
+ n1 = variance / count
69
+ n2 = other.variance / other.count
70
+
71
+ n = (n1 + n2)**2
72
+
73
+ d1 = variance**2 / (count**2 * (count - 1))
74
+ d2 = other.variance**2 / (other.count**2 * (other.count - 1))
75
+
76
+ n / (d1 + d2)
77
+ end
78
+
79
+ # Calculates the arithmetic mean (average) of the collection
80
+ #
81
+ # @return [Float] The arithmetic mean of all numeric values
82
+ # @example
83
+ # [1, 2, 3, 4, 5].mean # => 3.0
84
+ # (1..10).mean # => 5.5
85
+ def mean
86
+ sum / size.to_f
87
+ end
88
+
89
+ # Calculates the median (middle value) of the collection
90
+ # For collections with an even number of elements, returns the average of the two middle values
91
+ #
92
+ # @return [Numeric, nil] The median value, or nil if the collection is empty
93
+ # @example
94
+ # [1, 2, 3, 4, 5].median # => 3
95
+ # [1, 2, 3, 4].median # => 2.5
96
+ # [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
97
+ # [].median # => nil
98
+ def median
99
+ return nil if size == 0
100
+
101
+ sorted = sort
102
+ midpoint = size / 2
103
+
104
+ if size.even?
105
+ sorted[midpoint - 1, 2].sum / 2.0
106
+ else
107
+ sorted[midpoint]
108
+ end
109
+ end
110
+
111
+ # Calculates the sample variance of the collection
112
+ # Uses the unbiased formula with n-1 degrees of freedom (Bessel's correction)
113
+ #
114
+ # @return [Float] The sample variance
115
+ # @example
116
+ # [1, 2, 3, 4, 5].variance # => 2.5
117
+ # [5, 5, 5, 5].variance # => 0.0 (no variation)
118
+ def variance
119
+ mean = self.mean
120
+ sum_of_squares = map { |r| (r - mean)**2 }.sum
121
+ sum_of_squares / (count - 1).to_f
122
+ end
123
+
124
+ # Calculates the sample standard deviation of the collection
125
+ # Returns the square root of the sample variance
126
+ #
127
+ # @return [Float] The sample standard deviation
128
+ # @example
129
+ # [1, 2, 3, 4, 5].standard_deviation # => 1.58
130
+ # [5, 5, 5, 5].standard_deviation # => 0.0
131
+ def standard_deviation
132
+ Math.sqrt variance
133
+ end
134
+
135
+ # Removes extreme outliers using the IQR (Interquartile Range) method
136
+ # This is particularly effective for performance data which often has
137
+ # extreme values due to network issues, CPU scheduling, GC pauses, etc.
138
+ #
139
+ # @param multiplier [Float] IQR multiplier (1.5 is standard, 2.0 is more conservative)
140
+ # @return [Array] Array with outliers removed
141
+ def remove_outliers(multiplier: 1.5)
142
+ return self if size < 4 # Need minimum data points for quartiles
143
+
144
+ sorted = sort
145
+ n = size
146
+
147
+ # Use the standard quartile calculation with interpolation
148
+ # Q1 position = (n-1) * 0.25
149
+ # Q3 position = (n-1) * 0.75
150
+ q1_pos = (n - 1) * 0.25
151
+ q3_pos = (n - 1) * 0.75
152
+
153
+ # Calculate Q1
154
+ if q1_pos == q1_pos.floor
155
+ q1 = sorted[q1_pos.to_i]
156
+ else
157
+ lower_index = q1_pos.floor
158
+ upper_index = q1_pos.ceil
159
+ weight = q1_pos - q1_pos.floor
160
+ q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
161
+ end
162
+
163
+ # Calculate Q3
164
+ if q3_pos == q3_pos.floor
165
+ q3 = sorted[q3_pos.to_i]
166
+ else
167
+ lower_index = q3_pos.floor
168
+ upper_index = q3_pos.ceil
169
+ weight = q3_pos - q3_pos.floor
170
+ q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
171
+ end
172
+
173
+ iqr = q3 - q1
174
+
175
+ # Calculate bounds
176
+ lower_bound = q1 - (multiplier * iqr)
177
+ upper_bound = q3 + (multiplier * iqr)
178
+
179
+ # Filter out outliers
180
+ select { |value| value >= lower_bound && value <= upper_bound }
181
+ end
182
+
183
+ # Returns statistics about outlier removal for debugging/logging
184
+ # Provides detailed information about how many outliers were removed and their percentage
185
+ #
186
+ # @param multiplier [Float] IQR multiplier for outlier detection (1.5 is standard, 2.0 is more conservative)
187
+ # @return [Hash] Statistics hash containing :original_count, :filtered_count, :outliers_removed, :outlier_percentage
188
+ # @example
189
+ # data = [1, 2, 3, 4, 5, 100]
190
+ # stats = data.outlier_stats
191
+ # # => {original_count: 6, filtered_count: 5, outliers_removed: 1, outlier_percentage: 16.67}
192
+ def outlier_stats(multiplier: 1.5)
193
+ original_count = size
194
+ filtered = remove_outliers(multiplier: multiplier)
195
+
196
+ {
197
+ original_count: original_count,
198
+ filtered_count: filtered.size,
199
+ outliers_removed: original_count - filtered.size,
200
+ outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
201
+ }
202
+ end
203
+ end
204
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: enumerable-stats
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Jon Daniel
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 2025-07-31 00:00:00.000000000 Z
11
+ dependencies: []
12
+ description: |
13
+ A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
14
+ Provides mean, median, variance, and standard deviation calculations, along with robust outlier detection using the IQR method.
15
+ Perfect for data analysis, performance monitoring, A/B testing, and cleaning datasets with extreme values.
16
+ Zero dependencies and works seamlessly with any Ruby collection that includes Enumerable.
17
+ email: binarycleric@gmail.com
18
+ executables: []
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - lib/enumerable-stats.rb
23
+ - lib/enumerable_stats/enumerable_ext.rb
24
+ homepage: https://github.com/binarycleric/enumerable-stats
25
+ licenses:
26
+ - MIT
27
+ metadata:
28
+ source_code_uri: https://github.com/binarycleric/enumerable-stats
29
+ github_repo: ssh://github.com/binarycleric/enumerable-stats
30
+ rubygems_mfa_required: 'true'
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - ">="
37
+ - !ruby/object:Gem::Version
38
+ version: 3.1.0
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubygems_version: 3.6.2
46
+ specification_version: 4
47
+ summary: Statistical Methods for Enumerable Collections
48
+ test_files: []