enumerable-stats 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: bd06a4ad69a379469e16e45d5eca8debb2a786edc66c3208429adde1c6f80f20
4
- data.tar.gz: 2a8823cdf6f5fe2793e9df99dbd71f4427a63146fdd165475204fe4493607355
3
+ metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
4
+ data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
5
5
  SHA512:
6
- metadata.gz: 308025ee228b384520cff759f5664beace555bd66e3894c389e1547359e2eb90e7fad1bfd72c6774f5b1b327c222a5dd9e8a278b9e16155a9a7e48a693bc60bc
7
- data.tar.gz: 224af711d1b3b24d8218cf15adb116216dcb57c81f677b5af8c02c238e2ea304e2ac3537aa9c221750055b68a61d15cce578ee5e7a7687283cce4fb0f917f5dd
6
+ metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
7
+ data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'enumerable_stats/enumerable_ext'
3
+ require_relative "enumerable_stats/enumerable_ext"
4
4
 
5
5
  module Enumerable
6
6
  include EnumerableStats::EnumerableExt
@@ -1,6 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EnumerableStats
4
+ # Extension module that adds statistical methods to all Enumerable objects.
5
+ #
6
+ # This module provides essential statistical functions including measures of central tendency
7
+ # (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
8
+ # outlier detection using the IQR method, and statistical comparison methods.
9
+ #
10
+ # When included, these methods become available on all Ruby collections that include
11
+ # Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
12
+ # without external dependencies.
13
+ #
14
+ # @example Basic statistical calculations
15
+ # [1, 2, 3, 4, 5].mean #=> 3.0
16
+ # [1, 2, 3, 4, 5].median #=> 3
17
+ # [1, 2, 3, 4, 5].percentile(75) #=> 4.0
18
+ #
19
+ # @example Outlier detection
20
+ # data = [1, 2, 3, 4, 100]
21
+ # data.remove_outliers #=> [1, 2, 3, 4]
22
+ # data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
23
+ #
24
+ # @example Statistical testing
25
+ # control = [10, 12, 14, 16, 18]
26
+ # treatment = [15, 17, 19, 21, 23]
27
+ # control.t_value(treatment) #=> negative t-statistic
28
+ # control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
29
+ # treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
30
+ # control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
31
+ #
32
+ # @see Enumerable
33
+ # @since 0.1.0
4
34
  module EnumerableExt
5
35
  # Calculates the percentage difference between this collection's mean and another value or collection's mean
6
36
  # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
@@ -13,7 +43,7 @@ module EnumerableStats
13
43
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
14
44
 
15
45
  return 0.0 if a == b
16
- return Float::INFINITY if a + b == 0
46
+ return Float::INFINITY if (a + b).zero?
17
47
 
18
48
  ((a - b).abs / ((a + b) / 2.0).abs) * 100
19
49
  end
@@ -29,7 +59,7 @@ module EnumerableStats
29
59
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
30
60
 
31
61
  return 0.0 if a == b
32
- return Float::INFINITY if a + b == 0
62
+ return Float::INFINITY if (a + b).zero?
33
63
 
34
64
  ((a - b) / ((a + b) / 2.0).abs) * 100
35
65
  end
@@ -70,12 +100,52 @@ module EnumerableStats
70
100
 
71
101
  n = (n1 + n2)**2
72
102
 
73
- d1 = variance**2 / (count**2 * (count - 1))
74
- d2 = other.variance**2 / (other.count**2 * (other.count - 1))
103
+ d1 = (variance**2) / ((count**2) * (count - 1))
104
+ d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
75
105
 
76
106
  n / (d1 + d2)
77
107
  end
78
108
 
109
+ # Tests if this collection's mean is significantly greater than another collection's mean
110
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
111
+ # significance at the specified alpha level.
112
+ #
113
+ # @param other [Enumerable] Another collection to compare against
114
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
115
+ # @return [Boolean] True if this collection's mean is significantly greater
116
+ # @example
117
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
118
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
119
+ # treatment.greater_than?(control) # => true (treatment significantly > control)
120
+ # control.greater_than?(treatment) # => false
121
+ def greater_than?(other, alpha: 0.05)
122
+ t_stat = t_value(other)
123
+ df = degrees_of_freedom(other)
124
+ critical_value = critical_t_value(df, alpha)
125
+
126
+ t_stat > critical_value
127
+ end
128
+
129
+ # Tests if this collection's mean is significantly less than another collection's mean
130
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
131
+ # significance at the specified alpha level.
132
+ #
133
+ # @param other [Enumerable] Another collection to compare against
134
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
135
+ # @return [Boolean] True if this collection's mean is significantly less
136
+ # @example
137
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
138
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
139
+ # control.less_than?(treatment) # => true (control significantly < treatment)
140
+ # treatment.less_than?(control) # => false
141
+ def less_than?(other, alpha: 0.05)
142
+ t_stat = t_value(other)
143
+ df = degrees_of_freedom(other)
144
+ critical_value = critical_t_value(df, alpha)
145
+
146
+ t_stat < -critical_value
147
+ end
148
+
79
149
  # Calculates the arithmetic mean (average) of the collection
80
150
  #
81
151
  # @return [Float] The arithmetic mean of all numeric values
@@ -96,7 +166,7 @@ module EnumerableStats
96
166
  # [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
97
167
  # [].median # => nil
98
168
  def median
99
- return nil if size == 0
169
+ return nil if size.zero?
100
170
 
101
171
  sorted = sort
102
172
  midpoint = size / 2
@@ -108,6 +178,53 @@ module EnumerableStats
108
178
  end
109
179
  end
110
180
 
181
+ # Calculates the specified percentile of the collection
182
+ # Uses linear interpolation between data points when the exact percentile falls between values
183
+ # This is equivalent to the "linear" method used by many statistical software packages
184
+ #
185
+ # @param percentile [Numeric] The percentile to calculate (0-100)
186
+ # @return [Numeric, nil] The value at the specified percentile, or nil if the collection is empty
187
+ # @raise [ArgumentError] If percentile is not between 0 and 100
188
+ # @example
189
+ # [1, 2, 3, 4, 5].percentile(50) # => 3 (same as median)
190
+ # [1, 2, 3, 4, 5].percentile(25) # => 2.0 (25th percentile)
191
+ # [1, 2, 3, 4, 5].percentile(75) # => 4.0 (75th percentile)
192
+ # [1, 2, 3, 4, 5].percentile(0) # => 1 (minimum value)
193
+ # [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
194
+ # [].percentile(50) # => nil (empty collection)
195
+ def percentile(percentile)
196
+ return nil if size.zero?
197
+
198
+ unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
199
+ raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
200
+ end
201
+
202
+ sorted = sort
203
+
204
+ # Handle edge cases
205
+ return sorted.first if percentile.zero?
206
+ return sorted.last if percentile == 100
207
+
208
+ # Calculate the position using the "linear" method (R-7/Excel method)
209
+ # This is the most commonly used method in statistical software
210
+ position = (size - 1) * (percentile / 100.0)
211
+
212
+ # If position is an integer, return that exact element
213
+ if position == position.floor
214
+ sorted[position.to_i]
215
+ else
216
+ # Linear interpolation between the two surrounding values
217
+ lower_index = position.floor
218
+ upper_index = position.ceil
219
+ weight = position - position.floor
220
+
221
+ lower_value = sorted[lower_index]
222
+ upper_value = sorted[upper_index]
223
+
224
+ lower_value + (weight * (upper_value - lower_value))
225
+ end
226
+ end
227
+
111
228
  # Calculates the sample variance of the collection
112
229
  # Uses the unbiased formula with n-1 degrees of freedom (Bessel's correction)
113
230
  #
@@ -117,7 +234,7 @@ module EnumerableStats
117
234
  # [5, 5, 5, 5].variance # => 0.0 (no variation)
118
235
  def variance
119
236
  mean = self.mean
120
- sum_of_squares = map { |r| (r - mean)**2 }.sum
237
+ sum_of_squares = sum { |r| (r - mean)**2 }
121
238
  sum_of_squares / (count - 1).to_f
122
239
  end
123
240
 
@@ -157,7 +274,7 @@ module EnumerableStats
157
274
  lower_index = q1_pos.floor
158
275
  upper_index = q1_pos.ceil
159
276
  weight = q1_pos - q1_pos.floor
160
- q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
277
+ q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
161
278
  end
162
279
 
163
280
  # Calculate Q3
@@ -167,7 +284,7 @@ module EnumerableStats
167
284
  lower_index = q3_pos.floor
168
285
  upper_index = q3_pos.ceil
169
286
  weight = q3_pos - q3_pos.floor
170
- q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
287
+ q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
171
288
  end
172
289
 
173
290
  iqr = q3 - q1
@@ -177,7 +294,7 @@ module EnumerableStats
177
294
  upper_bound = q3 + (multiplier * iqr)
178
295
 
179
296
  # Filter out outliers
180
- select { |value| value >= lower_bound && value <= upper_bound }
297
+ select { |value| value.between?(lower_bound, upper_bound) }
181
298
  end
182
299
 
183
300
  # Returns statistics about outlier removal for debugging/logging
@@ -200,5 +317,78 @@ module EnumerableStats
200
317
  outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
201
318
  }
202
319
  end
320
+
321
+ private
322
+
323
+ # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
324
+ # Uses a lookup table for common df values and approximations for others
325
+ #
326
+ # @param df [Float] Degrees of freedom
327
+ # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
328
+ # @return [Float] Critical t-value for one-tailed test
329
+ def critical_t_value(df, alpha)
330
+ # For large df (≥30), t-distribution approximates normal distribution
331
+ return normal_critical_value(alpha) if df >= 30
332
+
333
+ # Lookup table for common t-values (one-tailed, α = 0.05)
334
+ # These are standard critical values from t-tables
335
+ t_table_05 = {
336
+ 1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
337
+ 6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
338
+ 11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
339
+ 16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
340
+ 21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
341
+ 26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
342
+ }
343
+
344
+ # Lookup table for common t-values (one-tailed, α = 0.01)
345
+ t_table_01 = {
346
+ 1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
347
+ 6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
348
+ 11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
349
+ 16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
350
+ 21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
351
+ 26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
352
+ }
353
+
354
+ df_int = df.round
355
+
356
+ if alpha <= 0.01
357
+ t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
358
+ elsif alpha <= 0.05
359
+ t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
360
+ else
361
+ # For alpha > 0.05, interpolate or use approximation
362
+ # This is a rough approximation for other alpha levels
363
+ base_t = t_table_05[df_int] || t_table_05[29]
364
+ base_t * ((0.05 / alpha)**0.5)
365
+ end
366
+ end
367
+
368
+ # Returns the critical value for standard normal distribution (z-score)
369
+ # Used when degrees of freedom is large (≥30)
370
+ #
371
+ # @param alpha [Float] Significance level
372
+ # @return [Float] Critical z-value for one-tailed test
373
+ def normal_critical_value(alpha)
374
+ # Common z-values for one-tailed tests
375
+ # Use approximate comparisons to avoid float equality issues
376
+ if (alpha - 0.10).abs < 1e-10
377
+ 1.282
378
+ elsif (alpha - 0.05).abs < 1e-10
379
+ 1.645
380
+ elsif (alpha - 0.025).abs < 1e-10
381
+ 1.960
382
+ elsif (alpha - 0.01).abs < 1e-10
383
+ 2.326
384
+ elsif (alpha - 0.005).abs < 1e-10
385
+ 2.576
386
+ else
387
+ # Approximation using inverse normal for other alpha values
388
+ # This is a rough approximation of the inverse normal CDF
389
+ # For α = 0.05, this gives approximately 1.645
390
+ Math.sqrt(-2 * Math.log(alpha))
391
+ end
392
+ end
203
393
  end
204
- end
394
+ end
metadata CHANGED
@@ -1,13 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: enumerable-stats
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Daniel
8
+ autorequire:
8
9
  bindir: bin
9
10
  cert_chain: []
10
- date: 2025-07-31 00:00:00.000000000 Z
11
+ date: 2025-08-02 00:00:00.000000000 Z
11
12
  dependencies: []
12
13
  description: |
13
14
  A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.
@@ -28,6 +29,7 @@ metadata:
28
29
  source_code_uri: https://github.com/binarycleric/enumerable-stats
29
30
  github_repo: ssh://github.com/binarycleric/enumerable-stats
30
31
  rubygems_mfa_required: 'true'
32
+ post_install_message:
31
33
  rdoc_options: []
32
34
  require_paths:
33
35
  - lib
@@ -42,7 +44,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
42
44
  - !ruby/object:Gem::Version
43
45
  version: '0'
44
46
  requirements: []
45
- rubygems_version: 3.6.2
47
+ rubygems_version: 3.5.22
48
+ signing_key:
46
49
  specification_version: 4
47
50
  summary: Statistical Methods for Enumerable Collections
48
51
  test_files: []