enumerable-stats 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5a52c483b38592d8be9155651e63a4d238850927bc3282f27f22a4859e1db1a
4
- data.tar.gz: 0e807826f7103e049effcd1bd925279e5b1ea7d34932fddebf7167e184a6b887
3
+ metadata.gz: 8c04e21076f0a3ebbf61538159c1348107ca5ed371f4b1f135212dd8d932e184
4
+ data.tar.gz: 32ef265cbaac6a2801e01fb67b73a12313a65d900bc9e1ca0bbc0bfa0bc98f0d
5
5
  SHA512:
6
- metadata.gz: df23176506fd05b2769fc6ee49ed62bc7954ec9f810d6153a69754df90a3cbed7b4744e269c7f26b08b1e2bb56b0384d3c1a46bdacc7a57ecba7f9029c4bcb19
7
- data.tar.gz: 8e80e9d596018a704773ca2836f951d1565876b35aedf7428117a61d031f685ea788ac115d02f04fc7b27ab9b06ebae5a617fe891a97130b8da628cdee807c20
6
+ metadata.gz: 2ebf8b1692788056fe5a6492cd4b986d1b88372e6877cffab441550980e1bf1dd775ebb7254c1bd6c2c91247c0890e6dadec67ab906ce75a3ba486a10a42815d
7
+ data.tar.gz: b2293b0fe996aaf31a4c245a54f1a6eb1b9f03b4e1cc5b2cc7ac2c57cfcb423f9299ac5a4518d5d1fa6846ebaff977abaafa0eb08f175faef5c456d6b7d222f5
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'enumerable_stats/enumerable_ext'
3
+ require_relative "enumerable_stats/enumerable_ext"
4
4
 
5
5
  module Enumerable
6
6
  include EnumerableStats::EnumerableExt
@@ -1,6 +1,36 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EnumerableStats
4
+ # Extension module that adds statistical methods to all Enumerable objects.
5
+ #
6
+ # This module provides essential statistical functions including measures of central tendency
7
+ # (mean, median), measures of dispersion (variance, standard deviation), percentile calculations,
8
+ # outlier detection using the IQR method, and statistical comparison methods.
9
+ #
10
+ # When included, these methods become available on all Ruby collections that include
11
+ # Enumerable (Arrays, Ranges, Sets, etc.), enabling seamless statistical analysis
12
+ # without external dependencies.
13
+ #
14
+ # @example Basic statistical calculations
15
+ # [1, 2, 3, 4, 5].mean #=> 3.0
16
+ # [1, 2, 3, 4, 5].median #=> 3
17
+ # [1, 2, 3, 4, 5].percentile(75) #=> 4.0
18
+ #
19
+ # @example Outlier detection
20
+ # data = [1, 2, 3, 4, 100]
21
+ # data.remove_outliers #=> [1, 2, 3, 4]
22
+ # data.outlier_stats #=> { outliers_removed: 1, percentage: 20.0, ... }
23
+ #
24
+ # @example Statistical testing
25
+ # control = [10, 12, 14, 16, 18]
26
+ # treatment = [15, 17, 19, 21, 23]
27
+ # control.t_value(treatment) #=> negative t-statistic
28
+ # control.degrees_of_freedom(treatment) #=> degrees of freedom for Welch's t-test
29
+ # treatment.greater_than?(control) #=> true (treatment mean significantly > control mean)
30
+ # control.less_than?(treatment) #=> true (control mean significantly < treatment mean)
31
+ #
32
+ # @see Enumerable
33
+ # @since 0.1.0
4
34
  module EnumerableExt
5
35
  # Calculates the percentage difference between this collection's mean and another value or collection's mean
6
36
  # Uses the symmetric percentage difference formula: |a - b| / ((a + b) / 2) * 100
@@ -13,7 +43,7 @@ module EnumerableStats
13
43
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
14
44
 
15
45
  return 0.0 if a == b
16
- return Float::INFINITY if a + b == 0
46
+ return Float::INFINITY if (a + b).zero?
17
47
 
18
48
  ((a - b).abs / ((a + b) / 2.0).abs) * 100
19
49
  end
@@ -29,7 +59,7 @@ module EnumerableStats
29
59
  b = other.respond_to?(:mean) ? other.mean.to_f : other.to_f
30
60
 
31
61
  return 0.0 if a == b
32
- return Float::INFINITY if a + b == 0
62
+ return Float::INFINITY if (a + b).zero?
33
63
 
34
64
  ((a - b) / ((a + b) / 2.0).abs) * 100
35
65
  end
@@ -70,12 +100,52 @@ module EnumerableStats
70
100
 
71
101
  n = (n1 + n2)**2
72
102
 
73
- d1 = variance**2 / (count**2 * (count - 1))
74
- d2 = other.variance**2 / (other.count**2 * (other.count - 1))
103
+ d1 = (variance**2) / ((count**2) * (count - 1))
104
+ d2 = (other.variance**2) / ((other.count**2) * (other.count - 1))
75
105
 
76
106
  n / (d1 + d2)
77
107
  end
78
108
 
109
+ # Tests if this collection's mean is significantly greater than another collection's mean
110
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
111
+ # significance at the specified alpha level.
112
+ #
113
+ # @param other [Enumerable] Another collection to compare against
114
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
115
+ # @return [Boolean] True if this collection's mean is significantly greater
116
+ # @example
117
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
118
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
119
+ # treatment.greater_than?(control) # => true (treatment significantly > control)
120
+ # control.greater_than?(treatment) # => false
121
+ def greater_than?(other, alpha: 0.05)
122
+ t_stat = t_value(other)
123
+ df = degrees_of_freedom(other)
124
+ critical_value = critical_t_value(df, alpha)
125
+
126
+ t_stat > critical_value
127
+ end
128
+
129
+ # Tests if this collection's mean is significantly less than another collection's mean
130
+ # using a one-tailed Student's t-test. Returns true if the test indicates statistical
131
+ # significance at the specified alpha level.
132
+ #
133
+ # @param other [Enumerable] Another collection to compare against
134
+ # @param alpha [Float] Significance level (default: 0.05 for 95% confidence)
135
+ # @return [Boolean] True if this collection's mean is significantly less
136
+ # @example
137
+ # control = [10, 12, 11, 13, 12] # mean ≈ 11.6
138
+ # treatment = [15, 17, 16, 18, 14] # mean = 16.0
139
+ # control.less_than?(treatment) # => true (control significantly < treatment)
140
+ # treatment.less_than?(control) # => false
141
+ def less_than?(other, alpha: 0.05)
142
+ t_stat = t_value(other)
143
+ df = degrees_of_freedom(other)
144
+ critical_value = critical_t_value(df, alpha)
145
+
146
+ t_stat < -critical_value
147
+ end
148
+
79
149
  # Calculates the arithmetic mean (average) of the collection
80
150
  #
81
151
  # @return [Float] The arithmetic mean of all numeric values
@@ -96,7 +166,7 @@ module EnumerableStats
96
166
  # [5, 1, 3, 2, 4].median # => 3 (automatically sorts)
97
167
  # [].median # => nil
98
168
  def median
99
- return nil if size == 0
169
+ return nil if size.zero?
100
170
 
101
171
  sorted = sort
102
172
  midpoint = size / 2
@@ -123,7 +193,7 @@ module EnumerableStats
123
193
  # [1, 2, 3, 4, 5].percentile(100) # => 5 (maximum value)
124
194
  # [].percentile(50) # => nil (empty collection)
125
195
  def percentile(percentile)
126
- return nil if size == 0
196
+ return nil if size.zero?
127
197
 
128
198
  unless percentile.is_a?(Numeric) && percentile >= 0 && percentile <= 100
129
199
  raise ArgumentError, "Percentile must be a number between 0 and 100, got #{percentile}"
@@ -132,7 +202,7 @@ module EnumerableStats
132
202
  sorted = sort
133
203
 
134
204
  # Handle edge cases
135
- return sorted.first if percentile == 0
205
+ return sorted.first if percentile.zero?
136
206
  return sorted.last if percentile == 100
137
207
 
138
208
  # Calculate the position using the "linear" method (R-7/Excel method)
@@ -151,7 +221,7 @@ module EnumerableStats
151
221
  lower_value = sorted[lower_index]
152
222
  upper_value = sorted[upper_index]
153
223
 
154
- lower_value + weight * (upper_value - lower_value)
224
+ lower_value + (weight * (upper_value - lower_value))
155
225
  end
156
226
  end
157
227
 
@@ -164,7 +234,7 @@ module EnumerableStats
164
234
  # [5, 5, 5, 5].variance # => 0.0 (no variation)
165
235
  def variance
166
236
  mean = self.mean
167
- sum_of_squares = map { |r| (r - mean)**2 }.sum
237
+ sum_of_squares = sum { |r| (r - mean)**2 }
168
238
  sum_of_squares / (count - 1).to_f
169
239
  end
170
240
 
@@ -204,7 +274,7 @@ module EnumerableStats
204
274
  lower_index = q1_pos.floor
205
275
  upper_index = q1_pos.ceil
206
276
  weight = q1_pos - q1_pos.floor
207
- q1 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
277
+ q1 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
208
278
  end
209
279
 
210
280
  # Calculate Q3
@@ -214,7 +284,7 @@ module EnumerableStats
214
284
  lower_index = q3_pos.floor
215
285
  upper_index = q3_pos.ceil
216
286
  weight = q3_pos - q3_pos.floor
217
- q3 = sorted[lower_index] + weight * (sorted[upper_index] - sorted[lower_index])
287
+ q3 = sorted[lower_index] + (weight * (sorted[upper_index] - sorted[lower_index]))
218
288
  end
219
289
 
220
290
  iqr = q3 - q1
@@ -224,7 +294,7 @@ module EnumerableStats
224
294
  upper_bound = q3 + (multiplier * iqr)
225
295
 
226
296
  # Filter out outliers
227
- select { |value| value >= lower_bound && value <= upper_bound }
297
+ select { |value| value.between?(lower_bound, upper_bound) }
228
298
  end
229
299
 
230
300
  # Returns statistics about outlier removal for debugging/logging
@@ -247,5 +317,78 @@ module EnumerableStats
247
317
  outlier_percentage: ((original_count - filtered.size).to_f / original_count * 100).round(2)
248
318
  }
249
319
  end
320
+
321
+ private
322
+
323
+ # Calculates the critical t-value for a one-tailed test given degrees of freedom and alpha level
324
+ # Uses a lookup table for common df values and approximations for others
325
+ #
326
+ # @param df [Float] Degrees of freedom
327
+ # @param alpha [Float] Significance level (e.g., 0.05 for 95% confidence)
328
+ # @return [Float] Critical t-value for one-tailed test
329
+ def critical_t_value(df, alpha)
330
+ # For large df (≥30), t-distribution approximates normal distribution
331
+ return normal_critical_value(alpha) if df >= 30
332
+
333
+ # Lookup table for common t-values (one-tailed, α = 0.05)
334
+ # These are standard critical values from t-tables
335
+ t_table_05 = {
336
+ 1 => 6.314, 2 => 2.920, 3 => 2.353, 4 => 2.132, 5 => 2.015,
337
+ 6 => 1.943, 7 => 1.895, 8 => 1.860, 9 => 1.833, 10 => 1.812,
338
+ 11 => 1.796, 12 => 1.782, 13 => 1.771, 14 => 1.761, 15 => 1.753,
339
+ 16 => 1.746, 17 => 1.740, 18 => 1.734, 19 => 1.729, 20 => 1.725,
340
+ 21 => 1.721, 22 => 1.717, 23 => 1.714, 24 => 1.711, 25 => 1.708,
341
+ 26 => 1.706, 27 => 1.703, 28 => 1.701, 29 => 1.699
342
+ }
343
+
344
+ # Lookup table for common t-values (one-tailed, α = 0.01)
345
+ t_table_01 = {
346
+ 1 => 31.821, 2 => 6.965, 3 => 4.541, 4 => 3.747, 5 => 3.365,
347
+ 6 => 3.143, 7 => 2.998, 8 => 2.896, 9 => 2.821, 10 => 2.764,
348
+ 11 => 2.718, 12 => 2.681, 13 => 2.650, 14 => 2.624, 15 => 2.602,
349
+ 16 => 2.583, 17 => 2.567, 18 => 2.552, 19 => 2.539, 20 => 2.528,
350
+ 21 => 2.518, 22 => 2.508, 23 => 2.500, 24 => 2.492, 25 => 2.485,
351
+ 26 => 2.479, 27 => 2.473, 28 => 2.467, 29 => 2.462
352
+ }
353
+
354
+ df_int = df.round
355
+
356
+ if alpha <= 0.01
357
+ t_table_01[df_int] || t_table_01[29] # Use df=29 as fallback for larger values
358
+ elsif alpha <= 0.05
359
+ t_table_05[df_int] || t_table_05[29] # Use df=29 as fallback for larger values
360
+ else
361
+ # For alpha > 0.05, interpolate or use approximation
362
+ # This is a rough approximation for other alpha levels
363
+ base_t = t_table_05[df_int] || t_table_05[29]
364
+ base_t * ((0.05 / alpha)**0.5)
365
+ end
366
+ end
367
+
368
+ # Returns the critical value for standard normal distribution (z-score)
369
+ # Used when degrees of freedom is large (≥30)
370
+ #
371
+ # @param alpha [Float] Significance level
372
+ # @return [Float] Critical z-value for one-tailed test
373
+ def normal_critical_value(alpha)
374
+ # Common z-values for one-tailed tests
375
+ # Use approximate comparisons to avoid float equality issues
376
+ if (alpha - 0.10).abs < 1e-10
377
+ 1.282
378
+ elsif (alpha - 0.05).abs < 1e-10
379
+ 1.645
380
+ elsif (alpha - 0.025).abs < 1e-10
381
+ 1.960
382
+ elsif (alpha - 0.01).abs < 1e-10
383
+ 2.326
384
+ elsif (alpha - 0.005).abs < 1e-10
385
+ 2.576
386
+ else
387
+ # Approximation using inverse normal for other alpha values
388
+ # This is a rough approximation of the inverse normal CDF
389
+ # For α = 0.05, this gives approximately 1.645
390
+ Math.sqrt(-2 * Math.log(alpha))
391
+ end
392
+ end
250
393
  end
251
- end
394
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: enumerable-stats
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jon Daniel
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-08-01 00:00:00.000000000 Z
11
+ date: 2025-08-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |
14
14
  A Ruby gem that extends all Enumerable objects (Arrays, Ranges, Sets, etc.) with essential statistical methods.