more_math 1.5.0 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,111 +2,179 @@ require 'more_math/sequence/moving_average'
2
2
  require 'more_math/sequence/refinement'
3
3
 
4
4
  module MoreMath
5
- # This class is used to contain elements and compute various statistical
6
- # values for them.
5
+ # A sequence class for statistical analysis and mathematical operations.
6
+ #
7
+ # This class provides comprehensive statistical functionality including:
8
+ # - Basic sequence operations (iteration, size, etc.)
9
+ # - Statistical measures (mean, variance, standard deviation)
10
+ # - Advanced statistical methods (percentiles, confidence intervals)
11
+ # - Time series analysis (moving averages, autocorrelation)
12
+ # - Hypothesis testing (t-tests, confidence intervals)
13
+ # - Data visualization tools (histograms)
14
+ #
15
+ # @example Basic usage
16
+ # sequence = Sequence.new([1, 2, 3, 4, 5])
17
+ # puts sequence.mean # => 3.0
18
+ # puts sequence.variance # => 2.0
19
+ # sequence.simple_moving_average(3) # => [2.0, 3.0, 4.0]
20
+ #
21
+ # @example Statistical analysis
22
+ # data = Sequence.new([10, 15, 20, 25, 30])
23
+ # puts data.percentile(90) # => 28.0
24
+ # puts data.confidence_interval(0.05) # => 17.0..23.0
7
25
  class Sequence
8
26
  include MoreMath::Sequence::MovingAverage
9
27
 
28
+ # Initializes a new Sequence instance with the given elements.
29
+ #
30
+ # @param elements [Array] The array of elements to store in this sequence
10
31
  def initialize(elements)
11
32
  @elements = elements.dup.freeze
12
33
  end
13
34
 
14
35
  # Returns the array of elements.
36
+ #
37
+ # @return [Array] The frozen array of elements in this sequence
15
38
  attr_reader :elements
16
39
 
17
- # Calls the +block+ for every element of this Sequence.
40
+ # Calls the block for every element of this Sequence.
41
+ #
42
+ # @yield [element] Yields each element to the block
43
+ # @yieldparam element [Object] Each element in the sequence
44
+ # @return [self] Returns self to allow method chaining
18
45
  def each(&block)
19
46
  @elements.each(&block)
20
47
  end
21
48
  include Enumerable
22
49
 
23
50
  # Returns true if this sequence is empty, otherwise false.
51
+ #
52
+ # @return [Boolean] true if sequence has no elements, false otherwise
24
53
  def empty?
25
54
  @elements.empty?
26
55
  end
27
56
 
28
- # Returns the number of elements, on which the analysis is based.
57
+ # Returns the number of elements in this sequence.
58
+ #
59
+ # @return [Integer] The count of elements in the sequence
29
60
  def size
30
61
  @elements.size
31
62
  end
32
63
 
33
64
  # Reset all memoized values of this sequence.
65
+ #
66
+ # @return [self] Returns self after clearing memoization cache
34
67
  def reset
35
68
  self.class.mize_cache_clear
36
69
  self
37
70
  end
38
71
 
72
+ # Converts the sequence to an array.
73
+ #
74
+ # @return [Array] A duplicate of the internal elements array
39
75
  def to_ary
40
76
  @elements.dup
41
77
  end
42
78
 
43
79
  alias to_a to_ary
44
80
 
45
- # Push +element+ on this Sequence and return a new Sequence instance with
46
- # +element+ as its last element.
81
+ # Pushes an element onto this Sequence and returns a new Sequence instance.
82
+ #
83
+ # @param element [Object] The element to add to the sequence
84
+ # @return [Sequence] A new Sequence instance with the element added
47
85
  def push(element)
48
86
  Sequence.new(@elements.dup.push(element))
49
87
  end
88
+
50
89
  alias << push
51
90
 
52
91
  # Returns the variance of the elements.
92
+ #
93
+ # Variance measures how far each number in the set is from the mean.
94
+ #
95
+ # @return [Float] The population variance of the elements
96
+ # @note Uses the formula: Σ(xi - μ)² / n
53
97
  memoize method:
54
98
  def variance
55
99
  sum_of_squares / size
56
100
  end
57
101
 
58
- # Returns the sample_variance of the elements.
102
+ # Returns the sample variance of the elements.
103
+ #
104
+ # Sample variance is used when the data represents a sample rather than a population.
105
+ #
106
+ # @return [Float] The sample variance of the elements
107
+ # @note Uses the formula: Σ(xi - μ)² / (n-1)
59
108
  memoize method:
60
109
  def sample_variance
61
110
  size > 1 ? sum_of_squares / (size - 1.0) : 0.0
62
111
  end
63
112
 
64
- # Returns the sum of squares (the sum of the squared deviations) of the
65
- # elements.
113
+ # Returns the sum of squares of the elements.
114
+ #
115
+ # Sum of squares is used in variance and standard deviation calculations.
116
+ #
117
+ # @return [Float] The sum of squared deviations from the mean
66
118
  memoize method:
67
119
  def sum_of_squares
68
120
  @elements.inject(0.0) { |s, t| s + (t - arithmetic_mean) ** 2 }
69
121
  end
70
122
 
71
123
  # Returns the standard deviation of the elements.
124
+ #
125
+ # Standard deviation measures the amount of variation or dispersion in a set of values.
126
+ #
127
+ # @return [Float] The population standard deviation
72
128
  memoize method:
73
129
  def standard_deviation
74
130
  Math.sqrt(variance)
75
131
  end
76
132
 
77
133
  # Returns the Z-score sequence derived from the current sequence.
134
+ #
135
+ # Z-scores standardize data by transforming it to have a mean of 0 and standard deviation of 1.
136
+ #
137
+ # @return [Sequence] A new Sequence with z-score values
78
138
  memoize method:
79
139
  def z_score
80
140
  self.class.new(elements.map { |t| t.to_f - mean / standard_deviation })
81
141
  end
82
142
 
83
- # Returns the standard deviation of the elements in percentage of the
84
- # arithmetic mean.
143
+ # Returns the standard deviation as a percentage of the arithmetic mean.
144
+ #
145
+ # @return [Float] Standard deviation expressed as a percentage of the mean
85
146
  memoize method:
86
147
  def standard_deviation_percentage
87
148
  100.0 * standard_deviation / arithmetic_mean
88
149
  end
89
150
 
90
151
  # Returns the sample standard deviation of the elements.
152
+ #
153
+ # @return [Float] The sample standard deviation
91
154
  memoize method:
92
155
  def sample_standard_deviation
93
156
  Math.sqrt(sample_variance)
94
157
  end
95
158
 
96
- # Returns the sample standard deviation of the elements in percentage
97
- # of the arithmetic mean.
159
+ # Returns the sample standard deviation as a percentage of the arithmetic mean.
160
+ #
161
+ # @return [Float] Sample standard deviation expressed as a percentage of the mean
98
162
  memoize method:
99
163
  def sample_standard_deviation_percentage
100
164
  100.0 * sample_standard_deviation / arithmetic_mean
101
165
  end
102
166
 
103
167
  # Returns the sum of all elements.
168
+ #
169
+ # @return [Float] The sum of all elements in the sequence
104
170
  memoize method:
105
171
  def sum
106
172
  @elements.inject(0.0) { |s, t| s + t }
107
173
  end
108
174
 
109
175
  # Returns the arithmetic mean of the elements.
176
+ #
177
+ # @return [Float] The arithmetic mean (average) of the elements
110
178
  memoize method:
111
179
  def arithmetic_mean
112
180
  sum / size
@@ -114,8 +182,11 @@ module MoreMath
114
182
 
115
183
  alias mean arithmetic_mean
116
184
 
117
- # Returns the harmonic mean of the elements. If any of the elements
118
- # is less than or equal to 0.0, this method returns NaN.
185
+ # Returns the harmonic mean of the elements.
186
+ #
187
+ # The harmonic mean is useful for rates and ratios. Returns NaN if any element is <= 0.
188
+ #
189
+ # @return [Float] The harmonic mean, or NaN if invalid input
119
190
  memoize method:
120
191
  def harmonic_mean
121
192
  sum = @elements.inject(0.0) { |s, t|
@@ -128,8 +199,12 @@ module MoreMath
128
199
  sum ? size / sum : 0 / 0.0
129
200
  end
130
201
 
131
- # Returns the geometric mean of the elements. If any of the
132
- # elements is less than 0.0, this method returns NaN.
202
+ # Returns the geometric mean of the elements.
203
+ #
204
+ # The geometric mean is useful for sets of positive numbers that are to be multiplied together.
205
+ # Returns NaN if any element is negative, 0 if any element is zero.
206
+ #
207
+ # @return [Float] The geometric mean, or NaN if invalid input
133
208
  memoize method:
134
209
  def geometric_mean
135
210
  sum = @elements.inject(0.0) { |s, t|
@@ -153,27 +228,36 @@ module MoreMath
153
228
  end
154
229
 
155
230
  # Returns the minimum of the elements.
231
+ #
232
+ # @return [Object] The minimum element in the sequence
156
233
  memoize method:
157
234
  def min
158
235
  @elements.min
159
236
  end
160
237
 
161
238
  # Returns the maximum of the elements.
239
+ #
240
+ # @return [Object] The maximum element in the sequence
162
241
  memoize method:
163
242
  def max
164
243
  @elements.max
165
244
  end
166
245
 
167
- # Return a sorted array of the elements.
246
+ # Returns a sorted array of the elements.
247
+ #
248
+ # @return [Array] A new array containing elements sorted in ascending order
168
249
  memoize method:
169
250
  def sorted
170
251
  @elements.sort
171
252
  end
172
253
 
173
- # Returns the +p+-percentile of the elements.
174
- # There are many methods to compute the percentile, this method uses the
175
- # the weighted average at x_(n + 1)p, which allows p to be in 0...100
176
- # (excluding the 100).
254
+ # Returns the p-percentile of the elements.
255
+ #
256
+ # Uses weighted average at x_(n + 1)p for interpolation between percentiles.
257
+ #
258
+ # @param p [Integer, Float] The percentile to calculate (0-99)
259
+ # @return [Float] The p-th percentile value
260
+ # @raise [ArgumentError] If p is not in the range (0...100)
177
261
  def percentile(p = 50)
178
262
  (0...100).include?(p) or
179
263
  raise ArgumentError, "p = #{p}, but has to be in (0...100)"
@@ -195,16 +279,20 @@ module MoreMath
195
279
 
196
280
  alias median percentile
197
281
 
198
- # Use an approximation of the Welch-Satterthwaite equation to compute the
199
- # degrees of freedom for Welch's t-test.
282
+ # Computes the degrees of freedom for Welch's t-test.
283
+ #
284
+ # @param other [Sequence] The other sequence to compare against
285
+ # @return [Float] The degrees of freedom for Welch's t-test
200
286
  def compute_welch_df(other)
201
287
  (sample_variance / size + other.sample_variance / other.size) ** 2 / (
202
288
  (sample_variance ** 2 / (size ** 2 * (size - 1))) +
203
289
  (other.sample_variance ** 2 / (other.size ** 2 * (other.size - 1))))
204
290
  end
205
291
 
206
- # Returns the t value of the Welch's t-test between this Sequence
207
- # instance and the +other+.
292
+ # Returns the t value of the Welch's t-test between this sequence and another.
293
+ #
294
+ # @param other [Sequence] The other sequence to compare against
295
+ # @return [Float] The t-statistic value
208
296
  def t_welch(other)
209
297
  signal = arithmetic_mean - other.arithmetic_mean
210
298
  noise = Math.sqrt(sample_variance / size +
@@ -214,26 +302,35 @@ module MoreMath
214
302
  0.0
215
303
  end
216
304
 
217
- # Returns an estimation of the common standard deviation of the
218
- # elements of this and +other+.
305
+ # Returns an estimation of the common standard deviation of this and another sequence.
306
+ #
307
+ # @param other [Sequence] The other sequence to compare against
308
+ # @return [Float] The pooled standard deviation estimate
219
309
  def common_standard_deviation(other)
220
310
  Math.sqrt(common_variance(other))
221
311
  end
222
312
 
223
- # Returns an estimation of the common variance of the elements of this
224
- # and +other+.
313
+ # Returns an estimation of the common variance of this and another sequence.
314
+ #
315
+ # @param other [Sequence] The other sequence to compare against
316
+ # @return [Float] The pooled variance estimate
225
317
  def common_variance(other)
226
318
  (size - 1) * sample_variance + (other.size - 1) *
227
319
  other.sample_variance / (size + other.size - 2)
228
320
  end
229
321
 
230
- # Compute the # degrees of freedom for Student's t-test.
322
+ # Computes the degrees of freedom for Student's t-test.
323
+ #
324
+ # @param other [Sequence] The other sequence to compare against
325
+ # @return [Integer] The degrees of freedom for Student's t-test
231
326
  def compute_student_df(other)
232
327
  size + other.size - 2
233
328
  end
234
329
 
235
- # Returns the t value of the Student's t-test between this Sequence
236
- # instance and the +other+.
330
+ # Returns the t value of the Student's t-test between this sequence and another.
331
+ #
332
+ # @param other [Sequence] The other sequence to compare against
333
+ # @return [Float] The t-statistic value
237
334
  def t_student(other)
238
335
  signal = arithmetic_mean - other.arithmetic_mean
239
336
  noise = common_standard_deviation(other) *
@@ -243,9 +340,12 @@ module MoreMath
243
340
  0.0
244
341
  end
245
342
 
246
- # Compute a sample size, that will more likely yield a mean difference
247
- # between this instance's elements and those of +other+. Use +alpha+
248
- # and +beta+ as levels for the first- and second-order errors.
343
+ # Computes the suggested sample size for detecting a mean difference.
344
+ #
345
+ # @param other [Sequence] The other sequence to compare against
346
+ # @param alpha [Float] The significance level (default: 0.05)
347
+ # @param beta [Float] The Type II error probability (default: 0.05)
348
+ # @return [Float] The suggested sample size
249
349
  def suggested_sample_size(other, alpha = 0.05, beta = 0.05)
250
350
  alpha, beta = alpha.abs, beta.abs
251
351
  signal = arithmetic_mean - other.arithmetic_mean
@@ -256,17 +356,21 @@ module MoreMath
256
356
  Math.sqrt(pooled_variance_estimate)) / signal) ** 2
257
357
  end
258
358
 
259
- # Return true, if the Sequence instance covers the +other+, that is their
260
- # arithmetic mean value is most likely to be equal for the +alpha+ error
261
- # level.
359
+ # Determines if this sequence covers another sequence at the given alpha level.
360
+ #
361
+ # @param other [Sequence] The other sequence to compare against
362
+ # @param alpha [Float] The significance level (default: 0.05)
363
+ # @return [Boolean] true if sequences are statistically equivalent
262
364
  def cover?(other, alpha = 0.05)
263
365
  t = t_welch(other)
264
366
  td = TDistribution.new(compute_welch_df(other))
265
367
  t.abs < td.inverse_probability(1 - alpha.abs / 2.0)
266
368
  end
267
369
 
268
- # Return the confidence interval for the arithmetic mean with alpha level +alpha+ of
269
- # the elements of this Sequence instance as a Range object.
370
+ # Returns the confidence interval for the arithmetic mean.
371
+ #
372
+ # @param alpha [Float] The significance level (default: 0.05)
373
+ # @return [Range] The confidence interval as a range object
270
374
  def confidence_interval(alpha = 0.05)
271
375
  td = TDistribution.new(size - 1)
272
376
  t = td.inverse_probability(alpha / 2).abs
@@ -274,7 +378,9 @@ module MoreMath
274
378
  (arithmetic_mean - delta)..(arithmetic_mean + delta)
275
379
  end
276
380
 
277
- # Returns the array of autovariances (of length size - 1).
381
+ # Returns the array of autovariances.
382
+ #
383
+ # @return [Array<Float>] Array of autovariance values
278
384
  def autovariance
279
385
  Array.new(size - 1) do |k|
280
386
  s = 0.0
@@ -285,15 +391,17 @@ module MoreMath
285
391
  end
286
392
  end
287
393
 
288
- # Returns the array of autocorrelation values c_k / c_0 (of length size -
289
- # 1).
394
+ # Returns the array of autocorrelation values.
395
+ #
396
+ # @return [Array<Float>] Array of autocorrelation values (normalized by first variance)
290
397
  def autocorrelation
291
398
  c = autovariance
292
399
  Array.new(c.size) { |k| c[k] / c[0] }
293
400
  end
294
401
 
295
- # Returns the d-value for the Durbin-Watson statistic. The value is d << 2
296
- # for positive, d >> 2 for negative and d around 2 for no autocorrelation.
402
+ # Returns the d-value for the Durbin-Watson statistic.
403
+ #
404
+ # @return [Float] The Durbin-Watson statistic value (close to 2 indicates no autocorrelation)
297
405
  def durbin_watson_statistic
298
406
  e = linear_regression.residuals
299
407
  e.size <= 1 and return 2.0
@@ -301,10 +409,10 @@ module MoreMath
301
409
  e.inject(0.0) { |s, x| s + x ** 2 }
302
410
  end
303
411
 
304
- # Returns the q value of the Ljung-Box statistic for the number of lags
305
- # +lags+. A higher value might indicate autocorrelation in the elements of
306
- # this Sequence instance. This method returns nil if there weren't enough
307
- # (at least lags) lags available.
412
+ # Returns the q value of the Ljung-Box statistic.
413
+ #
414
+ # @param lags [Integer] The number of lags to consider (default: 20)
415
+ # @return [Float, nil] The Ljung-Box statistic value or nil if insufficient data
308
416
  def ljung_box_statistic(lags = 20)
309
417
  r = autocorrelation
310
418
  lags >= r.size and return
@@ -312,14 +420,11 @@ module MoreMath
312
420
  n * (n + 2) * (1..lags).inject(0.0) { |s, i| s + r[i] ** 2 / (n - i) }
313
421
  end
314
422
 
315
- # This method tries to detect autocorrelation with the Ljung-Box
316
- # statistic. If enough lags can be considered it returns a hash with
317
- # results, otherwise nil is returned. The keys are
318
- # :lags:: the number of lags,
319
- # :alpha_level:: the alpha level for the test,
320
- # :q:: the value of the ljung_box_statistic,
321
- # :p:: the p-value computed, if p is higher than alpha no correlation was detected,
322
- # :detected:: true if a correlation was found.
423
+ # Detects autocorrelation using the Ljung-Box statistic.
424
+ #
425
+ # @param lags [Integer] The number of lags to consider (default: 20)
426
+ # @param alpha_level [Float] The significance level (default: 0.05)
427
+ # @return [Hash, nil] Results hash or nil if insufficient data
323
428
  def detect_autocorrelation(lags = 20, alpha_level = 0.05)
324
429
  if q = ljung_box_statistic(lags)
325
430
  p = ChiSquareDistribution.new(lags).probability(q)
@@ -334,16 +439,19 @@ module MoreMath
334
439
  end
335
440
 
336
441
  # Returns the interquartile range for this sequence.
442
+ #
443
+ # @return [Float] The difference between 75th and 25th percentiles
337
444
  def interquartile_range
338
445
  quartile1 = percentile(25)
339
446
  quartile3 = percentile(75)
340
447
  quartile3 - quartile1
341
448
  end
342
449
 
343
- # Return a result hash with the number of :very_low, :low, :high, and
344
- # :very_high outliers, determined by the box plotting algorithm run with
345
- # :median and :iqr parameters. If no outliers were found or the iqr is
346
- # less than epsilon, nil is returned.
450
+ # Detects outliers using the boxplot algorithm.
451
+ #
452
+ # @param factor [Float] The multiplier for IQR to define outlier boundaries (default: 3.0)
453
+ # @param epsilon [Float] Small value for numerical stability (default: 1E-5)
454
+ # @return [Hash, nil] Outlier statistics or nil if no outliers or insufficient data
347
455
  def detect_outliers(factor = 3.0, epsilon = 1E-5)
348
456
  half_factor = factor / 2.0
349
457
  quartile1 = percentile(25)
@@ -372,15 +480,18 @@ module MoreMath
372
480
  end
373
481
  end
374
482
 
375
- # Returns the LinearRegression object for the equation a * x + b which
376
- # represents the line computed by the linear regression algorithm.
483
+ # Returns the LinearRegression object for this sequence.
484
+ #
485
+ # @return [LinearRegression] The linear regression model for this data
377
486
  memoize method:
378
487
  def linear_regression
379
488
  LinearRegression.new @elements
380
489
  end
381
490
 
382
- # Returns a Histogram instance with +bins+ as the number of bins for this
383
- # analysis' elements.
491
+ # Creates a Histogram instance from this sequence.
492
+ #
493
+ # @param bins [Integer] The number of bins for the histogram
494
+ # @return [Histogram] A new Histogram instance
384
495
  def histogram(bins)
385
496
  Histogram.new(self, bins)
386
497
  end