more_math 1.5.0 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGES.md +28 -1
- data/README.md +25 -54
- data/Rakefile +8 -2
- data/lib/more_math/cantor_pairing_function.rb +59 -0
- data/lib/more_math/constants/functions_constants.rb +37 -0
- data/lib/more_math/continued_fraction.rb +170 -60
- data/lib/more_math/distributions.rb +98 -9
- data/lib/more_math/entropy.rb +74 -2
- data/lib/more_math/exceptions.rb +26 -0
- data/lib/more_math/functions.rb +140 -4
- data/lib/more_math/histogram.rb +86 -3
- data/lib/more_math/linear_regression.rb +108 -7
- data/lib/more_math/newton_bisection.rb +71 -8
- data/lib/more_math/numberify_string_function.rb +96 -20
- data/lib/more_math/permutation.rb +132 -27
- data/lib/more_math/ranking_common.rb +38 -10
- data/lib/more_math/sequence/moving_average.rb +27 -0
- data/lib/more_math/sequence/refinement.rb +26 -0
- data/lib/more_math/sequence.rb +177 -66
- data/lib/more_math/string_numeral.rb +172 -4
- data/lib/more_math/subset.rb +49 -5
- data/lib/more_math/version.rb +1 -1
- data/lib/more_math.rb +1 -0
- data/more_math.gemspec +4 -3
- metadata +17 -3
data/lib/more_math/sequence.rb
CHANGED
@@ -2,111 +2,179 @@ require 'more_math/sequence/moving_average'
|
|
2
2
|
require 'more_math/sequence/refinement'
|
3
3
|
|
4
4
|
module MoreMath
|
5
|
-
#
|
6
|
-
#
|
5
|
+
# A sequence class for statistical analysis and mathematical operations.
|
6
|
+
#
|
7
|
+
# This class provides comprehensive statistical functionality including:
|
8
|
+
# - Basic sequence operations (iteration, size, etc.)
|
9
|
+
# - Statistical measures (mean, variance, standard deviation)
|
10
|
+
# - Advanced statistical methods (percentiles, confidence intervals)
|
11
|
+
# - Time series analysis (moving averages, autocorrelation)
|
12
|
+
# - Hypothesis testing (t-tests, confidence intervals)
|
13
|
+
# - Data visualization tools (histograms)
|
14
|
+
#
|
15
|
+
# @example Basic usage
|
16
|
+
# sequence = Sequence.new([1, 2, 3, 4, 5])
|
17
|
+
# puts sequence.mean # => 3.0
|
18
|
+
# puts sequence.variance # => 2.0
|
19
|
+
# sequence.simple_moving_average(3) # => [2.0, 3.0, 4.0]
|
20
|
+
#
|
21
|
+
# @example Statistical analysis
|
22
|
+
# data = Sequence.new([10, 15, 20, 25, 30])
|
23
|
+
# puts data.percentile(90) # => 28.0
|
24
|
+
# puts data.confidence_interval(0.05) # => 17.0..23.0
|
7
25
|
class Sequence
|
8
26
|
include MoreMath::Sequence::MovingAverage
|
9
27
|
|
28
|
+
# Initializes a new Sequence instance with the given elements.
|
29
|
+
#
|
30
|
+
# @param elements [Array] The array of elements to store in this sequence
|
10
31
|
def initialize(elements)
|
11
32
|
@elements = elements.dup.freeze
|
12
33
|
end
|
13
34
|
|
14
35
|
# Returns the array of elements.
|
36
|
+
#
|
37
|
+
# @return [Array] The frozen array of elements in this sequence
|
15
38
|
attr_reader :elements
|
16
39
|
|
17
|
-
# Calls the
|
40
|
+
# Calls the block for every element of this Sequence.
|
41
|
+
#
|
42
|
+
# @yield [element] Yields each element to the block
|
43
|
+
# @yieldparam element [Object] Each element in the sequence
|
44
|
+
# @return [self] Returns self to allow method chaining
|
18
45
|
def each(&block)
|
19
46
|
@elements.each(&block)
|
20
47
|
end
|
21
48
|
include Enumerable
|
22
49
|
|
23
50
|
# Returns true if this sequence is empty, otherwise false.
|
51
|
+
#
|
52
|
+
# @return [Boolean] true if sequence has no elements, false otherwise
|
24
53
|
def empty?
|
25
54
|
@elements.empty?
|
26
55
|
end
|
27
56
|
|
28
|
-
# Returns the number of elements
|
57
|
+
# Returns the number of elements in this sequence.
|
58
|
+
#
|
59
|
+
# @return [Integer] The count of elements in the sequence
|
29
60
|
def size
|
30
61
|
@elements.size
|
31
62
|
end
|
32
63
|
|
33
64
|
# Reset all memoized values of this sequence.
|
65
|
+
#
|
66
|
+
# @return [self] Returns self after clearing memoization cache
|
34
67
|
def reset
|
35
68
|
self.class.mize_cache_clear
|
36
69
|
self
|
37
70
|
end
|
38
71
|
|
72
|
+
# Converts the sequence to an array.
|
73
|
+
#
|
74
|
+
# @return [Array] A duplicate of the internal elements array
|
39
75
|
def to_ary
|
40
76
|
@elements.dup
|
41
77
|
end
|
42
78
|
|
43
79
|
alias to_a to_ary
|
44
80
|
|
45
|
-
#
|
46
|
-
#
|
81
|
+
# Pushes an element onto this Sequence and returns a new Sequence instance.
|
82
|
+
#
|
83
|
+
# @param element [Object] The element to add to the sequence
|
84
|
+
# @return [Sequence] A new Sequence instance with the element added
|
47
85
|
def push(element)
|
48
86
|
Sequence.new(@elements.dup.push(element))
|
49
87
|
end
|
88
|
+
|
50
89
|
alias << push
|
51
90
|
|
52
91
|
# Returns the variance of the elements.
|
92
|
+
#
|
93
|
+
# Variance measures how far each number in the set is from the mean.
|
94
|
+
#
|
95
|
+
# @return [Float] The population variance of the elements
|
96
|
+
# @note Uses the formula: Σ(xi - μ)² / n
|
53
97
|
memoize method:
|
54
98
|
def variance
|
55
99
|
sum_of_squares / size
|
56
100
|
end
|
57
101
|
|
58
|
-
# Returns the
|
102
|
+
# Returns the sample variance of the elements.
|
103
|
+
#
|
104
|
+
# Sample variance is used when the data represents a sample rather than a population.
|
105
|
+
#
|
106
|
+
# @return [Float] The sample variance of the elements
|
107
|
+
# @note Uses the formula: Σ(xi - μ)² / (n-1)
|
59
108
|
memoize method:
|
60
109
|
def sample_variance
|
61
110
|
size > 1 ? sum_of_squares / (size - 1.0) : 0.0
|
62
111
|
end
|
63
112
|
|
64
|
-
# Returns the sum of squares
|
65
|
-
#
|
113
|
+
# Returns the sum of squares of the elements.
|
114
|
+
#
|
115
|
+
# Sum of squares is used in variance and standard deviation calculations.
|
116
|
+
#
|
117
|
+
# @return [Float] The sum of squared deviations from the mean
|
66
118
|
memoize method:
|
67
119
|
def sum_of_squares
|
68
120
|
@elements.inject(0.0) { |s, t| s + (t - arithmetic_mean) ** 2 }
|
69
121
|
end
|
70
122
|
|
71
123
|
# Returns the standard deviation of the elements.
|
124
|
+
#
|
125
|
+
# Standard deviation measures the amount of variation or dispersion in a set of values.
|
126
|
+
#
|
127
|
+
# @return [Float] The population standard deviation
|
72
128
|
memoize method:
|
73
129
|
def standard_deviation
|
74
130
|
Math.sqrt(variance)
|
75
131
|
end
|
76
132
|
|
77
133
|
# Returns the Z-score sequence derived from the current sequence.
|
134
|
+
#
|
135
|
+
# Z-scores standardize data by transforming it to have a mean of 0 and standard deviation of 1.
|
136
|
+
#
|
137
|
+
# @return [Sequence] A new Sequence with z-score values
|
78
138
|
memoize method:
|
79
139
|
def z_score
|
80
140
|
self.class.new(elements.map { |t| t.to_f - mean / standard_deviation })
|
81
141
|
end
|
82
142
|
|
83
|
-
# Returns the standard deviation
|
84
|
-
#
|
143
|
+
# Returns the standard deviation as a percentage of the arithmetic mean.
|
144
|
+
#
|
145
|
+
# @return [Float] Standard deviation expressed as a percentage of the mean
|
85
146
|
memoize method:
|
86
147
|
def standard_deviation_percentage
|
87
148
|
100.0 * standard_deviation / arithmetic_mean
|
88
149
|
end
|
89
150
|
|
90
151
|
# Returns the sample standard deviation of the elements.
|
152
|
+
#
|
153
|
+
# @return [Float] The sample standard deviation
|
91
154
|
memoize method:
|
92
155
|
def sample_standard_deviation
|
93
156
|
Math.sqrt(sample_variance)
|
94
157
|
end
|
95
158
|
|
96
|
-
# Returns the sample standard deviation of the
|
97
|
-
#
|
159
|
+
# Returns the sample standard deviation as a percentage of the arithmetic mean.
|
160
|
+
#
|
161
|
+
# @return [Float] Sample standard deviation expressed as a percentage of the mean
|
98
162
|
memoize method:
|
99
163
|
def sample_standard_deviation_percentage
|
100
164
|
100.0 * sample_standard_deviation / arithmetic_mean
|
101
165
|
end
|
102
166
|
|
103
167
|
# Returns the sum of all elements.
|
168
|
+
#
|
169
|
+
# @return [Float] The sum of all elements in the sequence
|
104
170
|
memoize method:
|
105
171
|
def sum
|
106
172
|
@elements.inject(0.0) { |s, t| s + t }
|
107
173
|
end
|
108
174
|
|
109
175
|
# Returns the arithmetic mean of the elements.
|
176
|
+
#
|
177
|
+
# @return [Float] The arithmetic mean (average) of the elements
|
110
178
|
memoize method:
|
111
179
|
def arithmetic_mean
|
112
180
|
sum / size
|
@@ -114,8 +182,11 @@ module MoreMath
|
|
114
182
|
|
115
183
|
alias mean arithmetic_mean
|
116
184
|
|
117
|
-
# Returns the harmonic mean of the elements.
|
118
|
-
#
|
185
|
+
# Returns the harmonic mean of the elements.
|
186
|
+
#
|
187
|
+
# The harmonic mean is useful for rates and ratios. Returns NaN if any element is <= 0.
|
188
|
+
#
|
189
|
+
# @return [Float] The harmonic mean, or NaN if invalid input
|
119
190
|
memoize method:
|
120
191
|
def harmonic_mean
|
121
192
|
sum = @elements.inject(0.0) { |s, t|
|
@@ -128,8 +199,12 @@ module MoreMath
|
|
128
199
|
sum ? size / sum : 0 / 0.0
|
129
200
|
end
|
130
201
|
|
131
|
-
# Returns the geometric mean of the elements.
|
132
|
-
#
|
202
|
+
# Returns the geometric mean of the elements.
|
203
|
+
#
|
204
|
+
# The geometric mean is useful for sets of positive numbers that are to be multiplied together.
|
205
|
+
# Returns NaN if any element is negative, 0 if any element is zero.
|
206
|
+
#
|
207
|
+
# @return [Float] The geometric mean, or NaN if invalid input
|
133
208
|
memoize method:
|
134
209
|
def geometric_mean
|
135
210
|
sum = @elements.inject(0.0) { |s, t|
|
@@ -153,27 +228,36 @@ module MoreMath
|
|
153
228
|
end
|
154
229
|
|
155
230
|
# Returns the minimum of the elements.
|
231
|
+
#
|
232
|
+
# @return [Object] The minimum element in the sequence
|
156
233
|
memoize method:
|
157
234
|
def min
|
158
235
|
@elements.min
|
159
236
|
end
|
160
237
|
|
161
238
|
# Returns the maximum of the elements.
|
239
|
+
#
|
240
|
+
# @return [Object] The maximum element in the sequence
|
162
241
|
memoize method:
|
163
242
|
def max
|
164
243
|
@elements.max
|
165
244
|
end
|
166
245
|
|
167
|
-
#
|
246
|
+
# Returns a sorted array of the elements.
|
247
|
+
#
|
248
|
+
# @return [Array] A new array containing elements sorted in ascending order
|
168
249
|
memoize method:
|
169
250
|
def sorted
|
170
251
|
@elements.sort
|
171
252
|
end
|
172
253
|
|
173
|
-
# Returns the
|
174
|
-
#
|
175
|
-
#
|
176
|
-
#
|
254
|
+
# Returns the p-percentile of the elements.
|
255
|
+
#
|
256
|
+
# Uses weighted average at x_(n + 1)p for interpolation between percentiles.
|
257
|
+
#
|
258
|
+
# @param p [Integer, Float] The percentile to calculate (0-99)
|
259
|
+
# @return [Float] The p-th percentile value
|
260
|
+
# @raise [ArgumentError] If p is not in the range (0...100)
|
177
261
|
def percentile(p = 50)
|
178
262
|
(0...100).include?(p) or
|
179
263
|
raise ArgumentError, "p = #{p}, but has to be in (0...100)"
|
@@ -195,16 +279,20 @@ module MoreMath
|
|
195
279
|
|
196
280
|
alias median percentile
|
197
281
|
|
198
|
-
#
|
199
|
-
#
|
282
|
+
# Computes the degrees of freedom for Welch's t-test.
|
283
|
+
#
|
284
|
+
# @param other [Sequence] The other sequence to compare against
|
285
|
+
# @return [Float] The degrees of freedom for Welch's t-test
|
200
286
|
def compute_welch_df(other)
|
201
287
|
(sample_variance / size + other.sample_variance / other.size) ** 2 / (
|
202
288
|
(sample_variance ** 2 / (size ** 2 * (size - 1))) +
|
203
289
|
(other.sample_variance ** 2 / (other.size ** 2 * (other.size - 1))))
|
204
290
|
end
|
205
291
|
|
206
|
-
# Returns the t value of the Welch's t-test between this
|
207
|
-
#
|
292
|
+
# Returns the t value of the Welch's t-test between this sequence and another.
|
293
|
+
#
|
294
|
+
# @param other [Sequence] The other sequence to compare against
|
295
|
+
# @return [Float] The t-statistic value
|
208
296
|
def t_welch(other)
|
209
297
|
signal = arithmetic_mean - other.arithmetic_mean
|
210
298
|
noise = Math.sqrt(sample_variance / size +
|
@@ -214,26 +302,35 @@ module MoreMath
|
|
214
302
|
0.0
|
215
303
|
end
|
216
304
|
|
217
|
-
# Returns an estimation of the common standard deviation of
|
218
|
-
#
|
305
|
+
# Returns an estimation of the common standard deviation of this and another sequence.
|
306
|
+
#
|
307
|
+
# @param other [Sequence] The other sequence to compare against
|
308
|
+
# @return [Float] The pooled standard deviation estimate
|
219
309
|
def common_standard_deviation(other)
|
220
310
|
Math.sqrt(common_variance(other))
|
221
311
|
end
|
222
312
|
|
223
|
-
# Returns an estimation of the common variance of
|
224
|
-
#
|
313
|
+
# Returns an estimation of the common variance of this and another sequence.
|
314
|
+
#
|
315
|
+
# @param other [Sequence] The other sequence to compare against
|
316
|
+
# @return [Float] The pooled variance estimate
|
225
317
|
def common_variance(other)
|
226
318
|
(size - 1) * sample_variance + (other.size - 1) *
|
227
319
|
other.sample_variance / (size + other.size - 2)
|
228
320
|
end
|
229
321
|
|
230
|
-
#
|
322
|
+
# Computes the degrees of freedom for Student's t-test.
|
323
|
+
#
|
324
|
+
# @param other [Sequence] The other sequence to compare against
|
325
|
+
# @return [Integer] The degrees of freedom for Student's t-test
|
231
326
|
def compute_student_df(other)
|
232
327
|
size + other.size - 2
|
233
328
|
end
|
234
329
|
|
235
|
-
# Returns the t value of the Student's t-test between this
|
236
|
-
#
|
330
|
+
# Returns the t value of the Student's t-test between this sequence and another.
|
331
|
+
#
|
332
|
+
# @param other [Sequence] The other sequence to compare against
|
333
|
+
# @return [Float] The t-statistic value
|
237
334
|
def t_student(other)
|
238
335
|
signal = arithmetic_mean - other.arithmetic_mean
|
239
336
|
noise = common_standard_deviation(other) *
|
@@ -243,9 +340,12 @@ module MoreMath
|
|
243
340
|
0.0
|
244
341
|
end
|
245
342
|
|
246
|
-
#
|
247
|
-
#
|
248
|
-
#
|
343
|
+
# Computes the suggested sample size for detecting a mean difference.
|
344
|
+
#
|
345
|
+
# @param other [Sequence] The other sequence to compare against
|
346
|
+
# @param alpha [Float] The significance level (default: 0.05)
|
347
|
+
# @param beta [Float] The Type II error probability (default: 0.05)
|
348
|
+
# @return [Float] The suggested sample size
|
249
349
|
def suggested_sample_size(other, alpha = 0.05, beta = 0.05)
|
250
350
|
alpha, beta = alpha.abs, beta.abs
|
251
351
|
signal = arithmetic_mean - other.arithmetic_mean
|
@@ -256,17 +356,21 @@ module MoreMath
|
|
256
356
|
Math.sqrt(pooled_variance_estimate)) / signal) ** 2
|
257
357
|
end
|
258
358
|
|
259
|
-
#
|
260
|
-
#
|
261
|
-
#
|
359
|
+
# Determines if this sequence covers another sequence at the given alpha level.
|
360
|
+
#
|
361
|
+
# @param other [Sequence] The other sequence to compare against
|
362
|
+
# @param alpha [Float] The significance level (default: 0.05)
|
363
|
+
# @return [Boolean] true if sequences are statistically equivalent
|
262
364
|
def cover?(other, alpha = 0.05)
|
263
365
|
t = t_welch(other)
|
264
366
|
td = TDistribution.new(compute_welch_df(other))
|
265
367
|
t.abs < td.inverse_probability(1 - alpha.abs / 2.0)
|
266
368
|
end
|
267
369
|
|
268
|
-
#
|
269
|
-
#
|
370
|
+
# Returns the confidence interval for the arithmetic mean.
|
371
|
+
#
|
372
|
+
# @param alpha [Float] The significance level (default: 0.05)
|
373
|
+
# @return [Range] The confidence interval as a range object
|
270
374
|
def confidence_interval(alpha = 0.05)
|
271
375
|
td = TDistribution.new(size - 1)
|
272
376
|
t = td.inverse_probability(alpha / 2).abs
|
@@ -274,7 +378,9 @@ module MoreMath
|
|
274
378
|
(arithmetic_mean - delta)..(arithmetic_mean + delta)
|
275
379
|
end
|
276
380
|
|
277
|
-
# Returns the array of autovariances
|
381
|
+
# Returns the array of autovariances.
|
382
|
+
#
|
383
|
+
# @return [Array<Float>] Array of autovariance values
|
278
384
|
def autovariance
|
279
385
|
Array.new(size - 1) do |k|
|
280
386
|
s = 0.0
|
@@ -285,15 +391,17 @@ module MoreMath
|
|
285
391
|
end
|
286
392
|
end
|
287
393
|
|
288
|
-
# Returns the array of autocorrelation values
|
289
|
-
#
|
394
|
+
# Returns the array of autocorrelation values.
|
395
|
+
#
|
396
|
+
# @return [Array<Float>] Array of autocorrelation values (normalized by first variance)
|
290
397
|
def autocorrelation
|
291
398
|
c = autovariance
|
292
399
|
Array.new(c.size) { |k| c[k] / c[0] }
|
293
400
|
end
|
294
401
|
|
295
|
-
# Returns the d-value for the Durbin-Watson statistic.
|
296
|
-
#
|
402
|
+
# Returns the d-value for the Durbin-Watson statistic.
|
403
|
+
#
|
404
|
+
# @return [Float] The Durbin-Watson statistic value (close to 2 indicates no autocorrelation)
|
297
405
|
def durbin_watson_statistic
|
298
406
|
e = linear_regression.residuals
|
299
407
|
e.size <= 1 and return 2.0
|
@@ -301,10 +409,10 @@ module MoreMath
|
|
301
409
|
e.inject(0.0) { |s, x| s + x ** 2 }
|
302
410
|
end
|
303
411
|
|
304
|
-
# Returns the q value of the Ljung-Box statistic
|
305
|
-
#
|
306
|
-
#
|
307
|
-
#
|
412
|
+
# Returns the q value of the Ljung-Box statistic.
|
413
|
+
#
|
414
|
+
# @param lags [Integer] The number of lags to consider (default: 20)
|
415
|
+
# @return [Float, nil] The Ljung-Box statistic value or nil if insufficient data
|
308
416
|
def ljung_box_statistic(lags = 20)
|
309
417
|
r = autocorrelation
|
310
418
|
lags >= r.size and return
|
@@ -312,14 +420,11 @@ module MoreMath
|
|
312
420
|
n * (n + 2) * (1..lags).inject(0.0) { |s, i| s + r[i] ** 2 / (n - i) }
|
313
421
|
end
|
314
422
|
|
315
|
-
#
|
316
|
-
#
|
317
|
-
#
|
318
|
-
#
|
319
|
-
#
|
320
|
-
# :q:: the value of the ljung_box_statistic,
|
321
|
-
# :p:: the p-value computed, if p is higher than alpha no correlation was detected,
|
322
|
-
# :detected:: true if a correlation was found.
|
423
|
+
# Detects autocorrelation using the Ljung-Box statistic.
|
424
|
+
#
|
425
|
+
# @param lags [Integer] The number of lags to consider (default: 20)
|
426
|
+
# @param alpha_level [Float] The significance level (default: 0.05)
|
427
|
+
# @return [Hash, nil] Results hash or nil if insufficient data
|
323
428
|
def detect_autocorrelation(lags = 20, alpha_level = 0.05)
|
324
429
|
if q = ljung_box_statistic(lags)
|
325
430
|
p = ChiSquareDistribution.new(lags).probability(q)
|
@@ -334,16 +439,19 @@ module MoreMath
|
|
334
439
|
end
|
335
440
|
|
336
441
|
# Returns the interquartile range for this sequence.
|
442
|
+
#
|
443
|
+
# @return [Float] The difference between 75th and 25th percentiles
|
337
444
|
def interquartile_range
|
338
445
|
quartile1 = percentile(25)
|
339
446
|
quartile3 = percentile(75)
|
340
447
|
quartile3 - quartile1
|
341
448
|
end
|
342
449
|
|
343
|
-
#
|
344
|
-
#
|
345
|
-
#
|
346
|
-
#
|
450
|
+
# Detects outliers using the boxplot algorithm.
|
451
|
+
#
|
452
|
+
# @param factor [Float] The multiplier for IQR to define outlier boundaries (default: 3.0)
|
453
|
+
# @param epsilon [Float] Small value for numerical stability (default: 1E-5)
|
454
|
+
# @return [Hash, nil] Outlier statistics or nil if no outliers or insufficient data
|
347
455
|
def detect_outliers(factor = 3.0, epsilon = 1E-5)
|
348
456
|
half_factor = factor / 2.0
|
349
457
|
quartile1 = percentile(25)
|
@@ -372,15 +480,18 @@ module MoreMath
|
|
372
480
|
end
|
373
481
|
end
|
374
482
|
|
375
|
-
# Returns the LinearRegression object for
|
376
|
-
#
|
483
|
+
# Returns the LinearRegression object for this sequence.
|
484
|
+
#
|
485
|
+
# @return [LinearRegression] The linear regression model for this data
|
377
486
|
memoize method:
|
378
487
|
def linear_regression
|
379
488
|
LinearRegression.new @elements
|
380
489
|
end
|
381
490
|
|
382
|
-
#
|
383
|
-
#
|
491
|
+
# Creates a Histogram instance from this sequence.
|
492
|
+
#
|
493
|
+
# @param bins [Integer] The number of bins for the histogram
|
494
|
+
# @return [Histogram] A new Histogram instance
|
384
495
|
def histogram(bins)
|
385
496
|
Histogram.new(self, bins)
|
386
497
|
end
|