davidrichards-just_enumerable_stats 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +147 -0
- data/VERSION.yml +4 -0
- data/bin/jes +27 -0
- data/lib/fixed_range.rb +46 -0
- data/lib/just_enumerable_stats/stats.rb +503 -0
- data/lib/just_enumerable_stats.rb +498 -0
- data/spec/fixed_range_spec.rb +77 -0
- data/spec/just_enumerable_stats/stats_spec.rb +459 -0
- data/spec/just_enumerable_stats_spec.rb +449 -0
- data/spec/spec_helper.rb +8 -0
- metadata +65 -0
@@ -0,0 +1,498 @@
|
|
1
|
+
# Borrowed this from my own gem, sirb
|
2
|
+
|
3
|
+
class Object
|
4
|
+
|
5
|
+
# Simpler way to handle a random number between to values
|
6
|
+
def rand_between(a, b)
|
7
|
+
return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
|
8
|
+
range = (a - b).abs + 1
|
9
|
+
rand(range) + [a,b].min
|
10
|
+
end
|
11
|
+
|
12
|
+
# Handles non-integers
|
13
|
+
def rand_in_floats(a, b)
|
14
|
+
range = (a - b).abs
|
15
|
+
(rand * range) + [a,b].min
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
module Enumerable
|
21
|
+
|
22
|
+
alias :original_max :max
|
23
|
+
alias :original_min :min
|
24
|
+
|
25
|
+
# To keep max and min DRY.
|
26
|
+
def block_sorter(a, b, &block)
|
27
|
+
if block
|
28
|
+
val = yield(a, b)
|
29
|
+
elsif default_block
|
30
|
+
val = default_block.call(a, b)
|
31
|
+
else
|
32
|
+
val = a <=> b
|
33
|
+
end
|
34
|
+
end
|
35
|
+
protected :block_sorter
|
36
|
+
|
37
|
+
# Returns the max, using an optional block.
|
38
|
+
def max(&block)
|
39
|
+
self.inject do |best, e|
|
40
|
+
val = block_sorter(best, e, &block)
|
41
|
+
best = val > 0 ? best : e
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the first index of the max value
|
46
|
+
def max_index(&block)
|
47
|
+
self.index(max(&block))
|
48
|
+
end
|
49
|
+
|
50
|
+
# Min of any number of items
|
51
|
+
def min(&block)
|
52
|
+
self.inject do |best, e|
|
53
|
+
val = block_sorter(best, e, &block)
|
54
|
+
best = val < 0 ? best : e
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns the first index of the min value
|
59
|
+
def min_index(&block)
|
60
|
+
self.index(min(&block))
|
61
|
+
end
|
62
|
+
|
63
|
+
# The block called to filter the values in the object.
|
64
|
+
def default_block
|
65
|
+
@default_stat_block
|
66
|
+
end
|
67
|
+
|
68
|
+
# Allows me to setup a block for a series of operations. Example:
|
69
|
+
# a = [1,2,3]
|
70
|
+
# a.sum # => 6.0
|
71
|
+
# a.default_block = lambda{|e| 1 / e}
|
72
|
+
# a.sum # => 1.0
|
73
|
+
def default_block=(block)
|
74
|
+
@default_stat_block = block
|
75
|
+
end
|
76
|
+
|
77
|
+
# Provides zero in the right class (Numeric or Float)
|
78
|
+
def zero
|
79
|
+
any? {|e| e.is_a?(Float)} ? 0.0 : 0
|
80
|
+
end
|
81
|
+
protected :zero
|
82
|
+
|
83
|
+
# Provides one in the right class (Numeric or Float)
|
84
|
+
def one
|
85
|
+
any? {|e| e.is_a?(Float)} ? 1.0 : 1
|
86
|
+
end
|
87
|
+
protected :one
|
88
|
+
|
89
|
+
# Adds up the list. Uses a block or default block if present.
|
90
|
+
def sum
|
91
|
+
sum = zero
|
92
|
+
if block_given?
|
93
|
+
each{|i| sum += yield(i)}
|
94
|
+
elsif default_block
|
95
|
+
each{|i| sum += default_block[*i]}
|
96
|
+
else
|
97
|
+
each{|i| sum += i}
|
98
|
+
end
|
99
|
+
sum
|
100
|
+
end
|
101
|
+
|
102
|
+
# The arithmetic mean, uses a block or default block.
|
103
|
+
def average(&block)
|
104
|
+
sum(&block)/size
|
105
|
+
end
|
106
|
+
alias :mean :average
|
107
|
+
alias :avg :average
|
108
|
+
|
109
|
+
# The variance, uses a block or default block.
|
110
|
+
def variance(&block)
|
111
|
+
m = mean(&block)
|
112
|
+
sum_of_differences = if block_given?
|
113
|
+
sum{ |i| j=yield(i); (m - j) ** 2 }
|
114
|
+
elsif default_block
|
115
|
+
sum{ |i| j=default_block[*i]; (m - j) ** 2 }
|
116
|
+
else
|
117
|
+
sum{ |i| (m - i) ** 2 }
|
118
|
+
end
|
119
|
+
sum_of_differences / (size - 1)
|
120
|
+
end
|
121
|
+
alias :var :variance
|
122
|
+
|
123
|
+
# The standard deviation. Uses a block or default block.
|
124
|
+
def standard_deviation(&block)
|
125
|
+
Math::sqrt(variance(&block))
|
126
|
+
end
|
127
|
+
alias :std :standard_deviation
|
128
|
+
|
129
|
+
# The slow way is to iterate up to the middle point. A faster way is to
|
130
|
+
# use the index, when available. If a block is supplied, always iterate
|
131
|
+
# to the middle point.
|
132
|
+
def median(ratio=0.5, &block)
|
133
|
+
return iterate_midway(ratio, &block) if block_given?
|
134
|
+
begin
|
135
|
+
mid1, mid2 = middle_two
|
136
|
+
sorted = new_sort
|
137
|
+
med1, med2 = sorted[mid1], sorted[mid2]
|
138
|
+
return med1 if med1 == med2
|
139
|
+
return med1 + ((med2 - med1) * ratio)
|
140
|
+
rescue
|
141
|
+
iterate_midway(ratio, &block)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def middle_two
|
146
|
+
mid2 = size.div(2)
|
147
|
+
mid1 = (size % 2 == 0) ? mid2 - 1 : mid2
|
148
|
+
return mid1, mid2
|
149
|
+
end
|
150
|
+
protected :middle_two
|
151
|
+
|
152
|
+
def median_position
|
153
|
+
middle_two.last
|
154
|
+
end
|
155
|
+
protected :median_position
|
156
|
+
|
157
|
+
def first_half(&block)
|
158
|
+
fh = self[0..median_position].dup
|
159
|
+
end
|
160
|
+
protected :first_half
|
161
|
+
|
162
|
+
def second_half(&block)
|
163
|
+
# Total crap, but it's the way R does things, and this will most likely
|
164
|
+
# only be used to feed R some numbers to plot, if at all.
|
165
|
+
sh = size <= 5 ? self[median_position..-1].dup : self[median_position - 1..-1].dup
|
166
|
+
end
|
167
|
+
protected :second_half
|
168
|
+
|
169
|
+
# An iterative version of median
|
170
|
+
def iterate_midway(ratio, &block)
|
171
|
+
mid1, mid2, last_value, j, sorted, sort1, sort2 = middle_two, nil, 0, new_sort, nil, nil
|
172
|
+
|
173
|
+
if block_given?
|
174
|
+
sorted.each do |i|
|
175
|
+
last_value = yield(i)
|
176
|
+
j += 1
|
177
|
+
sort1 = last_value if j == mid1
|
178
|
+
sort2 = last_value if j == mid2
|
179
|
+
break if j >= mid2
|
180
|
+
end
|
181
|
+
elsif default_block
|
182
|
+
sorted.each do |i|
|
183
|
+
last_value = default_block[*i]
|
184
|
+
j += 1
|
185
|
+
sort1 = last_value if j == mid1
|
186
|
+
sort2 = last_value if j == mid2
|
187
|
+
break if j >= mid2
|
188
|
+
end
|
189
|
+
else
|
190
|
+
sorted.each do |i|
|
191
|
+
last_value = i
|
192
|
+
sort1 = last_value if j == mid1
|
193
|
+
sort2 = last_value if j == mid2
|
194
|
+
j += 1
|
195
|
+
break if j >= mid2
|
196
|
+
end
|
197
|
+
end
|
198
|
+
return med1 if med1 == med2
|
199
|
+
return med1 + ((med2 - med1) * ratio)
|
200
|
+
end
|
201
|
+
protected :iterate_midway
|
202
|
+
|
203
|
+
# Just an array of [min, max] to comply with R uses of the work. Use
|
204
|
+
# range_as_range if you want a real Range.
|
205
|
+
def range(&block)
|
206
|
+
[min(&block), max(&block)]
|
207
|
+
end
|
208
|
+
|
209
|
+
# Useful for setting a real range class (FixedRange).
|
210
|
+
def range_class=(klass)
|
211
|
+
@range_class = klass
|
212
|
+
end
|
213
|
+
|
214
|
+
# When creating a range, what class will it be? Defaults to Range, but
|
215
|
+
# other classes are sometimes useful.
|
216
|
+
def range_class
|
217
|
+
@range_class ||= Range
|
218
|
+
end
|
219
|
+
|
220
|
+
# Actually instantiates the range, instead of producing a min and max array.
|
221
|
+
def range_as_range(&block)
|
222
|
+
range_class.new(min(&block), max(&block))
|
223
|
+
end
|
224
|
+
|
225
|
+
# I don't pass the block to the sort, because a sort block needs to look
|
226
|
+
# something like: {|x,y| x <=> y}. To get around this, set the default
|
227
|
+
# block on the object.
|
228
|
+
def new_sort(&block)
|
229
|
+
if block_given?
|
230
|
+
map { |i| yield(i) }.sort.dup
|
231
|
+
elsif default_block
|
232
|
+
map { |i| default_block[*i] }.sort.dup
|
233
|
+
else
|
234
|
+
sort().dup
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Doesn't overwrite things like Matrix#rank
|
239
|
+
def rank(&block)
|
240
|
+
|
241
|
+
sorted = new_sort(&block)
|
242
|
+
|
243
|
+
if block_given?
|
244
|
+
map { |i| sorted.index(yield(i)) + 1 }
|
245
|
+
elsif default_block
|
246
|
+
map { |i| sorted.index(default_block[*i]) + 1 }
|
247
|
+
else
|
248
|
+
map { |i| sorted.index(i) + 1 }
|
249
|
+
end
|
250
|
+
|
251
|
+
end unless defined?(rank)
|
252
|
+
|
253
|
+
# Given values like [10,5,5,1]
|
254
|
+
# Rank should produce something like [4,2,2,1]
|
255
|
+
# And order should produce something like [4,2,3,1]
|
256
|
+
# The trick is that rank skips as many as were duplicated, so there
|
257
|
+
# could not be a 3 in the rank from the example above.
|
258
|
+
def order(&block)
|
259
|
+
hold = []
|
260
|
+
rank(&block).each do |x|
|
261
|
+
while hold.include?(x) do
|
262
|
+
x += 1
|
263
|
+
end
|
264
|
+
hold << x
|
265
|
+
end
|
266
|
+
hold
|
267
|
+
end
|
268
|
+
|
269
|
+
# First quartile: nth_split_by_m(1, 4)
|
270
|
+
# Third quartile: nth_split_by_m(3, 4)
|
271
|
+
# Median: nth_split_by_m(1, 2)
|
272
|
+
# Doesn't match R, and it's silly to try to.
|
273
|
+
# def nth_split_by_m(n, m)
|
274
|
+
# sorted = new_sort
|
275
|
+
# dividers = m - 1
|
276
|
+
# if size % m == dividers # Divides evenly
|
277
|
+
# # Because we have a 0-based list, we get the floor
|
278
|
+
# i = ((size / m.to_f) * n).floor
|
279
|
+
# j = i
|
280
|
+
# else
|
281
|
+
# # This reflects R's approach, which I don't think I agree with.
|
282
|
+
# i = (((size / m.to_f) * n) - 1)
|
283
|
+
# i = i > (size / m.to_f) ? i.floor : i.ceil
|
284
|
+
# j = i + 1
|
285
|
+
# end
|
286
|
+
# sorted[i] + ((n / m.to_f) * (sorted[j] - sorted[i]))
|
287
|
+
# end
|
288
|
+
def quantile(&block)
|
289
|
+
[
|
290
|
+
min(&block),
|
291
|
+
first_half(&block).median(0.25, &block),
|
292
|
+
median(&block),
|
293
|
+
second_half(&block).median(0.75, &block),
|
294
|
+
max(&block)
|
295
|
+
]
|
296
|
+
end
|
297
|
+
|
298
|
+
# The cummulative sum. Example:
|
299
|
+
# [1,2,3].cum_sum # => [1, 3, 6]
|
300
|
+
def cum_sum(sorted=false, &block)
|
301
|
+
sum = zero
|
302
|
+
obj = sorted ? self.new_sort : self
|
303
|
+
if block_given?
|
304
|
+
obj.map { |i| sum += yield(i) }
|
305
|
+
elsif default_block
|
306
|
+
obj.map { |i| sum += default_block[*i] }
|
307
|
+
else
|
308
|
+
obj.map { |i| sum += i }
|
309
|
+
end
|
310
|
+
end
|
311
|
+
alias :cumulative_sum :cum_sum
|
312
|
+
|
313
|
+
# The cummulative product. Example:
|
314
|
+
# [1,2,3].cum_prod # => [1.0, 2.0, 6.0]
|
315
|
+
def cum_prod(sorted=false, &block)
|
316
|
+
prod = one
|
317
|
+
obj = sorted ? self.new_sort : self
|
318
|
+
if block_given?
|
319
|
+
obj.map { |i| prod *= yield(i) }
|
320
|
+
elsif default_block
|
321
|
+
obj.map { |i| prod *= default_block[*i] }
|
322
|
+
else
|
323
|
+
obj.map { |i| prod *= i }
|
324
|
+
end
|
325
|
+
end
|
326
|
+
alias :cumulative_product :cum_prod
|
327
|
+
|
328
|
+
# Used to preprocess the list
|
329
|
+
def morph_list(&block)
|
330
|
+
if block
|
331
|
+
self.map{ |e| block.call(e) }
|
332
|
+
elsif self.default_block
|
333
|
+
self.map{ |e| self.default_block.call(e) }
|
334
|
+
else
|
335
|
+
self
|
336
|
+
end
|
337
|
+
end
|
338
|
+
protected :morph_list
|
339
|
+
|
340
|
+
# Example:
|
341
|
+
# [1,2,3,0,5].cum_max # => [1,2,3,3,5]
|
342
|
+
def cum_max(&block)
|
343
|
+
morph_list(&block).inject([]) do |list, e|
|
344
|
+
found = (list | [e]).max
|
345
|
+
list << (found ? found : e)
|
346
|
+
end
|
347
|
+
end
|
348
|
+
alias :cumulative_max :cum_max
|
349
|
+
|
350
|
+
# Example:
|
351
|
+
# [1,2,3,0,5].cum_min # => [1,1,1,0,0]
|
352
|
+
def cum_min(&block)
|
353
|
+
morph_list(&block).inject([]) do |list, e|
|
354
|
+
found = (list | [e]).min
|
355
|
+
list << (found ? found : e)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
alias :cumulative_min :cum_min
|
359
|
+
|
360
|
+
# Multiplies the values:
|
361
|
+
# >> product(1,2,3)
|
362
|
+
# => 6.0
|
363
|
+
def product
|
364
|
+
self.inject(one) {|sum, a| sum *= a}
|
365
|
+
end
|
366
|
+
|
367
|
+
# There are going to be a lot more of these kinds of things, so pay
|
368
|
+
# attention.
|
369
|
+
def to_pairs(other, &block)
|
370
|
+
n = [self.size, other.size].min
|
371
|
+
(0...n).map {|i| block.call(self[i], other[i]) }
|
372
|
+
end
|
373
|
+
|
374
|
+
# Finds the tanimoto coefficient: the intersection set size / union set
|
375
|
+
# size. This is used to find the distance between two vectors.
|
376
|
+
# >> [1,2,3].cor([2,3,5])
|
377
|
+
# => 0.981980506061966
|
378
|
+
# >> [1,2,3].tanimoto_pairs([2,3,5])
|
379
|
+
# => 0.5
|
380
|
+
def tanimoto_pairs(other)
|
381
|
+
intersect(other).size / union(other).size.to_f
|
382
|
+
end
|
383
|
+
alias :tanimoto_correlation :tanimoto_pairs
|
384
|
+
|
385
|
+
# Sometimes it just helps to have things spelled out. These are all
|
386
|
+
# part of the Array class. This means, you have methods that you can't
|
387
|
+
# run on some kinds of enumerables.
|
388
|
+
|
389
|
+
# All of the left and right hand sides, excluding duplicates.
|
390
|
+
# "The union of x and y"
|
391
|
+
def union(other)
|
392
|
+
self | other
|
393
|
+
end
|
394
|
+
|
395
|
+
# What's shared on the left and right hand sides
|
396
|
+
# "The intersection of x and y"
|
397
|
+
def intersect(other)
|
398
|
+
self & other
|
399
|
+
end
|
400
|
+
|
401
|
+
# Everything on the left hand side except what's shared on the right
|
402
|
+
# hand side.
|
403
|
+
# "The relative compliment of y in x"
|
404
|
+
def compliment(other)
|
405
|
+
self - other
|
406
|
+
end
|
407
|
+
|
408
|
+
# Everything but what's shared
|
409
|
+
def exclusive_not(other)
|
410
|
+
(self | other) - (self & other)
|
411
|
+
end
|
412
|
+
|
413
|
+
# Finds the cartesian product, excluding duplicates items and self-
|
414
|
+
# referential pairs. Yields the block value if given.
|
415
|
+
def cartesian_product(other, &block)
|
416
|
+
x,y = self.uniq.dup, other.uniq.dup
|
417
|
+
pairs = x.inject([]) do |cp, i|
|
418
|
+
cp | y.map{|b| i == b ? nil : [i,b]}.compact
|
419
|
+
end
|
420
|
+
return pairs unless block_given?
|
421
|
+
pairs.map{|p| yield p.first, p.last}
|
422
|
+
end
|
423
|
+
alias :cp :cartesian_product
|
424
|
+
alias :permutations :cartesian_product
|
425
|
+
|
426
|
+
# Sigma of pairs. Returns a single float, or whatever object is sent in.
|
427
|
+
# Example: [1,2,3].sigma_pairs([4,5,6], 0) {|x, y| x + y}
|
428
|
+
# returns 21 instead of 21.0.
|
429
|
+
def sigma_pairs(other, z=zero, &block)
|
430
|
+
self.to_pairs(other,&block).inject(z) {|sum, i| sum += i}
|
431
|
+
end
|
432
|
+
|
433
|
+
# Returns the Euclidian distance between all points of a set of enumerables
|
434
|
+
def euclidian_distance(other)
|
435
|
+
Math.sqrt(self.sigma_pairs(other) {|a, b| (a - b) ** 2})
|
436
|
+
end
|
437
|
+
|
438
|
+
# Returns a random integer in the range for any number of lists. This
|
439
|
+
# is a way to get a random vector that is tenable based on the sample
|
440
|
+
# data. For example, given two sets of numbers:
|
441
|
+
#
|
442
|
+
# a = [1,2,3]; b = [8,8,8]
|
443
|
+
#
|
444
|
+
# rand_in_pair_range will return a value >= 1 and <= 8 in the first
|
445
|
+
# place, >= 2 and <= 8 in the second place, and >= 3 and <= 8 in the
|
446
|
+
# last place.
|
447
|
+
# Works for integers. Rethink this for floats. May consider setting up
|
448
|
+
# FixedRange for floats. O(n*5)
|
449
|
+
def rand_in_range(*args)
|
450
|
+
min = self.min_of_lists(*args)
|
451
|
+
max = self.max_of_lists(*args)
|
452
|
+
(0...size).inject([]) do |ary, i|
|
453
|
+
ary << rand_between(min[i], max[i])
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
457
|
+
# Finds the correlation between two enumerables.
|
458
|
+
# Example: [1,2,3].cor [2,3,5]
|
459
|
+
# returns 0.981980506061966
|
460
|
+
def correlation(other)
|
461
|
+
n = [self.size, other.size].min
|
462
|
+
sum_of_products_of_pairs = self.sigma_pairs(other) {|a, b| a * b}
|
463
|
+
self_sum = self.sum
|
464
|
+
other_sum = other.sum
|
465
|
+
sum_of_squared_self_scores = self.sum { |e| e * e }
|
466
|
+
sum_of_squared_other_scores = other.sum { |e| e * e }
|
467
|
+
|
468
|
+
numerator = (n * sum_of_products_of_pairs) - (self_sum * other_sum)
|
469
|
+
self_denominator = ((n * sum_of_squared_self_scores) - (self_sum ** 2))
|
470
|
+
other_denominator = ((n * sum_of_squared_other_scores) - (other_sum ** 2))
|
471
|
+
denominator = Math.sqrt(self_denominator * other_denominator)
|
472
|
+
return numerator / denominator
|
473
|
+
end
|
474
|
+
alias :cor :correlation
|
475
|
+
|
476
|
+
# Transposes arrays of arrays and yields a block on the value.
|
477
|
+
# The regular Array#transpose ignores blocks
|
478
|
+
def yield_transpose(*enums, &block)
|
479
|
+
enums.unshift(self)
|
480
|
+
n = enums.map{ |x| x.size}.min
|
481
|
+
block ||= lambda{|e| e}
|
482
|
+
(0...n).map { |i| block.call enums.map{ |x| x[i] } }
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns the max of two or more enumerables.
|
486
|
+
# >> [1,2,3].max_of_lists([0,5,6], [0,2,9])
|
487
|
+
# => [1, 5, 9]
|
488
|
+
def max_of_lists(*enums)
|
489
|
+
yield_transpose(*enums) {|e| e.max}
|
490
|
+
end
|
491
|
+
|
492
|
+
# Returns the min of two or more enumerables.
|
493
|
+
# >> [1,2,3].min_of_lists([4,5,6], [0,2,9])
|
494
|
+
# => [0, 2, 3]
|
495
|
+
def min_of_lists(*enums)
|
496
|
+
yield_transpose(*enums) {|e| e.min}
|
497
|
+
end
|
498
|
+
end
|