davidrichards-just_enumerable_stats 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +147 -0
- data/VERSION.yml +4 -0
- data/bin/jes +27 -0
- data/lib/fixed_range.rb +46 -0
- data/lib/just_enumerable_stats/stats.rb +503 -0
- data/lib/just_enumerable_stats.rb +498 -0
- data/spec/fixed_range_spec.rb +77 -0
- data/spec/just_enumerable_stats/stats_spec.rb +459 -0
- data/spec/just_enumerable_stats_spec.rb +449 -0
- data/spec/spec_helper.rb +8 -0
- metadata +65 -0
@@ -0,0 +1,498 @@
|
|
1
|
+
# Borrowed this from my own gem, sirb
|
2
|
+
|
3
|
+
class Object
|
4
|
+
|
5
|
+
# Simpler way to handle a random number between to values
|
6
|
+
def rand_between(a, b)
|
7
|
+
return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
|
8
|
+
range = (a - b).abs + 1
|
9
|
+
rand(range) + [a,b].min
|
10
|
+
end
|
11
|
+
|
12
|
+
# Handles non-integers
|
13
|
+
def rand_in_floats(a, b)
|
14
|
+
range = (a - b).abs
|
15
|
+
(rand * range) + [a,b].min
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
module Enumerable
|
21
|
+
|
22
|
+
alias :original_max :max
|
23
|
+
alias :original_min :min
|
24
|
+
|
25
|
+
# To keep max and min DRY.
|
26
|
+
def block_sorter(a, b, &block)
|
27
|
+
if block
|
28
|
+
val = yield(a, b)
|
29
|
+
elsif default_block
|
30
|
+
val = default_block.call(a, b)
|
31
|
+
else
|
32
|
+
val = a <=> b
|
33
|
+
end
|
34
|
+
end
|
35
|
+
protected :block_sorter
|
36
|
+
|
37
|
+
# Returns the max, using an optional block.
|
38
|
+
def max(&block)
|
39
|
+
self.inject do |best, e|
|
40
|
+
val = block_sorter(best, e, &block)
|
41
|
+
best = val > 0 ? best : e
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the first index of the max value
|
46
|
+
def max_index(&block)
|
47
|
+
self.index(max(&block))
|
48
|
+
end
|
49
|
+
|
50
|
+
# Min of any number of items
|
51
|
+
def min(&block)
|
52
|
+
self.inject do |best, e|
|
53
|
+
val = block_sorter(best, e, &block)
|
54
|
+
best = val < 0 ? best : e
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns the first index of the min value
|
59
|
+
def min_index(&block)
|
60
|
+
self.index(min(&block))
|
61
|
+
end
|
62
|
+
|
63
|
+
# The block called to filter the values in the object.
|
64
|
+
def default_block
|
65
|
+
@default_stat_block
|
66
|
+
end
|
67
|
+
|
68
|
+
# Allows me to setup a block for a series of operations. Example:
|
69
|
+
# a = [1,2,3]
|
70
|
+
# a.sum # => 6.0
|
71
|
+
# a.default_block = lambda{|e| 1 / e}
|
72
|
+
# a.sum # => 1.0
|
73
|
+
def default_block=(block)
|
74
|
+
@default_stat_block = block
|
75
|
+
end
|
76
|
+
|
77
|
+
# Provides zero in the right class (Numeric or Float)
|
78
|
+
def zero
|
79
|
+
any? {|e| e.is_a?(Float)} ? 0.0 : 0
|
80
|
+
end
|
81
|
+
protected :zero
|
82
|
+
|
83
|
+
# Provides one in the right class (Numeric or Float)
|
84
|
+
def one
|
85
|
+
any? {|e| e.is_a?(Float)} ? 1.0 : 1
|
86
|
+
end
|
87
|
+
protected :one
|
88
|
+
|
89
|
+
# Adds up the list. Uses a block or default block if present.
|
90
|
+
def sum
|
91
|
+
sum = zero
|
92
|
+
if block_given?
|
93
|
+
each{|i| sum += yield(i)}
|
94
|
+
elsif default_block
|
95
|
+
each{|i| sum += default_block[*i]}
|
96
|
+
else
|
97
|
+
each{|i| sum += i}
|
98
|
+
end
|
99
|
+
sum
|
100
|
+
end
|
101
|
+
|
102
|
+
# The arithmetic mean, uses a block or default block.
|
103
|
+
def average(&block)
|
104
|
+
sum(&block)/size
|
105
|
+
end
|
106
|
+
alias :mean :average
|
107
|
+
alias :avg :average
|
108
|
+
|
109
|
+
# The variance, uses a block or default block.
|
110
|
+
def variance(&block)
|
111
|
+
m = mean(&block)
|
112
|
+
sum_of_differences = if block_given?
|
113
|
+
sum{ |i| j=yield(i); (m - j) ** 2 }
|
114
|
+
elsif default_block
|
115
|
+
sum{ |i| j=default_block[*i]; (m - j) ** 2 }
|
116
|
+
else
|
117
|
+
sum{ |i| (m - i) ** 2 }
|
118
|
+
end
|
119
|
+
sum_of_differences / (size - 1)
|
120
|
+
end
|
121
|
+
alias :var :variance
|
122
|
+
|
123
|
+
# The standard deviation. Uses a block or default block.
|
124
|
+
def standard_deviation(&block)
|
125
|
+
Math::sqrt(variance(&block))
|
126
|
+
end
|
127
|
+
alias :std :standard_deviation
|
128
|
+
|
129
|
+
# The slow way is to iterate up to the middle point. A faster way is to
|
130
|
+
# use the index, when available. If a block is supplied, always iterate
|
131
|
+
# to the middle point.
|
132
|
+
def median(ratio=0.5, &block)
|
133
|
+
return iterate_midway(ratio, &block) if block_given?
|
134
|
+
begin
|
135
|
+
mid1, mid2 = middle_two
|
136
|
+
sorted = new_sort
|
137
|
+
med1, med2 = sorted[mid1], sorted[mid2]
|
138
|
+
return med1 if med1 == med2
|
139
|
+
return med1 + ((med2 - med1) * ratio)
|
140
|
+
rescue
|
141
|
+
iterate_midway(ratio, &block)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def middle_two
|
146
|
+
mid2 = size.div(2)
|
147
|
+
mid1 = (size % 2 == 0) ? mid2 - 1 : mid2
|
148
|
+
return mid1, mid2
|
149
|
+
end
|
150
|
+
protected :middle_two
|
151
|
+
|
152
|
+
def median_position
|
153
|
+
middle_two.last
|
154
|
+
end
|
155
|
+
protected :median_position
|
156
|
+
|
157
|
+
def first_half(&block)
|
158
|
+
fh = self[0..median_position].dup
|
159
|
+
end
|
160
|
+
protected :first_half
|
161
|
+
|
162
|
+
def second_half(&block)
|
163
|
+
# Total crap, but it's the way R does things, and this will most likely
|
164
|
+
# only be used to feed R some numbers to plot, if at all.
|
165
|
+
sh = size <= 5 ? self[median_position..-1].dup : self[median_position - 1..-1].dup
|
166
|
+
end
|
167
|
+
protected :second_half
|
168
|
+
|
169
|
+
# An iterative version of median
|
170
|
+
def iterate_midway(ratio, &block)
|
171
|
+
mid1, mid2, last_value, j, sorted, sort1, sort2 = middle_two, nil, 0, new_sort, nil, nil
|
172
|
+
|
173
|
+
if block_given?
|
174
|
+
sorted.each do |i|
|
175
|
+
last_value = yield(i)
|
176
|
+
j += 1
|
177
|
+
sort1 = last_value if j == mid1
|
178
|
+
sort2 = last_value if j == mid2
|
179
|
+
break if j >= mid2
|
180
|
+
end
|
181
|
+
elsif default_block
|
182
|
+
sorted.each do |i|
|
183
|
+
last_value = default_block[*i]
|
184
|
+
j += 1
|
185
|
+
sort1 = last_value if j == mid1
|
186
|
+
sort2 = last_value if j == mid2
|
187
|
+
break if j >= mid2
|
188
|
+
end
|
189
|
+
else
|
190
|
+
sorted.each do |i|
|
191
|
+
last_value = i
|
192
|
+
sort1 = last_value if j == mid1
|
193
|
+
sort2 = last_value if j == mid2
|
194
|
+
j += 1
|
195
|
+
break if j >= mid2
|
196
|
+
end
|
197
|
+
end
|
198
|
+
return med1 if med1 == med2
|
199
|
+
return med1 + ((med2 - med1) * ratio)
|
200
|
+
end
|
201
|
+
protected :iterate_midway
|
202
|
+
|
203
|
+
# Just an array of [min, max] to comply with R uses of the work. Use
|
204
|
+
# range_as_range if you want a real Range.
|
205
|
+
def range(&block)
|
206
|
+
[min(&block), max(&block)]
|
207
|
+
end
|
208
|
+
|
209
|
+
# Useful for setting a real range class (FixedRange).
|
210
|
+
def range_class=(klass)
|
211
|
+
@range_class = klass
|
212
|
+
end
|
213
|
+
|
214
|
+
# When creating a range, what class will it be? Defaults to Range, but
|
215
|
+
# other classes are sometimes useful.
|
216
|
+
def range_class
|
217
|
+
@range_class ||= Range
|
218
|
+
end
|
219
|
+
|
220
|
+
# Actually instantiates the range, instead of producing a min and max array.
|
221
|
+
def range_as_range(&block)
|
222
|
+
range_class.new(min(&block), max(&block))
|
223
|
+
end
|
224
|
+
|
225
|
+
# I don't pass the block to the sort, because a sort block needs to look
|
226
|
+
# something like: {|x,y| x <=> y}. To get around this, set the default
|
227
|
+
# block on the object.
|
228
|
+
def new_sort(&block)
|
229
|
+
if block_given?
|
230
|
+
map { |i| yield(i) }.sort.dup
|
231
|
+
elsif default_block
|
232
|
+
map { |i| default_block[*i] }.sort.dup
|
233
|
+
else
|
234
|
+
sort().dup
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
# Doesn't overwrite things like Matrix#rank
|
239
|
+
def rank(&block)
|
240
|
+
|
241
|
+
sorted = new_sort(&block)
|
242
|
+
|
243
|
+
if block_given?
|
244
|
+
map { |i| sorted.index(yield(i)) + 1 }
|
245
|
+
elsif default_block
|
246
|
+
map { |i| sorted.index(default_block[*i]) + 1 }
|
247
|
+
else
|
248
|
+
map { |i| sorted.index(i) + 1 }
|
249
|
+
end
|
250
|
+
|
251
|
+
end unless defined?(rank)
|
252
|
+
|
253
|
+
# Given values like [10,5,5,1]
|
254
|
+
# Rank should produce something like [4,2,2,1]
|
255
|
+
# And order should produce something like [4,2,3,1]
|
256
|
+
# The trick is that rank skips as many as were duplicated, so there
|
257
|
+
# could not be a 3 in the rank from the example above.
|
258
|
+
def order(&block)
|
259
|
+
hold = []
|
260
|
+
rank(&block).each do |x|
|
261
|
+
while hold.include?(x) do
|
262
|
+
x += 1
|
263
|
+
end
|
264
|
+
hold << x
|
265
|
+
end
|
266
|
+
hold
|
267
|
+
end
|
268
|
+
|
269
|
+
# First quartile: nth_split_by_m(1, 4)
|
270
|
+
# Third quartile: nth_split_by_m(3, 4)
|
271
|
+
# Median: nth_split_by_m(1, 2)
|
272
|
+
# Doesn't match R, and it's silly to try to.
|
273
|
+
# def nth_split_by_m(n, m)
|
274
|
+
# sorted = new_sort
|
275
|
+
# dividers = m - 1
|
276
|
+
# if size % m == dividers # Divides evenly
|
277
|
+
# # Because we have a 0-based list, we get the floor
|
278
|
+
# i = ((size / m.to_f) * n).floor
|
279
|
+
# j = i
|
280
|
+
# else
|
281
|
+
# # This reflects R's approach, which I don't think I agree with.
|
282
|
+
# i = (((size / m.to_f) * n) - 1)
|
283
|
+
# i = i > (size / m.to_f) ? i.floor : i.ceil
|
284
|
+
# j = i + 1
|
285
|
+
# end
|
286
|
+
# sorted[i] + ((n / m.to_f) * (sorted[j] - sorted[i]))
|
287
|
+
# end
|
288
|
+
def quantile(&block)
|
289
|
+
[
|
290
|
+
min(&block),
|
291
|
+
first_half(&block).median(0.25, &block),
|
292
|
+
median(&block),
|
293
|
+
second_half(&block).median(0.75, &block),
|
294
|
+
max(&block)
|
295
|
+
]
|
296
|
+
end
|
297
|
+
|
298
|
+
# The cummulative sum. Example:
|
299
|
+
# [1,2,3].cum_sum # => [1, 3, 6]
|
300
|
+
def cum_sum(sorted=false, &block)
|
301
|
+
sum = zero
|
302
|
+
obj = sorted ? self.new_sort : self
|
303
|
+
if block_given?
|
304
|
+
obj.map { |i| sum += yield(i) }
|
305
|
+
elsif default_block
|
306
|
+
obj.map { |i| sum += default_block[*i] }
|
307
|
+
else
|
308
|
+
obj.map { |i| sum += i }
|
309
|
+
end
|
310
|
+
end
|
311
|
+
alias :cumulative_sum :cum_sum
|
312
|
+
|
313
|
+
# The cummulative product. Example:
|
314
|
+
# [1,2,3].cum_prod # => [1.0, 2.0, 6.0]
|
315
|
+
def cum_prod(sorted=false, &block)
|
316
|
+
prod = one
|
317
|
+
obj = sorted ? self.new_sort : self
|
318
|
+
if block_given?
|
319
|
+
obj.map { |i| prod *= yield(i) }
|
320
|
+
elsif default_block
|
321
|
+
obj.map { |i| prod *= default_block[*i] }
|
322
|
+
else
|
323
|
+
obj.map { |i| prod *= i }
|
324
|
+
end
|
325
|
+
end
|
326
|
+
alias :cumulative_product :cum_prod
|
327
|
+
|
328
|
+
# Used to preprocess the list
|
329
|
+
def morph_list(&block)
|
330
|
+
if block
|
331
|
+
self.map{ |e| block.call(e) }
|
332
|
+
elsif self.default_block
|
333
|
+
self.map{ |e| self.default_block.call(e) }
|
334
|
+
else
|
335
|
+
self
|
336
|
+
end
|
337
|
+
end
|
338
|
+
protected :morph_list
|
339
|
+
|
340
|
+
# Example:
|
341
|
+
# [1,2,3,0,5].cum_max # => [1,2,3,3,5]
|
342
|
+
def cum_max(&block)
|
343
|
+
morph_list(&block).inject([]) do |list, e|
|
344
|
+
found = (list | [e]).max
|
345
|
+
list << (found ? found : e)
|
346
|
+
end
|
347
|
+
end
|
348
|
+
alias :cumulative_max :cum_max
|
349
|
+
|
350
|
+
# Example:
|
351
|
+
# [1,2,3,0,5].cum_min # => [1,1,1,0,0]
|
352
|
+
def cum_min(&block)
|
353
|
+
morph_list(&block).inject([]) do |list, e|
|
354
|
+
found = (list | [e]).min
|
355
|
+
list << (found ? found : e)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
alias :cumulative_min :cum_min
|
359
|
+
|
360
|
+
# Multiplies the values:
|
361
|
+
# >> product(1,2,3)
|
362
|
+
# => 6.0
|
363
|
+
def product
|
364
|
+
self.inject(one) {|sum, a| sum *= a}
|
365
|
+
end
|
366
|
+
|
367
|
+
# There are going to be a lot more of these kinds of things, so pay
|
368
|
+
# attention.
|
369
|
+
def to_pairs(other, &block)
|
370
|
+
n = [self.size, other.size].min
|
371
|
+
(0...n).map {|i| block.call(self[i], other[i]) }
|
372
|
+
end
|
373
|
+
|
374
|
+
# Finds the tanimoto coefficient: the intersection set size / union set
|
375
|
+
# size. This is used to find the distance between two vectors.
|
376
|
+
# >> [1,2,3].cor([2,3,5])
|
377
|
+
# => 0.981980506061966
|
378
|
+
# >> [1,2,3].tanimoto_pairs([2,3,5])
|
379
|
+
# => 0.5
|
380
|
+
def tanimoto_pairs(other)
|
381
|
+
intersect(other).size / union(other).size.to_f
|
382
|
+
end
|
383
|
+
alias :tanimoto_correlation :tanimoto_pairs
|
384
|
+
|
385
|
+
# Sometimes it just helps to have things spelled out. These are all
|
386
|
+
# part of the Array class. This means, you have methods that you can't
|
387
|
+
# run on some kinds of enumerables.
|
388
|
+
|
389
|
+
# All of the left and right hand sides, excluding duplicates.
|
390
|
+
# "The union of x and y"
|
391
|
+
def union(other)
|
392
|
+
self | other
|
393
|
+
end
|
394
|
+
|
395
|
+
# What's shared on the left and right hand sides
|
396
|
+
# "The intersection of x and y"
|
397
|
+
def intersect(other)
|
398
|
+
self & other
|
399
|
+
end
|
400
|
+
|
401
|
+
# Everything on the left hand side except what's shared on the right
|
402
|
+
# hand side.
|
403
|
+
# "The relative compliment of y in x"
|
404
|
+
def compliment(other)
|
405
|
+
self - other
|
406
|
+
end
|
407
|
+
|
408
|
+
# Everything but what's shared
|
409
|
+
def exclusive_not(other)
|
410
|
+
(self | other) - (self & other)
|
411
|
+
end
|
412
|
+
|
413
|
+
# Finds the cartesian product, excluding duplicates items and self-
|
414
|
+
# referential pairs. Yields the block value if given.
|
415
|
+
def cartesian_product(other, &block)
|
416
|
+
x,y = self.uniq.dup, other.uniq.dup
|
417
|
+
pairs = x.inject([]) do |cp, i|
|
418
|
+
cp | y.map{|b| i == b ? nil : [i,b]}.compact
|
419
|
+
end
|
420
|
+
return pairs unless block_given?
|
421
|
+
pairs.map{|p| yield p.first, p.last}
|
422
|
+
end
|
423
|
+
alias :cp :cartesian_product
|
424
|
+
alias :permutations :cartesian_product
|
425
|
+
|
426
|
+
# Sigma of pairs. Returns a single float, or whatever object is sent in.
|
427
|
+
# Example: [1,2,3].sigma_pairs([4,5,6], 0) {|x, y| x + y}
|
428
|
+
# returns 21 instead of 21.0.
|
429
|
+
def sigma_pairs(other, z=zero, &block)
|
430
|
+
self.to_pairs(other,&block).inject(z) {|sum, i| sum += i}
|
431
|
+
end
|
432
|
+
|
433
|
+
# Returns the Euclidian distance between all points of a set of enumerables
|
434
|
+
def euclidian_distance(other)
|
435
|
+
Math.sqrt(self.sigma_pairs(other) {|a, b| (a - b) ** 2})
|
436
|
+
end
|
437
|
+
|
438
|
+
# Returns a random integer in the range for any number of lists. This
|
439
|
+
# is a way to get a random vector that is tenable based on the sample
|
440
|
+
# data. For example, given two sets of numbers:
|
441
|
+
#
|
442
|
+
# a = [1,2,3]; b = [8,8,8]
|
443
|
+
#
|
444
|
+
# rand_in_pair_range will return a value >= 1 and <= 8 in the first
|
445
|
+
# place, >= 2 and <= 8 in the second place, and >= 3 and <= 8 in the
|
446
|
+
# last place.
|
447
|
+
# Works for integers. Rethink this for floats. May consider setting up
|
448
|
+
# FixedRange for floats. O(n*5)
|
449
|
+
def rand_in_range(*args)
|
450
|
+
min = self.min_of_lists(*args)
|
451
|
+
max = self.max_of_lists(*args)
|
452
|
+
(0...size).inject([]) do |ary, i|
|
453
|
+
ary << rand_between(min[i], max[i])
|
454
|
+
end
|
455
|
+
end
|
456
|
+
|
457
|
+
# Finds the correlation between two enumerables.
|
458
|
+
# Example: [1,2,3].cor [2,3,5]
|
459
|
+
# returns 0.981980506061966
|
460
|
+
def correlation(other)
|
461
|
+
n = [self.size, other.size].min
|
462
|
+
sum_of_products_of_pairs = self.sigma_pairs(other) {|a, b| a * b}
|
463
|
+
self_sum = self.sum
|
464
|
+
other_sum = other.sum
|
465
|
+
sum_of_squared_self_scores = self.sum { |e| e * e }
|
466
|
+
sum_of_squared_other_scores = other.sum { |e| e * e }
|
467
|
+
|
468
|
+
numerator = (n * sum_of_products_of_pairs) - (self_sum * other_sum)
|
469
|
+
self_denominator = ((n * sum_of_squared_self_scores) - (self_sum ** 2))
|
470
|
+
other_denominator = ((n * sum_of_squared_other_scores) - (other_sum ** 2))
|
471
|
+
denominator = Math.sqrt(self_denominator * other_denominator)
|
472
|
+
return numerator / denominator
|
473
|
+
end
|
474
|
+
alias :cor :correlation
|
475
|
+
|
476
|
+
# Transposes arrays of arrays and yields a block on the value.
|
477
|
+
# The regular Array#transpose ignores blocks
|
478
|
+
def yield_transpose(*enums, &block)
|
479
|
+
enums.unshift(self)
|
480
|
+
n = enums.map{ |x| x.size}.min
|
481
|
+
block ||= lambda{|e| e}
|
482
|
+
(0...n).map { |i| block.call enums.map{ |x| x[i] } }
|
483
|
+
end
|
484
|
+
|
485
|
+
# Returns the max of two or more enumerables.
|
486
|
+
# >> [1,2,3].max_of_lists([0,5,6], [0,2,9])
|
487
|
+
# => [1, 5, 9]
|
488
|
+
def max_of_lists(*enums)
|
489
|
+
yield_transpose(*enums) {|e| e.max}
|
490
|
+
end
|
491
|
+
|
492
|
+
# Returns the min of two or more enumerables.
|
493
|
+
# >> [1,2,3].min_of_lists([4,5,6], [0,2,9])
|
494
|
+
# => [0, 2, 3]
|
495
|
+
def min_of_lists(*enums)
|
496
|
+
yield_transpose(*enums) {|e| e.min}
|
497
|
+
end
|
498
|
+
end
|