davidrichards-just_enumerable_stats 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,498 @@
1
+ # Borrowed this from my own gem, sirb
2
+
3
+ class Object
4
+
5
+ # Simpler way to handle a random number between to values
6
+ def rand_between(a, b)
7
+ return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
8
+ range = (a - b).abs + 1
9
+ rand(range) + [a,b].min
10
+ end
11
+
12
+ # Handles non-integers
13
+ def rand_in_floats(a, b)
14
+ range = (a - b).abs
15
+ (rand * range) + [a,b].min
16
+ end
17
+
18
+ end
19
+
20
+ module Enumerable
21
+
22
+ alias :original_max :max
23
+ alias :original_min :min
24
+
25
+ # To keep max and min DRY.
26
+ def block_sorter(a, b, &block)
27
+ if block
28
+ val = yield(a, b)
29
+ elsif default_block
30
+ val = default_block.call(a, b)
31
+ else
32
+ val = a <=> b
33
+ end
34
+ end
35
+ protected :block_sorter
36
+
37
+ # Returns the max, using an optional block.
38
+ def max(&block)
39
+ self.inject do |best, e|
40
+ val = block_sorter(best, e, &block)
41
+ best = val > 0 ? best : e
42
+ end
43
+ end
44
+
45
+ # Returns the first index of the max value
46
+ def max_index(&block)
47
+ self.index(max(&block))
48
+ end
49
+
50
+ # Min of any number of items
51
+ def min(&block)
52
+ self.inject do |best, e|
53
+ val = block_sorter(best, e, &block)
54
+ best = val < 0 ? best : e
55
+ end
56
+ end
57
+
58
+ # Returns the first index of the min value
59
+ def min_index(&block)
60
+ self.index(min(&block))
61
+ end
62
+
63
+ # The block called to filter the values in the object.
64
+ def default_block
65
+ @default_stat_block
66
+ end
67
+
68
+ # Allows me to setup a block for a series of operations. Example:
69
+ # a = [1,2,3]
70
+ # a.sum # => 6.0
71
+ # a.default_block = lambda{|e| 1 / e}
72
+ # a.sum # => 1.0
73
+ def default_block=(block)
74
+ @default_stat_block = block
75
+ end
76
+
77
+ # Provides zero in the right class (Numeric or Float)
78
+ def zero
79
+ any? {|e| e.is_a?(Float)} ? 0.0 : 0
80
+ end
81
+ protected :zero
82
+
83
+ # Provides one in the right class (Numeric or Float)
84
+ def one
85
+ any? {|e| e.is_a?(Float)} ? 1.0 : 1
86
+ end
87
+ protected :one
88
+
89
+ # Adds up the list. Uses a block or default block if present.
90
+ def sum
91
+ sum = zero
92
+ if block_given?
93
+ each{|i| sum += yield(i)}
94
+ elsif default_block
95
+ each{|i| sum += default_block[*i]}
96
+ else
97
+ each{|i| sum += i}
98
+ end
99
+ sum
100
+ end
101
+
102
+ # The arithmetic mean, uses a block or default block.
103
+ def average(&block)
104
+ sum(&block)/size
105
+ end
106
+ alias :mean :average
107
+ alias :avg :average
108
+
109
+ # The variance, uses a block or default block.
110
+ def variance(&block)
111
+ m = mean(&block)
112
+ sum_of_differences = if block_given?
113
+ sum{ |i| j=yield(i); (m - j) ** 2 }
114
+ elsif default_block
115
+ sum{ |i| j=default_block[*i]; (m - j) ** 2 }
116
+ else
117
+ sum{ |i| (m - i) ** 2 }
118
+ end
119
+ sum_of_differences / (size - 1)
120
+ end
121
+ alias :var :variance
122
+
123
+ # The standard deviation. Uses a block or default block.
124
+ def standard_deviation(&block)
125
+ Math::sqrt(variance(&block))
126
+ end
127
+ alias :std :standard_deviation
128
+
129
+ # The slow way is to iterate up to the middle point. A faster way is to
130
+ # use the index, when available. If a block is supplied, always iterate
131
+ # to the middle point.
132
+ def median(ratio=0.5, &block)
133
+ return iterate_midway(ratio, &block) if block_given?
134
+ begin
135
+ mid1, mid2 = middle_two
136
+ sorted = new_sort
137
+ med1, med2 = sorted[mid1], sorted[mid2]
138
+ return med1 if med1 == med2
139
+ return med1 + ((med2 - med1) * ratio)
140
+ rescue
141
+ iterate_midway(ratio, &block)
142
+ end
143
+ end
144
+
145
+ def middle_two
146
+ mid2 = size.div(2)
147
+ mid1 = (size % 2 == 0) ? mid2 - 1 : mid2
148
+ return mid1, mid2
149
+ end
150
+ protected :middle_two
151
+
152
+ def median_position
153
+ middle_two.last
154
+ end
155
+ protected :median_position
156
+
157
+ def first_half(&block)
158
+ fh = self[0..median_position].dup
159
+ end
160
+ protected :first_half
161
+
162
+ def second_half(&block)
163
+ # Total crap, but it's the way R does things, and this will most likely
164
+ # only be used to feed R some numbers to plot, if at all.
165
+ sh = size <= 5 ? self[median_position..-1].dup : self[median_position - 1..-1].dup
166
+ end
167
+ protected :second_half
168
+
169
+ # An iterative version of median
170
+ def iterate_midway(ratio, &block)
171
+ mid1, mid2, last_value, j, sorted, sort1, sort2 = middle_two, nil, 0, new_sort, nil, nil
172
+
173
+ if block_given?
174
+ sorted.each do |i|
175
+ last_value = yield(i)
176
+ j += 1
177
+ sort1 = last_value if j == mid1
178
+ sort2 = last_value if j == mid2
179
+ break if j >= mid2
180
+ end
181
+ elsif default_block
182
+ sorted.each do |i|
183
+ last_value = default_block[*i]
184
+ j += 1
185
+ sort1 = last_value if j == mid1
186
+ sort2 = last_value if j == mid2
187
+ break if j >= mid2
188
+ end
189
+ else
190
+ sorted.each do |i|
191
+ last_value = i
192
+ sort1 = last_value if j == mid1
193
+ sort2 = last_value if j == mid2
194
+ j += 1
195
+ break if j >= mid2
196
+ end
197
+ end
198
+ return med1 if med1 == med2
199
+ return med1 + ((med2 - med1) * ratio)
200
+ end
201
+ protected :iterate_midway
202
+
203
+ # Just an array of [min, max] to comply with R uses of the work. Use
204
+ # range_as_range if you want a real Range.
205
+ def range(&block)
206
+ [min(&block), max(&block)]
207
+ end
208
+
209
+ # Useful for setting a real range class (FixedRange).
210
+ def range_class=(klass)
211
+ @range_class = klass
212
+ end
213
+
214
+ # When creating a range, what class will it be? Defaults to Range, but
215
+ # other classes are sometimes useful.
216
+ def range_class
217
+ @range_class ||= Range
218
+ end
219
+
220
+ # Actually instantiates the range, instead of producing a min and max array.
221
+ def range_as_range(&block)
222
+ range_class.new(min(&block), max(&block))
223
+ end
224
+
225
+ # I don't pass the block to the sort, because a sort block needs to look
226
+ # something like: {|x,y| x <=> y}. To get around this, set the default
227
+ # block on the object.
228
+ def new_sort(&block)
229
+ if block_given?
230
+ map { |i| yield(i) }.sort.dup
231
+ elsif default_block
232
+ map { |i| default_block[*i] }.sort.dup
233
+ else
234
+ sort().dup
235
+ end
236
+ end
237
+
238
+ # Doesn't overwrite things like Matrix#rank
239
+ def rank(&block)
240
+
241
+ sorted = new_sort(&block)
242
+
243
+ if block_given?
244
+ map { |i| sorted.index(yield(i)) + 1 }
245
+ elsif default_block
246
+ map { |i| sorted.index(default_block[*i]) + 1 }
247
+ else
248
+ map { |i| sorted.index(i) + 1 }
249
+ end
250
+
251
+ end unless defined?(rank)
252
+
253
+ # Given values like [10,5,5,1]
254
+ # Rank should produce something like [4,2,2,1]
255
+ # And order should produce something like [4,2,3,1]
256
+ # The trick is that rank skips as many as were duplicated, so there
257
+ # could not be a 3 in the rank from the example above.
258
+ def order(&block)
259
+ hold = []
260
+ rank(&block).each do |x|
261
+ while hold.include?(x) do
262
+ x += 1
263
+ end
264
+ hold << x
265
+ end
266
+ hold
267
+ end
268
+
269
+ # First quartile: nth_split_by_m(1, 4)
270
+ # Third quartile: nth_split_by_m(3, 4)
271
+ # Median: nth_split_by_m(1, 2)
272
+ # Doesn't match R, and it's silly to try to.
273
+ # def nth_split_by_m(n, m)
274
+ # sorted = new_sort
275
+ # dividers = m - 1
276
+ # if size % m == dividers # Divides evenly
277
+ # # Because we have a 0-based list, we get the floor
278
+ # i = ((size / m.to_f) * n).floor
279
+ # j = i
280
+ # else
281
+ # # This reflects R's approach, which I don't think I agree with.
282
+ # i = (((size / m.to_f) * n) - 1)
283
+ # i = i > (size / m.to_f) ? i.floor : i.ceil
284
+ # j = i + 1
285
+ # end
286
+ # sorted[i] + ((n / m.to_f) * (sorted[j] - sorted[i]))
287
+ # end
288
+ def quantile(&block)
289
+ [
290
+ min(&block),
291
+ first_half(&block).median(0.25, &block),
292
+ median(&block),
293
+ second_half(&block).median(0.75, &block),
294
+ max(&block)
295
+ ]
296
+ end
297
+
298
+ # The cummulative sum. Example:
299
+ # [1,2,3].cum_sum # => [1, 3, 6]
300
+ def cum_sum(sorted=false, &block)
301
+ sum = zero
302
+ obj = sorted ? self.new_sort : self
303
+ if block_given?
304
+ obj.map { |i| sum += yield(i) }
305
+ elsif default_block
306
+ obj.map { |i| sum += default_block[*i] }
307
+ else
308
+ obj.map { |i| sum += i }
309
+ end
310
+ end
311
+ alias :cumulative_sum :cum_sum
312
+
313
+ # The cummulative product. Example:
314
+ # [1,2,3].cum_prod # => [1.0, 2.0, 6.0]
315
+ def cum_prod(sorted=false, &block)
316
+ prod = one
317
+ obj = sorted ? self.new_sort : self
318
+ if block_given?
319
+ obj.map { |i| prod *= yield(i) }
320
+ elsif default_block
321
+ obj.map { |i| prod *= default_block[*i] }
322
+ else
323
+ obj.map { |i| prod *= i }
324
+ end
325
+ end
326
+ alias :cumulative_product :cum_prod
327
+
328
+ # Used to preprocess the list
329
+ def morph_list(&block)
330
+ if block
331
+ self.map{ |e| block.call(e) }
332
+ elsif self.default_block
333
+ self.map{ |e| self.default_block.call(e) }
334
+ else
335
+ self
336
+ end
337
+ end
338
+ protected :morph_list
339
+
340
+ # Example:
341
+ # [1,2,3,0,5].cum_max # => [1,2,3,3,5]
342
+ def cum_max(&block)
343
+ morph_list(&block).inject([]) do |list, e|
344
+ found = (list | [e]).max
345
+ list << (found ? found : e)
346
+ end
347
+ end
348
+ alias :cumulative_max :cum_max
349
+
350
+ # Example:
351
+ # [1,2,3,0,5].cum_min # => [1,1,1,0,0]
352
+ def cum_min(&block)
353
+ morph_list(&block).inject([]) do |list, e|
354
+ found = (list | [e]).min
355
+ list << (found ? found : e)
356
+ end
357
+ end
358
+ alias :cumulative_min :cum_min
359
+
360
+ # Multiplies the values:
361
+ # >> product(1,2,3)
362
+ # => 6.0
363
+ def product
364
+ self.inject(one) {|sum, a| sum *= a}
365
+ end
366
+
367
+ # There are going to be a lot more of these kinds of things, so pay
368
+ # attention.
369
+ def to_pairs(other, &block)
370
+ n = [self.size, other.size].min
371
+ (0...n).map {|i| block.call(self[i], other[i]) }
372
+ end
373
+
374
+ # Finds the tanimoto coefficient: the intersection set size / union set
375
+ # size. This is used to find the distance between two vectors.
376
+ # >> [1,2,3].cor([2,3,5])
377
+ # => 0.981980506061966
378
+ # >> [1,2,3].tanimoto_pairs([2,3,5])
379
+ # => 0.5
380
+ def tanimoto_pairs(other)
381
+ intersect(other).size / union(other).size.to_f
382
+ end
383
+ alias :tanimoto_correlation :tanimoto_pairs
384
+
385
+ # Sometimes it just helps to have things spelled out. These are all
386
+ # part of the Array class. This means, you have methods that you can't
387
+ # run on some kinds of enumerables.
388
+
389
+ # All of the left and right hand sides, excluding duplicates.
390
+ # "The union of x and y"
391
+ def union(other)
392
+ self | other
393
+ end
394
+
395
+ # What's shared on the left and right hand sides
396
+ # "The intersection of x and y"
397
+ def intersect(other)
398
+ self & other
399
+ end
400
+
401
+ # Everything on the left hand side except what's shared on the right
402
+ # hand side.
403
+ # "The relative compliment of y in x"
404
+ def compliment(other)
405
+ self - other
406
+ end
407
+
408
+ # Everything but what's shared
409
+ def exclusive_not(other)
410
+ (self | other) - (self & other)
411
+ end
412
+
413
+ # Finds the cartesian product, excluding duplicates items and self-
414
+ # referential pairs. Yields the block value if given.
415
+ def cartesian_product(other, &block)
416
+ x,y = self.uniq.dup, other.uniq.dup
417
+ pairs = x.inject([]) do |cp, i|
418
+ cp | y.map{|b| i == b ? nil : [i,b]}.compact
419
+ end
420
+ return pairs unless block_given?
421
+ pairs.map{|p| yield p.first, p.last}
422
+ end
423
+ alias :cp :cartesian_product
424
+ alias :permutations :cartesian_product
425
+
426
+ # Sigma of pairs. Returns a single float, or whatever object is sent in.
427
+ # Example: [1,2,3].sigma_pairs([4,5,6], 0) {|x, y| x + y}
428
+ # returns 21 instead of 21.0.
429
+ def sigma_pairs(other, z=zero, &block)
430
+ self.to_pairs(other,&block).inject(z) {|sum, i| sum += i}
431
+ end
432
+
433
+ # Returns the Euclidian distance between all points of a set of enumerables
434
+ def euclidian_distance(other)
435
+ Math.sqrt(self.sigma_pairs(other) {|a, b| (a - b) ** 2})
436
+ end
437
+
438
+ # Returns a random integer in the range for any number of lists. This
439
+ # is a way to get a random vector that is tenable based on the sample
440
+ # data. For example, given two sets of numbers:
441
+ #
442
+ # a = [1,2,3]; b = [8,8,8]
443
+ #
444
+ # rand_in_pair_range will return a value >= 1 and <= 8 in the first
445
+ # place, >= 2 and <= 8 in the second place, and >= 3 and <= 8 in the
446
+ # last place.
447
+ # Works for integers. Rethink this for floats. May consider setting up
448
+ # FixedRange for floats. O(n*5)
449
+ def rand_in_range(*args)
450
+ min = self.min_of_lists(*args)
451
+ max = self.max_of_lists(*args)
452
+ (0...size).inject([]) do |ary, i|
453
+ ary << rand_between(min[i], max[i])
454
+ end
455
+ end
456
+
457
+ # Finds the correlation between two enumerables.
458
+ # Example: [1,2,3].cor [2,3,5]
459
+ # returns 0.981980506061966
460
+ def correlation(other)
461
+ n = [self.size, other.size].min
462
+ sum_of_products_of_pairs = self.sigma_pairs(other) {|a, b| a * b}
463
+ self_sum = self.sum
464
+ other_sum = other.sum
465
+ sum_of_squared_self_scores = self.sum { |e| e * e }
466
+ sum_of_squared_other_scores = other.sum { |e| e * e }
467
+
468
+ numerator = (n * sum_of_products_of_pairs) - (self_sum * other_sum)
469
+ self_denominator = ((n * sum_of_squared_self_scores) - (self_sum ** 2))
470
+ other_denominator = ((n * sum_of_squared_other_scores) - (other_sum ** 2))
471
+ denominator = Math.sqrt(self_denominator * other_denominator)
472
+ return numerator / denominator
473
+ end
474
+ alias :cor :correlation
475
+
476
+ # Transposes arrays of arrays and yields a block on the value.
477
+ # The regular Array#transpose ignores blocks
478
+ def yield_transpose(*enums, &block)
479
+ enums.unshift(self)
480
+ n = enums.map{ |x| x.size}.min
481
+ block ||= lambda{|e| e}
482
+ (0...n).map { |i| block.call enums.map{ |x| x[i] } }
483
+ end
484
+
485
+ # Returns the max of two or more enumerables.
486
+ # >> [1,2,3].max_of_lists([0,5,6], [0,2,9])
487
+ # => [1, 5, 9]
488
+ def max_of_lists(*enums)
489
+ yield_transpose(*enums) {|e| e.max}
490
+ end
491
+
492
+ # Returns the min of two or more enumerables.
493
+ # >> [1,2,3].min_of_lists([4,5,6], [0,2,9])
494
+ # => [0, 2, 3]
495
+ def min_of_lists(*enums)
496
+ yield_transpose(*enums) {|e| e.min}
497
+ end
498
+ end