davidrichards-just_enumerable_stats 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,498 @@
1
+ # Borrowed this from my own gem, sirb
2
+
3
+ class Object
4
+
5
+ # Simpler way to handle a random number between to values
6
+ def rand_between(a, b)
7
+ return rand_in_floats(a, b) if a.is_a?(Float) or b.is_a?(Float)
8
+ range = (a - b).abs + 1
9
+ rand(range) + [a,b].min
10
+ end
11
+
12
+ # Handles non-integers
13
+ def rand_in_floats(a, b)
14
+ range = (a - b).abs
15
+ (rand * range) + [a,b].min
16
+ end
17
+
18
+ end
19
+
20
+ module Enumerable
21
+
22
+ alias :original_max :max
23
+ alias :original_min :min
24
+
25
+ # To keep max and min DRY.
26
+ def block_sorter(a, b, &block)
27
+ if block
28
+ val = yield(a, b)
29
+ elsif default_block
30
+ val = default_block.call(a, b)
31
+ else
32
+ val = a <=> b
33
+ end
34
+ end
35
+ protected :block_sorter
36
+
37
+ # Returns the max, using an optional block.
38
+ def max(&block)
39
+ self.inject do |best, e|
40
+ val = block_sorter(best, e, &block)
41
+ best = val > 0 ? best : e
42
+ end
43
+ end
44
+
45
+ # Returns the first index of the max value
46
+ def max_index(&block)
47
+ self.index(max(&block))
48
+ end
49
+
50
+ # Min of any number of items
51
+ def min(&block)
52
+ self.inject do |best, e|
53
+ val = block_sorter(best, e, &block)
54
+ best = val < 0 ? best : e
55
+ end
56
+ end
57
+
58
+ # Returns the first index of the min value
59
+ def min_index(&block)
60
+ self.index(min(&block))
61
+ end
62
+
63
+ # The block called to filter the values in the object.
64
+ def default_block
65
+ @default_stat_block
66
+ end
67
+
68
+ # Allows me to setup a block for a series of operations. Example:
69
+ # a = [1,2,3]
70
+ # a.sum # => 6.0
71
+ # a.default_block = lambda{|e| 1 / e}
72
+ # a.sum # => 1.0
73
+ def default_block=(block)
74
+ @default_stat_block = block
75
+ end
76
+
77
+ # Provides zero in the right class (Numeric or Float)
78
+ def zero
79
+ any? {|e| e.is_a?(Float)} ? 0.0 : 0
80
+ end
81
+ protected :zero
82
+
83
+ # Provides one in the right class (Numeric or Float)
84
+ def one
85
+ any? {|e| e.is_a?(Float)} ? 1.0 : 1
86
+ end
87
+ protected :one
88
+
89
+ # Adds up the list. Uses a block or default block if present.
90
+ def sum
91
+ sum = zero
92
+ if block_given?
93
+ each{|i| sum += yield(i)}
94
+ elsif default_block
95
+ each{|i| sum += default_block[*i]}
96
+ else
97
+ each{|i| sum += i}
98
+ end
99
+ sum
100
+ end
101
+
102
+ # The arithmetic mean, uses a block or default block.
103
+ def average(&block)
104
+ sum(&block)/size
105
+ end
106
+ alias :mean :average
107
+ alias :avg :average
108
+
109
+ # The variance, uses a block or default block.
110
+ def variance(&block)
111
+ m = mean(&block)
112
+ sum_of_differences = if block_given?
113
+ sum{ |i| j=yield(i); (m - j) ** 2 }
114
+ elsif default_block
115
+ sum{ |i| j=default_block[*i]; (m - j) ** 2 }
116
+ else
117
+ sum{ |i| (m - i) ** 2 }
118
+ end
119
+ sum_of_differences / (size - 1)
120
+ end
121
+ alias :var :variance
122
+
123
+ # The standard deviation. Uses a block or default block.
124
+ def standard_deviation(&block)
125
+ Math::sqrt(variance(&block))
126
+ end
127
+ alias :std :standard_deviation
128
+
129
+ # The slow way is to iterate up to the middle point. A faster way is to
130
+ # use the index, when available. If a block is supplied, always iterate
131
+ # to the middle point.
132
+ def median(ratio=0.5, &block)
133
+ return iterate_midway(ratio, &block) if block_given?
134
+ begin
135
+ mid1, mid2 = middle_two
136
+ sorted = new_sort
137
+ med1, med2 = sorted[mid1], sorted[mid2]
138
+ return med1 if med1 == med2
139
+ return med1 + ((med2 - med1) * ratio)
140
+ rescue
141
+ iterate_midway(ratio, &block)
142
+ end
143
+ end
144
+
145
+ def middle_two
146
+ mid2 = size.div(2)
147
+ mid1 = (size % 2 == 0) ? mid2 - 1 : mid2
148
+ return mid1, mid2
149
+ end
150
+ protected :middle_two
151
+
152
+ def median_position
153
+ middle_two.last
154
+ end
155
+ protected :median_position
156
+
157
+ def first_half(&block)
158
+ fh = self[0..median_position].dup
159
+ end
160
+ protected :first_half
161
+
162
+ def second_half(&block)
163
+ # Total crap, but it's the way R does things, and this will most likely
164
+ # only be used to feed R some numbers to plot, if at all.
165
+ sh = size <= 5 ? self[median_position..-1].dup : self[median_position - 1..-1].dup
166
+ end
167
+ protected :second_half
168
+
169
+ # An iterative version of median
170
+ def iterate_midway(ratio, &block)
171
+ mid1, mid2, last_value, j, sorted, sort1, sort2 = middle_two, nil, 0, new_sort, nil, nil
172
+
173
+ if block_given?
174
+ sorted.each do |i|
175
+ last_value = yield(i)
176
+ j += 1
177
+ sort1 = last_value if j == mid1
178
+ sort2 = last_value if j == mid2
179
+ break if j >= mid2
180
+ end
181
+ elsif default_block
182
+ sorted.each do |i|
183
+ last_value = default_block[*i]
184
+ j += 1
185
+ sort1 = last_value if j == mid1
186
+ sort2 = last_value if j == mid2
187
+ break if j >= mid2
188
+ end
189
+ else
190
+ sorted.each do |i|
191
+ last_value = i
192
+ sort1 = last_value if j == mid1
193
+ sort2 = last_value if j == mid2
194
+ j += 1
195
+ break if j >= mid2
196
+ end
197
+ end
198
+ return med1 if med1 == med2
199
+ return med1 + ((med2 - med1) * ratio)
200
+ end
201
+ protected :iterate_midway
202
+
203
+ # Just an array of [min, max] to comply with R uses of the work. Use
204
+ # range_as_range if you want a real Range.
205
+ def range(&block)
206
+ [min(&block), max(&block)]
207
+ end
208
+
209
+ # Useful for setting a real range class (FixedRange).
210
+ def range_class=(klass)
211
+ @range_class = klass
212
+ end
213
+
214
+ # When creating a range, what class will it be? Defaults to Range, but
215
+ # other classes are sometimes useful.
216
+ def range_class
217
+ @range_class ||= Range
218
+ end
219
+
220
+ # Actually instantiates the range, instead of producing a min and max array.
221
+ def range_as_range(&block)
222
+ range_class.new(min(&block), max(&block))
223
+ end
224
+
225
+ # I don't pass the block to the sort, because a sort block needs to look
226
+ # something like: {|x,y| x <=> y}. To get around this, set the default
227
+ # block on the object.
228
+ def new_sort(&block)
229
+ if block_given?
230
+ map { |i| yield(i) }.sort.dup
231
+ elsif default_block
232
+ map { |i| default_block[*i] }.sort.dup
233
+ else
234
+ sort().dup
235
+ end
236
+ end
237
+
238
+ # Doesn't overwrite things like Matrix#rank
239
+ def rank(&block)
240
+
241
+ sorted = new_sort(&block)
242
+
243
+ if block_given?
244
+ map { |i| sorted.index(yield(i)) + 1 }
245
+ elsif default_block
246
+ map { |i| sorted.index(default_block[*i]) + 1 }
247
+ else
248
+ map { |i| sorted.index(i) + 1 }
249
+ end
250
+
251
+ end unless defined?(rank)
252
+
253
+ # Given values like [10,5,5,1]
254
+ # Rank should produce something like [4,2,2,1]
255
+ # And order should produce something like [4,2,3,1]
256
+ # The trick is that rank skips as many as were duplicated, so there
257
+ # could not be a 3 in the rank from the example above.
258
+ def order(&block)
259
+ hold = []
260
+ rank(&block).each do |x|
261
+ while hold.include?(x) do
262
+ x += 1
263
+ end
264
+ hold << x
265
+ end
266
+ hold
267
+ end
268
+
269
+ # First quartile: nth_split_by_m(1, 4)
270
+ # Third quartile: nth_split_by_m(3, 4)
271
+ # Median: nth_split_by_m(1, 2)
272
+ # Doesn't match R, and it's silly to try to.
273
+ # def nth_split_by_m(n, m)
274
+ # sorted = new_sort
275
+ # dividers = m - 1
276
+ # if size % m == dividers # Divides evenly
277
+ # # Because we have a 0-based list, we get the floor
278
+ # i = ((size / m.to_f) * n).floor
279
+ # j = i
280
+ # else
281
+ # # This reflects R's approach, which I don't think I agree with.
282
+ # i = (((size / m.to_f) * n) - 1)
283
+ # i = i > (size / m.to_f) ? i.floor : i.ceil
284
+ # j = i + 1
285
+ # end
286
+ # sorted[i] + ((n / m.to_f) * (sorted[j] - sorted[i]))
287
+ # end
288
+ def quantile(&block)
289
+ [
290
+ min(&block),
291
+ first_half(&block).median(0.25, &block),
292
+ median(&block),
293
+ second_half(&block).median(0.75, &block),
294
+ max(&block)
295
+ ]
296
+ end
297
+
298
+ # The cummulative sum. Example:
299
+ # [1,2,3].cum_sum # => [1, 3, 6]
300
+ def cum_sum(sorted=false, &block)
301
+ sum = zero
302
+ obj = sorted ? self.new_sort : self
303
+ if block_given?
304
+ obj.map { |i| sum += yield(i) }
305
+ elsif default_block
306
+ obj.map { |i| sum += default_block[*i] }
307
+ else
308
+ obj.map { |i| sum += i }
309
+ end
310
+ end
311
+ alias :cumulative_sum :cum_sum
312
+
313
+ # The cummulative product. Example:
314
+ # [1,2,3].cum_prod # => [1.0, 2.0, 6.0]
315
+ def cum_prod(sorted=false, &block)
316
+ prod = one
317
+ obj = sorted ? self.new_sort : self
318
+ if block_given?
319
+ obj.map { |i| prod *= yield(i) }
320
+ elsif default_block
321
+ obj.map { |i| prod *= default_block[*i] }
322
+ else
323
+ obj.map { |i| prod *= i }
324
+ end
325
+ end
326
+ alias :cumulative_product :cum_prod
327
+
328
+ # Used to preprocess the list
329
+ def morph_list(&block)
330
+ if block
331
+ self.map{ |e| block.call(e) }
332
+ elsif self.default_block
333
+ self.map{ |e| self.default_block.call(e) }
334
+ else
335
+ self
336
+ end
337
+ end
338
+ protected :morph_list
339
+
340
+ # Example:
341
+ # [1,2,3,0,5].cum_max # => [1,2,3,3,5]
342
+ def cum_max(&block)
343
+ morph_list(&block).inject([]) do |list, e|
344
+ found = (list | [e]).max
345
+ list << (found ? found : e)
346
+ end
347
+ end
348
+ alias :cumulative_max :cum_max
349
+
350
+ # Example:
351
+ # [1,2,3,0,5].cum_min # => [1,1,1,0,0]
352
+ def cum_min(&block)
353
+ morph_list(&block).inject([]) do |list, e|
354
+ found = (list | [e]).min
355
+ list << (found ? found : e)
356
+ end
357
+ end
358
+ alias :cumulative_min :cum_min
359
+
360
+ # Multiplies the values:
361
+ # >> product(1,2,3)
362
+ # => 6.0
363
+ def product
364
+ self.inject(one) {|sum, a| sum *= a}
365
+ end
366
+
367
+ # There are going to be a lot more of these kinds of things, so pay
368
+ # attention.
369
+ def to_pairs(other, &block)
370
+ n = [self.size, other.size].min
371
+ (0...n).map {|i| block.call(self[i], other[i]) }
372
+ end
373
+
374
+ # Finds the tanimoto coefficient: the intersection set size / union set
375
+ # size. This is used to find the distance between two vectors.
376
+ # >> [1,2,3].cor([2,3,5])
377
+ # => 0.981980506061966
378
+ # >> [1,2,3].tanimoto_pairs([2,3,5])
379
+ # => 0.5
380
+ def tanimoto_pairs(other)
381
+ intersect(other).size / union(other).size.to_f
382
+ end
383
+ alias :tanimoto_correlation :tanimoto_pairs
384
+
385
+ # Sometimes it just helps to have things spelled out. These are all
386
+ # part of the Array class. This means, you have methods that you can't
387
+ # run on some kinds of enumerables.
388
+
389
+ # All of the left and right hand sides, excluding duplicates.
390
+ # "The union of x and y"
391
+ def union(other)
392
+ self | other
393
+ end
394
+
395
+ # What's shared on the left and right hand sides
396
+ # "The intersection of x and y"
397
+ def intersect(other)
398
+ self & other
399
+ end
400
+
401
+ # Everything on the left hand side except what's shared on the right
402
+ # hand side.
403
+ # "The relative compliment of y in x"
404
+ def compliment(other)
405
+ self - other
406
+ end
407
+
408
+ # Everything but what's shared
409
+ def exclusive_not(other)
410
+ (self | other) - (self & other)
411
+ end
412
+
413
+ # Finds the cartesian product, excluding duplicates items and self-
414
+ # referential pairs. Yields the block value if given.
415
+ def cartesian_product(other, &block)
416
+ x,y = self.uniq.dup, other.uniq.dup
417
+ pairs = x.inject([]) do |cp, i|
418
+ cp | y.map{|b| i == b ? nil : [i,b]}.compact
419
+ end
420
+ return pairs unless block_given?
421
+ pairs.map{|p| yield p.first, p.last}
422
+ end
423
+ alias :cp :cartesian_product
424
+ alias :permutations :cartesian_product
425
+
426
+ # Sigma of pairs. Returns a single float, or whatever object is sent in.
427
+ # Example: [1,2,3].sigma_pairs([4,5,6], 0) {|x, y| x + y}
428
+ # returns 21 instead of 21.0.
429
+ def sigma_pairs(other, z=zero, &block)
430
+ self.to_pairs(other,&block).inject(z) {|sum, i| sum += i}
431
+ end
432
+
433
+ # Returns the Euclidian distance between all points of a set of enumerables
434
+ def euclidian_distance(other)
435
+ Math.sqrt(self.sigma_pairs(other) {|a, b| (a - b) ** 2})
436
+ end
437
+
438
+ # Returns a random integer in the range for any number of lists. This
439
+ # is a way to get a random vector that is tenable based on the sample
440
+ # data. For example, given two sets of numbers:
441
+ #
442
+ # a = [1,2,3]; b = [8,8,8]
443
+ #
444
+ # rand_in_pair_range will return a value >= 1 and <= 8 in the first
445
+ # place, >= 2 and <= 8 in the second place, and >= 3 and <= 8 in the
446
+ # last place.
447
+ # Works for integers. Rethink this for floats. May consider setting up
448
+ # FixedRange for floats. O(n*5)
449
+ def rand_in_range(*args)
450
+ min = self.min_of_lists(*args)
451
+ max = self.max_of_lists(*args)
452
+ (0...size).inject([]) do |ary, i|
453
+ ary << rand_between(min[i], max[i])
454
+ end
455
+ end
456
+
457
+ # Finds the correlation between two enumerables.
458
+ # Example: [1,2,3].cor [2,3,5]
459
+ # returns 0.981980506061966
460
+ def correlation(other)
461
+ n = [self.size, other.size].min
462
+ sum_of_products_of_pairs = self.sigma_pairs(other) {|a, b| a * b}
463
+ self_sum = self.sum
464
+ other_sum = other.sum
465
+ sum_of_squared_self_scores = self.sum { |e| e * e }
466
+ sum_of_squared_other_scores = other.sum { |e| e * e }
467
+
468
+ numerator = (n * sum_of_products_of_pairs) - (self_sum * other_sum)
469
+ self_denominator = ((n * sum_of_squared_self_scores) - (self_sum ** 2))
470
+ other_denominator = ((n * sum_of_squared_other_scores) - (other_sum ** 2))
471
+ denominator = Math.sqrt(self_denominator * other_denominator)
472
+ return numerator / denominator
473
+ end
474
+ alias :cor :correlation
475
+
476
+ # Transposes arrays of arrays and yields a block on the value.
477
+ # The regular Array#transpose ignores blocks
478
+ def yield_transpose(*enums, &block)
479
+ enums.unshift(self)
480
+ n = enums.map{ |x| x.size}.min
481
+ block ||= lambda{|e| e}
482
+ (0...n).map { |i| block.call enums.map{ |x| x[i] } }
483
+ end
484
+
485
+ # Returns the max of two or more enumerables.
486
+ # >> [1,2,3].max_of_lists([0,5,6], [0,2,9])
487
+ # => [1, 5, 9]
488
+ def max_of_lists(*enums)
489
+ yield_transpose(*enums) {|e| e.max}
490
+ end
491
+
492
+ # Returns the min of two or more enumerables.
493
+ # >> [1,2,3].min_of_lists([4,5,6], [0,2,9])
494
+ # => [0, 2, 3]
495
+ def min_of_lists(*enums)
496
+ yield_transpose(*enums) {|e| e.min}
497
+ end
498
+ end