tb 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/README +2 -1
  2. data/lib/tb.rb +7 -3
  3. data/lib/tb/basic.rb +1 -1
  4. data/lib/tb/cmd_cat.rb +1 -3
  5. data/lib/tb/cmd_consecutive.rb +4 -6
  6. data/lib/tb/cmd_crop.rb +5 -7
  7. data/lib/tb/cmd_cross.rb +51 -49
  8. data/lib/tb/cmd_cut.rb +2 -6
  9. data/lib/tb/cmd_git_log.rb +20 -11
  10. data/lib/tb/cmd_grep.rb +1 -3
  11. data/lib/tb/cmd_group.rb +18 -44
  12. data/lib/tb/cmd_gsub.rb +2 -4
  13. data/lib/tb/cmd_join.rb +1 -3
  14. data/lib/tb/cmd_ls.rb +8 -15
  15. data/lib/tb/cmd_mheader.rb +3 -4
  16. data/lib/tb/cmd_nest.rb +4 -9
  17. data/lib/tb/cmd_newfield.rb +1 -3
  18. data/lib/tb/cmd_rename.rb +2 -4
  19. data/lib/tb/cmd_shape.rb +2 -3
  20. data/lib/tb/cmd_sort.rb +3 -5
  21. data/lib/tb/cmd_svn_log.rb +3 -5
  22. data/lib/tb/cmd_tar_tvf.rb +2 -4
  23. data/lib/tb/cmd_to_csv.rb +1 -1
  24. data/lib/tb/cmd_unnest.rb +1 -3
  25. data/lib/tb/cmdutil.rb +57 -135
  26. data/lib/tb/csv.rb +11 -54
  27. data/lib/tb/customcmp.rb +41 -0
  28. data/lib/tb/customeq.rb +41 -0
  29. data/lib/tb/enumerable.rb +225 -435
  30. data/lib/tb/enumerator.rb +22 -14
  31. data/lib/tb/ex_enumerable.rb +659 -0
  32. data/lib/tb/ex_enumerator.rb +102 -0
  33. data/lib/tb/fileenumerator.rb +2 -2
  34. data/lib/tb/func.rb +141 -0
  35. data/lib/tb/json.rb +1 -1
  36. data/lib/tb/reader.rb +4 -4
  37. data/lib/tb/search.rb +2 -4
  38. data/lib/tb/zipper.rb +60 -0
  39. data/test/test_cmd_cat.rb +40 -0
  40. data/test/test_cmd_git_log.rb +116 -0
  41. data/test/test_cmd_ls.rb +90 -0
  42. data/test/test_cmd_svn_log.rb +87 -0
  43. data/test/test_cmd_to_csv.rb +14 -0
  44. data/test/test_cmdutil.rb +25 -10
  45. data/test/test_csv.rb +10 -0
  46. data/test/test_customcmp.rb +14 -0
  47. data/test/test_customeq.rb +20 -0
  48. data/test/{test_enumerable.rb → test_ex_enumerable.rb} +181 -3
  49. data/test/test_search.rb +2 -10
  50. data/test/test_tbenum.rb +3 -3
  51. data/test/test_zipper.rb +22 -0
  52. metadata +20 -8
  53. data/lib/tb/enum.rb +0 -294
  54. data/lib/tb/pairs.rb +0 -227
  55. data/test/test_pairs.rb +0 -122
@@ -27,10 +27,10 @@
27
27
  # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
 
29
29
  class Tb::Yielder
30
- def initialize(header_proc, each_proc)
30
+ def initialize(header_proc, base_yielder)
31
31
  @header_proc_called = false
32
32
  @header_proc = header_proc
33
- @each_proc = each_proc
33
+ @base_yielder = base_yielder
34
34
  end
35
35
  attr_reader :header_proc_called
36
36
 
@@ -44,30 +44,38 @@ class Tb::Yielder
44
44
  if !@header_proc_called
45
45
  set_header(nil)
46
46
  end
47
- @each_proc.call(*args)
47
+ @base_yielder.yield(*args)
48
48
  end
49
49
  alias << yield
50
50
  end
51
51
 
52
- class Tb::Enumerator
53
- include Tb::Enum
52
+ class Tb::Enumerator < Enumerator
53
+ include Tb::Enumerable
54
54
 
55
- def initialize(&enumerator_proc)
56
- @enumerator_proc = enumerator_proc
55
+ def self.new(&enumerator_proc)
56
+ super() {|y|
57
+ header_proc = Thread.current[:tb_enumerator_header_proc]
58
+ ty = Tb::Yielder.new(header_proc, y)
59
+ enumerator_proc.call(ty)
60
+ if !ty.header_proc_called
61
+ header_proc.call(nil)
62
+ end
63
+ }
57
64
  end
58
65
 
59
66
  def each(&each_proc)
60
- yielder = Tb::Yielder.new(nil, each_proc)
61
- @enumerator_proc.call(yielder)
62
- nil
67
+ header_and_each(nil, &each_proc)
63
68
  end
64
69
 
65
70
  def header_and_each(header_proc, &each_proc)
66
- yielder = Tb::Yielder.new(header_proc, each_proc)
67
- @enumerator_proc.call(yielder)
68
- if !yielder.header_proc_called
69
- header_proc.call(nil)
71
+ old = Thread.current[:tb_enumerator_header_proc]
72
+ begin
73
+ Thread.current[:tb_enumerator_header_proc] = header_proc
74
+ Enumerator.instance_method(:each).bind(self).call(&each_proc)
75
+ ensure
76
+ Thread.current[:tb_enumerator_header_proc] = old
70
77
  end
71
78
  nil
72
79
  end
80
+
73
81
  end
@@ -0,0 +1,659 @@
1
+ # lib/tb/ex_enumerable.rb - extensions for Enumerable
2
+ #
3
+ # Copyright (C) 2010-2012 Tanaka Akira <akr@fsij.org>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or without
6
+ # modification, are permitted provided that the following conditions are met:
7
+ #
8
+ # 1. Redistributions of source code must retain the above copyright notice, this
9
+ # list of conditions and the following disclaimer.
10
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
11
+ # this list of conditions and the following disclaimer in the documentation
12
+ # and/or other materials provided with the distribution.
13
+ # 3. The name of the author may not be used to endorse or promote products
14
+ # derived from this software without specific prior written permission.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19
+ # EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21
+ # OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24
+ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
25
+ # OF SUCH DAMAGE.
26
+
27
+ module Enumerable
28
+ # :call-seq:
29
+ # enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts])
30
+ # enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts]) {|ks, vs| ... }
31
+ #
32
+ # categorizes the elements in _enum_ and returns a hash.
33
+ # This method assumes multiple elements for a category.
34
+ #
35
+ # +tb_categorize+ takes one or more key selectors,
36
+ # one value selector and
37
+ # an optional option hash.
38
+ # It also takes an optional block.
39
+ #
40
+ # The selectors specify how to extract a value from an element in _enum_.
41
+ #
42
+ # The key selectors, _kselN_, are used to extract hash keys from an element.
43
+ # If two or more key selectors are specified, the result hash will be nested.
44
+ #
45
+ # The value selector, _vsel_, is used for the values of innermost hashes.
46
+ # By default, all values extracted by _vsel_ from the elements which
47
+ # key selectors extracts same value are composed as an array.
48
+ # The array is set to the values of the innermost hashes.
49
+ # This behavior can be customized by the options: :seed, :op and :update.
50
+ #
51
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
52
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
53
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
54
+ # p a.tb_categorize(:color, :fruit)
55
+ # #=> {"yellow"=>["banana", "grapefruit"], "green"=>["melon"]}
56
+ # p a.tb_categorize(:taste, :fruit)
57
+ # #=> {"sweet"=>["banana", "melon"], "tart"=>["grapefruit"]}
58
+ # p a.tb_categorize(:taste, :color, :fruit)
59
+ # #=> {"sweet"=>{"yellow"=>["banana"], "green"=>["melon"]}, "tart"=>{"yellow"=>["grapefruit"]}}
60
+ # p a.tb_categorize(:taste, :color)
61
+ # #=> {"sweet"=>["yellow", "green"], "tart"=>["yellow"]}
62
+ #
63
+ # In the above example, :fruit, :color and :taste is specified as selectors.
64
+ # There are several types of selectors as follows:
65
+ #
66
+ # - object with +call+ method (procedure, etc.): extracts a value from the element by calling the procedure with the element as an argument.
67
+ # - array of selectors: make an array which contains the values extracted by the selectors.
68
+ # - other object: extracts a value from the element using +[]+ method as +element[selector]+.
69
+ #
70
+ # So the selector :fruit extracts the value from the element
71
+ # {:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100}
72
+ # as {...}[:fruit].
73
+ #
74
+ # p a.tb_categorize(lambda {|elt| elt[:fruit][4] }, :fruit)
75
+ # #=> {"n"=>["banana", "melon"], "e"=>["grapefruit"]}
76
+ #
77
+ # When the key selectors returns same key for two or or more elements,
78
+ # corresponding values extracted by the value selector are combined.
79
+ # By default, all values are collected as an array.
80
+ # :seed, :op and :update option in the option hash customizes this behavior.
81
+ # :seed option and :op option is similar to Enumerable#inject.
82
+ # :seed option specifies an initial value.
83
+ # (If :seed option is not given, the first value for each category is treated as an initial value.)
84
+ # :op option specifies a procedure to combine a seed and an element into a next seed.
85
+ # :update option is same as :op option except it takes three arguments instead of two:
86
+ # keys, seed and element.
87
+ # +to_proc+ method is used to convert :op and :update option to a procedure.
88
+ # So a symbol can be used for them.
89
+ #
90
+ # # count categorized elements.
91
+ # p a.tb_categorize(:color, lambda {|e| 1 }, :op=>:+)
92
+ # #=> {"yellow"=>2, "green"=>1}
93
+ #
94
+ # p a.tb_categorize(:color, :fruit, :seed=>"", :op=>:+)
95
+ # #=> {"yellow"=>"bananagrapefruit", "green"=>"melon"}
96
+ #
97
+ # The default behavior, collecting all values as an array, is implemented as follows.
98
+ # :seed => nil
99
+ # :update => {|ks, s, v| !s ? [v] : (s << v) }
100
+ #
101
+ # :op and :update option are disjoint.
102
+ # ArgumentError is raised if both are specified.
103
+ #
104
+ # The block for +tb_categorize+ method converts combined values to final innermost hash values.
105
+ #
106
+ # p a.tb_categorize(:color, :fruit) {|ks, vs| vs.join(",") }
107
+ # #=> {"yellow"=>"banana,grapefruit", "green"=>"melon"}
108
+ #
109
+ # # calculates the average price for fruits of each color.
110
+ # p a.tb_categorize(:color, :price) {|ks, vs| vs.inject(0.0, &:+) / vs.length }
111
+ # #=> {"yellow"=>150.0, "green"=>300.0}
112
+ #
113
+ def tb_categorize(*args, &reduce_proc)
114
+ opts = args.last.kind_of?(Hash) ? args.pop : {}
115
+ if args.length < 2
116
+ raise ArgumentError, "needs 2 or more arguments without option hash (but #{args.length})"
117
+ end
118
+ value_selector = tb_cat_selector_proc(args.pop)
119
+ key_selectors = args.map {|a| tb_cat_selector_proc(a) }
120
+ has_seed = opts.has_key? :seed
121
+ seed_value = opts[:seed]
122
+ if opts.has_key?(:update) && opts.has_key?(:op)
123
+ raise ArgumentError, "both :op and :update option specified"
124
+ elsif opts.has_key? :update
125
+ update_proc = opts[:update].to_proc
126
+ elsif opts.has_key? :op
127
+ op_proc = opts[:op].to_proc
128
+ update_proc = lambda {|ks, s, v| op_proc.call(s, v) }
129
+ else
130
+ has_seed = true
131
+ seed_value = nil
132
+ update_proc = lambda {|ks, s, v| !s ? [v] : (s << v) }
133
+ end
134
+ result = {}
135
+ each {|*elts|
136
+ elt = elts.length <= 1 ? elts[0] : elts
137
+ ks = key_selectors.map {|ksel| ksel.call(elt) }
138
+ v = value_selector.call(elt)
139
+ h = result
140
+ 0.upto(ks.length-2) {|i|
141
+ k = ks[i]
142
+ h[k] = {} if !h.has_key?(k)
143
+ h = h[k]
144
+ }
145
+ lastk = ks.last
146
+ if !h.has_key?(lastk)
147
+ if has_seed
148
+ h[lastk] = update_proc.call(ks, seed_value, v)
149
+ else
150
+ h[lastk] = v
151
+ end
152
+ else
153
+ h[lastk] = update_proc.call(ks, h[lastk], v)
154
+ end
155
+ }
156
+ if reduce_proc
157
+ tb_cat_reduce(result, [], key_selectors.length-1, reduce_proc)
158
+ end
159
+ result
160
+ end
161
+
162
+ def tb_cat_selector_proc(selector)
163
+ if selector.respond_to?(:call)
164
+ selector
165
+ elsif selector.respond_to? :to_ary
166
+ selector_procs = selector.to_ary.map {|sel| tb_cat_selector_proc(sel) }
167
+ lambda {|elt| selector_procs.map {|selproc| selproc.call(elt) } }
168
+ else
169
+ lambda {|elt| elt[selector] }
170
+ end
171
+ end
172
+ private :tb_cat_selector_proc
173
+
174
+ def tb_cat_reduce(hash, ks, nestlevel, reduce_proc)
175
+ if nestlevel.zero?
176
+ hash.each {|k, v|
177
+ ks << k
178
+ begin
179
+ hash[k] = reduce_proc.call(ks.dup, v)
180
+ ensure
181
+ ks.pop
182
+ end
183
+ }
184
+ else
185
+ hash.each {|k, h|
186
+ ks << k
187
+ begin
188
+ tb_cat_reduce(h, ks, nestlevel-1, reduce_proc)
189
+ ensure
190
+ ks.pop
191
+ end
192
+ }
193
+ end
194
+ end
195
+ private :tb_cat_reduce
196
+
197
+ # :call-seq:
198
+ # enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) -> hash
199
+ # enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) {|s, v| ... } -> hash
200
+ #
201
+ # categorizes the elements in _enum_ and returns a hash.
202
+ # This method assumes one element for a category by default.
203
+ #
204
+ # +tb_unique_categorize+ takes one or more key selectors,
205
+ # one value selector and
206
+ # an optional option hash.
207
+ # It also takes an optional block.
208
+ #
209
+ # The selectors specify how to extract a value from an element in _enum_.
210
+ # See Enumerable#tb_categorize for details of selectors.
211
+ #
212
+ # The key selectors, _kselN_, are used to extract hash keys from an element.
213
+ # If two or more key selectors are specified, the result hash will be nested.
214
+ #
215
+ # The value selector, _vsel_, is used for the values of innermost hashes.
216
+ # By default, this method assumes the key selectors categorizes elements in enum uniquely.
217
+ # If the key selectors generates same keys for two or more elements, ArgumentError is raised.
218
+ # This behavior can be customized by :seed option and the block.
219
+ #
220
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
221
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
222
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
223
+ # p a.tb_unique_categorize(:fruit, :price)
224
+ # #=> {"banana"=>100, "melon"=>300, "grapefruit"=>200}
225
+ #
226
+ # p a.tb_unique_categorize(:color, :price)
227
+ # # ArgumentError
228
+ #
229
+ # If the block is given, it is used for combining values in a category.
230
+ # The arguments for the block is a seed and the value extracted by _vsel_.
231
+ # The return value of the block is used as the next seed.
232
+ # :seed option specifies the initial seed.
233
+ # If :seed is not given, the first value for each category is used for the seed.
234
+ #
235
+ # p a.tb_unique_categorize(:taste, :price) {|s, v| s + v }
236
+ # #=> {"sweet"=>400, "tart"=>200}
237
+ #
238
+ # p a.tb_unique_categorize(:color, :price) {|s, v| s + v }
239
+ # #=> {"yellow"=>300, "green"=>300}
240
+ #
241
+ def tb_unique_categorize(*args, &update_proc)
242
+ opts = args.last.kind_of?(Hash) ? args.pop.dup : {}
243
+ if update_proc
244
+ opts[:update] = lambda {|ks, s, v| update_proc.call(s, v) }
245
+ else
246
+ seed = Object.new
247
+ opts[:seed] = seed
248
+ opts[:update] = lambda {|ks, s, v|
249
+ if s.equal? seed
250
+ v
251
+ else
252
+ raise ArgumentError, "ambiguous key: #{ks.map {|k| k.inspect }.join(',')}"
253
+ end
254
+ }
255
+ end
256
+ tb_categorize(*(args + [opts]))
257
+ end
258
+
259
+ # :call-seq:
260
+ # enum.tb_category_count(ksel1, ksel2, ...)
261
+ #
262
+ # counts elements in _enum_ for each category defined by the key selectors.
263
+ #
264
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
265
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
266
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
267
+ #
268
+ # p a.tb_category_count(:color)
269
+ # #=> {"yellow"=>2, "green"=>1}
270
+ #
271
+ # p a.tb_category_count(:taste)
272
+ # #=> {"sweet"=>2, "tart"=>1}
273
+ #
274
+ # p a.tb_category_count(:taste, :color)
275
+ # #=> {"sweet"=>{"yellow"=>1, "green"=>1}, "tart"=>{"yellow"=>1}}
276
+ #
277
+ # The selectors specify how to extract a value from an element in _enum_.
278
+ # See Enumerable#tb_categorize for details of selectors.
279
+ #
280
+ def tb_category_count(*args)
281
+ tb_categorize(*(args + [lambda {|e| 1 }, {:update => lambda {|ks, s, v| s + v }}]))
282
+ end
283
+
284
+ def dump_objsfile(title, tempfile)
285
+ tempfile.flush
286
+ path = tempfile
287
+ a = []
288
+ open(path) {|f|
289
+ until f.eof?
290
+ pair = Marshal.load(f)
291
+ a << (pair ? pair.last : :sep)
292
+ end
293
+ }
294
+ puts "#{title}: #{a.inspect}"
295
+ end
296
+ private :dump_objsfile
297
+
298
+ # :call-seq:
299
+ # enum.extsort_by(options={}) {|value| cmpvalue }
300
+ #
301
+ # +extsort_by+ returns an enumerator which yields elements in the receiver in sorted order.
302
+ # The block defines the order which cmpvalue is ascending.
303
+ #
304
+ # options:
305
+ # :map : a procedure to convert the element. It is applied after cmpvalue is obtained. (default: nil)
306
+ # :unique : a procedure to merge two values which has same cmpvalue. (default: nil)
307
+ # :memsize : limit in-memory sorting size in bytes (default: 10000000)
308
+ #
309
+ # If :unique option is given, it is used to merge
310
+ # elements which have same cmpvalue.
311
+ # The procedure should take two elements and return one.
312
+ # The procedure should be associative. (f(x,f(y,z)) = f(f(x,y),z))
313
+ #
314
+ def extsort_by(opts={}, &cmpvalue_from)
315
+ mapfunc = opts[:map]
316
+ opts = opts.dup
317
+ opts[:map] = mapfunc ?
318
+ lambda {|v| Marshal.dump(mapfunc.call(v)) } :
319
+ lambda {|v| Marshal.dump(v) }
320
+ uniqfunc = opts[:unique]
321
+ if uniqfunc
322
+ opts[:unique] = lambda {|x, y| Marshal.dump(uniqfunc.call(Marshal.load(x), Marshal.load(y))) }
323
+ end
324
+ reducefunc = opts[:unique]
325
+ mapfunc2 = opts[:map] || lambda {|v| v }
326
+ self.lazy_map {|v|
327
+ [cmpvalue_from.call(v), mapfunc2.call(v)]
328
+ }.send(:extsort_internal0, reducefunc, opts).lazy_map {|k, d|
329
+ Marshal.load(d)
330
+ }
331
+ end
332
+
333
+ # :call-seq:
334
+ # enum.extsort_reduce(op, [opts]) {|element| [key, val| }
335
+ #
336
+ def extsort_reduce(op, opts={}, &key_val_proc)
337
+ lazy_map(&key_val_proc).send(:extsort_internal0, op, opts)
338
+ end
339
+
340
+ def extsort_internal0(reducefunc, opts={})
341
+ if reducefunc.is_a? Symbol
342
+ reducefunc = reducefunc.to_proc
343
+ end
344
+ opts = opts.dup
345
+ opts[:memsize] ||= 10000000
346
+ Enumerator.new {|y|
347
+ extsort_internal1(reducefunc, opts, y)
348
+ }
349
+ end
350
+ private :extsort_internal0
351
+
352
+ def extsort_internal1(reducefunc, opts, y)
353
+ tmp1 = Tempfile.new("tbsortA")
354
+ tmp2 = Tempfile.new("tbsortB")
355
+ extsort_first_split(tmp1, tmp2, reducefunc, opts)
356
+ if tmp1.size == 0 && tmp2.size == 0
357
+ return Enumerator.new {|_| }
358
+ end
359
+ tmp3 = Tempfile.new("tbsortC")
360
+ tmp4 = Tempfile.new("tbsortD")
361
+ while tmp2.size != 0
362
+ #dump_objsfile(:tmp1, tmp1)
363
+ #dump_objsfile(:tmp2, tmp2)
364
+ #dump_objsfile(:tmp3, tmp3)
365
+ #dump_objsfile(:tmp4, tmp4)
366
+ extsort_merge(tmp1, tmp2, tmp3, tmp4, reducefunc, opts)
367
+ tmp1.rewind
368
+ tmp1.truncate(0)
369
+ tmp2.rewind
370
+ tmp2.truncate(0)
371
+ tmp1, tmp2, tmp3, tmp4 = tmp3, tmp4, tmp1, tmp2
372
+ end
373
+ #dump_objsfile(:tmp1, tmp1)
374
+ #dump_objsfile(:tmp2, tmp2)
375
+ #dump_objsfile(:tmp3, tmp3)
376
+ #dump_objsfile(:tmp4, tmp4)
377
+ extsort_yield(tmp1, y)
378
+ ensure
379
+ tmp1.close(true) if tmp1
380
+ tmp2.close(true) if tmp2
381
+ tmp3.close(true) if tmp3
382
+ tmp4.close(true) if tmp4
383
+ end
384
+ private :extsort_internal1
385
+
386
+ def extsort_first_split(tmp1, tmp2, reducefunc, opts)
387
+ prevobj_cv = nil
388
+ prevobj_dumped = nil
389
+ tmp_current, tmp_another = tmp1, tmp2
390
+ buf = {}
391
+ buf_size = 0
392
+ buf_mode = true
393
+ self.each_with_index {|v, i|
394
+ obj_cv, obj = v
395
+ #p [obj, obj_cv]
396
+ #p [prevobj_cv, buf_mode, obj, obj_cv]
397
+ if buf_mode
398
+ dumped = Marshal.dump([obj_cv, obj])
399
+ ary = (buf[obj_cv] ||= [])
400
+ ary << [obj_cv, i, dumped]
401
+ buf_size += dumped.size
402
+ if reducefunc && ary.length == 2
403
+ obj1_cv, i1, dumped1 = ary[0]
404
+ _, _, dumped2 = ary[1]
405
+ _, obj1 = Marshal.load(dumped1)
406
+ _, obj2 = Marshal.load(dumped2)
407
+ obju = reducefunc.call(obj1, obj2)
408
+ buf[obj1_cv] = [[obj1_cv, i1, Marshal.dump([obj1_cv, obju])]]
409
+ end
410
+ if opts[:memsize] < buf_size
411
+ buf_keys = buf.keys.sort
412
+ (0...(buf_keys.length-1)).each {|j|
413
+ cv = buf_keys[j]
414
+ buf[cv].each {|_, _, d|
415
+ tmp_current.write d
416
+ }
417
+ }
418
+ ary = buf[buf_keys.last]
419
+ (0...(ary.length-1)).each {|j|
420
+ _, _, d = ary[j]
421
+ tmp_current.write d
422
+ }
423
+ prevobj_cv, _, prevobj_dumped = ary[-1]
424
+ buf.clear
425
+ buf_mode = false
426
+ end
427
+ elsif (cmp = (prevobj_cv <=> obj_cv)) == 0 && reducefunc
428
+ _, obj1 = Marshal.load(prevobj_dumped)
429
+ obj2 = obj
430
+ obju = reducefunc.call(obj1, obj2)
431
+ prevobj_dumped = Marshal.dump([prevobj_cv, obju])
432
+ elsif cmp <= 0
433
+ tmp_current.write prevobj_dumped
434
+ prevobj_dumped = Marshal.dump([obj_cv, obj])
435
+ prevobj_cv = obj_cv
436
+ else
437
+ tmp_current.write prevobj_dumped
438
+ Marshal.dump(nil, tmp_current)
439
+ dumped = Marshal.dump([obj_cv, obj])
440
+ buf = { obj_cv => [[obj_cv, i, dumped]] }
441
+ buf_size = dumped.size
442
+ buf_mode = true
443
+ tmp_current, tmp_another = tmp_another, tmp_current
444
+ prevobj_cv = nil
445
+ prevobj_dumped = nil
446
+ end
447
+ }
448
+ if buf_mode
449
+ buf_keys = buf.keys.sort
450
+ buf_keys.each {|cv|
451
+ buf[cv].each {|_, _, d|
452
+ tmp_current.write d
453
+ }
454
+ }
455
+ else
456
+ tmp_current.write prevobj_dumped
457
+ end
458
+ if !buf_mode || !buf.empty?
459
+ Marshal.dump(nil, tmp_current)
460
+ end
461
+ end
462
+ private :extsort_first_split
463
+
464
+ def extsort_merge(src1, src2, dst1, dst2, reducefunc, opts)
465
+ src1.rewind
466
+ src2.rewind
467
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
468
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
469
+ prefer1 = true
470
+ while true
471
+ cmp = obj1_cv <=> obj2_cv
472
+ if prefer1 ? cmp > 0 : cmp >= 0
473
+ obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
474
+ obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
475
+ prefer1 = !prefer1
476
+ end
477
+ if reducefunc && cmp == 0
478
+ Marshal.dump([obj1_cv, reducefunc.call(obj1, obj2)], dst1)
479
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
480
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
481
+ if obj1_pair && !obj2_pair
482
+ obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
483
+ obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
484
+ prefer1 = !prefer1
485
+ end
486
+ else
487
+ Marshal.dump([obj1_cv, obj1], dst1)
488
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
489
+ end
490
+ if !obj1_pair
491
+ while obj2_pair
492
+ Marshal.dump(obj2_pair, dst1)
493
+ obj2_pair = Marshal.load(src2)
494
+ end
495
+ Marshal.dump(nil, dst1)
496
+ dst1, dst2 = dst2, dst1
497
+ break if src1.eof?
498
+ break if src2.eof?
499
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
500
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
501
+ end
502
+ end
503
+ if !src1.eof?
504
+ restsrc = src1
505
+ elsif !src2.eof?
506
+ restsrc = src2
507
+ else
508
+ return
509
+ end
510
+ until restsrc.eof?
511
+ restobj_pair = Marshal.load(restsrc)
512
+ Marshal.dump(restobj_pair, dst1)
513
+ end
514
+ end
515
+ private :extsort_merge
516
+
517
+ def extsort_yield(tmp1, y)
518
+ tmp1.rewind
519
+ while true
520
+ pair = Marshal.load(tmp1)
521
+ break if !pair
522
+ y.yield pair
523
+ end
524
+ end
525
+ private :extsort_yield
526
+
527
+ # splits self by _representative_ which is called with a element.
528
+ #
529
+ # _before_group_ is called before each group with the first element.
530
+ # _after_group_ is called after each group with the last element.
531
+ # _body_ is called for each element.
532
+ #
533
+ def each_group_element_by(representative, before_group, body, after_group)
534
+ detect_group_by(before_group, after_group, &representative).each(&body)
535
+ end
536
+
537
+ # creates an enumerator which yields same as self but
538
+ # given block and procedures are called between each element for grouping.
539
+ #
540
+ # The block is called for each element to define groups.
541
+ # A group is conecutive elements which the block returns same value.
542
+ #
543
+ # _before_group_ is called before each group with the first element.
544
+ #
545
+ # _after_group_ is called after each group with the last element.
546
+ #
547
+ # _before_group_ and _after_group_ are optional.
548
+ #
549
+ # The grouping mechanism is called as "control break" in some cluture such as COBOL.
550
+ #
551
+ # Consecutive even numbers and odd numbers can be grouped as follows.
552
+ #
553
+ # [1,3,5,4,8].detect_group_by(
554
+ # lambda {|v| puts "start" },
555
+ # lambda {|v| puts "end" }) {|v| v.even? }.each {|x| p x }
556
+ # #=> start
557
+ # # 1
558
+ # # 3
559
+ # # 5
560
+ # # end
561
+ # # start
562
+ # # 4
563
+ # # 8
564
+ # # end
565
+ #
566
+ # Note that +detect_group_by+ can be cascaeded but
567
+ # It doesn't work as nested manner.
568
+ #
569
+ # (0..9).detect_group_by(
570
+ # lambda {|v| print "[" },
571
+ # lambda {|v| print "]" }) {|v|
572
+ # v.even?
573
+ # }.detect_group_by(
574
+ # lambda {|v| print "(" },
575
+ # lambda {|v| print ")" }) {|v|
576
+ # (v/2).even?
577
+ # }.each {|x| print x }
578
+ # #=> [(0][1][)(2][3][)(4][5][)(6][7][)(8][9])
579
+ #
580
+ # Consider +detect_nested_group_by+ for nested groups.
581
+ #
582
+ def detect_group_by(before_group=nil, after_group=nil, &representative_proc)
583
+ detect_nested_group_by([[representative_proc, before_group, after_group]])
584
+ end
585
+
586
+ # creates an enumerator which yields same as self but
587
+ # nested groups detected by _group_specs_
588
+ #
589
+ # _group_specs_ is an array of three procedures arrays as:
590
+ #
591
+ # [[representative_proc1, before_proc1, after_proc1],
592
+ # [representative_proc2, before_proc2, after_proc2],
593
+ # ...]
594
+ #
595
+ # _representative_proc1_ splits elements as groups.
596
+ # The group is defined as consecutive elements which _representative_proc1_ returns same value.
597
+ # _before_proc1_ is called before the each groups.
598
+ # _after_proc1_ is called after the each groups.
599
+ #
600
+ # Subsequent procedures, _representative_proc2_, _before_proc2_, _after_proc2_, ..., are
601
+ # used to split elements in the above groups.
602
+ #
603
+ # (0..9).detect_nested_group_by(
604
+ # [[lambda {|v| (v/2).even? },
605
+ # lambda {|v| print "(" },
606
+ # lambda {|v| print ")" }],
607
+ # [lambda {|v| v.even? },
608
+ # lambda {|v| print "[" },
609
+ # lambda {|v| print "]" }]]).each {|x| print x }
610
+ # #=> ([0][1])([2][3])([4][5])([6][7])([8][9])
611
+ #
612
+ def detect_nested_group_by(group_specs)
613
+ Enumerator.new {|y|
614
+ first = true
615
+ prev_reps = nil
616
+ prev = nil
617
+ self.each {|*curr|
618
+ reps = group_specs.map {|representative_proc, _, _|
619
+ representative_proc.call(*curr)
620
+ }
621
+ if first
622
+ first = false
623
+ group_specs.each {|_, before_proc, _|
624
+ before_proc.call(*curr) if before_proc
625
+ }
626
+ else
627
+ different_index = (0...group_specs.length).find {|i| prev_reps[i] != reps[i] }
628
+ if different_index
629
+ (group_specs.length-1).downto(different_index) {|i|
630
+ _, _, after_proc = group_specs[i]
631
+ after_proc.call(*prev) if after_proc
632
+ }
633
+ different_index.upto(group_specs.length-1) {|i|
634
+ _, before_proc, _ = group_specs[i]
635
+ before_proc.call(*curr) if before_proc
636
+ }
637
+ end
638
+ end
639
+ y.yield(*curr)
640
+ prev_reps = reps
641
+ prev = curr
642
+ }
643
+ if !first
644
+ (group_specs.length-1).downto(0) {|i|
645
+ _, _, after_proc = group_specs[i]
646
+ after_proc.call(*prev) if after_proc
647
+ }
648
+ end
649
+ }
650
+ end
651
+
652
+ def lazy_map
653
+ Enumerator.new {|y|
654
+ self.each {|*vs|
655
+ y.yield(yield(*vs))
656
+ }
657
+ }
658
+ end
659
+ end