tb 0.3 → 0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/README +2 -1
  2. data/lib/tb.rb +7 -3
  3. data/lib/tb/basic.rb +1 -1
  4. data/lib/tb/cmd_cat.rb +1 -3
  5. data/lib/tb/cmd_consecutive.rb +4 -6
  6. data/lib/tb/cmd_crop.rb +5 -7
  7. data/lib/tb/cmd_cross.rb +51 -49
  8. data/lib/tb/cmd_cut.rb +2 -6
  9. data/lib/tb/cmd_git_log.rb +20 -11
  10. data/lib/tb/cmd_grep.rb +1 -3
  11. data/lib/tb/cmd_group.rb +18 -44
  12. data/lib/tb/cmd_gsub.rb +2 -4
  13. data/lib/tb/cmd_join.rb +1 -3
  14. data/lib/tb/cmd_ls.rb +8 -15
  15. data/lib/tb/cmd_mheader.rb +3 -4
  16. data/lib/tb/cmd_nest.rb +4 -9
  17. data/lib/tb/cmd_newfield.rb +1 -3
  18. data/lib/tb/cmd_rename.rb +2 -4
  19. data/lib/tb/cmd_shape.rb +2 -3
  20. data/lib/tb/cmd_sort.rb +3 -5
  21. data/lib/tb/cmd_svn_log.rb +3 -5
  22. data/lib/tb/cmd_tar_tvf.rb +2 -4
  23. data/lib/tb/cmd_to_csv.rb +1 -1
  24. data/lib/tb/cmd_unnest.rb +1 -3
  25. data/lib/tb/cmdutil.rb +57 -135
  26. data/lib/tb/csv.rb +11 -54
  27. data/lib/tb/customcmp.rb +41 -0
  28. data/lib/tb/customeq.rb +41 -0
  29. data/lib/tb/enumerable.rb +225 -435
  30. data/lib/tb/enumerator.rb +22 -14
  31. data/lib/tb/ex_enumerable.rb +659 -0
  32. data/lib/tb/ex_enumerator.rb +102 -0
  33. data/lib/tb/fileenumerator.rb +2 -2
  34. data/lib/tb/func.rb +141 -0
  35. data/lib/tb/json.rb +1 -1
  36. data/lib/tb/reader.rb +4 -4
  37. data/lib/tb/search.rb +2 -4
  38. data/lib/tb/zipper.rb +60 -0
  39. data/test/test_cmd_cat.rb +40 -0
  40. data/test/test_cmd_git_log.rb +116 -0
  41. data/test/test_cmd_ls.rb +90 -0
  42. data/test/test_cmd_svn_log.rb +87 -0
  43. data/test/test_cmd_to_csv.rb +14 -0
  44. data/test/test_cmdutil.rb +25 -10
  45. data/test/test_csv.rb +10 -0
  46. data/test/test_customcmp.rb +14 -0
  47. data/test/test_customeq.rb +20 -0
  48. data/test/{test_enumerable.rb → test_ex_enumerable.rb} +181 -3
  49. data/test/test_search.rb +2 -10
  50. data/test/test_tbenum.rb +3 -3
  51. data/test/test_zipper.rb +22 -0
  52. metadata +20 -8
  53. data/lib/tb/enum.rb +0 -294
  54. data/lib/tb/pairs.rb +0 -227
  55. data/test/test_pairs.rb +0 -122
@@ -27,10 +27,10 @@
27
27
  # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
28
 
29
29
  class Tb::Yielder
30
- def initialize(header_proc, each_proc)
30
+ def initialize(header_proc, base_yielder)
31
31
  @header_proc_called = false
32
32
  @header_proc = header_proc
33
- @each_proc = each_proc
33
+ @base_yielder = base_yielder
34
34
  end
35
35
  attr_reader :header_proc_called
36
36
 
@@ -44,30 +44,38 @@ class Tb::Yielder
44
44
  if !@header_proc_called
45
45
  set_header(nil)
46
46
  end
47
- @each_proc.call(*args)
47
+ @base_yielder.yield(*args)
48
48
  end
49
49
  alias << yield
50
50
  end
51
51
 
52
- class Tb::Enumerator
53
- include Tb::Enum
52
+ class Tb::Enumerator < Enumerator
53
+ include Tb::Enumerable
54
54
 
55
- def initialize(&enumerator_proc)
56
- @enumerator_proc = enumerator_proc
55
+ def self.new(&enumerator_proc)
56
+ super() {|y|
57
+ header_proc = Thread.current[:tb_enumerator_header_proc]
58
+ ty = Tb::Yielder.new(header_proc, y)
59
+ enumerator_proc.call(ty)
60
+ if !ty.header_proc_called
61
+ header_proc.call(nil)
62
+ end
63
+ }
57
64
  end
58
65
 
59
66
  def each(&each_proc)
60
- yielder = Tb::Yielder.new(nil, each_proc)
61
- @enumerator_proc.call(yielder)
62
- nil
67
+ header_and_each(nil, &each_proc)
63
68
  end
64
69
 
65
70
  def header_and_each(header_proc, &each_proc)
66
- yielder = Tb::Yielder.new(header_proc, each_proc)
67
- @enumerator_proc.call(yielder)
68
- if !yielder.header_proc_called
69
- header_proc.call(nil)
71
+ old = Thread.current[:tb_enumerator_header_proc]
72
+ begin
73
+ Thread.current[:tb_enumerator_header_proc] = header_proc
74
+ Enumerator.instance_method(:each).bind(self).call(&each_proc)
75
+ ensure
76
+ Thread.current[:tb_enumerator_header_proc] = old
70
77
  end
71
78
  nil
72
79
  end
80
+
73
81
  end
@@ -0,0 +1,659 @@
1
+ # lib/tb/ex_enumerable.rb - extensions for Enumerable
2
+ #
3
+ # Copyright (C) 2010-2012 Tanaka Akira <akr@fsij.org>
4
+ #
5
+ # Redistribution and use in source and binary forms, with or without
6
+ # modification, are permitted provided that the following conditions are met:
7
+ #
8
+ # 1. Redistributions of source code must retain the above copyright notice, this
9
+ # list of conditions and the following disclaimer.
10
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
11
+ # this list of conditions and the following disclaimer in the documentation
12
+ # and/or other materials provided with the distribution.
13
+ # 3. The name of the author may not be used to endorse or promote products
14
+ # derived from this software without specific prior written permission.
15
+ #
16
+ # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
17
+ # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
18
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
19
+ # EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21
+ # OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24
+ # IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
25
+ # OF SUCH DAMAGE.
26
+
27
+ module Enumerable
28
+ # :call-seq:
29
+ # enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts])
30
+ # enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts]) {|ks, vs| ... }
31
+ #
32
+ # categorizes the elements in _enum_ and returns a hash.
33
+ # This method assumes multiple elements for a category.
34
+ #
35
+ # +tb_categorize+ takes one or more key selectors,
36
+ # one value selector and
37
+ # an optional option hash.
38
+ # It also takes an optional block.
39
+ #
40
+ # The selectors specify how to extract a value from an element in _enum_.
41
+ #
42
+ # The key selectors, _kselN_, are used to extract hash keys from an element.
43
+ # If two or more key selectors are specified, the result hash will be nested.
44
+ #
45
+ # The value selector, _vsel_, is used for the values of innermost hashes.
46
+ # By default, all values extracted by _vsel_ from the elements which
47
+ # key selectors extracts same value are composed as an array.
48
+ # The array is set to the values of the innermost hashes.
49
+ # This behavior can be customized by the options: :seed, :op and :update.
50
+ #
51
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
52
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
53
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
54
+ # p a.tb_categorize(:color, :fruit)
55
+ # #=> {"yellow"=>["banana", "grapefruit"], "green"=>["melon"]}
56
+ # p a.tb_categorize(:taste, :fruit)
57
+ # #=> {"sweet"=>["banana", "melon"], "tart"=>["grapefruit"]}
58
+ # p a.tb_categorize(:taste, :color, :fruit)
59
+ # #=> {"sweet"=>{"yellow"=>["banana"], "green"=>["melon"]}, "tart"=>{"yellow"=>["grapefruit"]}}
60
+ # p a.tb_categorize(:taste, :color)
61
+ # #=> {"sweet"=>["yellow", "green"], "tart"=>["yellow"]}
62
+ #
63
+ # In the above example, :fruit, :color and :taste is specified as selectors.
64
+ # There are several types of selectors as follows:
65
+ #
66
+ # - object with +call+ method (procedure, etc.): extracts a value from the element by calling the procedure with the element as an argument.
67
+ # - array of selectors: make an array which contains the values extracted by the selectors.
68
+ # - other object: extracts a value from the element using +[]+ method as +element[selector]+.
69
+ #
70
+ # So the selector :fruit extracts the value from the element
71
+ # {:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100}
72
+ # as {...}[:fruit].
73
+ #
74
+ # p a.tb_categorize(lambda {|elt| elt[:fruit][4] }, :fruit)
75
+ # #=> {"n"=>["banana", "melon"], "e"=>["grapefruit"]}
76
+ #
77
+ # When the key selectors returns same key for two or or more elements,
78
+ # corresponding values extracted by the value selector are combined.
79
+ # By default, all values are collected as an array.
80
+ # :seed, :op and :update option in the option hash customizes this behavior.
81
+ # :seed option and :op option is similar to Enumerable#inject.
82
+ # :seed option specifies an initial value.
83
+ # (If :seed option is not given, the first value for each category is treated as an initial value.)
84
+ # :op option specifies a procedure to combine a seed and an element into a next seed.
85
+ # :update option is same as :op option except it takes three arguments instead of two:
86
+ # keys, seed and element.
87
+ # +to_proc+ method is used to convert :op and :update option to a procedure.
88
+ # So a symbol can be used for them.
89
+ #
90
+ # # count categorized elements.
91
+ # p a.tb_categorize(:color, lambda {|e| 1 }, :op=>:+)
92
+ # #=> {"yellow"=>2, "green"=>1}
93
+ #
94
+ # p a.tb_categorize(:color, :fruit, :seed=>"", :op=>:+)
95
+ # #=> {"yellow"=>"bananagrapefruit", "green"=>"melon"}
96
+ #
97
+ # The default behavior, collecting all values as an array, is implemented as follows.
98
+ # :seed => nil
99
+ # :update => {|ks, s, v| !s ? [v] : (s << v) }
100
+ #
101
+ # :op and :update option are disjoint.
102
+ # ArgumentError is raised if both are specified.
103
+ #
104
+ # The block for +tb_categorize+ method converts combined values to final innermost hash values.
105
+ #
106
+ # p a.tb_categorize(:color, :fruit) {|ks, vs| vs.join(",") }
107
+ # #=> {"yellow"=>"banana,grapefruit", "green"=>"melon"}
108
+ #
109
+ # # calculates the average price for fruits of each color.
110
+ # p a.tb_categorize(:color, :price) {|ks, vs| vs.inject(0.0, &:+) / vs.length }
111
+ # #=> {"yellow"=>150.0, "green"=>300.0}
112
+ #
113
+ def tb_categorize(*args, &reduce_proc)
114
+ opts = args.last.kind_of?(Hash) ? args.pop : {}
115
+ if args.length < 2
116
+ raise ArgumentError, "needs 2 or more arguments without option hash (but #{args.length})"
117
+ end
118
+ value_selector = tb_cat_selector_proc(args.pop)
119
+ key_selectors = args.map {|a| tb_cat_selector_proc(a) }
120
+ has_seed = opts.has_key? :seed
121
+ seed_value = opts[:seed]
122
+ if opts.has_key?(:update) && opts.has_key?(:op)
123
+ raise ArgumentError, "both :op and :update option specified"
124
+ elsif opts.has_key? :update
125
+ update_proc = opts[:update].to_proc
126
+ elsif opts.has_key? :op
127
+ op_proc = opts[:op].to_proc
128
+ update_proc = lambda {|ks, s, v| op_proc.call(s, v) }
129
+ else
130
+ has_seed = true
131
+ seed_value = nil
132
+ update_proc = lambda {|ks, s, v| !s ? [v] : (s << v) }
133
+ end
134
+ result = {}
135
+ each {|*elts|
136
+ elt = elts.length <= 1 ? elts[0] : elts
137
+ ks = key_selectors.map {|ksel| ksel.call(elt) }
138
+ v = value_selector.call(elt)
139
+ h = result
140
+ 0.upto(ks.length-2) {|i|
141
+ k = ks[i]
142
+ h[k] = {} if !h.has_key?(k)
143
+ h = h[k]
144
+ }
145
+ lastk = ks.last
146
+ if !h.has_key?(lastk)
147
+ if has_seed
148
+ h[lastk] = update_proc.call(ks, seed_value, v)
149
+ else
150
+ h[lastk] = v
151
+ end
152
+ else
153
+ h[lastk] = update_proc.call(ks, h[lastk], v)
154
+ end
155
+ }
156
+ if reduce_proc
157
+ tb_cat_reduce(result, [], key_selectors.length-1, reduce_proc)
158
+ end
159
+ result
160
+ end
161
+
162
+ def tb_cat_selector_proc(selector)
163
+ if selector.respond_to?(:call)
164
+ selector
165
+ elsif selector.respond_to? :to_ary
166
+ selector_procs = selector.to_ary.map {|sel| tb_cat_selector_proc(sel) }
167
+ lambda {|elt| selector_procs.map {|selproc| selproc.call(elt) } }
168
+ else
169
+ lambda {|elt| elt[selector] }
170
+ end
171
+ end
172
+ private :tb_cat_selector_proc
173
+
174
+ def tb_cat_reduce(hash, ks, nestlevel, reduce_proc)
175
+ if nestlevel.zero?
176
+ hash.each {|k, v|
177
+ ks << k
178
+ begin
179
+ hash[k] = reduce_proc.call(ks.dup, v)
180
+ ensure
181
+ ks.pop
182
+ end
183
+ }
184
+ else
185
+ hash.each {|k, h|
186
+ ks << k
187
+ begin
188
+ tb_cat_reduce(h, ks, nestlevel-1, reduce_proc)
189
+ ensure
190
+ ks.pop
191
+ end
192
+ }
193
+ end
194
+ end
195
+ private :tb_cat_reduce
196
+
197
+ # :call-seq:
198
+ # enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) -> hash
199
+ # enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) {|s, v| ... } -> hash
200
+ #
201
+ # categorizes the elements in _enum_ and returns a hash.
202
+ # This method assumes one element for a category by default.
203
+ #
204
+ # +tb_unique_categorize+ takes one or more key selectors,
205
+ # one value selector and
206
+ # an optional option hash.
207
+ # It also takes an optional block.
208
+ #
209
+ # The selectors specify how to extract a value from an element in _enum_.
210
+ # See Enumerable#tb_categorize for details of selectors.
211
+ #
212
+ # The key selectors, _kselN_, are used to extract hash keys from an element.
213
+ # If two or more key selectors are specified, the result hash will be nested.
214
+ #
215
+ # The value selector, _vsel_, is used for the values of innermost hashes.
216
+ # By default, this method assumes the key selectors categorizes elements in enum uniquely.
217
+ # If the key selectors generates same keys for two or more elements, ArgumentError is raised.
218
+ # This behavior can be customized by :seed option and the block.
219
+ #
220
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
221
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
222
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
223
+ # p a.tb_unique_categorize(:fruit, :price)
224
+ # #=> {"banana"=>100, "melon"=>300, "grapefruit"=>200}
225
+ #
226
+ # p a.tb_unique_categorize(:color, :price)
227
+ # # ArgumentError
228
+ #
229
+ # If the block is given, it is used for combining values in a category.
230
+ # The arguments for the block is a seed and the value extracted by _vsel_.
231
+ # The return value of the block is used as the next seed.
232
+ # :seed option specifies the initial seed.
233
+ # If :seed is not given, the first value for each category is used for the seed.
234
+ #
235
+ # p a.tb_unique_categorize(:taste, :price) {|s, v| s + v }
236
+ # #=> {"sweet"=>400, "tart"=>200}
237
+ #
238
+ # p a.tb_unique_categorize(:color, :price) {|s, v| s + v }
239
+ # #=> {"yellow"=>300, "green"=>300}
240
+ #
241
+ def tb_unique_categorize(*args, &update_proc)
242
+ opts = args.last.kind_of?(Hash) ? args.pop.dup : {}
243
+ if update_proc
244
+ opts[:update] = lambda {|ks, s, v| update_proc.call(s, v) }
245
+ else
246
+ seed = Object.new
247
+ opts[:seed] = seed
248
+ opts[:update] = lambda {|ks, s, v|
249
+ if s.equal? seed
250
+ v
251
+ else
252
+ raise ArgumentError, "ambiguous key: #{ks.map {|k| k.inspect }.join(',')}"
253
+ end
254
+ }
255
+ end
256
+ tb_categorize(*(args + [opts]))
257
+ end
258
+
259
+ # :call-seq:
260
+ # enum.tb_category_count(ksel1, ksel2, ...)
261
+ #
262
+ # counts elements in _enum_ for each category defined by the key selectors.
263
+ #
264
+ # a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
265
+ # {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
266
+ # {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
267
+ #
268
+ # p a.tb_category_count(:color)
269
+ # #=> {"yellow"=>2, "green"=>1}
270
+ #
271
+ # p a.tb_category_count(:taste)
272
+ # #=> {"sweet"=>2, "tart"=>1}
273
+ #
274
+ # p a.tb_category_count(:taste, :color)
275
+ # #=> {"sweet"=>{"yellow"=>1, "green"=>1}, "tart"=>{"yellow"=>1}}
276
+ #
277
+ # The selectors specify how to extract a value from an element in _enum_.
278
+ # See Enumerable#tb_categorize for details of selectors.
279
+ #
280
+ def tb_category_count(*args)
281
+ tb_categorize(*(args + [lambda {|e| 1 }, {:update => lambda {|ks, s, v| s + v }}]))
282
+ end
283
+
284
+ def dump_objsfile(title, tempfile)
285
+ tempfile.flush
286
+ path = tempfile
287
+ a = []
288
+ open(path) {|f|
289
+ until f.eof?
290
+ pair = Marshal.load(f)
291
+ a << (pair ? pair.last : :sep)
292
+ end
293
+ }
294
+ puts "#{title}: #{a.inspect}"
295
+ end
296
+ private :dump_objsfile
297
+
298
+ # :call-seq:
299
+ # enum.extsort_by(options={}) {|value| cmpvalue }
300
+ #
301
+ # +extsort_by+ returns an enumerator which yields elements in the receiver in sorted order.
302
+ # The block defines the order which cmpvalue is ascending.
303
+ #
304
+ # options:
305
+ # :map : a procedure to convert the element. It is applied after cmpvalue is obtained. (default: nil)
306
+ # :unique : a procedure to merge two values which has same cmpvalue. (default: nil)
307
+ # :memsize : limit in-memory sorting size in bytes (default: 10000000)
308
+ #
309
+ # If :unique option is given, it is used to merge
310
+ # elements which have same cmpvalue.
311
+ # The procedure should take two elements and return one.
312
+ # The procedure should be associative. (f(x,f(y,z)) = f(f(x,y),z))
313
+ #
314
+ def extsort_by(opts={}, &cmpvalue_from)
315
+ mapfunc = opts[:map]
316
+ opts = opts.dup
317
+ opts[:map] = mapfunc ?
318
+ lambda {|v| Marshal.dump(mapfunc.call(v)) } :
319
+ lambda {|v| Marshal.dump(v) }
320
+ uniqfunc = opts[:unique]
321
+ if uniqfunc
322
+ opts[:unique] = lambda {|x, y| Marshal.dump(uniqfunc.call(Marshal.load(x), Marshal.load(y))) }
323
+ end
324
+ reducefunc = opts[:unique]
325
+ mapfunc2 = opts[:map] || lambda {|v| v }
326
+ self.lazy_map {|v|
327
+ [cmpvalue_from.call(v), mapfunc2.call(v)]
328
+ }.send(:extsort_internal0, reducefunc, opts).lazy_map {|k, d|
329
+ Marshal.load(d)
330
+ }
331
+ end
332
+
333
+ # :call-seq:
334
+ # enum.extsort_reduce(op, [opts]) {|element| [key, val| }
335
+ #
336
+ def extsort_reduce(op, opts={}, &key_val_proc)
337
+ lazy_map(&key_val_proc).send(:extsort_internal0, op, opts)
338
+ end
339
+
340
+ def extsort_internal0(reducefunc, opts={})
341
+ if reducefunc.is_a? Symbol
342
+ reducefunc = reducefunc.to_proc
343
+ end
344
+ opts = opts.dup
345
+ opts[:memsize] ||= 10000000
346
+ Enumerator.new {|y|
347
+ extsort_internal1(reducefunc, opts, y)
348
+ }
349
+ end
350
+ private :extsort_internal0
351
+
352
+ def extsort_internal1(reducefunc, opts, y)
353
+ tmp1 = Tempfile.new("tbsortA")
354
+ tmp2 = Tempfile.new("tbsortB")
355
+ extsort_first_split(tmp1, tmp2, reducefunc, opts)
356
+ if tmp1.size == 0 && tmp2.size == 0
357
+ return Enumerator.new {|_| }
358
+ end
359
+ tmp3 = Tempfile.new("tbsortC")
360
+ tmp4 = Tempfile.new("tbsortD")
361
+ while tmp2.size != 0
362
+ #dump_objsfile(:tmp1, tmp1)
363
+ #dump_objsfile(:tmp2, tmp2)
364
+ #dump_objsfile(:tmp3, tmp3)
365
+ #dump_objsfile(:tmp4, tmp4)
366
+ extsort_merge(tmp1, tmp2, tmp3, tmp4, reducefunc, opts)
367
+ tmp1.rewind
368
+ tmp1.truncate(0)
369
+ tmp2.rewind
370
+ tmp2.truncate(0)
371
+ tmp1, tmp2, tmp3, tmp4 = tmp3, tmp4, tmp1, tmp2
372
+ end
373
+ #dump_objsfile(:tmp1, tmp1)
374
+ #dump_objsfile(:tmp2, tmp2)
375
+ #dump_objsfile(:tmp3, tmp3)
376
+ #dump_objsfile(:tmp4, tmp4)
377
+ extsort_yield(tmp1, y)
378
+ ensure
379
+ tmp1.close(true) if tmp1
380
+ tmp2.close(true) if tmp2
381
+ tmp3.close(true) if tmp3
382
+ tmp4.close(true) if tmp4
383
+ end
384
+ private :extsort_internal1
385
+
386
+ def extsort_first_split(tmp1, tmp2, reducefunc, opts)
387
+ prevobj_cv = nil
388
+ prevobj_dumped = nil
389
+ tmp_current, tmp_another = tmp1, tmp2
390
+ buf = {}
391
+ buf_size = 0
392
+ buf_mode = true
393
+ self.each_with_index {|v, i|
394
+ obj_cv, obj = v
395
+ #p [obj, obj_cv]
396
+ #p [prevobj_cv, buf_mode, obj, obj_cv]
397
+ if buf_mode
398
+ dumped = Marshal.dump([obj_cv, obj])
399
+ ary = (buf[obj_cv] ||= [])
400
+ ary << [obj_cv, i, dumped]
401
+ buf_size += dumped.size
402
+ if reducefunc && ary.length == 2
403
+ obj1_cv, i1, dumped1 = ary[0]
404
+ _, _, dumped2 = ary[1]
405
+ _, obj1 = Marshal.load(dumped1)
406
+ _, obj2 = Marshal.load(dumped2)
407
+ obju = reducefunc.call(obj1, obj2)
408
+ buf[obj1_cv] = [[obj1_cv, i1, Marshal.dump([obj1_cv, obju])]]
409
+ end
410
+ if opts[:memsize] < buf_size
411
+ buf_keys = buf.keys.sort
412
+ (0...(buf_keys.length-1)).each {|j|
413
+ cv = buf_keys[j]
414
+ buf[cv].each {|_, _, d|
415
+ tmp_current.write d
416
+ }
417
+ }
418
+ ary = buf[buf_keys.last]
419
+ (0...(ary.length-1)).each {|j|
420
+ _, _, d = ary[j]
421
+ tmp_current.write d
422
+ }
423
+ prevobj_cv, _, prevobj_dumped = ary[-1]
424
+ buf.clear
425
+ buf_mode = false
426
+ end
427
+ elsif (cmp = (prevobj_cv <=> obj_cv)) == 0 && reducefunc
428
+ _, obj1 = Marshal.load(prevobj_dumped)
429
+ obj2 = obj
430
+ obju = reducefunc.call(obj1, obj2)
431
+ prevobj_dumped = Marshal.dump([prevobj_cv, obju])
432
+ elsif cmp <= 0
433
+ tmp_current.write prevobj_dumped
434
+ prevobj_dumped = Marshal.dump([obj_cv, obj])
435
+ prevobj_cv = obj_cv
436
+ else
437
+ tmp_current.write prevobj_dumped
438
+ Marshal.dump(nil, tmp_current)
439
+ dumped = Marshal.dump([obj_cv, obj])
440
+ buf = { obj_cv => [[obj_cv, i, dumped]] }
441
+ buf_size = dumped.size
442
+ buf_mode = true
443
+ tmp_current, tmp_another = tmp_another, tmp_current
444
+ prevobj_cv = nil
445
+ prevobj_dumped = nil
446
+ end
447
+ }
448
+ if buf_mode
449
+ buf_keys = buf.keys.sort
450
+ buf_keys.each {|cv|
451
+ buf[cv].each {|_, _, d|
452
+ tmp_current.write d
453
+ }
454
+ }
455
+ else
456
+ tmp_current.write prevobj_dumped
457
+ end
458
+ if !buf_mode || !buf.empty?
459
+ Marshal.dump(nil, tmp_current)
460
+ end
461
+ end
462
+ private :extsort_first_split
463
+
464
+ def extsort_merge(src1, src2, dst1, dst2, reducefunc, opts)
465
+ src1.rewind
466
+ src2.rewind
467
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
468
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
469
+ prefer1 = true
470
+ while true
471
+ cmp = obj1_cv <=> obj2_cv
472
+ if prefer1 ? cmp > 0 : cmp >= 0
473
+ obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
474
+ obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
475
+ prefer1 = !prefer1
476
+ end
477
+ if reducefunc && cmp == 0
478
+ Marshal.dump([obj1_cv, reducefunc.call(obj1, obj2)], dst1)
479
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
480
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
481
+ if obj1_pair && !obj2_pair
482
+ obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
483
+ obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
484
+ prefer1 = !prefer1
485
+ end
486
+ else
487
+ Marshal.dump([obj1_cv, obj1], dst1)
488
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
489
+ end
490
+ if !obj1_pair
491
+ while obj2_pair
492
+ Marshal.dump(obj2_pair, dst1)
493
+ obj2_pair = Marshal.load(src2)
494
+ end
495
+ Marshal.dump(nil, dst1)
496
+ dst1, dst2 = dst2, dst1
497
+ break if src1.eof?
498
+ break if src2.eof?
499
+ obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
500
+ obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
501
+ end
502
+ end
503
+ if !src1.eof?
504
+ restsrc = src1
505
+ elsif !src2.eof?
506
+ restsrc = src2
507
+ else
508
+ return
509
+ end
510
+ until restsrc.eof?
511
+ restobj_pair = Marshal.load(restsrc)
512
+ Marshal.dump(restobj_pair, dst1)
513
+ end
514
+ end
515
+ private :extsort_merge
516
+
517
+ def extsort_yield(tmp1, y)
518
+ tmp1.rewind
519
+ while true
520
+ pair = Marshal.load(tmp1)
521
+ break if !pair
522
+ y.yield pair
523
+ end
524
+ end
525
+ private :extsort_yield
526
+
527
+ # splits self by _representative_ which is called with a element.
528
+ #
529
+ # _before_group_ is called before each group with the first element.
530
+ # _after_group_ is called after each group with the last element.
531
+ # _body_ is called for each element.
532
+ #
533
+ def each_group_element_by(representative, before_group, body, after_group)
534
+ detect_group_by(before_group, after_group, &representative).each(&body)
535
+ end
536
+
537
+ # creates an enumerator which yields same as self but
538
+ # given block and procedures are called between each element for grouping.
539
+ #
540
+ # The block is called for each element to define groups.
541
+ # A group is conecutive elements which the block returns same value.
542
+ #
543
+ # _before_group_ is called before each group with the first element.
544
+ #
545
+ # _after_group_ is called after each group with the last element.
546
+ #
547
+ # _before_group_ and _after_group_ are optional.
548
+ #
549
+ # The grouping mechanism is called as "control break" in some cluture such as COBOL.
550
+ #
551
+ # Consecutive even numbers and odd numbers can be grouped as follows.
552
+ #
553
+ # [1,3,5,4,8].detect_group_by(
554
+ # lambda {|v| puts "start" },
555
+ # lambda {|v| puts "end" }) {|v| v.even? }.each {|x| p x }
556
+ # #=> start
557
+ # # 1
558
+ # # 3
559
+ # # 5
560
+ # # end
561
+ # # start
562
+ # # 4
563
+ # # 8
564
+ # # end
565
+ #
566
+ # Note that +detect_group_by+ can be cascaeded but
567
+ # It doesn't work as nested manner.
568
+ #
569
+ # (0..9).detect_group_by(
570
+ # lambda {|v| print "[" },
571
+ # lambda {|v| print "]" }) {|v|
572
+ # v.even?
573
+ # }.detect_group_by(
574
+ # lambda {|v| print "(" },
575
+ # lambda {|v| print ")" }) {|v|
576
+ # (v/2).even?
577
+ # }.each {|x| print x }
578
+ # #=> [(0][1][)(2][3][)(4][5][)(6][7][)(8][9])
579
+ #
580
+ # Consider +detect_nested_group_by+ for nested groups.
581
+ #
582
+ def detect_group_by(before_group=nil, after_group=nil, &representative_proc)
583
+ detect_nested_group_by([[representative_proc, before_group, after_group]])
584
+ end
585
+
586
+ # creates an enumerator which yields same as self but
587
+ # nested groups detected by _group_specs_
588
+ #
589
+ # _group_specs_ is an array of three procedures arrays as:
590
+ #
591
+ # [[representative_proc1, before_proc1, after_proc1],
592
+ # [representative_proc2, before_proc2, after_proc2],
593
+ # ...]
594
+ #
595
+ # _representative_proc1_ splits elements as groups.
596
+ # The group is defined as consecutive elements which _representative_proc1_ returns same value.
597
+ # _before_proc1_ is called before the each groups.
598
+ # _after_proc1_ is called after the each groups.
599
+ #
600
+ # Subsequent procedures, _representative_proc2_, _before_proc2_, _after_proc2_, ..., are
601
+ # used to split elements in the above groups.
602
+ #
603
+ # (0..9).detect_nested_group_by(
604
+ # [[lambda {|v| (v/2).even? },
605
+ # lambda {|v| print "(" },
606
+ # lambda {|v| print ")" }],
607
+ # [lambda {|v| v.even? },
608
+ # lambda {|v| print "[" },
609
+ # lambda {|v| print "]" }]]).each {|x| print x }
610
+ # #=> ([0][1])([2][3])([4][5])([6][7])([8][9])
611
+ #
612
+ def detect_nested_group_by(group_specs)
613
+ Enumerator.new {|y|
614
+ first = true
615
+ prev_reps = nil
616
+ prev = nil
617
+ self.each {|*curr|
618
+ reps = group_specs.map {|representative_proc, _, _|
619
+ representative_proc.call(*curr)
620
+ }
621
+ if first
622
+ first = false
623
+ group_specs.each {|_, before_proc, _|
624
+ before_proc.call(*curr) if before_proc
625
+ }
626
+ else
627
+ different_index = (0...group_specs.length).find {|i| prev_reps[i] != reps[i] }
628
+ if different_index
629
+ (group_specs.length-1).downto(different_index) {|i|
630
+ _, _, after_proc = group_specs[i]
631
+ after_proc.call(*prev) if after_proc
632
+ }
633
+ different_index.upto(group_specs.length-1) {|i|
634
+ _, before_proc, _ = group_specs[i]
635
+ before_proc.call(*curr) if before_proc
636
+ }
637
+ end
638
+ end
639
+ y.yield(*curr)
640
+ prev_reps = reps
641
+ prev = curr
642
+ }
643
+ if !first
644
+ (group_specs.length-1).downto(0) {|i|
645
+ _, _, after_proc = group_specs[i]
646
+ after_proc.call(*prev) if after_proc
647
+ }
648
+ end
649
+ }
650
+ end
651
+
652
+ def lazy_map
653
+ Enumerator.new {|y|
654
+ self.each {|*vs|
655
+ y.yield(yield(*vs))
656
+ }
657
+ }
658
+ end
659
+ end