tb 0.3 → 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -1
- data/lib/tb.rb +7 -3
- data/lib/tb/basic.rb +1 -1
- data/lib/tb/cmd_cat.rb +1 -3
- data/lib/tb/cmd_consecutive.rb +4 -6
- data/lib/tb/cmd_crop.rb +5 -7
- data/lib/tb/cmd_cross.rb +51 -49
- data/lib/tb/cmd_cut.rb +2 -6
- data/lib/tb/cmd_git_log.rb +20 -11
- data/lib/tb/cmd_grep.rb +1 -3
- data/lib/tb/cmd_group.rb +18 -44
- data/lib/tb/cmd_gsub.rb +2 -4
- data/lib/tb/cmd_join.rb +1 -3
- data/lib/tb/cmd_ls.rb +8 -15
- data/lib/tb/cmd_mheader.rb +3 -4
- data/lib/tb/cmd_nest.rb +4 -9
- data/lib/tb/cmd_newfield.rb +1 -3
- data/lib/tb/cmd_rename.rb +2 -4
- data/lib/tb/cmd_shape.rb +2 -3
- data/lib/tb/cmd_sort.rb +3 -5
- data/lib/tb/cmd_svn_log.rb +3 -5
- data/lib/tb/cmd_tar_tvf.rb +2 -4
- data/lib/tb/cmd_to_csv.rb +1 -1
- data/lib/tb/cmd_unnest.rb +1 -3
- data/lib/tb/cmdutil.rb +57 -135
- data/lib/tb/csv.rb +11 -54
- data/lib/tb/customcmp.rb +41 -0
- data/lib/tb/customeq.rb +41 -0
- data/lib/tb/enumerable.rb +225 -435
- data/lib/tb/enumerator.rb +22 -14
- data/lib/tb/ex_enumerable.rb +659 -0
- data/lib/tb/ex_enumerator.rb +102 -0
- data/lib/tb/fileenumerator.rb +2 -2
- data/lib/tb/func.rb +141 -0
- data/lib/tb/json.rb +1 -1
- data/lib/tb/reader.rb +4 -4
- data/lib/tb/search.rb +2 -4
- data/lib/tb/zipper.rb +60 -0
- data/test/test_cmd_cat.rb +40 -0
- data/test/test_cmd_git_log.rb +116 -0
- data/test/test_cmd_ls.rb +90 -0
- data/test/test_cmd_svn_log.rb +87 -0
- data/test/test_cmd_to_csv.rb +14 -0
- data/test/test_cmdutil.rb +25 -10
- data/test/test_csv.rb +10 -0
- data/test/test_customcmp.rb +14 -0
- data/test/test_customeq.rb +20 -0
- data/test/{test_enumerable.rb → test_ex_enumerable.rb} +181 -3
- data/test/test_search.rb +2 -10
- data/test/test_tbenum.rb +3 -3
- data/test/test_zipper.rb +22 -0
- metadata +20 -8
- data/lib/tb/enum.rb +0 -294
- data/lib/tb/pairs.rb +0 -227
- data/test/test_pairs.rb +0 -122
data/lib/tb/enumerator.rb
CHANGED
@@ -27,10 +27,10 @@
|
|
27
27
|
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
28
28
|
|
29
29
|
class Tb::Yielder
|
30
|
-
def initialize(header_proc,
|
30
|
+
def initialize(header_proc, base_yielder)
|
31
31
|
@header_proc_called = false
|
32
32
|
@header_proc = header_proc
|
33
|
-
@
|
33
|
+
@base_yielder = base_yielder
|
34
34
|
end
|
35
35
|
attr_reader :header_proc_called
|
36
36
|
|
@@ -44,30 +44,38 @@ class Tb::Yielder
|
|
44
44
|
if !@header_proc_called
|
45
45
|
set_header(nil)
|
46
46
|
end
|
47
|
-
@
|
47
|
+
@base_yielder.yield(*args)
|
48
48
|
end
|
49
49
|
alias << yield
|
50
50
|
end
|
51
51
|
|
52
|
-
class Tb::Enumerator
|
53
|
-
include Tb::
|
52
|
+
class Tb::Enumerator < Enumerator
|
53
|
+
include Tb::Enumerable
|
54
54
|
|
55
|
-
def
|
56
|
-
|
55
|
+
def self.new(&enumerator_proc)
|
56
|
+
super() {|y|
|
57
|
+
header_proc = Thread.current[:tb_enumerator_header_proc]
|
58
|
+
ty = Tb::Yielder.new(header_proc, y)
|
59
|
+
enumerator_proc.call(ty)
|
60
|
+
if !ty.header_proc_called
|
61
|
+
header_proc.call(nil)
|
62
|
+
end
|
63
|
+
}
|
57
64
|
end
|
58
65
|
|
59
66
|
def each(&each_proc)
|
60
|
-
|
61
|
-
@enumerator_proc.call(yielder)
|
62
|
-
nil
|
67
|
+
header_and_each(nil, &each_proc)
|
63
68
|
end
|
64
69
|
|
65
70
|
def header_and_each(header_proc, &each_proc)
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
71
|
+
old = Thread.current[:tb_enumerator_header_proc]
|
72
|
+
begin
|
73
|
+
Thread.current[:tb_enumerator_header_proc] = header_proc
|
74
|
+
Enumerator.instance_method(:each).bind(self).call(&each_proc)
|
75
|
+
ensure
|
76
|
+
Thread.current[:tb_enumerator_header_proc] = old
|
70
77
|
end
|
71
78
|
nil
|
72
79
|
end
|
80
|
+
|
73
81
|
end
|
@@ -0,0 +1,659 @@
|
|
1
|
+
# lib/tb/ex_enumerable.rb - extensions for Enumerable
|
2
|
+
#
|
3
|
+
# Copyright (C) 2010-2012 Tanaka Akira <akr@fsij.org>
|
4
|
+
#
|
5
|
+
# Redistribution and use in source and binary forms, with or without
|
6
|
+
# modification, are permitted provided that the following conditions are met:
|
7
|
+
#
|
8
|
+
# 1. Redistributions of source code must retain the above copyright notice, this
|
9
|
+
# list of conditions and the following disclaimer.
|
10
|
+
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
11
|
+
# this list of conditions and the following disclaimer in the documentation
|
12
|
+
# and/or other materials provided with the distribution.
|
13
|
+
# 3. The name of the author may not be used to endorse or promote products
|
14
|
+
# derived from this software without specific prior written permission.
|
15
|
+
#
|
16
|
+
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
17
|
+
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
18
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
19
|
+
# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
20
|
+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
21
|
+
# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
22
|
+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
23
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
24
|
+
# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
|
25
|
+
# OF SUCH DAMAGE.
|
26
|
+
|
27
|
+
module Enumerable
|
28
|
+
# :call-seq:
|
29
|
+
# enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts])
|
30
|
+
# enum.tb_categorize(ksel1, ksel2, ..., vsel, [opts]) {|ks, vs| ... }
|
31
|
+
#
|
32
|
+
# categorizes the elements in _enum_ and returns a hash.
|
33
|
+
# This method assumes multiple elements for a category.
|
34
|
+
#
|
35
|
+
# +tb_categorize+ takes one or more key selectors,
|
36
|
+
# one value selector and
|
37
|
+
# an optional option hash.
|
38
|
+
# It also takes an optional block.
|
39
|
+
#
|
40
|
+
# The selectors specify how to extract a value from an element in _enum_.
|
41
|
+
#
|
42
|
+
# The key selectors, _kselN_, are used to extract hash keys from an element.
|
43
|
+
# If two or more key selectors are specified, the result hash will be nested.
|
44
|
+
#
|
45
|
+
# The value selector, _vsel_, is used for the values of innermost hashes.
|
46
|
+
# By default, all values extracted by _vsel_ from the elements which
|
47
|
+
# key selectors extracts same value are composed as an array.
|
48
|
+
# The array is set to the values of the innermost hashes.
|
49
|
+
# This behavior can be customized by the options: :seed, :op and :update.
|
50
|
+
#
|
51
|
+
# a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
|
52
|
+
# {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
|
53
|
+
# {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
|
54
|
+
# p a.tb_categorize(:color, :fruit)
|
55
|
+
# #=> {"yellow"=>["banana", "grapefruit"], "green"=>["melon"]}
|
56
|
+
# p a.tb_categorize(:taste, :fruit)
|
57
|
+
# #=> {"sweet"=>["banana", "melon"], "tart"=>["grapefruit"]}
|
58
|
+
# p a.tb_categorize(:taste, :color, :fruit)
|
59
|
+
# #=> {"sweet"=>{"yellow"=>["banana"], "green"=>["melon"]}, "tart"=>{"yellow"=>["grapefruit"]}}
|
60
|
+
# p a.tb_categorize(:taste, :color)
|
61
|
+
# #=> {"sweet"=>["yellow", "green"], "tart"=>["yellow"]}
|
62
|
+
#
|
63
|
+
# In the above example, :fruit, :color and :taste is specified as selectors.
|
64
|
+
# There are several types of selectors as follows:
|
65
|
+
#
|
66
|
+
# - object with +call+ method (procedure, etc.): extracts a value from the element by calling the procedure with the element as an argument.
|
67
|
+
# - array of selectors: make an array which contains the values extracted by the selectors.
|
68
|
+
# - other object: extracts a value from the element using +[]+ method as +element[selector]+.
|
69
|
+
#
|
70
|
+
# So the selector :fruit extracts the value from the element
|
71
|
+
# {:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100}
|
72
|
+
# as {...}[:fruit].
|
73
|
+
#
|
74
|
+
# p a.tb_categorize(lambda {|elt| elt[:fruit][4] }, :fruit)
|
75
|
+
# #=> {"n"=>["banana", "melon"], "e"=>["grapefruit"]}
|
76
|
+
#
|
77
|
+
# When the key selectors returns same key for two or or more elements,
|
78
|
+
# corresponding values extracted by the value selector are combined.
|
79
|
+
# By default, all values are collected as an array.
|
80
|
+
# :seed, :op and :update option in the option hash customizes this behavior.
|
81
|
+
# :seed option and :op option is similar to Enumerable#inject.
|
82
|
+
# :seed option specifies an initial value.
|
83
|
+
# (If :seed option is not given, the first value for each category is treated as an initial value.)
|
84
|
+
# :op option specifies a procedure to combine a seed and an element into a next seed.
|
85
|
+
# :update option is same as :op option except it takes three arguments instead of two:
|
86
|
+
# keys, seed and element.
|
87
|
+
# +to_proc+ method is used to convert :op and :update option to a procedure.
|
88
|
+
# So a symbol can be used for them.
|
89
|
+
#
|
90
|
+
# # count categorized elements.
|
91
|
+
# p a.tb_categorize(:color, lambda {|e| 1 }, :op=>:+)
|
92
|
+
# #=> {"yellow"=>2, "green"=>1}
|
93
|
+
#
|
94
|
+
# p a.tb_categorize(:color, :fruit, :seed=>"", :op=>:+)
|
95
|
+
# #=> {"yellow"=>"bananagrapefruit", "green"=>"melon"}
|
96
|
+
#
|
97
|
+
# The default behavior, collecting all values as an array, is implemented as follows.
|
98
|
+
# :seed => nil
|
99
|
+
# :update => {|ks, s, v| !s ? [v] : (s << v) }
|
100
|
+
#
|
101
|
+
# :op and :update option are disjoint.
|
102
|
+
# ArgumentError is raised if both are specified.
|
103
|
+
#
|
104
|
+
# The block for +tb_categorize+ method converts combined values to final innermost hash values.
|
105
|
+
#
|
106
|
+
# p a.tb_categorize(:color, :fruit) {|ks, vs| vs.join(",") }
|
107
|
+
# #=> {"yellow"=>"banana,grapefruit", "green"=>"melon"}
|
108
|
+
#
|
109
|
+
# # calculates the average price for fruits of each color.
|
110
|
+
# p a.tb_categorize(:color, :price) {|ks, vs| vs.inject(0.0, &:+) / vs.length }
|
111
|
+
# #=> {"yellow"=>150.0, "green"=>300.0}
|
112
|
+
#
|
113
|
+
def tb_categorize(*args, &reduce_proc)
|
114
|
+
opts = args.last.kind_of?(Hash) ? args.pop : {}
|
115
|
+
if args.length < 2
|
116
|
+
raise ArgumentError, "needs 2 or more arguments without option hash (but #{args.length})"
|
117
|
+
end
|
118
|
+
value_selector = tb_cat_selector_proc(args.pop)
|
119
|
+
key_selectors = args.map {|a| tb_cat_selector_proc(a) }
|
120
|
+
has_seed = opts.has_key? :seed
|
121
|
+
seed_value = opts[:seed]
|
122
|
+
if opts.has_key?(:update) && opts.has_key?(:op)
|
123
|
+
raise ArgumentError, "both :op and :update option specified"
|
124
|
+
elsif opts.has_key? :update
|
125
|
+
update_proc = opts[:update].to_proc
|
126
|
+
elsif opts.has_key? :op
|
127
|
+
op_proc = opts[:op].to_proc
|
128
|
+
update_proc = lambda {|ks, s, v| op_proc.call(s, v) }
|
129
|
+
else
|
130
|
+
has_seed = true
|
131
|
+
seed_value = nil
|
132
|
+
update_proc = lambda {|ks, s, v| !s ? [v] : (s << v) }
|
133
|
+
end
|
134
|
+
result = {}
|
135
|
+
each {|*elts|
|
136
|
+
elt = elts.length <= 1 ? elts[0] : elts
|
137
|
+
ks = key_selectors.map {|ksel| ksel.call(elt) }
|
138
|
+
v = value_selector.call(elt)
|
139
|
+
h = result
|
140
|
+
0.upto(ks.length-2) {|i|
|
141
|
+
k = ks[i]
|
142
|
+
h[k] = {} if !h.has_key?(k)
|
143
|
+
h = h[k]
|
144
|
+
}
|
145
|
+
lastk = ks.last
|
146
|
+
if !h.has_key?(lastk)
|
147
|
+
if has_seed
|
148
|
+
h[lastk] = update_proc.call(ks, seed_value, v)
|
149
|
+
else
|
150
|
+
h[lastk] = v
|
151
|
+
end
|
152
|
+
else
|
153
|
+
h[lastk] = update_proc.call(ks, h[lastk], v)
|
154
|
+
end
|
155
|
+
}
|
156
|
+
if reduce_proc
|
157
|
+
tb_cat_reduce(result, [], key_selectors.length-1, reduce_proc)
|
158
|
+
end
|
159
|
+
result
|
160
|
+
end
|
161
|
+
|
162
|
+
def tb_cat_selector_proc(selector)
|
163
|
+
if selector.respond_to?(:call)
|
164
|
+
selector
|
165
|
+
elsif selector.respond_to? :to_ary
|
166
|
+
selector_procs = selector.to_ary.map {|sel| tb_cat_selector_proc(sel) }
|
167
|
+
lambda {|elt| selector_procs.map {|selproc| selproc.call(elt) } }
|
168
|
+
else
|
169
|
+
lambda {|elt| elt[selector] }
|
170
|
+
end
|
171
|
+
end
|
172
|
+
private :tb_cat_selector_proc
|
173
|
+
|
174
|
+
def tb_cat_reduce(hash, ks, nestlevel, reduce_proc)
|
175
|
+
if nestlevel.zero?
|
176
|
+
hash.each {|k, v|
|
177
|
+
ks << k
|
178
|
+
begin
|
179
|
+
hash[k] = reduce_proc.call(ks.dup, v)
|
180
|
+
ensure
|
181
|
+
ks.pop
|
182
|
+
end
|
183
|
+
}
|
184
|
+
else
|
185
|
+
hash.each {|k, h|
|
186
|
+
ks << k
|
187
|
+
begin
|
188
|
+
tb_cat_reduce(h, ks, nestlevel-1, reduce_proc)
|
189
|
+
ensure
|
190
|
+
ks.pop
|
191
|
+
end
|
192
|
+
}
|
193
|
+
end
|
194
|
+
end
|
195
|
+
private :tb_cat_reduce
|
196
|
+
|
197
|
+
# :call-seq:
|
198
|
+
# enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) -> hash
|
199
|
+
# enum.tb_unique_categorize(ksel1, ksel2, ..., vsel, [opts]) {|s, v| ... } -> hash
|
200
|
+
#
|
201
|
+
# categorizes the elements in _enum_ and returns a hash.
|
202
|
+
# This method assumes one element for a category by default.
|
203
|
+
#
|
204
|
+
# +tb_unique_categorize+ takes one or more key selectors,
|
205
|
+
# one value selector and
|
206
|
+
# an optional option hash.
|
207
|
+
# It also takes an optional block.
|
208
|
+
#
|
209
|
+
# The selectors specify how to extract a value from an element in _enum_.
|
210
|
+
# See Enumerable#tb_categorize for details of selectors.
|
211
|
+
#
|
212
|
+
# The key selectors, _kselN_, are used to extract hash keys from an element.
|
213
|
+
# If two or more key selectors are specified, the result hash will be nested.
|
214
|
+
#
|
215
|
+
# The value selector, _vsel_, is used for the values of innermost hashes.
|
216
|
+
# By default, this method assumes the key selectors categorizes elements in enum uniquely.
|
217
|
+
# If the key selectors generates same keys for two or more elements, ArgumentError is raised.
|
218
|
+
# This behavior can be customized by :seed option and the block.
|
219
|
+
#
|
220
|
+
# a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
|
221
|
+
# {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
|
222
|
+
# {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
|
223
|
+
# p a.tb_unique_categorize(:fruit, :price)
|
224
|
+
# #=> {"banana"=>100, "melon"=>300, "grapefruit"=>200}
|
225
|
+
#
|
226
|
+
# p a.tb_unique_categorize(:color, :price)
|
227
|
+
# # ArgumentError
|
228
|
+
#
|
229
|
+
# If the block is given, it is used for combining values in a category.
|
230
|
+
# The arguments for the block is a seed and the value extracted by _vsel_.
|
231
|
+
# The return value of the block is used as the next seed.
|
232
|
+
# :seed option specifies the initial seed.
|
233
|
+
# If :seed is not given, the first value for each category is used for the seed.
|
234
|
+
#
|
235
|
+
# p a.tb_unique_categorize(:taste, :price) {|s, v| s + v }
|
236
|
+
# #=> {"sweet"=>400, "tart"=>200}
|
237
|
+
#
|
238
|
+
# p a.tb_unique_categorize(:color, :price) {|s, v| s + v }
|
239
|
+
# #=> {"yellow"=>300, "green"=>300}
|
240
|
+
#
|
241
|
+
def tb_unique_categorize(*args, &update_proc)
|
242
|
+
opts = args.last.kind_of?(Hash) ? args.pop.dup : {}
|
243
|
+
if update_proc
|
244
|
+
opts[:update] = lambda {|ks, s, v| update_proc.call(s, v) }
|
245
|
+
else
|
246
|
+
seed = Object.new
|
247
|
+
opts[:seed] = seed
|
248
|
+
opts[:update] = lambda {|ks, s, v|
|
249
|
+
if s.equal? seed
|
250
|
+
v
|
251
|
+
else
|
252
|
+
raise ArgumentError, "ambiguous key: #{ks.map {|k| k.inspect }.join(',')}"
|
253
|
+
end
|
254
|
+
}
|
255
|
+
end
|
256
|
+
tb_categorize(*(args + [opts]))
|
257
|
+
end
|
258
|
+
|
259
|
+
# :call-seq:
|
260
|
+
# enum.tb_category_count(ksel1, ksel2, ...)
|
261
|
+
#
|
262
|
+
# counts elements in _enum_ for each category defined by the key selectors.
|
263
|
+
#
|
264
|
+
# a = [{:fruit => "banana", :color => "yellow", :taste => "sweet", :price => 100},
|
265
|
+
# {:fruit => "melon", :color => "green", :taste => "sweet", :price => 300},
|
266
|
+
# {:fruit => "grapefruit", :color => "yellow", :taste => "tart", :price => 200}]
|
267
|
+
#
|
268
|
+
# p a.tb_category_count(:color)
|
269
|
+
# #=> {"yellow"=>2, "green"=>1}
|
270
|
+
#
|
271
|
+
# p a.tb_category_count(:taste)
|
272
|
+
# #=> {"sweet"=>2, "tart"=>1}
|
273
|
+
#
|
274
|
+
# p a.tb_category_count(:taste, :color)
|
275
|
+
# #=> {"sweet"=>{"yellow"=>1, "green"=>1}, "tart"=>{"yellow"=>1}}
|
276
|
+
#
|
277
|
+
# The selectors specify how to extract a value from an element in _enum_.
|
278
|
+
# See Enumerable#tb_categorize for details of selectors.
|
279
|
+
#
|
280
|
+
def tb_category_count(*args)
|
281
|
+
tb_categorize(*(args + [lambda {|e| 1 }, {:update => lambda {|ks, s, v| s + v }}]))
|
282
|
+
end
|
283
|
+
|
284
|
+
def dump_objsfile(title, tempfile)
|
285
|
+
tempfile.flush
|
286
|
+
path = tempfile
|
287
|
+
a = []
|
288
|
+
open(path) {|f|
|
289
|
+
until f.eof?
|
290
|
+
pair = Marshal.load(f)
|
291
|
+
a << (pair ? pair.last : :sep)
|
292
|
+
end
|
293
|
+
}
|
294
|
+
puts "#{title}: #{a.inspect}"
|
295
|
+
end
|
296
|
+
private :dump_objsfile
|
297
|
+
|
298
|
+
# :call-seq:
|
299
|
+
# enum.extsort_by(options={}) {|value| cmpvalue }
|
300
|
+
#
|
301
|
+
# +extsort_by+ returns an enumerator which yields elements in the receiver in sorted order.
|
302
|
+
# The block defines the order which cmpvalue is ascending.
|
303
|
+
#
|
304
|
+
# options:
|
305
|
+
# :map : a procedure to convert the element. It is applied after cmpvalue is obtained. (default: nil)
|
306
|
+
# :unique : a procedure to merge two values which has same cmpvalue. (default: nil)
|
307
|
+
# :memsize : limit in-memory sorting size in bytes (default: 10000000)
|
308
|
+
#
|
309
|
+
# If :unique option is given, it is used to merge
|
310
|
+
# elements which have same cmpvalue.
|
311
|
+
# The procedure should take two elements and return one.
|
312
|
+
# The procedure should be associative. (f(x,f(y,z)) = f(f(x,y),z))
|
313
|
+
#
|
314
|
+
def extsort_by(opts={}, &cmpvalue_from)
|
315
|
+
mapfunc = opts[:map]
|
316
|
+
opts = opts.dup
|
317
|
+
opts[:map] = mapfunc ?
|
318
|
+
lambda {|v| Marshal.dump(mapfunc.call(v)) } :
|
319
|
+
lambda {|v| Marshal.dump(v) }
|
320
|
+
uniqfunc = opts[:unique]
|
321
|
+
if uniqfunc
|
322
|
+
opts[:unique] = lambda {|x, y| Marshal.dump(uniqfunc.call(Marshal.load(x), Marshal.load(y))) }
|
323
|
+
end
|
324
|
+
reducefunc = opts[:unique]
|
325
|
+
mapfunc2 = opts[:map] || lambda {|v| v }
|
326
|
+
self.lazy_map {|v|
|
327
|
+
[cmpvalue_from.call(v), mapfunc2.call(v)]
|
328
|
+
}.send(:extsort_internal0, reducefunc, opts).lazy_map {|k, d|
|
329
|
+
Marshal.load(d)
|
330
|
+
}
|
331
|
+
end
|
332
|
+
|
333
|
+
# :call-seq:
|
334
|
+
# enum.extsort_reduce(op, [opts]) {|element| [key, val| }
|
335
|
+
#
|
336
|
+
def extsort_reduce(op, opts={}, &key_val_proc)
|
337
|
+
lazy_map(&key_val_proc).send(:extsort_internal0, op, opts)
|
338
|
+
end
|
339
|
+
|
340
|
+
def extsort_internal0(reducefunc, opts={})
|
341
|
+
if reducefunc.is_a? Symbol
|
342
|
+
reducefunc = reducefunc.to_proc
|
343
|
+
end
|
344
|
+
opts = opts.dup
|
345
|
+
opts[:memsize] ||= 10000000
|
346
|
+
Enumerator.new {|y|
|
347
|
+
extsort_internal1(reducefunc, opts, y)
|
348
|
+
}
|
349
|
+
end
|
350
|
+
private :extsort_internal0
|
351
|
+
|
352
|
+
def extsort_internal1(reducefunc, opts, y)
|
353
|
+
tmp1 = Tempfile.new("tbsortA")
|
354
|
+
tmp2 = Tempfile.new("tbsortB")
|
355
|
+
extsort_first_split(tmp1, tmp2, reducefunc, opts)
|
356
|
+
if tmp1.size == 0 && tmp2.size == 0
|
357
|
+
return Enumerator.new {|_| }
|
358
|
+
end
|
359
|
+
tmp3 = Tempfile.new("tbsortC")
|
360
|
+
tmp4 = Tempfile.new("tbsortD")
|
361
|
+
while tmp2.size != 0
|
362
|
+
#dump_objsfile(:tmp1, tmp1)
|
363
|
+
#dump_objsfile(:tmp2, tmp2)
|
364
|
+
#dump_objsfile(:tmp3, tmp3)
|
365
|
+
#dump_objsfile(:tmp4, tmp4)
|
366
|
+
extsort_merge(tmp1, tmp2, tmp3, tmp4, reducefunc, opts)
|
367
|
+
tmp1.rewind
|
368
|
+
tmp1.truncate(0)
|
369
|
+
tmp2.rewind
|
370
|
+
tmp2.truncate(0)
|
371
|
+
tmp1, tmp2, tmp3, tmp4 = tmp3, tmp4, tmp1, tmp2
|
372
|
+
end
|
373
|
+
#dump_objsfile(:tmp1, tmp1)
|
374
|
+
#dump_objsfile(:tmp2, tmp2)
|
375
|
+
#dump_objsfile(:tmp3, tmp3)
|
376
|
+
#dump_objsfile(:tmp4, tmp4)
|
377
|
+
extsort_yield(tmp1, y)
|
378
|
+
ensure
|
379
|
+
tmp1.close(true) if tmp1
|
380
|
+
tmp2.close(true) if tmp2
|
381
|
+
tmp3.close(true) if tmp3
|
382
|
+
tmp4.close(true) if tmp4
|
383
|
+
end
|
384
|
+
private :extsort_internal1
|
385
|
+
|
386
|
+
def extsort_first_split(tmp1, tmp2, reducefunc, opts)
|
387
|
+
prevobj_cv = nil
|
388
|
+
prevobj_dumped = nil
|
389
|
+
tmp_current, tmp_another = tmp1, tmp2
|
390
|
+
buf = {}
|
391
|
+
buf_size = 0
|
392
|
+
buf_mode = true
|
393
|
+
self.each_with_index {|v, i|
|
394
|
+
obj_cv, obj = v
|
395
|
+
#p [obj, obj_cv]
|
396
|
+
#p [prevobj_cv, buf_mode, obj, obj_cv]
|
397
|
+
if buf_mode
|
398
|
+
dumped = Marshal.dump([obj_cv, obj])
|
399
|
+
ary = (buf[obj_cv] ||= [])
|
400
|
+
ary << [obj_cv, i, dumped]
|
401
|
+
buf_size += dumped.size
|
402
|
+
if reducefunc && ary.length == 2
|
403
|
+
obj1_cv, i1, dumped1 = ary[0]
|
404
|
+
_, _, dumped2 = ary[1]
|
405
|
+
_, obj1 = Marshal.load(dumped1)
|
406
|
+
_, obj2 = Marshal.load(dumped2)
|
407
|
+
obju = reducefunc.call(obj1, obj2)
|
408
|
+
buf[obj1_cv] = [[obj1_cv, i1, Marshal.dump([obj1_cv, obju])]]
|
409
|
+
end
|
410
|
+
if opts[:memsize] < buf_size
|
411
|
+
buf_keys = buf.keys.sort
|
412
|
+
(0...(buf_keys.length-1)).each {|j|
|
413
|
+
cv = buf_keys[j]
|
414
|
+
buf[cv].each {|_, _, d|
|
415
|
+
tmp_current.write d
|
416
|
+
}
|
417
|
+
}
|
418
|
+
ary = buf[buf_keys.last]
|
419
|
+
(0...(ary.length-1)).each {|j|
|
420
|
+
_, _, d = ary[j]
|
421
|
+
tmp_current.write d
|
422
|
+
}
|
423
|
+
prevobj_cv, _, prevobj_dumped = ary[-1]
|
424
|
+
buf.clear
|
425
|
+
buf_mode = false
|
426
|
+
end
|
427
|
+
elsif (cmp = (prevobj_cv <=> obj_cv)) == 0 && reducefunc
|
428
|
+
_, obj1 = Marshal.load(prevobj_dumped)
|
429
|
+
obj2 = obj
|
430
|
+
obju = reducefunc.call(obj1, obj2)
|
431
|
+
prevobj_dumped = Marshal.dump([prevobj_cv, obju])
|
432
|
+
elsif cmp <= 0
|
433
|
+
tmp_current.write prevobj_dumped
|
434
|
+
prevobj_dumped = Marshal.dump([obj_cv, obj])
|
435
|
+
prevobj_cv = obj_cv
|
436
|
+
else
|
437
|
+
tmp_current.write prevobj_dumped
|
438
|
+
Marshal.dump(nil, tmp_current)
|
439
|
+
dumped = Marshal.dump([obj_cv, obj])
|
440
|
+
buf = { obj_cv => [[obj_cv, i, dumped]] }
|
441
|
+
buf_size = dumped.size
|
442
|
+
buf_mode = true
|
443
|
+
tmp_current, tmp_another = tmp_another, tmp_current
|
444
|
+
prevobj_cv = nil
|
445
|
+
prevobj_dumped = nil
|
446
|
+
end
|
447
|
+
}
|
448
|
+
if buf_mode
|
449
|
+
buf_keys = buf.keys.sort
|
450
|
+
buf_keys.each {|cv|
|
451
|
+
buf[cv].each {|_, _, d|
|
452
|
+
tmp_current.write d
|
453
|
+
}
|
454
|
+
}
|
455
|
+
else
|
456
|
+
tmp_current.write prevobj_dumped
|
457
|
+
end
|
458
|
+
if !buf_mode || !buf.empty?
|
459
|
+
Marshal.dump(nil, tmp_current)
|
460
|
+
end
|
461
|
+
end
|
462
|
+
private :extsort_first_split
|
463
|
+
|
464
|
+
def extsort_merge(src1, src2, dst1, dst2, reducefunc, opts)
|
465
|
+
src1.rewind
|
466
|
+
src2.rewind
|
467
|
+
obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
|
468
|
+
obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
|
469
|
+
prefer1 = true
|
470
|
+
while true
|
471
|
+
cmp = obj1_cv <=> obj2_cv
|
472
|
+
if prefer1 ? cmp > 0 : cmp >= 0
|
473
|
+
obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
|
474
|
+
obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
|
475
|
+
prefer1 = !prefer1
|
476
|
+
end
|
477
|
+
if reducefunc && cmp == 0
|
478
|
+
Marshal.dump([obj1_cv, reducefunc.call(obj1, obj2)], dst1)
|
479
|
+
obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
|
480
|
+
obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
|
481
|
+
if obj1_pair && !obj2_pair
|
482
|
+
obj1_pair, obj1_cv, obj1, src1, obj2_pair, obj2_cv, obj2, src2 =
|
483
|
+
obj2_pair, obj2_cv, obj2, src2, obj1_pair, obj1_cv, obj1, src1
|
484
|
+
prefer1 = !prefer1
|
485
|
+
end
|
486
|
+
else
|
487
|
+
Marshal.dump([obj1_cv, obj1], dst1)
|
488
|
+
obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
|
489
|
+
end
|
490
|
+
if !obj1_pair
|
491
|
+
while obj2_pair
|
492
|
+
Marshal.dump(obj2_pair, dst1)
|
493
|
+
obj2_pair = Marshal.load(src2)
|
494
|
+
end
|
495
|
+
Marshal.dump(nil, dst1)
|
496
|
+
dst1, dst2 = dst2, dst1
|
497
|
+
break if src1.eof?
|
498
|
+
break if src2.eof?
|
499
|
+
obj1_cv, obj1 = obj1_pair = Marshal.load(src1)
|
500
|
+
obj2_cv, obj2 = obj2_pair = Marshal.load(src2)
|
501
|
+
end
|
502
|
+
end
|
503
|
+
if !src1.eof?
|
504
|
+
restsrc = src1
|
505
|
+
elsif !src2.eof?
|
506
|
+
restsrc = src2
|
507
|
+
else
|
508
|
+
return
|
509
|
+
end
|
510
|
+
until restsrc.eof?
|
511
|
+
restobj_pair = Marshal.load(restsrc)
|
512
|
+
Marshal.dump(restobj_pair, dst1)
|
513
|
+
end
|
514
|
+
end
|
515
|
+
private :extsort_merge
|
516
|
+
|
517
|
+
def extsort_yield(tmp1, y)
|
518
|
+
tmp1.rewind
|
519
|
+
while true
|
520
|
+
pair = Marshal.load(tmp1)
|
521
|
+
break if !pair
|
522
|
+
y.yield pair
|
523
|
+
end
|
524
|
+
end
|
525
|
+
private :extsort_yield
|
526
|
+
|
527
|
+
# splits self by _representative_ which is called with a element.
|
528
|
+
#
|
529
|
+
# _before_group_ is called before each group with the first element.
|
530
|
+
# _after_group_ is called after each group with the last element.
|
531
|
+
# _body_ is called for each element.
|
532
|
+
#
|
533
|
+
def each_group_element_by(representative, before_group, body, after_group)
|
534
|
+
detect_group_by(before_group, after_group, &representative).each(&body)
|
535
|
+
end
|
536
|
+
|
537
|
+
# creates an enumerator which yields same as self but
|
538
|
+
# given block and procedures are called between each element for grouping.
|
539
|
+
#
|
540
|
+
# The block is called for each element to define groups.
|
541
|
+
# A group is conecutive elements which the block returns same value.
|
542
|
+
#
|
543
|
+
# _before_group_ is called before each group with the first element.
|
544
|
+
#
|
545
|
+
# _after_group_ is called after each group with the last element.
|
546
|
+
#
|
547
|
+
# _before_group_ and _after_group_ are optional.
|
548
|
+
#
|
549
|
+
# The grouping mechanism is called as "control break" in some cluture such as COBOL.
|
550
|
+
#
|
551
|
+
# Consecutive even numbers and odd numbers can be grouped as follows.
|
552
|
+
#
|
553
|
+
# [1,3,5,4,8].detect_group_by(
|
554
|
+
# lambda {|v| puts "start" },
|
555
|
+
# lambda {|v| puts "end" }) {|v| v.even? }.each {|x| p x }
|
556
|
+
# #=> start
|
557
|
+
# # 1
|
558
|
+
# # 3
|
559
|
+
# # 5
|
560
|
+
# # end
|
561
|
+
# # start
|
562
|
+
# # 4
|
563
|
+
# # 8
|
564
|
+
# # end
|
565
|
+
#
|
566
|
+
# Note that +detect_group_by+ can be cascaeded but
|
567
|
+
# It doesn't work as nested manner.
|
568
|
+
#
|
569
|
+
# (0..9).detect_group_by(
|
570
|
+
# lambda {|v| print "[" },
|
571
|
+
# lambda {|v| print "]" }) {|v|
|
572
|
+
# v.even?
|
573
|
+
# }.detect_group_by(
|
574
|
+
# lambda {|v| print "(" },
|
575
|
+
# lambda {|v| print ")" }) {|v|
|
576
|
+
# (v/2).even?
|
577
|
+
# }.each {|x| print x }
|
578
|
+
# #=> [(0][1][)(2][3][)(4][5][)(6][7][)(8][9])
|
579
|
+
#
|
580
|
+
# Consider +detect_nested_group_by+ for nested groups.
|
581
|
+
#
|
582
|
+
def detect_group_by(before_group=nil, after_group=nil, &representative_proc)
|
583
|
+
detect_nested_group_by([[representative_proc, before_group, after_group]])
|
584
|
+
end
|
585
|
+
|
586
|
+
# creates an enumerator which yields same as self but
|
587
|
+
# nested groups detected by _group_specs_
|
588
|
+
#
|
589
|
+
# _group_specs_ is an array of three procedures arrays as:
|
590
|
+
#
|
591
|
+
# [[representative_proc1, before_proc1, after_proc1],
|
592
|
+
# [representative_proc2, before_proc2, after_proc2],
|
593
|
+
# ...]
|
594
|
+
#
|
595
|
+
# _representative_proc1_ splits elements as groups.
|
596
|
+
# The group is defined as consecutive elements which _representative_proc1_ returns same value.
|
597
|
+
# _before_proc1_ is called before the each groups.
|
598
|
+
# _after_proc1_ is called after the each groups.
|
599
|
+
#
|
600
|
+
# Subsequent procedures, _representative_proc2_, _before_proc2_, _after_proc2_, ..., are
|
601
|
+
# used to split elements in the above groups.
|
602
|
+
#
|
603
|
+
# (0..9).detect_nested_group_by(
|
604
|
+
# [[lambda {|v| (v/2).even? },
|
605
|
+
# lambda {|v| print "(" },
|
606
|
+
# lambda {|v| print ")" }],
|
607
|
+
# [lambda {|v| v.even? },
|
608
|
+
# lambda {|v| print "[" },
|
609
|
+
# lambda {|v| print "]" }]]).each {|x| print x }
|
610
|
+
# #=> ([0][1])([2][3])([4][5])([6][7])([8][9])
|
611
|
+
#
|
612
|
+
def detect_nested_group_by(group_specs)
|
613
|
+
Enumerator.new {|y|
|
614
|
+
first = true
|
615
|
+
prev_reps = nil
|
616
|
+
prev = nil
|
617
|
+
self.each {|*curr|
|
618
|
+
reps = group_specs.map {|representative_proc, _, _|
|
619
|
+
representative_proc.call(*curr)
|
620
|
+
}
|
621
|
+
if first
|
622
|
+
first = false
|
623
|
+
group_specs.each {|_, before_proc, _|
|
624
|
+
before_proc.call(*curr) if before_proc
|
625
|
+
}
|
626
|
+
else
|
627
|
+
different_index = (0...group_specs.length).find {|i| prev_reps[i] != reps[i] }
|
628
|
+
if different_index
|
629
|
+
(group_specs.length-1).downto(different_index) {|i|
|
630
|
+
_, _, after_proc = group_specs[i]
|
631
|
+
after_proc.call(*prev) if after_proc
|
632
|
+
}
|
633
|
+
different_index.upto(group_specs.length-1) {|i|
|
634
|
+
_, before_proc, _ = group_specs[i]
|
635
|
+
before_proc.call(*curr) if before_proc
|
636
|
+
}
|
637
|
+
end
|
638
|
+
end
|
639
|
+
y.yield(*curr)
|
640
|
+
prev_reps = reps
|
641
|
+
prev = curr
|
642
|
+
}
|
643
|
+
if !first
|
644
|
+
(group_specs.length-1).downto(0) {|i|
|
645
|
+
_, _, after_proc = group_specs[i]
|
646
|
+
after_proc.call(*prev) if after_proc
|
647
|
+
}
|
648
|
+
end
|
649
|
+
}
|
650
|
+
end
|
651
|
+
|
652
|
+
def lazy_map
|
653
|
+
Enumerator.new {|y|
|
654
|
+
self.each {|*vs|
|
655
|
+
y.yield(yield(*vs))
|
656
|
+
}
|
657
|
+
}
|
658
|
+
end
|
659
|
+
end
|