mrtoolkit 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/Makefile +6 -0
- data/README.rdoc +19 -0
- data/Rakefile +57 -0
- data/VERSION.yml +4 -0
- data/examples/Rakefile +80 -0
- data/examples/Readme +12 -0
- data/examples/hour.rb +57 -0
- data/examples/import-logs +14 -0
- data/examples/import.rb +22 -0
- data/examples/ip-result.rb +33 -0
- data/examples/ip-size.rb +33 -0
- data/examples/ip-ua.rb +36 -0
- data/examples/ip.rb +10 -0
- data/examples/section.rb +37 -0
- data/examples/top-file.rb +36 -0
- data/lib/mrtoolkit.rb +908 -0
- data/lib/regression.rb +33 -0
- data/lib/stream_runner.rb +100 -0
- data/mrtoolkit.gemspec +79 -0
- data/standalone/hadoop +104 -0
- data/test/Rakefile +21 -0
- data/test/test-in/test1-in +2 -0
- data/test/test-in/test2-in +4 -0
- data/test/test-in/test3-in +5 -0
- data/test/test-in/test4-in +6 -0
- data/test/test-in/test5-in +12 -0
- data/test/test-in/test6-in +3 -0
- data/test/test-in/test7-in +20 -0
- data/test/test-in/test8-in +12 -0
- data/test/test-in/test9-in +6 -0
- data/test/utest.rb +471 -0
- metadata +104 -0
data/lib/mrtoolkit.rb
ADDED
@@ -0,0 +1,908 @@
|
|
1
|
+
require 'pp'
|
2
|
+
require 'stream_runner'
|
3
|
+
|
4
|
+
# Store information about a processing stage.
|
5
|
+
# Includes input and output field names, field separators,
|
6
|
+
# and the filenames processed by the stage.
|
7
|
+
class Stage
|
8
|
+
attr_reader :in_fields, :out_fields
|
9
|
+
attr_reader :in_sep, :out_sep
|
10
|
+
attr_reader :errors
|
11
|
+
|
12
|
+
def initialize(*args)
|
13
|
+
end
|
14
|
+
|
15
|
+
def field name
|
16
|
+
@in_fields = [] unless @in_fields
|
17
|
+
@in_fields << name.to_sym
|
18
|
+
end
|
19
|
+
def emit name
|
20
|
+
@out_fields = [] unless @out_fields
|
21
|
+
@out_fields << name.to_sym
|
22
|
+
end
|
23
|
+
def field_separator sep
|
24
|
+
@in_sep = sep
|
25
|
+
end
|
26
|
+
def emit_separator sep
|
27
|
+
@out_sep = sep
|
28
|
+
end
|
29
|
+
def catch_errors
|
30
|
+
@catch_errors = true
|
31
|
+
end
|
32
|
+
|
33
|
+
def declare
|
34
|
+
end
|
35
|
+
|
36
|
+
# Create the input and output structures.
|
37
|
+
def prepare
|
38
|
+
@in_sep = "\t" unless @in_sep
|
39
|
+
@out_sep = "\t" unless @out_sep
|
40
|
+
@input_type = Struct.new(*@in_fields)
|
41
|
+
@output_type = Struct.new(*@out_fields)
|
42
|
+
@errors = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
# Copies all fields of a struct to another
|
46
|
+
# Some fields can be skipped.
|
47
|
+
def copy_struct(src, dest, skip = 0)
|
48
|
+
(0..src.length-1-skip).each {|i| dest[i] = src[i+skip]}
|
49
|
+
dest
|
50
|
+
end
|
51
|
+
|
52
|
+
# Write any output
|
53
|
+
def write_out(output)
|
54
|
+
if output
|
55
|
+
outs = @out_fields.collect { |f| output[f].to_s.chomp }
|
56
|
+
@out_fd.puts outs.join(@out_sep)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def new_input(line = nil)
|
61
|
+
input = @input_type.new
|
62
|
+
return input unless line
|
63
|
+
fields = line.chomp.split(@in_sep)
|
64
|
+
@in_fields.each_index { |i| input[i] = fields[i] }
|
65
|
+
input
|
66
|
+
end
|
67
|
+
def new_output
|
68
|
+
@output_type.new
|
69
|
+
end
|
70
|
+
|
71
|
+
# Process one line of map or reduce file.
|
72
|
+
# Create output record.
|
73
|
+
# Call the given function.
|
74
|
+
# collect the output and write it out.
|
75
|
+
def process_step(fun, input = nil)
|
76
|
+
begin
|
77
|
+
out = send(fun, input, new_output)
|
78
|
+
if out
|
79
|
+
out = [out] unless out.class == Array
|
80
|
+
out.each {|o| write_out(o)}
|
81
|
+
end
|
82
|
+
rescue StandardError
|
83
|
+
STDERR.puts "Error: #{$!}"
|
84
|
+
@errors += 1
|
85
|
+
raise unless @catch_errors
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
# This class allows uniform processing of File and STDIN/STDOUT
|
91
|
+
# file descriptors.
|
92
|
+
# It must be passed a block, which gets the open file descriptor.
|
93
|
+
class StreamIO
|
94
|
+
def self.open(f, mode = "r")
|
95
|
+
if f.class == String
|
96
|
+
fp = File.open(f, mode)
|
97
|
+
yield(fp)
|
98
|
+
fp.close
|
99
|
+
elsif f.class == IO
|
100
|
+
yield(f)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
############################
|
106
|
+
# Base class for map
|
107
|
+
############################
|
108
|
+
# Map Stage
|
109
|
+
# Creates an object to hold input lines after they have been
|
110
|
+
# parsed and separated into fields.
|
111
|
+
# Reads input and feeds to process method, then collects output.
|
112
|
+
class MapBase < Stage
|
113
|
+
|
114
|
+
# Called at the beginning of map.
|
115
|
+
# No input.
|
116
|
+
def process_begin(dummy, output)
|
117
|
+
nil
|
118
|
+
end
|
119
|
+
# Called for each record.
|
120
|
+
def process(input, output)
|
121
|
+
nil
|
122
|
+
end
|
123
|
+
# Called at the end of map.
|
124
|
+
def process_end(dummy, output)
|
125
|
+
nil
|
126
|
+
end
|
127
|
+
|
128
|
+
def run(in_fd, out_fd)
|
129
|
+
@out_fd = out_fd
|
130
|
+
process_step(:process_begin, nil)
|
131
|
+
input = nil
|
132
|
+
in_fd.each_line do |line|
|
133
|
+
@raw_input = line
|
134
|
+
input = new_input(line)
|
135
|
+
process_step(:process, input)
|
136
|
+
end
|
137
|
+
process_step(:process_end, nil)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
############################
|
142
|
+
# Base class for reduce
|
143
|
+
############################
|
144
|
+
# Reduce Stage
|
145
|
+
# Creates an object to hold input lines after they have been
|
146
|
+
# parsed and separated into fields.
|
147
|
+
# Reads input and feeds to process method, then collects output.
|
148
|
+
# Reduce input is map output
|
149
|
+
class ReduceBase < Stage
|
150
|
+
# This suite of functions is called on the fields based on
|
151
|
+
# their first field.
|
152
|
+
# For each value of the first field, process_init is called first,
|
153
|
+
# then process_each is called for each one,
|
154
|
+
# then process_term is called after the last one.
|
155
|
+
# The client can implement only process_term to see each unique value once.
|
156
|
+
# When process_term is called, input is on the next record, so the first
|
157
|
+
# field is in @last
|
158
|
+
|
159
|
+
# Called at the beginning of a run of equal values of the first field.
|
160
|
+
def process_init(input, output)
|
161
|
+
nil
|
162
|
+
end
|
163
|
+
# Called for each one of the equal values.
|
164
|
+
def process_each(input, output)
|
165
|
+
nil
|
166
|
+
end
|
167
|
+
# Called after the run of equal values.
|
168
|
+
# No input record. Previous value of first field in @last.
|
169
|
+
def process_term(dummy, output)
|
170
|
+
nil
|
171
|
+
end
|
172
|
+
|
173
|
+
# Called at the beginning of reduction.
|
174
|
+
# No input.
|
175
|
+
def process_begin(dummy, output)
|
176
|
+
nil
|
177
|
+
end
|
178
|
+
# Called for each record.
|
179
|
+
def process(input, output)
|
180
|
+
nil
|
181
|
+
end
|
182
|
+
# Called at the end of reduction.
|
183
|
+
def process_end(dummy, output)
|
184
|
+
nil
|
185
|
+
end
|
186
|
+
|
187
|
+
# This suite of functions is called on all records.
|
188
|
+
# The function process_begin is called first,
|
189
|
+
# then process is called on each record,
|
190
|
+
# then process_end is called last.
|
191
|
+
# This default implementation implements the calls to process_init,
|
192
|
+
# proces_each, and process_term.
|
193
|
+
# The client can omit process_begin and process_end
|
194
|
+
# and just implement process to see each record.
|
195
|
+
def process_internal(input)
|
196
|
+
v = input[0]
|
197
|
+
if @last.nil?
|
198
|
+
process_step(:process_init, input)
|
199
|
+
process_step(:process_each, input)
|
200
|
+
@last = v
|
201
|
+
return
|
202
|
+
end
|
203
|
+
if v == @last
|
204
|
+
# As long as key is the same, just process it
|
205
|
+
process_step(:process_each, input)
|
206
|
+
return
|
207
|
+
end
|
208
|
+
# The run has ended
|
209
|
+
process_step(:process_term, input) if @last
|
210
|
+
@last = v
|
211
|
+
|
212
|
+
process_step(:process_init, input)
|
213
|
+
process_step(:process_each, input)
|
214
|
+
end
|
215
|
+
def process_end_internal(dummy)
|
216
|
+
process_step(:process_term, nil) if @last
|
217
|
+
end
|
218
|
+
|
219
|
+
# Run the reducer.
|
220
|
+
# Call process_begin, then for each line, call
|
221
|
+
# process, then call process_end.
|
222
|
+
# At each step, collect any output and write it out.
|
223
|
+
def run(in_fd, out_fd)
|
224
|
+
@out_fd = out_fd
|
225
|
+
@last = nil
|
226
|
+
process_step(:process_begin, nil)
|
227
|
+
|
228
|
+
input = nil # so it will survive the loop
|
229
|
+
in_fd.each_line do |line|
|
230
|
+
@raw_input = line
|
231
|
+
input = new_input(line)
|
232
|
+
process_internal(input)
|
233
|
+
process_step(:process, input)
|
234
|
+
end
|
235
|
+
process_end_internal(nil)
|
236
|
+
process_step(:process_end, nil)
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
#########################################
|
241
|
+
# Pre-written map and reduce classes
|
242
|
+
#########################################
|
243
|
+
|
244
|
+
# Map just copies its fields
|
245
|
+
class CopyMap < MapBase
|
246
|
+
def initialize(*args)
|
247
|
+
if args.size < 1
|
248
|
+
@n = 0
|
249
|
+
else
|
250
|
+
@n = args[0].to_i - 1
|
251
|
+
end
|
252
|
+
end
|
253
|
+
def declare
|
254
|
+
(0..@n).each {|i| field "col#{i}"}
|
255
|
+
|
256
|
+
(0..@n).each {|i| emit "col#{i}"}
|
257
|
+
end
|
258
|
+
|
259
|
+
def process(input, output)
|
260
|
+
copy_struct(input, output)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# Map selects according to a RE
|
265
|
+
class SelectMap < MapBase
|
266
|
+
def initialize(*args)
|
267
|
+
raise ArgumentError if args.size < 1
|
268
|
+
@re = args[0]
|
269
|
+
if args[1]
|
270
|
+
@field = args[1]
|
271
|
+
else
|
272
|
+
@field = 0
|
273
|
+
end
|
274
|
+
if args[2]
|
275
|
+
@n = args[2].to_i - 1
|
276
|
+
else
|
277
|
+
@n = 0
|
278
|
+
end
|
279
|
+
end
|
280
|
+
def declare
|
281
|
+
(0..@n).each {|i| field "col#{i}"}
|
282
|
+
|
283
|
+
(0..@n).each {|i| emit "col#{i}"}
|
284
|
+
end
|
285
|
+
|
286
|
+
def process(input, output)
|
287
|
+
if input[@field] =~ @re
|
288
|
+
return copy_struct(input, output)
|
289
|
+
end
|
290
|
+
nil
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
# Reducer collects all values
|
295
|
+
# Outputs as many lines as input
|
296
|
+
# Init with number of fields to copy (default 1).
|
297
|
+
# Optional second arg is the number of initial fields to skip.
|
298
|
+
class CopyReduce < ReduceBase
|
299
|
+
def initialize(*args)
|
300
|
+
if args[0]
|
301
|
+
@n = args[0].to_i - 1
|
302
|
+
else
|
303
|
+
@n = 0
|
304
|
+
end
|
305
|
+
if args[1]
|
306
|
+
@m = args[1].to_i - 1
|
307
|
+
else
|
308
|
+
@m = -1
|
309
|
+
end
|
310
|
+
end
|
311
|
+
def declare
|
312
|
+
(0..@m).each {|i| field "skip#{i}"}
|
313
|
+
(0..@n).each {|i| field "col#{i}"}
|
314
|
+
|
315
|
+
(0..@n).each {|i| emit "col#{i}"}
|
316
|
+
end
|
317
|
+
|
318
|
+
def process(input, output)
|
319
|
+
copy_struct(input, output, @m+1)
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
# Reducer collects unique values
|
324
|
+
# Outputs as many lines as there are unique values in the first field.
|
325
|
+
class UniqueReduce < ReduceBase
|
326
|
+
def declare
|
327
|
+
field :value
|
328
|
+
|
329
|
+
emit :value
|
330
|
+
end
|
331
|
+
|
332
|
+
def process_term(input, output)
|
333
|
+
output.value = @last
|
334
|
+
output
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Reducer sums given fields
|
339
|
+
# Specify how many fields to sum (default 1).
|
340
|
+
# May optionally specify how many initial fields to skip
|
341
|
+
# Outputs one line of sums
|
342
|
+
class SumReduce < ReduceBase
|
343
|
+
def initialize(*args)
|
344
|
+
if args[0]
|
345
|
+
@n = args[0].to_i - 1
|
346
|
+
else
|
347
|
+
@n = 0
|
348
|
+
end
|
349
|
+
if args[1]
|
350
|
+
@m = args[1].to_i - 1
|
351
|
+
else
|
352
|
+
@m = -1
|
353
|
+
end
|
354
|
+
end
|
355
|
+
def declare
|
356
|
+
(0..@m).each {|i| field "skip#{i}"}
|
357
|
+
(0..@n).each {|i| field "count#{i}"}
|
358
|
+
|
359
|
+
(0..@n).each {|i| emit "sum#{i}"}
|
360
|
+
end
|
361
|
+
|
362
|
+
def process_begin(dummy, output)
|
363
|
+
@sum = Array.new(@n+1, 0)
|
364
|
+
nil
|
365
|
+
end
|
366
|
+
def process(input, output)
|
367
|
+
(0..@n).each {|i| @sum[i] += input[i+@m+1].to_f}
|
368
|
+
nil
|
369
|
+
end
|
370
|
+
def process_end(dummy, output)
|
371
|
+
(0..@n).each {|i| output[i] = @sum[i]}
|
372
|
+
output
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
# This reducer sums within each unique value of the first field.
|
377
|
+
# Outputs one line of sums for each unique value of the first field.
|
378
|
+
class UniqueSumReduce < ReduceBase
|
379
|
+
def initialize(*args)
|
380
|
+
if args[0]
|
381
|
+
@n = args[0].to_i - 1
|
382
|
+
else
|
383
|
+
@n = 0
|
384
|
+
end
|
385
|
+
if args[1]
|
386
|
+
@m = args[1].to_i - 1
|
387
|
+
else
|
388
|
+
@m = -1
|
389
|
+
end
|
390
|
+
end
|
391
|
+
|
392
|
+
def declare
|
393
|
+
field :unique
|
394
|
+
(0..@n).each {|i| field "count#{i}"}
|
395
|
+
(0..@m).each {|i| field "extra#{i}"}
|
396
|
+
|
397
|
+
emit :value
|
398
|
+
(0..@n).each {|i| emit "sum#{i}"}
|
399
|
+
(0..@m).each {|i| emit "extra#{i}"}
|
400
|
+
end
|
401
|
+
def process_init(input, output)
|
402
|
+
@sum = Array.new(@n+1, 0)
|
403
|
+
@extra = Array.new(@m+1)
|
404
|
+
nil
|
405
|
+
end
|
406
|
+
def process_each(input, output)
|
407
|
+
(0..@n).each {|i| @sum[i] += input[i+1].to_i}
|
408
|
+
(0..@m).each {|i| @extra[i] = input[i+@n+2]}
|
409
|
+
nil
|
410
|
+
end
|
411
|
+
def process_term(dummy, output)
|
412
|
+
output.value = @last
|
413
|
+
(0..@n).each {|i| output[i+1] = @sum[i]}
|
414
|
+
(0..@m).each {|i| output[i+@n+2] = @extra[i]}
|
415
|
+
output
|
416
|
+
end
|
417
|
+
end
|
418
|
+
|
419
|
+
# Reducer counts within each unique value of the first field.
|
420
|
+
# Outputs one line of counts for each unique value of the first field.
|
421
|
+
class UniqueCountReduce < ReduceBase
|
422
|
+
def initialize(*args)
|
423
|
+
if args[0]
|
424
|
+
@m = args[0].to_i - 1
|
425
|
+
else
|
426
|
+
@m = -1
|
427
|
+
end
|
428
|
+
end
|
429
|
+
|
430
|
+
def declare
|
431
|
+
field :unique
|
432
|
+
(0..@m).each {|i| field "extra#{i}"}
|
433
|
+
|
434
|
+
emit :value
|
435
|
+
emit :count
|
436
|
+
(0..@m).each {|i| emit "extra#{i}"}
|
437
|
+
end
|
438
|
+
def process_init(input, output)
|
439
|
+
@count = 0
|
440
|
+
@extra = Array.new(@m+1)
|
441
|
+
nil
|
442
|
+
end
|
443
|
+
def process_each(input, output)
|
444
|
+
@count += 1
|
445
|
+
(0..@m).each {|i| @extra[i] = input[i+1]}
|
446
|
+
nil
|
447
|
+
end
|
448
|
+
def process_term(dummy, output)
|
449
|
+
output.value = @last
|
450
|
+
output.count = @count
|
451
|
+
(0..@m).each {|i| output[i+2] = @extra[i]}
|
452
|
+
output
|
453
|
+
end
|
454
|
+
end
|
455
|
+
|
456
|
+
# Reducer works on groups where the first field is the same.
|
457
|
+
# For each distinct value of the second field, sum up the values
|
458
|
+
# of the third field.
|
459
|
+
class UniqueIndexedSumReduce < ReduceBase
|
460
|
+
def declare
|
461
|
+
field :unique
|
462
|
+
field :index
|
463
|
+
field :value
|
464
|
+
|
465
|
+
emit :unique
|
466
|
+
emit :index
|
467
|
+
emit :value
|
468
|
+
end
|
469
|
+
def process_init(input, output)
|
470
|
+
@sum = {}
|
471
|
+
nil
|
472
|
+
end
|
473
|
+
def process_each(input, output)
|
474
|
+
index = input.index
|
475
|
+
@sum[index] = 0 unless @sum.has_key?(index)
|
476
|
+
@sum[index] += input.value.to_i
|
477
|
+
nil
|
478
|
+
end
|
479
|
+
def process_term(dummy, output)
|
480
|
+
output = []
|
481
|
+
@sum.each do |index, value|
|
482
|
+
item = new_output
|
483
|
+
item.unique = @last
|
484
|
+
item.index = index
|
485
|
+
item.value = value
|
486
|
+
output << item
|
487
|
+
end
|
488
|
+
output
|
489
|
+
end
|
490
|
+
end
|
491
|
+
|
492
|
+
# Reducer works on groups where the first field is the same.
|
493
|
+
# Count the number of distinct occurances of the second field.
|
494
|
+
class UniqueIndexedCountReduce < ReduceBase
|
495
|
+
def declare
|
496
|
+
field :unique
|
497
|
+
field :index
|
498
|
+
|
499
|
+
emit :unique
|
500
|
+
emit :index
|
501
|
+
emit :value
|
502
|
+
end
|
503
|
+
def process_init(input, output)
|
504
|
+
@sum = {}
|
505
|
+
nil
|
506
|
+
end
|
507
|
+
def process_each(input, output)
|
508
|
+
index = input.index
|
509
|
+
@sum[index] = 0 unless @sum.has_key?(index)
|
510
|
+
@sum[index] += 1
|
511
|
+
nil
|
512
|
+
end
|
513
|
+
def process_term(dummy, output)
|
514
|
+
output = []
|
515
|
+
@sum.each do |index, value|
|
516
|
+
item = new_output
|
517
|
+
item.unique = @last
|
518
|
+
item.index = index
|
519
|
+
item.value = value
|
520
|
+
output << item
|
521
|
+
end
|
522
|
+
output
|
523
|
+
end
|
524
|
+
end
|
525
|
+
|
526
|
+
# Reducer samples the input
|
527
|
+
# One argument must be given: the number of samples to retain
|
528
|
+
# Outputs that many lines
|
529
|
+
# TODO store the whole input object in pool?
|
530
|
+
# or else take another argument of columns to store
|
531
|
+
class SampleReduce < ReduceBase
|
532
|
+
def initialize(*args)
|
533
|
+
raise ArgumentError if args.size < 1
|
534
|
+
@m = args[0].to_i
|
535
|
+
end
|
536
|
+
|
537
|
+
def declare
|
538
|
+
field :value
|
539
|
+
|
540
|
+
emit :value
|
541
|
+
end
|
542
|
+
def process_begin(dummy, output)
|
543
|
+
@pool = []
|
544
|
+
@n = 0
|
545
|
+
nil
|
546
|
+
end
|
547
|
+
def process(input, output)
|
548
|
+
if @pool.size < @m
|
549
|
+
@pool << input.value
|
550
|
+
elsif rand < (@m.to_f / @n.to_f)
|
551
|
+
@pool[rand(@m)] = input.value
|
552
|
+
end
|
553
|
+
@n += 1
|
554
|
+
nil
|
555
|
+
end
|
556
|
+
def process_end(dummy, output)
|
557
|
+
output = []
|
558
|
+
@pool.each do |elem|
|
559
|
+
item = new_output
|
560
|
+
item.value = elem
|
561
|
+
output << item
|
562
|
+
end
|
563
|
+
output
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
# Reducer retains the the M maximum values in column 2
|
568
|
+
# Column 2 must be numeric
|
569
|
+
# TODO store rest of fields too
|
570
|
+
class MaxReduce < ReduceBase
|
571
|
+
def initialize(*args)
|
572
|
+
if args[0]
|
573
|
+
@m = args[0].to_i
|
574
|
+
else
|
575
|
+
@m = 1
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
def declare
|
580
|
+
field :key
|
581
|
+
field :value
|
582
|
+
|
583
|
+
emit :key
|
584
|
+
emit :value
|
585
|
+
end
|
586
|
+
|
587
|
+
def compare(x, y)
|
588
|
+
y <=> x
|
589
|
+
end
|
590
|
+
|
591
|
+
def sort_pool
|
592
|
+
@pool.sort! {|x, y| compare(x[1], y[1])}
|
593
|
+
end
|
594
|
+
|
595
|
+
def process_begin(dummy, output)
|
596
|
+
@pool = []
|
597
|
+
nil
|
598
|
+
end
|
599
|
+
def process(input, output)
|
600
|
+
val = input.value.to_i
|
601
|
+
if @pool.size < @m
|
602
|
+
@pool << [input.key, val]
|
603
|
+
sort_pool
|
604
|
+
elsif val > @pool[-1][1]
|
605
|
+
@pool[-1] = [input.key, val]
|
606
|
+
sort_pool
|
607
|
+
end
|
608
|
+
nil
|
609
|
+
end
|
610
|
+
def process_end(dummy, output)
|
611
|
+
output = []
|
612
|
+
@pool.each do |elem|
|
613
|
+
item = new_output
|
614
|
+
item.key, item.value = elem
|
615
|
+
output << item
|
616
|
+
end
|
617
|
+
output
|
618
|
+
end
|
619
|
+
end
|
620
|
+
|
621
|
+
# Reducer sums the values for each unique key
|
622
|
+
# Outputs only the M max values
|
623
|
+
class MaxUniqueSumReduce < ReduceBase
|
624
|
+
def initialize(*args)
|
625
|
+
raise ArgumentError if args.size < 1
|
626
|
+
@m = args[0].to_i
|
627
|
+
end
|
628
|
+
|
629
|
+
def declare
|
630
|
+
field :key
|
631
|
+
field :value
|
632
|
+
|
633
|
+
emit :key
|
634
|
+
emit :value
|
635
|
+
end
|
636
|
+
|
637
|
+
def sort_pool
|
638
|
+
@pool.sort! {|x, y| y[1] <=> x[1]}
|
639
|
+
end
|
640
|
+
|
641
|
+
def process_begin(dummy, output)
|
642
|
+
@pool = []
|
643
|
+
nil
|
644
|
+
end
|
645
|
+
# These three do the sum
|
646
|
+
def process_init(input, output)
|
647
|
+
@sum = 0
|
648
|
+
nil
|
649
|
+
end
|
650
|
+
def process_each(input, output)
|
651
|
+
@sum += input.value.to_i
|
652
|
+
nil
|
653
|
+
end
|
654
|
+
def process_term(dummy, output)
|
655
|
+
if @pool.size < @m
|
656
|
+
@pool << [@last, @sum]
|
657
|
+
sort_pool
|
658
|
+
elsif @sum > @pool[-1][1]
|
659
|
+
@pool[-1] = [@last, @sum]
|
660
|
+
sort_pool
|
661
|
+
end
|
662
|
+
nil
|
663
|
+
end
|
664
|
+
def process_end(dummy, output)
|
665
|
+
output = []
|
666
|
+
@pool.each do |elem|
|
667
|
+
item = new_output
|
668
|
+
item.key, item.value = elem
|
669
|
+
output << item
|
670
|
+
end
|
671
|
+
output
|
672
|
+
end
|
673
|
+
end
|
674
|
+
|
675
|
+
# Min given fields
|
676
|
+
class MinReduce < MaxReduce
|
677
|
+
def process(input, output)
|
678
|
+
if @pool.size < @m
|
679
|
+
@pool << [input.key, input.value]
|
680
|
+
sort_pool
|
681
|
+
elsif input.value.to_i < @pool[-1][1].to_i
|
682
|
+
@pool[-1] = [input.key, input.value]
|
683
|
+
sort_pool
|
684
|
+
end
|
685
|
+
nil
|
686
|
+
end
|
687
|
+
end
|
688
|
+
|
689
|
+
# First record of each unique field
|
690
|
+
# Drops the given number of colums.
|
691
|
+
# By default, drops the first column.
|
692
|
+
class UniqueFirstReduce < ReduceBase
|
693
|
+
def initialize(*args)
|
694
|
+
if args[0]
|
695
|
+
@n = args[0].to_i - 1
|
696
|
+
else
|
697
|
+
@n = 0
|
698
|
+
end
|
699
|
+
if args[1]
|
700
|
+
@m = args[1].to_i - 1
|
701
|
+
else
|
702
|
+
@m = -1
|
703
|
+
end
|
704
|
+
end
|
705
|
+
|
706
|
+
def declare
|
707
|
+
(0..@m).each {|i| field "skip#{i}"}
|
708
|
+
(0..@n).each {|i| field "col#{i}"}
|
709
|
+
|
710
|
+
(0..@n).each {|i| emit "col#{i}"}
|
711
|
+
end
|
712
|
+
# copy over all dest fields
|
713
|
+
def process_init(input, output)
|
714
|
+
copy_struct(input, output, @m+1)
|
715
|
+
end
|
716
|
+
end
|
717
|
+
|
718
|
+
############################
|
719
|
+
# Base class for jobs
|
720
|
+
############################
|
721
|
+
|
722
|
+
class JobBase
|
723
|
+
@@testing = false
|
724
|
+
|
725
|
+
def initialize(*args)
|
726
|
+
@stages = []
|
727
|
+
end
|
728
|
+
|
729
|
+
def JobBase.testing(val)
|
730
|
+
@@testing = val
|
731
|
+
end
|
732
|
+
|
733
|
+
# Change filename so a path maps into a simple name.
|
734
|
+
# / ==> -
|
735
|
+
# * ==> all
|
736
|
+
# []? ==> _
|
737
|
+
def JobBase.filename_map(filename)
|
738
|
+
filename.gsub(/\*/, 'all').gsub(/\//, '-').gsub(/[\[\]?]/, '_')
|
739
|
+
end
|
740
|
+
|
741
|
+
# These store job declarations
|
742
|
+
def mapper map_class, *args
|
743
|
+
@map_class = map_class
|
744
|
+
@map_args = args
|
745
|
+
@map_opts = {}
|
746
|
+
@in_dirs = []
|
747
|
+
@extras = []
|
748
|
+
end
|
749
|
+
def reducer reduce_class, *args
|
750
|
+
@reduce_class = reduce_class
|
751
|
+
@reduce_args = args
|
752
|
+
@reducers = 1
|
753
|
+
@reduce_opts = {}
|
754
|
+
end
|
755
|
+
def indir in_dir
|
756
|
+
@in_dirs << in_dir
|
757
|
+
end
|
758
|
+
alias infiles indir
|
759
|
+
def outdir out_dir
|
760
|
+
@out_dir = JobBase.filename_map(out_dir)
|
761
|
+
end
|
762
|
+
alias outfiles outdir
|
763
|
+
def reducers n
|
764
|
+
@reducers = n
|
765
|
+
end
|
766
|
+
def extra ex
|
767
|
+
@extras << ex
|
768
|
+
end
|
769
|
+
def map_opt n, v
|
770
|
+
@map_opts[n] = v
|
771
|
+
end
|
772
|
+
def reduce_opt n, v
|
773
|
+
@reduce_opts[n] = v
|
774
|
+
end
|
775
|
+
def hadoop_opts name
|
776
|
+
@hadoop_opts = [] unless @hadoop_opts
|
777
|
+
@hadoop_opts << name
|
778
|
+
end
|
779
|
+
# This gathers the declarations and stores in a stage record.
|
780
|
+
def add_stage
|
781
|
+
case
|
782
|
+
when @map_class.nil?: raise "Map class not specified"
|
783
|
+
when @reduce_class.nil?: raise "Reduce class not specified"
|
784
|
+
when @in_dirs.empty?: raise "Indir not speficied"
|
785
|
+
when @out_dir.nil?: raise "Outdir not specified"
|
786
|
+
end
|
787
|
+
@stages << [@map_class, @map_args, @map_opts,
|
788
|
+
@reduce_class, @reduce_args, @reduce_opts,
|
789
|
+
@in_dirs, @out_dir, @reducers, @extras]
|
790
|
+
end
|
791
|
+
|
792
|
+
# For each method in the class starting with "stage", call the method,
|
793
|
+
# then call add_stage. This can be used to create multi-stage map-reduce
|
794
|
+
# programs.
|
795
|
+
def prepare
|
796
|
+
ms = self.class.instance_methods.find_all do |m|
|
797
|
+
m =~ /(^stage)|(^job$)/
|
798
|
+
end
|
799
|
+
ms.sort.each do |m|
|
800
|
+
self.method(m).call
|
801
|
+
add_stage
|
802
|
+
end
|
803
|
+
end
|
804
|
+
|
805
|
+
# Run the job.
|
806
|
+
# For each stage, run the mapper, then sort the
|
807
|
+
# intermediate output, then run the reducer.
|
808
|
+
def run_test
|
809
|
+
map_out_file = "/tmp/map-out"
|
810
|
+
red_in_file = "/tmp/reduce-in"
|
811
|
+
@stages.each do |s|
|
812
|
+
map_class, map_args, map_opts,
|
813
|
+
reduce_class, reduce_args, reduce_opts,
|
814
|
+
in_dirs, out_dir, reducers, extras = *s
|
815
|
+
mapper = map_class.new(*map_args)
|
816
|
+
mapper.declare
|
817
|
+
mapper.prepare
|
818
|
+
in_dirs.each do |in_dir|
|
819
|
+
StreamIO.open(in_dir, "r") do |in_fd|
|
820
|
+
StreamIO.open(map_out_file, "w") do |out_fd|
|
821
|
+
mapper.run in_fd, out_fd
|
822
|
+
end
|
823
|
+
end
|
824
|
+
end
|
825
|
+
|
826
|
+
system "sort <#{map_out_file} >#{red_in_file}"
|
827
|
+
|
828
|
+
reducer = reduce_class.new(*reduce_args)
|
829
|
+
reducer.declare
|
830
|
+
reducer.prepare
|
831
|
+
StreamIO.open(red_in_file, "r") do |in_fd|
|
832
|
+
StreamIO.open(out_dir, "w") do |out_fd|
|
833
|
+
reducer.run in_fd, out_fd
|
834
|
+
end
|
835
|
+
end
|
836
|
+
end
|
837
|
+
end
|
838
|
+
|
839
|
+
def build_command(fname, klass, args)
|
840
|
+
res = "#{fname} -s #{klass.to_s}"
|
841
|
+
if args
|
842
|
+
res += " #{args.join(' ')}"
|
843
|
+
end
|
844
|
+
res
|
845
|
+
end
|
846
|
+
|
847
|
+
def self.get_job_opts
|
848
|
+
opts = {}
|
849
|
+
if ARGV[0] == '-v'
|
850
|
+
opts[:verbose] = true
|
851
|
+
ARGV.shift
|
852
|
+
end
|
853
|
+
opts
|
854
|
+
end
|
855
|
+
|
856
|
+
def run(fname, opts)
|
857
|
+
sr = StreamRunner.new
|
858
|
+
out_dir = "out"
|
859
|
+
@stages.each do |s|
|
860
|
+
map_class, map_args, map_opts,
|
861
|
+
reduce_class, reduce_args, reduce_opts,
|
862
|
+
in_dirs, out_dir, reducers, extras = *s
|
863
|
+
opts = opts.merge({:hadoop_opts => @hadoop_opts.join(" ")}) if @hadoop_opts && @hadoop_opts.size > 0
|
864
|
+
sr.run_map_reduce(in_dirs, out_dir,
|
865
|
+
build_command(fname, map_class, map_args),
|
866
|
+
build_command(fname, reduce_class, reduce_args),
|
867
|
+
reducers,
|
868
|
+
[__FILE__, 'stream_runner.rb'] + extras,
|
869
|
+
map_opts, reduce_opts, opts)
|
870
|
+
end
|
871
|
+
end
|
872
|
+
|
873
|
+
def self.run_test
|
874
|
+
job = self.new
|
875
|
+
job.prepare
|
876
|
+
job.run_test
|
877
|
+
end
|
878
|
+
|
879
|
+
def self.run_command(opt = nil)
|
880
|
+
return if @@testing && opt == :at_exit
|
881
|
+
return run_test if @@testing
|
882
|
+
|
883
|
+
filename = $0 unless filename
|
884
|
+
if ARGV[0] == '-s'
|
885
|
+
ARGV.shift
|
886
|
+
class_name = ARGV.shift
|
887
|
+
action = Object.const_get(class_name).new(*ARGV)
|
888
|
+
action.declare
|
889
|
+
action.prepare
|
890
|
+
action.run(STDIN, STDOUT)
|
891
|
+
else
|
892
|
+
opts = get_job_opts
|
893
|
+
# create an instance of the class that was called originally
|
894
|
+
action = self.new
|
895
|
+
action.prepare
|
896
|
+
action.run(File.basename(filename), opts)
|
897
|
+
end
|
898
|
+
end
|
899
|
+
end
|
900
|
+
|
901
|
+
# At exit, call run_command in each class of the form xxxJob.
|
902
|
+
at_exit do
|
903
|
+
ObjectSpace.each_object(Class) do |klass|
|
904
|
+
if klass.name =~ /^\w+Job$/
|
905
|
+
klass.run_command(:at_exit)
|
906
|
+
end
|
907
|
+
end
|
908
|
+
end
|