mrtoolkit 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mrtoolkit.rb ADDED
@@ -0,0 +1,908 @@
1
+ require 'pp'
2
+ require 'stream_runner'
3
+
4
+ # Store information about a processing stage.
5
+ # Includes input and output field names, field separators,
6
+ # and the filenames processed by the stage.
7
+ class Stage
8
+ attr_reader :in_fields, :out_fields
9
+ attr_reader :in_sep, :out_sep
10
+ attr_reader :errors
11
+
12
+ def initialize(*args)
13
+ end
14
+
15
+ def field name
16
+ @in_fields = [] unless @in_fields
17
+ @in_fields << name.to_sym
18
+ end
19
+ def emit name
20
+ @out_fields = [] unless @out_fields
21
+ @out_fields << name.to_sym
22
+ end
23
+ def field_separator sep
24
+ @in_sep = sep
25
+ end
26
+ def emit_separator sep
27
+ @out_sep = sep
28
+ end
29
+ def catch_errors
30
+ @catch_errors = true
31
+ end
32
+
33
+ def declare
34
+ end
35
+
36
+ # Create the input and output structures.
37
+ def prepare
38
+ @in_sep = "\t" unless @in_sep
39
+ @out_sep = "\t" unless @out_sep
40
+ @input_type = Struct.new(*@in_fields)
41
+ @output_type = Struct.new(*@out_fields)
42
+ @errors = 0
43
+ end
44
+
45
+ # Copies all fields of a struct to another
46
+ # Some fields can be skipped.
47
+ def copy_struct(src, dest, skip = 0)
48
+ (0..src.length-1-skip).each {|i| dest[i] = src[i+skip]}
49
+ dest
50
+ end
51
+
52
+ # Write any output
53
+ def write_out(output)
54
+ if output
55
+ outs = @out_fields.collect { |f| output[f].to_s.chomp }
56
+ @out_fd.puts outs.join(@out_sep)
57
+ end
58
+ end
59
+
60
+ def new_input(line = nil)
61
+ input = @input_type.new
62
+ return input unless line
63
+ fields = line.chomp.split(@in_sep)
64
+ @in_fields.each_index { |i| input[i] = fields[i] }
65
+ input
66
+ end
67
+ def new_output
68
+ @output_type.new
69
+ end
70
+
71
+ # Process one line of map or reduce file.
72
+ # Create output record.
73
+ # Call the given function.
74
+ # collect the output and write it out.
75
+ def process_step(fun, input = nil)
76
+ begin
77
+ out = send(fun, input, new_output)
78
+ if out
79
+ out = [out] unless out.class == Array
80
+ out.each {|o| write_out(o)}
81
+ end
82
+ rescue StandardError
83
+ STDERR.puts "Error: #{$!}"
84
+ @errors += 1
85
+ raise unless @catch_errors
86
+ end
87
+ end
88
+ end
89
+
90
+ # This class allows uniform processing of File and STDIN/STDOUT
91
+ # file descriptors.
92
+ # It must be passed a block, which gets the open file descriptor.
93
+ class StreamIO
94
+ def self.open(f, mode = "r")
95
+ if f.class == String
96
+ fp = File.open(f, mode)
97
+ yield(fp)
98
+ fp.close
99
+ elsif f.class == IO
100
+ yield(f)
101
+ end
102
+ end
103
+ end
104
+
105
+ ############################
106
+ # Base class for map
107
+ ############################
108
+ # Map Stage
109
+ # Creates an object to hold input lines after they have been
110
+ # parsed and separated into fields.
111
+ # Reads input and feeds to process method, then collects output.
112
+ class MapBase < Stage
113
+
114
+ # Called at the beginning of map.
115
+ # No input.
116
+ def process_begin(dummy, output)
117
+ nil
118
+ end
119
+ # Called for each record.
120
+ def process(input, output)
121
+ nil
122
+ end
123
+ # Called at the end of map.
124
+ def process_end(dummy, output)
125
+ nil
126
+ end
127
+
128
+ def run(in_fd, out_fd)
129
+ @out_fd = out_fd
130
+ process_step(:process_begin, nil)
131
+ input = nil
132
+ in_fd.each_line do |line|
133
+ @raw_input = line
134
+ input = new_input(line)
135
+ process_step(:process, input)
136
+ end
137
+ process_step(:process_end, nil)
138
+ end
139
+ end
140
+
141
+ ############################
142
+ # Base class for reduce
143
+ ############################
144
+ # Reduce Stage
145
+ # Creates an object to hold input lines after they have been
146
+ # parsed and separated into fields.
147
+ # Reads input and feeds to process method, then collects output.
148
+ # Reduce input is map output
149
+ class ReduceBase < Stage
150
+ # This suite of functions is called on the fields based on
151
+ # their first field.
152
+ # For each value of the first field, process_init is called first,
153
+ # then process_each is called for each one,
154
+ # then process_term is called after the last one.
155
+ # The client can implement only process_term to see each unique value once.
156
+ # When process_term is called, input is on the next record, so the first
157
+ # field is in @last
158
+
159
+ # Called at the beginning of a run of equal values of the first field.
160
+ def process_init(input, output)
161
+ nil
162
+ end
163
+ # Called for each one of the equal values.
164
+ def process_each(input, output)
165
+ nil
166
+ end
167
+ # Called after the run of equal values.
168
+ # No input record. Previous value of first field in @last.
169
+ def process_term(dummy, output)
170
+ nil
171
+ end
172
+
173
+ # Called at the beginning of reduction.
174
+ # No input.
175
+ def process_begin(dummy, output)
176
+ nil
177
+ end
178
+ # Called for each record.
179
+ def process(input, output)
180
+ nil
181
+ end
182
+ # Called at the end of reduction.
183
+ def process_end(dummy, output)
184
+ nil
185
+ end
186
+
187
+ # This suite of functions is called on all records.
188
+ # The function process_begin is called first,
189
+ # then process is called on each record,
190
+ # then process_end is called last.
191
+ # This default implementation implements the calls to process_init,
192
+ # proces_each, and process_term.
193
+ # The client can omit process_begin and process_end
194
+ # and just implement process to see each record.
195
+ def process_internal(input)
196
+ v = input[0]
197
+ if @last.nil?
198
+ process_step(:process_init, input)
199
+ process_step(:process_each, input)
200
+ @last = v
201
+ return
202
+ end
203
+ if v == @last
204
+ # As long as key is the same, just process it
205
+ process_step(:process_each, input)
206
+ return
207
+ end
208
+ # The run has ended
209
+ process_step(:process_term, input) if @last
210
+ @last = v
211
+
212
+ process_step(:process_init, input)
213
+ process_step(:process_each, input)
214
+ end
215
+ def process_end_internal(dummy)
216
+ process_step(:process_term, nil) if @last
217
+ end
218
+
219
+ # Run the reducer.
220
+ # Call process_begin, then for each line, call
221
+ # process, then call process_end.
222
+ # At each step, collect any output and write it out.
223
+ def run(in_fd, out_fd)
224
+ @out_fd = out_fd
225
+ @last = nil
226
+ process_step(:process_begin, nil)
227
+
228
+ input = nil # so it will survive the loop
229
+ in_fd.each_line do |line|
230
+ @raw_input = line
231
+ input = new_input(line)
232
+ process_internal(input)
233
+ process_step(:process, input)
234
+ end
235
+ process_end_internal(nil)
236
+ process_step(:process_end, nil)
237
+ end
238
+ end
239
+
240
+ #########################################
241
+ # Pre-written map and reduce classes
242
+ #########################################
243
+
244
+ # Map just copies its fields
245
+ class CopyMap < MapBase
246
+ def initialize(*args)
247
+ if args.size < 1
248
+ @n = 0
249
+ else
250
+ @n = args[0].to_i - 1
251
+ end
252
+ end
253
+ def declare
254
+ (0..@n).each {|i| field "col#{i}"}
255
+
256
+ (0..@n).each {|i| emit "col#{i}"}
257
+ end
258
+
259
+ def process(input, output)
260
+ copy_struct(input, output)
261
+ end
262
+ end
263
+
264
+ # Map selects according to a RE
265
+ class SelectMap < MapBase
266
+ def initialize(*args)
267
+ raise ArgumentError if args.size < 1
268
+ @re = args[0]
269
+ if args[1]
270
+ @field = args[1]
271
+ else
272
+ @field = 0
273
+ end
274
+ if args[2]
275
+ @n = args[2].to_i - 1
276
+ else
277
+ @n = 0
278
+ end
279
+ end
280
+ def declare
281
+ (0..@n).each {|i| field "col#{i}"}
282
+
283
+ (0..@n).each {|i| emit "col#{i}"}
284
+ end
285
+
286
+ def process(input, output)
287
+ if input[@field] =~ @re
288
+ return copy_struct(input, output)
289
+ end
290
+ nil
291
+ end
292
+ end
293
+
294
+ # Reducer collects all values
295
+ # Outputs as many lines as input
296
+ # Init with number of fields to copy (default 1).
297
+ # Optional second arg is the number of initial fields to skip.
298
+ class CopyReduce < ReduceBase
299
+ def initialize(*args)
300
+ if args[0]
301
+ @n = args[0].to_i - 1
302
+ else
303
+ @n = 0
304
+ end
305
+ if args[1]
306
+ @m = args[1].to_i - 1
307
+ else
308
+ @m = -1
309
+ end
310
+ end
311
+ def declare
312
+ (0..@m).each {|i| field "skip#{i}"}
313
+ (0..@n).each {|i| field "col#{i}"}
314
+
315
+ (0..@n).each {|i| emit "col#{i}"}
316
+ end
317
+
318
+ def process(input, output)
319
+ copy_struct(input, output, @m+1)
320
+ end
321
+ end
322
+
323
+ # Reducer collects unique values
324
+ # Outputs as many lines as there are unique values in the first field.
325
+ class UniqueReduce < ReduceBase
326
+ def declare
327
+ field :value
328
+
329
+ emit :value
330
+ end
331
+
332
+ def process_term(input, output)
333
+ output.value = @last
334
+ output
335
+ end
336
+ end
337
+
338
+ # Reducer sums given fields
339
+ # Specify how many fields to sum (default 1).
340
+ # May optionally specify how many initial fields to skip
341
+ # Outputs one line of sums
342
+ class SumReduce < ReduceBase
343
+ def initialize(*args)
344
+ if args[0]
345
+ @n = args[0].to_i - 1
346
+ else
347
+ @n = 0
348
+ end
349
+ if args[1]
350
+ @m = args[1].to_i - 1
351
+ else
352
+ @m = -1
353
+ end
354
+ end
355
+ def declare
356
+ (0..@m).each {|i| field "skip#{i}"}
357
+ (0..@n).each {|i| field "count#{i}"}
358
+
359
+ (0..@n).each {|i| emit "sum#{i}"}
360
+ end
361
+
362
+ def process_begin(dummy, output)
363
+ @sum = Array.new(@n+1, 0)
364
+ nil
365
+ end
366
+ def process(input, output)
367
+ (0..@n).each {|i| @sum[i] += input[i+@m+1].to_f}
368
+ nil
369
+ end
370
+ def process_end(dummy, output)
371
+ (0..@n).each {|i| output[i] = @sum[i]}
372
+ output
373
+ end
374
+ end
375
+
376
+ # This reducer sums within each unique value of the first field.
377
+ # Outputs one line of sums for each unique value of the first field.
378
+ class UniqueSumReduce < ReduceBase
379
+ def initialize(*args)
380
+ if args[0]
381
+ @n = args[0].to_i - 1
382
+ else
383
+ @n = 0
384
+ end
385
+ if args[1]
386
+ @m = args[1].to_i - 1
387
+ else
388
+ @m = -1
389
+ end
390
+ end
391
+
392
+ def declare
393
+ field :unique
394
+ (0..@n).each {|i| field "count#{i}"}
395
+ (0..@m).each {|i| field "extra#{i}"}
396
+
397
+ emit :value
398
+ (0..@n).each {|i| emit "sum#{i}"}
399
+ (0..@m).each {|i| emit "extra#{i}"}
400
+ end
401
+ def process_init(input, output)
402
+ @sum = Array.new(@n+1, 0)
403
+ @extra = Array.new(@m+1)
404
+ nil
405
+ end
406
+ def process_each(input, output)
407
+ (0..@n).each {|i| @sum[i] += input[i+1].to_i}
408
+ (0..@m).each {|i| @extra[i] = input[i+@n+2]}
409
+ nil
410
+ end
411
+ def process_term(dummy, output)
412
+ output.value = @last
413
+ (0..@n).each {|i| output[i+1] = @sum[i]}
414
+ (0..@m).each {|i| output[i+@n+2] = @extra[i]}
415
+ output
416
+ end
417
+ end
418
+
419
+ # Reducer counts within each unique value of the first field.
420
+ # Outputs one line of counts for each unique value of the first field.
421
+ class UniqueCountReduce < ReduceBase
422
+ def initialize(*args)
423
+ if args[0]
424
+ @m = args[0].to_i - 1
425
+ else
426
+ @m = -1
427
+ end
428
+ end
429
+
430
+ def declare
431
+ field :unique
432
+ (0..@m).each {|i| field "extra#{i}"}
433
+
434
+ emit :value
435
+ emit :count
436
+ (0..@m).each {|i| emit "extra#{i}"}
437
+ end
438
+ def process_init(input, output)
439
+ @count = 0
440
+ @extra = Array.new(@m+1)
441
+ nil
442
+ end
443
+ def process_each(input, output)
444
+ @count += 1
445
+ (0..@m).each {|i| @extra[i] = input[i+1]}
446
+ nil
447
+ end
448
+ def process_term(dummy, output)
449
+ output.value = @last
450
+ output.count = @count
451
+ (0..@m).each {|i| output[i+2] = @extra[i]}
452
+ output
453
+ end
454
+ end
455
+
456
+ # Reducer works on groups where the first field is the same.
457
+ # For each distinct value of the second field, sum up the values
458
+ # of the third field.
459
+ class UniqueIndexedSumReduce < ReduceBase
460
+ def declare
461
+ field :unique
462
+ field :index
463
+ field :value
464
+
465
+ emit :unique
466
+ emit :index
467
+ emit :value
468
+ end
469
+ def process_init(input, output)
470
+ @sum = {}
471
+ nil
472
+ end
473
+ def process_each(input, output)
474
+ index = input.index
475
+ @sum[index] = 0 unless @sum.has_key?(index)
476
+ @sum[index] += input.value.to_i
477
+ nil
478
+ end
479
+ def process_term(dummy, output)
480
+ output = []
481
+ @sum.each do |index, value|
482
+ item = new_output
483
+ item.unique = @last
484
+ item.index = index
485
+ item.value = value
486
+ output << item
487
+ end
488
+ output
489
+ end
490
+ end
491
+
492
+ # Reducer works on groups where the first field is the same.
493
+ # Count the number of distinct occurances of the second field.
494
+ class UniqueIndexedCountReduce < ReduceBase
495
+ def declare
496
+ field :unique
497
+ field :index
498
+
499
+ emit :unique
500
+ emit :index
501
+ emit :value
502
+ end
503
+ def process_init(input, output)
504
+ @sum = {}
505
+ nil
506
+ end
507
+ def process_each(input, output)
508
+ index = input.index
509
+ @sum[index] = 0 unless @sum.has_key?(index)
510
+ @sum[index] += 1
511
+ nil
512
+ end
513
+ def process_term(dummy, output)
514
+ output = []
515
+ @sum.each do |index, value|
516
+ item = new_output
517
+ item.unique = @last
518
+ item.index = index
519
+ item.value = value
520
+ output << item
521
+ end
522
+ output
523
+ end
524
+ end
525
+
526
+ # Reducer samples the input
527
+ # One argument must be given: the number of samples to retain
528
+ # Outputs that many lines
529
+ # TODO store the whole input object in pool?
530
+ # or else take another argument of columns to store
531
+ class SampleReduce < ReduceBase
532
+ def initialize(*args)
533
+ raise ArgumentError if args.size < 1
534
+ @m = args[0].to_i
535
+ end
536
+
537
+ def declare
538
+ field :value
539
+
540
+ emit :value
541
+ end
542
+ def process_begin(dummy, output)
543
+ @pool = []
544
+ @n = 0
545
+ nil
546
+ end
547
+ def process(input, output)
548
+ if @pool.size < @m
549
+ @pool << input.value
550
+ elsif rand < (@m.to_f / @n.to_f)
551
+ @pool[rand(@m)] = input.value
552
+ end
553
+ @n += 1
554
+ nil
555
+ end
556
+ def process_end(dummy, output)
557
+ output = []
558
+ @pool.each do |elem|
559
+ item = new_output
560
+ item.value = elem
561
+ output << item
562
+ end
563
+ output
564
+ end
565
+ end
566
+
567
+ # Reducer retains the the M maximum values in column 2
568
+ # Column 2 must be numeric
569
+ # TODO store rest of fields too
570
+ class MaxReduce < ReduceBase
571
+ def initialize(*args)
572
+ if args[0]
573
+ @m = args[0].to_i
574
+ else
575
+ @m = 1
576
+ end
577
+ end
578
+
579
+ def declare
580
+ field :key
581
+ field :value
582
+
583
+ emit :key
584
+ emit :value
585
+ end
586
+
587
+ def compare(x, y)
588
+ y <=> x
589
+ end
590
+
591
+ def sort_pool
592
+ @pool.sort! {|x, y| compare(x[1], y[1])}
593
+ end
594
+
595
+ def process_begin(dummy, output)
596
+ @pool = []
597
+ nil
598
+ end
599
+ def process(input, output)
600
+ val = input.value.to_i
601
+ if @pool.size < @m
602
+ @pool << [input.key, val]
603
+ sort_pool
604
+ elsif val > @pool[-1][1]
605
+ @pool[-1] = [input.key, val]
606
+ sort_pool
607
+ end
608
+ nil
609
+ end
610
+ def process_end(dummy, output)
611
+ output = []
612
+ @pool.each do |elem|
613
+ item = new_output
614
+ item.key, item.value = elem
615
+ output << item
616
+ end
617
+ output
618
+ end
619
+ end
620
+
621
+ # Reducer sums the values for each unique key
622
+ # Outputs only the M max values
623
+ class MaxUniqueSumReduce < ReduceBase
624
+ def initialize(*args)
625
+ raise ArgumentError if args.size < 1
626
+ @m = args[0].to_i
627
+ end
628
+
629
+ def declare
630
+ field :key
631
+ field :value
632
+
633
+ emit :key
634
+ emit :value
635
+ end
636
+
637
+ def sort_pool
638
+ @pool.sort! {|x, y| y[1] <=> x[1]}
639
+ end
640
+
641
+ def process_begin(dummy, output)
642
+ @pool = []
643
+ nil
644
+ end
645
+ # These three do the sum
646
+ def process_init(input, output)
647
+ @sum = 0
648
+ nil
649
+ end
650
+ def process_each(input, output)
651
+ @sum += input.value.to_i
652
+ nil
653
+ end
654
+ def process_term(dummy, output)
655
+ if @pool.size < @m
656
+ @pool << [@last, @sum]
657
+ sort_pool
658
+ elsif @sum > @pool[-1][1]
659
+ @pool[-1] = [@last, @sum]
660
+ sort_pool
661
+ end
662
+ nil
663
+ end
664
+ def process_end(dummy, output)
665
+ output = []
666
+ @pool.each do |elem|
667
+ item = new_output
668
+ item.key, item.value = elem
669
+ output << item
670
+ end
671
+ output
672
+ end
673
+ end
674
+
675
+ # Min given fields
676
+ class MinReduce < MaxReduce
677
+ def process(input, output)
678
+ if @pool.size < @m
679
+ @pool << [input.key, input.value]
680
+ sort_pool
681
+ elsif input.value.to_i < @pool[-1][1].to_i
682
+ @pool[-1] = [input.key, input.value]
683
+ sort_pool
684
+ end
685
+ nil
686
+ end
687
+ end
688
+
689
+ # First record of each unique field
690
+ # Drops the given number of colums.
691
+ # By default, drops the first column.
692
+ class UniqueFirstReduce < ReduceBase
693
+ def initialize(*args)
694
+ if args[0]
695
+ @n = args[0].to_i - 1
696
+ else
697
+ @n = 0
698
+ end
699
+ if args[1]
700
+ @m = args[1].to_i - 1
701
+ else
702
+ @m = -1
703
+ end
704
+ end
705
+
706
+ def declare
707
+ (0..@m).each {|i| field "skip#{i}"}
708
+ (0..@n).each {|i| field "col#{i}"}
709
+
710
+ (0..@n).each {|i| emit "col#{i}"}
711
+ end
712
+ # copy over all dest fields
713
+ def process_init(input, output)
714
+ copy_struct(input, output, @m+1)
715
+ end
716
+ end
717
+
718
+ ############################
719
+ # Base class for jobs
720
+ ############################
721
+
722
+ class JobBase
723
+ @@testing = false
724
+
725
+ def initialize(*args)
726
+ @stages = []
727
+ end
728
+
729
+ def JobBase.testing(val)
730
+ @@testing = val
731
+ end
732
+
733
+ # Change filename so a path maps into a simple name.
734
+ # / ==> -
735
+ # * ==> all
736
+ # []? ==> _
737
+ def JobBase.filename_map(filename)
738
+ filename.gsub(/\*/, 'all').gsub(/\//, '-').gsub(/[\[\]?]/, '_')
739
+ end
740
+
741
+ # These store job declarations
742
+ def mapper map_class, *args
743
+ @map_class = map_class
744
+ @map_args = args
745
+ @map_opts = {}
746
+ @in_dirs = []
747
+ @extras = []
748
+ end
749
+ def reducer reduce_class, *args
750
+ @reduce_class = reduce_class
751
+ @reduce_args = args
752
+ @reducers = 1
753
+ @reduce_opts = {}
754
+ end
755
+ def indir in_dir
756
+ @in_dirs << in_dir
757
+ end
758
+ alias infiles indir
759
+ def outdir out_dir
760
+ @out_dir = JobBase.filename_map(out_dir)
761
+ end
762
+ alias outfiles outdir
763
+ def reducers n
764
+ @reducers = n
765
+ end
766
+ def extra ex
767
+ @extras << ex
768
+ end
769
+ def map_opt n, v
770
+ @map_opts[n] = v
771
+ end
772
+ def reduce_opt n, v
773
+ @reduce_opts[n] = v
774
+ end
775
+ def hadoop_opts name
776
+ @hadoop_opts = [] unless @hadoop_opts
777
+ @hadoop_opts << name
778
+ end
779
+ # This gathers the declarations and stores in a stage record.
780
+ def add_stage
781
+ case
782
+ when @map_class.nil?: raise "Map class not specified"
783
+ when @reduce_class.nil?: raise "Reduce class not specified"
784
+ when @in_dirs.empty?: raise "Indir not speficied"
785
+ when @out_dir.nil?: raise "Outdir not specified"
786
+ end
787
+ @stages << [@map_class, @map_args, @map_opts,
788
+ @reduce_class, @reduce_args, @reduce_opts,
789
+ @in_dirs, @out_dir, @reducers, @extras]
790
+ end
791
+
792
+ # For each method in the class starting with "stage", call the method,
793
+ # then call add_stage. This can be used to create multi-stage map-reduce
794
+ # programs.
795
+ def prepare
796
+ ms = self.class.instance_methods.find_all do |m|
797
+ m =~ /(^stage)|(^job$)/
798
+ end
799
+ ms.sort.each do |m|
800
+ self.method(m).call
801
+ add_stage
802
+ end
803
+ end
804
+
805
+ # Run the job.
806
+ # For each stage, run the mapper, then sort the
807
+ # intermediate output, then run the reducer.
808
+ def run_test
809
+ map_out_file = "/tmp/map-out"
810
+ red_in_file = "/tmp/reduce-in"
811
+ @stages.each do |s|
812
+ map_class, map_args, map_opts,
813
+ reduce_class, reduce_args, reduce_opts,
814
+ in_dirs, out_dir, reducers, extras = *s
815
+ mapper = map_class.new(*map_args)
816
+ mapper.declare
817
+ mapper.prepare
818
+ in_dirs.each do |in_dir|
819
+ StreamIO.open(in_dir, "r") do |in_fd|
820
+ StreamIO.open(map_out_file, "w") do |out_fd|
821
+ mapper.run in_fd, out_fd
822
+ end
823
+ end
824
+ end
825
+
826
+ system "sort <#{map_out_file} >#{red_in_file}"
827
+
828
+ reducer = reduce_class.new(*reduce_args)
829
+ reducer.declare
830
+ reducer.prepare
831
+ StreamIO.open(red_in_file, "r") do |in_fd|
832
+ StreamIO.open(out_dir, "w") do |out_fd|
833
+ reducer.run in_fd, out_fd
834
+ end
835
+ end
836
+ end
837
+ end
838
+
839
+ def build_command(fname, klass, args)
840
+ res = "#{fname} -s #{klass.to_s}"
841
+ if args
842
+ res += " #{args.join(' ')}"
843
+ end
844
+ res
845
+ end
846
+
847
+ def self.get_job_opts
848
+ opts = {}
849
+ if ARGV[0] == '-v'
850
+ opts[:verbose] = true
851
+ ARGV.shift
852
+ end
853
+ opts
854
+ end
855
+
856
+ def run(fname, opts)
857
+ sr = StreamRunner.new
858
+ out_dir = "out"
859
+ @stages.each do |s|
860
+ map_class, map_args, map_opts,
861
+ reduce_class, reduce_args, reduce_opts,
862
+ in_dirs, out_dir, reducers, extras = *s
863
+ opts = opts.merge({:hadoop_opts => @hadoop_opts.join(" ")}) if @hadoop_opts && @hadoop_opts.size > 0
864
+ sr.run_map_reduce(in_dirs, out_dir,
865
+ build_command(fname, map_class, map_args),
866
+ build_command(fname, reduce_class, reduce_args),
867
+ reducers,
868
+ [__FILE__, 'stream_runner.rb'] + extras,
869
+ map_opts, reduce_opts, opts)
870
+ end
871
+ end
872
+
873
+ def self.run_test
874
+ job = self.new
875
+ job.prepare
876
+ job.run_test
877
+ end
878
+
879
+ def self.run_command(opt = nil)
880
+ return if @@testing && opt == :at_exit
881
+ return run_test if @@testing
882
+
883
+ filename = $0 unless filename
884
+ if ARGV[0] == '-s'
885
+ ARGV.shift
886
+ class_name = ARGV.shift
887
+ action = Object.const_get(class_name).new(*ARGV)
888
+ action.declare
889
+ action.prepare
890
+ action.run(STDIN, STDOUT)
891
+ else
892
+ opts = get_job_opts
893
+ # create an instance of the class that was called originally
894
+ action = self.new
895
+ action.prepare
896
+ action.run(File.basename(filename), opts)
897
+ end
898
+ end
899
+ end
900
+
901
+ # At exit, call run_command in each class of the form xxxJob.
902
+ at_exit do
903
+ ObjectSpace.each_object(Class) do |klass|
904
+ if klass.name =~ /^\w+Job$/
905
+ klass.run_command(:at_exit)
906
+ end
907
+ end
908
+ end