mrtoolkit 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mrtoolkit.rb ADDED
@@ -0,0 +1,908 @@
1
+ require 'pp'
2
+ require 'stream_runner'
3
+
4
+ # Store information about a processing stage.
5
+ # Includes input and output field names, field separators,
6
+ # and the filenames processed by the stage.
7
+ class Stage
8
+ attr_reader :in_fields, :out_fields
9
+ attr_reader :in_sep, :out_sep
10
+ attr_reader :errors
11
+
12
+ def initialize(*args)
13
+ end
14
+
15
+ def field name
16
+ @in_fields = [] unless @in_fields
17
+ @in_fields << name.to_sym
18
+ end
19
+ def emit name
20
+ @out_fields = [] unless @out_fields
21
+ @out_fields << name.to_sym
22
+ end
23
+ def field_separator sep
24
+ @in_sep = sep
25
+ end
26
+ def emit_separator sep
27
+ @out_sep = sep
28
+ end
29
+ def catch_errors
30
+ @catch_errors = true
31
+ end
32
+
33
+ def declare
34
+ end
35
+
36
+ # Create the input and output structures.
37
+ def prepare
38
+ @in_sep = "\t" unless @in_sep
39
+ @out_sep = "\t" unless @out_sep
40
+ @input_type = Struct.new(*@in_fields)
41
+ @output_type = Struct.new(*@out_fields)
42
+ @errors = 0
43
+ end
44
+
45
+ # Copies all fields of a struct to another
46
+ # Some fields can be skipped.
47
+ def copy_struct(src, dest, skip = 0)
48
+ (0..src.length-1-skip).each {|i| dest[i] = src[i+skip]}
49
+ dest
50
+ end
51
+
52
+ # Write any output
53
+ def write_out(output)
54
+ if output
55
+ outs = @out_fields.collect { |f| output[f].to_s.chomp }
56
+ @out_fd.puts outs.join(@out_sep)
57
+ end
58
+ end
59
+
60
+ def new_input(line = nil)
61
+ input = @input_type.new
62
+ return input unless line
63
+ fields = line.chomp.split(@in_sep)
64
+ @in_fields.each_index { |i| input[i] = fields[i] }
65
+ input
66
+ end
67
+ def new_output
68
+ @output_type.new
69
+ end
70
+
71
+ # Process one line of map or reduce file.
72
+ # Create output record.
73
+ # Call the given function.
74
+ # collect the output and write it out.
75
+ def process_step(fun, input = nil)
76
+ begin
77
+ out = send(fun, input, new_output)
78
+ if out
79
+ out = [out] unless out.class == Array
80
+ out.each {|o| write_out(o)}
81
+ end
82
+ rescue StandardError
83
+ STDERR.puts "Error: #{$!}"
84
+ @errors += 1
85
+ raise unless @catch_errors
86
+ end
87
+ end
88
+ end
89
+
90
+ # This class allows uniform processing of File and STDIN/STDOUT
91
+ # file descriptors.
92
+ # It must be passed a block, which gets the open file descriptor.
93
+ class StreamIO
94
+ def self.open(f, mode = "r")
95
+ if f.class == String
96
+ fp = File.open(f, mode)
97
+ yield(fp)
98
+ fp.close
99
+ elsif f.class == IO
100
+ yield(f)
101
+ end
102
+ end
103
+ end
104
+
105
+ ############################
106
+ # Base class for map
107
+ ############################
108
+ # Map Stage
109
+ # Creates an object to hold input lines after they have been
110
+ # parsed and separated into fields.
111
+ # Reads input and feeds to process method, then collects output.
112
+ class MapBase < Stage
113
+
114
+ # Called at the beginning of map.
115
+ # No input.
116
+ def process_begin(dummy, output)
117
+ nil
118
+ end
119
+ # Called for each record.
120
+ def process(input, output)
121
+ nil
122
+ end
123
+ # Called at the end of map.
124
+ def process_end(dummy, output)
125
+ nil
126
+ end
127
+
128
+ def run(in_fd, out_fd)
129
+ @out_fd = out_fd
130
+ process_step(:process_begin, nil)
131
+ input = nil
132
+ in_fd.each_line do |line|
133
+ @raw_input = line
134
+ input = new_input(line)
135
+ process_step(:process, input)
136
+ end
137
+ process_step(:process_end, nil)
138
+ end
139
+ end
140
+
141
+ ############################
142
+ # Base class for reduce
143
+ ############################
144
+ # Reduce Stage
145
+ # Creates an object to hold input lines after they have been
146
+ # parsed and separated into fields.
147
+ # Reads input and feeds to process method, then collects output.
148
+ # Reduce input is map output
149
+ class ReduceBase < Stage
150
+ # This suite of functions is called on the fields based on
151
+ # their first field.
152
+ # For each value of the first field, process_init is called first,
153
+ # then process_each is called for each one,
154
+ # then process_term is called after the last one.
155
+ # The client can implement only process_term to see each unique value once.
156
+ # When process_term is called, input is on the next record, so the first
157
+ # field is in @last
158
+
159
+ # Called at the beginning of a run of equal values of the first field.
160
+ def process_init(input, output)
161
+ nil
162
+ end
163
+ # Called for each one of the equal values.
164
+ def process_each(input, output)
165
+ nil
166
+ end
167
+ # Called after the run of equal values.
168
+ # No input record. Previous value of first field in @last.
169
+ def process_term(dummy, output)
170
+ nil
171
+ end
172
+
173
+ # Called at the beginning of reduction.
174
+ # No input.
175
+ def process_begin(dummy, output)
176
+ nil
177
+ end
178
+ # Called for each record.
179
+ def process(input, output)
180
+ nil
181
+ end
182
+ # Called at the end of reduction.
183
+ def process_end(dummy, output)
184
+ nil
185
+ end
186
+
187
+ # This suite of functions is called on all records.
188
+ # The function process_begin is called first,
189
+ # then process is called on each record,
190
+ # then process_end is called last.
191
+ # This default implementation implements the calls to process_init,
192
+ # proces_each, and process_term.
193
+ # The client can omit process_begin and process_end
194
+ # and just implement process to see each record.
195
+ def process_internal(input)
196
+ v = input[0]
197
+ if @last.nil?
198
+ process_step(:process_init, input)
199
+ process_step(:process_each, input)
200
+ @last = v
201
+ return
202
+ end
203
+ if v == @last
204
+ # As long as key is the same, just process it
205
+ process_step(:process_each, input)
206
+ return
207
+ end
208
+ # The run has ended
209
+ process_step(:process_term, input) if @last
210
+ @last = v
211
+
212
+ process_step(:process_init, input)
213
+ process_step(:process_each, input)
214
+ end
215
+ def process_end_internal(dummy)
216
+ process_step(:process_term, nil) if @last
217
+ end
218
+
219
+ # Run the reducer.
220
+ # Call process_begin, then for each line, call
221
+ # process, then call process_end.
222
+ # At each step, collect any output and write it out.
223
+ def run(in_fd, out_fd)
224
+ @out_fd = out_fd
225
+ @last = nil
226
+ process_step(:process_begin, nil)
227
+
228
+ input = nil # so it will survive the loop
229
+ in_fd.each_line do |line|
230
+ @raw_input = line
231
+ input = new_input(line)
232
+ process_internal(input)
233
+ process_step(:process, input)
234
+ end
235
+ process_end_internal(nil)
236
+ process_step(:process_end, nil)
237
+ end
238
+ end
239
+
240
+ #########################################
241
+ # Pre-written map and reduce classes
242
+ #########################################
243
+
244
+ # Map just copies its fields
245
+ class CopyMap < MapBase
246
+ def initialize(*args)
247
+ if args.size < 1
248
+ @n = 0
249
+ else
250
+ @n = args[0].to_i - 1
251
+ end
252
+ end
253
+ def declare
254
+ (0..@n).each {|i| field "col#{i}"}
255
+
256
+ (0..@n).each {|i| emit "col#{i}"}
257
+ end
258
+
259
+ def process(input, output)
260
+ copy_struct(input, output)
261
+ end
262
+ end
263
+
264
+ # Map selects according to a RE
265
+ class SelectMap < MapBase
266
+ def initialize(*args)
267
+ raise ArgumentError if args.size < 1
268
+ @re = args[0]
269
+ if args[1]
270
+ @field = args[1]
271
+ else
272
+ @field = 0
273
+ end
274
+ if args[2]
275
+ @n = args[2].to_i - 1
276
+ else
277
+ @n = 0
278
+ end
279
+ end
280
+ def declare
281
+ (0..@n).each {|i| field "col#{i}"}
282
+
283
+ (0..@n).each {|i| emit "col#{i}"}
284
+ end
285
+
286
+ def process(input, output)
287
+ if input[@field] =~ @re
288
+ return copy_struct(input, output)
289
+ end
290
+ nil
291
+ end
292
+ end
293
+
294
+ # Reducer collects all values
295
+ # Outputs as many lines as input
296
+ # Init with number of fields to copy (default 1).
297
+ # Optional second arg is the number of initial fields to skip.
298
+ class CopyReduce < ReduceBase
299
+ def initialize(*args)
300
+ if args[0]
301
+ @n = args[0].to_i - 1
302
+ else
303
+ @n = 0
304
+ end
305
+ if args[1]
306
+ @m = args[1].to_i - 1
307
+ else
308
+ @m = -1
309
+ end
310
+ end
311
+ def declare
312
+ (0..@m).each {|i| field "skip#{i}"}
313
+ (0..@n).each {|i| field "col#{i}"}
314
+
315
+ (0..@n).each {|i| emit "col#{i}"}
316
+ end
317
+
318
+ def process(input, output)
319
+ copy_struct(input, output, @m+1)
320
+ end
321
+ end
322
+
323
+ # Reducer collects unique values
324
+ # Outputs as many lines as there are unique values in the first field.
325
+ class UniqueReduce < ReduceBase
326
+ def declare
327
+ field :value
328
+
329
+ emit :value
330
+ end
331
+
332
+ def process_term(input, output)
333
+ output.value = @last
334
+ output
335
+ end
336
+ end
337
+
338
+ # Reducer sums given fields
339
+ # Specify how many fields to sum (default 1).
340
+ # May optionally specify how many initial fields to skip
341
+ # Outputs one line of sums
342
+ class SumReduce < ReduceBase
343
+ def initialize(*args)
344
+ if args[0]
345
+ @n = args[0].to_i - 1
346
+ else
347
+ @n = 0
348
+ end
349
+ if args[1]
350
+ @m = args[1].to_i - 1
351
+ else
352
+ @m = -1
353
+ end
354
+ end
355
+ def declare
356
+ (0..@m).each {|i| field "skip#{i}"}
357
+ (0..@n).each {|i| field "count#{i}"}
358
+
359
+ (0..@n).each {|i| emit "sum#{i}"}
360
+ end
361
+
362
+ def process_begin(dummy, output)
363
+ @sum = Array.new(@n+1, 0)
364
+ nil
365
+ end
366
+ def process(input, output)
367
+ (0..@n).each {|i| @sum[i] += input[i+@m+1].to_f}
368
+ nil
369
+ end
370
+ def process_end(dummy, output)
371
+ (0..@n).each {|i| output[i] = @sum[i]}
372
+ output
373
+ end
374
+ end
375
+
376
+ # This reducer sums within each unique value of the first field.
377
+ # Outputs one line of sums for each unique value of the first field.
378
+ class UniqueSumReduce < ReduceBase
379
+ def initialize(*args)
380
+ if args[0]
381
+ @n = args[0].to_i - 1
382
+ else
383
+ @n = 0
384
+ end
385
+ if args[1]
386
+ @m = args[1].to_i - 1
387
+ else
388
+ @m = -1
389
+ end
390
+ end
391
+
392
+ def declare
393
+ field :unique
394
+ (0..@n).each {|i| field "count#{i}"}
395
+ (0..@m).each {|i| field "extra#{i}"}
396
+
397
+ emit :value
398
+ (0..@n).each {|i| emit "sum#{i}"}
399
+ (0..@m).each {|i| emit "extra#{i}"}
400
+ end
401
+ def process_init(input, output)
402
+ @sum = Array.new(@n+1, 0)
403
+ @extra = Array.new(@m+1)
404
+ nil
405
+ end
406
+ def process_each(input, output)
407
+ (0..@n).each {|i| @sum[i] += input[i+1].to_i}
408
+ (0..@m).each {|i| @extra[i] = input[i+@n+2]}
409
+ nil
410
+ end
411
+ def process_term(dummy, output)
412
+ output.value = @last
413
+ (0..@n).each {|i| output[i+1] = @sum[i]}
414
+ (0..@m).each {|i| output[i+@n+2] = @extra[i]}
415
+ output
416
+ end
417
+ end
418
+
419
+ # Reducer counts within each unique value of the first field.
420
+ # Outputs one line of counts for each unique value of the first field.
421
+ class UniqueCountReduce < ReduceBase
422
+ def initialize(*args)
423
+ if args[0]
424
+ @m = args[0].to_i - 1
425
+ else
426
+ @m = -1
427
+ end
428
+ end
429
+
430
+ def declare
431
+ field :unique
432
+ (0..@m).each {|i| field "extra#{i}"}
433
+
434
+ emit :value
435
+ emit :count
436
+ (0..@m).each {|i| emit "extra#{i}"}
437
+ end
438
+ def process_init(input, output)
439
+ @count = 0
440
+ @extra = Array.new(@m+1)
441
+ nil
442
+ end
443
+ def process_each(input, output)
444
+ @count += 1
445
+ (0..@m).each {|i| @extra[i] = input[i+1]}
446
+ nil
447
+ end
448
+ def process_term(dummy, output)
449
+ output.value = @last
450
+ output.count = @count
451
+ (0..@m).each {|i| output[i+2] = @extra[i]}
452
+ output
453
+ end
454
+ end
455
+
456
+ # Reducer works on groups where the first field is the same.
457
+ # For each distinct value of the second field, sum up the values
458
+ # of the third field.
459
+ class UniqueIndexedSumReduce < ReduceBase
460
+ def declare
461
+ field :unique
462
+ field :index
463
+ field :value
464
+
465
+ emit :unique
466
+ emit :index
467
+ emit :value
468
+ end
469
+ def process_init(input, output)
470
+ @sum = {}
471
+ nil
472
+ end
473
+ def process_each(input, output)
474
+ index = input.index
475
+ @sum[index] = 0 unless @sum.has_key?(index)
476
+ @sum[index] += input.value.to_i
477
+ nil
478
+ end
479
+ def process_term(dummy, output)
480
+ output = []
481
+ @sum.each do |index, value|
482
+ item = new_output
483
+ item.unique = @last
484
+ item.index = index
485
+ item.value = value
486
+ output << item
487
+ end
488
+ output
489
+ end
490
+ end
491
+
492
+ # Reducer works on groups where the first field is the same.
493
+ # Count the number of distinct occurances of the second field.
494
+ class UniqueIndexedCountReduce < ReduceBase
495
+ def declare
496
+ field :unique
497
+ field :index
498
+
499
+ emit :unique
500
+ emit :index
501
+ emit :value
502
+ end
503
+ def process_init(input, output)
504
+ @sum = {}
505
+ nil
506
+ end
507
+ def process_each(input, output)
508
+ index = input.index
509
+ @sum[index] = 0 unless @sum.has_key?(index)
510
+ @sum[index] += 1
511
+ nil
512
+ end
513
+ def process_term(dummy, output)
514
+ output = []
515
+ @sum.each do |index, value|
516
+ item = new_output
517
+ item.unique = @last
518
+ item.index = index
519
+ item.value = value
520
+ output << item
521
+ end
522
+ output
523
+ end
524
+ end
525
+
526
+ # Reducer samples the input
527
+ # One argument must be given: the number of samples to retain
528
+ # Outputs that many lines
529
+ # TODO store the whole input object in pool?
530
+ # or else take another argument of columns to store
531
+ class SampleReduce < ReduceBase
532
+ def initialize(*args)
533
+ raise ArgumentError if args.size < 1
534
+ @m = args[0].to_i
535
+ end
536
+
537
+ def declare
538
+ field :value
539
+
540
+ emit :value
541
+ end
542
+ def process_begin(dummy, output)
543
+ @pool = []
544
+ @n = 0
545
+ nil
546
+ end
547
+ def process(input, output)
548
+ if @pool.size < @m
549
+ @pool << input.value
550
+ elsif rand < (@m.to_f / @n.to_f)
551
+ @pool[rand(@m)] = input.value
552
+ end
553
+ @n += 1
554
+ nil
555
+ end
556
+ def process_end(dummy, output)
557
+ output = []
558
+ @pool.each do |elem|
559
+ item = new_output
560
+ item.value = elem
561
+ output << item
562
+ end
563
+ output
564
+ end
565
+ end
566
+
567
+ # Reducer retains the the M maximum values in column 2
568
+ # Column 2 must be numeric
569
+ # TODO store rest of fields too
570
+ class MaxReduce < ReduceBase
571
+ def initialize(*args)
572
+ if args[0]
573
+ @m = args[0].to_i
574
+ else
575
+ @m = 1
576
+ end
577
+ end
578
+
579
+ def declare
580
+ field :key
581
+ field :value
582
+
583
+ emit :key
584
+ emit :value
585
+ end
586
+
587
+ def compare(x, y)
588
+ y <=> x
589
+ end
590
+
591
+ def sort_pool
592
+ @pool.sort! {|x, y| compare(x[1], y[1])}
593
+ end
594
+
595
+ def process_begin(dummy, output)
596
+ @pool = []
597
+ nil
598
+ end
599
+ def process(input, output)
600
+ val = input.value.to_i
601
+ if @pool.size < @m
602
+ @pool << [input.key, val]
603
+ sort_pool
604
+ elsif val > @pool[-1][1]
605
+ @pool[-1] = [input.key, val]
606
+ sort_pool
607
+ end
608
+ nil
609
+ end
610
+ def process_end(dummy, output)
611
+ output = []
612
+ @pool.each do |elem|
613
+ item = new_output
614
+ item.key, item.value = elem
615
+ output << item
616
+ end
617
+ output
618
+ end
619
+ end
620
+
621
+ # Reducer sums the values for each unique key
622
+ # Outputs only the M max values
623
+ class MaxUniqueSumReduce < ReduceBase
624
+ def initialize(*args)
625
+ raise ArgumentError if args.size < 1
626
+ @m = args[0].to_i
627
+ end
628
+
629
+ def declare
630
+ field :key
631
+ field :value
632
+
633
+ emit :key
634
+ emit :value
635
+ end
636
+
637
+ def sort_pool
638
+ @pool.sort! {|x, y| y[1] <=> x[1]}
639
+ end
640
+
641
+ def process_begin(dummy, output)
642
+ @pool = []
643
+ nil
644
+ end
645
+ # These three do the sum
646
+ def process_init(input, output)
647
+ @sum = 0
648
+ nil
649
+ end
650
+ def process_each(input, output)
651
+ @sum += input.value.to_i
652
+ nil
653
+ end
654
+ def process_term(dummy, output)
655
+ if @pool.size < @m
656
+ @pool << [@last, @sum]
657
+ sort_pool
658
+ elsif @sum > @pool[-1][1]
659
+ @pool[-1] = [@last, @sum]
660
+ sort_pool
661
+ end
662
+ nil
663
+ end
664
+ def process_end(dummy, output)
665
+ output = []
666
+ @pool.each do |elem|
667
+ item = new_output
668
+ item.key, item.value = elem
669
+ output << item
670
+ end
671
+ output
672
+ end
673
+ end
674
+
675
+ # Min given fields
676
+ class MinReduce < MaxReduce
677
+ def process(input, output)
678
+ if @pool.size < @m
679
+ @pool << [input.key, input.value]
680
+ sort_pool
681
+ elsif input.value.to_i < @pool[-1][1].to_i
682
+ @pool[-1] = [input.key, input.value]
683
+ sort_pool
684
+ end
685
+ nil
686
+ end
687
+ end
688
+
689
+ # First record of each unique field
690
+ # Drops the given number of colums.
691
+ # By default, drops the first column.
692
+ class UniqueFirstReduce < ReduceBase
693
+ def initialize(*args)
694
+ if args[0]
695
+ @n = args[0].to_i - 1
696
+ else
697
+ @n = 0
698
+ end
699
+ if args[1]
700
+ @m = args[1].to_i - 1
701
+ else
702
+ @m = -1
703
+ end
704
+ end
705
+
706
+ def declare
707
+ (0..@m).each {|i| field "skip#{i}"}
708
+ (0..@n).each {|i| field "col#{i}"}
709
+
710
+ (0..@n).each {|i| emit "col#{i}"}
711
+ end
712
+ # copy over all dest fields
713
+ def process_init(input, output)
714
+ copy_struct(input, output, @m+1)
715
+ end
716
+ end
717
+
718
+ ############################
719
+ # Base class for jobs
720
+ ############################
721
+
722
+ class JobBase
723
+ @@testing = false
724
+
725
+ def initialize(*args)
726
+ @stages = []
727
+ end
728
+
729
+ def JobBase.testing(val)
730
+ @@testing = val
731
+ end
732
+
733
+ # Change filename so a path maps into a simple name.
734
+ # / ==> -
735
+ # * ==> all
736
+ # []? ==> _
737
+ def JobBase.filename_map(filename)
738
+ filename.gsub(/\*/, 'all').gsub(/\//, '-').gsub(/[\[\]?]/, '_')
739
+ end
740
+
741
+ # These store job declarations
742
+ def mapper map_class, *args
743
+ @map_class = map_class
744
+ @map_args = args
745
+ @map_opts = {}
746
+ @in_dirs = []
747
+ @extras = []
748
+ end
749
+ def reducer reduce_class, *args
750
+ @reduce_class = reduce_class
751
+ @reduce_args = args
752
+ @reducers = 1
753
+ @reduce_opts = {}
754
+ end
755
+ def indir in_dir
756
+ @in_dirs << in_dir
757
+ end
758
+ alias infiles indir
759
+ def outdir out_dir
760
+ @out_dir = JobBase.filename_map(out_dir)
761
+ end
762
+ alias outfiles outdir
763
+ def reducers n
764
+ @reducers = n
765
+ end
766
+ def extra ex
767
+ @extras << ex
768
+ end
769
+ def map_opt n, v
770
+ @map_opts[n] = v
771
+ end
772
+ def reduce_opt n, v
773
+ @reduce_opts[n] = v
774
+ end
775
+ def hadoop_opts name
776
+ @hadoop_opts = [] unless @hadoop_opts
777
+ @hadoop_opts << name
778
+ end
779
+ # This gathers the declarations and stores in a stage record.
780
+ def add_stage
781
+ case
782
+ when @map_class.nil?: raise "Map class not specified"
783
+ when @reduce_class.nil?: raise "Reduce class not specified"
784
+ when @in_dirs.empty?: raise "Indir not speficied"
785
+ when @out_dir.nil?: raise "Outdir not specified"
786
+ end
787
+ @stages << [@map_class, @map_args, @map_opts,
788
+ @reduce_class, @reduce_args, @reduce_opts,
789
+ @in_dirs, @out_dir, @reducers, @extras]
790
+ end
791
+
792
+ # For each method in the class starting with "stage", call the method,
793
+ # then call add_stage. This can be used to create multi-stage map-reduce
794
+ # programs.
795
+ def prepare
796
+ ms = self.class.instance_methods.find_all do |m|
797
+ m =~ /(^stage)|(^job$)/
798
+ end
799
+ ms.sort.each do |m|
800
+ self.method(m).call
801
+ add_stage
802
+ end
803
+ end
804
+
805
+ # Run the job.
806
+ # For each stage, run the mapper, then sort the
807
+ # intermediate output, then run the reducer.
808
+ def run_test
809
+ map_out_file = "/tmp/map-out"
810
+ red_in_file = "/tmp/reduce-in"
811
+ @stages.each do |s|
812
+ map_class, map_args, map_opts,
813
+ reduce_class, reduce_args, reduce_opts,
814
+ in_dirs, out_dir, reducers, extras = *s
815
+ mapper = map_class.new(*map_args)
816
+ mapper.declare
817
+ mapper.prepare
818
+ in_dirs.each do |in_dir|
819
+ StreamIO.open(in_dir, "r") do |in_fd|
820
+ StreamIO.open(map_out_file, "w") do |out_fd|
821
+ mapper.run in_fd, out_fd
822
+ end
823
+ end
824
+ end
825
+
826
+ system "sort <#{map_out_file} >#{red_in_file}"
827
+
828
+ reducer = reduce_class.new(*reduce_args)
829
+ reducer.declare
830
+ reducer.prepare
831
+ StreamIO.open(red_in_file, "r") do |in_fd|
832
+ StreamIO.open(out_dir, "w") do |out_fd|
833
+ reducer.run in_fd, out_fd
834
+ end
835
+ end
836
+ end
837
+ end
838
+
839
+ def build_command(fname, klass, args)
840
+ res = "#{fname} -s #{klass.to_s}"
841
+ if args
842
+ res += " #{args.join(' ')}"
843
+ end
844
+ res
845
+ end
846
+
847
+ def self.get_job_opts
848
+ opts = {}
849
+ if ARGV[0] == '-v'
850
+ opts[:verbose] = true
851
+ ARGV.shift
852
+ end
853
+ opts
854
+ end
855
+
856
+ def run(fname, opts)
857
+ sr = StreamRunner.new
858
+ out_dir = "out"
859
+ @stages.each do |s|
860
+ map_class, map_args, map_opts,
861
+ reduce_class, reduce_args, reduce_opts,
862
+ in_dirs, out_dir, reducers, extras = *s
863
+ opts = opts.merge({:hadoop_opts => @hadoop_opts.join(" ")}) if @hadoop_opts && @hadoop_opts.size > 0
864
+ sr.run_map_reduce(in_dirs, out_dir,
865
+ build_command(fname, map_class, map_args),
866
+ build_command(fname, reduce_class, reduce_args),
867
+ reducers,
868
+ [__FILE__, 'stream_runner.rb'] + extras,
869
+ map_opts, reduce_opts, opts)
870
+ end
871
+ end
872
+
873
+ def self.run_test
874
+ job = self.new
875
+ job.prepare
876
+ job.run_test
877
+ end
878
+
879
+ def self.run_command(opt = nil)
880
+ return if @@testing && opt == :at_exit
881
+ return run_test if @@testing
882
+
883
+ filename = $0 unless filename
884
+ if ARGV[0] == '-s'
885
+ ARGV.shift
886
+ class_name = ARGV.shift
887
+ action = Object.const_get(class_name).new(*ARGV)
888
+ action.declare
889
+ action.prepare
890
+ action.run(STDIN, STDOUT)
891
+ else
892
+ opts = get_job_opts
893
+ # create an instance of the class that was called originally
894
+ action = self.new
895
+ action.prepare
896
+ action.run(File.basename(filename), opts)
897
+ end
898
+ end
899
+ end
900
+
901
+ # At exit, call run_command in each class of the form xxxJob.
902
+ at_exit do
903
+ ObjectSpace.each_object(Class) do |klass|
904
+ if klass.name =~ /^\w+Job$/
905
+ klass.run_command(:at_exit)
906
+ end
907
+ end
908
+ end