wukong-hadoop 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +59 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +339 -0
- data/Rakefile +13 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-bzip +23 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-cp +3 -0
- data/bin/hdp-du +86 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-ls +11 -0
- data/bin/hdp-mkdir +2 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +32 -0
- data/bin/hdp-sort +40 -0
- data/bin/hdp-stream +40 -0
- data/bin/hdp-stream-flat +22 -0
- data/bin/hdp-stream2 +39 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/wu-hadoop +14 -0
- data/examples/counter.rb +17 -0
- data/examples/map_only.rb +28 -0
- data/examples/processors.rb +4 -0
- data/examples/sonnet_18.txt +14 -0
- data/examples/tokenizer.rb +28 -0
- data/examples/word_count.rb +44 -0
- data/features/step_definitions/wu_hadoop_steps.rb +4 -0
- data/features/support/env.rb +1 -0
- data/features/wu_hadoop.feature +113 -0
- data/lib/wukong-hadoop.rb +21 -0
- data/lib/wukong-hadoop/configuration.rb +133 -0
- data/lib/wukong-hadoop/driver.rb +190 -0
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
- data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
- data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
- data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
- data/lib/wukong-hadoop/extensions.rb +2 -0
- data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
- data/lib/wukong-hadoop/version.rb +6 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +39 -0
- data/spec/wukong-hadoop/driver_spec.rb +117 -0
- data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
- data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
- data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
- data/wukong-hadoop.gemspec +33 -0
- metadata +168 -0
@@ -0,0 +1,104 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Implements logic for figuring out the correct mapper commandline
|
5
|
+
# given wu-hadoop's arguments.
|
6
|
+
module MapLogic
|
7
|
+
|
8
|
+
# Return the actual commandline used by the mapper, whether
|
9
|
+
# running in local or Hadoop mode.
|
10
|
+
#
|
11
|
+
# You should be able to copy, paste, and run this command
|
12
|
+
# unmodified to debug the mapper.
|
13
|
+
#
|
14
|
+
# @return [String]
|
15
|
+
def mapper_commandline
|
16
|
+
return settings[:map_command] if explicit_map_command?
|
17
|
+
[command_prefix, 'wu-local', mapper_arg].tap do |cmd|
|
18
|
+
cmd << "--run=#{mapper_name}" if mapper_needs_run_arg?
|
19
|
+
cmd << params_to_pass
|
20
|
+
end.compact.map(&:to_s).reject(&:empty?).join(' ')
|
21
|
+
end
|
22
|
+
|
23
|
+
# Were we given an explicit map command (like 'cut -f 1') or are
|
24
|
+
# we to introspect and construct the command?
|
25
|
+
#
|
26
|
+
# @return [true, false]
|
27
|
+
def explicit_map_command?
|
28
|
+
settings[:map_command]
|
29
|
+
end
|
30
|
+
|
31
|
+
# Were we given a processor to use as our mapper explicitly by
|
32
|
+
# name or are we to introspect to discover the correct
|
33
|
+
# processor?
|
34
|
+
#
|
35
|
+
# @return [true, false]
|
36
|
+
def explicit_map_processor?
|
37
|
+
settings[:mapper]
|
38
|
+
end
|
39
|
+
|
40
|
+
# Were we given an explicit mapper (either as a command or as a
|
41
|
+
# processor) or should we introspect to find one?
|
42
|
+
#
|
43
|
+
# @return [true, false]
|
44
|
+
def explicit_mapper?
|
45
|
+
explicit_map_processor? || explicit_map_command?
|
46
|
+
end
|
47
|
+
|
48
|
+
# The argument that we should introspect on to turn into our
|
49
|
+
# mapper.
|
50
|
+
#
|
51
|
+
# @return [String]
|
52
|
+
def mapper_arg
|
53
|
+
args.first
|
54
|
+
end
|
55
|
+
|
56
|
+
# Does the mapper commandline need an explicit --run argument?
|
57
|
+
#
|
58
|
+
# Will not be used if the processor name is the same as the name
|
59
|
+
# of the script.
|
60
|
+
#
|
61
|
+
# @return [true, false]
|
62
|
+
def mapper_needs_run_arg?
|
63
|
+
return false if mapper_arg.to_s == mapper_name.to_s
|
64
|
+
return false if File.basename(mapper_arg.to_s, '.rb') == mapper_name.to_s
|
65
|
+
true
|
66
|
+
end
|
67
|
+
|
68
|
+
# Return the name of the processor to use as the mapper.
|
69
|
+
#
|
70
|
+
# Will raise a <tt>Wukong::Error</tt> if a given mapper is
|
71
|
+
# invalid or if none can be guessed.
|
72
|
+
#
|
73
|
+
# Most of the logic that examines explicit command line
|
74
|
+
# arguments and checks for the existence of named processors or
|
75
|
+
# files is here.
|
76
|
+
#
|
77
|
+
# @return [String]
|
78
|
+
def mapper_name
|
79
|
+
case
|
80
|
+
when explicit_mapper?
|
81
|
+
if processor_registered?(settings[:mapper])
|
82
|
+
settings[:mapper]
|
83
|
+
else
|
84
|
+
raise Error.new("No such processor: '#{settings[:mapper]}'")
|
85
|
+
end
|
86
|
+
when map_only? && processor_registered?(mapper_arg)
|
87
|
+
mapper_arg
|
88
|
+
when map_only? && file_is_processor?(mapper_arg)
|
89
|
+
processor_name_from_file(mapper_arg)
|
90
|
+
when single_job_arg? && explicit_reducer? && processor_registered?(mapper_arg)
|
91
|
+
mapper_arg
|
92
|
+
when separate_map_and_reduce_args? && processor_registered?(mapper_arg)
|
93
|
+
mapper_arg
|
94
|
+
when separate_map_and_reduce_args? && file_is_processor?(mapper_arg)
|
95
|
+
processor_name_from_file(mapper_arg)
|
96
|
+
when processor_registered?('mapper')
|
97
|
+
'mapper'
|
98
|
+
else
|
99
|
+
raise Error.new("Could not find a processor to use as a mapper")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,129 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Implements logic for figuring out the correct reducer
|
5
|
+
# commandline given wu-hadoop's arguments and whether or not to
|
6
|
+
# run a map-only (no-reduce) job.
|
7
|
+
module ReduceLogic
|
8
|
+
|
9
|
+
# Return the actual commandline used by the reducer, whether
|
10
|
+
# running in local or Hadoop mode.
|
11
|
+
#
|
12
|
+
# You should be able to copy, paste, and run this command
|
13
|
+
# unmodified to debug the reducer.
|
14
|
+
#
|
15
|
+
# @return [String]
|
16
|
+
def reducer_commandline
|
17
|
+
return '' unless reduce?
|
18
|
+
return settings[:reduce_command] if explicit_reduce_command?
|
19
|
+
[command_prefix, 'wu-local', reducer_arg].tap do |cmd|
|
20
|
+
cmd << "--run=#{reducer_name}" if reducer_needs_run_arg?
|
21
|
+
cmd << params_to_pass
|
22
|
+
end.compact.map(&:to_s).reject(&:empty?).join(' ')
|
23
|
+
end
|
24
|
+
|
25
|
+
# Were we given an explicit reduce command (like 'uniq -c') or
|
26
|
+
# are we to introspect and construct the command?
|
27
|
+
#
|
28
|
+
# @return [true, false]
|
29
|
+
def explicit_reduce_command?
|
30
|
+
settings[:reduce_command]
|
31
|
+
end
|
32
|
+
|
33
|
+
# Were we given a processor to use as our reducer explicitly by
|
34
|
+
# name or are we to introspect to discover the correct
|
35
|
+
# processor?
|
36
|
+
#
|
37
|
+
# @return [true, false]
|
38
|
+
def explicit_reduce_processor?
|
39
|
+
settings[:reducer]
|
40
|
+
end
|
41
|
+
|
42
|
+
# Were we given an explicit reducer (either as a command or as a
|
43
|
+
# processor) or should we introspect to find one?
|
44
|
+
#
|
45
|
+
# @return [true, false]
|
46
|
+
def explicit_reducer?
|
47
|
+
explicit_reduce_processor? || explicit_reduce_command?
|
48
|
+
end
|
49
|
+
|
50
|
+
# The argument that we should introspect on to turn into our
|
51
|
+
# reducer.
|
52
|
+
#
|
53
|
+
# @return [String]
|
54
|
+
def reducer_arg
|
55
|
+
args.last
|
56
|
+
end
|
57
|
+
|
58
|
+
# Should we perform a reduce or is this a map-only job?
|
59
|
+
#
|
60
|
+
# We will definitely reduce if
|
61
|
+
#
|
62
|
+
# - given an explicit <tt>--reduce_command</tt>
|
63
|
+
# - we discovered a reducer
|
64
|
+
#
|
65
|
+
# We will not reduce if:
|
66
|
+
#
|
67
|
+
# - <tt>--reduce_tasks</tt> was explicitly set to 0
|
68
|
+
#
|
69
|
+
# @return [true, false]
|
70
|
+
def reduce?
|
71
|
+
return false if settings[:reduce_tasks] && settings[:reduce_tasks].to_i == 0
|
72
|
+
return true if settings[:reduce_command]
|
73
|
+
return true if reducer_name
|
74
|
+
false
|
75
|
+
end
|
76
|
+
|
77
|
+
# Is this a map-only job?
|
78
|
+
#
|
79
|
+
# @see #reduce?
|
80
|
+
#
|
81
|
+
# @return [true, false]
|
82
|
+
def map_only?
|
83
|
+
(! reduce?)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Does the reducer commandline need an explicit --run argument?
|
87
|
+
#
|
88
|
+
# Will not be used if the processor name is the same as the name
|
89
|
+
# of the script.
|
90
|
+
#
|
91
|
+
# @return [true, false]
|
92
|
+
def reducer_needs_run_arg?
|
93
|
+
return false if reducer_arg.to_s == reducer_name.to_s
|
94
|
+
return false if File.basename(reducer_arg.to_s, '.rb') == reducer_name
|
95
|
+
true
|
96
|
+
end
|
97
|
+
|
98
|
+
# Return the name of the processor to use as the reducer.
|
99
|
+
#
|
100
|
+
# Will raise a <tt>Wukong::Error</tt> if a given reducer is
|
101
|
+
# invalid. Will return nil if no reducer can be guessed.
|
102
|
+
#
|
103
|
+
# Most of the logic that examines explicit command line
|
104
|
+
# arguments and checks for the existence of named processors or
|
105
|
+
# files is here.
|
106
|
+
#
|
107
|
+
# @return [String]
|
108
|
+
def reducer_name
|
109
|
+
case
|
110
|
+
when explicit_reducer?
|
111
|
+
if processor_registered?(settings[:reducer])
|
112
|
+
settings[:reducer]
|
113
|
+
else
|
114
|
+
raise Error.new("No such processor: '#{settings[:reducer]}'")
|
115
|
+
end
|
116
|
+
when single_job_arg? && explicit_mapper? && processor_registered?(reducer_arg)
|
117
|
+
reducer_arg
|
118
|
+
when separate_map_and_reduce_args? && processor_registered?(reducer_arg)
|
119
|
+
reducer_arg
|
120
|
+
when separate_map_and_reduce_args? && file_is_processor?(reducer_arg)
|
121
|
+
processor_name_from_file(reducer_arg)
|
122
|
+
when processor_registered?('reducer')
|
123
|
+
'reducer'
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Hadoop streaming exposes several environment variables to
|
5
|
+
# scripts it executes. This module contains methods that make
|
6
|
+
# these variables easily accessed from within a processor.
|
7
|
+
#
|
8
|
+
# Since these environment variables are ultimately set by Hadoop's
|
9
|
+
# streaming jar when executing inside Hadoop, you'll have to set
|
10
|
+
# them manually when testing locally.
|
11
|
+
#
|
12
|
+
# Via @pskomoroch via @tlipcon:
|
13
|
+
#
|
14
|
+
# "there is a little known Hadoop Streaming trick buried in this Python
|
15
|
+
# script. You will notice that the date is not actually in the raw log
|
16
|
+
# data itself, but is part of the filename. It turns out that Hadoop makes
|
17
|
+
# job parameters you would fetch in Java with something like
|
18
|
+
# job.get("mapred.input.file") available as environment variables for
|
19
|
+
# streaming jobs, with periods replaced with underscores:
|
20
|
+
#
|
21
|
+
# filepath = os.environ["map_input_file"]
|
22
|
+
# filename = os.path.split(filepath)[-1]
|
23
|
+
module EnvMethods
|
24
|
+
|
25
|
+
# Fetch a parameter set by Hadoop streaming in the environment
|
26
|
+
# of the currently executing process.
|
27
|
+
#
|
28
|
+
# @param [String] name the '.' separated parameter name to fetch
|
29
|
+
# @return [String] the value from the process' environment
|
30
|
+
def hadoop_streaming_parameter name
|
31
|
+
ENV[name.gsub('.', '_')]
|
32
|
+
end
|
33
|
+
|
34
|
+
# Path of the (data) file currently being processed.
|
35
|
+
#
|
36
|
+
# @return [String]
|
37
|
+
def input_file
|
38
|
+
ENV['map_input_file']
|
39
|
+
end
|
40
|
+
|
41
|
+
# Directory of the (data) file currently being processed.
|
42
|
+
#
|
43
|
+
# @return [String]
|
44
|
+
def input_dir
|
45
|
+
ENV['mapred_input_dir']
|
46
|
+
end
|
47
|
+
|
48
|
+
# Offset of the chunk currently being processed within the current input file.
|
49
|
+
#
|
50
|
+
# @return [String]
|
51
|
+
def map_input_start_offset
|
52
|
+
ENV['map_input_start']
|
53
|
+
end
|
54
|
+
|
55
|
+
# Length of the chunk currently being processed within the current input file.
|
56
|
+
#
|
57
|
+
# @return [String]
|
58
|
+
def map_input_length
|
59
|
+
ENV['map_input_length']
|
60
|
+
end
|
61
|
+
|
62
|
+
# ID of the current map/reduce attempt.
|
63
|
+
#
|
64
|
+
# @return [String]
|
65
|
+
def attempt_id
|
66
|
+
ENV['mapred_task_id']
|
67
|
+
end
|
68
|
+
|
69
|
+
# ID of the current map/reduce task.
|
70
|
+
#
|
71
|
+
# @return [String]
|
72
|
+
def curr_task_id
|
73
|
+
ENV['mapred_tip_id']
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
Processor.class_eval{ include Hadoop::EnvMethods }
|
80
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'wukong-hadoop'
|
2
|
+
require_relative('support/integration_helper')
|
3
|
+
require_relative('support/driver_helper')
|
4
|
+
require 'wukong/spec_helpers'
|
5
|
+
|
6
|
+
RSpec.configure do |config|
|
7
|
+
|
8
|
+
config.before(:each) do
|
9
|
+
@orig_reg = Wukong.registry.show
|
10
|
+
end
|
11
|
+
|
12
|
+
config.after(:each) do
|
13
|
+
Wukong.registry.clear!
|
14
|
+
Wukong.registry.merge!(@orig_reg)
|
15
|
+
end
|
16
|
+
|
17
|
+
include Wukong::SpecHelpers
|
18
|
+
include Wukong::Hadoop::IntegrationHelper
|
19
|
+
include Wukong::Hadoop::DriverHelper
|
20
|
+
end
|
21
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
module DriverHelper
|
4
|
+
|
5
|
+
def driver *args
|
6
|
+
params = ::Wukong::Hadoop.configure(Configliere::Param.new)
|
7
|
+
params.resolve!
|
8
|
+
params.merge!(args.pop) if args.last.is_a?(Hash)
|
9
|
+
Wukong::Hadoop::Driver.new(params, *args)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
module IntegrationHelper
|
4
|
+
|
5
|
+
def root
|
6
|
+
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
|
+
end
|
8
|
+
|
9
|
+
def lib_dir
|
10
|
+
root.join('lib')
|
11
|
+
end
|
12
|
+
|
13
|
+
def bin_dir
|
14
|
+
root.join('bin')
|
15
|
+
end
|
16
|
+
|
17
|
+
def examples_dir
|
18
|
+
root.join('examples')
|
19
|
+
end
|
20
|
+
|
21
|
+
def integration_env
|
22
|
+
{
|
23
|
+
"PATH" => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
|
24
|
+
"RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
|
25
|
+
}
|
26
|
+
end
|
27
|
+
|
28
|
+
def integration_cwd
|
29
|
+
root.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
def example_script *args
|
33
|
+
examples_dir.join(*args)
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,117 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Hadoop::Driver do
|
4
|
+
|
5
|
+
context "processing its arguments" do
|
6
|
+
it "raises an error when it can't find a file" do
|
7
|
+
lambda { driver(example_script('processors.rb'), example_script('doesnt_exist.rb')) }.should raise_error(Wukong::Error, /No such processor or file/)
|
8
|
+
end
|
9
|
+
it "raises an error when it can't find a widget" do
|
10
|
+
lambda { driver('regexp', 'doesnt_exist') }.should raise_error(Wukong::Error, /No such processor or file/)
|
11
|
+
end
|
12
|
+
it "raises an error when given more than two arguments" do
|
13
|
+
lambda { driver('regexp', example_script('counter.rb'), 'extra') }.should raise_error(Wukong::Error, /more than two/)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
context "will execute a map-only job" do
|
18
|
+
context "with an explicit map command" do
|
19
|
+
let(:subject) { driver(:map_command => 'cut -f 1') }
|
20
|
+
its(:reduce?) { should be_false }
|
21
|
+
its(:mapper_commandline) { should match /^cut -f 1$/ }
|
22
|
+
end
|
23
|
+
context "with a single widget" do
|
24
|
+
let(:subject) { driver('regexp') }
|
25
|
+
its(:reduce?) { should be_false }
|
26
|
+
its(:mapper_commandline) { should match /^wu-local regexp$/ }
|
27
|
+
end
|
28
|
+
context "with a single file" do
|
29
|
+
context "defining a processor named 'mapper'" do
|
30
|
+
let(:subject) { driver(example_script('map_only.rb')) }
|
31
|
+
its(:reduce?) { should be_false }
|
32
|
+
its(:mapper_commandline) { should match /^wu-local .*map_only.rb --run=mapper$/ }
|
33
|
+
end
|
34
|
+
context "defining a processor named after the file" do
|
35
|
+
let(:subject) { driver(example_script('tokenizer.rb')) }
|
36
|
+
its(:reduce?) { should be_false }
|
37
|
+
its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
|
38
|
+
end
|
39
|
+
context "using the given --mapper option " do
|
40
|
+
let(:subject) { driver(example_script('processors.rb'), :mapper => 'tokenizer') }
|
41
|
+
its(:reduce?) { should be_false }
|
42
|
+
its(:mapper_commandline) { should match /^wu-local .*processors.rb --run=tokenizer$/ }
|
43
|
+
end
|
44
|
+
context "defining a processor named 'reducer' but with --reduce_tasks=0" do
|
45
|
+
let(:subject) { driver(example_script('word_count.rb'), :reduce_tasks => 0) }
|
46
|
+
its(:reduce?) { should be_false }
|
47
|
+
its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
|
48
|
+
end
|
49
|
+
end
|
50
|
+
context "with two files but with --reduce_tasks=0" do
|
51
|
+
let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb'), :reduce_tasks => 0) }
|
52
|
+
its(:reduce?) { should be_false }
|
53
|
+
its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
context "will execute a map-reduce job" do
|
58
|
+
context "with explicit map and reduce commands" do
|
59
|
+
let(:subject) { driver(:map_command => 'cut -f 1', :reduce_command => 'uniq -c') }
|
60
|
+
its(:reduce?) { should be_true }
|
61
|
+
its(:mapper_commandline) { should == 'cut -f 1' }
|
62
|
+
its(:reducer_commandline) { should == 'uniq -c' }
|
63
|
+
end
|
64
|
+
context "with two widgets" do
|
65
|
+
let(:subject) { driver('regexp', 'count') }
|
66
|
+
its(:reduce?) { should be_true }
|
67
|
+
its(:mapper_commandline) { should match /^wu-local regexp$/ }
|
68
|
+
its(:reducer_commandline) { should match /^wu-local count$/ }
|
69
|
+
end
|
70
|
+
context "with a single file" do
|
71
|
+
context "defining processors named 'mapper' and 'reducer'" do
|
72
|
+
let(:subject) { driver(example_script('word_count.rb')) }
|
73
|
+
its(:reduce?) { should be_true }
|
74
|
+
its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
|
75
|
+
its(:reducer_commandline) { should match /^wu-local .*word_count.rb --run=reducer$/ }
|
76
|
+
end
|
77
|
+
end
|
78
|
+
context "with two files" do
|
79
|
+
let(:subject) { driver(example_script('tokenizer.rb'), example_script('counter.rb')) }
|
80
|
+
its(:reduce?) { should be_true }
|
81
|
+
its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
|
82
|
+
its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
|
83
|
+
end
|
84
|
+
context "with a widget and a file" do
|
85
|
+
let(:subject) { driver('regexp', example_script('counter.rb')) }
|
86
|
+
its(:reduce?) { should be_true }
|
87
|
+
its(:mapper_commandline) { should match /^wu-local regexp$/ }
|
88
|
+
its(:reducer_commandline) { should match /^wu-local .*counter.rb$/ }
|
89
|
+
end
|
90
|
+
context "with a file and a widget" do
|
91
|
+
let(:subject) { driver(example_script('tokenizer.rb'), 'count') }
|
92
|
+
its(:reduce?) { should be_true }
|
93
|
+
its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
|
94
|
+
its(:reducer_commandline) { should match /^wu-local count$/ }
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
context "handling arguments" do
|
99
|
+
let(:subject) { driver('regexp', :clean => 'hi', :messy => 'hi "there"', :reduce_tasks => 0, :dry_run => true, :rm => true) }
|
100
|
+
it "passes arguments it doesn't know about to wu-local" do
|
101
|
+
subject.mapper_commandline.should include('--clean=hi')
|
102
|
+
end
|
103
|
+
it "correctly passes messy arguments" do
|
104
|
+
subject.mapper_commandline.should include('--messy=hi\\ \\"there\\"')
|
105
|
+
end
|
106
|
+
it "does not pass arguments that are internal to wukong-hadoop" do
|
107
|
+
subject.mapper_commandline.should_not include('--reduce_tasks', '--dry_run', '--rm')
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context "given the --command_prefix option" do
|
112
|
+
let(:subject) { driver('regexp', 'count', :command_prefix => 'bundle exec') }
|
113
|
+
its(:mapper_commandline) { should match(/^bundle exec wu-local/) }
|
114
|
+
its(:reducer_commandline) { should match(/^bundle exec wu-local/) }
|
115
|
+
end
|
116
|
+
|
117
|
+
end
|