wukong-hadoop 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -1
- data/lib/wukong-hadoop.rb +1 -1
- data/lib/wukong-hadoop/runner.rb +1 -18
- data/lib/wukong-hadoop/runner/hadoop_invocation.rb +15 -11
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +3 -3
- data/spec/wukong-hadoop/runner_spec.rb +1 -1
- data/wukong-hadoop.gemspec +1 -1
- metadata +7 -7
data/Gemfile
CHANGED
data/lib/wukong-hadoop.rb
CHANGED
@@ -21,8 +21,8 @@ module Wukong
|
|
21
21
|
return unless program_name == 'wu-hadoop'
|
22
22
|
|
23
23
|
# Hadoop Options
|
24
|
-
settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
|
25
24
|
settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
|
25
|
+
settings.define :hadoop_streaming_jar, wukong_hadoop: true, description: 'Path to hadoop streaming jar. Use this for non-standard hadoop installations.'
|
26
26
|
|
27
27
|
# Translate simplified args to their hairy hadoop equivalents
|
28
28
|
settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
|
data/lib/wukong-hadoop/runner.rb
CHANGED
@@ -111,7 +111,7 @@ EOF
|
|
111
111
|
log.info "Launching local!"
|
112
112
|
execute_command!(local_commandline)
|
113
113
|
else
|
114
|
-
remove_output_path
|
114
|
+
remove_output_path if settings[:rm] || settings[:overwrite]
|
115
115
|
hadoop_commandline
|
116
116
|
log.info "Launching Hadoop!"
|
117
117
|
execute_command!(hadoop_commandline)
|
@@ -187,23 +187,6 @@ EOF
|
|
187
187
|
"--#{param}=#{Shellwords.escape(val.to_s)}"
|
188
188
|
end.join(" ")
|
189
189
|
end
|
190
|
-
|
191
|
-
# Execute a command composed of the given parts.
|
192
|
-
#
|
193
|
-
# Will print the command instead of the <tt>--dry_run</tt>
|
194
|
-
# option was given.
|
195
|
-
#
|
196
|
-
# @param [Array<String>] argv
|
197
|
-
def execute_command!(*argv)
|
198
|
-
command = argv.flatten.reject(&:blank?).join(" \\\n ")
|
199
|
-
if settings[:dry_run]
|
200
|
-
log.info("Dry run:")
|
201
|
-
puts command
|
202
|
-
else
|
203
|
-
puts `#{command}`
|
204
|
-
raise Error.new("Command failed!") unless $?.success?
|
205
|
-
end
|
206
|
-
end
|
207
190
|
|
208
191
|
end
|
209
192
|
end
|
@@ -10,10 +10,8 @@ module Wukong
|
|
10
10
|
#
|
11
11
|
# Will not actually do anything if the <tt>--dry_run</tt> option
|
12
12
|
# is also given.
|
13
|
-
def remove_output_path
|
14
|
-
|
15
|
-
log.info "Removing output file #{output_path}: #{cmd}"
|
16
|
-
puts `#{cmd}` unless settings[:dry_run]
|
13
|
+
def remove_output_path
|
14
|
+
execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
|
17
15
|
end
|
18
16
|
|
19
17
|
# Return the Hadoop command used to launch this job in a Hadoop
|
@@ -26,11 +24,11 @@ module Wukong
|
|
26
24
|
def hadoop_commandline
|
27
25
|
[
|
28
26
|
hadoop_runner,
|
29
|
-
"jar #{
|
27
|
+
"jar #{hadoop_streaming_jar}",
|
30
28
|
hadoop_jobconf_options,
|
31
29
|
"-D mapred.job.name='#{job_name}'",
|
32
|
-
hadoop_other_args,
|
33
30
|
hadoop_files,
|
31
|
+
hadoop_other_args,
|
34
32
|
"-mapper '#{mapper_commandline}'",
|
35
33
|
"-reducer '#{reducer_commandline}'",
|
36
34
|
"-input '#{input_paths}'",
|
@@ -84,7 +82,17 @@ module Wukong
|
|
84
82
|
#
|
85
83
|
# @return [String]
|
86
84
|
def hadoop_runner
|
87
|
-
settings[:hadoop_runner] ||
|
85
|
+
settings[:hadoop_runner] || 'hadoop'
|
86
|
+
end
|
87
|
+
|
88
|
+
# The path (glob) to the Hadoop streaming jar.
|
89
|
+
#
|
90
|
+
# Respects the value of <tt>--hadoop_streaming_jar</tt> if
|
91
|
+
# given. Otherwise uses the default CDH4 location.
|
92
|
+
#
|
93
|
+
# @return [String]
|
94
|
+
def hadoop_streaming_jar
|
95
|
+
settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
|
88
96
|
end
|
89
97
|
|
90
98
|
# Return an array of jobconf (-D) options that will be passed to Hadoop.
|
@@ -99,10 +107,6 @@ module Wukong
|
|
99
107
|
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
100
108
|
# If no reducer and no reduce_command, then skip the reduce phase
|
101
109
|
settings[:reduce_tasks] ||= 0 unless reduce?
|
102
|
-
# Fields hadoop should use to distribute records to reducers
|
103
|
-
unless settings[:partition_fields].blank?
|
104
|
-
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
105
|
-
end
|
106
110
|
jobconf_options += [
|
107
111
|
:io_sort_mb, :io_sort_record_percent,
|
108
112
|
:map_speculative, :map_tasks,
|
data/spec/spec_helper.rb
CHANGED
@@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
69
69
|
context "removing existing output paths" do
|
70
70
|
|
71
71
|
it "will not remove the output path by default" do
|
72
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path
|
72
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
|
73
73
|
end
|
74
74
|
it "will remove the output path when given the --rm option" do
|
75
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path
|
75
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
|
76
76
|
end
|
77
77
|
it "will not remove the output path when given the --rm option AND the --dry_run option" do
|
78
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path
|
78
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
@@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do
|
|
4
4
|
|
5
5
|
context "handling errors" do
|
6
6
|
it "raises an error when it can't find a file" do
|
7
|
-
expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /
|
7
|
+
expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
|
8
8
|
end
|
9
9
|
|
10
10
|
it "raises an error in Hadoop mode when called without input and output paths" do
|
data/wukong-hadoop.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong-hadoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2014-03-19 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: wukong
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - '='
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 4.0.0
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -28,7 +28,7 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - '='
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
version:
|
31
|
+
version: 4.0.0
|
32
32
|
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
33
33
|
efficient to process by lines\n * stream of field arrays when it's efficient
|
34
34
|
to deal directly with fields\n * stream of lightweight objects when it's efficient
|
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
113
|
version: '0'
|
114
114
|
segments:
|
115
115
|
- 0
|
116
|
-
hash: -
|
116
|
+
hash: -3970581689600624425
|
117
117
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
118
|
none: false
|
119
119
|
requirements:
|
@@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
122
|
version: '0'
|
123
123
|
segments:
|
124
124
|
- 0
|
125
|
-
hash: -
|
125
|
+
hash: -3970581689600624425
|
126
126
|
requirements: []
|
127
127
|
rubyforge_project:
|
128
|
-
rubygems_version: 1.8.
|
128
|
+
rubygems_version: 1.8.23
|
129
129
|
signing_key:
|
130
130
|
specification_version: 3
|
131
131
|
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|