wukong-hadoop 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/lib/wukong-hadoop.rb +1 -1
- data/lib/wukong-hadoop/runner.rb +1 -18
- data/lib/wukong-hadoop/runner/hadoop_invocation.rb +15 -11
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +3 -3
- data/spec/wukong-hadoop/runner_spec.rb +1 -1
- data/wukong-hadoop.gemspec +1 -1
- metadata +7 -7
data/Gemfile
CHANGED
data/lib/wukong-hadoop.rb
CHANGED
@@ -21,8 +21,8 @@ module Wukong
|
|
21
21
|
return unless program_name == 'wu-hadoop'
|
22
22
|
|
23
23
|
# Hadoop Options
|
24
|
-
settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
|
25
24
|
settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
|
25
|
+
settings.define :hadoop_streaming_jar, wukong_hadoop: true, description: 'Path to hadoop streaming jar. Use this for non-standard hadoop installations.'
|
26
26
|
|
27
27
|
# Translate simplified args to their hairy hadoop equivalents
|
28
28
|
settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
|
data/lib/wukong-hadoop/runner.rb
CHANGED
@@ -111,7 +111,7 @@ EOF
|
|
111
111
|
log.info "Launching local!"
|
112
112
|
execute_command!(local_commandline)
|
113
113
|
else
|
114
|
-
remove_output_path
|
114
|
+
remove_output_path if settings[:rm] || settings[:overwrite]
|
115
115
|
hadoop_commandline
|
116
116
|
log.info "Launching Hadoop!"
|
117
117
|
execute_command!(hadoop_commandline)
|
@@ -187,23 +187,6 @@ EOF
|
|
187
187
|
"--#{param}=#{Shellwords.escape(val.to_s)}"
|
188
188
|
end.join(" ")
|
189
189
|
end
|
190
|
-
|
191
|
-
# Execute a command composed of the given parts.
|
192
|
-
#
|
193
|
-
# Will print the command instead of the <tt>--dry_run</tt>
|
194
|
-
# option was given.
|
195
|
-
#
|
196
|
-
# @param [Array<String>] argv
|
197
|
-
def execute_command!(*argv)
|
198
|
-
command = argv.flatten.reject(&:blank?).join(" \\\n ")
|
199
|
-
if settings[:dry_run]
|
200
|
-
log.info("Dry run:")
|
201
|
-
puts command
|
202
|
-
else
|
203
|
-
puts `#{command}`
|
204
|
-
raise Error.new("Command failed!") unless $?.success?
|
205
|
-
end
|
206
|
-
end
|
207
190
|
|
208
191
|
end
|
209
192
|
end
|
@@ -10,10 +10,8 @@ module Wukong
|
|
10
10
|
#
|
11
11
|
# Will not actually do anything if the <tt>--dry_run</tt> option
|
12
12
|
# is also given.
|
13
|
-
def remove_output_path
|
14
|
-
|
15
|
-
log.info "Removing output file #{output_path}: #{cmd}"
|
16
|
-
puts `#{cmd}` unless settings[:dry_run]
|
13
|
+
def remove_output_path
|
14
|
+
execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
|
17
15
|
end
|
18
16
|
|
19
17
|
# Return the Hadoop command used to launch this job in a Hadoop
|
@@ -26,11 +24,11 @@ module Wukong
|
|
26
24
|
def hadoop_commandline
|
27
25
|
[
|
28
26
|
hadoop_runner,
|
29
|
-
"jar #{
|
27
|
+
"jar #{hadoop_streaming_jar}",
|
30
28
|
hadoop_jobconf_options,
|
31
29
|
"-D mapred.job.name='#{job_name}'",
|
32
|
-
hadoop_other_args,
|
33
30
|
hadoop_files,
|
31
|
+
hadoop_other_args,
|
34
32
|
"-mapper '#{mapper_commandline}'",
|
35
33
|
"-reducer '#{reducer_commandline}'",
|
36
34
|
"-input '#{input_paths}'",
|
@@ -84,7 +82,17 @@ module Wukong
|
|
84
82
|
#
|
85
83
|
# @return [String]
|
86
84
|
def hadoop_runner
|
87
|
-
settings[:hadoop_runner] ||
|
85
|
+
settings[:hadoop_runner] || 'hadoop'
|
86
|
+
end
|
87
|
+
|
88
|
+
# The path (glob) to the Hadoop streaming jar.
|
89
|
+
#
|
90
|
+
# Respects the value of <tt>--hadoop_streaming_jar</tt> if
|
91
|
+
# given. Otherwise uses the default CDH4 location.
|
92
|
+
#
|
93
|
+
# @return [String]
|
94
|
+
def hadoop_streaming_jar
|
95
|
+
settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
|
88
96
|
end
|
89
97
|
|
90
98
|
# Return an array of jobconf (-D) options that will be passed to Hadoop.
|
@@ -99,10 +107,6 @@ module Wukong
|
|
99
107
|
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
100
108
|
# If no reducer and no reduce_command, then skip the reduce phase
|
101
109
|
settings[:reduce_tasks] ||= 0 unless reduce?
|
102
|
-
# Fields hadoop should use to distribute records to reducers
|
103
|
-
unless settings[:partition_fields].blank?
|
104
|
-
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
105
|
-
end
|
106
110
|
jobconf_options += [
|
107
111
|
:io_sort_mb, :io_sort_record_percent,
|
108
112
|
:map_speculative, :map_tasks,
|
data/spec/spec_helper.rb
CHANGED
@@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
69
69
|
context "removing existing output paths" do
|
70
70
|
|
71
71
|
it "will not remove the output path by default" do
|
72
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path
|
72
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
|
73
73
|
end
|
74
74
|
it "will remove the output path when given the --rm option" do
|
75
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path
|
75
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
|
76
76
|
end
|
77
77
|
it "will not remove the output path when given the --rm option AND the --dry_run option" do
|
78
|
-
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path
|
78
|
+
hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
|
79
79
|
end
|
80
80
|
end
|
81
81
|
|
@@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do
|
|
4
4
|
|
5
5
|
context "handling errors" do
|
6
6
|
it "raises an error when it can't find a file" do
|
7
|
-
expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /
|
7
|
+
expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
|
8
8
|
end
|
9
9
|
|
10
10
|
it "raises an error in Hadoop mode when called without input and output paths" do
|
data/wukong-hadoop.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong-hadoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date:
|
14
|
+
date: 2014-03-19 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: wukong
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - '='
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 4.0.0
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -28,7 +28,7 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - '='
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
version:
|
31
|
+
version: 4.0.0
|
32
32
|
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
33
33
|
efficient to process by lines\n * stream of field arrays when it's efficient
|
34
34
|
to deal directly with fields\n * stream of lightweight objects when it's efficient
|
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
113
113
|
version: '0'
|
114
114
|
segments:
|
115
115
|
- 0
|
116
|
-
hash: -
|
116
|
+
hash: -3970581689600624425
|
117
117
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
118
118
|
none: false
|
119
119
|
requirements:
|
@@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
122
122
|
version: '0'
|
123
123
|
segments:
|
124
124
|
- 0
|
125
|
-
hash: -
|
125
|
+
hash: -3970581689600624425
|
126
126
|
requirements: []
|
127
127
|
rubyforge_project:
|
128
|
-
rubygems_version: 1.8.
|
128
|
+
rubygems_version: 1.8.23
|
129
129
|
signing_key:
|
130
130
|
specification_version: 3
|
131
131
|
summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
|