wukong-hadoop 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +5 -0
- data/lib/wukong-hadoop/configuration.rb +5 -1
- data/lib/wukong-hadoop/driver.rb +4 -3
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +22 -6
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +1 -0
- data/spec/support/integration_helper.rb +7 -7
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +36 -6
- data/wukong-hadoop.gemspec +1 -5
- metadata +4 -36
data/Gemfile
CHANGED
@@ -16,6 +16,7 @@ module Wukong
|
|
16
16
|
settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
|
17
17
|
settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
|
18
18
|
settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
|
19
|
+
settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
|
19
20
|
settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
|
20
21
|
settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
|
21
22
|
settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
|
@@ -39,7 +40,10 @@ module Wukong
|
|
39
40
|
settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
|
40
41
|
settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
|
41
42
|
settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
|
42
|
-
settings.define :java_opts, wukong_hadoop: true, description: 'Additional
|
43
|
+
settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
|
44
|
+
settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
|
45
|
+
settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
|
46
|
+
settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
|
43
47
|
|
44
48
|
# Options given on the command-line
|
45
49
|
settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
|
data/lib/wukong-hadoop/driver.rb
CHANGED
@@ -24,6 +24,7 @@ module Wukong
|
|
24
24
|
include ReduceLogic
|
25
25
|
include HadoopInvocation
|
26
26
|
include LocalInvocation
|
27
|
+
include Logging
|
27
28
|
|
28
29
|
# The settings used by this driver.
|
29
30
|
#
|
@@ -55,12 +56,12 @@ module Wukong
|
|
55
56
|
# Run this driver.
|
56
57
|
def run!
|
57
58
|
if mode == :local
|
58
|
-
#
|
59
|
+
# log.info "Launching local!"
|
59
60
|
execute_command!(local_commandline)
|
60
61
|
else
|
61
62
|
ensure_input_and_output!
|
62
63
|
remove_output_path! if settings[:rm] || settings[:overwrite]
|
63
|
-
|
64
|
+
log.info "Launching Hadoop!"
|
64
65
|
execute_command!(hadoop_commandline)
|
65
66
|
end
|
66
67
|
end
|
@@ -177,7 +178,7 @@ module Wukong
|
|
177
178
|
def execute_command!(*args)
|
178
179
|
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
179
180
|
if settings[:dry_run]
|
180
|
-
|
181
|
+
log.info("Dry run:")
|
181
182
|
puts command
|
182
183
|
else
|
183
184
|
puts `#{command}`
|
@@ -16,8 +16,8 @@ module Wukong
|
|
16
16
|
# Will not actually do anything if the <tt>--dry_run</tt> option
|
17
17
|
# is also given.
|
18
18
|
def remove_output_path!
|
19
|
-
cmd = %Q{#{
|
20
|
-
|
19
|
+
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
20
|
+
log.info "Removing output file #{output_path}: #{cmd}"
|
21
21
|
puts `#{cmd}` unless settings[:dry_run]
|
22
22
|
end
|
23
23
|
|
@@ -35,11 +35,11 @@ module Wukong
|
|
35
35
|
hadoop_jobconf_options,
|
36
36
|
"-D mapred.job.name='#{job_name}'",
|
37
37
|
hadoop_other_args,
|
38
|
+
hadoop_files,
|
38
39
|
"-mapper '#{mapper_commandline}'",
|
39
40
|
"-reducer '#{reducer_commandline}'",
|
40
41
|
"-input '#{input_paths}'",
|
41
42
|
"-output '#{output_path}'",
|
42
|
-
hadoop_files,
|
43
43
|
io_formats,
|
44
44
|
hadoop_recycle_env,
|
45
45
|
].flatten.compact.join(" \t\\\n ")
|
@@ -103,7 +103,7 @@ module Wukong
|
|
103
103
|
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
104
104
|
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
105
105
|
# If no reducer and no reduce_command, then skip the reduce phase
|
106
|
-
settings[:reduce_tasks]
|
106
|
+
settings[:reduce_tasks] ||= 0 unless reduce?
|
107
107
|
# Fields hadoop should use to distribute records to reducers
|
108
108
|
unless settings[:partition_fields].blank?
|
109
109
|
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
@@ -146,8 +146,24 @@ module Wukong
|
|
146
146
|
#
|
147
147
|
# http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
|
148
148
|
def hadoop_files
|
149
|
-
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.
|
150
|
-
|
149
|
+
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.each do |arg|
|
150
|
+
settings[:files] << arg
|
151
|
+
end
|
152
|
+
|
153
|
+
[].tap do |files_options|
|
154
|
+
{
|
155
|
+
:files => '-files ',
|
156
|
+
:jars => '-libjars ',
|
157
|
+
:archives => '-archives '
|
158
|
+
}.each_pair do |file_type_name, file_option_name|
|
159
|
+
unless settings[file_type_name].nil? || settings[file_type_name].empty?
|
160
|
+
files = settings[file_type_name].map do |file_name_or_glob|
|
161
|
+
# Don't glob on the HDFS
|
162
|
+
file_type_name == :archives ? file_name_or_glob : [Dir[file_name_or_glob], file_name_or_glob]
|
163
|
+
end.flatten.compact.uniq.join(',')
|
164
|
+
files_options << "#{file_option_name}'#{files}'"
|
165
|
+
end
|
166
|
+
end
|
151
167
|
end
|
152
168
|
end
|
153
169
|
|
data/spec/spec_helper.rb
CHANGED
@@ -6,16 +6,16 @@ module Wukong
|
|
6
6
|
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
7
|
end
|
8
8
|
|
9
|
-
def lib_dir
|
10
|
-
root.join('lib')
|
9
|
+
def lib_dir *args
|
10
|
+
root.join('lib', *args)
|
11
11
|
end
|
12
12
|
|
13
|
-
def bin_dir
|
14
|
-
root.join('bin')
|
13
|
+
def bin_dir *args
|
14
|
+
root.join('bin', *args)
|
15
15
|
end
|
16
16
|
|
17
|
-
def examples_dir
|
18
|
-
root.join('examples')
|
17
|
+
def examples_dir *args
|
18
|
+
root.join('examples', *args)
|
19
19
|
end
|
20
20
|
|
21
21
|
def integration_env
|
@@ -30,7 +30,7 @@ module Wukong
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def example_script *args
|
33
|
-
examples_dir
|
33
|
+
examples_dir(*args)
|
34
34
|
end
|
35
35
|
|
36
36
|
end
|
@@ -4,8 +4,9 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
4
4
|
|
5
5
|
let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
6
6
|
let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
7
|
-
let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
|
8
|
-
let(:
|
7
|
+
let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
|
8
|
+
let(:custom_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
|
9
|
+
let(:many_files) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}
|
9
10
|
|
10
11
|
context "defining input paths" do
|
11
12
|
it "raises an error unless given an --input option" do
|
@@ -15,7 +16,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
15
16
|
map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
|
16
17
|
end
|
17
18
|
it "sets its input format given the --input_format option" do
|
18
|
-
|
19
|
+
custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
|
19
20
|
end
|
20
21
|
end
|
21
22
|
|
@@ -27,7 +28,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
27
28
|
map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
|
28
29
|
end
|
29
30
|
it "sets its output format given the --output_format option" do
|
30
|
-
|
31
|
+
custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
@@ -43,6 +44,18 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
47
|
+
context "setting the number of reduce tasks" do
|
48
|
+
it "does nothing on a map/reduce job" do
|
49
|
+
map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
|
50
|
+
end
|
51
|
+
it "respects the option when given" do
|
52
|
+
complex.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=20})
|
53
|
+
end
|
54
|
+
it "sets reduce tasks to 0 for a map-only job" do
|
55
|
+
map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
46
59
|
context "defining Hadoop JobConf options" do
|
47
60
|
it "translates friendly names into native ones" do
|
48
61
|
complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
|
@@ -54,7 +67,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
54
67
|
end
|
55
68
|
|
56
69
|
context "removing existing output paths" do
|
57
|
-
|
70
|
+
|
58
71
|
it "will not remove the output path by default" do
|
59
72
|
map_reduce.should_not_receive(:remove_output_path!)
|
60
73
|
map_reduce.should_receive(:execute_command!)
|
@@ -72,7 +85,24 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
72
85
|
d.should_receive(:execute_command!)
|
73
86
|
d.run!
|
74
87
|
end
|
75
|
-
|
88
|
+
end
|
89
|
+
|
90
|
+
context "handle files, jars, and archives" do
|
91
|
+
it "does not include any files, jars, or archives when no files were passed" do
|
92
|
+
map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
|
93
|
+
end
|
94
|
+
it "should include files when asked" do
|
95
|
+
many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
|
96
|
+
end
|
97
|
+
it "should include jars when asked" do
|
98
|
+
many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
|
99
|
+
end
|
100
|
+
it "should include archives when asked" do
|
101
|
+
many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
|
102
|
+
end
|
103
|
+
it "should include files when passed files as arguments" do
|
104
|
+
driver(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
|
105
|
+
end
|
76
106
|
end
|
77
107
|
|
78
108
|
end
|
data/wukong-hadoop.gemspec
CHANGED
@@ -25,9 +25,5 @@ EOF
|
|
25
25
|
gem.test_files = gem.files.grep(/^spec/)
|
26
26
|
gem.require_paths = ['lib']
|
27
27
|
|
28
|
-
gem.add_dependency('wukong', '3.0.0.
|
29
|
-
|
30
|
-
gem.add_development_dependency 'rake', '~> 0.9'
|
31
|
-
gem.add_development_dependency 'rspec', '~> 2'
|
32
|
-
|
28
|
+
gem.add_dependency('wukong', '3.0.0.pre3')
|
33
29
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong-hadoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-12-
|
14
|
+
date: 2012-12-17 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: wukong
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - '='
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version: 3.0.0.
|
23
|
+
version: 3.0.0.pre3
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -28,39 +28,7 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - '='
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
version: 3.0.0.
|
32
|
-
- !ruby/object:Gem::Dependency
|
33
|
-
name: rake
|
34
|
-
requirement: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
|
-
requirements:
|
37
|
-
- - ~>
|
38
|
-
- !ruby/object:Gem::Version
|
39
|
-
version: '0.9'
|
40
|
-
type: :development
|
41
|
-
prerelease: false
|
42
|
-
version_requirements: !ruby/object:Gem::Requirement
|
43
|
-
none: false
|
44
|
-
requirements:
|
45
|
-
- - ~>
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0.9'
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: rspec
|
50
|
-
requirement: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
|
-
requirements:
|
53
|
-
- - ~>
|
54
|
-
- !ruby/object:Gem::Version
|
55
|
-
version: '2'
|
56
|
-
type: :development
|
57
|
-
prerelease: false
|
58
|
-
version_requirements: !ruby/object:Gem::Requirement
|
59
|
-
none: false
|
60
|
-
requirements:
|
61
|
-
- - ~>
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
version: '2'
|
31
|
+
version: 3.0.0.pre3
|
64
32
|
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
65
33
|
efficient to process by lines\n * stream of field arrays when it's efficient
|
66
34
|
to deal directly with fields\n * stream of lightweight objects when it's efficient
|