wukong-hadoop 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +5 -0
- data/lib/wukong-hadoop/configuration.rb +5 -1
- data/lib/wukong-hadoop/driver.rb +4 -3
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +22 -6
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +1 -0
- data/spec/support/integration_helper.rb +7 -7
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +36 -6
- data/wukong-hadoop.gemspec +1 -5
- metadata +4 -36
data/Gemfile
CHANGED
@@ -16,6 +16,7 @@ module Wukong
|
|
16
16
|
settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
|
17
17
|
settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
|
18
18
|
settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
|
19
|
+
settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
|
19
20
|
settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
|
20
21
|
settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
|
21
22
|
settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
|
@@ -39,7 +40,10 @@ module Wukong
|
|
39
40
|
settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
|
40
41
|
settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
|
41
42
|
settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
|
42
|
-
settings.define :java_opts, wukong_hadoop: true, description: 'Additional
|
43
|
+
settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
|
44
|
+
settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
|
45
|
+
settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
|
46
|
+
settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
|
43
47
|
|
44
48
|
# Options given on the command-line
|
45
49
|
settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
|
data/lib/wukong-hadoop/driver.rb
CHANGED
@@ -24,6 +24,7 @@ module Wukong
|
|
24
24
|
include ReduceLogic
|
25
25
|
include HadoopInvocation
|
26
26
|
include LocalInvocation
|
27
|
+
include Logging
|
27
28
|
|
28
29
|
# The settings used by this driver.
|
29
30
|
#
|
@@ -55,12 +56,12 @@ module Wukong
|
|
55
56
|
# Run this driver.
|
56
57
|
def run!
|
57
58
|
if mode == :local
|
58
|
-
#
|
59
|
+
# log.info "Launching local!"
|
59
60
|
execute_command!(local_commandline)
|
60
61
|
else
|
61
62
|
ensure_input_and_output!
|
62
63
|
remove_output_path! if settings[:rm] || settings[:overwrite]
|
63
|
-
|
64
|
+
log.info "Launching Hadoop!"
|
64
65
|
execute_command!(hadoop_commandline)
|
65
66
|
end
|
66
67
|
end
|
@@ -177,7 +178,7 @@ module Wukong
|
|
177
178
|
def execute_command!(*args)
|
178
179
|
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
179
180
|
if settings[:dry_run]
|
180
|
-
|
181
|
+
log.info("Dry run:")
|
181
182
|
puts command
|
182
183
|
else
|
183
184
|
puts `#{command}`
|
@@ -16,8 +16,8 @@ module Wukong
|
|
16
16
|
# Will not actually do anything if the <tt>--dry_run</tt> option
|
17
17
|
# is also given.
|
18
18
|
def remove_output_path!
|
19
|
-
cmd = %Q{#{
|
20
|
-
|
19
|
+
cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
|
20
|
+
log.info "Removing output file #{output_path}: #{cmd}"
|
21
21
|
puts `#{cmd}` unless settings[:dry_run]
|
22
22
|
end
|
23
23
|
|
@@ -35,11 +35,11 @@ module Wukong
|
|
35
35
|
hadoop_jobconf_options,
|
36
36
|
"-D mapred.job.name='#{job_name}'",
|
37
37
|
hadoop_other_args,
|
38
|
+
hadoop_files,
|
38
39
|
"-mapper '#{mapper_commandline}'",
|
39
40
|
"-reducer '#{reducer_commandline}'",
|
40
41
|
"-input '#{input_paths}'",
|
41
42
|
"-output '#{output_path}'",
|
42
|
-
hadoop_files,
|
43
43
|
io_formats,
|
44
44
|
hadoop_recycle_env,
|
45
45
|
].flatten.compact.join(" \t\\\n ")
|
@@ -103,7 +103,7 @@ module Wukong
|
|
103
103
|
settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
|
104
104
|
settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
|
105
105
|
# If no reducer and no reduce_command, then skip the reduce phase
|
106
|
-
settings[:reduce_tasks]
|
106
|
+
settings[:reduce_tasks] ||= 0 unless reduce?
|
107
107
|
# Fields hadoop should use to distribute records to reducers
|
108
108
|
unless settings[:partition_fields].blank?
|
109
109
|
jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
|
@@ -146,8 +146,24 @@ module Wukong
|
|
146
146
|
#
|
147
147
|
# http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
|
148
148
|
def hadoop_files
|
149
|
-
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.
|
150
|
-
|
149
|
+
args.find_all { |arg| arg.to_s =~ /\.rb$/ }.each do |arg|
|
150
|
+
settings[:files] << arg
|
151
|
+
end
|
152
|
+
|
153
|
+
[].tap do |files_options|
|
154
|
+
{
|
155
|
+
:files => '-files ',
|
156
|
+
:jars => '-libjars ',
|
157
|
+
:archives => '-archives '
|
158
|
+
}.each_pair do |file_type_name, file_option_name|
|
159
|
+
unless settings[file_type_name].nil? || settings[file_type_name].empty?
|
160
|
+
files = settings[file_type_name].map do |file_name_or_glob|
|
161
|
+
# Don't glob on the HDFS
|
162
|
+
file_type_name == :archives ? file_name_or_glob : [Dir[file_name_or_glob], file_name_or_glob]
|
163
|
+
end.flatten.compact.uniq.join(',')
|
164
|
+
files_options << "#{file_option_name}'#{files}'"
|
165
|
+
end
|
166
|
+
end
|
151
167
|
end
|
152
168
|
end
|
153
169
|
|
data/spec/spec_helper.rb
CHANGED
@@ -6,16 +6,16 @@ module Wukong
|
|
6
6
|
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
7
|
end
|
8
8
|
|
9
|
-
def lib_dir
|
10
|
-
root.join('lib')
|
9
|
+
def lib_dir *args
|
10
|
+
root.join('lib', *args)
|
11
11
|
end
|
12
12
|
|
13
|
-
def bin_dir
|
14
|
-
root.join('bin')
|
13
|
+
def bin_dir *args
|
14
|
+
root.join('bin', *args)
|
15
15
|
end
|
16
16
|
|
17
|
-
def examples_dir
|
18
|
-
root.join('examples')
|
17
|
+
def examples_dir *args
|
18
|
+
root.join('examples', *args)
|
19
19
|
end
|
20
20
|
|
21
21
|
def integration_env
|
@@ -30,7 +30,7 @@ module Wukong
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def example_script *args
|
33
|
-
examples_dir
|
33
|
+
examples_dir(*args)
|
34
34
|
end
|
35
35
|
|
36
36
|
end
|
@@ -4,8 +4,9 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
4
4
|
|
5
5
|
let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
6
6
|
let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
|
7
|
-
let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
|
8
|
-
let(:
|
7
|
+
let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
|
8
|
+
let(:custom_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
|
9
|
+
let(:many_files) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}
|
9
10
|
|
10
11
|
context "defining input paths" do
|
11
12
|
it "raises an error unless given an --input option" do
|
@@ -15,7 +16,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
15
16
|
map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
|
16
17
|
end
|
17
18
|
it "sets its input format given the --input_format option" do
|
18
|
-
|
19
|
+
custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
|
19
20
|
end
|
20
21
|
end
|
21
22
|
|
@@ -27,7 +28,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
27
28
|
map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
|
28
29
|
end
|
29
30
|
it "sets its output format given the --output_format option" do
|
30
|
-
|
31
|
+
custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
|
31
32
|
end
|
32
33
|
end
|
33
34
|
|
@@ -43,6 +44,18 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
43
44
|
end
|
44
45
|
end
|
45
46
|
|
47
|
+
context "setting the number of reduce tasks" do
|
48
|
+
it "does nothing on a map/reduce job" do
|
49
|
+
map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
|
50
|
+
end
|
51
|
+
it "respects the option when given" do
|
52
|
+
complex.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=20})
|
53
|
+
end
|
54
|
+
it "sets reduce tasks to 0 for a map-only job" do
|
55
|
+
map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
46
59
|
context "defining Hadoop JobConf options" do
|
47
60
|
it "translates friendly names into native ones" do
|
48
61
|
complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
|
@@ -54,7 +67,7 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
54
67
|
end
|
55
68
|
|
56
69
|
context "removing existing output paths" do
|
57
|
-
|
70
|
+
|
58
71
|
it "will not remove the output path by default" do
|
59
72
|
map_reduce.should_not_receive(:remove_output_path!)
|
60
73
|
map_reduce.should_receive(:execute_command!)
|
@@ -72,7 +85,24 @@ describe Wukong::Hadoop::HadoopInvocation do
|
|
72
85
|
d.should_receive(:execute_command!)
|
73
86
|
d.run!
|
74
87
|
end
|
75
|
-
|
88
|
+
end
|
89
|
+
|
90
|
+
context "handle files, jars, and archives" do
|
91
|
+
it "does not include any files, jars, or archives when no files were passed" do
|
92
|
+
map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
|
93
|
+
end
|
94
|
+
it "should include files when asked" do
|
95
|
+
many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
|
96
|
+
end
|
97
|
+
it "should include jars when asked" do
|
98
|
+
many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
|
99
|
+
end
|
100
|
+
it "should include archives when asked" do
|
101
|
+
many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
|
102
|
+
end
|
103
|
+
it "should include files when passed files as arguments" do
|
104
|
+
driver(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
|
105
|
+
end
|
76
106
|
end
|
77
107
|
|
78
108
|
end
|
data/wukong-hadoop.gemspec
CHANGED
@@ -25,9 +25,5 @@ EOF
|
|
25
25
|
gem.test_files = gem.files.grep(/^spec/)
|
26
26
|
gem.require_paths = ['lib']
|
27
27
|
|
28
|
-
gem.add_dependency('wukong', '3.0.0.
|
29
|
-
|
30
|
-
gem.add_development_dependency 'rake', '~> 0.9'
|
31
|
-
gem.add_development_dependency 'rspec', '~> 2'
|
32
|
-
|
28
|
+
gem.add_dependency('wukong', '3.0.0.pre3')
|
33
29
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong-hadoop
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,7 +11,7 @@ authors:
|
|
11
11
|
autorequire:
|
12
12
|
bindir: bin
|
13
13
|
cert_chain: []
|
14
|
-
date: 2012-12-
|
14
|
+
date: 2012-12-17 00:00:00.000000000 Z
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: wukong
|
@@ -20,7 +20,7 @@ dependencies:
|
|
20
20
|
requirements:
|
21
21
|
- - '='
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version: 3.0.0.
|
23
|
+
version: 3.0.0.pre3
|
24
24
|
type: :runtime
|
25
25
|
prerelease: false
|
26
26
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -28,39 +28,7 @@ dependencies:
|
|
28
28
|
requirements:
|
29
29
|
- - '='
|
30
30
|
- !ruby/object:Gem::Version
|
31
|
-
version: 3.0.0.
|
32
|
-
- !ruby/object:Gem::Dependency
|
33
|
-
name: rake
|
34
|
-
requirement: !ruby/object:Gem::Requirement
|
35
|
-
none: false
|
36
|
-
requirements:
|
37
|
-
- - ~>
|
38
|
-
- !ruby/object:Gem::Version
|
39
|
-
version: '0.9'
|
40
|
-
type: :development
|
41
|
-
prerelease: false
|
42
|
-
version_requirements: !ruby/object:Gem::Requirement
|
43
|
-
none: false
|
44
|
-
requirements:
|
45
|
-
- - ~>
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0.9'
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: rspec
|
50
|
-
requirement: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
|
-
requirements:
|
53
|
-
- - ~>
|
54
|
-
- !ruby/object:Gem::Version
|
55
|
-
version: '2'
|
56
|
-
type: :development
|
57
|
-
prerelease: false
|
58
|
-
version_requirements: !ruby/object:Gem::Requirement
|
59
|
-
none: false
|
60
|
-
requirements:
|
61
|
-
- - ~>
|
62
|
-
- !ruby/object:Gem::Version
|
63
|
-
version: '2'
|
31
|
+
version: 3.0.0.pre3
|
64
32
|
description: ! " Treat your dataset like a:\n\n * stream of lines when it's
|
65
33
|
efficient to process by lines\n * stream of field arrays when it's efficient
|
66
34
|
to deal directly with fields\n * stream of lightweight objects when it's efficient
|