wukong-hadoop 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +59 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/README.md +339 -0
- data/Rakefile +13 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-bzip +23 -0
- data/bin/hdp-cat +3 -0
- data/bin/hdp-catd +3 -0
- data/bin/hdp-cp +3 -0
- data/bin/hdp-du +86 -0
- data/bin/hdp-get +3 -0
- data/bin/hdp-kill +3 -0
- data/bin/hdp-kill-task +3 -0
- data/bin/hdp-ls +11 -0
- data/bin/hdp-mkdir +2 -0
- data/bin/hdp-mkdirp +12 -0
- data/bin/hdp-mv +3 -0
- data/bin/hdp-parts_to_keys.rb +77 -0
- data/bin/hdp-ps +3 -0
- data/bin/hdp-put +3 -0
- data/bin/hdp-rm +32 -0
- data/bin/hdp-sort +40 -0
- data/bin/hdp-stream +40 -0
- data/bin/hdp-stream-flat +22 -0
- data/bin/hdp-stream2 +39 -0
- data/bin/hdp-sync +17 -0
- data/bin/hdp-wc +67 -0
- data/bin/wu-hadoop +14 -0
- data/examples/counter.rb +17 -0
- data/examples/map_only.rb +28 -0
- data/examples/processors.rb +4 -0
- data/examples/sonnet_18.txt +14 -0
- data/examples/tokenizer.rb +28 -0
- data/examples/word_count.rb +44 -0
- data/features/step_definitions/wu_hadoop_steps.rb +4 -0
- data/features/support/env.rb +1 -0
- data/features/wu_hadoop.feature +113 -0
- data/lib/wukong-hadoop.rb +21 -0
- data/lib/wukong-hadoop/configuration.rb +133 -0
- data/lib/wukong-hadoop/driver.rb +190 -0
- data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
- data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
- data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
- data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
- data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
- data/lib/wukong-hadoop/extensions.rb +2 -0
- data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
- data/lib/wukong-hadoop/version.rb +6 -0
- data/spec/spec_helper.rb +21 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +39 -0
- data/spec/wukong-hadoop/driver_spec.rb +117 -0
- data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
- data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
- data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
- data/wukong-hadoop.gemspec +33 -0
- metadata +168 -0
data/bin/hdp-sync
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'wukong'
|
3
|
+
|
4
|
+
src_dir, dest_dir = ARGV[0..1]
|
5
|
+
src_files = Dir[src_dir + '/*']
|
6
|
+
dest_files = Wukong::Dfs.list_files dest_dir
|
7
|
+
Wukong::Dfs.compare_listings(src_files, dest_files) do |comparison, src_file, dest_file|
|
8
|
+
case comparison
|
9
|
+
when :missing
|
10
|
+
dest_filename = "%s/%s" % [dest_dir, dest_file]
|
11
|
+
puts "Copying #{src_file} #{dest_filename}"
|
12
|
+
puts `hadoop dfs -put #{src_file} #{dest_filename}`
|
13
|
+
when :differ
|
14
|
+
src_ls = `ls -l #{src_file}`.split(/\s+/).join("\t")
|
15
|
+
puts "Differ: #{src_ls} \n#{dest_file}"
|
16
|
+
end
|
17
|
+
end
|
data/bin/hdp-wc
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'wukong'
|
3
|
+
NEWLINE_LENGTH = $/.length # KLUDGE
|
4
|
+
|
5
|
+
#
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# !! The +words+ count comes out higher than that of +wc+ -- don't know
|
9
|
+
# why. (It's close: a 10GB, 1M line dataset it showed 367833839 vs. 367713271)
|
10
|
+
#
|
11
|
+
class WcMapper < Wukong::Streamer::LineStreamer
|
12
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
13
|
+
|
14
|
+
def before_stream
|
15
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
16
|
+
end
|
17
|
+
|
18
|
+
def process line
|
19
|
+
return unless line
|
20
|
+
self.lines += 1
|
21
|
+
self.fields += 1 + line.count("\t")
|
22
|
+
self.words += 1 + line.strip.scan(/\s+/).length unless line.blank?
|
23
|
+
self.chars += line.chars.to_a.length + NEWLINE_LENGTH
|
24
|
+
self.bytes += line.bytesize + NEWLINE_LENGTH
|
25
|
+
$stderr.puts line if (line.chars.to_a.length != line.bytesize)
|
26
|
+
end
|
27
|
+
|
28
|
+
def after_stream
|
29
|
+
emit [lines, fields, words, chars, bytes]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
#
|
34
|
+
#
|
35
|
+
class WcReducer < Wukong::Streamer::Base
|
36
|
+
attr_accessor :lines, :fields, :words, :chars, :bytes
|
37
|
+
|
38
|
+
def before_stream
|
39
|
+
self.lines, self.fields, self.words, self.chars, self.bytes = [0,0,0,0,0]
|
40
|
+
end
|
41
|
+
|
42
|
+
def process m_lines, m_fields, m_words, m_chars, m_bytes
|
43
|
+
self.lines += m_lines.to_i
|
44
|
+
self.fields += m_fields.to_i
|
45
|
+
self.words += m_words.to_i
|
46
|
+
self.chars += m_chars.to_i
|
47
|
+
self.bytes += m_bytes.to_i
|
48
|
+
end
|
49
|
+
|
50
|
+
def after_stream
|
51
|
+
emit [lines, fields, words, chars, bytes]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
Wukong::Script.new(WcMapper, WcReducer, :reduce_tasks => 1).run
|
56
|
+
|
57
|
+
# class FooScript < Wukong::Script
|
58
|
+
# def map_command
|
59
|
+
# '/usr/bin/wc'
|
60
|
+
# end
|
61
|
+
# def reduce_command
|
62
|
+
# '/bin/cat'
|
63
|
+
# end
|
64
|
+
# end
|
65
|
+
# FooScript.new(nil, nil, :reduce_tasks => 1).run
|
66
|
+
#
|
67
|
+
# ruby -ne 'wc_v = `echo "#{$_.chomp}" | wc`; gr_v=($_.strip.empty? ? 0 : $_.strip.scan(/\s+/).length + 1 ) ; puts [wc_v.chomp, " ", gr_v, $_.chomp].join("\t")'
|
data/bin/wu-hadoop
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'wukong-hadoop'
|
4
|
+
settings = Wukong::Hadoop::Configuration
|
5
|
+
require 'wukong/boot'
|
6
|
+
Wukong.boot!(settings)
|
7
|
+
|
8
|
+
settings[:command_prefix] = 'bundle exec' if Wukong.in_deploy_pack?
|
9
|
+
|
10
|
+
if settings.rest.empty?
|
11
|
+
settings.dump_help
|
12
|
+
exit(1)
|
13
|
+
end
|
14
|
+
Wukong::Hadoop::Driver.run(settings, *settings.rest)
|
data/examples/counter.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
Wukong.processor(:counter, Wukong::Processor::Accumulator) do
|
2
|
+
|
3
|
+
attr_accessor :count
|
4
|
+
|
5
|
+
def start record
|
6
|
+
self.count = 0
|
7
|
+
end
|
8
|
+
|
9
|
+
def accumulate record
|
10
|
+
self.count += 1
|
11
|
+
end
|
12
|
+
|
13
|
+
def finalize
|
14
|
+
yield [key, count].join("\t")
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
Wukong.processor(:mapper) do
|
2
|
+
|
3
|
+
field :min_length, Integer, :default => 1
|
4
|
+
field :max_length, Integer, :default => 256
|
5
|
+
field :split_on, Regexp, :default => /\s+/
|
6
|
+
field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
|
7
|
+
field :fold_case, :boolean, :default => false
|
8
|
+
|
9
|
+
def process string
|
10
|
+
tokenize(string).each do |token|
|
11
|
+
yield token if acceptable?(token)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def tokenize string
|
18
|
+
string.split(split_on).map do |token|
|
19
|
+
stripped = token.gsub(remove, '')
|
20
|
+
fold_case ? stripped.downcase : stripped
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def acceptable? token
|
25
|
+
(min_length..max_length).include?(token.length)
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
Shall I compare thee to a summer's day?
|
2
|
+
Thou art more lovely and more temperate:
|
3
|
+
Rough winds do shake the darling buds of May,
|
4
|
+
And summer's lease hath all too short a date:
|
5
|
+
Sometime too hot the eye of heaven shines,
|
6
|
+
And often is his gold complexion dimm'd;
|
7
|
+
And every fair from fair sometime declines,
|
8
|
+
By chance or nature's changing course untrimm'd;
|
9
|
+
But thy eternal summer shall not fade
|
10
|
+
Nor lose possession of that fair thou owest;
|
11
|
+
Nor shall Death brag thou wander'st in his shade,
|
12
|
+
When in eternal lines to time thou growest:
|
13
|
+
So long as men can breathe or eyes can see,
|
14
|
+
So long lives this and this gives life to thee.
|
@@ -0,0 +1,28 @@
|
|
1
|
+
Wukong.processor(:tokenizer) do
|
2
|
+
|
3
|
+
field :min_length, Integer, :default => 1
|
4
|
+
field :max_length, Integer, :default => 256
|
5
|
+
field :split_on, Regexp, :default => /\s+/
|
6
|
+
field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
|
7
|
+
field :fold_case, :boolean, :default => false
|
8
|
+
|
9
|
+
def process string
|
10
|
+
tokenize(string).each do |token|
|
11
|
+
yield token if acceptable?(token)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def tokenize string
|
18
|
+
string.split(split_on).map do |token|
|
19
|
+
stripped = token.gsub(remove, '')
|
20
|
+
fold_case ? stripped.downcase : stripped
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def acceptable? token
|
25
|
+
(min_length..max_length).include?(token.length)
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
Wukong.processor(:mapper) do
|
2
|
+
|
3
|
+
field :min_length, Integer, :default => 1
|
4
|
+
field :max_length, Integer, :default => 256
|
5
|
+
field :split_on, Regexp, :default => /\s+/
|
6
|
+
field :remove, Regexp, :default => /[^a-zA-Z0-9\']+/
|
7
|
+
field :fold_case, :boolean, :default => false
|
8
|
+
|
9
|
+
def process string
|
10
|
+
tokenize(string).each do |token|
|
11
|
+
yield token if acceptable?(token)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def tokenize string
|
18
|
+
string.split(split_on).map do |token|
|
19
|
+
stripped = token.gsub(remove, '')
|
20
|
+
fold_case ? stripped.downcase : stripped
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def acceptable? token
|
25
|
+
(min_length..max_length).include?(token.length)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
Wukong.processor(:reducer, Wukong::Processor::Accumulator) do
|
30
|
+
|
31
|
+
attr_accessor :count
|
32
|
+
|
33
|
+
def start record
|
34
|
+
self.count = 0
|
35
|
+
end
|
36
|
+
|
37
|
+
def accumulate record
|
38
|
+
self.count += 1
|
39
|
+
end
|
40
|
+
|
41
|
+
def finalize
|
42
|
+
yield [key, count].join("\t")
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'aruba/cucumber'
|
@@ -0,0 +1,113 @@
|
|
1
|
+
Feature: Run wu-hadoop from the command line
|
2
|
+
In order to execute hadoop streaming commands
|
3
|
+
As a user of wu-hadoop
|
4
|
+
I should be able run wu-hadoop with wukong processors
|
5
|
+
|
6
|
+
Scenario: Simple wu-hadoop command
|
7
|
+
Given a wukong script "examples/word_count.rb"
|
8
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar `
|
9
|
+
Then the output should contain:
|
10
|
+
"""
|
11
|
+
/usr/lib/hadoop/bin/hadoop \
|
12
|
+
jar /usr/lib/hadoop/contrib/streaming/hadoop-*streaming*.jar \
|
13
|
+
-D mapred.job.name='word_count.rb---/foo---/bar' \
|
14
|
+
"""
|
15
|
+
And the output should match:
|
16
|
+
"""
|
17
|
+
-mapper '.*ruby bundle exec wu-local .*word_count.rb --run=mapper ' \\
|
18
|
+
-reducer '.*ruby bundle exec wu-local .*word_count.rb --run=reducer ' \\
|
19
|
+
"""
|
20
|
+
And the output should contain:
|
21
|
+
"""
|
22
|
+
-input '/foo' \
|
23
|
+
-output '/bar' \
|
24
|
+
"""
|
25
|
+
And the output should match:
|
26
|
+
"""
|
27
|
+
-file '.*word_count.rb' \\
|
28
|
+
-cmdenv 'BUNDLE_GEMFILE=.*wukong-hadoop/Gemfile'
|
29
|
+
"""
|
30
|
+
|
31
|
+
Scenario: A wu-hadoop command without an input or output
|
32
|
+
Given a wukong script "examples/word_count.rb"
|
33
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run`
|
34
|
+
Then the output should contain:
|
35
|
+
"""
|
36
|
+
Missing values for: input (Comma-separated list of input paths.), output (Output directory for the hdfs.)
|
37
|
+
"""
|
38
|
+
|
39
|
+
Scenario: Specifying an alternative gemfile
|
40
|
+
Given a wukong script "examples/word_count.rb"
|
41
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --gemfile=alt/Gemfile`
|
42
|
+
Then the output should contain:
|
43
|
+
"""
|
44
|
+
-cmdenv 'BUNDLE_GEMFILE=alt/Gemfile'
|
45
|
+
"""
|
46
|
+
|
47
|
+
Scenario: Skipping the reduce step
|
48
|
+
Given a file named "wukong_script.rb" with:
|
49
|
+
"""
|
50
|
+
Wukong.processor(:mapper) do
|
51
|
+
|
52
|
+
end
|
53
|
+
"""
|
54
|
+
When I run `bundle exec wu-hadoop wukong_script.rb --dry_run --input=/foo --output=/bar`
|
55
|
+
Then the output should contain:
|
56
|
+
"""
|
57
|
+
-D mapred.reduce.tasks=0 \
|
58
|
+
"""
|
59
|
+
|
60
|
+
Scenario: A processor without a mapper
|
61
|
+
Given a file named "wukong_script.rb" with:
|
62
|
+
"""
|
63
|
+
Wukong.processor(:reducer) do
|
64
|
+
|
65
|
+
end
|
66
|
+
"""
|
67
|
+
When I run `bundle exec wu-hadoop wukong_script.rb --dry_run --input=/foo --output=/bar`
|
68
|
+
Then the output should match:
|
69
|
+
"""
|
70
|
+
No :mapper definition found in .*wukong_script.rb
|
71
|
+
"""
|
72
|
+
|
73
|
+
Scenario: Translating hadoop jobconf options
|
74
|
+
Given a wukong script "examples/word_count.rb"
|
75
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --max_tracker_failures=12`
|
76
|
+
Then the output should match:
|
77
|
+
"""
|
78
|
+
-D mapred.max.tracker.failures=12 \\
|
79
|
+
"""
|
80
|
+
|
81
|
+
Scenario: Passing along extra configuration options
|
82
|
+
Given a wukong script "examples/word_count.rb"
|
83
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --foo=bar`
|
84
|
+
Then the output should match:
|
85
|
+
"""
|
86
|
+
-mapper '.* --foo=bar' \\
|
87
|
+
-reducer '.* --foo=bar' \\
|
88
|
+
"""
|
89
|
+
|
90
|
+
Scenario: Specifying input and output formats
|
91
|
+
Given a wukong script "examples/word_count.rb"
|
92
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --input_format=com.foo.BarInputFormat`
|
93
|
+
Then the output should contain:
|
94
|
+
"""
|
95
|
+
-inputformat 'com.foo.BarInputFormat' \
|
96
|
+
"""
|
97
|
+
|
98
|
+
Scenario: Specifying additional java options
|
99
|
+
Given a wukong script "examples/word_count.rb"
|
100
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --dry_run --input=/foo --output=/bar --java_opts=-Dfoo.bar=baz,-Dother.opts=cool`
|
101
|
+
Then the output should contain:
|
102
|
+
"""
|
103
|
+
-D foo.bar=baz \
|
104
|
+
-D other.opts=cool \
|
105
|
+
"""
|
106
|
+
|
107
|
+
Scenario: Failed hadoop job
|
108
|
+
Given a wukong script "examples/word_count.rb"
|
109
|
+
When I run `bundle exec wu-hadoop examples/word_count.rb --input=/foo --output=/bar`
|
110
|
+
Then the output should contain:
|
111
|
+
"""
|
112
|
+
Streaming command failed!
|
113
|
+
"""
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'configliere'
|
2
|
+
require 'pathname'
|
3
|
+
require 'rbconfig'
|
4
|
+
require 'wukong'
|
5
|
+
|
6
|
+
module Wukong
|
7
|
+
# Wukong-Hadoop is a plugin for Wukong that lets you develop, test,
|
8
|
+
# and run map/reduce type workflows both locally and in the context
|
9
|
+
# of a Hadoop cluster.
|
10
|
+
#
|
11
|
+
# It comes with a binary program called <tt>wu-hadoop</tt> which
|
12
|
+
# lets you execute Ruby files containing Wukong processors as well
|
13
|
+
# as built-in Wukong widgets.
|
14
|
+
module Hadoop
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
require 'wukong-hadoop/configuration'
|
20
|
+
require 'wukong-hadoop/driver'
|
21
|
+
require 'wukong-hadoop/extensions'
|
@@ -0,0 +1,133 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Hadoop
|
3
|
+
|
4
|
+
# Configure the given settings object for use with Wukong::Hadoop.
|
5
|
+
#
|
6
|
+
# @param [Configliere::Param] settings the settings to configure
|
7
|
+
# @return [Configliere::Param the configured settings
|
8
|
+
def self.configure settings
|
9
|
+
# Hadoop Options
|
10
|
+
settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
|
11
|
+
settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
|
12
|
+
|
13
|
+
# Translate simplified args to their hairy hadoop equivalents
|
14
|
+
settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
|
15
|
+
settings.define :io_sort_record_percent, wukong_hadoop: true, jobconf: true, description: 'io.sort.record.percent'
|
16
|
+
settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
|
17
|
+
settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
|
18
|
+
settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
|
19
|
+
settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
|
20
|
+
settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
|
21
|
+
settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
|
22
|
+
settings.define :max_node_map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.map.tasks.maximum'
|
23
|
+
settings.define :max_node_reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.reduce.tasks.maximum'
|
24
|
+
settings.define :max_record_length, wukong_hadoop: true, jobconf: true, description: 'mapred.linerecordreader.maxlength'
|
25
|
+
settings.define :max_reduces_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.cluster'
|
26
|
+
settings.define :max_reduces_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.node'
|
27
|
+
settings.define :max_tracker_failures, wukong_hadoop: true, jobconf: true, description: 'mapred.max.tracker.failures'
|
28
|
+
settings.define :max_map_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.map.max.attempts'
|
29
|
+
settings.define :max_reduce_attempts, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.max.attempts'
|
30
|
+
settings.define :min_split_size, wukong_hadoop: true, jobconf: true, description: 'mapred.min.split.size'
|
31
|
+
settings.define :output_field_separator, wukong_hadoop: true, jobconf: true, description: 'stream.map.output.field.separator'
|
32
|
+
settings.define :partition_fields, wukong_hadoop: true, jobconf: true, description: 'num.key.fields.for.partition'
|
33
|
+
settings.define :reduce_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks'
|
34
|
+
settings.define :respect_exit_status, wukong_hadoop: true, jobconf: true, description: 'stream.non.zero.exit.is.failure'
|
35
|
+
settings.define :reuse_jvms, wukong_hadoop: true, jobconf: true, description: 'mapred.job.reuse.jvm.num.tasks'
|
36
|
+
settings.define :sort_fields, wukong_hadoop: true, jobconf: true, description: 'stream.num.map.output.key.fields'
|
37
|
+
settings.define :timeout, wukong_hadoop: true, jobconf: true, description: 'mapred.task.timeout'
|
38
|
+
settings.define :noempty, wukong_hadoop: true, description: "Don't create zero-byte reduce files"
|
39
|
+
settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
|
40
|
+
settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
|
41
|
+
settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
|
42
|
+
settings.define :java_opts, wukong_hadoop: true, description: 'Additional java options to be passed to hadoop streaming.', :type => Array, :default => []
|
43
|
+
|
44
|
+
# Options given on the command-line
|
45
|
+
settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
|
46
|
+
settings.define :map_command, description: "Shell command to run as mapper, in place of a constructed wu-local command", wukong_hadoop: true
|
47
|
+
settings.define :reduce_command, description: "Shell command to run as reducer, in place of a constructed wu-local command", wukong_hadoop: true
|
48
|
+
settings.define :sort_command, description: "Shell command to run as sorter (only in `local' mode)", wukong_hadoop: true, :default => 'sort'
|
49
|
+
settings.define :command_prefix, description: "Prefex to insert before all Wukong commands", wukong_hadoop: true
|
50
|
+
settings.define :mapper, description: "Name of processor to use as a mapper", wukong_hadoop: true
|
51
|
+
settings.define :reducer, description: "Name of processor to use as a reducer", wukong_hadoop: true
|
52
|
+
settings.define :gemfile, description: "Specify an alternative Gemfile to execute this wukong script with", wukong_hadoop: true
|
53
|
+
settings.define :dry_run, description: "Echo the command that will be run, but don't run it", wukong_hadoop: true, :type => :boolean, :default => false
|
54
|
+
settings.define :rm, description: "Recursively remove the destination directory.", wukong_hadoop: true, :type => :boolean, :default => false
|
55
|
+
settings.define :input, description: "Comma-separated list of input paths", wukong_hadoop: true
|
56
|
+
settings.define :output, description: "Output path.", wukong_hadoop: true
|
57
|
+
|
58
|
+
settings.use(:commandline)
|
59
|
+
|
60
|
+
def settings.usage()
|
61
|
+
"usage: #{File.basename($0)} PROCESSOR|FLOW [PROCESSOR|FLOW] [ --param=value | -p value | --param | -p]"
|
62
|
+
end
|
63
|
+
|
64
|
+
settings.description = <<EOF
|
65
|
+
wu-hadoop is a tool to model and launch Wukong processors as
|
66
|
+
map/reduce workflows within the Hadoop framework.
|
67
|
+
|
68
|
+
Use wu-hadoop with existing processors in `local' mode to test the
|
69
|
+
logic of your job, reading from the specified --input and printing to
|
70
|
+
STDOUT:
|
71
|
+
|
72
|
+
$ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt
|
73
|
+
a 2
|
74
|
+
all 1
|
75
|
+
and 2
|
76
|
+
...
|
77
|
+
|
78
|
+
where it is assumed that your mapper is called 'mapper' and your
|
79
|
+
reducer 'reducer'. You can also cat in data:
|
80
|
+
|
81
|
+
$ cat examples/sonnet_18.txt | wu-hadoop examples/word_count.rb --mode=local
|
82
|
+
|
83
|
+
Or pass options directly:
|
84
|
+
|
85
|
+
$ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt --fold_case --min_length=3
|
86
|
+
all 1
|
87
|
+
and 5
|
88
|
+
art 1
|
89
|
+
brag 1
|
90
|
+
...
|
91
|
+
|
92
|
+
Or define both processors in separate files:
|
93
|
+
|
94
|
+
$ wu-hadoop examples/tokenizer.rb examples/counter.rb --mode=local --input=examples/sonnet_18.txt
|
95
|
+
|
96
|
+
Or by name:
|
97
|
+
|
98
|
+
$ wu-hadoop examples/processors.rb --mode=local --input=examples/sonnet_18.txt --mapper=tokenizer --reducer=counter
|
99
|
+
|
100
|
+
Or just by command:
|
101
|
+
|
102
|
+
$ wu-hadoop processors.rb --mapper=tokenizer --reduce_command='uniq -c' ...
|
103
|
+
$ wu-hadoop processors.rb --map_command='cut -f3' --reducer=counter ...
|
104
|
+
$ wu-hadoop --map_command='cut -f3' --reduce_command='uniq -c' ...
|
105
|
+
|
106
|
+
If you don't specify a --reducer explicitly, and you didn't give two
|
107
|
+
separate arguments, and no processor named :reducer exists in the
|
108
|
+
environment, then we assume you are launching a map-only job and
|
109
|
+
'mapred.tasktracker.reduce.tasks.maximum' will correspondingly be set
|
110
|
+
to 0:
|
111
|
+
|
112
|
+
$ wu-hadoop examples/tokenizer.rb --mode=local --input=examples/sonnet_18.txt
|
113
|
+
Shall
|
114
|
+
I
|
115
|
+
compare
|
116
|
+
thee
|
117
|
+
...
|
118
|
+
|
119
|
+
You can achieve this directly with the --reduce_tasks=0 option.
|
120
|
+
|
121
|
+
Many other Hadoop options have been wrapped with similarly friendly
|
122
|
+
names below. These are ignored when running in `local' mode.
|
123
|
+
|
124
|
+
Some options (like `--sort_command') only make sense in `local' mode.
|
125
|
+
These are ignored in `hadoop' mode.
|
126
|
+
EOF
|
127
|
+
settings
|
128
|
+
end
|
129
|
+
|
130
|
+
# All Hadoop configuration for Wukong lives within this object.
|
131
|
+
Configuration = configure(Configliere::Param.new) unless defined? Configuration
|
132
|
+
end
|
133
|
+
end
|