wukong 1.4.7 → 1.4.9
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.textile +9 -0
- data/README.textile +1 -1
- data/bin/hdp-bzip +28 -0
- data/bin/hdp-mkdir +1 -1
- data/bin/hdp-stream-flat +3 -2
- data/bin/wu-lign +32 -18
- data/docpages/pig/cookbook.html +481 -0
- data/docpages/pig/images/hadoop-logo.jpg +0 -0
- data/docpages/pig/images/instruction_arrow.png +0 -0
- data/docpages/pig/images/pig-logo.gif +0 -0
- data/docpages/pig/piglatin_ref1.html +1103 -0
- data/docpages/pig/piglatin_ref2.html +14340 -0
- data/docpages/pig/setup.html +505 -0
- data/docpages/pig/skin/basic.css +166 -0
- data/docpages/pig/skin/breadcrumbs.js +237 -0
- data/docpages/pig/skin/fontsize.js +166 -0
- data/docpages/pig/skin/getBlank.js +40 -0
- data/docpages/pig/skin/getMenu.js +45 -0
- data/docpages/pig/skin/images/chapter.gif +0 -0
- data/docpages/pig/skin/images/chapter_open.gif +0 -0
- data/docpages/pig/skin/images/current.gif +0 -0
- data/docpages/pig/skin/images/external-link.gif +0 -0
- data/docpages/pig/skin/images/header_white_line.gif +0 -0
- data/docpages/pig/skin/images/page.gif +0 -0
- data/docpages/pig/skin/images/pdfdoc.gif +0 -0
- data/docpages/pig/skin/images/rc-b-l-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-15-1body-2menu-3menu.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png +0 -0
- data/docpages/pig/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png +0 -0
- data/docpages/pig/skin/print.css +54 -0
- data/docpages/pig/skin/profile.css +181 -0
- data/docpages/pig/skin/screen.css +587 -0
- data/docpages/pig/tutorial.html +1059 -0
- data/docpages/pig/udf.html +1509 -0
- data/examples/keystore/conditional_outputter_example.rb +70 -0
- data/examples/{graph → network_graph}/adjacency_list.rb +0 -0
- data/examples/{graph → network_graph}/breadth_first_search.rb +0 -0
- data/examples/{graph → network_graph}/gen_2paths.rb +0 -0
- data/examples/{graph → network_graph}/gen_multi_edge.rb +0 -0
- data/examples/{graph → network_graph}/gen_symmetric_links.rb +0 -0
- data/examples/pagerank/run_pagerank.sh +10 -8
- data/examples/{apache_log_parser.rb → server_logs/apache_log_parser.rb} +0 -0
- data/examples/stupidly_simple_filter.rb +43 -0
- data/lib/wukong/extensions/hash.rb +13 -0
- data/lib/wukong/extensions/hash_like.rb +7 -0
- data/lib/wukong/keystore/cassandra_conditional_outputter.rb +122 -0
- data/lib/wukong/script.rb +27 -22
- data/lib/wukong/script/hadoop_command.rb +5 -3
- data/lib/wukong/streamer/accumulating_reducer.rb +2 -1
- data/wukong.gemspec +64 -26
- metadata +89 -31
- data/docpages/pig/PigLatinReferenceManual.html +0 -19134
- data/examples/foo.rb +0 -9
- data/examples/package-local.rb +0 -100
- data/examples/package.rb +0 -96
- data/examples/run_all.sh +0 -47
@@ -0,0 +1,70 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'cassandra'
|
4
|
+
require 'wukong'
|
5
|
+
require 'wukong/encoding'
|
6
|
+
require 'wukong/keystore/cassandra_conditional_outputter'
|
7
|
+
|
8
|
+
#
|
9
|
+
# Usage:
|
10
|
+
# echo -e "bob has boobs ha ha ha" | ./examples/keystore/conditional_outputter_example.rb --map
|
11
|
+
#
|
12
|
+
|
13
|
+
CASSANDRA_KEYSPACE = 'CorpusAnalysis'
|
14
|
+
|
15
|
+
#
|
16
|
+
# This demonstrates the CassandraConditionalOutputter module.
|
17
|
+
#
|
18
|
+
# CassandraConditionalOutputter uses and a cassandra key-value store to
|
19
|
+
# track unique IDs and prevent output of any record already present in the
|
20
|
+
# database.
|
21
|
+
#
|
22
|
+
# For this example, it takes an input stream, generates all letter pairs for
|
23
|
+
# each line, and emits
|
24
|
+
#
|
25
|
+
#
|
26
|
+
class LetterPairMapper < Wukong::Streamer::LineStreamer
|
27
|
+
include CassandraConditionalOutputter
|
28
|
+
|
29
|
+
#
|
30
|
+
# A unique key for the given record. If an object with
|
31
|
+
# that key has been seen, it won't be re-emitted.
|
32
|
+
#
|
33
|
+
# In this example, we'll just encode the letter pair
|
34
|
+
#
|
35
|
+
def conditional_output_key record
|
36
|
+
record.to_s.wukong_encode(:url)
|
37
|
+
end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Emit each letter pair in the line.
|
41
|
+
# the CassandraConditionalOutputter will swallow all duplicate lines.
|
42
|
+
#
|
43
|
+
def process line, &block
|
44
|
+
letter_pairs(line).each do |pair|
|
45
|
+
yield(pair)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# turn a string into the pairs of adjacent letters
|
50
|
+
#
|
51
|
+
# @example
|
52
|
+
# letter_pairs('abracadabra')
|
53
|
+
# # => ['ab', 'br',
|
54
|
+
def letter_pairs str, &block
|
55
|
+
chars = str.chars.to_a
|
56
|
+
chars[0..-2].zip(chars[1..-1]).map(&:join)
|
57
|
+
end
|
58
|
+
|
59
|
+
# Clear the entire cached keys column at the end of the run.
|
60
|
+
#
|
61
|
+
# You almost certainly don't want to do this in a real script.
|
62
|
+
#
|
63
|
+
def after_stream
|
64
|
+
$stderr.puts 'Clearing conditional_output_key cache...'
|
65
|
+
@key_cache.clear_column_family!(conditional_output_key_column)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Execute the script
|
70
|
+
Wukong::Script.new( LetterPairMapper, nil ).run
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,19 +1,21 @@
|
|
1
1
|
#!/usr/bin/env bash
|
2
2
|
|
3
3
|
# Directory to pagerank on.
|
4
|
-
work_dir=$1
|
5
|
-
if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank" ; exit ; fi
|
4
|
+
work_dir=$1 ; shift
|
5
|
+
if [ "$work_dir" == '' ] ; then echo "Please specify the parent of the directory made by gen_initial_pagerank: $0 initial_dir [number_of_iterations] [start_iteration]" ; exit ; fi
|
6
|
+
# How many rounds to run: default 10
|
7
|
+
n_iters=${1-10} ; shift
|
8
|
+
# the iteration to start with: default 0
|
9
|
+
start_i=${1-0} ; shift
|
6
10
|
|
7
|
-
|
8
|
-
# How many rounds to run
|
9
|
-
max_iter=10
|
10
11
|
# this directory
|
11
12
|
script_dir="`dirname $0`"
|
12
13
|
|
13
|
-
for ((
|
14
|
-
curr_str=`printf "%03d" $
|
15
|
-
next_str=`printf "%03d" $
|
14
|
+
for (( iter=0 ; "$iter" < "$n_iters" ; iter++ )) ; do
|
15
|
+
curr_str=`printf "%03d" $(( $start_i + $iter ))`
|
16
|
+
next_str=`printf "%03d" $(( $start_i + $iter + 1 ))`
|
16
17
|
curr_dir=$work_dir/pagerank_graph_${curr_str}
|
17
18
|
next_dir=$work_dir/pagerank_graph_${next_str}
|
19
|
+
echo -e "Iteration $(( $iter + 1 )) / $n_iters:\t `basename $curr_dir` => `basename $next_dir`"
|
18
20
|
$script_dir/pagerank.rb --rm --run $curr_dir $next_dir
|
19
21
|
done
|
File without changes
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'wukong'
|
4
|
+
|
5
|
+
# Run as (local mode)
|
6
|
+
#
|
7
|
+
# ./examples/stupidly_simple_filter.rb --run=local input.tsv output.tsv
|
8
|
+
#
|
9
|
+
# for hadoop mode,
|
10
|
+
#
|
11
|
+
# ./examples/stupidly_simple_filter.rb --run=hadoop input.tsv output.tsv
|
12
|
+
#
|
13
|
+
# For debugging, run
|
14
|
+
#
|
15
|
+
# cat input.tsv | ./examples/stupidly_simple_filter.rb --map input.tsv | more
|
16
|
+
#
|
17
|
+
|
18
|
+
#
|
19
|
+
# A very simple mapper -- looks for a regex match in one field,
|
20
|
+
# and emits the whole record if the field matches
|
21
|
+
#
|
22
|
+
class GrepMapper < Wukong::Streamer::RecordStreamer
|
23
|
+
|
24
|
+
MATCHER = %r{(ford|mercury|saab|mazda|isuzu)}
|
25
|
+
|
26
|
+
#
|
27
|
+
# Given a series of records like:
|
28
|
+
#
|
29
|
+
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
30
|
+
# tweet 123456789 20100102030405 @jerry, I'm having your baby
|
31
|
+
#
|
32
|
+
# emits only the lines matching that regex
|
33
|
+
#
|
34
|
+
def process rsrc, id, timestamp, text, *rest
|
35
|
+
yield [rsrc, id, timestamp, text, *rest] if line =~ MATCHER
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Execute the script
|
40
|
+
Wukong::Script.new(
|
41
|
+
GrepMapper,
|
42
|
+
nil
|
43
|
+
).run
|
@@ -141,6 +141,19 @@ class Hash
|
|
141
141
|
replace(compact)
|
142
142
|
end
|
143
143
|
|
144
|
+
#
|
145
|
+
# remove all key-value pairs where the value is blank
|
146
|
+
#
|
147
|
+
def compact_blank
|
148
|
+
reject{|key,val| val.blank? }
|
149
|
+
end
|
150
|
+
#
|
151
|
+
# Replace the hash with its compact_blank'ed self
|
152
|
+
#
|
153
|
+
def compact_blank!
|
154
|
+
replace(compact_blank)
|
155
|
+
end
|
156
|
+
|
144
157
|
# Stolen from ActiveSupport::CoreExtensions::Hash::ReverseMerge.
|
145
158
|
def reverse_merge(other_hash)
|
146
159
|
other_hash.merge(self)
|
@@ -88,6 +88,13 @@ module Wukong
|
|
88
88
|
merge hsh2, &Hash::DEEP_MERGER
|
89
89
|
end
|
90
90
|
|
91
|
+
#
|
92
|
+
# remove all key-value pairs where the value is blank
|
93
|
+
#
|
94
|
+
def compact_blank
|
95
|
+
to_hash.compact_blank!
|
96
|
+
end
|
97
|
+
|
91
98
|
module ClassMethods
|
92
99
|
#
|
93
100
|
# Instantiate an instance of the struct from a hash
|
@@ -0,0 +1,122 @@
|
|
1
|
+
|
2
|
+
#
|
3
|
+
# For a stream process that sees a significant number of duplicated heavyweight
|
4
|
+
# objects, it may be better to deduplicate them midflight (rather than, say,
|
5
|
+
# using a reducer to effectively `cat | sort | uniq` the data).
|
6
|
+
#
|
7
|
+
# This uses a cassandra key-value store to track unique IDs and prevent output
|
8
|
+
# of any record already present in the database. (Why cassandra? Because we use
|
9
|
+
# it in production. Might be nice to rewrite this example against redis or
|
10
|
+
# TokyoTyrant or something less demanding.)
|
11
|
+
#
|
12
|
+
# Things you have to do:
|
13
|
+
#
|
14
|
+
# * Override the conditional_output_key method to distinguish identical records
|
15
|
+
# * Define a constant CASSANDRA_KEYSPACE giving the Cassandra keyspace you're working in
|
16
|
+
# * (Optionally) override conditional_output_key_column
|
17
|
+
#
|
18
|
+
# * In your cassandra storage-conf.xml, add a column family to your keyspace:
|
19
|
+
#
|
20
|
+
# <Keyspace Name="CorpusAnalysis">
|
21
|
+
# <KeysCachedFraction>0.01</KeysCachedFraction>
|
22
|
+
#
|
23
|
+
# <!-- Added for CassandraConditionalOutputter -->
|
24
|
+
# <ColumnFamily CompareWith="UTF8Type" Name="LetterPairMapperKeys" />
|
25
|
+
#
|
26
|
+
# <ReplicaPlacementStrategy>org.apache.cassandra.locator.RackUnawareStrategy</ReplicaPlacementStrategy>
|
27
|
+
# <ReplicationFactor>1</ReplicationFactor>
|
28
|
+
# <EndPointSnitch>org.apache.cassandra.locator.EndPointSnitch</EndPointSnitch>
|
29
|
+
# </Keyspace>
|
30
|
+
#
|
31
|
+
# In this example, the CASSANDRA_KEYSPACE is 'CorpusAnalysis' and the
|
32
|
+
# conditional_output_key_column is 'LetterPairMapperKeys'
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# Given
|
36
|
+
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
37
|
+
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
38
|
+
# tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
|
39
|
+
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
40
|
+
# tweet 1137 20100119234532 These pretzels are making me thirsty
|
41
|
+
# ....
|
42
|
+
# will emit:
|
43
|
+
# tweet 123456789 20100102030405 @frank: I'm having a bacon sandwich
|
44
|
+
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
45
|
+
# tweet 8675309 20100102030405 I find pastrami to be the most sensual of the salted, cured meats.
|
46
|
+
# tweet 24601 20100104136526 @jerry, I'm having your baby
|
47
|
+
# tweet 1137 20100119234532 These pretzels are making me thirsty
|
48
|
+
# ....
|
49
|
+
#
|
50
|
+
module CassandraConditionalOutputter
|
51
|
+
|
52
|
+
#
|
53
|
+
# A unique key for the given record. If an object with
|
54
|
+
# that key has been seen, it won't be re-emitted.
|
55
|
+
#
|
56
|
+
# You will almost certainly want to override this method in your subclass. Be
|
57
|
+
# sure that the key is a string, and is encoded properly (Cassandra likes to
|
58
|
+
# strip whitespace from keys, for instance).
|
59
|
+
#
|
60
|
+
def conditional_output_key record
|
61
|
+
record.to_s
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# Checks each record against the key cache
|
66
|
+
# Swallows records already there,
|
67
|
+
#
|
68
|
+
#
|
69
|
+
def emit record, &block
|
70
|
+
key = conditional_output_key(record)
|
71
|
+
if should_emit?(record)
|
72
|
+
set_key(key, {'t' => record.timestamp})
|
73
|
+
super record
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Default. Emit record if its key is not already contained
|
78
|
+
# in the key-value store. Overwrite this as necessary
|
79
|
+
def should_emit? record
|
80
|
+
key = conditional_output_key(record)
|
81
|
+
!has_key?(key)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Check for presence of key in the cache
|
85
|
+
def has_key? key
|
86
|
+
not key_cache.get(conditional_output_key_column, key).blank?
|
87
|
+
end
|
88
|
+
|
89
|
+
# register key in the key_cache
|
90
|
+
def set_key key, data={'t' => '0'}
|
91
|
+
key_cache.insert(conditional_output_key_column, key, data)
|
92
|
+
end
|
93
|
+
|
94
|
+
# nuke key from the key_cache
|
95
|
+
def remove_key key
|
96
|
+
key_cache.remove(conditional_output_key_column, key)
|
97
|
+
end
|
98
|
+
|
99
|
+
#
|
100
|
+
# Key cache implementation in Cassandra
|
101
|
+
#
|
102
|
+
|
103
|
+
# The cache
|
104
|
+
def key_cache
|
105
|
+
@key_cache ||= Cassandra.new(CASSANDRA_KEYSPACE)
|
106
|
+
end
|
107
|
+
|
108
|
+
# The column to use for the key cache. By default, the class name plus 'Keys',
|
109
|
+
# but feel free to override.
|
110
|
+
#
|
111
|
+
# @example
|
112
|
+
#
|
113
|
+
# class FooMapper < Wukong::Streamer::RecordStreamer
|
114
|
+
# include ConditionalOutputter
|
115
|
+
# end
|
116
|
+
# FooMapper.new.conditional_output_key_column
|
117
|
+
# # => 'FooMapperKeys'
|
118
|
+
#
|
119
|
+
def conditional_output_key_column
|
120
|
+
self.class.to_s+'Keys'
|
121
|
+
end
|
122
|
+
end
|
data/lib/wukong/script.rb
CHANGED
@@ -82,6 +82,7 @@ module Wukong
|
|
82
82
|
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run as local or as hadoop?', :wukong => true, :hide_help => false
|
83
83
|
Settings.define :default_mapper, :default => '/bin/cat', :description => 'The command to run when a nil mapper is given.', :wukong => true, :hide_help => true
|
84
84
|
Settings.define :default_reducer, :default => '/bin/cat', :description => 'The command to run when a nil reducer is given.', :wukong => true, :hide_help => true
|
85
|
+
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
85
86
|
Settings.define :hadoop_home, :default => '/usr/lib/hadoop', :env_var => 'HADOOP_HOME', :description => "Path to hadoop installation; :hadoop_home/bin/hadoop should run hadoop.", :wukong => true
|
86
87
|
Settings.define :hadoop_runner, :description => "Path to hadoop script; usually, set :hadoop_home instead of this.", :wukong => true
|
87
88
|
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
@@ -118,11 +119,11 @@ module Wukong
|
|
118
119
|
# end
|
119
120
|
# MyScript.new(MyMapper, nil).run
|
120
121
|
#
|
121
|
-
def initialize mapper_klass, reducer_klass, extra_options={}
|
122
|
+
def initialize mapper_klass, reducer_klass=nil, extra_options={}
|
122
123
|
self.options = Settings.dup
|
123
|
-
options.resolve!
|
124
|
-
options.merge! self.default_options
|
125
|
-
options.merge! extra_options
|
124
|
+
self.options.resolve!
|
125
|
+
self.options.merge! self.default_options
|
126
|
+
self.options.merge! extra_options
|
126
127
|
self.mapper_klass = mapper_klass
|
127
128
|
self.reducer_klass = reducer_klass
|
128
129
|
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
@@ -141,24 +142,29 @@ module Wukong
|
|
141
142
|
end
|
142
143
|
|
143
144
|
#
|
144
|
-
#
|
145
|
+
# Shell command for map phase. By default, calls the script in --map mode
|
146
|
+
# In hadoop mode, this is given to the hadoop streaming command.
|
147
|
+
# In local mode, it's given to the system() call
|
145
148
|
#
|
146
149
|
def map_command
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
150
|
+
if mapper_klass
|
151
|
+
"#{ruby_interpreter_path} #{this_script_filename} --map " + non_wukong_params
|
152
|
+
else
|
153
|
+
options[:map_command] || options[:default_mapper]
|
154
|
+
end
|
151
155
|
end
|
152
156
|
|
153
157
|
#
|
154
|
-
# Shell command for reduce phase
|
155
|
-
#
|
158
|
+
# Shell command for reduce phase. By default, calls the script in --reduce mode
|
159
|
+
# In hadoop mode, this is given to the hadoop streaming command.
|
160
|
+
# In local mode, it's given to the system() call
|
156
161
|
#
|
157
162
|
def reduce_command
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
163
|
+
if reducer_klass
|
164
|
+
"#{ruby_interpreter_path} #{this_script_filename} --reduce " + non_wukong_params
|
165
|
+
else
|
166
|
+
options[:reduce_command]
|
167
|
+
end
|
162
168
|
end
|
163
169
|
|
164
170
|
#
|
@@ -187,10 +193,10 @@ module Wukong
|
|
187
193
|
end
|
188
194
|
|
189
195
|
def input_output_paths
|
190
|
-
|
191
|
-
|
192
|
-
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (
|
193
|
-
[
|
196
|
+
output_path = options.rest.pop
|
197
|
+
input_paths = options.rest.reject(&:blank?)
|
198
|
+
raise "You need to specify a parsed input directory and a directory for output. Got #{ARGV.inspect}" if (! options[:dry_run]) && (input_paths.blank? || output_path.blank?)
|
199
|
+
[input_paths, output_path]
|
194
200
|
end
|
195
201
|
|
196
202
|
def maybe_overwrite_output_paths! output_path
|
@@ -218,8 +224,7 @@ module Wukong
|
|
218
224
|
def ruby_interpreter_path
|
219
225
|
Pathname.new(
|
220
226
|
File.join(Config::CONFIG["bindir"],
|
221
|
-
Config::CONFIG["RUBY_INSTALL_NAME"]+
|
222
|
-
Config::CONFIG["EXEEXT"])
|
227
|
+
Config::CONFIG["RUBY_INSTALL_NAME"]+Config::CONFIG["EXEEXT"])
|
223
228
|
).realpath
|
224
229
|
end
|
225
230
|
|
@@ -229,10 +234,10 @@ module Wukong
|
|
229
234
|
def exec_hadoop_streaming
|
230
235
|
$stderr.puts "Streaming on self"
|
231
236
|
input_path, output_path = input_output_paths
|
232
|
-
maybe_overwrite_output_paths! output_path
|
233
237
|
command = runner_command(input_path, output_path)
|
234
238
|
$stderr.puts command
|
235
239
|
unless options[:dry_run]
|
240
|
+
maybe_overwrite_output_paths! output_path
|
236
241
|
$stdout.puts `#{command}`
|
237
242
|
end
|
238
243
|
end
|
@@ -28,6 +28,7 @@ module Wukong
|
|
28
28
|
Settings.define :reuse_jvms, :jobconf => true, :description => 'mapred.job.reuse.jvm.num.tasks', :wukong => true
|
29
29
|
Settings.define :respect_exit_status, :jobconf => true, :description => 'stream.non.zero.exit.is.failure', :wukong => true
|
30
30
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
31
|
+
Settings.define :job_name, :jobconf => true, :description => 'mapred.job.name', :wukong => true
|
31
32
|
# mapred.linerecordreader.maxlength :description => "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
32
33
|
|
33
34
|
# emit a -jobconf hadoop option if the simplified command line arg is present
|
@@ -67,12 +68,13 @@ module Wukong
|
|
67
68
|
]
|
68
69
|
end
|
69
70
|
|
70
|
-
def hadoop_other_args
|
71
|
+
def hadoop_other_args input_path, output_path
|
71
72
|
extra_str_args = [ options[:extra_args] ]
|
72
73
|
extra_str_args += ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
73
74
|
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
74
75
|
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
75
|
-
|
76
|
+
options[:job_name] ||= "#{File.basename(this_script_filename)}---#{input_path}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
77
|
+
extra_hsh_args = [:job_name, :map_speculative, :timeout, :reuse_jvms, :respect_exit_status].map{|opt| jobconf(opt) }
|
76
78
|
extra_str_args + extra_hsh_args
|
77
79
|
end
|
78
80
|
|
@@ -105,7 +107,7 @@ module Wukong
|
|
105
107
|
"-input '#{input_path}'",
|
106
108
|
"-output '#{output_path}'",
|
107
109
|
hadoop_recycle_env,
|
108
|
-
hadoop_other_args,
|
110
|
+
hadoop_other_args(input_path, output_path),
|
109
111
|
].flatten.compact.join(" \t\\\n ")
|
110
112
|
end
|
111
113
|
|