wukong 1.5.3 → 1.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.textile +4 -0
- data/bin/hdp-bin +44 -0
- data/bin/hdp-ls +2 -1
- data/docpages/avro/performance.textile +36 -0
- data/examples/cassandra_streaming/avromapper.rb +85 -0
- data/examples/cassandra_streaming/berlitz_for_cassandra.textile +22 -0
- data/examples/cassandra_streaming/cassandra.avpr +468 -0
- data/examples/cassandra_streaming/cassandra_random_partitioner.rb +62 -0
- data/examples/cassandra_streaming/catter.sh +45 -0
- data/examples/cassandra_streaming/client_interface_notes.textile +200 -0
- data/examples/cassandra_streaming/client_schema.avpr +211 -0
- data/examples/cassandra_streaming/client_schema.textile +318 -0
- data/examples/cassandra_streaming/foofile.avr +0 -0
- data/examples/cassandra_streaming/pymap.sh +1 -0
- data/examples/cassandra_streaming/pyreduce.sh +1 -0
- data/examples/cassandra_streaming/smutation.avpr +188 -0
- data/examples/cassandra_streaming/streamer.sh +51 -0
- data/examples/cassandra_streaming/struct_loader.rb +24 -0
- data/examples/cassandra_streaming/tuning.textile +73 -0
- data/examples/emr/README-elastic_map_reduce.textile +26 -0
- data/examples/emr/dot_wukong_dir/credentials.json +7 -0
- data/examples/emr/{emr.yaml → dot_wukong_dir/emr.yaml} +33 -16
- data/{bin/bootstrap.sh → examples/emr/dot_wukong_dir/emr_bootstrap.sh} +1 -1
- data/examples/emr/elastic_mapreduce_example.rb +1 -0
- data/lib/wukong/encoding/asciize.rb +108 -0
- data/lib/wukong/extensions/date_time.rb +33 -7
- data/lib/wukong/extensions/emittable.rb +12 -25
- data/lib/wukong/extensions/hash_like.rb +13 -6
- data/lib/wukong/filename_pattern.rb +8 -7
- data/lib/wukong/schema.rb +47 -0
- data/lib/wukong/script.rb +7 -0
- data/lib/wukong/script/cassandra_loader_script.rb +40 -0
- data/lib/wukong/script/emr_command.rb +74 -43
- data/lib/wukong/script/hadoop_command.rb +89 -72
- data/lib/wukong/store.rb +2 -7
- data/lib/wukong/store/cassandra.rb +10 -0
- data/lib/wukong/store/cassandra/streaming.rb +75 -0
- data/lib/wukong/store/cassandra/struct_loader.rb +21 -0
- data/lib/wukong/store/cassandra_model.rb +90 -0
- data/lib/wukong/store/chh_chunked_flat_file_store.rb +1 -1
- data/lib/wukong/store/chunked_flat_file_store.rb +24 -20
- data/wukong.gemspec +32 -4
- metadata +33 -14
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
Object.class_eval do
|
3
2
|
def to_flat() [to_s] end
|
4
3
|
end
|
@@ -54,29 +53,17 @@ Hash.class_eval do
|
|
54
53
|
end
|
55
54
|
end
|
56
55
|
|
57
|
-
class
|
58
|
-
#
|
59
|
-
|
60
|
-
#
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
# Flatten
|
70
|
-
def to_flat
|
71
|
-
strftime(FLAT_FORMAT)
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
class DateTime < Date
|
76
|
-
# strftime() format to flatten a date
|
77
|
-
FLAT_FORMAT = "%Y%m%d%H%M%S"
|
78
|
-
# Flatten
|
79
|
-
def to_flat
|
80
|
-
strftime(FLAT_FORMAT)
|
56
|
+
class Integer
|
57
|
+
#
|
58
|
+
# Express boolean as 1 (true) or 0 (false). In contravention of typical ruby
|
59
|
+
# semantics (but in a way that is more robust for wukong-like batch
|
60
|
+
# processing), the number 0, the string '0', nil and false are all considered
|
61
|
+
# false. (This also makes the method idempotent: repeated calls give same result.)
|
62
|
+
#
|
63
|
+
def self.unbooleanize bool
|
64
|
+
case bool
|
65
|
+
when 0, '0', false, nil then 0
|
66
|
+
else 1
|
67
|
+
end
|
81
68
|
end
|
82
69
|
end
|
@@ -103,16 +103,23 @@ module Wukong
|
|
103
103
|
# otherwise they must be uniformly strings
|
104
104
|
#
|
105
105
|
def from_hash(hsh, has_symbol_keys=false)
|
106
|
-
|
107
|
-
|
108
|
-
self.new *hsh.values_of(*keys)
|
106
|
+
extract_keys = has_symbol_keys ? self.keys.map(&:to_sym) : self.keys.map(&:to_s)
|
107
|
+
self.new *hsh.values_of(*extract_keys)
|
109
108
|
end
|
110
109
|
#
|
111
110
|
# The last portion of the class in underscored form
|
112
|
-
#
|
111
|
+
# memoized
|
113
112
|
#
|
114
|
-
def
|
115
|
-
@resource_name ||= self.
|
113
|
+
def resource_name
|
114
|
+
@resource_name ||= self.class_basename.underscore.to_sym
|
115
|
+
end
|
116
|
+
# The last portion of the class name
|
117
|
+
# memoized
|
118
|
+
#
|
119
|
+
# @example
|
120
|
+
# This::That::TheOther.new.class_basename # => TheOther
|
121
|
+
def class_basename
|
122
|
+
@class_basename ||= self.to_s.gsub(%r{.*::}, '')
|
116
123
|
end
|
117
124
|
end
|
118
125
|
|
@@ -16,12 +16,12 @@ module Wukong
|
|
16
16
|
# walk through pattern, replacing tokens (eg :time or :pid) with the
|
17
17
|
# corresponding value.
|
18
18
|
#
|
19
|
+
# Don't use ':' in a pattern except to introduce a token
|
20
|
+
# and separate tokens with '-', '+' '/' or '.'
|
21
|
+
#
|
19
22
|
def make token_vals={}
|
20
23
|
token_vals = token_val_defaults.merge token_vals
|
21
24
|
token_vals[:timestamp] ||= Time.now.utc.strftime("%Y%m%d%H%M%S")
|
22
|
-
# CHH_NOTE: The following is broken for patterns that need a ":" or
|
23
|
-
# patterns that need text following a token with no special chars in
|
24
|
-
# between.
|
25
25
|
val = pattern.gsub(/:(\w+)/){ replace($1, token_vals) }
|
26
26
|
val
|
27
27
|
end
|
@@ -39,7 +39,7 @@ module Wukong
|
|
39
39
|
case token
|
40
40
|
when :pid then pid
|
41
41
|
when :hostname then hostname
|
42
|
-
when :handle then token_vals[:handle]
|
42
|
+
when :handle then token_vals[:handle]
|
43
43
|
when :handle_prefix then token_vals[:handle].to_s[0..5]
|
44
44
|
when :timestamp then token_vals[:timestamp]
|
45
45
|
when :date then token_vals[:timestamp][ 0..7]
|
@@ -56,7 +56,7 @@ module Wukong
|
|
56
56
|
|
57
57
|
# Memoized: the hostname for the machine running this script.
|
58
58
|
def hostname
|
59
|
-
@hostname ||= ENV['HOSTNAME'] || `hostname`.
|
59
|
+
@hostname ||= ENV['HOSTNAME'] || `hostname`.chomp
|
60
60
|
end
|
61
61
|
# Memoized: the Process ID for this invocation.
|
62
62
|
def pid
|
@@ -64,9 +64,10 @@ module Wukong
|
|
64
64
|
end
|
65
65
|
|
66
66
|
# Characters deemed safe in a filename;
|
67
|
-
SAFE_CHARS = 'a-zA-Z0-9_
|
67
|
+
SAFE_CHARS = 'a-zA-Z0-9_\-\.\+\/'
|
68
|
+
RE_SAFE_FILENAME = %r{[^#{SAFE_CHARS}]+}moxi
|
68
69
|
def self.sanitize str
|
69
|
-
str.gsub(
|
70
|
+
str.gsub(RE_SAFE_FILENAME, '-')
|
70
71
|
end
|
71
72
|
|
72
73
|
end
|
data/lib/wukong/schema.rb
CHANGED
@@ -50,6 +50,35 @@ class << Yaml ; def to_pig() 'chararray' end ; end if defined?(Yaml)
|
|
50
50
|
class << Json ; def to_pig() 'chararray' end ; end if defined?(Json)
|
51
51
|
class << Regex ; def to_pig() 'chararray' end ; end if defined?(Regex)
|
52
52
|
|
53
|
+
|
54
|
+
#
|
55
|
+
# Basic types: Avro conversion
|
56
|
+
#
|
57
|
+
class << Integer ; def to_avro() 'int' end ; end
|
58
|
+
class << Bignum ; def to_avro() 'long' end ; end
|
59
|
+
class << Float ; def to_avro() 'float' end ; end
|
60
|
+
class << Symbol ; def to_avro() 'string' end ; end
|
61
|
+
class << Date ; def to_avro() 'long' end ; end
|
62
|
+
class << Time ; def to_avro() 'long' end ; end
|
63
|
+
class << DateTime ; def to_avro() 'long' end ; end
|
64
|
+
class << String ; def to_avro() 'string' end ; end
|
65
|
+
class << Text ; def to_avro() 'string' end ; end if defined?(Text)
|
66
|
+
class << Blob ; def to_avro() 'bytearray' end ; end if defined?(Blob)
|
67
|
+
class << Boolean ; def to_avro() 'bytearray' end ; end if defined?(Boolean)
|
68
|
+
class String ; def to_avro() self.to_s ; end ; end
|
69
|
+
class Symbol ; def to_avro() self.to_s ; end ; end
|
70
|
+
|
71
|
+
class << BigDecimal ; def to_avro() 'long' end ; end if defined?(BigDecimal)
|
72
|
+
class << EpochTime ; def to_avro() 'integer' end ; end if defined?(EpochTime)
|
73
|
+
class << FilePath ; def to_avro() 'string' end ; end if defined?(FilePath)
|
74
|
+
class << Flag ; def to_avro() 'string' end ; end if defined?(Flag)
|
75
|
+
class << IPAddress ; def to_avro() 'string' end ; end if defined?(IPAddress)
|
76
|
+
class << URI ; def to_avro() 'string' end ; end if defined?(URI)
|
77
|
+
class << Csv ; def to_avro() 'string' end ; end if defined?(Csv)
|
78
|
+
class << Yaml ; def to_avro() 'string' end ; end if defined?(Yaml)
|
79
|
+
class << Json ; def to_avro() 'string' end ; end if defined?(Json)
|
80
|
+
class << Regex ; def to_avro() 'string' end ; end if defined?(Regex)
|
81
|
+
|
53
82
|
module Wukong
|
54
83
|
#
|
55
84
|
# Export model's structure for loading and manipulating in other frameworks,
|
@@ -208,6 +237,24 @@ module Wukong
|
|
208
237
|
str.join("\n")
|
209
238
|
end
|
210
239
|
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
#
|
244
|
+
# Avro
|
245
|
+
#
|
246
|
+
def to_avro
|
247
|
+
require 'json' # yikes
|
248
|
+
h = {}
|
249
|
+
h[:name] = self.name
|
250
|
+
h[:type] = "record"
|
251
|
+
h[:fields] = []
|
252
|
+
members.zip(mtypes).each do |member, type|
|
253
|
+
h[:fields] << {:name => member.to_s, :type => type.to_avro}
|
254
|
+
end
|
255
|
+
h.to_json
|
256
|
+
end
|
257
|
+
|
211
258
|
end
|
212
259
|
# standard stanza for making methods appear on the class itself on include
|
213
260
|
def self.included base
|
data/lib/wukong/script.rb
CHANGED
@@ -145,6 +145,7 @@ module Wukong
|
|
145
145
|
when 'map' then mapper_klass.new(self.options).stream
|
146
146
|
when 'reduce' then reducer_klass.new(self.options).stream
|
147
147
|
when 'local' then execute_local_workflow
|
148
|
+
when 'cassandra' then execute_hadoop_workflow
|
148
149
|
when 'hadoop', 'mapred' then execute_hadoop_workflow
|
149
150
|
when 'emr'
|
150
151
|
require 'wukong/script/emr_command'
|
@@ -196,6 +197,12 @@ module Wukong
|
|
196
197
|
"#{File.basename(this_script_filename)}---#{input_paths}---#{output_path}".gsub(%r{[^\w/\.\-\+]+}, '')
|
197
198
|
end
|
198
199
|
|
200
|
+
# Wrapper for dangerous operations to catch errors
|
201
|
+
def safely action, &block
|
202
|
+
begin
|
203
|
+
block.call
|
204
|
+
rescue StandardError => e ; handle_error(action, e); end
|
205
|
+
end
|
199
206
|
|
200
207
|
protected
|
201
208
|
|
@@ -0,0 +1,40 @@
|
|
1
|
+
Settings.define :cassandra_keyspace, :required => true, :description => "The keyspace to bulk load"
|
2
|
+
Settings.define :cassandra_col_family, :required => true, :description => "The column family to bulk load"
|
3
|
+
Settings.define :cassandra_home, :env_var => 'CASSANDRA_HOME', :default => '/usr/local/share/cassandra'
|
4
|
+
|
5
|
+
module Wukong
|
6
|
+
class CassandraScript < Wukong::Script
|
7
|
+
def hadoop_other_args *args
|
8
|
+
opts = super(*args)
|
9
|
+
opts << "-D stream.map.output=\'cassandra_avro_output\'"
|
10
|
+
opts << "-D stream.io.identifier.resolver.class=\'org.apache.cassandra.hadoop.streaming.AvroResolver\'"
|
11
|
+
opts << "-D cassandra.output.keyspace=\'#{Settings.cassandra_keyspace}\'"
|
12
|
+
opts << "-D cassandra.output.columnfamily=\'#{Settings.cassandra_col_family}\'"
|
13
|
+
opts << "-D cassandra.partitioner.class=\'org.apache.cassandra.dht.RandomPartitioner\'"
|
14
|
+
opts << "-D cassandra.thrift.address=\'#{[Settings.cassandra_hosts].flatten.map{|s| s.gsub(/:.*/, '')}.join(",")}\'"
|
15
|
+
opts << "-D cassandra.thrift.port=\'9160\'"
|
16
|
+
# opts << "-D mapreduce.output.columnfamilyoutputformat.batch.threshold=\'1024\'"
|
17
|
+
# ORDER MATTERS
|
18
|
+
opts << "-libjars \'#{cassandra_jars}\'"
|
19
|
+
opts << "-file \'#{avro_schema}\'"
|
20
|
+
opts << "-outputformat \'org.apache.cassandra.hadoop.ColumnFamilyOutputFormat\'"
|
21
|
+
opts
|
22
|
+
end
|
23
|
+
|
24
|
+
#
|
25
|
+
# Return paths to cassandra jars as a string
|
26
|
+
#
|
27
|
+
def cassandra_jars
|
28
|
+
jars = []
|
29
|
+
Dir["#{Settings.cassandra_home}/build/apache-cassandra*.jar", "#{Settings.cassandra_home}/build/lib/jars/*.jar", "#{Settings.cassandra_home}/lib/*.jar"].each do |jar|
|
30
|
+
jars << jar
|
31
|
+
end
|
32
|
+
jars.join(',')
|
33
|
+
end
|
34
|
+
|
35
|
+
def avro_schema
|
36
|
+
File.join(Settings.cassandra_home, "interface/avro/cassandra.avpr")
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
end
|
@@ -1,16 +1,26 @@
|
|
1
1
|
require 'right_aws'
|
2
2
|
require 'configliere/config_block'
|
3
|
-
|
3
|
+
#
|
4
|
+
EMR_CONFIG_DIR = '~/.wukong' unless defined?(EMR_CONFIG_DIR)
|
5
|
+
#
|
4
6
|
Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
|
5
7
|
Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
|
6
8
|
Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
|
7
9
|
Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
|
8
|
-
Settings.define :emr_root, :description => 'S3
|
9
|
-
Settings.define :
|
10
|
-
Settings.define :
|
11
|
-
Settings.define :
|
10
|
+
Settings.define :emr_root, :description => 'S3 bucket and path to use as the base for Elastic MapReduce storage, organized by job name'
|
11
|
+
Settings.define :emr_data_root, :description => 'Optional '
|
12
|
+
Settings.define :emr_bootstrap_script, :description => 'Bootstrap actions for Elastic Map Reduce machine provisioning', :default => EMR_CONFIG_DIR+'/emr_bootstrap.sh', :type => :filename, :finally => lambda{ Settings.emr_bootstrap_script = File.expand_path(Settings.emr_bootstrap_script) }
|
13
|
+
Settings.define :emr_extra_args, :description => 'kludge: allows you to stuff extra args into the elastic-mapreduce invocation', :type => Array, :wukong => true
|
14
|
+
Settings.define :alive, :description => 'Whether to keep machine running after job invocation', :type => :boolean
|
15
|
+
#
|
16
|
+
Settings.define :keypair_file, :description => 'AWS Key pair file', :type => :filename
|
17
|
+
Settings.define :keypair, :description => "AWS Key pair name. If not specified, it's taken from keypair_file's basename", :finally => lambda{ Settings.keypair ||= File.basename(Settings.keypair_file.to_s, '.pem') if Settings.keypair_file }
|
18
|
+
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
12
19
|
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
13
|
-
Settings.define :jobflow
|
20
|
+
Settings.define :jobflow, :description => "ID of an existing EMR job flow. Wukong will create a new job flow"
|
21
|
+
#
|
22
|
+
Settings.read(File.expand_path(EMR_CONFIG_DIR+'/emr.yaml'))
|
23
|
+
|
14
24
|
module Wukong
|
15
25
|
#
|
16
26
|
# EMR Options
|
@@ -26,39 +36,46 @@ module Wukong
|
|
26
36
|
Log.info " Copying this script to the cloud."
|
27
37
|
S3Util.store(this_script_filename, mapper_s3_uri)
|
28
38
|
S3Util.store(this_script_filename, reducer_s3_uri)
|
29
|
-
S3Util.store(File.expand_path(
|
39
|
+
S3Util.store(File.expand_path(Settings.emr_bootstrap_script), bootstrap_s3_uri)
|
40
|
+
end
|
41
|
+
|
42
|
+
def copy_jars_to_cloud
|
43
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
|
44
|
+
# "--cache-archive=#{wukong_libs_s3_uri}#vendor",
|
45
|
+
end
|
46
|
+
|
47
|
+
def hadoop_options_for_emr_runner
|
48
|
+
[hadoop_jobconf_options, hadoop_other_args].flatten.compact.map{|hdp_opt| "--arg '#{hdp_opt}'"}
|
30
49
|
end
|
31
50
|
|
32
51
|
def execute_emr_runner
|
33
52
|
command_args = []
|
34
|
-
command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
35
|
-
command_args += emr_credentials
|
36
53
|
if Settings.jobflow
|
37
54
|
command_args << Settings.dashed_flag_for(:jobflow)
|
38
55
|
else
|
39
|
-
command_args << Settings.dashed_flag_for(:alive)
|
40
56
|
command_args << "--create --name=#{job_name}"
|
41
|
-
command_args << Settings.
|
57
|
+
command_args << Settings.dashed_flag_for(:alive)
|
58
|
+
command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type, :hadoop_version).join(' ')
|
59
|
+
command_args << Settings.dashed_flags(:availability_zone, :keypair, :keypair_file).join(' ')
|
60
|
+
command_args << "--bootstrap-action=#{bootstrap_s3_uri}"
|
42
61
|
end
|
62
|
+
command_args << Settings.dashed_flags(:enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
63
|
+
command_args += emr_credentials
|
43
64
|
command_args += [
|
44
|
-
"--bootstrap-action=#{bootstrap_s3_uri}",
|
45
65
|
"--log-uri=#{log_s3_uri}",
|
46
66
|
"--stream",
|
47
67
|
"--mapper=#{mapper_s3_uri} ",
|
48
68
|
"--reducer=#{reducer_s3_uri} ",
|
49
|
-
"--input=#{input_paths} --output=#{output_path}",
|
50
|
-
# to specify zero reducers:
|
51
|
-
# "--arg '-D mapred.reduce.tasks=0'"
|
69
|
+
"--input=#{input_paths.join(",")} --output=#{output_path}",
|
52
70
|
]
|
71
|
+
# eg to specify zero reducers:
|
72
|
+
# Settings[:emr_extra_args] = "--arg '-D mapred.reduce.tasks=0'"
|
73
|
+
command_args += Settings[:emr_extra_args] unless Settings[:emr_extra_args].blank?
|
74
|
+
command_args += hadoop_options_for_emr_runner
|
53
75
|
Log.info 'Follow along at http://localhost:9000/job'
|
54
76
|
execute_command!( File.expand_path(Settings.emr_runner), *command_args )
|
55
77
|
end
|
56
78
|
|
57
|
-
def emr_ship_jars
|
58
|
-
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
|
59
|
-
# "--cache-archive=#{wukong_libs_s3_uri}#vendor",
|
60
|
-
end
|
61
|
-
|
62
79
|
def emr_credentials
|
63
80
|
command_args = []
|
64
81
|
if Settings.emr_credentials_file
|
@@ -66,7 +83,6 @@ module Wukong
|
|
66
83
|
else
|
67
84
|
command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
|
68
85
|
end
|
69
|
-
command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
|
70
86
|
command_args
|
71
87
|
end
|
72
88
|
|
@@ -75,58 +91,73 @@ module Wukong
|
|
75
91
|
File.basename($0,'.rb')
|
76
92
|
end
|
77
93
|
|
94
|
+
# Produces an s3 URI within the Wukong emr sandbox from a set of path
|
95
|
+
# segments
|
96
|
+
#
|
97
|
+
# @example
|
98
|
+
# Settings.emr_root = 's3://emr.yourmom.com/wukong'
|
99
|
+
# emr_s3_path('log', 'my_happy_job', 'run-97.log')
|
100
|
+
# # => "s3://emr.yourmom.com/wukong/log/my_happy_job/run-97.log"
|
101
|
+
#
|
102
|
+
def emr_s3_path *path_segs
|
103
|
+
File.join(Settings.emr_root, path_segs.flatten.compact)
|
104
|
+
end
|
105
|
+
|
78
106
|
def mapper_s3_uri
|
79
|
-
emr_s3_path(job_handle+'-mapper.rb')
|
107
|
+
emr_s3_path(job_handle, 'code', job_handle+'-mapper.rb')
|
80
108
|
end
|
81
109
|
def reducer_s3_uri
|
82
|
-
emr_s3_path(job_handle+'-reducer.rb')
|
110
|
+
emr_s3_path(job_handle, 'code', job_handle+'-reducer.rb')
|
83
111
|
end
|
84
112
|
def log_s3_uri
|
85
|
-
emr_s3_path('log',
|
113
|
+
emr_s3_path(job_handle, 'log', 'emr_jobs')
|
86
114
|
end
|
87
115
|
def bootstrap_s3_uri
|
88
|
-
emr_s3_path('bin', "
|
116
|
+
emr_s3_path(job_handle, 'bin', "emr_bootstrap.sh")
|
89
117
|
end
|
90
118
|
def wukong_libs_s3_uri
|
91
|
-
emr_s3_path('
|
92
|
-
end
|
93
|
-
|
94
|
-
def emr_s3_path *path_segs
|
95
|
-
File.join(Settings.emr_root, path_segs.flatten.compact)
|
119
|
+
emr_s3_path(job_handle, 'code', "wukong-libs.jar")
|
96
120
|
end
|
97
121
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
122
|
+
ABSOLUTE_URI = %r{^/|^\w+://}
|
123
|
+
#
|
124
|
+
# Walk through the input paths and the output path. Prepends
|
125
|
+
# Settings.emr_data_root to any that does NOT look like
|
126
|
+
# an absolute path ("/foo") or a URI ("s3://yourmom/data")
|
127
|
+
#
|
128
|
+
def fix_paths!
|
129
|
+
return if Settings.emr_data_root.blank?
|
130
|
+
unless input_paths.blank?
|
131
|
+
@input_paths = input_paths.map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
|
132
|
+
end
|
133
|
+
unless output_path.blank?
|
134
|
+
@output_path = [output_path].map{|path| (path =~ ABSOLUTE_URI) ? path : File.join(Settings.emr_data_root, path) }
|
105
135
|
end
|
106
136
|
end
|
107
137
|
|
138
|
+
#
|
139
|
+
# Simple class to coordinate s3 operations
|
140
|
+
#
|
108
141
|
class S3Util
|
109
142
|
# class methods
|
110
143
|
class << self
|
111
144
|
def s3
|
112
145
|
@s3 ||= RightAws::S3Interface.new(
|
113
146
|
Settings.access_key, Settings.secret_access_key,
|
114
|
-
{:multi_thread => true, :logger => Log})
|
147
|
+
{:multi_thread => true, :logger => Log, :port => 80, :protocol => 'http' })
|
115
148
|
end
|
116
|
-
|
117
149
|
def bucket_and_path_from_uri uri
|
118
150
|
uri =~ %r{^s3\w*://([\w\.\-]+)\W*(.*)} and return([$1, $2])
|
119
151
|
end
|
120
|
-
|
121
152
|
def store filename, uri
|
122
|
-
Log.debug " #{filename} => #{uri}"
|
123
153
|
dest_bucket, dest_key = bucket_and_path_from_uri(uri)
|
124
|
-
|
154
|
+
Log.debug " #{filename} => #{dest_bucket} / #{dest_key}"
|
155
|
+
contents = File.read(filename)
|
125
156
|
s3.store_object(:bucket => dest_bucket, :key => dest_key, :data => contents)
|
126
157
|
end
|
127
|
-
|
128
158
|
end
|
129
159
|
end
|
160
|
+
|
130
161
|
end
|
131
162
|
Script.class_eval do
|
132
163
|
include EmrCommand
|
@@ -32,16 +32,28 @@ module Wukong
|
|
32
32
|
Settings.define :max_maps_per_node, :jobconf => true, :description => 'mapred.max.maps.per.node', :wukong => true
|
33
33
|
Settings.define :max_maps_per_cluster, :jobconf => true, :description => 'mapred.max.maps.per.cluster', :wukong => true
|
34
34
|
Settings.define :max_record_length, :jobconf => true, :description => 'mapred.linerecordreader.maxlength', :wukong => true # "Safeguards against corrupted data: lines longer than this (in bytes) are treated as bad records."
|
35
|
-
Settings.define :
|
35
|
+
Settings.define :min_split_size, :jobconf => true, :description => 'mapred.min.split.size', :wukong => true
|
36
36
|
Settings.define :noempty, :description => "don't create zero-byte reduce files (hadoop mode only)", :wukong => true
|
37
|
+
Settings.define :split_on_xml_tag, :description => "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'", :wukong => true
|
38
|
+
|
39
|
+
# emit a -jobconf hadoop option if the simplified command line arg is present
|
40
|
+
# if not, the resulting nil will be elided later
|
41
|
+
def jobconf option
|
42
|
+
if options[option]
|
43
|
+
# "-jobconf %s=%s" % [options.description_for(option), options[option]]
|
44
|
+
"-D %s=%s" % [options.description_for(option), options[option]]
|
45
|
+
end
|
46
|
+
end
|
37
47
|
|
38
48
|
#
|
39
49
|
# Assemble the hadoop command to execute
|
40
50
|
# and launch the hadoop runner to execute the script across all tasktrackers
|
41
51
|
#
|
52
|
+
# FIXME: Should add some simple logic to ensure that commands are in the
|
53
|
+
# right order or hadoop will complain. ie. -D options MUST come before
|
54
|
+
# others
|
55
|
+
#
|
42
56
|
def execute_hadoop_workflow
|
43
|
-
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
44
|
-
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
45
57
|
# Input paths join by ','
|
46
58
|
input_paths = @input_paths.join(',')
|
47
59
|
#
|
@@ -49,14 +61,14 @@ module Wukong
|
|
49
61
|
hadoop_commandline = [
|
50
62
|
hadoop_runner,
|
51
63
|
"jar #{Settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
|
64
|
+
hadoop_jobconf_options,
|
65
|
+
"-D mapred.job.name='#{job_name}'",
|
66
|
+
hadoop_other_args,
|
52
67
|
"-mapper '#{mapper_commandline}'",
|
53
68
|
"-reducer '#{reducer_commandline}'",
|
54
69
|
"-input '#{input_paths}'",
|
55
70
|
"-output '#{output_path}'",
|
56
|
-
hadoop_jobconf_options,
|
57
|
-
"-jobconf mapred.job.name='#{job_name}'",
|
58
71
|
hadoop_recycle_env,
|
59
|
-
hadoop_other_args,
|
60
72
|
].flatten.compact.join(" \t\\\n ")
|
61
73
|
Log.info " Launching hadoop!"
|
62
74
|
execute_command!(hadoop_commandline)
|
@@ -64,48 +76,40 @@ module Wukong
|
|
64
76
|
|
65
77
|
def hadoop_jobconf_options
|
66
78
|
jobconf_options = []
|
67
|
-
#
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
]
|
79
|
+
# Fixup these options
|
80
|
+
options[:reuse_jvms] = '-1' if (options[:reuse_jvms] == true)
|
81
|
+
options[:respect_exit_status] = 'false' if (options[:ignore_exit_status] == true)
|
82
|
+
# If no reducer_klass and no reduce_command, then skip the reduce phase
|
83
|
+
options[:reduce_tasks] = 0 if (! reducer_klass) && (! options[:reduce_command]) && (! options[:reduce_tasks])
|
72
84
|
# Fields hadoop should use to distribute records to reducers
|
73
85
|
unless options[:partition_fields].blank?
|
74
86
|
jobconf_options += [
|
75
|
-
'-partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner',
|
76
|
-
jobconf(:output_field_separator),
|
77
87
|
jobconf(:partition_fields),
|
88
|
+
jobconf(:output_field_separator),
|
78
89
|
]
|
79
90
|
end
|
80
|
-
# Setting the number of mappers and reducers.
|
81
91
|
jobconf_options += [
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
92
|
+
:key_field_separator, :sort_fields,
|
93
|
+
:map_tasks, :reduce_tasks,
|
94
|
+
:max_node_map_tasks, :max_node_reduce_tasks,
|
95
|
+
:max_reduces_per_node, :max_reduces_per_cluster,
|
96
|
+
:max_maps_per_node, :max_maps_per_cluster,
|
97
|
+
:min_split_size,
|
98
|
+
:map_speculative,
|
99
|
+
:timeout,
|
100
|
+
:reuse_jvms, :respect_exit_status
|
101
|
+
].map{|opt| jobconf(opt)}
|
91
102
|
jobconf_options.flatten.compact
|
92
103
|
end
|
93
104
|
|
94
|
-
# emit a -jobconf hadoop option if the simplified command line arg is present
|
95
|
-
# if not, the resulting nil will be elided later
|
96
|
-
def jobconf option
|
97
|
-
if options[option]
|
98
|
-
"-jobconf %s=%s" % [options.description_for(option), options[option]]
|
99
|
-
end
|
100
|
-
end
|
101
|
-
|
102
105
|
def hadoop_other_args
|
103
106
|
extra_str_args = [ options[:extra_args] ]
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
extra_str_args
|
107
|
+
if Settings.split_on_xml_tag
|
108
|
+
extra_str_args << %Q{-inputreader 'StreamXmlRecordReader,begin=<#{Settings.split_on_xml_tag}>,end=</#{Settings.split_on_xml_tag}>'}
|
109
|
+
end
|
110
|
+
extra_str_args << ' -lazyOutput' if options[:noempty] # don't create reduce file if no records
|
111
|
+
extra_str_args << ' -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner' unless options[:partition_fields].blank?
|
112
|
+
extra_str_args
|
109
113
|
end
|
110
114
|
|
111
115
|
def hadoop_recycle_env
|
@@ -135,42 +139,6 @@ module Wukong
|
|
135
139
|
# Thanks to Todd Lipcon for directing me to that hack.
|
136
140
|
#
|
137
141
|
|
138
|
-
# "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
|
139
|
-
# "HADOOP_IDENT_STRING" =>"hadoop",
|
140
|
-
# "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
|
141
|
-
# "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
|
142
|
-
# "HOME" =>"/var/run/hadoop-0.20",
|
143
|
-
# "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
|
144
|
-
# "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
|
145
|
-
# "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
|
146
|
-
# "USER" =>"hadoop",
|
147
|
-
#
|
148
|
-
# "dfs_block_size" =>"134217728",
|
149
|
-
# "map_input_start" =>"0",
|
150
|
-
# "map_input_length" =>"125726898",
|
151
|
-
# "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
|
152
|
-
# "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
|
153
|
-
# "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
|
154
|
-
# "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
|
155
|
-
# "mapred_output_compression_type" =>"BLOCK",
|
156
|
-
# "mapred_task_partition" =>"0",
|
157
|
-
# "mapred_tasktracker_map_tasks_maximum" =>"4",
|
158
|
-
# "mapred_tasktracker_reduce_tasks_maximum" =>"2",
|
159
|
-
# "mapred_tip_id" =>"task_200910221152_0023_m_000000",
|
160
|
-
# "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
|
161
|
-
# "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
|
162
|
-
#
|
163
|
-
# "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
|
164
|
-
# "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
|
165
|
-
# "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
|
166
|
-
# "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
|
167
|
-
# "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
|
168
|
-
# "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
|
169
|
-
# "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
|
170
|
-
# "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
|
171
|
-
# "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
|
172
|
-
# "user_name" =>"flip",
|
173
|
-
|
174
142
|
# HDFS pathname to the input file currently being processed.
|
175
143
|
def input_file
|
176
144
|
ENV['map_input_file']
|
@@ -211,3 +179,52 @@ module Wukong
|
|
211
179
|
end
|
212
180
|
end
|
213
181
|
end
|
182
|
+
|
183
|
+
# -partitioner org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner \
|
184
|
+
# -D mapred.output.key.comparator.class=org.apache.hadoop.mapred.lib.KeyFieldBasedComparator \
|
185
|
+
# -D mapred.text.key.comparator.options=-k2,2nr\
|
186
|
+
# -D mapred.text.key.partitioner.options=-k1,2\
|
187
|
+
# -D mapred.text.key.partitioner.options=\"-k1,$partfields\"
|
188
|
+
# -D stream.num.map.output.key.fields=\"$sortfields\"
|
189
|
+
#
|
190
|
+
# -D stream.map.output.field.separator=\"'/t'\"
|
191
|
+
# -D map.output.key.field.separator=. \
|
192
|
+
# -D mapred.data.field.separator=. \
|
193
|
+
# -D map.output.key.value.fields.spec=6,5,1-3:0- \
|
194
|
+
# -D reduce.output.key.value.fields.spec=0-2:5- \
|
195
|
+
|
196
|
+
# "HADOOP_HOME" =>"/usr/lib/hadoop-0.20/bin/..",
|
197
|
+
# "HADOOP_IDENT_STRING" =>"hadoop",
|
198
|
+
# "HADOOP_LOGFILE" =>"hadoop-hadoop-tasktracker-ip-10-242-14-223.log",
|
199
|
+
# "HADOOP_LOG_DIR" =>"/usr/lib/hadoop-0.20/bin/../logs",
|
200
|
+
# "HOME" =>"/var/run/hadoop-0.20",
|
201
|
+
# "JAVA_HOME" =>"/usr/lib/jvm/java-6-sun",
|
202
|
+
# "LD_LIBRARY_PATH" =>"/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386:/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386/client:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/lib/i386:/usr/lib/jvm/java-6-sun-1.6.0.10/jre/../lib/i386",
|
203
|
+
# "PATH" =>"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games",
|
204
|
+
# "USER" =>"hadoop",
|
205
|
+
#
|
206
|
+
# "dfs_block_size" =>"134217728",
|
207
|
+
# "map_input_start" =>"0",
|
208
|
+
# "map_input_length" =>"125726898",
|
209
|
+
# "mapred_output_key_class" =>"org.apache.hadoop.io.Text",
|
210
|
+
# "mapred_output_value_class" =>"org.apache.hadoop.io.Text",
|
211
|
+
# "mapred_output_format_class" =>"org.apache.hadoop.mapred.TextOutputFormat",
|
212
|
+
# "mapred_output_compression_codec" =>"org.apache.hadoop.io.compress.DefaultCodec",
|
213
|
+
# "mapred_output_compression_type" =>"BLOCK",
|
214
|
+
# "mapred_task_partition" =>"0",
|
215
|
+
# "mapred_tasktracker_map_tasks_maximum" =>"4",
|
216
|
+
# "mapred_tasktracker_reduce_tasks_maximum" =>"2",
|
217
|
+
# "mapred_tip_id" =>"task_200910221152_0023_m_000000",
|
218
|
+
# "mapred_task_id" =>"attempt_200910221152_0023_m_000000_0",
|
219
|
+
# "mapred_job_tracker" =>"ec2-174-129-141-78.compute-1.amazonaws.com:8021",
|
220
|
+
#
|
221
|
+
# "mapred_input_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809",
|
222
|
+
# "map_input_file" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/ripd/com.tw/com.twitter.search/20090809/com.twitter.search+20090809233441-56735-womper.tsv.bz2",
|
223
|
+
# "mapred_working_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip",
|
224
|
+
# "mapred_work_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809/_temporary/_attempt_200910221152_0023_m_000000_0",
|
225
|
+
# "mapred_output_dir" =>"hdfs://ec2-174-129-141-78.compute-1.amazonaws.com/user/flip/tmp/twsearch-20090809",
|
226
|
+
# "mapred_temp_dir" =>"/mnt/tmp/hadoop-hadoop/mapred/temp",
|
227
|
+
# "PWD" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work",
|
228
|
+
# "TMPDIR" =>"/mnt/hadoop/mapred/local/taskTracker/jobcache/job_200910221152_0023/attempt_200910221152_0023_m_000000_0/work/tmp",
|
229
|
+
# "stream_map_streamprocessor" =>"%2Fusr%2Fbin%2Fruby1.8+%2Fmnt%2Fhome%2Fflip%2Fics%2Fwuclan%2Fexamples%2Ftwitter%2Fparse%2Fparse_twitter_search_requests.rb+--map+--rm",
|
230
|
+
# "user_name" =>"flip",
|