wukong 1.5.0 → 1.5.1
Sign up to get free protection for your applications and to get access to all the features.
@@ -1,17 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
$stderr.puts `tar xvjf *.tar.bz2 `
|
4
|
-
$stderr.puts `ls -lR . /mnt/var/lib/hadoop/mapred/taskTracker/archive `
|
5
|
-
|
6
|
-
Dir['/mnt/var/lib/hadoop/mapred/taskTracker/archive/**/lib'].each{|dir| $: << dir }
|
7
|
-
Dir['./**/lib'].each{|dir| $: << dir }
|
2
|
+
Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
|
8
3
|
require 'rubygems'
|
9
4
|
require 'wukong'
|
10
|
-
begin
|
11
|
-
require 'wukong/script/emr_command'
|
12
|
-
rescue
|
13
|
-
nil
|
14
|
-
end
|
15
5
|
|
16
6
|
class FooStreamer < Wukong::Streamer::LineStreamer
|
17
7
|
def initialize *args
|
@@ -20,16 +10,9 @@ class FooStreamer < Wukong::Streamer::LineStreamer
|
|
20
10
|
end
|
21
11
|
|
22
12
|
def process *args
|
23
|
-
yield [@line_no, *args]
|
13
|
+
yield ["%5d" % @line_no, *args]
|
24
14
|
@line_no += 1
|
25
15
|
end
|
26
16
|
end
|
27
17
|
|
28
|
-
case
|
29
|
-
when ($0 =~ /mapper\.rb/) then Settings[:map] = true
|
30
|
-
when ($0 =~ /reducer\.rb/) then Settings[:reduce] = true
|
31
|
-
end
|
32
|
-
|
33
18
|
Wukong::Script.new(FooStreamer, FooStreamer).run
|
34
|
-
puts 'done!'
|
35
|
-
puts $0
|
data/lib/wukong/script.rb
CHANGED
@@ -86,14 +86,14 @@ module Wukong
|
|
86
86
|
# thus, requiring a working hadoop install), or to run in local mode
|
87
87
|
# (script --map | sort | script --reduce)
|
88
88
|
#
|
89
|
-
Settings.define :default_run_mode,
|
90
|
-
Settings.define :map_command,
|
91
|
-
Settings.define :reduce_command,
|
92
|
-
Settings.define :run,
|
93
|
-
Settings.define :map,
|
94
|
-
Settings.define :reduce,
|
95
|
-
Settings.define :dry_run,
|
96
|
-
Settings.define :rm,
|
89
|
+
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
|
90
|
+
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
91
|
+
Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
|
92
|
+
Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
|
93
|
+
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
94
|
+
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
95
|
+
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
96
|
+
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
97
97
|
|
98
98
|
#
|
99
99
|
# Instantiate the Script with the Mapper and the Reducer class (each a
|
@@ -204,7 +204,7 @@ module Wukong
|
|
204
204
|
# use the running framework to relaunch the script in map and in reduce mode
|
205
205
|
#
|
206
206
|
def execute_command! *args
|
207
|
-
command = args.flatten.
|
207
|
+
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
208
208
|
Log.info "Running\n\n#{command}\n"
|
209
209
|
if options[:dry_run]
|
210
210
|
Log.info '== [Not running preceding command: dry run] =='
|
@@ -1,13 +1,14 @@
|
|
1
1
|
require 'right_aws'
|
2
2
|
require 'configliere/config_block'
|
3
3
|
Settings.read(File.expand_path('~/.wukong/emr.yaml'))
|
4
|
-
Settings.define :
|
5
|
-
Settings.define :
|
6
|
-
Settings.define :
|
7
|
-
Settings.define :
|
8
|
-
Settings.define :
|
9
|
-
Settings.define :
|
10
|
-
Settings.define :
|
4
|
+
Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
|
5
|
+
Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
|
6
|
+
Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
|
7
|
+
Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
|
8
|
+
Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
|
9
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
|
10
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
|
11
|
+
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
11
12
|
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
12
13
|
Settings.define :jobflow
|
13
14
|
module Wukong
|
@@ -26,57 +27,71 @@ module Wukong
|
|
26
27
|
S3Util.store(this_script_filename, mapper_s3_uri)
|
27
28
|
S3Util.store(this_script_filename, reducer_s3_uri)
|
28
29
|
S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
|
29
|
-
S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
|
30
|
-
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
|
31
30
|
end
|
32
31
|
|
33
32
|
def execute_emr_runner
|
34
|
-
command_args = [
|
35
|
-
|
36
|
-
|
33
|
+
command_args = []
|
34
|
+
command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
35
|
+
command_args += emr_credentials
|
36
|
+
if Settings.jobflow
|
37
|
+
command_args << Settings.dashed_flag_for(:jobflow)
|
38
|
+
else
|
39
|
+
command_args << Settings.dashed_flag_for(:alive)
|
40
|
+
command_args << "--create --name=#{job_name}"
|
41
|
+
command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
|
42
|
+
end
|
37
43
|
command_args += [
|
38
|
-
%Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
|
39
|
-
"--stream",
|
40
|
-
"--mapper=#{mapper_s3_uri}",
|
41
|
-
"--reducer=#{reducer_s3_uri}",
|
42
|
-
"--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
|
43
|
-
#"--enable-debugging --log-uri=#{log_s3_uri}",
|
44
|
-
"--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
|
45
|
-
"--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
|
46
44
|
"--bootstrap-action=#{bootstrap_s3_uri}",
|
45
|
+
"--log-uri=#{log_s3_uri}",
|
46
|
+
"--stream",
|
47
|
+
"--mapper=#{mapper_s3_uri} ",
|
48
|
+
"--reducer=#{reducer_s3_uri} ",
|
49
|
+
"--input=#{input_paths} --output=#{output_path}",
|
50
|
+
# to specify zero reducers:
|
51
|
+
# "--arg '-D mapred.reduce.tasks=0'"
|
47
52
|
]
|
48
|
-
|
49
|
-
command_args << "--jobflow=#{Settings[:jobflow]}"
|
50
|
-
else
|
51
|
-
command_args << '--alive --create'
|
52
|
-
command_args << "--name=#{job_name}"
|
53
|
-
command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
|
54
|
-
end
|
53
|
+
Log.info 'Follow along at http://localhost:9000/job'
|
55
54
|
execute_command!( File.expand_path(Settings.emr_runner), *command_args )
|
56
55
|
end
|
57
56
|
|
57
|
+
def emr_ship_jars
|
58
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
|
59
|
+
# "--cache-archive=#{wukong_libs_s3_uri}#vendor",
|
60
|
+
end
|
61
|
+
|
62
|
+
def emr_credentials
|
63
|
+
command_args = []
|
64
|
+
if Settings.emr_credentials_file
|
65
|
+
command_args << "--credentials #{File.expand_path(Settings.emr_credentials_file)}"
|
66
|
+
else
|
67
|
+
command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
|
68
|
+
end
|
69
|
+
command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
|
70
|
+
command_args
|
71
|
+
end
|
72
|
+
|
58
73
|
# A short name for this job
|
59
74
|
def job_handle
|
60
75
|
File.basename($0,'.rb')
|
61
76
|
end
|
62
77
|
|
63
78
|
def mapper_s3_uri
|
64
|
-
|
79
|
+
emr_s3_path(job_handle+'-mapper.rb')
|
65
80
|
end
|
66
81
|
def reducer_s3_uri
|
67
|
-
|
82
|
+
emr_s3_path(job_handle+'-reducer.rb')
|
68
83
|
end
|
69
84
|
def log_s3_uri
|
70
|
-
|
85
|
+
emr_s3_path('log', job_handle)
|
71
86
|
end
|
72
87
|
def bootstrap_s3_uri
|
73
|
-
|
88
|
+
emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
|
74
89
|
end
|
75
90
|
def wukong_libs_s3_uri
|
76
|
-
|
91
|
+
emr_s3_path('bin', "wukong-libs.jar")
|
77
92
|
end
|
78
93
|
|
79
|
-
def
|
94
|
+
def emr_s3_path *path_segs
|
80
95
|
File.join(Settings.emr_root, path_segs.flatten.compact)
|
81
96
|
end
|
82
97
|
|
@@ -76,14 +76,10 @@
|
|
76
76
|
raise "override the finalize method in your subclass"
|
77
77
|
end
|
78
78
|
|
79
|
-
#
|
80
|
-
|
81
|
-
#
|
82
|
-
def stream
|
83
|
-
super
|
84
|
-
# don't finalize if we never saw any field at all
|
79
|
+
# Finalize the last-seen group.
|
80
|
+
def after_stream *args
|
85
81
|
finalize(){|record| emit record } unless (self.key == :__first_pass__)
|
86
|
-
|
82
|
+
super *args
|
87
83
|
end
|
88
84
|
end
|
89
85
|
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.5.
|
8
|
+
s.version = "1.5.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-08-
|
12
|
+
s.date = %q{2010-08-11}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 1.5.
|
9
|
+
- 1
|
10
|
+
version: 1.5.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Philip (flip) Kromer
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-11 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|