wukong 1.5.0 → 1.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
$stderr.puts `tar xvjf *.tar.bz2 `
|
4
|
-
$stderr.puts `ls -lR . /mnt/var/lib/hadoop/mapred/taskTracker/archive `
|
5
|
-
|
6
|
-
Dir['/mnt/var/lib/hadoop/mapred/taskTracker/archive/**/lib'].each{|dir| $: << dir }
|
7
|
-
Dir['./**/lib'].each{|dir| $: << dir }
|
2
|
+
Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
|
8
3
|
require 'rubygems'
|
9
4
|
require 'wukong'
|
10
|
-
begin
|
11
|
-
require 'wukong/script/emr_command'
|
12
|
-
rescue
|
13
|
-
nil
|
14
|
-
end
|
15
5
|
|
16
6
|
class FooStreamer < Wukong::Streamer::LineStreamer
|
17
7
|
def initialize *args
|
@@ -20,16 +10,9 @@ class FooStreamer < Wukong::Streamer::LineStreamer
|
|
20
10
|
end
|
21
11
|
|
22
12
|
def process *args
|
23
|
-
yield [@line_no, *args]
|
13
|
+
yield ["%5d" % @line_no, *args]
|
24
14
|
@line_no += 1
|
25
15
|
end
|
26
16
|
end
|
27
17
|
|
28
|
-
case
|
29
|
-
when ($0 =~ /mapper\.rb/) then Settings[:map] = true
|
30
|
-
when ($0 =~ /reducer\.rb/) then Settings[:reduce] = true
|
31
|
-
end
|
32
|
-
|
33
18
|
Wukong::Script.new(FooStreamer, FooStreamer).run
|
34
|
-
puts 'done!'
|
35
|
-
puts $0
|
data/lib/wukong/script.rb
CHANGED
@@ -86,14 +86,14 @@ module Wukong
|
|
86
86
|
# thus, requiring a working hadoop install), or to run in local mode
|
87
87
|
# (script --map | sort | script --reduce)
|
88
88
|
#
|
89
|
-
Settings.define :default_run_mode,
|
90
|
-
Settings.define :map_command,
|
91
|
-
Settings.define :reduce_command,
|
92
|
-
Settings.define :run,
|
93
|
-
Settings.define :map,
|
94
|
-
Settings.define :reduce,
|
95
|
-
Settings.define :dry_run,
|
96
|
-
Settings.define :rm,
|
89
|
+
Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
|
90
|
+
Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
|
91
|
+
Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
|
92
|
+
Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
|
93
|
+
Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
|
94
|
+
Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
|
95
|
+
Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
|
96
|
+
Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
|
97
97
|
|
98
98
|
#
|
99
99
|
# Instantiate the Script with the Mapper and the Reducer class (each a
|
@@ -204,7 +204,7 @@ module Wukong
|
|
204
204
|
# use the running framework to relaunch the script in map and in reduce mode
|
205
205
|
#
|
206
206
|
def execute_command! *args
|
207
|
-
command = args.flatten.
|
207
|
+
command = args.flatten.reject(&:blank?).join(" \\\n ")
|
208
208
|
Log.info "Running\n\n#{command}\n"
|
209
209
|
if options[:dry_run]
|
210
210
|
Log.info '== [Not running preceding command: dry run] =='
|
@@ -1,13 +1,14 @@
|
|
1
1
|
require 'right_aws'
|
2
2
|
require 'configliere/config_block'
|
3
3
|
Settings.read(File.expand_path('~/.wukong/emr.yaml'))
|
4
|
-
Settings.define :
|
5
|
-
Settings.define :
|
6
|
-
Settings.define :
|
7
|
-
Settings.define :
|
8
|
-
Settings.define :
|
9
|
-
Settings.define :
|
10
|
-
Settings.define :
|
4
|
+
Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
|
5
|
+
Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
|
6
|
+
Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
|
7
|
+
Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
|
8
|
+
Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
|
9
|
+
Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
|
10
|
+
Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
|
11
|
+
Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
|
11
12
|
Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
|
12
13
|
Settings.define :jobflow
|
13
14
|
module Wukong
|
@@ -26,57 +27,71 @@ module Wukong
|
|
26
27
|
S3Util.store(this_script_filename, mapper_s3_uri)
|
27
28
|
S3Util.store(this_script_filename, reducer_s3_uri)
|
28
29
|
S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
|
29
|
-
S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
|
30
|
-
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
|
31
30
|
end
|
32
31
|
|
33
32
|
def execute_emr_runner
|
34
|
-
command_args = [
|
35
|
-
|
36
|
-
|
33
|
+
command_args = []
|
34
|
+
command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
|
35
|
+
command_args += emr_credentials
|
36
|
+
if Settings.jobflow
|
37
|
+
command_args << Settings.dashed_flag_for(:jobflow)
|
38
|
+
else
|
39
|
+
command_args << Settings.dashed_flag_for(:alive)
|
40
|
+
command_args << "--create --name=#{job_name}"
|
41
|
+
command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
|
42
|
+
end
|
37
43
|
command_args += [
|
38
|
-
%Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
|
39
|
-
"--stream",
|
40
|
-
"--mapper=#{mapper_s3_uri}",
|
41
|
-
"--reducer=#{reducer_s3_uri}",
|
42
|
-
"--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
|
43
|
-
#"--enable-debugging --log-uri=#{log_s3_uri}",
|
44
|
-
"--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
|
45
|
-
"--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
|
46
44
|
"--bootstrap-action=#{bootstrap_s3_uri}",
|
45
|
+
"--log-uri=#{log_s3_uri}",
|
46
|
+
"--stream",
|
47
|
+
"--mapper=#{mapper_s3_uri} ",
|
48
|
+
"--reducer=#{reducer_s3_uri} ",
|
49
|
+
"--input=#{input_paths} --output=#{output_path}",
|
50
|
+
# to specify zero reducers:
|
51
|
+
# "--arg '-D mapred.reduce.tasks=0'"
|
47
52
|
]
|
48
|
-
|
49
|
-
command_args << "--jobflow=#{Settings[:jobflow]}"
|
50
|
-
else
|
51
|
-
command_args << '--alive --create'
|
52
|
-
command_args << "--name=#{job_name}"
|
53
|
-
command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
|
54
|
-
end
|
53
|
+
Log.info 'Follow along at http://localhost:9000/job'
|
55
54
|
execute_command!( File.expand_path(Settings.emr_runner), *command_args )
|
56
55
|
end
|
57
56
|
|
57
|
+
def emr_ship_jars
|
58
|
+
S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
|
59
|
+
# "--cache-archive=#{wukong_libs_s3_uri}#vendor",
|
60
|
+
end
|
61
|
+
|
62
|
+
def emr_credentials
|
63
|
+
command_args = []
|
64
|
+
if Settings.emr_credentials_file
|
65
|
+
command_args << "--credentials #{File.expand_path(Settings.emr_credentials_file)}"
|
66
|
+
else
|
67
|
+
command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
|
68
|
+
end
|
69
|
+
command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
|
70
|
+
command_args
|
71
|
+
end
|
72
|
+
|
58
73
|
# A short name for this job
|
59
74
|
def job_handle
|
60
75
|
File.basename($0,'.rb')
|
61
76
|
end
|
62
77
|
|
63
78
|
def mapper_s3_uri
|
64
|
-
|
79
|
+
emr_s3_path(job_handle+'-mapper.rb')
|
65
80
|
end
|
66
81
|
def reducer_s3_uri
|
67
|
-
|
82
|
+
emr_s3_path(job_handle+'-reducer.rb')
|
68
83
|
end
|
69
84
|
def log_s3_uri
|
70
|
-
|
85
|
+
emr_s3_path('log', job_handle)
|
71
86
|
end
|
72
87
|
def bootstrap_s3_uri
|
73
|
-
|
88
|
+
emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
|
74
89
|
end
|
75
90
|
def wukong_libs_s3_uri
|
76
|
-
|
91
|
+
emr_s3_path('bin', "wukong-libs.jar")
|
77
92
|
end
|
78
93
|
|
79
|
-
def
|
94
|
+
def emr_s3_path *path_segs
|
80
95
|
File.join(Settings.emr_root, path_segs.flatten.compact)
|
81
96
|
end
|
82
97
|
|
@@ -76,14 +76,10 @@
|
|
76
76
|
raise "override the finalize method in your subclass"
|
77
77
|
end
|
78
78
|
|
79
|
-
#
|
80
|
-
|
81
|
-
#
|
82
|
-
def stream
|
83
|
-
super
|
84
|
-
# don't finalize if we never saw any field at all
|
79
|
+
# Finalize the last-seen group.
|
80
|
+
def after_stream *args
|
85
81
|
finalize(){|record| emit record } unless (self.key == :__first_pass__)
|
86
|
-
|
82
|
+
super *args
|
87
83
|
end
|
88
84
|
end
|
89
85
|
|
data/wukong.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{wukong}
|
8
|
-
s.version = "1.5.
|
8
|
+
s.version = "1.5.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2010-08-
|
12
|
+
s.date = %q{2010-08-11}
|
13
13
|
s.description = %q{ Treat your dataset like a:
|
14
14
|
|
15
15
|
* stream of lines when it's efficient to process by lines
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wukong
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 5
|
9
|
-
-
|
10
|
-
version: 1.5.
|
9
|
+
- 1
|
10
|
+
version: 1.5.1
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Philip (flip) Kromer
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2010-08-
|
18
|
+
date: 2010-08-11 00:00:00 -05:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|