wukong 1.5.0 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,17 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $stderr.puts `jar xvf *.jar `
3
- $stderr.puts `tar xvjf *.tar.bz2 `
4
- $stderr.puts `ls -lR . /mnt/var/lib/hadoop/mapred/taskTracker/archive `
5
-
6
- Dir['/mnt/var/lib/hadoop/mapred/taskTracker/archive/**/lib'].each{|dir| $: << dir }
7
- Dir['./**/lib'].each{|dir| $: << dir }
2
+ Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
8
3
  require 'rubygems'
9
4
  require 'wukong'
10
- begin
11
- require 'wukong/script/emr_command'
12
- rescue
13
- nil
14
- end
15
5
 
16
6
  class FooStreamer < Wukong::Streamer::LineStreamer
17
7
  def initialize *args
@@ -20,16 +10,9 @@ class FooStreamer < Wukong::Streamer::LineStreamer
20
10
  end
21
11
 
22
12
  def process *args
23
- yield [@line_no, *args]
13
+ yield ["%5d" % @line_no, *args]
24
14
  @line_no += 1
25
15
  end
26
16
  end
27
17
 
28
- case
29
- when ($0 =~ /mapper\.rb/) then Settings[:map] = true
30
- when ($0 =~ /reducer\.rb/) then Settings[:reduce] = true
31
- end
32
-
33
18
  Wukong::Script.new(FooStreamer, FooStreamer).run
34
- puts 'done!'
35
- puts $0
@@ -86,14 +86,14 @@ module Wukong
86
86
  # thus, requiring a working hadoop install), or to run in local mode
87
87
  # (script --map | sort | script --reduce)
88
88
  #
89
- Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
- Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
- Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
- Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
93
- Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
- Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
- Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
89
+ Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
+ Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
+ Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
+ Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
93
+ Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
+ Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
+ Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
+ Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
97
97
 
98
98
  #
99
99
  # Instantiate the Script with the Mapper and the Reducer class (each a
@@ -204,7 +204,7 @@ module Wukong
204
204
  # use the running framework to relaunch the script in map and in reduce mode
205
205
  #
206
206
  def execute_command! *args
207
- command = args.flatten.compact.join(" \\\n ")
207
+ command = args.flatten.reject(&:blank?).join(" \\\n ")
208
208
  Log.info "Running\n\n#{command}\n"
209
209
  if options[:dry_run]
210
210
  Log.info '== [Not running preceding command: dry run] =='
@@ -1,13 +1,14 @@
1
1
  require 'right_aws'
2
2
  require 'configliere/config_block'
3
3
  Settings.read(File.expand_path('~/.wukong/emr.yaml'))
4
- Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
5
- Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
6
- Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
7
- Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
8
- Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
9
- Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
10
- Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
4
+ Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
5
+ Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
6
+ Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
7
+ Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
8
+ Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
9
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
10
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
11
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
11
12
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
12
13
  Settings.define :jobflow
13
14
  module Wukong
@@ -26,57 +27,71 @@ module Wukong
26
27
  S3Util.store(this_script_filename, mapper_s3_uri)
27
28
  S3Util.store(this_script_filename, reducer_s3_uri)
28
29
  S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
29
- S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
30
- S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
31
30
  end
32
31
 
33
32
  def execute_emr_runner
34
- command_args = [
35
- :hadoop_version, :availability_zone, :key_pair, :key_pair_file,
36
- ].map{|args| Settings.dashed_flag_for(*args) }
33
+ command_args = []
34
+ command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
35
+ command_args += emr_credentials
36
+ if Settings.jobflow
37
+ command_args << Settings.dashed_flag_for(:jobflow)
38
+ else
39
+ command_args << Settings.dashed_flag_for(:alive)
40
+ command_args << "--create --name=#{job_name}"
41
+ command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
42
+ end
37
43
  command_args += [
38
- %Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
39
- "--stream",
40
- "--mapper=#{mapper_s3_uri}",
41
- "--reducer=#{reducer_s3_uri}",
42
- "--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
43
- #"--enable-debugging --log-uri=#{log_s3_uri}",
44
- "--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
45
- "--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
46
44
  "--bootstrap-action=#{bootstrap_s3_uri}",
45
+ "--log-uri=#{log_s3_uri}",
46
+ "--stream",
47
+ "--mapper=#{mapper_s3_uri} ",
48
+ "--reducer=#{reducer_s3_uri} ",
49
+ "--input=#{input_paths} --output=#{output_path}",
50
+ # to specify zero reducers:
51
+ # "--arg '-D mapred.reduce.tasks=0'"
47
52
  ]
48
- if Settings.jobflow
49
- command_args << "--jobflow=#{Settings[:jobflow]}"
50
- else
51
- command_args << '--alive --create'
52
- command_args << "--name=#{job_name}"
53
- command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
54
- end
53
+ Log.info 'Follow along at http://localhost:9000/job'
55
54
  execute_command!( File.expand_path(Settings.emr_runner), *command_args )
56
55
  end
57
56
 
57
+ def emr_ship_jars
58
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
59
+ # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
60
+ end
61
+
62
+ def emr_credentials
63
+ command_args = []
64
+ if Settings.emr_credentials_file
65
+ command_args << "--credentials #{File.expand_path(Settings.emr_credentials_file)}"
66
+ else
67
+ command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
68
+ end
69
+ command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
70
+ command_args
71
+ end
72
+
58
73
  # A short name for this job
59
74
  def job_handle
60
75
  File.basename($0,'.rb')
61
76
  end
62
77
 
63
78
  def mapper_s3_uri
64
- s3_path(job_handle+'-mapper.rb')
79
+ emr_s3_path(job_handle+'-mapper.rb')
65
80
  end
66
81
  def reducer_s3_uri
67
- s3_path(job_handle+'-reducer.rb')
82
+ emr_s3_path(job_handle+'-reducer.rb')
68
83
  end
69
84
  def log_s3_uri
70
- s3_path('log', job_handle)
85
+ emr_s3_path('log', job_handle)
71
86
  end
72
87
  def bootstrap_s3_uri
73
- s3_path('bin', "bootstrap-#{job_handle}.sh")
88
+ emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
74
89
  end
75
90
  def wukong_libs_s3_uri
76
- s3_path('bin', "wukong-libs.tar.bz2")
91
+ emr_s3_path('bin', "wukong-libs.jar")
77
92
  end
78
93
 
79
- def s3_path *path_segs
94
+ def emr_s3_path *path_segs
80
95
  File.join(Settings.emr_root, path_segs.flatten.compact)
81
96
  end
82
97
 
@@ -76,14 +76,10 @@
76
76
  raise "override the finalize method in your subclass"
77
77
  end
78
78
 
79
- #
80
- # Must make sure to finalize the last-seen accumulation.
81
- #
82
- def stream
83
- super
84
- # don't finalize if we never saw any field at all
79
+ # Finalize the last-seen group.
80
+ def after_stream *args
85
81
  finalize(){|record| emit record } unless (self.key == :__first_pass__)
86
- after_stream
82
+ super *args
87
83
  end
88
84
  end
89
85
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.5.0"
8
+ s.version = "1.5.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-08-10}
12
+ s.date = %q{2010-08-11}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 3
4
+ hash: 1
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 5
9
- - 0
10
- version: 1.5.0
9
+ - 1
10
+ version: 1.5.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-10 00:00:00 -05:00
18
+ date: 2010-08-11 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency