wukong 1.5.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,17 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
- $stderr.puts `jar xvf *.jar `
3
- $stderr.puts `tar xvjf *.tar.bz2 `
4
- $stderr.puts `ls -lR . /mnt/var/lib/hadoop/mapred/taskTracker/archive `
5
-
6
- Dir['/mnt/var/lib/hadoop/mapred/taskTracker/archive/**/lib'].each{|dir| $: << dir }
7
- Dir['./**/lib'].each{|dir| $: << dir }
2
+ Dir[File.dirname(__FILE__)+'/vendor/**/lib'].each{|dir| $: << dir }
8
3
  require 'rubygems'
9
4
  require 'wukong'
10
- begin
11
- require 'wukong/script/emr_command'
12
- rescue
13
- nil
14
- end
15
5
 
16
6
  class FooStreamer < Wukong::Streamer::LineStreamer
17
7
  def initialize *args
@@ -20,16 +10,9 @@ class FooStreamer < Wukong::Streamer::LineStreamer
20
10
  end
21
11
 
22
12
  def process *args
23
- yield [@line_no, *args]
13
+ yield ["%5d" % @line_no, *args]
24
14
  @line_no += 1
25
15
  end
26
16
  end
27
17
 
28
- case
29
- when ($0 =~ /mapper\.rb/) then Settings[:map] = true
30
- when ($0 =~ /reducer\.rb/) then Settings[:reduce] = true
31
- end
32
-
33
18
  Wukong::Script.new(FooStreamer, FooStreamer).run
34
- puts 'done!'
35
- puts $0
@@ -86,14 +86,14 @@ module Wukong
86
86
  # thus, requiring a working hadoop install), or to run in local mode
87
87
  # (script --map | sort | script --reduce)
88
88
  #
89
- Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
- Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
- Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
- Settings.define :run, :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud.", :wukong => true
93
- Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
- Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
- Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
- Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
89
+ Settings.define :default_run_mode, :default => 'hadoop', :description => 'Run mode: local, hadoop, emr (elastic mapreduce)', :wukong => true, :hide_help => false
90
+ Settings.define :map_command, :description => "shell command to run as mapper, in place of this wukong script", :wukong => true
91
+ Settings.define :reduce_command, :description => "shell command to run as reducer, in place of this wukong script", :wukong => true
92
+ Settings.define :run, :env_var => 'WUKONG_RUN_MODE', :description => "run the script's workflow: Specify 'hadoop' to use hadoop streaming; 'local' to run your_script.rb --map | sort | your_script.rb --reduce; 'emr' to launch on the amazon cloud; 'map' or 'reduce' to run that phase.", :wukong => true
93
+ Settings.define :map, :description => "run the script's map phase. Reads/writes to STDIN/STDOUT.", :wukong => true
94
+ Settings.define :reduce, :description => "run the script's reduce phase. Reads/writes to STDIN/STDOUT. You can only choose one of --run, --map or --reduce.", :wukong => true
95
+ Settings.define :dry_run, :description => "echo the command that will be run, but don't run it", :wukong => true
96
+ Settings.define :rm, :description => "Recursively remove the destination directory. Only used in hadoop mode.", :wukong => true
97
97
 
98
98
  #
99
99
  # Instantiate the Script with the Mapper and the Reducer class (each a
@@ -204,7 +204,7 @@ module Wukong
204
204
  # use the running framework to relaunch the script in map and in reduce mode
205
205
  #
206
206
  def execute_command! *args
207
- command = args.flatten.compact.join(" \\\n ")
207
+ command = args.flatten.reject(&:blank?).join(" \\\n ")
208
208
  Log.info "Running\n\n#{command}\n"
209
209
  if options[:dry_run]
210
210
  Log.info '== [Not running preceding command: dry run] =='
@@ -1,13 +1,14 @@
1
1
  require 'right_aws'
2
2
  require 'configliere/config_block'
3
3
  Settings.read(File.expand_path('~/.wukong/emr.yaml'))
4
- Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
5
- Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
6
- Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
7
- Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
8
- Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) }
9
- Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') }
10
- Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
4
+ Settings.define :emr_credentials_file, :description => 'A .json file holding your AWS access credentials. See http://bit.ly/emr_credentials_file for format'
5
+ Settings.define :access_key, :description => 'AWS Access key', :env_var => 'AWS_ACCESS_KEY_ID'
6
+ Settings.define :secret_access_key, :description => 'AWS Secret Access key', :env_var => 'AWS_SECRET_ACCESS_KEY'
7
+ Settings.define :emr_runner, :description => 'Path to the elastic-mapreduce command (~ etc will be expanded)'
8
+ Settings.define :emr_root, :description => 'S3 url to use as the base for Elastic MapReduce storage'
9
+ Settings.define :key_pair_file, :description => 'AWS Key pair file', :finally => lambda{ Settings.key_pair_file = File.expand_path(Settings.key_pair_file.to_s) if Settings.key_pair_file }
10
+ Settings.define :key_pair, :description => "AWS Key pair name. If not specified, it's taken from key_pair_file's basename", :finally => lambda{ Settings.key_pair ||= File.basename(Settings.key_pair_file.to_s, '.pem') if Settings.key_pair_file }
11
+ Settings.define :instance_type, :description => 'AWS instance type to use', :default => 'm1.small'
11
12
  Settings.define :master_instance_type, :description => 'Overrides the instance type for the master node', :finally => lambda{ Settings.master_instance_type ||= Settings.instance_type }
12
13
  Settings.define :jobflow
13
14
  module Wukong
@@ -26,57 +27,71 @@ module Wukong
26
27
  S3Util.store(this_script_filename, mapper_s3_uri)
27
28
  S3Util.store(this_script_filename, reducer_s3_uri)
28
29
  S3Util.store(File.expand_path('~/ics/wukong/bin/bootstrap.sh'), bootstrap_s3_uri)
29
- S3Util.store(File.expand_path('/tmp/wukong-libs.tar.bz2'), wukong_libs_s3_uri)
30
- S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), s3_path('bin', "wukong-libs.jar"))
31
30
  end
32
31
 
33
32
  def execute_emr_runner
34
- command_args = [
35
- :hadoop_version, :availability_zone, :key_pair, :key_pair_file,
36
- ].map{|args| Settings.dashed_flag_for(*args) }
33
+ command_args = []
34
+ command_args << Settings.dashed_flags(:hadoop_version, :enable_debugging, :step_action, [:emr_runner_verbose, :verbose], [:emr_runner_debug, :debug]).join(' ')
35
+ command_args += emr_credentials
36
+ if Settings.jobflow
37
+ command_args << Settings.dashed_flag_for(:jobflow)
38
+ else
39
+ command_args << Settings.dashed_flag_for(:alive)
40
+ command_args << "--create --name=#{job_name}"
41
+ command_args << Settings.dashed_flags(:num_instances, [:instance_type, :slave_instance_type], :master_instance_type).join(' ')
42
+ end
37
43
  command_args += [
38
- %Q{--verbose --debug --access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} },
39
- "--stream",
40
- "--mapper=#{mapper_s3_uri}",
41
- "--reducer=#{reducer_s3_uri}",
42
- "--input=#{mapper_s3_uri} --output=#{Settings.emr_root+'/foo-out.tsv'}",
43
- #"--enable-debugging --log-uri=#{log_s3_uri}",
44
- "--cache-archive=#{s3_path('bin', "wukong-libs.jar")}#wukong-libs.jar",
45
- "--cache=#{wukong_libs_s3_uri}##{File.basename wukong_libs_s3_uri}",
46
44
  "--bootstrap-action=#{bootstrap_s3_uri}",
45
+ "--log-uri=#{log_s3_uri}",
46
+ "--stream",
47
+ "--mapper=#{mapper_s3_uri} ",
48
+ "--reducer=#{reducer_s3_uri} ",
49
+ "--input=#{input_paths} --output=#{output_path}",
50
+ # to specify zero reducers:
51
+ # "--arg '-D mapred.reduce.tasks=0'"
47
52
  ]
48
- if Settings.jobflow
49
- command_args << "--jobflow=#{Settings[:jobflow]}"
50
- else
51
- command_args << '--alive --create'
52
- command_args << "--name=#{job_name}"
53
- command_args += [ [:instance_type, :slave_instance_type] , :master_instance_type, :num_instances, ].map{|args| Settings.dashed_flag_for(*args) }
54
- end
53
+ Log.info 'Follow along at http://localhost:9000/job'
55
54
  execute_command!( File.expand_path(Settings.emr_runner), *command_args )
56
55
  end
57
56
 
57
+ def emr_ship_jars
58
+ S3Util.store(File.expand_path('/tmp/wukong-libs.jar'), wukong_libs_s3_uri)
59
+ # "--cache-archive=#{wukong_libs_s3_uri}#vendor",
60
+ end
61
+
62
+ def emr_credentials
63
+ command_args = []
64
+ if Settings.emr_credentials_file
65
+ command_args << "--credentials #{File.expand_path(Settings.emr_credentials_file)}"
66
+ else
67
+ command_args << %Q{--access-id #{Settings.access_key} --private-key #{Settings.secret_access_key} }
68
+ end
69
+ command_args << Settings.dashed_flags(:availability_zone, :key_pair, :key_pair_file).join(' ')
70
+ command_args
71
+ end
72
+
58
73
  # A short name for this job
59
74
  def job_handle
60
75
  File.basename($0,'.rb')
61
76
  end
62
77
 
63
78
  def mapper_s3_uri
64
- s3_path(job_handle+'-mapper.rb')
79
+ emr_s3_path(job_handle+'-mapper.rb')
65
80
  end
66
81
  def reducer_s3_uri
67
- s3_path(job_handle+'-reducer.rb')
82
+ emr_s3_path(job_handle+'-reducer.rb')
68
83
  end
69
84
  def log_s3_uri
70
- s3_path('log', job_handle)
85
+ emr_s3_path('log', job_handle)
71
86
  end
72
87
  def bootstrap_s3_uri
73
- s3_path('bin', "bootstrap-#{job_handle}.sh")
88
+ emr_s3_path('bin', "bootstrap-#{job_handle}.sh")
74
89
  end
75
90
  def wukong_libs_s3_uri
76
- s3_path('bin', "wukong-libs.tar.bz2")
91
+ emr_s3_path('bin', "wukong-libs.jar")
77
92
  end
78
93
 
79
- def s3_path *path_segs
94
+ def emr_s3_path *path_segs
80
95
  File.join(Settings.emr_root, path_segs.flatten.compact)
81
96
  end
82
97
 
@@ -76,14 +76,10 @@
76
76
  raise "override the finalize method in your subclass"
77
77
  end
78
78
 
79
- #
80
- # Must make sure to finalize the last-seen accumulation.
81
- #
82
- def stream
83
- super
84
- # don't finalize if we never saw any field at all
79
+ # Finalize the last-seen group.
80
+ def after_stream *args
85
81
  finalize(){|record| emit record } unless (self.key == :__first_pass__)
86
- after_stream
82
+ super *args
87
83
  end
88
84
  end
89
85
 
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.5.0"
8
+ s.version = "1.5.1"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-08-10}
12
+ s.date = %q{2010-08-11}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 3
4
+ hash: 1
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 5
9
- - 0
10
- version: 1.5.0
9
+ - 1
10
+ version: 1.5.1
11
11
  platform: ruby
12
12
  authors:
13
13
  - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-08-10 00:00:00 -05:00
18
+ date: 2010-08-11 00:00:00 -05:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency