wukong-hadoop 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,3 +1,8 @@
1
1
  source :rubygems
2
2
 
3
3
  gemspec
4
+
5
+ group :development do
6
+ gem 'rake', '~> 0.9'
7
+ gem 'rspec', '~> 2'
8
+ end
@@ -16,6 +16,7 @@ module Wukong
16
16
  settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
17
17
  settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
18
18
  settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
19
+ settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
19
20
  settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
20
21
  settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
21
22
  settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
@@ -39,7 +40,10 @@ module Wukong
39
40
  settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
40
41
  settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
41
42
  settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
42
- settings.define :java_opts, wukong_hadoop: true, description: 'Additional java options to be passed to hadoop streaming.', :type => Array, :default => []
43
+ settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
44
+ settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
45
+ settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
46
+ settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
43
47
 
44
48
  # Options given on the command-line
45
49
  settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
@@ -24,6 +24,7 @@ module Wukong
24
24
  include ReduceLogic
25
25
  include HadoopInvocation
26
26
  include LocalInvocation
27
+ include Logging
27
28
 
28
29
  # The settings used by this driver.
29
30
  #
@@ -55,12 +56,12 @@ module Wukong
55
56
  # Run this driver.
56
57
  def run!
57
58
  if mode == :local
58
- # Log.info "Launching local!"
59
+ # log.info "Launching local!"
59
60
  execute_command!(local_commandline)
60
61
  else
61
62
  ensure_input_and_output!
62
63
  remove_output_path! if settings[:rm] || settings[:overwrite]
63
- Log.info "Launching Hadoop!"
64
+ log.info "Launching Hadoop!"
64
65
  execute_command!(hadoop_commandline)
65
66
  end
66
67
  end
@@ -177,7 +178,7 @@ module Wukong
177
178
  def execute_command!(*args)
178
179
  command = args.flatten.reject(&:blank?).join(" \\\n ")
179
180
  if settings[:dry_run]
180
- Log.info("Dry run:")
181
+ log.info("Dry run:")
181
182
  puts command
182
183
  else
183
184
  puts `#{command}`
@@ -16,8 +16,8 @@ module Wukong
16
16
  # Will not actually do anything if the <tt>--dry_run</tt> option
17
17
  # is also given.
18
18
  def remove_output_path!
19
- cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
20
- Log.info "Removing output file #{output_path}: #{cmd}"
19
+ cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
20
+ log.info "Removing output file #{output_path}: #{cmd}"
21
21
  puts `#{cmd}` unless settings[:dry_run]
22
22
  end
23
23
 
@@ -35,11 +35,11 @@ module Wukong
35
35
  hadoop_jobconf_options,
36
36
  "-D mapred.job.name='#{job_name}'",
37
37
  hadoop_other_args,
38
+ hadoop_files,
38
39
  "-mapper '#{mapper_commandline}'",
39
40
  "-reducer '#{reducer_commandline}'",
40
41
  "-input '#{input_paths}'",
41
42
  "-output '#{output_path}'",
42
- hadoop_files,
43
43
  io_formats,
44
44
  hadoop_recycle_env,
45
45
  ].flatten.compact.join(" \t\\\n ")
@@ -103,7 +103,7 @@ module Wukong
103
103
  settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
104
104
  settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
105
105
  # If no reducer and no reduce_command, then skip the reduce phase
106
- settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
106
+ settings[:reduce_tasks] ||= 0 unless reduce?
107
107
  # Fields hadoop should use to distribute records to reducers
108
108
  unless settings[:partition_fields].blank?
109
109
  jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
@@ -146,8 +146,24 @@ module Wukong
146
146
  #
147
147
  # http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
148
148
  def hadoop_files
149
- args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
150
- "-file '#{arg}'"
149
+ args.find_all { |arg| arg.to_s =~ /\.rb$/ }.each do |arg|
150
+ settings[:files] << arg
151
+ end
152
+
153
+ [].tap do |files_options|
154
+ {
155
+ :files => '-files ',
156
+ :jars => '-libjars ',
157
+ :archives => '-archives '
158
+ }.each_pair do |file_type_name, file_option_name|
159
+ unless settings[file_type_name].nil? || settings[file_type_name].empty?
160
+ files = settings[file_type_name].map do |file_name_or_glob|
161
+ # Don't glob on the HDFS
162
+ file_type_name == :archives ? file_name_or_glob : [Dir[file_name_or_glob], file_name_or_glob]
163
+ end.flatten.compact.uniq.join(',')
164
+ files_options << "#{file_option_name}'#{files}'"
165
+ end
166
+ end
151
167
  end
152
168
  end
153
169
 
@@ -1,6 +1,6 @@
1
1
  module Wukong
2
2
  module Hadoop
3
3
  # The current version of Wukong-Hadoop.
4
- VERSION = '0.0.1'
4
+ VERSION = '0.0.2'
5
5
  end
6
6
  end
@@ -6,6 +6,7 @@ require 'wukong/spec_helpers'
6
6
  RSpec.configure do |config|
7
7
 
8
8
  config.before(:each) do
9
+ Wukong::Log.level = Log4r::OFF
9
10
  @orig_reg = Wukong.registry.show
10
11
  end
11
12
 
@@ -6,16 +6,16 @@ module Wukong
6
6
  @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
7
  end
8
8
 
9
- def lib_dir
10
- root.join('lib')
9
+ def lib_dir *args
10
+ root.join('lib', *args)
11
11
  end
12
12
 
13
- def bin_dir
14
- root.join('bin')
13
+ def bin_dir *args
14
+ root.join('bin', *args)
15
15
  end
16
16
 
17
- def examples_dir
18
- root.join('examples')
17
+ def examples_dir *args
18
+ root.join('examples', *args)
19
19
  end
20
20
 
21
21
  def integration_env
@@ -30,7 +30,7 @@ module Wukong
30
30
  end
31
31
 
32
32
  def example_script *args
33
- examples_dir.join(*args)
33
+ examples_dir(*args)
34
34
  end
35
35
 
36
36
  end
@@ -4,8 +4,9 @@ describe Wukong::Hadoop::HadoopInvocation do
4
4
 
5
5
  let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
6
6
  let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
7
- let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
8
- let(:custum_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
7
+ let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
8
+ let(:custom_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
9
+ let(:many_files) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}
9
10
 
10
11
  context "defining input paths" do
11
12
  it "raises an error unless given an --input option" do
@@ -15,7 +16,7 @@ describe Wukong::Hadoop::HadoopInvocation do
15
16
  map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
16
17
  end
17
18
  it "sets its input format given the --input_format option" do
18
- custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
+ custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
20
  end
20
21
  end
21
22
 
@@ -27,7 +28,7 @@ describe Wukong::Hadoop::HadoopInvocation do
27
28
  map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
28
29
  end
29
30
  it "sets its output format given the --output_format option" do
30
- custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
+ custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
32
  end
32
33
  end
33
34
 
@@ -43,6 +44,18 @@ describe Wukong::Hadoop::HadoopInvocation do
43
44
  end
44
45
  end
45
46
 
47
+ context "setting the number of reduce tasks" do
48
+ it "does nothing on a map/reduce job" do
49
+ map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
50
+ end
51
+ it "respects the option when given" do
52
+ complex.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=20})
53
+ end
54
+ it "sets reduce tasks to 0 for a map-only job" do
55
+ map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
56
+ end
57
+ end
58
+
46
59
  context "defining Hadoop JobConf options" do
47
60
  it "translates friendly names into native ones" do
48
61
  complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
@@ -54,7 +67,7 @@ describe Wukong::Hadoop::HadoopInvocation do
54
67
  end
55
68
 
56
69
  context "removing existing output paths" do
57
- before { Log.stub!(:info) }
70
+
58
71
  it "will not remove the output path by default" do
59
72
  map_reduce.should_not_receive(:remove_output_path!)
60
73
  map_reduce.should_receive(:execute_command!)
@@ -72,7 +85,24 @@ describe Wukong::Hadoop::HadoopInvocation do
72
85
  d.should_receive(:execute_command!)
73
86
  d.run!
74
87
  end
75
-
88
+ end
89
+
90
+ context "handle files, jars, and archives" do
91
+ it "does not include any files, jars, or archives when no files were passed" do
92
+ map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
93
+ end
94
+ it "should include files when asked" do
95
+ many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
96
+ end
97
+ it "should include jars when asked" do
98
+ many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
99
+ end
100
+ it "should include archives when asked" do
101
+ many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
102
+ end
103
+ it "should include files when passed files as arguments" do
104
+ driver(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
105
+ end
76
106
  end
77
107
 
78
108
  end
@@ -25,9 +25,5 @@ EOF
25
25
  gem.test_files = gem.files.grep(/^spec/)
26
26
  gem.require_paths = ['lib']
27
27
 
28
- gem.add_dependency('wukong', '3.0.0.pre2')
29
-
30
- gem.add_development_dependency 'rake', '~> 0.9'
31
- gem.add_development_dependency 'rspec', '~> 2'
32
-
28
+ gem.add_dependency('wukong', '3.0.0.pre3')
33
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-12-01 00:00:00.000000000 Z
14
+ date: 2012-12-17 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: wukong
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - '='
22
22
  - !ruby/object:Gem::Version
23
- version: 3.0.0.pre2
23
+ version: 3.0.0.pre3
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
@@ -28,39 +28,7 @@ dependencies:
28
28
  requirements:
29
29
  - - '='
30
30
  - !ruby/object:Gem::Version
31
- version: 3.0.0.pre2
32
- - !ruby/object:Gem::Dependency
33
- name: rake
34
- requirement: !ruby/object:Gem::Requirement
35
- none: false
36
- requirements:
37
- - - ~>
38
- - !ruby/object:Gem::Version
39
- version: '0.9'
40
- type: :development
41
- prerelease: false
42
- version_requirements: !ruby/object:Gem::Requirement
43
- none: false
44
- requirements:
45
- - - ~>
46
- - !ruby/object:Gem::Version
47
- version: '0.9'
48
- - !ruby/object:Gem::Dependency
49
- name: rspec
50
- requirement: !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ~>
54
- - !ruby/object:Gem::Version
55
- version: '2'
56
- type: :development
57
- prerelease: false
58
- version_requirements: !ruby/object:Gem::Requirement
59
- none: false
60
- requirements:
61
- - - ~>
62
- - !ruby/object:Gem::Version
63
- version: '2'
31
+ version: 3.0.0.pre3
64
32
  description: ! " Treat your dataset like a:\n\n * stream of lines when it's
65
33
  efficient to process by lines\n * stream of field arrays when it's efficient
66
34
  to deal directly with fields\n * stream of lightweight objects when it's efficient