wukong-hadoop 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,3 +1,8 @@
1
1
  source :rubygems
2
2
 
3
3
  gemspec
4
+
5
+ group :development do
6
+ gem 'rake', '~> 0.9'
7
+ gem 'rspec', '~> 2'
8
+ end
@@ -16,6 +16,7 @@ module Wukong
16
16
  settings.define :job_name, wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
17
17
  settings.define :key_field_separator, wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
18
18
  settings.define :map_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
19
+ settings.define :reduce_speculative, wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
19
20
  settings.define :map_tasks, wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
20
21
  settings.define :max_maps_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
21
22
  settings.define :max_maps_per_node, wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
@@ -39,7 +40,10 @@ module Wukong
39
40
  settings.define :split_on_xml_tag, wukong_hadoop: true, description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
40
41
  settings.define :input_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative InputFormat.'
41
42
  settings.define :output_format, wukong_hadoop: true, description: 'Fully qualified Java class name defining an alternative OutputFormat.'
42
- settings.define :java_opts, wukong_hadoop: true, description: 'Additional java options to be passed to hadoop streaming.', :type => Array, :default => []
43
+ settings.define :java_opts, wukong_hadoop: true, description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
44
+ settings.define :files, wukong_hadoop: true, description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
45
+ settings.define :jars, wukong_hadoop: true, description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
46
+ settings.define :archives, wukong_hadoop: true, description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
43
47
 
44
48
  # Options given on the command-line
45
49
  settings.define :mode, description: "Run in either 'hadoop' or 'local' mode", wukong_hadoop: true, :default => 'hadoop'
@@ -24,6 +24,7 @@ module Wukong
24
24
  include ReduceLogic
25
25
  include HadoopInvocation
26
26
  include LocalInvocation
27
+ include Logging
27
28
 
28
29
  # The settings used by this driver.
29
30
  #
@@ -55,12 +56,12 @@ module Wukong
55
56
  # Run this driver.
56
57
  def run!
57
58
  if mode == :local
58
- # Log.info "Launching local!"
59
+ # log.info "Launching local!"
59
60
  execute_command!(local_commandline)
60
61
  else
61
62
  ensure_input_and_output!
62
63
  remove_output_path! if settings[:rm] || settings[:overwrite]
63
- Log.info "Launching Hadoop!"
64
+ log.info "Launching Hadoop!"
64
65
  execute_command!(hadoop_commandline)
65
66
  end
66
67
  end
@@ -177,7 +178,7 @@ module Wukong
177
178
  def execute_command!(*args)
178
179
  command = args.flatten.reject(&:blank?).join(" \\\n ")
179
180
  if settings[:dry_run]
180
- Log.info("Dry run:")
181
+ log.info("Dry run:")
181
182
  puts command
182
183
  else
183
184
  puts `#{command}`
@@ -16,8 +16,8 @@ module Wukong
16
16
  # Will not actually do anything if the <tt>--dry_run</tt> option
17
17
  # is also given.
18
18
  def remove_output_path!
19
- cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
20
- Log.info "Removing output file #{output_path}: #{cmd}"
19
+ cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
20
+ log.info "Removing output file #{output_path}: #{cmd}"
21
21
  puts `#{cmd}` unless settings[:dry_run]
22
22
  end
23
23
 
@@ -35,11 +35,11 @@ module Wukong
35
35
  hadoop_jobconf_options,
36
36
  "-D mapred.job.name='#{job_name}'",
37
37
  hadoop_other_args,
38
+ hadoop_files,
38
39
  "-mapper '#{mapper_commandline}'",
39
40
  "-reducer '#{reducer_commandline}'",
40
41
  "-input '#{input_paths}'",
41
42
  "-output '#{output_path}'",
42
- hadoop_files,
43
43
  io_formats,
44
44
  hadoop_recycle_env,
45
45
  ].flatten.compact.join(" \t\\\n ")
@@ -103,7 +103,7 @@ module Wukong
103
103
  settings[:reuse_jvms] = '-1' if (settings[:reuse_jvms] == true)
104
104
  settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
105
105
  # If no reducer and no reduce_command, then skip the reduce phase
106
- settings[:reduce_tasks] = 0 unless (reduce? || settings[:reduce_tasks].nil?)
106
+ settings[:reduce_tasks] ||= 0 unless reduce?
107
107
  # Fields hadoop should use to distribute records to reducers
108
108
  unless settings[:partition_fields].blank?
109
109
  jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
@@ -146,8 +146,24 @@ module Wukong
146
146
  #
147
147
  # http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
148
148
  def hadoop_files
149
- args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
150
- "-file '#{arg}'"
149
+ args.find_all { |arg| arg.to_s =~ /\.rb$/ }.each do |arg|
150
+ settings[:files] << arg
151
+ end
152
+
153
+ [].tap do |files_options|
154
+ {
155
+ :files => '-files ',
156
+ :jars => '-libjars ',
157
+ :archives => '-archives '
158
+ }.each_pair do |file_type_name, file_option_name|
159
+ unless settings[file_type_name].nil? || settings[file_type_name].empty?
160
+ files = settings[file_type_name].map do |file_name_or_glob|
161
+ # Don't glob on the HDFS
162
+ file_type_name == :archives ? file_name_or_glob : [Dir[file_name_or_glob], file_name_or_glob]
163
+ end.flatten.compact.uniq.join(',')
164
+ files_options << "#{file_option_name}'#{files}'"
165
+ end
166
+ end
151
167
  end
152
168
  end
153
169
 
@@ -1,6 +1,6 @@
1
1
  module Wukong
2
2
  module Hadoop
3
3
  # The current version of Wukong-Hadoop.
4
- VERSION = '0.0.1'
4
+ VERSION = '0.0.2'
5
5
  end
6
6
  end
@@ -6,6 +6,7 @@ require 'wukong/spec_helpers'
6
6
  RSpec.configure do |config|
7
7
 
8
8
  config.before(:each) do
9
+ Wukong::Log.level = Log4r::OFF
9
10
  @orig_reg = Wukong.registry.show
10
11
  end
11
12
 
@@ -6,16 +6,16 @@ module Wukong
6
6
  @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
7
  end
8
8
 
9
- def lib_dir
10
- root.join('lib')
9
+ def lib_dir *args
10
+ root.join('lib', *args)
11
11
  end
12
12
 
13
- def bin_dir
14
- root.join('bin')
13
+ def bin_dir *args
14
+ root.join('bin', *args)
15
15
  end
16
16
 
17
- def examples_dir
18
- root.join('examples')
17
+ def examples_dir *args
18
+ root.join('examples', *args)
19
19
  end
20
20
 
21
21
  def integration_env
@@ -30,7 +30,7 @@ module Wukong
30
30
  end
31
31
 
32
32
  def example_script *args
33
- examples_dir.join(*args)
33
+ examples_dir(*args)
34
34
  end
35
35
 
36
36
  end
@@ -4,8 +4,9 @@ describe Wukong::Hadoop::HadoopInvocation do
4
4
 
5
5
  let(:map_only) { driver('regexp', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
6
6
  let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
7
- let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
8
- let(:custum_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
7
+ let(:complex) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
8
+ let(:custom_io) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
9
+ let(:many_files) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}
9
10
 
10
11
  context "defining input paths" do
11
12
  it "raises an error unless given an --input option" do
@@ -15,7 +16,7 @@ describe Wukong::Hadoop::HadoopInvocation do
15
16
  map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
16
17
  end
17
18
  it "sets its input format given the --input_format option" do
18
- custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
+ custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
19
20
  end
20
21
  end
21
22
 
@@ -27,7 +28,7 @@ describe Wukong::Hadoop::HadoopInvocation do
27
28
  map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
28
29
  end
29
30
  it "sets its output format given the --output_format option" do
30
- custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
+ custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
31
32
  end
32
33
  end
33
34
 
@@ -43,6 +44,18 @@ describe Wukong::Hadoop::HadoopInvocation do
43
44
  end
44
45
  end
45
46
 
47
+ context "setting the number of reduce tasks" do
48
+ it "does nothing on a map/reduce job" do
49
+ map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
50
+ end
51
+ it "respects the option when given" do
52
+ complex.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=20})
53
+ end
54
+ it "sets reduce tasks to 0 for a map-only job" do
55
+ map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
56
+ end
57
+ end
58
+
46
59
  context "defining Hadoop JobConf options" do
47
60
  it "translates friendly names into native ones" do
48
61
  complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
@@ -54,7 +67,7 @@ describe Wukong::Hadoop::HadoopInvocation do
54
67
  end
55
68
 
56
69
  context "removing existing output paths" do
57
- before { Log.stub!(:info) }
70
+
58
71
  it "will not remove the output path by default" do
59
72
  map_reduce.should_not_receive(:remove_output_path!)
60
73
  map_reduce.should_receive(:execute_command!)
@@ -72,7 +85,24 @@ describe Wukong::Hadoop::HadoopInvocation do
72
85
  d.should_receive(:execute_command!)
73
86
  d.run!
74
87
  end
75
-
88
+ end
89
+
90
+ context "handle files, jars, and archives" do
91
+ it "does not include any files, jars, or archives when no files were passed" do
92
+ map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
93
+ end
94
+ it "should include files when asked" do
95
+ many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
96
+ end
97
+ it "should include jars when asked" do
98
+ many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
99
+ end
100
+ it "should include archives when asked" do
101
+ many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
102
+ end
103
+ it "should include files when passed files as arguments" do
104
+ driver(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
105
+ end
76
106
  end
77
107
 
78
108
  end
@@ -25,9 +25,5 @@ EOF
25
25
  gem.test_files = gem.files.grep(/^spec/)
26
26
  gem.require_paths = ['lib']
27
27
 
28
- gem.add_dependency('wukong', '3.0.0.pre2')
29
-
30
- gem.add_development_dependency 'rake', '~> 0.9'
31
- gem.add_development_dependency 'rspec', '~> 2'
32
-
28
+ gem.add_dependency('wukong', '3.0.0.pre3')
33
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2012-12-01 00:00:00.000000000 Z
14
+ date: 2012-12-17 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: wukong
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - '='
22
22
  - !ruby/object:Gem::Version
23
- version: 3.0.0.pre2
23
+ version: 3.0.0.pre3
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
@@ -28,39 +28,7 @@ dependencies:
28
28
  requirements:
29
29
  - - '='
30
30
  - !ruby/object:Gem::Version
31
- version: 3.0.0.pre2
32
- - !ruby/object:Gem::Dependency
33
- name: rake
34
- requirement: !ruby/object:Gem::Requirement
35
- none: false
36
- requirements:
37
- - - ~>
38
- - !ruby/object:Gem::Version
39
- version: '0.9'
40
- type: :development
41
- prerelease: false
42
- version_requirements: !ruby/object:Gem::Requirement
43
- none: false
44
- requirements:
45
- - - ~>
46
- - !ruby/object:Gem::Version
47
- version: '0.9'
48
- - !ruby/object:Gem::Dependency
49
- name: rspec
50
- requirement: !ruby/object:Gem::Requirement
51
- none: false
52
- requirements:
53
- - - ~>
54
- - !ruby/object:Gem::Version
55
- version: '2'
56
- type: :development
57
- prerelease: false
58
- version_requirements: !ruby/object:Gem::Requirement
59
- none: false
60
- requirements:
61
- - - ~>
62
- - !ruby/object:Gem::Version
63
- version: '2'
31
+ version: 3.0.0.pre3
64
32
  description: ! " Treat your dataset like a:\n\n * stream of lines when it's
65
33
  efficient to process by lines\n * stream of field arrays when it's efficient
66
34
  to deal directly with fields\n * stream of lightweight objects when it's efficient