wukong-hadoop 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source :rubygems
1
+ source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
4
 
@@ -21,8 +21,8 @@ module Wukong
21
21
  return unless program_name == 'wu-hadoop'
22
22
 
23
23
  # Hadoop Options
24
- settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
25
24
  settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
25
+ settings.define :hadoop_streaming_jar, wukong_hadoop: true, description: 'Path to hadoop streaming jar. Use this for non-standard hadoop installations.'
26
26
 
27
27
  # Translate simplified args to their hairy hadoop equivalents
28
28
  settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
@@ -111,7 +111,7 @@ EOF
111
111
  log.info "Launching local!"
112
112
  execute_command!(local_commandline)
113
113
  else
114
- remove_output_path! if settings[:rm] || settings[:overwrite]
114
+ remove_output_path if settings[:rm] || settings[:overwrite]
115
115
  hadoop_commandline
116
116
  log.info "Launching Hadoop!"
117
117
  execute_command!(hadoop_commandline)
@@ -187,23 +187,6 @@ EOF
187
187
  "--#{param}=#{Shellwords.escape(val.to_s)}"
188
188
  end.join(" ")
189
189
  end
190
-
191
- # Execute a command composed of the given parts.
192
- #
193
- # Will print the command instead of the <tt>--dry_run</tt>
194
- # option was given.
195
- #
196
- # @param [Array<String>] argv
197
- def execute_command!(*argv)
198
- command = argv.flatten.reject(&:blank?).join(" \\\n ")
199
- if settings[:dry_run]
200
- log.info("Dry run:")
201
- puts command
202
- else
203
- puts `#{command}`
204
- raise Error.new("Command failed!") unless $?.success?
205
- end
206
- end
207
190
 
208
191
  end
209
192
  end
@@ -10,10 +10,8 @@ module Wukong
10
10
  #
11
11
  # Will not actually do anything if the <tt>--dry_run</tt> option
12
12
  # is also given.
13
- def remove_output_path!
14
- cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
15
- log.info "Removing output file #{output_path}: #{cmd}"
16
- puts `#{cmd}` unless settings[:dry_run]
13
+ def remove_output_path
14
+ execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
17
15
  end
18
16
 
19
17
  # Return the Hadoop command used to launch this job in a Hadoop
@@ -26,11 +24,11 @@ module Wukong
26
24
  def hadoop_commandline
27
25
  [
28
26
  hadoop_runner,
29
- "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
27
+ "jar #{hadoop_streaming_jar}",
30
28
  hadoop_jobconf_options,
31
29
  "-D mapred.job.name='#{job_name}'",
32
- hadoop_other_args,
33
30
  hadoop_files,
31
+ hadoop_other_args,
34
32
  "-mapper '#{mapper_commandline}'",
35
33
  "-reducer '#{reducer_commandline}'",
36
34
  "-input '#{input_paths}'",
@@ -84,7 +82,17 @@ module Wukong
84
82
  #
85
83
  # @return [String]
86
84
  def hadoop_runner
87
- settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
85
+ settings[:hadoop_runner] || 'hadoop'
86
+ end
87
+
88
+ # The path (glob) to the Hadoop streaming jar.
89
+ #
90
+ # Respects the value of <tt>--hadoop_streaming_jar</tt> if
91
+ # given. Otherwise uses the default CDH4 location.
92
+ #
93
+ # @return [String]
94
+ def hadoop_streaming_jar
95
+ settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
88
96
  end
89
97
 
90
98
  # Return an array of jobconf (-D) options that will be passed to Hadoop.
@@ -99,10 +107,6 @@ module Wukong
99
107
  settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
100
108
  # If no reducer and no reduce_command, then skip the reduce phase
101
109
  settings[:reduce_tasks] ||= 0 unless reduce?
102
- # Fields hadoop should use to distribute records to reducers
103
- unless settings[:partition_fields].blank?
104
- jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
105
- end
106
110
  jobconf_options += [
107
111
  :io_sort_mb, :io_sort_record_percent,
108
112
  :map_speculative, :map_tasks,
@@ -1,5 +1,5 @@
1
1
  module Wukong
2
2
  module Hadoop
3
- VERSION = '0.1.1'
3
+ VERSION = '0.2.0'
4
4
  end
5
5
  end
@@ -21,7 +21,7 @@ RSpec.configure do |config|
21
21
 
22
22
  def hadoop_runner *args, &block
23
23
  runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
24
- stub!(:execute_command!)
24
+ stub(:execute_command)
25
25
  instance_eval(&block) if block_given?
26
26
  end
27
27
  end
@@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do
69
69
  context "removing existing output paths" do
70
70
 
71
71
  it "will not remove the output path by default" do
72
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path!) }
72
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
73
73
  end
74
74
  it "will remove the output path when given the --rm option" do
75
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path!) }
75
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
76
76
  end
77
77
  it "will not remove the output path when given the --rm option AND the --dry_run option" do
78
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path!) }
78
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
79
79
  end
80
80
  end
81
81
 
@@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do
4
4
 
5
5
  context "handling errors" do
6
6
  it "raises an error when it can't find a file" do
7
- expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /no such file/)
7
+ expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
8
8
  end
9
9
 
10
10
  it "raises an error in Hadoop mode when called without input and output paths" do
@@ -25,5 +25,5 @@ EOF
25
25
  gem.test_files = gem.files.grep(/^spec/)
26
26
  gem.require_paths = ['lib']
27
27
 
28
- gem.add_dependency('wukong', '3.0.1')
28
+ gem.add_dependency('wukong', '4.0.0')
29
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-03-07 00:00:00.000000000 Z
14
+ date: 2014-03-19 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: wukong
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - '='
22
22
  - !ruby/object:Gem::Version
23
- version: 3.0.1
23
+ version: 4.0.0
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
@@ -28,7 +28,7 @@ dependencies:
28
28
  requirements:
29
29
  - - '='
30
30
  - !ruby/object:Gem::Version
31
- version: 3.0.1
31
+ version: 4.0.0
32
32
  description: ! " Treat your dataset like a:\n\n * stream of lines when it's
33
33
  efficient to process by lines\n * stream of field arrays when it's efficient
34
34
  to deal directly with fields\n * stream of lightweight objects when it's efficient
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
113
113
  version: '0'
114
114
  segments:
115
115
  - 0
116
- hash: -3850081407008684305
116
+ hash: -3970581689600624425
117
117
  required_rubygems_version: !ruby/object:Gem::Requirement
118
118
  none: false
119
119
  requirements:
@@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  version: '0'
123
123
  segments:
124
124
  - 0
125
- hash: -3850081407008684305
125
+ hash: -3970581689600624425
126
126
  requirements: []
127
127
  rubyforge_project:
128
- rubygems_version: 1.8.24
128
+ rubygems_version: 1.8.23
129
129
  signing_key:
130
130
  specification_version: 3
131
131
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use