wukong-hadoop 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,4 +1,4 @@
1
- source :rubygems
1
+ source 'https://rubygems.org'
2
2
 
3
3
  gemspec
4
4
 
@@ -21,8 +21,8 @@ module Wukong
21
21
  return unless program_name == 'wu-hadoop'
22
22
 
23
23
  # Hadoop Options
24
- settings.define :hadoop_home, wukong_hadoop: true, description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
25
24
  settings.define :hadoop_runner, wukong_hadoop: true, description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
25
+ settings.define :hadoop_streaming_jar, wukong_hadoop: true, description: 'Path to hadoop streaming jar. Use this for non-standard hadoop installations.'
26
26
 
27
27
  # Translate simplified args to their hairy hadoop equivalents
28
28
  settings.define :io_sort_mb, wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
@@ -111,7 +111,7 @@ EOF
111
111
  log.info "Launching local!"
112
112
  execute_command!(local_commandline)
113
113
  else
114
- remove_output_path! if settings[:rm] || settings[:overwrite]
114
+ remove_output_path if settings[:rm] || settings[:overwrite]
115
115
  hadoop_commandline
116
116
  log.info "Launching Hadoop!"
117
117
  execute_command!(hadoop_commandline)
@@ -187,23 +187,6 @@ EOF
187
187
  "--#{param}=#{Shellwords.escape(val.to_s)}"
188
188
  end.join(" ")
189
189
  end
190
-
191
- # Execute a command composed of the given parts.
192
- #
193
- # Will print the command instead of the <tt>--dry_run</tt>
194
- # option was given.
195
- #
196
- # @param [Array<String>] argv
197
- def execute_command!(*argv)
198
- command = argv.flatten.reject(&:blank?).join(" \\\n ")
199
- if settings[:dry_run]
200
- log.info("Dry run:")
201
- puts command
202
- else
203
- puts `#{command}`
204
- raise Error.new("Command failed!") unless $?.success?
205
- end
206
- end
207
190
 
208
191
  end
209
192
  end
@@ -10,10 +10,8 @@ module Wukong
10
10
  #
11
11
  # Will not actually do anything if the <tt>--dry_run</tt> option
12
12
  # is also given.
13
- def remove_output_path!
14
- cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
15
- log.info "Removing output file #{output_path}: #{cmd}"
16
- puts `#{cmd}` unless settings[:dry_run]
13
+ def remove_output_path
14
+ execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
17
15
  end
18
16
 
19
17
  # Return the Hadoop command used to launch this job in a Hadoop
@@ -26,11 +24,11 @@ module Wukong
26
24
  def hadoop_commandline
27
25
  [
28
26
  hadoop_runner,
29
- "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
27
+ "jar #{hadoop_streaming_jar}",
30
28
  hadoop_jobconf_options,
31
29
  "-D mapred.job.name='#{job_name}'",
32
- hadoop_other_args,
33
30
  hadoop_files,
31
+ hadoop_other_args,
34
32
  "-mapper '#{mapper_commandline}'",
35
33
  "-reducer '#{reducer_commandline}'",
36
34
  "-input '#{input_paths}'",
@@ -84,7 +82,17 @@ module Wukong
84
82
  #
85
83
  # @return [String]
86
84
  def hadoop_runner
87
- settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
85
+ settings[:hadoop_runner] || 'hadoop'
86
+ end
87
+
88
+ # The path (glob) to the Hadoop streaming jar.
89
+ #
90
+ # Respects the value of <tt>--hadoop_streaming_jar</tt> if
91
+ # given. Otherwise uses the default CDH4 location.
92
+ #
93
+ # @return [String]
94
+ def hadoop_streaming_jar
95
+ settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
88
96
  end
89
97
 
90
98
  # Return an array of jobconf (-D) options that will be passed to Hadoop.
@@ -99,10 +107,6 @@ module Wukong
99
107
  settings[:respect_exit_status] = 'false' if (settings[:ignore_exit_status] == true)
100
108
  # If no reducer and no reduce_command, then skip the reduce phase
101
109
  settings[:reduce_tasks] ||= 0 unless reduce?
102
- # Fields hadoop should use to distribute records to reducers
103
- unless settings[:partition_fields].blank?
104
- jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
105
- end
106
110
  jobconf_options += [
107
111
  :io_sort_mb, :io_sort_record_percent,
108
112
  :map_speculative, :map_tasks,
@@ -1,5 +1,5 @@
1
1
  module Wukong
2
2
  module Hadoop
3
- VERSION = '0.1.1'
3
+ VERSION = '0.2.0'
4
4
  end
5
5
  end
@@ -21,7 +21,7 @@ RSpec.configure do |config|
21
21
 
22
22
  def hadoop_runner *args, &block
23
23
  runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
24
- stub!(:execute_command!)
24
+ stub(:execute_command)
25
25
  instance_eval(&block) if block_given?
26
26
  end
27
27
  end
@@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do
69
69
  context "removing existing output paths" do
70
70
 
71
71
  it "will not remove the output path by default" do
72
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path!) }
72
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
73
73
  end
74
74
  it "will remove the output path when given the --rm option" do
75
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path!) }
75
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
76
76
  end
77
77
  it "will not remove the output path when given the --rm option AND the --dry_run option" do
78
- hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path!) }
78
+ hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
79
79
  end
80
80
  end
81
81
 
@@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do
4
4
 
5
5
  context "handling errors" do
6
6
  it "raises an error when it can't find a file" do
7
- expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /no such file/)
7
+ expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
8
8
  end
9
9
 
10
10
  it "raises an error in Hadoop mode when called without input and output paths" do
@@ -25,5 +25,5 @@ EOF
25
25
  gem.test_files = gem.files.grep(/^spec/)
26
26
  gem.require_paths = ['lib']
27
27
 
28
- gem.add_dependency('wukong', '3.0.1')
28
+ gem.add_dependency('wukong', '4.0.0')
29
29
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong-hadoop
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -11,7 +11,7 @@ authors:
11
11
  autorequire:
12
12
  bindir: bin
13
13
  cert_chain: []
14
- date: 2013-03-07 00:00:00.000000000 Z
14
+ date: 2014-03-19 00:00:00.000000000 Z
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: wukong
@@ -20,7 +20,7 @@ dependencies:
20
20
  requirements:
21
21
  - - '='
22
22
  - !ruby/object:Gem::Version
23
- version: 3.0.1
23
+ version: 4.0.0
24
24
  type: :runtime
25
25
  prerelease: false
26
26
  version_requirements: !ruby/object:Gem::Requirement
@@ -28,7 +28,7 @@ dependencies:
28
28
  requirements:
29
29
  - - '='
30
30
  - !ruby/object:Gem::Version
31
- version: 3.0.1
31
+ version: 4.0.0
32
32
  description: ! " Treat your dataset like a:\n\n * stream of lines when it's
33
33
  efficient to process by lines\n * stream of field arrays when it's efficient
34
34
  to deal directly with fields\n * stream of lightweight objects when it's efficient
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
113
113
  version: '0'
114
114
  segments:
115
115
  - 0
116
- hash: -3850081407008684305
116
+ hash: -3970581689600624425
117
117
  required_rubygems_version: !ruby/object:Gem::Requirement
118
118
  none: false
119
119
  requirements:
@@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
122
  version: '0'
123
123
  segments:
124
124
  - 0
125
- hash: -3850081407008684305
125
+ hash: -3970581689600624425
126
126
  requirements: []
127
127
  rubyforge_project:
128
- rubygems_version: 1.8.24
128
+ rubygems_version: 1.8.23
129
129
  signing_key:
130
130
  specification_version: 3
131
131
  summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use