RubyGems - wukong-hadoop - Versions diffs - 0.1.1 → 0.2.0 - Mend

wukong-hadoop 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/Gemfile +1 -1
data/lib/wukong-hadoop.rb +1 -1
data/lib/wukong-hadoop/runner.rb +1 -18
data/lib/wukong-hadoop/runner/hadoop_invocation.rb +15 -11
data/lib/wukong-hadoop/version.rb +1 -1
data/spec/spec_helper.rb +1 -1
data/spec/wukong-hadoop/hadoop_mode_spec.rb +3 -3
data/spec/wukong-hadoop/runner_spec.rb +1 -1
data/wukong-hadoop.gemspec +1 -1
metadata +7 -7

data/Gemfile CHANGED

@@ -1,4 +1,4 @@
-source :rubygems
+source 'https://rubygems.org'
 gemspec

data/lib/wukong-hadoop.rb CHANGED

@@ -21,8 +21,8 @@ module Wukong
       return unless program_name == 'wu-hadoop'
       # Hadoop Options
-      settings.define :hadoop_home,             wukong_hadoop: true,                description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
       settings.define :hadoop_runner,           wukong_hadoop: true,                description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
+      settings.define :hadoop_streaming_jar,    wukong_hadoop: true,                description: 'Path to hadoop streaming jar.  Use this for non-standard hadoop installations.'
       # Translate simplified args to their hairy hadoop equivalents
       settings.define :io_sort_mb,              wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'

data/lib/wukong-hadoop/runner.rb CHANGED

@@ -111,7 +111,7 @@ EOF
           log.info "Launching local!"
           execute_command!(local_commandline)
         else
-          remove_output_path! if settings[:rm] || settings[:overwrite]
+          remove_output_path if settings[:rm] || settings[:overwrite]
           hadoop_commandline
           log.info "Launching Hadoop!"
           execute_command!(hadoop_commandline)
@@ -187,23 +187,6 @@ EOF
           "--#{param}=#{Shellwords.escape(val.to_s)}"
         end.join(" ")
       end
-      # Execute a command composed of the given parts.
-      #
-      # Will print the command instead of the <tt>--dry_run</tt>
-      # option was given.
-      #
-      # @param [Array<String>] argv
-      def execute_command!(*argv)
-        command = argv.flatten.reject(&:blank?).join(" \\\n    ")
-        if settings[:dry_run]
-          log.info("Dry run:")
-          puts command
-        else
-          puts `#{command}`
-          raise Error.new("Command failed!") unless $?.success?
-        end
-      end
     end
   end

data/lib/wukong-hadoop/runner/hadoop_invocation.rb CHANGED

@@ -10,10 +10,8 @@ module Wukong
       #
       # Will not actually do anything if the <tt>--dry_run</tt> option
       # is also given.
-      def remove_output_path!
-        cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
-        log.info "Removing output file #{output_path}: #{cmd}"
-        puts `#{cmd}` unless settings[:dry_run]
+      def remove_output_path
+        execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
       end
       # Return the Hadoop command used to launch this job in a Hadoop
@@ -26,11 +24,11 @@ module Wukong
       def hadoop_commandline
         [
          hadoop_runner,
-         "jar #{settings[:hadoop_home]}/contrib/streaming/hadoop-*streaming*.jar",
+         "jar #{hadoop_streaming_jar}",
          hadoop_jobconf_options,
          "-D mapred.job.name='#{job_name}'",
-         hadoop_other_args,
          hadoop_files,
+         hadoop_other_args,
          "-mapper       '#{mapper_commandline}'",
          "-reducer      '#{reducer_commandline}'",
          "-input        '#{input_paths}'",
@@ -84,7 +82,17 @@ module Wukong
       #
       # @return [String]
       def hadoop_runner
-        settings[:hadoop_runner] || File.join(settings[:hadoop_home], 'bin/hadoop')
+        settings[:hadoop_runner] || 'hadoop'
+      end
+      # The path (glob) to the Hadoop streaming jar.
+      #
+      # Respects the value of <tt>--hadoop_streaming_jar</tt> if
+      # given.  Otherwise uses the default CDH4 location.
+      #
+      # @return [String]
+      def hadoop_streaming_jar
+        settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
       end
       # Return an array of jobconf (-D) options that will be passed to Hadoop.
@@ -99,10 +107,6 @@ module Wukong
         settings[:respect_exit_status] = 'false' if     (settings[:ignore_exit_status] == true)
         # If no reducer and no reduce_command, then skip the reduce phase
         settings[:reduce_tasks]      ||= 0       unless reduce?
-        # Fields hadoop should use to distribute records to reducers
-        unless settings[:partition_fields].blank?
-          jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
-        end
         jobconf_options += [
                             :io_sort_mb,               :io_sort_record_percent,
                             :map_speculative,          :map_tasks,

data/lib/wukong-hadoop/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Wukong
   module Hadoop
-    VERSION = '0.1.1'
+    VERSION = '0.2.0'
   end
 end

data/spec/spec_helper.rb CHANGED

@@ -21,7 +21,7 @@ RSpec.configure do |config|
   def hadoop_runner *args, &block
     runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
-      stub!(:execute_command!)
+      stub(:execute_command)
       instance_eval(&block) if block_given?
     end
   end

data/spec/wukong-hadoop/hadoop_mode_spec.rb CHANGED

@@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do
   context "removing existing output paths" do
     it "will not remove the output path by default" do
-      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path!) }
+      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
     end
     it "will remove the output path when given the --rm option" do
-      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path!) }
+      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
     end
     it "will not remove the output path when given the --rm option AND the --dry_run option" do
-      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path!) }
+      hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
     end
   end

data/spec/wukong-hadoop/runner_spec.rb CHANGED

@@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do
   context "handling errors" do
     it "raises an error when it can't find a file" do
-      expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /no such file/)
+      expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
     end
     it "raises an error in Hadoop mode when called without input and output paths" do

data/wukong-hadoop.gemspec CHANGED

@@ -25,5 +25,5 @@ EOF
   gem.test_files    = gem.files.grep(/^spec/)
   gem.require_paths = ['lib']
-  gem.add_dependency('wukong', '3.0.1')
+  gem.add_dependency('wukong', '4.0.0')
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wukong-hadoop
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
   prerelease:
 platform: ruby
 authors:
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-03-07 00:00:00.000000000 Z
+date: 2014-03-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: wukong
@@ -20,7 +20,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.1
+        version: 4.0.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -28,7 +28,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.1
+        version: 4.0.0
 description: ! "  Treat your dataset like a:\n\n      * stream of lines when it's
   efficient to process by lines\n      * stream of field arrays when it's efficient
   to deal directly with fields\n      * stream of lightweight objects when it's efficient
@@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -3850081407008684305
+      hash: -3970581689600624425
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
@@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
       segments:
       - 0
-      hash: -3850081407008684305
+      hash: -3970581689600624425
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.24
+rubygems_version: 1.8.23
 signing_key:
 specification_version: 3
 summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use