RubyGems - wukong-hadoop - Versions diffs - 0.0.1 → 0.0.2 - Mend

wukong-hadoop 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/Gemfile +5 -0
data/lib/wukong-hadoop/configuration.rb +5 -1
data/lib/wukong-hadoop/driver.rb +4 -3
data/lib/wukong-hadoop/driver/hadoop_invocation.rb +22 -6
data/lib/wukong-hadoop/version.rb +1 -1
data/spec/spec_helper.rb +1 -0
data/spec/support/integration_helper.rb +7 -7
data/spec/wukong-hadoop/hadoop_mode_spec.rb +36 -6
data/wukong-hadoop.gemspec +1 -5
metadata +4 -36

data/Gemfile CHANGED

@@ -1,3 +1,8 @@
 source :rubygems
 gemspec
+group :development do
+  gem 'rake',     '~> 0.9'
+  gem 'rspec',    '~> 2'
+end

data/lib/wukong-hadoop/configuration.rb CHANGED

@@ -16,6 +16,7 @@ module Wukong
       settings.define :job_name,                wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
       settings.define :key_field_separator,     wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
       settings.define :map_speculative,         wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
+      settings.define :reduce_speculative,      wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
       settings.define :map_tasks,               wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
       settings.define :max_maps_per_cluster,    wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
       settings.define :max_maps_per_node,       wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
@@ -39,7 +40,10 @@ module Wukong
       settings.define :split_on_xml_tag,        wukong_hadoop: true,                description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
       settings.define :input_format,            wukong_hadoop: true,                description: 'Fully qualified Java class name defining an alternative InputFormat.'
       settings.define :output_format,           wukong_hadoop: true,                description: 'Fully qualified Java class name defining an alternative OutputFormat.'
-      settings.define :java_opts,               wukong_hadoop: true,                description: 'Additional java options to be passed to hadoop streaming.', :type => Array, :default => []
+      settings.define :java_opts,               wukong_hadoop: true,                description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
+      settings.define :files,                   wukong_hadoop: true,                description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
+      settings.define :jars,                    wukong_hadoop: true,                description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
+      settings.define :archives,                wukong_hadoop: true,                description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
       # Options given on the command-line
       settings.define :mode,           description: "Run in either 'hadoop' or 'local' mode",                                        wukong_hadoop: true, :default => 'hadoop'

data/lib/wukong-hadoop/driver.rb CHANGED

@@ -24,6 +24,7 @@ module Wukong
       include ReduceLogic
       include HadoopInvocation
       include LocalInvocation
+      include Logging
       # The settings used by this driver.
       #
@@ -55,12 +56,12 @@ module Wukong
       # Run this driver.
       def run!
         if mode == :local
-          # Log.info "Launching local!"
+          # log.info "Launching local!"
           execute_command!(local_commandline)
         else
           ensure_input_and_output!
           remove_output_path! if settings[:rm] || settings[:overwrite]
-          Log.info "Launching Hadoop!"
+          log.info "Launching Hadoop!"
           execute_command!(hadoop_commandline)
         end
       end
@@ -177,7 +178,7 @@ module Wukong
       def execute_command!(*args)
         command = args.flatten.reject(&:blank?).join(" \\\n    ")
         if settings[:dry_run]
-          Log.info("Dry run:")
+          log.info("Dry run:")
           puts command
         else
           puts `#{command}`

data/lib/wukong-hadoop/driver/hadoop_invocation.rb CHANGED

@@ -16,8 +16,8 @@ module Wukong
       # Will not actually do anything if the <tt>--dry_run</tt> option
       # is also given.
       def remove_output_path!
-        cmd = %Q{#{settings[:hadoop_runner]} fs -rmr '#{output_path}'}
-        Log.info "Removing output file #{output_path}: #{cmd}"
+        cmd = %Q{#{hadoop_runner} fs -rmr '#{output_path}'}
+        log.info "Removing output file #{output_path}: #{cmd}"
         puts `#{cmd}` unless settings[:dry_run]
       end
@@ -35,11 +35,11 @@ module Wukong
          hadoop_jobconf_options,
          "-D mapred.job.name='#{job_name}'",
          hadoop_other_args,
+         hadoop_files,
          "-mapper       '#{mapper_commandline}'",
          "-reducer      '#{reducer_commandline}'",
          "-input        '#{input_paths}'",
          "-output       '#{output_path}'",
-         hadoop_files,
          io_formats,
          hadoop_recycle_env,
         ].flatten.compact.join(" \t\\\n  ")
@@ -103,7 +103,7 @@ module Wukong
         settings[:reuse_jvms]          = '-1'    if     (settings[:reuse_jvms] == true)
         settings[:respect_exit_status] = 'false' if     (settings[:ignore_exit_status] == true)
         # If no reducer and no reduce_command, then skip the reduce phase
-        settings[:reduce_tasks]        = 0       unless (reduce? || settings[:reduce_tasks].nil?)
+        settings[:reduce_tasks]      ||= 0       unless reduce?
         # Fields hadoop should use to distribute records to reducers
         unless settings[:partition_fields].blank?
           jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
@@ -146,8 +146,24 @@ module Wukong
       #
       # http://hadoop.apache.org/docs/r0.20.2/streaming.html#Package+Files+With+Job+Submissions
       def hadoop_files
-        args.find_all { |arg| arg.to_s =~ /\.rb$/ }.map do |arg|
-          "-file         '#{arg}'"
+        args.find_all { |arg| arg.to_s =~ /\.rb$/ }.each do |arg|
+          settings[:files] << arg
+        end
+        [].tap do |files_options|
+          {
+            :files    => '-files        ',
+            :jars     => '-libjars      ',
+            :archives => '-archives     '
+          }.each_pair do |file_type_name, file_option_name|
+            unless settings[file_type_name].nil? || settings[file_type_name].empty?
+              files = settings[file_type_name].map do |file_name_or_glob|
+                # Don't glob on the HDFS
+                file_type_name == :archives ? file_name_or_glob : [Dir[file_name_or_glob], file_name_or_glob]
+              end.flatten.compact.uniq.join(',')
+              files_options << "#{file_option_name}'#{files}'"
+            end
+          end
         end
       end

data/lib/wukong-hadoop/version.rb CHANGED

@@ -1,6 +1,6 @@
 module Wukong
   module Hadoop
     # The current version of Wukong-Hadoop.
-    VERSION = '0.0.1'
+    VERSION = '0.0.2'
   end
 end

data/spec/spec_helper.rb CHANGED

@@ -6,6 +6,7 @@ require 'wukong/spec_helpers'
 RSpec.configure do |config|
   config.before(:each) do
+    Wukong::Log.level = Log4r::OFF
     @orig_reg = Wukong.registry.show
   end

data/spec/support/integration_helper.rb CHANGED

@@ -6,16 +6,16 @@ module Wukong
         @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
       end
-      def lib_dir
-        root.join('lib')
+      def lib_dir *args
+        root.join('lib', *args)
       end
-      def bin_dir
-        root.join('bin')
+      def bin_dir *args
+        root.join('bin', *args)
       end
-      def examples_dir
-        root.join('examples')
+      def examples_dir *args
+        root.join('examples', *args)
       end
       def integration_env
@@ -30,7 +30,7 @@ module Wukong
       end
       def example_script *args
-        examples_dir.join(*args)
+        examples_dir(*args)
       end
     end

data/spec/wukong-hadoop/hadoop_mode_spec.rb CHANGED

@@ -4,8 +4,9 @@ describe Wukong::Hadoop::HadoopInvocation do
   let(:map_only)   { driver('regexp',          input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
   let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
-  let(:complex)    { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
-  let(:custum_io)  { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
+  let(:complex)    { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye'], :reduce_tasks => 20) }
+  let(:custom_io)  { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
+  let(:many_files) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', files: %w[/file/1 /file/2], archives: %w[/archive/1 /archive/2], jars: %w[/jar/1 /jar/2])}
   context "defining input paths" do
     it "raises an error unless given an --input option" do
@@ -15,7 +16,7 @@ describe Wukong::Hadoop::HadoopInvocation do
       map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
     end
     it "sets its input format given the --input_format option" do
-      custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
+      custom_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
     end
   end
@@ -27,7 +28,7 @@ describe Wukong::Hadoop::HadoopInvocation do
       map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
     end
     it "sets its output format given the --output_format option" do
-      custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
+      custom_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
     end
   end
@@ -43,6 +44,18 @@ describe Wukong::Hadoop::HadoopInvocation do
     end
   end
+  context "setting the number of reduce tasks" do
+    it "does nothing on a map/reduce job" do
+      map_reduce.hadoop_commandline.should_not match(%r{-D mapred.reduce.tasks})
+    end
+    it "respects the option when given" do
+      complex.hadoop_commandline.should  match(%r{-D mapred.reduce.tasks=20})
+    end
+    it "sets reduce tasks to 0 for a map-only job" do
+      map_only.hadoop_commandline.should match(%r{-D mapred.reduce.tasks=0})
+    end
+  end
   context "defining Hadoop JobConf options" do
     it "translates friendly names into native ones" do
       complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
@@ -54,7 +67,7 @@ describe Wukong::Hadoop::HadoopInvocation do
   end
   context "removing existing output paths" do
-    before { Log.stub!(:info) }
     it "will not remove the output path by default" do
       map_reduce.should_not_receive(:remove_output_path!)
       map_reduce.should_receive(:execute_command!)
@@ -72,7 +85,24 @@ describe Wukong::Hadoop::HadoopInvocation do
       d.should_receive(:execute_command!)
       d.run!
     end
+  end
+  context "handle files, jars, and archives" do
+    it "does not include any files, jars, or archives when no files were passed" do
+      map_reduce.hadoop_commandline.should_not match(%r{-(files|archives|libjars)})
+    end
+    it "should include files when asked" do
+      many_files.hadoop_commandline.should match(%r{-files\s+'/file/1,/file/2'})
+    end
+    it "should include jars when asked" do
+      many_files.hadoop_commandline.should match(%r{-libjars\s+'/jar/1,/jar/2'})
+    end
+    it "should include archives when asked" do
+      many_files.hadoop_commandline.should match(%r{-archives\s+'/archive/1,/archive/2'})
+    end
+    it "should include files when passed files as arguments" do
+      driver(examples_dir('tokenizer.rb'), examples_dir('counter.rb'), input: '/tmp/input1,/tmp/input2', output: '/tmp/output').hadoop_commandline.should match(%r{-files.+tokenizer\.rb,.*counter\.rb})
+    end
   end
 end

data/wukong-hadoop.gemspec CHANGED

@@ -25,9 +25,5 @@ EOF
   gem.test_files    = gem.files.grep(/^spec/)
   gem.require_paths = ['lib']
-  gem.add_dependency('wukong',      '3.0.0.pre2')
-  gem.add_development_dependency 'rake',     '~> 0.9'
-  gem.add_development_dependency 'rspec',    '~> 2'
+  gem.add_dependency('wukong',      '3.0.0.pre3')
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wukong-hadoop
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -11,7 +11,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-01 00:00:00.000000000 Z
+date: 2012-12-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: wukong
@@ -20,7 +20,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre2
+        version: 3.0.0.pre3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -28,39 +28,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre2
-- !ruby/object:Gem::Dependency
-  name: rake
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: '0.9'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: '0.9'
-- !ruby/object:Gem::Dependency
-  name: rspec
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: '2'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        version: '2'
+        version: 3.0.0.pre3
 description: ! "  Treat your dataset like a:\n\n      * stream of lines when it's
   efficient to process by lines\n      * stream of field arrays when it's efficient
   to deal directly with fields\n      * stream of lightweight objects when it's efficient