RubyGems - wukong-hadoop - Versions diffs - 0.0.1 - Mend

wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/.gitignore +59 -0
data/.rspec +2 -0
data/Gemfile +3 -0
data/README.md +339 -0
data/Rakefile +13 -0
data/bin/hdp-bin +44 -0
data/bin/hdp-bzip +23 -0
data/bin/hdp-cat +3 -0
data/bin/hdp-catd +3 -0
data/bin/hdp-cp +3 -0
data/bin/hdp-du +86 -0
data/bin/hdp-get +3 -0
data/bin/hdp-kill +3 -0
data/bin/hdp-kill-task +3 -0
data/bin/hdp-ls +11 -0
data/bin/hdp-mkdir +2 -0
data/bin/hdp-mkdirp +12 -0
data/bin/hdp-mv +3 -0
data/bin/hdp-parts_to_keys.rb +77 -0
data/bin/hdp-ps +3 -0
data/bin/hdp-put +3 -0
data/bin/hdp-rm +32 -0
data/bin/hdp-sort +40 -0
data/bin/hdp-stream +40 -0
data/bin/hdp-stream-flat +22 -0
data/bin/hdp-stream2 +39 -0
data/bin/hdp-sync +17 -0
data/bin/hdp-wc +67 -0
data/bin/wu-hadoop +14 -0
data/examples/counter.rb +17 -0
data/examples/map_only.rb +28 -0
data/examples/processors.rb +4 -0
data/examples/sonnet_18.txt +14 -0
data/examples/tokenizer.rb +28 -0
data/examples/word_count.rb +44 -0
data/features/step_definitions/wu_hadoop_steps.rb +4 -0
data/features/support/env.rb +1 -0
data/features/wu_hadoop.feature +113 -0
data/lib/wukong-hadoop.rb +21 -0
data/lib/wukong-hadoop/configuration.rb +133 -0
data/lib/wukong-hadoop/driver.rb +190 -0
data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
data/lib/wukong-hadoop/extensions.rb +2 -0
data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
data/lib/wukong-hadoop/version.rb +6 -0
data/spec/spec_helper.rb +21 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +39 -0
data/spec/wukong-hadoop/driver_spec.rb +117 -0
data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
data/wukong-hadoop.gemspec +33 -0
metadata +168 -0

data/lib/wukong-hadoop/driver/map_logic.rb ADDED Viewed

@@ -0,0 +1,104 @@
+module Wukong
+  module Hadoop
+    # Implements logic for figuring out the correct mapper commandline
+    # given wu-hadoop's arguments.
+    module MapLogic
+      # Return the actual commandline used by the mapper, whether
+      # running in local or Hadoop mode.
+      #
+      # You should be able to copy, paste, and run this command
+      # unmodified to debug the mapper.
+      #
+      # @return [String]
+      def mapper_commandline
+        return settings[:map_command] if explicit_map_command?
+        [command_prefix, 'wu-local',  mapper_arg].tap do |cmd|
+          cmd << "--run=#{mapper_name}" if mapper_needs_run_arg?
+          cmd << params_to_pass
+        end.compact.map(&:to_s).reject(&:empty?).join(' ')
+      end
+      # Were we given an explicit map command (like 'cut -f 1') or are
+      # we to introspect and construct the command?
+      #
+      # @return [true, false]
+      def explicit_map_command?
+        settings[:map_command]
+      end
+      # Were we given a processor to use as our mapper explicitly by
+      # name or are we to introspect to discover the correct
+      # processor?
+      #
+      # @return [true, false]
+      def explicit_map_processor?
+        settings[:mapper]
+      end
+      # Were we given an explicit mapper (either as a command or as a
+      # processor) or should we introspect to find one?
+      #
+      # @return [true, false]
+      def explicit_mapper?
+        explicit_map_processor? || explicit_map_command?
+      end
+      # The argument that we should introspect on to turn into our
+      # mapper.
+      #
+      # @return [String]
+      def mapper_arg
+        args.first
+      end
+      # Does the mapper commandline need an explicit --run argument?
+      #
+      # Will not be used if the processor name is the same as the name
+      # of the script.
+      #
+      # @return [true, false]
+      def mapper_needs_run_arg?
+        return false if mapper_arg.to_s == mapper_name.to_s
+        return false if File.basename(mapper_arg.to_s, '.rb') == mapper_name.to_s
+        true
+      end
+      # Return the name of the processor to use as the mapper.
+      #
+      # Will raise a <tt>Wukong::Error</tt> if a given mapper is
+      # invalid or if none can be guessed.
+      #
+      # Most of the logic that examines explicit command line
+      # arguments and checks for the existence of named processors or
+      # files is here.
+      #
+      # @return [String]
+      def mapper_name
+        case
+        when explicit_mapper?
+          if processor_registered?(settings[:mapper])
+            settings[:mapper]
+          else
+            raise Error.new("No such processor: '#{settings[:mapper]}'")
+          end
+        when map_only? && processor_registered?(mapper_arg)
+          mapper_arg
+        when map_only? && file_is_processor?(mapper_arg)
+          processor_name_from_file(mapper_arg)
+        when single_job_arg? && explicit_reducer? && processor_registered?(mapper_arg)
+          mapper_arg
+        when separate_map_and_reduce_args? && processor_registered?(mapper_arg)
+          mapper_arg
+        when separate_map_and_reduce_args? && file_is_processor?(mapper_arg)
+          processor_name_from_file(mapper_arg)
+        when processor_registered?('mapper')
+          'mapper'
+        else
+          raise Error.new("Could not find a processor to use as a mapper")
+        end
+      end
+    end
+  end
+end

data/lib/wukong-hadoop/driver/reduce_logic.rb ADDED Viewed

@@ -0,0 +1,129 @@
+module Wukong
+  module Hadoop
+    # Implements logic for figuring out the correct reducer
+    # commandline given wu-hadoop's arguments and whether or not to
+    # run a map-only (no-reduce) job.
+    module ReduceLogic
+      # Return the actual commandline used by the reducer, whether
+      # running in local or Hadoop mode.
+      #
+      # You should be able to copy, paste, and run this command
+      # unmodified to debug the reducer.
+      #
+      # @return [String]
+      def reducer_commandline
+        return ''                        unless reduce?
+        return settings[:reduce_command] if     explicit_reduce_command?
+        [command_prefix, 'wu-local', reducer_arg].tap do |cmd|
+          cmd << "--run=#{reducer_name}" if reducer_needs_run_arg?
+          cmd << params_to_pass
+        end.compact.map(&:to_s).reject(&:empty?).join(' ')
+      end
+      # Were we given an explicit reduce command (like 'uniq -c') or
+      # are we to introspect and construct the command?
+      #
+      # @return [true, false]
+      def explicit_reduce_command?
+        settings[:reduce_command]
+      end
+      # Were we given a processor to use as our reducer explicitly by
+      # name or are we to introspect to discover the correct
+      # processor?
+      #
+      # @return [true, false]
+      def explicit_reduce_processor?
+        settings[:reducer]
+      end
+      # Were we given an explicit reducer (either as a command or as a
+      # processor) or should we introspect to find one?
+      #
+      # @return [true, false]
+      def explicit_reducer?
+        explicit_reduce_processor? || explicit_reduce_command?
+      end
+      # The argument that we should introspect on to turn into our
+      # reducer.
+      #
+      # @return [String]
+      def reducer_arg
+        args.last
+      end
+      # Should we perform a reduce or is this a map-only job?
+      #
+      # We will definitely reduce if
+      #
+      #   - given an explicit <tt>--reduce_command</tt>
+      #   - we discovered a reducer
+      #
+      # We will not reduce if:
+      #
+      #   - <tt>--reduce_tasks</tt> was explicitly set to 0
+      #
+      # @return [true, false]
+      def reduce?
+        return false if settings[:reduce_tasks] && settings[:reduce_tasks].to_i == 0
+        return true  if settings[:reduce_command]
+        return true  if reducer_name
+        false
+      end
+      # Is this a map-only job?
+      #
+      # @see #reduce?
+      #
+      # @return [true, false]
+      def map_only?
+        (! reduce?)
+      end
+      # Does the reducer commandline need an explicit --run argument?
+      #
+      # Will not be used if the processor name is the same as the name
+      # of the script.
+      #
+      # @return [true, false]
+      def reducer_needs_run_arg?
+        return false if reducer_arg.to_s == reducer_name.to_s
+        return false if File.basename(reducer_arg.to_s, '.rb') == reducer_name
+        true
+      end
+      # Return the name of the processor to use as the reducer.
+      #
+      # Will raise a <tt>Wukong::Error</tt> if a given reducer is
+      # invalid.  Will return nil if no reducer can be guessed.
+      #
+      # Most of the logic that examines explicit command line
+      # arguments and checks for the existence of named processors or
+      # files is here.
+      #
+      # @return [String]
+      def reducer_name
+        case
+        when explicit_reducer?
+          if processor_registered?(settings[:reducer])
+            settings[:reducer]
+          else
+            raise Error.new("No such processor: '#{settings[:reducer]}'")
+          end
+        when single_job_arg? && explicit_mapper? && processor_registered?(reducer_arg)
+          reducer_arg
+        when separate_map_and_reduce_args? && processor_registered?(reducer_arg)
+          reducer_arg
+        when separate_map_and_reduce_args? && file_is_processor?(reducer_arg)
+          processor_name_from_file(reducer_arg)
+        when processor_registered?('reducer')
+          'reducer'
+        end
+      end
+    end
+  end
+end

data/lib/wukong-hadoop/extensions.rb ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ require_relative("hadoop_env_methods")
2	+

data/lib/wukong-hadoop/hadoop_env_methods.rb ADDED Viewed

@@ -0,0 +1,80 @@
+module Wukong
+  module Hadoop
+    # Hadoop streaming exposes several environment variables to
+    # scripts it executes.  This module contains methods that make
+    # these variables easily accessed from within a processor.
+    #
+    # Since these environment variables are ultimately set by Hadoop's
+    # streaming jar when executing inside Hadoop, you'll have to set
+    # them manually when testing locally.
+    #
+    # Via @pskomoroch via @tlipcon:
+    #
+    #  "there is a little known Hadoop Streaming trick buried in this Python
+    #   script. You will notice that the date is not actually in the raw log
+    #   data itself, but is part of the filename. It turns out that Hadoop makes
+    #   job parameters you would fetch in Java with something like
+    #   job.get("mapred.input.file") available as environment variables for
+    #   streaming jobs, with periods replaced with underscores:
+    #
+    #     filepath = os.environ["map_input_file"]
+    #     filename = os.path.split(filepath)[-1]
+    module EnvMethods
+      # Fetch a parameter set by Hadoop streaming in the environment
+      # of the currently executing process.
+      #
+      # @param [String] name the '.' separated parameter name to fetch
+      # @return [String] the value from the process' environment
+      def hadoop_streaming_parameter name
+        ENV[name.gsub('.', '_')]
+      end
+      # Path of the (data) file currently being processed.
+      #
+      # @return [String]
+      def input_file
+        ENV['map_input_file']
+      end
+      # Directory of the (data) file currently being processed.
+      #
+      # @return [String]
+      def input_dir
+        ENV['mapred_input_dir']
+      end
+      # Offset of the chunk currently being processed within the current input file.
+      #
+      # @return [String]
+      def map_input_start_offset
+        ENV['map_input_start']
+      end
+      # Length of the chunk currently being processed within the current input file.
+      #
+      # @return [String]
+      def map_input_length
+        ENV['map_input_length']
+      end
+      # ID of the current map/reduce attempt.
+      #
+      # @return [String]
+      def attempt_id
+        ENV['mapred_task_id']
+      end
+      # ID of the current map/reduce task.
+      #
+      # @return [String]
+      def curr_task_id
+        ENV['mapred_tip_id']
+      end
+    end
+  end
+  Processor.class_eval{ include Hadoop::EnvMethods }
+end

data/lib/wukong-hadoop/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module Wukong
+  module Hadoop
+    # The current version of Wukong-Hadoop.
+    VERSION = '0.0.1'
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'wukong-hadoop'
+require_relative('support/integration_helper')
+require_relative('support/driver_helper')
+require 'wukong/spec_helpers'
+RSpec.configure do |config|
+  config.before(:each) do
+    @orig_reg = Wukong.registry.show
+  end
+  config.after(:each) do
+    Wukong.registry.clear!
+    Wukong.registry.merge!(@orig_reg)
+  end
+  include Wukong::SpecHelpers
+  include Wukong::Hadoop::IntegrationHelper
+  include Wukong::Hadoop::DriverHelper
+end

data/spec/support/driver_helper.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module Wukong
+  module Hadoop
+    module DriverHelper
+      def driver *args
+        params   = ::Wukong::Hadoop.configure(Configliere::Param.new)
+        params.resolve!
+        params.merge!(args.pop) if args.last.is_a?(Hash)
+        Wukong::Hadoop::Driver.new(params, *args)
+      end
+    end
+  end
+end

data/spec/support/integration_helper.rb ADDED Viewed

@@ -0,0 +1,39 @@
+module Wukong
+  module Hadoop
+    module IntegrationHelper
+      def root
+        @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
+      end
+      def lib_dir
+        root.join('lib')
+      end
+      def bin_dir
+        root.join('bin')
+      end
+      def examples_dir
+        root.join('examples')
+      end
+      def integration_env
+        {
+          "PATH"    => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
+          "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
+        }
+      end
+      def integration_cwd
+        root.to_s
+      end
+      def example_script *args
+        examples_dir.join(*args)
+      end
+    end
+  end
+end

data/spec/wukong-hadoop/driver_spec.rb ADDED Viewed

@@ -0,0 +1,117 @@
+require 'spec_helper'
+describe Wukong::Hadoop::Driver do
+  context "processing its arguments" do
+    it "raises an error when it can't find a file" do
+      lambda { driver(example_script('processors.rb'), example_script('doesnt_exist.rb')) }.should raise_error(Wukong::Error, /No such processor or file/)
+    end
+    it "raises an error when it can't find a widget" do
+      lambda { driver('regexp', 'doesnt_exist') }.should raise_error(Wukong::Error, /No such processor or file/)
+    end
+    it "raises an error when given more than two arguments" do
+      lambda { driver('regexp', example_script('counter.rb'), 'extra') }.should raise_error(Wukong::Error, /more than two/)
+    end
+  end
+  context "will execute a map-only job" do
+    context "with an explicit map command" do
+      let(:subject)            { driver(:map_command => 'cut -f 1') }
+      its(:reduce?)            { should be_false }
+      its(:mapper_commandline) { should match /^cut -f 1$/ }
+    end
+    context "with a single widget" do
+      let(:subject)            { driver('regexp') }
+      its(:reduce?)            { should be_false }
+      its(:mapper_commandline) { should match /^wu-local regexp$/ }
+    end
+    context "with a single file" do
+      context "defining a processor named 'mapper'" do
+        let(:subject)            { driver(example_script('map_only.rb')) }
+        its(:reduce?)            { should be_false }
+        its(:mapper_commandline) { should match /^wu-local .*map_only.rb --run=mapper$/ }
+      end
+      context "defining a processor named after the file" do
+        let(:subject)            { driver(example_script('tokenizer.rb')) }
+        its(:reduce?)            { should be_false }
+        its(:mapper_commandline) { should match /^wu-local .*tokenizer.rb$/ }
+      end
+      context "using the given --mapper option " do
+        let(:subject)            { driver(example_script('processors.rb'), :mapper => 'tokenizer') }
+        its(:reduce?)            { should be_false }
+        its(:mapper_commandline) { should match /^wu-local .*processors.rb --run=tokenizer$/ }
+      end
+      context "defining a processor named 'reducer' but with --reduce_tasks=0" do
+        let(:subject)            { driver(example_script('word_count.rb'), :reduce_tasks => 0) }
+        its(:reduce?)            { should be_false }
+        its(:mapper_commandline) { should match /^wu-local .*word_count.rb --run=mapper$/ }
+      end
+    end
+    context "with two files but with --reduce_tasks=0" do
+      let(:subject)             { driver(example_script('tokenizer.rb'), example_script('counter.rb'), :reduce_tasks => 0) }
+      its(:reduce?)             { should be_false                          }
+      its(:mapper_commandline)  { should match /^wu-local .*tokenizer.rb$/ }
+    end
+  end
+  context "will execute a map-reduce job" do
+    context "with explicit map and reduce commands" do
+      let(:subject)             { driver(:map_command => 'cut -f 1', :reduce_command => 'uniq -c') }
+      its(:reduce?)             { should be_true       }
+      its(:mapper_commandline)  { should == 'cut -f 1' }
+      its(:reducer_commandline) { should == 'uniq -c'  }
+    end
+    context "with two widgets" do
+      let(:subject)             { driver('regexp', 'count')        }
+      its(:reduce?)             { should be_true                   }
+      its(:mapper_commandline)  { should match /^wu-local regexp$/ }
+      its(:reducer_commandline) { should match /^wu-local count$/  }
+    end
+    context "with a single file" do
+      context "defining processors named 'mapper' and 'reducer'" do
+        let(:subject)             { driver(example_script('word_count.rb')) }
+        its(:reduce?)             { should be_true                   }
+        its(:mapper_commandline)  { should match /^wu-local .*word_count.rb --run=mapper$/  }
+        its(:reducer_commandline) { should match /^wu-local .*word_count.rb --run=reducer$/ }
+      end
+    end
+    context "with two files" do
+      let(:subject)             { driver(example_script('tokenizer.rb'), example_script('counter.rb')) }
+      its(:reduce?)             { should be_true                           }
+      its(:mapper_commandline)  { should match /^wu-local .*tokenizer.rb$/ }
+      its(:reducer_commandline) { should match /^wu-local .*counter.rb$/   }
+    end
+    context "with a widget and a file" do
+      let(:subject)             { driver('regexp', example_script('counter.rb')) }
+      its(:reduce?)             { should be_true                           }
+      its(:mapper_commandline)  { should match /^wu-local regexp$/         }
+      its(:reducer_commandline) { should match /^wu-local .*counter.rb$/   }
+    end
+    context "with a file and a widget" do
+      let(:subject)             { driver(example_script('tokenizer.rb'), 'count') }
+      its(:reduce?)             { should be_true                           }
+      its(:mapper_commandline)  { should match /^wu-local .*tokenizer.rb$/ }
+      its(:reducer_commandline) { should match /^wu-local count$/          }
+    end
+  end
+  context "handling arguments" do
+    let(:subject) { driver('regexp', :clean => 'hi', :messy => 'hi "there"', :reduce_tasks => 0, :dry_run => true, :rm => true) }
+    it "passes arguments it doesn't know about to wu-local" do
+      subject.mapper_commandline.should include('--clean=hi')
+    end
+    it "correctly passes messy arguments" do
+      subject.mapper_commandline.should include('--messy=hi\\ \\"there\\"')
+    end
+    it "does not pass arguments that are internal to wukong-hadoop" do
+      subject.mapper_commandline.should_not include('--reduce_tasks', '--dry_run', '--rm')
+    end
+  end
+  context "given the --command_prefix option" do
+    let(:subject) { driver('regexp', 'count', :command_prefix => 'bundle exec') }
+    its(:mapper_commandline)  { should match(/^bundle exec wu-local/) }
+    its(:reducer_commandline) { should match(/^bundle exec wu-local/) }
+  end
+end