RubyGems - wukong-hadoop - Versions diffs - 0.0.2 → 0.1.0 - Mend

wukong-hadoop 0.0.2 → 0.1.0

Files changed (27) hide show

data/.yardopts +5 -0
data/Gemfile +2 -0
data/LICENSE.md +95 -0
data/Rakefile +0 -3
data/bin/wu-hadoop +1 -11
data/examples/counter.rb +1 -1
data/lib/wukong-hadoop.rb +73 -6
data/lib/wukong-hadoop/runner.rb +210 -0
data/lib/wukong-hadoop/{driver → runner}/hadoop_invocation.rb +2 -7
data/lib/wukong-hadoop/{driver → runner}/local_invocation.rb +0 -0
data/lib/wukong-hadoop/{driver → runner}/map_logic.rb +4 -2
data/lib/wukong-hadoop/runner/overwritables.rb +35 -0
data/lib/wukong-hadoop/{driver → runner}/reduce_logic.rb +3 -2
data/lib/wukong-hadoop/version.rb +1 -1
data/spec/spec_helper.rb +11 -4
data/spec/wukong-hadoop/hadoop_mode_spec.rb +11 -19
data/spec/wukong-hadoop/local_mode_spec.rb +6 -6
data/spec/wukong-hadoop/{driver_spec.rb → runner_spec.rb} +35 -33
data/spec/wukong-hadoop/{wu_hadoop_spec.rb → wu-hadoop_spec.rb} +4 -4
data/spec/wukong-hadoop_spec.rb +5 -0
data/wukong-hadoop.gemspec +1 -1
metadata +18 -19
data/lib/wukong-hadoop/configuration.rb +0 -137
data/lib/wukong-hadoop/driver.rb +0 -191
data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +0 -27
data/spec/support/driver_helper.rb +0 -15
data/spec/support/integration_helper.rb +0 -39

data/lib/wukong-hadoop/configuration.rb DELETED

@@ -1,137 +0,0 @@
-module Wukong
-  module Hadoop
-    # Configure the given settings object for use with Wukong::Hadoop.
-    #
-    # @param [Configliere::Param] settings the settings to configure
-    # @return [Configliere::Param the configured settings
-    def self.configure settings
-      # Hadoop Options
-      settings.define :hadoop_home,             wukong_hadoop: true,                description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
-      settings.define :hadoop_runner,           wukong_hadoop: true,                description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
-      # Translate simplified args to their hairy hadoop equivalents
-      settings.define :io_sort_mb,              wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
-      settings.define :io_sort_record_percent,  wukong_hadoop: true, jobconf: true, description: 'io.sort.record.percent'
-      settings.define :job_name,                wukong_hadoop: true, jobconf: true, description: 'mapred.job.name'
-      settings.define :key_field_separator,     wukong_hadoop: true, jobconf: true, description: 'map.output.key.field.separator'
-      settings.define :map_speculative,         wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks.speculative.execution'
-      settings.define :reduce_speculative,      wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks.speculative.execution'
-      settings.define :map_tasks,               wukong_hadoop: true, jobconf: true, description: 'mapred.map.tasks'
-      settings.define :max_maps_per_cluster,    wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.cluster'
-      settings.define :max_maps_per_node,       wukong_hadoop: true, jobconf: true, description: 'mapred.max.maps.per.node'
-      settings.define :max_node_map_tasks,      wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.map.tasks.maximum'
-      settings.define :max_node_reduce_tasks,   wukong_hadoop: true, jobconf: true, description: 'mapred.tasktracker.reduce.tasks.maximum'
-      settings.define :max_record_length,       wukong_hadoop: true, jobconf: true, description: 'mapred.linerecordreader.maxlength'
-      settings.define :max_reduces_per_cluster, wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.cluster'
-      settings.define :max_reduces_per_node,    wukong_hadoop: true, jobconf: true, description: 'mapred.max.reduces.per.node'
-      settings.define :max_tracker_failures,    wukong_hadoop: true, jobconf: true, description: 'mapred.max.tracker.failures'
-      settings.define :max_map_attempts,        wukong_hadoop: true, jobconf: true, description: 'mapred.map.max.attempts'
-      settings.define :max_reduce_attempts,     wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.max.attempts'
-      settings.define :min_split_size,          wukong_hadoop: true, jobconf: true, description: 'mapred.min.split.size'
-      settings.define :output_field_separator,  wukong_hadoop: true, jobconf: true, description: 'stream.map.output.field.separator'
-      settings.define :partition_fields,        wukong_hadoop: true, jobconf: true, description: 'num.key.fields.for.partition'
-      settings.define :reduce_tasks,            wukong_hadoop: true, jobconf: true, description: 'mapred.reduce.tasks'
-      settings.define :respect_exit_status,     wukong_hadoop: true, jobconf: true, description: 'stream.non.zero.exit.is.failure'
-      settings.define :reuse_jvms,              wukong_hadoop: true, jobconf: true, description: 'mapred.job.reuse.jvm.num.tasks'
-      settings.define :sort_fields,             wukong_hadoop: true, jobconf: true, description: 'stream.num.map.output.key.fields'
-      settings.define :timeout,                 wukong_hadoop: true, jobconf: true, description: 'mapred.task.timeout'
-      settings.define :noempty,                 wukong_hadoop: true,                description: "Don't create zero-byte reduce files"
-      settings.define :split_on_xml_tag,        wukong_hadoop: true,                description: "Parse XML document by specifying the tag name: 'anything found between <tag> and </tag> will be treated as one record for map tasks'"
-      settings.define :input_format,            wukong_hadoop: true,                description: 'Fully qualified Java class name defining an alternative InputFormat.'
-      settings.define :output_format,           wukong_hadoop: true,                description: 'Fully qualified Java class name defining an alternative OutputFormat.'
-      settings.define :java_opts,               wukong_hadoop: true,                description: 'Additional Java options to be passed to hadoop streaming.', :type => Array, :default => []
-      settings.define :files,                   wukong_hadoop: true,                description: "Comma-separated list of files (or globs) to be copied to the MapReduce cluster (-files).", :type => Array, :default => []
-      settings.define :jars,                    wukong_hadoop: true,                description: "Comma-separated list of jars (or globs) to include on the Hadoop CLASSPATH (-libjars).", :type => Array, :default => []
-      settings.define :archives,                wukong_hadoop: true,                description: "Comma-separated list of archives to be unarchived on each worker (-archives).", :type => Array, :default => []
-      # Options given on the command-line
-      settings.define :mode,           description: "Run in either 'hadoop' or 'local' mode",                                        wukong_hadoop: true, :default => 'hadoop'
-      settings.define :map_command,    description: "Shell command to run as mapper, in place of a constructed wu-local command",    wukong_hadoop: true
-      settings.define :reduce_command, description: "Shell command to run as reducer, in place of a constructed wu-local command",   wukong_hadoop: true
-      settings.define :sort_command,   description: "Shell command to run as sorter (only in `local' mode)",             wukong_hadoop: true, :default => 'sort'
-      settings.define :command_prefix, description: "Prefex to insert before all Wukong commands",                       wukong_hadoop: true
-      settings.define :mapper,         description: "Name of processor to use as a mapper",                              wukong_hadoop: true
-      settings.define :reducer,        description: "Name of processor to use as a reducer",                             wukong_hadoop: true
-      settings.define :gemfile,        description: "Specify an alternative Gemfile to execute this wukong script with", wukong_hadoop: true
-      settings.define :dry_run,        description: "Echo the command that will be run, but don't run it",               wukong_hadoop: true, :type => :boolean, :default => false
-      settings.define :rm,             description: "Recursively remove the destination directory.",                     wukong_hadoop: true, :type => :boolean, :default => false
-      settings.define :input,          description: "Comma-separated list of input paths",                               wukong_hadoop: true
-      settings.define :output,         description: "Output path.",                                                      wukong_hadoop: true
-      settings.use(:commandline)
-      def settings.usage()
-        "usage: #{File.basename($0)} PROCESSOR|FLOW [PROCESSOR|FLOW] [ --param=value | -p value | --param | -p]"
-      end
-      settings.description = <<EOF
-wu-hadoop is a tool to model and launch Wukong processors as
-map/reduce workflows within the Hadoop framework.
-Use wu-hadoop with existing processors in `local' mode to test the
-logic of your job, reading from the specified --input and printing to
-STDOUT:
-  $ wu-hadoop examples/word_count.rb --mode=local --input=examples/sonnet_18.txt
-  a	2
-  all	1
-  and	2
-  ...
-where it is assumed that your mapper is called 'mapper' and your
-reducer 'reducer'.  You can also cat in data:
-  $ cat examples/sonnet_18.txt | wu-hadoop examples/word_count.rb --mode=local
-Or pass options directly:
-  $ wu-hadoop examples/word_count.rb  --mode=local --input=examples/sonnet_18.txt --fold_case --min_length=3
-  all	1
-  and	5
-  art	1
-  brag	1
-  ...
-Or define both processors in separate files:
-  $ wu-hadoop examples/tokenizer.rb examples/counter.rb --mode=local --input=examples/sonnet_18.txt
-Or by name:
-  $ wu-hadoop examples/processors.rb --mode=local --input=examples/sonnet_18.txt --mapper=tokenizer --reducer=counter
-Or just by command:
-$ wu-hadoop processors.rb --mapper=tokenizer --reduce_command='uniq -c' ...
-$ wu-hadoop processors.rb --map_command='cut -f3' --reducer=counter ...
-$ wu-hadoop --map_command='cut -f3' --reduce_command='uniq -c' ...
-If you don't specify a --reducer explicitly, and you didn't give two
-separate arguments, and no processor named :reducer exists in the
-environment, then we assume you are launching a map-only job and
-'mapred.tasktracker.reduce.tasks.maximum' will correspondingly be set
-to 0:
-  $ wu-hadoop examples/tokenizer.rb --mode=local --input=examples/sonnet_18.txt
-  Shall
-  I
-  compare
-  thee
-  ...
-You can achieve this directly with the --reduce_tasks=0 option.
-Many other Hadoop options have been wrapped with similarly friendly
-names below.  These are ignored when running in `local' mode.
-Some options (like `--sort_command') only make sense in `local' mode.
-These are ignored in `hadoop' mode.
-EOF
-      settings
-    end
-    # All Hadoop configuration for Wukong lives within this object.
-    Configuration = configure(Configliere::Param.new) unless defined? Configuration
-  end
-end

data/lib/wukong-hadoop/driver.rb DELETED

@@ -1,191 +0,0 @@
-require 'shellwords'
-require_relative("driver/inputs_and_outputs")
-require_relative("driver/map_logic")
-require_relative("driver/reduce_logic")
-require_relative("driver/local_invocation")
-require_relative("driver/hadoop_invocation")
-module Wukong
-  module Hadoop
-    # The <tt>Hadoop::Driver</tt> class contains the logic to examine
-    # arguments and construct command lines which it will execute to
-    # create the desired behavior.
-    #
-    # The Hadoop::Driver will introspect on its arguments to guess (if
-    # not given) the processors to use as mapper and reducer in a
-    # map/reduce job.  It will also decide whether to run that job in
-    # local or Hadoop mode.  These decisions result in a command which
-    # it will ultimately execute.
-    class Driver < Wukong::Driver
-      include InputsAndOutputs
-      include MapLogic
-      include ReduceLogic
-      include HadoopInvocation
-      include LocalInvocation
-      include Logging
-      # The settings used by this driver.
-      #
-      # @param [Configliere::Param]
-      attr_accessor :settings
-      # The (processed) arguments for this driver.
-      #
-      # @param [Array<String, Pathname>]
-      attr_reader   :args
-      # Initialize and run a new Wukong::Hadoop::Driver for the given
-      # +settings+.
-      #
-      # Will rescue all Wukong::Error exceptions by printing a nice
-      # message to STDERR and exiting.
-      #
-      # @param [Configliere::Param] settings
-      # @param [Array<String>] extra_args
-      def self.run(settings, *extra_args)
-        begin
-          new(settings, *extra_args).run!
-        rescue Wukong::Error => e
-          $stderr.puts e.message
-          exit(127)
-        end
-      end
-      # Run this driver.
-      def run!
-        if mode == :local
-          # log.info "Launching local!"
-          execute_command!(local_commandline)
-        else
-          ensure_input_and_output!
-          remove_output_path! if settings[:rm] || settings[:overwrite]
-          log.info "Launching Hadoop!"
-          execute_command!(hadoop_commandline)
-        end
-      end
-      # Initialize a new driver with the given +settings+ and +args+.
-      #
-      # @param [Configliere::Param] settings
-      # @param [Array<String>] args
-      def initialize(settings, *args)
-        @settings = settings
-        self.args = args
-      end
-      # Set the +args+ for this driver.
-      #
-      # Arguments can be either (registered) processor names or files.
-      #
-      # An error will be raised on missing files or those which
-      # couldn't be loaded.
-      #
-      # An error will be raised if more than two arguments (mapper and
-      # reducer) are passed.
-      #
-      # @param [Array<String>] args
-      def args= args
-        raise Error.new("Cannot provide more than two arguments") if args.length > 2
-        @args = args.map do |arg|
-          if processor_registered?(arg)
-            arg
-          else
-            begin
-              rp = Pathname.new(arg).realpath
-              load rp
-              rp
-            rescue => e
-              raise Error.new("No such processor or file: #{arg}")
-            end
-          end
-        end
-      end
-      # What mode is this driver in?
-      #
-      # @return [:hadoop, :local]
-      def mode
-        settings[:mode].to_s == 'local' ? :local : :hadoop
-      end
-      # Were mapper and/or reducer named by a single argument?
-      #
-      # @return [true, false]
-      def single_job_arg?
-        args.size == 1
-      end
-      # Were mapper and/or reducer named by separate arguments?
-      #
-      # @return [true, false]
-      def separate_map_and_reduce_args?
-        args.size == 2
-      end
-      # Is there a processor registered with the given +name+?
-      #
-      # @param [#to_s] name
-      # @return [true, false]
-      def processor_registered? name
-        Wukong.registry.registered?(name.to_s.to_sym)
-      end
-      # Return the guessed name of a processor at the given +path+.
-      #
-      # @param [String] path
-      # @return [String]
-      def processor_name_from_file(path)
-        File.basename(path, '.rb')
-      end
-      # Does the given +path+ contain a processor named after itself?
-      #
-      # @param [String] path
-      # @return [true, false]
-      def file_is_processor?(path)
-        processor_registered?(processor_name_from_file(path))
-      end
-      # The prefix to insert befor all invocations of the
-      # <tt>wu-local</tt> runner.
-      #
-      # @return [String]
-      def command_prefix
-        settings[:command_prefix]
-      end
-      # Returns parameters to pass to an invocation of
-      # <tt>wu-local</tt>.
-      #
-      # Parameters like <tt>--reduce_tasks</tt> which are relevant to
-      # Wukong-Hadoop will be interpreted and *not* passed.  Others
-      # will be passed unmodified.
-      #
-      # @return [String]
-      def params_to_pass
-        s = (Wukong.loaded_deploy_pack? ? Deploy.pre_deploy_settings : settings)
-        s.reject{ |param, val| s.definition_of(param, :wukong_hadoop) }.map{ |param,val| "--#{param}=#{Shellwords.escape(val.to_s)}" }.join(" ")
-      end
-      # Execute a command composed of the given parts.
-      #
-      # Will print the command instead of the <tt>--dry_run</tt>
-      # option was given.
-      #
-      # @param [Array<String>] args
-      def execute_command!(*args)
-        command = args.flatten.reject(&:blank?).join(" \\\n    ")
-        if settings[:dry_run]
-          log.info("Dry run:")
-          puts command
-        else
-          puts `#{command}`
-          raise "Streaming command failed!" unless $?.success?
-        end
-      end
-    end
-  end
-end

data/lib/wukong-hadoop/driver/inputs_and_outputs.rb DELETED

@@ -1,27 +0,0 @@
-module Wukong
-  module Hadoop
-    # Provides methods for determining input and output paths.
-    # Written as a separate module to allow easy overriding from other
-    # plugins.
-    module InputsAndOutputs
-      # The input paths to read from.
-      #
-      # @return [String]
-      def input_paths
-        (settings[:input] || [])
-      end
-      # The output path to write to.
-      #
-      # @return [String]
-      def output_path
-        settings[:output]
-      end
-    end
-  end
-end

data/spec/support/driver_helper.rb DELETED

@@ -1,15 +0,0 @@
-module Wukong
-  module Hadoop
-    module DriverHelper
-      def driver *args
-        params   = ::Wukong::Hadoop.configure(Configliere::Param.new)
-        params.resolve!
-        params.merge!(args.pop) if args.last.is_a?(Hash)
-        Wukong::Hadoop::Driver.new(params, *args)
-      end
-    end
-  end
-end

data/spec/support/integration_helper.rb DELETED

@@ -1,39 +0,0 @@
-module Wukong
-  module Hadoop
-    module IntegrationHelper
-      def root
-        @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
-      end
-      def lib_dir *args
-        root.join('lib', *args)
-      end
-      def bin_dir *args
-        root.join('bin', *args)
-      end
-      def examples_dir *args
-        root.join('examples', *args)
-      end
-      def integration_env
-        {
-          "PATH"    => [bin_dir.to_s, ENV["PATH"]].compact.join(':'),
-          "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
-        }
-      end
-      def integration_cwd
-        root.to_s
-      end
-      def example_script *args
-        examples_dir(*args)
-      end
-    end
-  end
-end