wukong-hadoop 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +1 -1
- data/lib/wukong-hadoop.rb +1 -1
- data/lib/wukong-hadoop/runner.rb +1 -18
- data/lib/wukong-hadoop/runner/hadoop_invocation.rb +15 -11
- data/lib/wukong-hadoop/version.rb +1 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/wukong-hadoop/hadoop_mode_spec.rb +3 -3
- data/spec/wukong-hadoop/runner_spec.rb +1 -1
- data/wukong-hadoop.gemspec +1 -1
- metadata +7 -7
    
        data/Gemfile
    CHANGED
    
    
    
        data/lib/wukong-hadoop.rb
    CHANGED
    
    | @@ -21,8 +21,8 @@ module Wukong | |
| 21 21 | 
             
                  return unless program_name == 'wu-hadoop'
         | 
| 22 22 |  | 
| 23 23 | 
             
                  # Hadoop Options
         | 
| 24 | 
            -
                  settings.define :hadoop_home,             wukong_hadoop: true,                description: 'Path to hadoop installation. HADOOP_HOME/bin/hadoop is used to run hadoop.', env_var: 'HADOOP_HOME', default: '/usr/lib/hadoop'
         | 
| 25 24 | 
             
                  settings.define :hadoop_runner,           wukong_hadoop: true,                description: 'Path to hadoop executable. Use this for non-standard hadoop installations.'
         | 
| 25 | 
            +
                  settings.define :hadoop_streaming_jar,    wukong_hadoop: true,                description: 'Path to hadoop streaming jar.  Use this for non-standard hadoop installations.'
         | 
| 26 26 |  | 
| 27 27 | 
             
                  # Translate simplified args to their hairy hadoop equivalents
         | 
| 28 28 | 
             
                  settings.define :io_sort_mb,              wukong_hadoop: true, jobconf: true, description: 'io.sort.mb'
         | 
    
        data/lib/wukong-hadoop/runner.rb
    CHANGED
    
    | @@ -111,7 +111,7 @@ EOF | |
| 111 111 | 
             
                      log.info "Launching local!"
         | 
| 112 112 | 
             
                      execute_command!(local_commandline)
         | 
| 113 113 | 
             
                    else
         | 
| 114 | 
            -
                      remove_output_path | 
| 114 | 
            +
                      remove_output_path if settings[:rm] || settings[:overwrite]
         | 
| 115 115 | 
             
                      hadoop_commandline
         | 
| 116 116 | 
             
                      log.info "Launching Hadoop!"
         | 
| 117 117 | 
             
                      execute_command!(hadoop_commandline)
         | 
| @@ -187,23 +187,6 @@ EOF | |
| 187 187 | 
             
                      "--#{param}=#{Shellwords.escape(val.to_s)}"
         | 
| 188 188 | 
             
                    end.join(" ")
         | 
| 189 189 | 
             
                  end
         | 
| 190 | 
            -
             | 
| 191 | 
            -
                  # Execute a command composed of the given parts.
         | 
| 192 | 
            -
                  #
         | 
| 193 | 
            -
                  # Will print the command instead of the <tt>--dry_run</tt>
         | 
| 194 | 
            -
                  # option was given.
         | 
| 195 | 
            -
                  #
         | 
| 196 | 
            -
                  # @param [Array<String>] argv
         | 
| 197 | 
            -
                  def execute_command!(*argv)
         | 
| 198 | 
            -
                    command = argv.flatten.reject(&:blank?).join(" \\\n    ")
         | 
| 199 | 
            -
                    if settings[:dry_run]
         | 
| 200 | 
            -
                      log.info("Dry run:")
         | 
| 201 | 
            -
                      puts command
         | 
| 202 | 
            -
                    else
         | 
| 203 | 
            -
                      puts `#{command}`
         | 
| 204 | 
            -
                      raise Error.new("Command failed!") unless $?.success?
         | 
| 205 | 
            -
                    end
         | 
| 206 | 
            -
                  end
         | 
| 207 190 |  | 
| 208 191 | 
             
                end
         | 
| 209 192 | 
             
              end
         | 
| @@ -10,10 +10,8 @@ module Wukong | |
| 10 10 | 
             
                  #
         | 
| 11 11 | 
             
                  # Will not actually do anything if the <tt>--dry_run</tt> option
         | 
| 12 12 | 
             
                  # is also given.
         | 
| 13 | 
            -
                  def remove_output_path | 
| 14 | 
            -
                     | 
| 15 | 
            -
                    log.info "Removing output file #{output_path}: #{cmd}"
         | 
| 16 | 
            -
                    puts `#{cmd}` unless settings[:dry_run]
         | 
| 13 | 
            +
                  def remove_output_path
         | 
| 14 | 
            +
                    execute_command("#{hadoop_runner} fs -rmr '#{output_path}'")
         | 
| 17 15 | 
             
                  end
         | 
| 18 16 |  | 
| 19 17 | 
             
                  # Return the Hadoop command used to launch this job in a Hadoop
         | 
| @@ -26,11 +24,11 @@ module Wukong | |
| 26 24 | 
             
                  def hadoop_commandline
         | 
| 27 25 | 
             
                    [
         | 
| 28 26 | 
             
                     hadoop_runner,
         | 
| 29 | 
            -
                     "jar #{ | 
| 27 | 
            +
                     "jar #{hadoop_streaming_jar}",
         | 
| 30 28 | 
             
                     hadoop_jobconf_options,
         | 
| 31 29 | 
             
                     "-D mapred.job.name='#{job_name}'",
         | 
| 32 | 
            -
                     hadoop_other_args,
         | 
| 33 30 | 
             
                     hadoop_files,
         | 
| 31 | 
            +
                     hadoop_other_args,
         | 
| 34 32 | 
             
                     "-mapper       '#{mapper_commandline}'",
         | 
| 35 33 | 
             
                     "-reducer      '#{reducer_commandline}'",
         | 
| 36 34 | 
             
                     "-input        '#{input_paths}'",
         | 
| @@ -84,7 +82,17 @@ module Wukong | |
| 84 82 | 
             
                  #
         | 
| 85 83 | 
             
                  # @return [String]
         | 
| 86 84 | 
             
                  def hadoop_runner
         | 
| 87 | 
            -
                    settings[:hadoop_runner] ||  | 
| 85 | 
            +
                    settings[:hadoop_runner] || 'hadoop'
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                  # The path (glob) to the Hadoop streaming jar.
         | 
| 89 | 
            +
                  #
         | 
| 90 | 
            +
                  # Respects the value of <tt>--hadoop_streaming_jar</tt> if
         | 
| 91 | 
            +
                  # given.  Otherwise uses the default CDH4 location.
         | 
| 92 | 
            +
                  #
         | 
| 93 | 
            +
                  # @return [String]
         | 
| 94 | 
            +
                  def hadoop_streaming_jar
         | 
| 95 | 
            +
                    settings[:hadoop_streaming_jar] || '/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh*.jar'
         | 
| 88 96 | 
             
                  end
         | 
| 89 97 |  | 
| 90 98 | 
             
                  # Return an array of jobconf (-D) options that will be passed to Hadoop.
         | 
| @@ -99,10 +107,6 @@ module Wukong | |
| 99 107 | 
             
                    settings[:respect_exit_status] = 'false' if     (settings[:ignore_exit_status] == true)
         | 
| 100 108 | 
             
                    # If no reducer and no reduce_command, then skip the reduce phase
         | 
| 101 109 | 
             
                    settings[:reduce_tasks]      ||= 0       unless reduce?
         | 
| 102 | 
            -
                    # Fields hadoop should use to distribute records to reducers
         | 
| 103 | 
            -
                    unless settings[:partition_fields].blank?
         | 
| 104 | 
            -
                      jobconf_options += [jobconf(:partition_fields), jobconf(:output_field_separator)]
         | 
| 105 | 
            -
                    end
         | 
| 106 110 | 
             
                    jobconf_options += [
         | 
| 107 111 | 
             
                                        :io_sort_mb,               :io_sort_record_percent,
         | 
| 108 112 | 
             
                                        :map_speculative,          :map_tasks,
         | 
    
        data/spec/spec_helper.rb
    CHANGED
    
    
| @@ -69,13 +69,13 @@ describe Wukong::Hadoop::HadoopInvocation do | |
| 69 69 | 
             
              context "removing existing output paths" do
         | 
| 70 70 |  | 
| 71 71 | 
             
                it "will not remove the output path by default" do
         | 
| 72 | 
            -
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path | 
| 72 | 
            +
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') { should_not_receive(:remove_output_path) }
         | 
| 73 73 | 
             
                end
         | 
| 74 74 | 
             
                it "will remove the output path when given the --rm option" do
         | 
| 75 | 
            -
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path | 
| 75 | 
            +
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true) { should_receive(:remove_output_path) }
         | 
| 76 76 | 
             
                end
         | 
| 77 77 | 
             
                it "will not remove the output path when given the --rm option AND the --dry_run option" do
         | 
| 78 | 
            -
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path | 
| 78 | 
            +
                  hadoop_runner('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true) { should_receive(:remove_output_path) }
         | 
| 79 79 | 
             
                end
         | 
| 80 80 | 
             
              end
         | 
| 81 81 |  | 
| @@ -4,7 +4,7 @@ describe Wukong::Hadoop::HadoopRunner do | |
| 4 4 |  | 
| 5 5 | 
             
              context "handling errors" do
         | 
| 6 6 | 
             
                it "raises an error when it can't find a file" do
         | 
| 7 | 
            -
                  expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, / | 
| 7 | 
            +
                  expect { hadoop_runner(examples_dir('processors.rb'), examples_dir('doesnt_exist.rb'), :input => 'foo', :output => 'bar') }.to raise_error(Wukong::Error, /cannot load such file/)
         | 
| 8 8 | 
             
                end
         | 
| 9 9 |  | 
| 10 10 | 
             
                it "raises an error in Hadoop mode when called without input and output paths" do
         | 
    
        data/wukong-hadoop.gemspec
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: wukong-hadoop
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.2.0
         | 
| 5 5 | 
             
              prerelease: 
         | 
| 6 6 | 
             
            platform: ruby
         | 
| 7 7 | 
             
            authors:
         | 
| @@ -11,7 +11,7 @@ authors: | |
| 11 11 | 
             
            autorequire: 
         | 
| 12 12 | 
             
            bindir: bin
         | 
| 13 13 | 
             
            cert_chain: []
         | 
| 14 | 
            -
            date:  | 
| 14 | 
            +
            date: 2014-03-19 00:00:00.000000000 Z
         | 
| 15 15 | 
             
            dependencies:
         | 
| 16 16 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 17 17 | 
             
              name: wukong
         | 
| @@ -20,7 +20,7 @@ dependencies: | |
| 20 20 | 
             
                requirements:
         | 
| 21 21 | 
             
                - - '='
         | 
| 22 22 | 
             
                  - !ruby/object:Gem::Version
         | 
| 23 | 
            -
                    version:  | 
| 23 | 
            +
                    version: 4.0.0
         | 
| 24 24 | 
             
              type: :runtime
         | 
| 25 25 | 
             
              prerelease: false
         | 
| 26 26 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| @@ -28,7 +28,7 @@ dependencies: | |
| 28 28 | 
             
                requirements:
         | 
| 29 29 | 
             
                - - '='
         | 
| 30 30 | 
             
                  - !ruby/object:Gem::Version
         | 
| 31 | 
            -
                    version:  | 
| 31 | 
            +
                    version: 4.0.0
         | 
| 32 32 | 
             
            description: ! "  Treat your dataset like a:\n\n      * stream of lines when it's
         | 
| 33 33 | 
             
              efficient to process by lines\n      * stream of field arrays when it's efficient
         | 
| 34 34 | 
             
              to deal directly with fields\n      * stream of lightweight objects when it's efficient
         | 
| @@ -113,7 +113,7 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 113 113 | 
             
                  version: '0'
         | 
| 114 114 | 
             
                  segments:
         | 
| 115 115 | 
             
                  - 0
         | 
| 116 | 
            -
                  hash: - | 
| 116 | 
            +
                  hash: -3970581689600624425
         | 
| 117 117 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 118 118 | 
             
              none: false
         | 
| 119 119 | 
             
              requirements:
         | 
| @@ -122,10 +122,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 122 122 | 
             
                  version: '0'
         | 
| 123 123 | 
             
                  segments:
         | 
| 124 124 | 
             
                  - 0
         | 
| 125 | 
            -
                  hash: - | 
| 125 | 
            +
                  hash: -3970581689600624425
         | 
| 126 126 | 
             
            requirements: []
         | 
| 127 127 | 
             
            rubyforge_project: 
         | 
| 128 | 
            -
            rubygems_version: 1.8. | 
| 128 | 
            +
            rubygems_version: 1.8.23
         | 
| 129 129 | 
             
            signing_key: 
         | 
| 130 130 | 
             
            specification_version: 3
         | 
| 131 131 | 
             
            summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
         |