RubyGems - wukong-hadoop - Versions diffs - 0.0.1 - Mend

wukong-hadoop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/.gitignore +59 -0
data/.rspec +2 -0
data/Gemfile +3 -0
data/README.md +339 -0
data/Rakefile +13 -0
data/bin/hdp-bin +44 -0
data/bin/hdp-bzip +23 -0
data/bin/hdp-cat +3 -0
data/bin/hdp-catd +3 -0
data/bin/hdp-cp +3 -0
data/bin/hdp-du +86 -0
data/bin/hdp-get +3 -0
data/bin/hdp-kill +3 -0
data/bin/hdp-kill-task +3 -0
data/bin/hdp-ls +11 -0
data/bin/hdp-mkdir +2 -0
data/bin/hdp-mkdirp +12 -0
data/bin/hdp-mv +3 -0
data/bin/hdp-parts_to_keys.rb +77 -0
data/bin/hdp-ps +3 -0
data/bin/hdp-put +3 -0
data/bin/hdp-rm +32 -0
data/bin/hdp-sort +40 -0
data/bin/hdp-stream +40 -0
data/bin/hdp-stream-flat +22 -0
data/bin/hdp-stream2 +39 -0
data/bin/hdp-sync +17 -0
data/bin/hdp-wc +67 -0
data/bin/wu-hadoop +14 -0
data/examples/counter.rb +17 -0
data/examples/map_only.rb +28 -0
data/examples/processors.rb +4 -0
data/examples/sonnet_18.txt +14 -0
data/examples/tokenizer.rb +28 -0
data/examples/word_count.rb +44 -0
data/features/step_definitions/wu_hadoop_steps.rb +4 -0
data/features/support/env.rb +1 -0
data/features/wu_hadoop.feature +113 -0
data/lib/wukong-hadoop.rb +21 -0
data/lib/wukong-hadoop/configuration.rb +133 -0
data/lib/wukong-hadoop/driver.rb +190 -0
data/lib/wukong-hadoop/driver/hadoop_invocation.rb +184 -0
data/lib/wukong-hadoop/driver/inputs_and_outputs.rb +27 -0
data/lib/wukong-hadoop/driver/local_invocation.rb +48 -0
data/lib/wukong-hadoop/driver/map_logic.rb +104 -0
data/lib/wukong-hadoop/driver/reduce_logic.rb +129 -0
data/lib/wukong-hadoop/extensions.rb +2 -0
data/lib/wukong-hadoop/hadoop_env_methods.rb +80 -0
data/lib/wukong-hadoop/version.rb +6 -0
data/spec/spec_helper.rb +21 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +39 -0
data/spec/wukong-hadoop/driver_spec.rb +117 -0
data/spec/wukong-hadoop/hadoop_env_methods_spec.rb +14 -0
data/spec/wukong-hadoop/hadoop_mode_spec.rb +78 -0
data/spec/wukong-hadoop/local_mode_spec.rb +22 -0
data/spec/wukong-hadoop/wu_hadoop_spec.rb +34 -0
data/wukong-hadoop.gemspec +33 -0
metadata +168 -0

data/spec/wukong-hadoop/hadoop_env_methods_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'spec_helper'
+describe Wukong::Hadoop::EnvMethods do
+  subject{ Wukong::Processor.new }
+  it{ should respond_to(:input_file)              }
+  it{ should respond_to(:input_dir)               }
+  it{ should respond_to(:map_input_start_offset)  }
+  it{ should respond_to(:map_input_length)        }
+  it{ should respond_to(:attempt_id)              }
+  it{ should respond_to(:curr_task_id)            }
+end

data/spec/wukong-hadoop/hadoop_mode_spec.rb ADDED Viewed

@@ -0,0 +1,78 @@
+require 'spec_helper'
+describe Wukong::Hadoop::HadoopInvocation do
+  let(:map_only)   { driver('regexp',          input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
+  let(:map_reduce) { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output') }
+  let(:complex)    { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', map_tasks: '100', job_name: 'testy', java_opts: ['-D foo.bar=3 -D baz.booz=hello', '-D hi.there=bye']) }
+  let(:custum_io)  { driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', input_format: 'com.example.InputFormat', output_format: 'com.example.OutputFormat') }
+  context "defining input paths" do
+    it "raises an error unless given an --input option" do
+      lambda { driver('regexp', output: '/tmp/output').run! }.should raise_error(Wukong::Error, /--input.*required/)
+    end
+    it "sets its input paths correctly" do
+      map_reduce.hadoop_commandline.should match(%r{-input\s+'/tmp/input1,/tmp/input2'})
+    end
+    it "sets its input format given the --input_format option" do
+      custum_io.hadoop_commandline.should match(%r{-inputformat\s+'com.example.InputFormat'})
+    end
+  end
+  context "defining its output path" do
+    it "raises an error unless given an --output option" do
+      lambda { driver('regexp', input: '/tmp/output').run! }.should raise_error(Wukong::Error, /--output.*required/)
+    end
+    it "sets its output path correctly" do
+      map_reduce.hadoop_commandline.should match(%r{-output\s+'/tmp/output'})
+    end
+    it "sets its output format given the --output_format option" do
+      custum_io.hadoop_commandline.should match(%r{-outputformat\s+'com.example.OutputFormat'})
+    end
+  end
+  context "defining its mapper and reducer" do
+    it "sets its mapper correctly" do
+      map_reduce.hadoop_commandline.should match(%r{-mapper\s+'wu-local regexp'})
+    end
+    it "sets its reducer correctly" do
+      map_reduce.hadoop_commandline.should match(%r{-reducer\s+'wu-local count'})
+    end
+    it "uses a blank reducer for a map-only job" do
+      map_only.hadoop_commandline.should match(%r{-reducer\s+''})
+    end
+  end
+  context "defining Hadoop JobConf options" do
+    it "translates friendly names into native ones" do
+      complex.hadoop_commandline.should include("-D mapred.job.name='testy'")
+      complex.hadoop_commandline.should include("-D mapred.map.tasks=100")
+    end
+    it "passes options in the given --java_opts option" do
+      complex.hadoop_commandline.should include('-D foo.bar=3','-D baz.booz=hello','-D hi.there=bye')
+    end
+  end
+  context "removing existing output paths" do
+    before { Log.stub!(:info) }
+    it "will not remove the output path by default" do
+      map_reduce.should_not_receive(:remove_output_path!)
+      map_reduce.should_receive(:execute_command!)
+      map_reduce.run!
+    end
+    it "will remove the output path when given the --rm option" do
+      d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true)
+      d.should_receive(:remove_output_path!)
+      d.should_receive(:execute_command!)
+      d.run!
+    end
+    it "will not remove the output path when given the --rm option AND the --dry_run option" do
+      d = driver('regexp', 'count', input: '/tmp/input1,/tmp/input2', output: '/tmp/output', rm: true, dry_run: true)
+      d.should_receive(:remove_output_path!)
+      d.should_receive(:execute_command!)
+      d.run!
+    end
+  end
+end

data/spec/wukong-hadoop/local_mode_spec.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'spec_helper'
+describe Wukong::Hadoop::LocalInvocation do
+  it "reads from STDIN and writes to STDOUT by default" do
+    driver('regexp').local_commandline.should == 'wu-local regexp'
+  end
+  it "reads from from multiple input paths given the --input option" do
+    driver('regexp', :input => '/some/file.tsv,something_else.dat').local_commandline.should == 'cat /some/file.tsv something_else.dat | wu-local regexp'
+  end
+  it "writes to a file given the --output option" do
+    driver('regexp', :output => '/tmp/output.json').local_commandline.should == 'wu-local regexp > /tmp/output.json'
+  end
+  it "will not perform a sort on a map-only job" do
+    driver('regexp').local_commandline.should_not include('sort')
+  end
+  it "will perform a sort on a map-reduce job" do
+    driver('regexp', 'count').local_commandline.should == 'wu-local regexp | sort | wu-local count'
+  end
+  it "will accept a custom sort command" do
+    driver('regexp', 'count', :sort_command => 'sort -n').local_commandline.should == 'wu-local regexp | sort -n | wu-local count'
+  end
+end

data/spec/wukong-hadoop/wu_hadoop_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'spec_helper'
+describe 'wu-hadoop' do
+  context "without any arguments" do
+    let(:subject) { command('wu-hadoop') }
+    it {should exit_with(:non_zero) }
+    it "displays help on STDERR" do
+      should have_stderr("usage: wu-hadoop")
+    end
+  end
+  context "in local mode" do
+    context "on a map-only job" do
+      let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
+      it { should exit_with(0) }
+      it { should have_stdout('Shall', 'I', 'compare', 'thee', 'to', 'a', "summer's", 'day') }
+    end
+    context "on a map-reduce job" do
+      let(:subject) { command('wu-hadoop', example_script('word_count.rb'), "--mode=local", "--input=#{example_script('sonnet_18.txt')}") }
+      it { should exit_with(0) }
+      it { should have_stdout(/complexion\s+1/, /Death\s+1/, /temperate\s+1/) }
+    end
+  end
+  context "in Hadoop mode" do
+    context "on a map-only job" do
+      let(:subject) { command('wu-hadoop', example_script('tokenizer.rb'), "--mode=hadoop", "--input=/data/in", "--output=/data/out", "--dry_run") }
+      it { should exit_with(0) }
+      it { should have_stdout(%r{jar.*hadoop.*streaming.*\.jar}, %r{-mapper.+tokenizer\.rb}, %r{-input.*/data/in}, %r{-output.*/data/out}) }
+    end
+  end
+end

data/wukong-hadoop.gemspec ADDED Viewed

@@ -0,0 +1,33 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/wukong-hadoop/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.name        = 'wukong-hadoop'
+  gem.homepage    = 'https://github.com/infochimps-labs/wukong-hadoop'
+  gem.licenses    = ["Apache 2.0"]
+  gem.email       = 'coders@infochimps.org'
+  gem.authors     = ['Infochimps', 'Philip (flip) Kromer', 'Travis Dempsey']
+  gem.version     = Wukong::Hadoop::VERSION
+  gem.summary     = 'Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use it, yet handles terabyte-scale computation with ease.'
+  gem.description = <<-EOF
+  Treat your dataset like a:
+      * stream of lines when it's efficient to process by lines
+      * stream of field arrays when it's efficient to deal directly with fields
+      * stream of lightweight objects when it's efficient to deal with objects
+  Wukong is friends with Hadoop the elephant, Pig the query language, and the cat on your command line.
+EOF
+  gem.files         = `git ls-files`.split("\n")
+  gem.executables   = ['wu-hadoop']
+  gem.test_files    = gem.files.grep(/^spec/)
+  gem.require_paths = ['lib']
+  gem.add_dependency('wukong',      '3.0.0.pre2')
+  gem.add_development_dependency 'rake',     '~> 0.9'
+  gem.add_development_dependency 'rspec',    '~> 2'
+end

metadata ADDED Viewed

@@ -0,0 +1,168 @@
+--- !ruby/object:Gem::Specification
+name: wukong-hadoop
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- Infochimps
+- Philip (flip) Kromer
+- Travis Dempsey
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-12-01 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: wukong
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 3.0.0.pre2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - '='
+      - !ruby/object:Gem::Version
+        version: 3.0.0.pre2
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.9'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '0.9'
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: '2'
+description: ! "  Treat your dataset like a:\n\n      * stream of lines when it's
+  efficient to process by lines\n      * stream of field arrays when it's efficient
+  to deal directly with fields\n      * stream of lightweight objects when it's efficient
+  to deal with objects\n\n  Wukong is friends with Hadoop the elephant, Pig the query
+  language, and the cat on your command line.\n"
+email: coders@infochimps.org
+executables:
+- wu-hadoop
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- .rspec
+- Gemfile
+- README.md
+- Rakefile
+- bin/hdp-bin
+- bin/hdp-bzip
+- bin/hdp-cat
+- bin/hdp-catd
+- bin/hdp-cp
+- bin/hdp-du
+- bin/hdp-get
+- bin/hdp-kill
+- bin/hdp-kill-task
+- bin/hdp-ls
+- bin/hdp-mkdir
+- bin/hdp-mkdirp
+- bin/hdp-mv
+- bin/hdp-parts_to_keys.rb
+- bin/hdp-ps
+- bin/hdp-put
+- bin/hdp-rm
+- bin/hdp-sort
+- bin/hdp-stream
+- bin/hdp-stream-flat
+- bin/hdp-stream2
+- bin/hdp-sync
+- bin/hdp-wc
+- bin/wu-hadoop
+- examples/counter.rb
+- examples/map_only.rb
+- examples/processors.rb
+- examples/sonnet_18.txt
+- examples/tokenizer.rb
+- examples/word_count.rb
+- features/step_definitions/wu_hadoop_steps.rb
+- features/support/env.rb
+- features/wu_hadoop.feature
+- lib/wukong-hadoop.rb
+- lib/wukong-hadoop/configuration.rb
+- lib/wukong-hadoop/driver.rb
+- lib/wukong-hadoop/driver/hadoop_invocation.rb
+- lib/wukong-hadoop/driver/inputs_and_outputs.rb
+- lib/wukong-hadoop/driver/local_invocation.rb
+- lib/wukong-hadoop/driver/map_logic.rb
+- lib/wukong-hadoop/driver/reduce_logic.rb
+- lib/wukong-hadoop/extensions.rb
+- lib/wukong-hadoop/hadoop_env_methods.rb
+- lib/wukong-hadoop/version.rb
+- spec/spec_helper.rb
+- spec/support/driver_helper.rb
+- spec/support/integration_helper.rb
+- spec/wukong-hadoop/driver_spec.rb
+- spec/wukong-hadoop/hadoop_env_methods_spec.rb
+- spec/wukong-hadoop/hadoop_mode_spec.rb
+- spec/wukong-hadoop/local_mode_spec.rb
+- spec/wukong-hadoop/wu_hadoop_spec.rb
+- wukong-hadoop.gemspec
+homepage: https://github.com/infochimps-labs/wukong-hadoop
+licenses:
+- Apache 2.0
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.23
+signing_key:
+specification_version: 3
+summary: Hadoop Streaming for Ruby. Wukong makes Hadoop so easy a chimpanzee can use
+  it, yet handles terabyte-scale computation with ease.
+test_files:
+- spec/spec_helper.rb
+- spec/support/driver_helper.rb
+- spec/support/integration_helper.rb
+- spec/wukong-hadoop/driver_spec.rb
+- spec/wukong-hadoop/hadoop_env_methods_spec.rb
+- spec/wukong-hadoop/hadoop_mode_spec.rb
+- spec/wukong-hadoop/local_mode_spec.rb
+- spec/wukong-hadoop/wu_hadoop_spec.rb
+has_rdoc: