RubyGems - wonderdog - Versions diffs - 0.0.2 → 0.1.0 - Mend

wonderdog 0.0.2 → 0.1.0

Files changed (20) hide show

data/.gitignore +31 -29
data/.yardopts +6 -0
data/Gemfile +2 -0
data/examples/no_wonderdog.rb +2 -0
data/examples/wonderdog.rb +4 -0
data/lib/wonderdog.rb +29 -1
data/lib/wonderdog/hadoop_invocation_override.rb +3 -3
data/lib/wonderdog/index_and_mapping.rb +1 -1
data/lib/wonderdog/version.rb +1 -1
data/spec/spec_helper.rb +14 -7
data/spec/support/.gitkeep +0 -0
data/spec/wonderdog/hadoop_invocation_override_spec.rb +24 -8
data/spec/wonderdog/wu-hadoop_spec.rb +18 -0
data/spec/wonderdog_spec.rb +5 -0
data/wonderdog.gemspec +1 -2
metadata +14 -27
data/Gemfile.lock +0 -57
data/lib/wonderdog/configuration.rb +0 -26
data/spec/support/driver_helper.rb +0 -15
data/spec/support/integration_helper.rb +0 -30

data/.gitignore CHANGED

@@ -1,16 +1,11 @@
-\#*
-.\#*
-*~
-.DS_Store
-Icon?
-REVISION
-TAGS*
-nohup.out
-.bzr
-.hg
-.svn
-a.out
+*.log
 *.o
 *.pyc
 *.so
@@ -18,32 +13,39 @@ a.out
 *.sw?
 *.tmproj
 *_flymake.*
+*private*
+*~
+.DS_Store
+.\#*
+.bzr
+.hg
 .project
 .pydevproject
 .settings
+.svn
 .tasks-cache
 .yardoc
-*.log
-*private*
-/log/*
-/pkg/*
-/tmp/*
-/coverage
-/db/*.sqlite3
+/Gemfile.lock
+/config/apeyeye.yaml
 /config/database.yml
 /config/private.yml
+/config/routes.rb
 /config/settings.yml
 /config/sphinx.yml
+/coverage
+/db/*.sqlite3
+/log/*
+/pkg/*
 /public/stylesheets/compiled/*
-/webrat.log
+/target
+/tmp/*
 /vendor/webrat/vendor
+/webrat.log
+Gemfile.lock
+Icon?
+REVISION
+TAGS*
+\#*
+a.out
 doc
-/config/apeyeye.yaml
-/config/routes.rb
-/target
+nohup.out

data/.yardopts ADDED

@@ -0,0 +1,6 @@
+--readme   README.md
+--markup   markdown
+-
+CHANGELOG.md
+LICENSE.md
+README.md

data/Gemfile CHANGED

@@ -5,4 +5,6 @@ gemspec
 group :development do
   gem 'rake',     '~> 0.9'
   gem 'rspec',    '~> 2'
+  gem 'yard'
+  gem 'redcarpet'
 end

data/examples/no_wonderdog.rb ADDED

	@@ -0,0 +1,2 @@
1	+ Wukong.dataflow(:mapper) { identity }
2	+ Wukong.dataflow(:reducer) { identity }

data/examples/wonderdog.rb ADDED

@@ -0,0 +1,4 @@
+require 'wonderdog'
+Wukong.dataflow(:mapper)  { identity }
+Wukong.dataflow(:reducer) { identity }

data/lib/wonderdog.rb CHANGED

@@ -6,9 +6,37 @@ module Wukong
   # Wukong.  This module adds some overrides which enables the
   # <tt>wu-hadoop</tt> program to leverage this code.
   module Elasticsearch
+    include Plugin
+    # Configure the given `settings` to be able to work with
+    # Elasticsearch.
+    #
+    # @param [Configliere::Param] settings
+    # @return [Configliere::Param] the newly configured settings
+    def self.configure settings, program
+      return unless program == 'wu-hadoop'
+      settings.define(:es_tmp_dir,        :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
+      settings.define(:es_lib_dir,        :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
+      settings.define(:es_config,         :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
+      settings.define(:es_input_splits,   :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_request_size,   :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
+      settings.define(:es_index_field,    :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
+      settings.define(:es_mapping_field,  :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
+      settings.define(:es_id_field,       :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
+      settings.define(:es_bulk_size,      :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_query,          :description => "Query to use when defining input splits for ElasticSearch input",    :wukong_hadoop => true)
+    end
+    # Boot Wonderdog with the given `settings` in the given `dir`.
+    #
+    # @param [Configliere::Param] settings
+    # @param [String] root
+    def self.boot settings, root
+    end
   end
 end
-require 'wonderdog/configuration'
 require 'wonderdog/hadoop_invocation_override'
 require 'wonderdog/timestamp'

data/lib/wonderdog/hadoop_invocation_override.rb CHANGED

@@ -99,8 +99,8 @@ module Wukong
       # @return [Array<String>]
       def hadoop_jobconf_options
         if reads_from_elasticsearch? || writes_to_elasticsearch?
-          settings[:map_speculative]    = false if settings[:map_speculative].nil?
-          settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
+          settings[:map_speculative]    = 'false' if settings[:map_speculative].nil?
+          settings[:reduce_speculative] = 'false' if settings[:reduce_speculative].nil?
         end
         super() + [].tap do |o|
@@ -164,5 +164,5 @@ module Wukong
     end
   end
-  Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
+  Hadoop::HadoopRunner.class_eval { include Elasticsearch::HadoopInvocationOverride }
 end

data/lib/wonderdog/index_and_mapping.rb CHANGED

@@ -26,7 +26,7 @@ module Wukong
       # @param [String]
       attr_reader :mapping
-      # Does the given +string+ look like a possible Elasticsearch
+      # Does the given `string` look like a possible Elasticsearch
       # /index/mapping specification?
       #
       # @param [String] string

data/lib/wonderdog/version.rb CHANGED

@@ -1,4 +1,4 @@
 module Wonderdog
   # The currently running Wonderdog version
-  VERSION = '0.0.2'
+  VERSION = '0.1.0'
 end

data/spec/spec_helper.rb CHANGED

@@ -1,12 +1,10 @@
 require 'wonderdog'
 require 'wukong/spec_helpers'
-require_relative('support/integration_helper')
-require_relative('support/driver_helper')
 RSpec.configure do |config|
   config.before(:each) do
+    Wukong::Log.level = Log4r::OFF
     @orig_reg = Wukong.registry.show
   end
@@ -14,9 +12,18 @@ RSpec.configure do |config|
     Wukong.registry.clear!
     Wukong.registry.merge!(@orig_reg)
   end
   include Wukong::SpecHelpers
-  include Wukong::Elasticsearch::IntegrationHelper
-  include Wukong::Elasticsearch::DriverHelper
-end
+  def root
+    @root ||= Pathname.new(File.expand_path('../..', __FILE__))
+  end
+  def hadoop_runner *args, &block
+    runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
+      stub!(:execute_command!)
+      instance_eval(&block) if block_given?
+    end
+  end
+end

data/spec/support/.gitkeep ADDED

File without changes

data/spec/wonderdog/hadoop_invocation_override_spec.rb CHANGED

@@ -2,10 +2,10 @@ require 'spec_helper'
 describe Wukong::Elasticsearch::HadoopInvocationOverride do
-  let(:no_es)      { driver('regexp',  'count', input: '/tmp/input_file',        output: '/tmp/output_file')         }
-  let(:es_reader)  { driver('regexp',  'count', input: 'es://the_index/the_map', output: '/tmp/output_file')         }
-  let(:es_writer)  { driver('regexp',  'count', input: '/tmp/input_file',        output: 'es:///the_index/the_map')  }
-  let(:es_complex) { driver('regexp',  'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
+  let(:no_es)      { hadoop_runner('regexp',  'count', input: '/tmp/input_file',        output: '/tmp/output_file')         }
+  let(:es_reader)  { hadoop_runner('regexp',  'count', input: 'es://the_index/the_map', output: '/tmp/output_file')         }
+  let(:es_writer)  { hadoop_runner('regexp',  'count', input: '/tmp/input_file',        output: 'es:///the_index/the_map')  }
+  let(:es_complex) { hadoop_runner('regexp',  'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID', map_speculative: true, reduce_speculative: true) }
   context "passing necessary jars to Hadoop streaming" do
     before  { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
@@ -36,16 +36,32 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
   context "setting speculative execution" do
     context "when not given speculative options" do
       context "and not interacting with Elasticsearch" do
-        it "doesn't add jars" do
+        it "doesn't add any speculative options" do
           no_es.hadoop_commandline.should_not match('speculative')
         end
       end
       context "and reading from Elasticsearch" do
-        it "adds default jars it finds on the local filesystem" do
-          es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
-          es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
+        it "disables speculative execution in the mapper" do
+          es_reader.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
+        end
+        it "disables speculative execution in the reducer" do
+          es_reader.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
         end
       end
+      context "and reading from Elasticsearch" do
+        it "disables speculative execution in the mapper" do
+          es_writer.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
+        end
+        it "disables speculative execution in the reducer" do
+          es_writer.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
+        end
+      end
+    end
+    context "when given speculative options" do
+      it "does not change them" do
+        es_complex.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*true/)
+        es_complex.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*true/)
+      end
     end
   end

data/spec/wonderdog/wu-hadoop_spec.rb ADDED

@@ -0,0 +1,18 @@
+require 'spec_helper'
+describe 'wu-hadoop' do
+  context "when wonderdog hasn't been required" do
+    let(:script) { examples_dir('no_wonderdog.rb') }
+    it "doesn't recognize Elasticsearch URIs" do
+      command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should_not have_stdout('elasticsearch')
+    end
+  end
+  context "when wonderdog hasn't been required" do
+    let(:script) { examples_dir('wonderdog.rb') }
+    it "recognizes Elasticsearch URIs" do
+      command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should have_stdout('elasticsearch')
+    end
+  end
+end

data/spec/wonderdog_spec.rb ADDED

@@ -0,0 +1,5 @@
+require 'spec_helper'
+describe Wukong::Elasticsearch do
+  it_behaves_like 'a plugin'
+end

data/wonderdog.gemspec CHANGED

@@ -28,6 +28,5 @@ EOF
   gem.test_files    = gem.files.grep(/^spec/)
   gem.require_paths = ['lib']
-  gem.add_dependency('wukong',        '3.0.0.pre3')
-  gem.add_dependency('wukong-hadoop', '>= 0.0.2')
+  gem.add_dependency('wukong-hadoop', '0.1.0')
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wonderdog
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.1.0
   prerelease:
 platform: ruby
 authors:
@@ -13,16 +13,16 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-17 00:00:00.000000000 Z
+date: 2013-02-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: wukong
+  name: wukong-hadoop
   requirement: !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre3
+        version: 0.1.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -30,23 +30,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre3
-- !ruby/object:Gem::Dependency
-  name: wukong-hadoop
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: 0.0.2
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: 0.0.2
+        version: 0.1.0
 description: ! "  Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
   \ a more fully-fledged member of both the Hadoop and Wukong\n  ecosystems.\n\n  For
   the Java side, Wonderdog provides InputFormat and OutputFormat\n  classes for use
@@ -60,9 +44,9 @@ extra_rdoc_files: []
 files:
 - .gitignore
 - .rspec
+- .yardopts
 - CHANGELOG.md
 - Gemfile
-- Gemfile.lock
 - LICENSE.md
 - README.md
 - Rakefile
@@ -75,8 +59,9 @@ files:
 - config/more_settings.yml
 - config/run_elasticsearch-2.sh
 - config/ufo_config.json
+- examples/no_wonderdog.rb
+- examples/wonderdog.rb
 - lib/wonderdog.rb
-- lib/wonderdog/configuration.rb
 - lib/wonderdog/hadoop_invocation_override.rb
 - lib/wonderdog/index_and_mapping.rb
 - lib/wonderdog/timestamp.rb
@@ -89,10 +74,11 @@ files:
 - notes/pigstorefunc.pig
 - pom.xml
 - spec/spec_helper.rb
-- spec/support/driver_helper.rb
-- spec/support/integration_helper.rb
+- spec/support/.gitkeep
 - spec/wonderdog/hadoop_invocation_override_spec.rb
 - spec/wonderdog/index_and_type_spec.rb
+- spec/wonderdog/wu-hadoop_spec.rb
+- spec/wonderdog_spec.rb
 - src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
 - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
 - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
@@ -141,8 +127,9 @@ specification_version: 3
 summary: Make Hadoop and ElasticSearch play together nicely.
 test_files:
 - spec/spec_helper.rb
-- spec/support/driver_helper.rb
-- spec/support/integration_helper.rb
+- spec/support/.gitkeep
 - spec/wonderdog/hadoop_invocation_override_spec.rb
 - spec/wonderdog/index_and_type_spec.rb
+- spec/wonderdog/wu-hadoop_spec.rb
+- spec/wonderdog_spec.rb
 has_rdoc:

data/Gemfile.lock DELETED

@@ -1,57 +0,0 @@
-PATH
-  remote: .
-  specs:
-    wonderdog (0.0.1)
-      wukong (= 3.0.0.pre3)
-      wukong-hadoop (>= 0.0.2)
-GEM
-  remote: http://rubygems.org/
-  specs:
-    configliere (0.4.18)
-      highline (>= 1.5.2)
-      multi_json (>= 1.1)
-    diff-lcs (1.1.3)
-    eventmachine (1.0.0)
-    forgery (0.5.0)
-    gorillib (0.4.2)
-      configliere (>= 0.4.13)
-      json
-      multi_json (>= 1.1)
-    highline (1.6.15)
-    json (1.7.5)
-    log4r (1.1.10)
-    multi_json (1.5.0)
-    rake (0.9.6)
-    rspec (2.12.0)
-      rspec-core (~> 2.12.0)
-      rspec-expectations (~> 2.12.0)
-      rspec-mocks (~> 2.12.0)
-    rspec-core (2.12.2)
-    rspec-expectations (2.12.1)
-      diff-lcs (~> 1.1.3)
-    rspec-mocks (2.12.0)
-    uuidtools (2.1.3)
-    vayacondios-client (0.1.2)
-      configliere (>= 0.4.16)
-      gorillib (~> 0.4.2)
-      multi_json (~> 1.1)
-    wukong (3.0.0.pre3)
-      configliere (>= 0.4.18)
-      eventmachine
-      forgery
-      gorillib (>= 0.4.2)
-      log4r
-      multi_json (>= 1.3.6)
-      uuidtools
-      vayacondios-client (>= 0.1.2)
-    wukong-hadoop (0.0.2)
-      wukong (= 3.0.0.pre3)
-PLATFORMS
-  ruby
-DEPENDENCIES
-  rake (~> 0.9)
-  rspec (~> 2)
-  wonderdog!

data/lib/wonderdog/configuration.rb DELETED

@@ -1,26 +0,0 @@
-module Wukong
-  module Elasticsearch
-    # Configure the given +settings+ to be able to work with
-    # Elasticsearch.
-    #
-    # @param [Configliere::Param] settings
-    # @return [Configliere::Param] the newly configured settings
-    def self.configure settings
-      settings.define(:es_tmp_dir,        :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
-      settings.define(:es_lib_dir,        :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
-      settings.define(:es_config,         :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
-      settings.define(:es_input_splits,   :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
-      settings.define(:es_request_size,   :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
-      settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
-      settings.define(:es_index_field,    :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
-      settings.define(:es_mapping_field,  :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
-      settings.define(:es_id_field,       :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
-      settings.define(:es_bulk_size,      :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
-      settings.define(:es_query,          :description => "Query to use when defining input splits for ElasticSearch input",    :wukong_hadoop => true)
-      settings
-    end
-  end
-end

data/spec/support/driver_helper.rb DELETED

@@ -1,15 +0,0 @@
-module Wukong
-  module Elasticsearch
-    module DriverHelper
-      def driver *args
-        params   = Elasticsearch.configure(Hadoop.configure(Configliere::Param.new))
-        params.resolve!
-        params.merge!(args.pop) if args.last.is_a?(Hash)
-        Hadoop::Driver.new(params, *args)
-      end
-    end
-  end
-end

data/spec/support/integration_helper.rb DELETED

@@ -1,30 +0,0 @@
-module Wukong
-  module Elasticsearch
-    module IntegrationHelper
-      def root
-        @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
-      end
-      def lib_dir
-        root.join('lib')
-      end
-      def bin_dir
-        root.join('bin')
-      end
-      def integration_env
-        {
-          "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
-        }
-      end
-      def integration_cwd
-        root.to_s
-      end
-    end
-  end
-end