RubyGems - wonderdog - Versions diffs - 0.0.1 → 0.0.2 - Mend

wonderdog 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/Gemfile +8 -0
data/Gemfile.lock +57 -0
data/lib/wonderdog/configuration.rb +1 -0
data/lib/wonderdog/hadoop_invocation_override.rb +31 -2
data/lib/wonderdog/version.rb +2 -1
data/spec/wonderdog/hadoop_invocation_override_spec.rb +104 -59
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +1 -1
data/wonderdog.gemspec +2 -1
metadata +22 -4

data/Gemfile ADDED

@@ -0,0 +1,8 @@
+source :rubygems
+gemspec
+group :development do
+  gem 'rake',     '~> 0.9'
+  gem 'rspec',    '~> 2'
+end

data/Gemfile.lock ADDED

@@ -0,0 +1,57 @@
+PATH
+  remote: .
+  specs:
+    wonderdog (0.0.1)
+      wukong (= 3.0.0.pre3)
+      wukong-hadoop (>= 0.0.2)
+GEM
+  remote: http://rubygems.org/
+  specs:
+    configliere (0.4.18)
+      highline (>= 1.5.2)
+      multi_json (>= 1.1)
+    diff-lcs (1.1.3)
+    eventmachine (1.0.0)
+    forgery (0.5.0)
+    gorillib (0.4.2)
+      configliere (>= 0.4.13)
+      json
+      multi_json (>= 1.1)
+    highline (1.6.15)
+    json (1.7.5)
+    log4r (1.1.10)
+    multi_json (1.5.0)
+    rake (0.9.6)
+    rspec (2.12.0)
+      rspec-core (~> 2.12.0)
+      rspec-expectations (~> 2.12.0)
+      rspec-mocks (~> 2.12.0)
+    rspec-core (2.12.2)
+    rspec-expectations (2.12.1)
+      diff-lcs (~> 1.1.3)
+    rspec-mocks (2.12.0)
+    uuidtools (2.1.3)
+    vayacondios-client (0.1.2)
+      configliere (>= 0.4.16)
+      gorillib (~> 0.4.2)
+      multi_json (~> 1.1)
+    wukong (3.0.0.pre3)
+      configliere (>= 0.4.18)
+      eventmachine
+      forgery
+      gorillib (>= 0.4.2)
+      log4r
+      multi_json (>= 1.3.6)
+      uuidtools
+      vayacondios-client (>= 0.1.2)
+    wukong-hadoop (0.0.2)
+      wukong (= 3.0.0.pre3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  rake (~> 0.9)
+  rspec (~> 2)
+  wonderdog!

data/lib/wonderdog/configuration.rb CHANGED

@@ -8,6 +8,7 @@ module Wukong
     # @return [Configliere::Param] the newly configured settings
     def self.configure settings
       settings.define(:es_tmp_dir,        :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
+      settings.define(:es_lib_dir,        :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
       settings.define(:es_config,         :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
       settings.define(:es_input_splits,   :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
       settings.define(:es_request_size,   :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)

data/lib/wonderdog/hadoop_invocation_override.rb CHANGED

@@ -98,8 +98,15 @@ module Wukong
       #
       # @return [Array<String>]
       def hadoop_jobconf_options
+        if reads_from_elasticsearch? || writes_to_elasticsearch?
+          settings[:map_speculative]    = false if settings[:map_speculative].nil?
+          settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
+        end
         super() + [].tap do |o|
-          o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
+          if (reads_from_elasticsearch? || writes_to_elasticsearch?)
+            o << java_opt('es.config', settings[:es_config])
+          end
           if reads_from_elasticsearch?
             o << java_opt('elasticsearch.input.index',          input_index.index)
@@ -121,6 +128,28 @@ module Wukong
         end.flatten.compact
       end
+      # :nodoc:
+      #
+      # Munge the settings object to add necessary jars if
+      # reading/writing to/from Elasticsearch, then call super().
+      def hadoop_files
+        if reads_from_elasticsearch? || writes_to_elasticsearch?
+          settings[:jars] = elasticsearch_jars if settings[:jars].empty?
+        end
+        super()
+      end
+      # All Elasticsearch, Wonderdog, and other support jars needed to
+      # connect Hadoop streaming with the
+      # ElasticSearchStreamingInputFormat and
+      # ElasticSearchStreamingOutputFormat provided by the Wonderdog
+      # Java code.
+      #
+      # @return [Array<String>]
+      def elasticsearch_jars
+        Dir[File.join(settings[:es_lib_dir] || '/usr/lib/hadoop/lib', '{elasticsearch,lucene,jna,wonderdog}*.jar')].compact.uniq
+      end
       # Returns a temporary path on the HDFS in which to store log
       # data while the Hadoop job runs.
       #
@@ -129,7 +158,7 @@ module Wukong
       def elasticsearch_hdfs_tmp_dir io
         cleaner  = %r{[^\w/\.\-\+]+}
         io_part  = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
-        File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
+        File.join(settings[:es_tmp_dir] || '/', io_part || '', Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
       end
     end

data/lib/wonderdog/version.rb CHANGED

@@ -1,3 +1,4 @@
 module Wonderdog
-  VERSION = '0.0.1'
+  # The currently running Wonderdog version
+  VERSION = '0.0.2'
 end

data/spec/wonderdog/hadoop_invocation_override_spec.rb CHANGED

@@ -7,75 +7,120 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
   let(:es_writer)  { driver('regexp',  'count', input: '/tmp/input_file',        output: 'es:///the_index/the_map')  }
   let(:es_complex) { driver('regexp',  'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
-  context "not interacting with Elasticsearch" do
-    subject                  { no_es                                            }
-    # input
-    its(:input_paths)        { should == '/tmp/input_file'                      }
-    its(:hadoop_commandline) { should     match(%r{-input.*/tmp/input_file}i)   }
-    # output
-    its(:output_path)        { should == '/tmp/output_file'                     }
-    its(:hadoop_commandline) { should     match(%r{-output.*/tmp/output_file}i) }
-    # no elasticsearch anything
-    its(:hadoop_commandline) { should_not match(/elasticsearch/i)               }
+  context "passing necessary jars to Hadoop streaming" do
+    before  { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
+    context "when not given explicit jars" do
+      context "and not interacting with Elasticsearch" do
+        it "doesn't add jars" do
+          no_es.hadoop_commandline.should_not match('-libjars')
+        end
+      end
+      context "and reading from Elasticsearch" do
+        it "adds default jars it finds on the local filesystem" do
+          es_reader.hadoop_commandline.should match('-libjars.*elasticsearch')
+        end
+      end
+      context "and writing to Elasticsearch" do
+        it "adds default jars it finds on the local filesystem" do
+          es_writer.hadoop_commandline.should match('-libjars.*elasticsearch')
+        end
+      end
+      context "and reading and writing to Elasticsearch" do
+        it "adds default jars it finds on the local filesystem" do
+          es_complex.hadoop_commandline.should match('-libjars.*elasticsearch')
+        end
+      end
+    end
   end
-  context "reading from Elasticsearch" do
-    subject                  { es_reader                                                          }
-    # input
-    its(:input_paths)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
-    its(:hadoop_commandline) { should     match(/-inputformat.*elasticsearch/i)                   }
-    its(:hadoop_commandline) { should     match(%r{-input.*/user.*wukong.*the_index.*the_map}i)   }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.index.*the_index/i)   }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.map.*the_map/i)       }
-    # output
-    its(:output_path)        { should == '/tmp/output_file'                                       }
-    its(:hadoop_commandline) { should_not match(/-outputformat/i)                                 }
-    its(:hadoop_commandline) { should     match(%r{-output.*/tmp/output_file}i)                   }
-    its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i)                    }
+  context "setting speculative execution" do
+    context "when not given speculative options" do
+      context "and not interacting with Elasticsearch" do
+        it "doesn't add jars" do
+          no_es.hadoop_commandline.should_not match('speculative')
+        end
+      end
+      context "and reading from Elasticsearch" do
+        it "adds default jars it finds on the local filesystem" do
+          es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
+          es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
+        end
+      end
+    end
   end
+  context "handling input and output paths, formats, and options when" do
-  context "writing to Elasticsearch" do
-    subject                  { es_writer                                                          }
+    context "not interacting with Elasticsearch" do
+      subject                  { no_es                                            }
+      # input
+      its(:input_paths)        { should == '/tmp/input_file'                      }
+      its(:hadoop_commandline) { should     match(%r{-input.*/tmp/input_file}i)   }
-    # input
-    its(:input_paths)        { should == '/tmp/input_file'                                        }
-    its(:hadoop_commandline) { should_not match(/-inputformat/i)                                  }
-    its(:hadoop_commandline) { should     match(%r{-input.*/tmp/input_file}i)                     }
-    its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i)                     }
+      # output
+      its(:output_path)        { should == '/tmp/output_file'                     }
+      its(:hadoop_commandline) { should     match(%r{-output.*/tmp/output_file}i) }
-    # output
-    its(:output_path)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
-    its(:hadoop_commandline) { should     match(/-outputformat.*elasticsearch/i)                  }
-    its(:hadoop_commandline) { should     match(%r{-output.*/user.*wukong.*the_index.*the_map}i)  }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index.*the_index/i)  }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.map.*the_map/i)      }
-  end
+      # no elasticsearch anything
+      its(:hadoop_commandline) { should_not match(/elasticsearch/i)               }
+    end
+    context "reading from Elasticsearch" do
+      subject                  { es_reader                                                          }
+      # input
+      its(:input_paths)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
+      its(:hadoop_commandline) { should     match(/-inputformat.*elasticsearch/i)                   }
+      its(:hadoop_commandline) { should     match(%r{-input.*/user.*wukong.*the_index.*the_map}i)   }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.index.*the_index/i)   }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.map.*the_map/i)       }
+      # output
+      its(:output_path)        { should == '/tmp/output_file'                                       }
+      its(:hadoop_commandline) { should_not match(/-outputformat/i)                                 }
+      its(:hadoop_commandline) { should     match(%r{-output.*/tmp/output_file}i)                   }
+      its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i)                    }
+    end
+    context "writing to Elasticsearch" do
+      subject                  { es_writer                                                          }
+      # input
+      its(:input_paths)        { should == '/tmp/input_file'                                        }
+      its(:hadoop_commandline) { should_not match(/-inputformat/i)                                  }
+      its(:hadoop_commandline) { should     match(%r{-input.*/tmp/input_file}i)                     }
+      its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i)                     }
+      # output
+      its(:output_path)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
+      its(:hadoop_commandline) { should     match(/-outputformat.*elasticsearch/i)                  }
+      its(:hadoop_commandline) { should     match(%r{-output.*/user.*wukong.*the_index.*the_map}i)  }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index.*the_index/i)  }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.map.*the_map/i)      }
+    end
-  context "reading and writing with many options" do
-    subject                  { es_complex                                                         }
+    context "reading and writing with many options" do
+      subject                  { es_complex                                                         }
-    # input
-    its(:input_paths)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
-    its(:hadoop_commandline) { should     match(/-inputformat.*elasticsearch/i)                   }
-    its(:hadoop_commandline) { should     match(%r{-input.*/user.*wukong.*the_index.*the_map}i)   }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.index.*the_index/i)   }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.map.*the_map/i)       }
+      # input
+      its(:input_paths)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
+      its(:hadoop_commandline) { should     match(/-inputformat.*elasticsearch/i)                   }
+      its(:hadoop_commandline) { should     match(%r{-input.*/user.*wukong.*the_index.*the_map}i)   }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.index.*the_index/i)   }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.map.*the_map/i)       }
-    # output
-    its(:output_path)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
-    its(:hadoop_commandline) { should     match(/-outputformat.*elasticsearch/i)                  }
-    its(:hadoop_commandline) { should     match(%r{-output.*/user.*wukong.*the_index.*the_map}i)  }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index.*the_index/i)  }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.map.*the_map/i)      }
+      # output
+      its(:output_path)        { should     match(%r{/user.*wukong.*the_index.*the_map})            }
+      its(:hadoop_commandline) { should     match(/-outputformat.*elasticsearch/i)                  }
+      its(:hadoop_commandline) { should     match(%r{-output.*/user.*wukong.*the_index.*the_map}i)  }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index.*the_index/i)  }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.map.*the_map/i)      }
-    # options
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.query.*hi.*there/i)   }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
-    its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index\.field.*ID/i)  }
+      # options
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.query.*hi.*there/i)   }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
+      its(:hadoop_commandline) { should     match(/-D\s+elasticsearch\.output\.index\.field.*ID/i)  }
+    end
   end
 end

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java CHANGED

@@ -48,7 +48,7 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
     private              String idFieldName;
     private static final String ES_BULK_SIZE_OPT     = "elasticsearch.output.bulk_size";
-    private static final String ES_BULK_SIZE         = "100";
+    private static final String ES_BULK_SIZE         = "1000";
     private              int    bulkSize;

data/wonderdog.gemspec CHANGED

@@ -28,5 +28,6 @@ EOF
   gem.test_files    = gem.files.grep(/^spec/)
   gem.require_paths = ['lib']
-  gem.add_dependency('wukong', '3.0.0.pre2')
+  gem.add_dependency('wukong',        '3.0.0.pre3')
+  gem.add_dependency('wukong-hadoop', '>= 0.0.2')
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: wonderdog
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-01 00:00:00.000000000 Z
+date: 2012-12-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: wukong
@@ -22,7 +22,7 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre2
+        version: 3.0.0.pre3
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -30,7 +30,23 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 3.0.0.pre2
+        version: 3.0.0.pre3
+- !ruby/object:Gem::Dependency
+  name: wukong-hadoop
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.0.2
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: 0.0.2
 description: ! "  Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
   \ a more fully-fledged member of both the Hadoop and Wukong\n  ecosystems.\n\n  For
   the Java side, Wonderdog provides InputFormat and OutputFormat\n  classes for use
@@ -45,6 +61,8 @@ files:
 - .gitignore
 - .rspec
 - CHANGELOG.md
+- Gemfile
+- Gemfile.lock
 - LICENSE.md
 - README.md
 - Rakefile