RubyGems - wonderdog - Versions diffs - 0.0.1 - Mend

wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/.gitignore +49 -0
data/.rspec +2 -0
data/CHANGELOG.md +5 -0
data/LICENSE.md +201 -0
data/README.md +175 -0
data/Rakefile +10 -0
data/bin/estool +141 -0
data/bin/estrus.rb +136 -0
data/bin/wonderdog +93 -0
data/config/elasticsearch-example.yml +227 -0
data/config/elasticsearch.in.sh +52 -0
data/config/logging.yml +43 -0
data/config/more_settings.yml +60 -0
data/config/run_elasticsearch-2.sh +42 -0
data/config/ufo_config.json +12 -0
data/lib/wonderdog.rb +14 -0
data/lib/wonderdog/configuration.rb +25 -0
data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
data/lib/wonderdog/index_and_mapping.rb +67 -0
data/lib/wonderdog/timestamp.rb +43 -0
data/lib/wonderdog/version.rb +3 -0
data/notes/README-benchmarking.txt +272 -0
data/notes/README-read_tuning.textile +74 -0
data/notes/benchmarking-201011.numbers +0 -0
data/notes/cluster_notes.md +17 -0
data/notes/notes.txt +91 -0
data/notes/pigstorefunc.pig +45 -0
data/pom.xml +80 -0
data/spec/spec_helper.rb +22 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +30 -0
data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
data/spec/wonderdog/index_and_type_spec.rb +73 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
data/test/foo.json +3 -0
data/test/foo.tsv +3 -0
data/test/test_dump.pig +19 -0
data/test/test_json_loader.pig +21 -0
data/test/test_tsv_loader.pig +16 -0
data/wonderdog.gemspec +32 -0
metadata +130 -0

data/config/elasticsearch.in.sh ADDED

@@ -0,0 +1,52 @@
+export ES_CONF_DIR=${ES_CONF_DIR-/etc/elasticsearch}
+export ES_WORK_DIR=${ES_WORK_DIR-/mnt/elasticsearch/work}
+export ES_DATA_DIR=${ES_DATA_DIR-/mnt/elasticsearch/data}
+export CLASSPATH=$ES_HOME/plugins/cloud-aws.zip
+CLASSPATH=$CLASSPATH:$ES_HOME/lib/elasticsearch-0.11.0.jar:$ES_HOME/lib/*:$ES_HOME/lib/sigar/*
+# bump the # of open files way way up
+ulimit -n 65536
+# allow elasticsearch to lock itself into memory if JNA is installed
+ulimit -l unlimited
+if [ "x$ES_MIN_MEM" = "x" ]; then
+  ES_MIN_MEM=256m
+fi
+if [ "x$ES_MAX_MEM" = "x" ]; then
+  ES_MAX_MEM=1500m
+fi
+# Arguments to pass to the JVM
+JAVA_OPTS="$JAVA_OPTS -Xms${ES_MIN_MEM}"
+JAVA_OPTS="$JAVA_OPTS -Xmx${ES_MAX_MEM}"
+JAVA_OPTS="$JAVA_OPTS -Xss128k"
+JAVA_OPTS="$JAVA_OPTS -Djline.enabled=true"
+JAVA_OPTS="$JAVA_OPTS -XX:+AggressiveOpts"
+JAVA_OPTS="$JAVA_OPTS -XX:+UseParNewGC"
+JAVA_OPTS="$JAVA_OPTS -XX:+UseConcMarkSweepGC"
+JAVA_OPTS="$JAVA_OPTS -XX:+CMSParallelRemarkEnabled"
+JAVA_OPTS="$JAVA_OPTS -XX:SurvivorRatio=8"
+JAVA_OPTS="$JAVA_OPTS -XX:MaxTenuringThreshold=1"
+JAVA_OPTS="$JAVA_OPTS -XX:+HeapDumpOnOutOfMemoryError"
+JAVA_OPTS="$JAVA_OPTS -XX:HeapDumpPath=$ES_WORK_DIR/heap"
+JAVA_OPTS="$JAVA_OPTS -XX:+PrintGCTimeStamps -XX:+PrintTenuringDistribution -XX:+TraceClassUnloading -XX:+PrintGCDetails -verbose:gc -Xloggc:/var/log/elasticsearch/elasticsearch-gc.log"
+JAVA_OPTS="$JAVA_OPTS -XX:+UseCompressedOops"  # avoid this on sun java < 1.6.0_20
+# ensures JMX accessible from outside world
+JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Djava.rmi.server.hostname=ec2-184-73-69-18.compute-1.amazonaws.com "
+# More options to consider LATER
+# java.net.preferIPv4Stack=true: Better OOTB experience, especially with jgroups
+#  -XX:CMSInitiatingOccupancyFraction=88
+ES_JAVA_OPTS="$ES_JAVA_OPTS -Des.path.data=$ES_DATA_DIR -Des.path.work=$ES_WORK_DIR"
+echo JAVA_OPTS="'$JAVA_OPTS'"
+echo ES_JAVA_OPTS="'$ES_JAVA_OPTS'"
+export JAVA_OPTS ES_JAVA_OPTS ES_MAX_MEM ES_MIN_MEM

data/config/logging.yml ADDED

@@ -0,0 +1,43 @@
+rootLogger: DEBUG, console, file
+#
+# Put the name of any module -- using its config path -- in the section below.
+#
+logger:
+  # log action execution errors for easier debugging
+  action :      DEBUG
+  index:
+    shard:
+      recovery: DEBUG
+    store:      INFO
+    gateway:    DEBUG
+    engine:     DEBUG
+    merge:      DEBUG
+    translog:   DEBUG
+  cluster:
+    service:    DEBUG
+    action:
+      shard:    DEBUG
+  gateway:      DEBUG
+  discovery:    DEBUG
+  jmx:          DEBUG
+  httpclient:   INFO
+  node:         DEBUG
+  plugins:      DEBUG
+appender:
+  console:
+    type: console
+    layout:
+      type: consolePattern
+      conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
+  file:
+    type: dailyRollingFile
+    file: ${path.logs}/${cluster.name}.log
+    datePattern: "'.'yyyy-MM-dd"
+    layout:
+      type: pattern
+      conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"

data/config/more_settings.yml ADDED

@@ -0,0 +1,60 @@
+#
+# This file isn't read for any reason -- it's
+# a dumping ground for annotated config sections
+#
+gateway:
+  # Settings for gateway.type = s3
+  s3:
+    bucket:                   infochimps-elasticsearch
+gateway:
+  fs:
+    # By default, uses the 'path.work' directory Note, the work directory is
+    # considered a temporal directory with ElasticSearch (meaning it is safe
+    # to rm -rf it), the default location of the persistent gateway in work
+    # intentional, it should be changed.
+    #
+    # When explicitly specifying the gateway.fs.location, each node will
+    # append its cluster.name to the provided location. It means that the
+    # location provided can safely support several clusters.
+    #
+    # The file system gateway automatically sets for each index created to use
+    # an fs index gateway. The location specified using gateway.fs.location
+    # will automatically be used in this case to store index level data
+    # (appended by the index name).
+    location:                   /mnt2/elasticsearch/fs
+discovery:
+  zen:
+    # == How should gossip be conducted?
+    ping:
+      multicast:
+        enabled:              false
+        # group:              224.2.2.4
+        # port:               54328
+        # ttl:                3
+        # address:            null
+      unicast:
+        # # Either a YAML array or a comma delimited string.
+        # # Each value is either in the form of host:port, or in the form of host[port1-port2].
+        # hosts:
+    # == Zen master election:
+    # As part of the initial ping process a master of the cluster is either
+    # elected or joined to. This is done automatically. The
+    # discovery.zen.initial_ping_timeout (which defaults to 3s) allows to
+    # configure the election to handle cases of slow or congested networks
+    # (higher values assure less chance of failure).
+    initial_ping_timeout:     3s
+    # # Allow node to become master?  Note, once a node is a client node
+    # # (node.client = true), it will not be allowed to become a master
+    # # (zen.master is automatically set to false).
+    # master:                   ~
+    # == Zen Fault detection:
+    fd:
+      ping_interval:          1s
+      ping_timeout:           30s
+      ping_retries            3

data/config/run_elasticsearch-2.sh ADDED

@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+#
+# This lets you run multiple daemons on the same machine.  It points each
+# daemon's data to /mnt$node/elasticsearch -- so running it with node='' will
+# write to /mnt/elasticsearch, node=3 will write to /mnt3/elasticsearch.
+#
+# Usage:
+#
+#   sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
+#
+# To run multiple nodes:
+#
+#   for node in '' 2 3 ; do sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
+#
+# Which node?
+node=${node-''}
+echo "Running elasticsearch with node=$node"
+# Where does elasticsearch live?
+export ES_HOME=/usr/local/share/elasticsearch
+export ES_CONF_DIR=/etc/elasticsearch
+export ES_INCLUDE=$ES_CONF_DIR/elasticsearch.in.sh
+# Where does data live?
+ES_DATA_ROOT=/mnt$node/elasticsearch
+export ES_DATA_DIR=$ES_DATA_ROOT/data
+export ES_WORK_DIR=$ES_DATA_ROOT/work
+# bump the # of open files way way up
+ulimit -n 65536
+# allow elasticsearch to lock itself into memory if JNA is installed
+ulimit -l unlimited
+# Force the heap size
+export ES_MAX_MEM=${ES_MAX_MEM-1800m}
+export ES_MIN_MEM=$ES_MAX_MEM
+exec chpst -u elasticsearch $ES_HOME/bin/elasticsearch \
+  -Des.config=/etc/elasticsearch/elasticsearch.yml \
+  -p /var/run/elasticsearch/es-$node.pid

data/config/ufo_config.json ADDED

@@ -0,0 +1,12 @@
+{
+    "ufo_sighting" : {
+        "properties" : {
+          "sighted_at"  : {"type" : "string", "store" : "yes"},
+          "reported_at" : {"type" : "string", "store" : "yes"},
+          "location"    : {"type" : "string", "store" : "yes"},
+          "shape"       : {"type" : "string", "store" : "yes"},
+          "duration"    : {"type" : "string", "store" : "yes"},
+          "description" : {"type" : "string", "store" : "yes"}
+        }
+    }
+}

data/lib/wonderdog.rb ADDED

@@ -0,0 +1,14 @@
+require 'wukong-hadoop'
+module Wukong
+  # Wonderdog provides Java code that couples Hadoop streaming to
+  # Wukong.  This module adds some overrides which enables the
+  # <tt>wu-hadoop</tt> program to leverage this code.
+  module Elasticsearch
+  end
+end
+require 'wonderdog/configuration'
+require 'wonderdog/hadoop_invocation_override'
+require 'wonderdog/timestamp'

data/lib/wonderdog/configuration.rb ADDED

@@ -0,0 +1,25 @@
+module Wukong
+  module Elasticsearch
+    # Configure the given +settings+ to be able to work with
+    # Elasticsearch.
+    #
+    # @param [Configliere::Param] settings
+    # @return [Configliere::Param] the newly configured settings
+    def self.configure settings
+      settings.define(:es_tmp_dir,        :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
+      settings.define(:es_config,         :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
+      settings.define(:es_input_splits,   :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_request_size,   :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
+      settings.define(:es_index_field,    :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
+      settings.define(:es_mapping_field,  :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
+      settings.define(:es_id_field,       :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
+      settings.define(:es_bulk_size,      :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
+      settings.define(:es_query,          :description => "Query to use when defining input splits for ElasticSearch input",    :wukong_hadoop => true)
+      settings
+    end
+  end
+end

data/lib/wonderdog/hadoop_invocation_override.rb ADDED

@@ -0,0 +1,139 @@
+require_relative("index_and_mapping")
+module Wukong
+  module Elasticsearch
+    # This module overrides some methods defined in
+    # Wukong::Hadoop::HadoopInvocation.  The overrides will only come
+    # into play if the job's input or output paths are URIs beginning
+    # with 'es://', implying reading or writing to/from Elasticsearch
+    # indices.
+    module HadoopInvocationOverride
+      # The input format when reading from Elasticsearch as defined in
+      # the Java code accompanying Wonderdog.
+      #
+      # @param [String]
+      ES_STREAMING_INPUT_FORMAT  = "com.infochimps.elasticsearch.ElasticSearchStreamingInputFormat"
+      # The output format when writing to Elasticsearch as defined in
+      # the Java code accompanying Wonderdog.
+      #
+      # @param [String]
+      ES_STREAMING_OUTPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingOutputFormat"
+      # Does this job read from Elasticsearch?
+      #
+      # @return [true, false]
+      def reads_from_elasticsearch?
+        IndexAndMapping.matches?(settings[:input])
+      end
+      # The input format to use for this job.
+      #
+      # Will override the default value to ES_STREAMING_INPUT_FORMAT if
+      # reading from Elasticsearch.
+      #
+      # @return [String]
+      def input_format
+        reads_from_elasticsearch? ? ES_STREAMING_INPUT_FORMAT : super()
+      end
+      # The input index to use.
+      #
+      # @return [IndexAndMapping]
+      def input_index
+        @input_index ||= IndexAndMapping.new(settings[:input])
+      end
+      # The input paths to use for this job.
+      #
+      # Will override the default value with a temporary HDFS path
+      # when reading from Elasticsearch.
+      #
+      # @return [String]
+      def input_paths
+        reads_from_elasticsearch? ? elasticsearch_hdfs_tmp_dir(input_index) : super()
+      end
+      # Does this write to Elasticsearch?
+      #
+      # @return [true, false]
+      def writes_to_elasticsearch?
+        IndexAndMapping.matches?(settings[:output])
+      end
+      # The output format to use for this job.
+      #
+      # Will override the default value to ES_STREAMING_OUTPUT_FORMAT if
+      # writing to Elasticsearch.
+      #
+      # @return [String]
+      def output_format
+        writes_to_elasticsearch? ? ES_STREAMING_OUTPUT_FORMAT : super()
+      end
+      # The output index to use.
+      #
+      # @return [IndexAndMapping]
+      def output_index
+        @output_index ||= IndexAndMapping.new(settings[:output])
+      end
+      # The output path to use for this job.
+      #
+      # Will override the default value with a temporary HDFS path
+      # when writing to Elasticsearch.
+      #
+      # @return [String]
+      def output_path
+        writes_to_elasticsearch? ? elasticsearch_hdfs_tmp_dir(output_index) : super()
+      end
+      # Adds Java options required to interact with the input/output
+      # formats defined by the Java code accompanying Wonderdog.
+      #
+      # Will not change the default Hadoop jobconf options unless it
+      # has to.
+      #
+      # @return [Array<String>]
+      def hadoop_jobconf_options
+        super() + [].tap do |o|
+          o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
+          if reads_from_elasticsearch?
+            o << java_opt('elasticsearch.input.index',          input_index.index)
+            o << java_opt('elasticsearch.input.mapping',        input_index.mapping)
+            o << java_opt('elasticsearch.input.splits',         settings[:es_input_splits])
+            o << java_opt('elasticsearch.input.query',          settings[:es_query])
+            o << java_opt('elasticsearch.input.request_size',   settings[:es_request_size])
+            o << java_opt('elasticsearch.input.scroll_timeout', settings[:es_scroll_timeout])
+          end
+          if writes_to_elasticsearch?
+            o << java_opt('elasticsearch.output.index',         output_index.index)
+            o << java_opt('elasticsearch.output.mapping',       output_index.mapping)
+            o << java_opt('elasticsearch.output.index.field',   settings[:es_index_field])
+            o << java_opt('elasticsearch.output.mapping.field', settings[:es_mapping_field])
+            o << java_opt('elasticsearch.output.id.field',      settings[:es_id_field])
+            o << java_opt('elasticsearch.output.bulk_size',     settings[:es_bulk_size])
+          end
+        end.flatten.compact
+      end
+      # Returns a temporary path on the HDFS in which to store log
+      # data while the Hadoop job runs.
+      #
+      # @param [IndexAndMapping] io
+      # @return [String]
+      def elasticsearch_hdfs_tmp_dir io
+        cleaner  = %r{[^\w/\.\-\+]+}
+        io_part  = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
+        File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
+      end
+    end
+  end
+  Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
+end

data/lib/wonderdog/index_and_mapping.rb ADDED

@@ -0,0 +1,67 @@
+module Wukong
+  module Elasticsearch
+    # A convenient class for parsing Elasticsearch index and mapping URIs
+    # like
+    #
+    #   - es://my_index
+    #   - es://my_index/my_mapping
+    #   - es://first_index,second_index,third_index
+    #   - es://my_index/first_mapping,second_mapping,third_mapping
+    class IndexAndMapping
+      # A regular expression that matches URIs describing an
+      # Elasticsearch index and/or mapping to read/write from/to.
+      #
+      # @param [Regexp]
+      ES_SCHEME_REGEXP        = %r{^es://}
+      # The Elasticsearch index.
+      #
+      # @param [String]
+      attr_reader :index
+      # The Elasticsearch mapping.
+      #
+      # @param [String]
+      attr_reader :mapping
+      # Does the given +string+ look like a possible Elasticsearch
+      # /index/mapping specification?
+      #
+      # @param [String] string
+      # @return [true, false]
+      def self.matches? string
+        return false unless string
+        string =~ ES_SCHEME_REGEXP
+      end
+      # Create a new index and mapping specification from the given
+      # +uri..
+      #
+      # @param [String] uri
+      def initialize uri
+        self.uri = uri
+      end
+      # Set the URI of this index and mapping specification, parsing it
+      # for an index and mapping.
+      #
+      # Will raise an error if the given URI is malformed.
+      #
+      # @param [String] uri
+      def uri= uri
+        raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless self.class.matches?(uri)
+        parts = uri.gsub(ES_SCHEME_REGEXP, '').gsub(/^\/+/,'').gsub(/\/+$/,'').split('/')
+        raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless parts.size.between?(1,2)
+        @index   = parts[0]
+        @mapping = parts[1]
+      end
+    end
+  end
+end