RubyGems - wonderdog - Versions diffs - 0.0.1 - Mend

wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/.gitignore +49 -0
data/.rspec +2 -0
data/CHANGELOG.md +5 -0
data/LICENSE.md +201 -0
data/README.md +175 -0
data/Rakefile +10 -0
data/bin/estool +141 -0
data/bin/estrus.rb +136 -0
data/bin/wonderdog +93 -0
data/config/elasticsearch-example.yml +227 -0
data/config/elasticsearch.in.sh +52 -0
data/config/logging.yml +43 -0
data/config/more_settings.yml +60 -0
data/config/run_elasticsearch-2.sh +42 -0
data/config/ufo_config.json +12 -0
data/lib/wonderdog.rb +14 -0
data/lib/wonderdog/configuration.rb +25 -0
data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
data/lib/wonderdog/index_and_mapping.rb +67 -0
data/lib/wonderdog/timestamp.rb +43 -0
data/lib/wonderdog/version.rb +3 -0
data/notes/README-benchmarking.txt +272 -0
data/notes/README-read_tuning.textile +74 -0
data/notes/benchmarking-201011.numbers +0 -0
data/notes/cluster_notes.md +17 -0
data/notes/notes.txt +91 -0
data/notes/pigstorefunc.pig +45 -0
data/pom.xml +80 -0
data/spec/spec_helper.rb +22 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +30 -0
data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
data/spec/wonderdog/index_and_type_spec.rb +73 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
data/test/foo.json +3 -0
data/test/foo.tsv +3 -0
data/test/test_dump.pig +19 -0
data/test/test_json_loader.pig +21 -0
data/test/test_tsv_loader.pig +16 -0
data/wonderdog.gemspec +32 -0
metadata +130 -0

data/bin/estool ADDED

@@ -0,0 +1,141 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'json'
+require 'socket'
+require 'optparse'
+require 'open3'
+require 'rake'
+options = OpenStruct.new
+OptionParser.new do |opts|
+  opts.banner = <<EOS
+Usage: estool <command> [options..]
+Commands include:
+    status                           Returns the status of INDEX
+    list                             Returns a list of all indices
+    health                           Returns the health of the shards
+    flush                            Performs a full flush of the INDEX
+    create                           Create the specified INDEX
+    delete                           Delete the specified INDEX. Requires confirmation.
+    refresh                          Refresh the specified INDEX
+    optimize                         Optimizes the specified INDEX to (-s) number of segments
+    snapshot                         Snapshots the specified INDEX to the gateway
+    segments                         Returns the segment information. Requires ElasticSearch v
+    mapping
+    set_replication
+    search
+    obj_types
+Options include:
+EOS
+  options.host     = Socket.gethostname
+  options.port     = 9200
+  options.index    = "_all"
+  options.segments = 3
+  options.query    = "foo"
+  options.raw      = false
+  options.usage    = opts
+  opts.on('-c', '--host HOSTNAME', 'Connect to ElasticSearch on HOSTNAME', 'Defaults to localhost') do |host|
+    options.host     = host
+  end
+  opts.on('-p', '--port PORT', 'Connect to ElasticSearch using PORT', 'Defaults to 9200') do |port|
+    options.port     = port
+  end
+  opts.on('-i','--index NAME','Name of index to query against', 'Defaults to _all') do |index|
+    options.index    = index
+  end
+  opts.on('-s', '--segments INT', 'Number of segments to optimize to', 'Defaults to 3. Use with <optimize>') do |num|
+    options.segments = num
+  end
+  opts.on('-r','--raw', 'Return raw JSON for parsing by another program') do
+    options.raw      = true
+  end
+  opts.on('-q', '--query STRING', 'Query INDEX with STRING.', 'Defaults to foo. Use with <search>') do |str|
+    options.query    = str
+  end
+  opts.on('-h', '--help', 'Display this screen and exit'){ puts opts ; exit }
+end.parse!
+class ESTool
+  attr_reader :options
+  def initialize(options)
+    @options = options
+  end
+  def connection() "http://#{options.host}:#{options.port}" ; end
+  def shell_response(cmd, req="-XGET")
+    url = File.join(connection, cmd)
+    Open3.popen3('curl','-s',req, url){ |stdin, stdout, stderr, thread| JSON.parse(stdout.read, :max_nesting => 100) }
+  end
+  def display cmd
+    result  = self.send(cmd.to_sym)
+    display = options.raw ? result.to_json : JSON.pretty_generate(result, :max_nesting => 100)
+    puts display
+  end
+  def status()     shell_response(File.join(options.index, "_status?")) ; end
+  def list()       status["indices"].keys ; end
+  def health()     shell_response("_cluster/health?") ; end
+  def flush()      shell_response(File.join(options.index, "_flush?full=true")) ; end
+  def create()     shell_response(options.index, "-XPUT") ; end
+  def delete()
+    require_confirmation!("delete", options.index)
+    shell_response(options.index, "-XDELETE")
+  end
+  def refresh()    shell_response(File.join(options.index, "_refresh"), "-XPOST") ; end
+  def optimize()   shell_response(File.join(options.index, "_optimize?max_num_segements=#{options.segments}"), "-XPOST") ; end
+  def snapshot()   shell_response(File.join(options.index, "_gateway/snapshot"), "-XPOST") ; end
+  def segments()   shell_response(File.join(options.index, "_segments")) ; end
+  def mapping()    shell_response(File.join(options.index, "_mapping")) ; end
+  # curl -s -XPUT http://host:port/index/_settings -d '{"index":{"number_of_replicas":num}}'
+  def set_replication() { "error" => "method not yet implemented" }; end
+  def search() shell_response(File.join(options.index, "_search?q=#{options.query}")) ; end
+  def obj_types() mapping[options.index].keys ; end
+  def require_confirmation!(meth, *args)
+    print "#{meth.capitalize} method with args #{args} requires confirmation! [yN]?"
+    response = STDIN.gets.chomp
+    if response =~ /y/i
+      print "#{meth.capitalize} method with args #{args} confirmed!"
+    else
+      print "#{meth.capitalize} method with args #{args} cancelled!"
+      exit
+    end
+  end
+  def method_missing meth, *args
+    puts "invalid command: #{meth}", options.usage
+    exit
+  end
+end
+command = ARGV.first
+ESTool.new(options).display(command)

data/bin/estrus.rb ADDED

@@ -0,0 +1,136 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'rubberband'
+require 'fileutils'
+require 'configliere'
+Settings.use :commandline, :env_var
+#
+# Estrus -- an alluringly primitive Elasticsearch stress-testing tool
+#
+# Example usage:
+#
+#   ~/ics/backend/wonderdog/ruby/estrus.rb --queries=100 --output_dir=~/ics/backend/estrus_data
+#
+# Output:
+#
+#   idx  datetime  secs   msec/query   hits  shards_successful  index nodename  query_term
+#
+# Setup
+#
+#   sudo apt-get install -y libcurl4-dev wamerican-large
+#   sudo gem install rubberband configliere
+# ,tweet-2010q1
+Settings.define :words_file,   :default => "/usr/share/dict/words", :description => "Flat file with words to use"
+Settings.define :offset_start, :default => 50_000,                  :description => "Where to start reading words", :type => Integer
+Settings.define :offset_scale, :default => 100,                     :description => "How far in the file to range", :type => Integer
+Settings.define :queries,      :default => 10,                      :description => "Number of queries to run",     :type => Integer
+Settings.define :es_indexes,   :default => 'tweet-2009q3pre,tweet-2009q4,tweet-2010q1,tweet-201004,tweet-201005,tweet-201005,tweet-201006,tweet-201007,tweet-201008,tweet-201009,tweet-201010,tweet-201011',          :description => "Elasticsearch index to query against", :type => Array
+Settings.define :output_dir,   :default => nil,                     :description => "If given, the output is directed to a file named :output_dir/{date}/es-{datetime}-{comment}-{hostname}.tsv"
+Settings.define :comment,      :default => nil,                     :description => "If given, it is included in the filename"
+Settings.define :host,         :default => `hostname`.chomp,        :description => "Host of ES query server"
+Settings.define :port,         :default => '9200',                  :description => "Port for ES query server"
+Settings.resolve!
+NODENAME = File.read('/etc/node_name').chomp rescue `hostname`.chomp
+CLIENTS = Settings.es_indexes.inject([]){|clients, index| clients << [index, ElasticSearch.new("#{Settings.host}:#{Settings.port}", :index => index, :type => "tweet")] ; clients }
+class StressTester
+  attr_accessor :started_at
+  def initialize
+    self.started_at = Time.now.utc
+  end
+  def words_file &block
+    File.open(Settings.words_file, &block)
+  end
+  def random_offset
+    Settings.offset_start + rand(1000)*Settings.offset_scale rescue nil
+  end
+  def output_file
+    return @output_file if @output_file
+    return $stdout if Settings.output_dir.blank?
+    datehr   = started_at.strftime("%Y%m%d%H")
+    datetime = started_at.to_flat
+    output_filename = File.expand_path(File.join(Settings.output_dir, datehr,
+        ["es", datetime, NODENAME, Settings.comment].compact.join('-')+".tsv"))
+    FileUtils.mkdir_p(File.dirname(output_filename))
+    @output_file = File.open(output_filename, "a")
+  end
+  def dump *args
+    output_file << args.join("\t")+"\n"
+  end
+  def each_word &block
+    words_file do |words_file|
+      random_offset.times{ words_file.readline }
+      loop do
+        word = words_file.readline.chomp rescue nil
+        break unless word
+        next if word =~ /\W/
+        yield word
+      end
+    end
+  end
+end
+class Time ; def to_flat() strftime("%Y%m%d%H%M%S"); end ; end
+class Array ; def random() self[rand(length)] ; end ; end
+tester = StressTester.new
+n_queries_executed = 0
+tester.each_word do |query_string|
+  index, client = CLIENTS.random
+  result  = client.search "text:#{query_string}"
+  elapsed = Time.now.utc - tester.started_at
+  n_queries_executed += 1
+  tester.dump(
+    n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
+    "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
+    result.total_entries, result._shards['successful'],
+    index, NODENAME,
+    query_string)
+  $stderr.puts(n_queries_executed) if n_queries_executed % 20 == 0
+  break if n_queries_executed >= Settings.queries
+end
+# query_string = 'verizon'
+# CLIENTS.each do |index,client|
+#   result  = client.search "text:#{query_string}"
+#   elapsed = Time.now.utc - tester.started_at
+#   n_queries_executed += 1
+#   tester.dump(
+#     n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
+#     "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
+#     result.total_entries, result._shards['successful'],
+#     index, NODENAME,
+#     query_string)
+#
+# end
+#
+# TODO: monkeypatch rubberband to use keepalives:
+#
+      # def connect!
+      #   unless defined?(@@patron_session)
+      #     @@patron_session = Patron::Session.new
+      #     @session = @@patron_session
+      #     @session.base_url = @server
+      #     @session.timeout = @options[:timeout]
+      #     @session.headers['User-Agent'] = 'ElasticSearch.rb v0.1'
+      #     @session.headers['Connection'] = 'Keep-Alive'
+      #   else
+      #     @session = @@patron_session
+      #   end
+      #   @request_count = 1
+      # end

data/bin/wonderdog ADDED

@@ -0,0 +1,93 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
+Settings.define :index_name,       :required => true,                                  :description => "Index to write data to"
+Settings.define :object_type,      :default => "tweet",                                :description => "Type of object we're indexing"
+Settings.define :field_names,      :default => "rsrc,tweet_id,created_at,user_id,screen_name,search_id,in_reply_to_user_id,in_reply_to_screen_name,in_reply_to_search_id,in_reply_to_status_id,text,source,lang,lat,lng,retweeted_count,rt_of_user_id,rt_of_screen_name,rt_of_tweet_id,contributors", :description => "Comma separated list of field names"
+Settings.define :id_field,         :default => "1",                                    :description => "Index of field to use as object id (counting from 0; default 1), use -1 if no id field"
+Settings.define :bulk_size,        :default => "1000",                                 :description => "Number of records per bulk request"
+Settings.define :es_home,          :default => "/usr/local/share/elasticsearch",       :description => "Path to elasticsearch installation",:env_var => "ES_HOME"
+Settings.define :es_config,        :default => "/etc/elasticsearch/elasticsearch.yml", :description => "Path to elasticsearch config"
+Settings.define :rm,               :default => false,                                  :description => "Remove existing output?"
+Settings.define :hadoop_home,      :default => "/usr/lib/hadoop",                      :description => "Path to hadoop installation", :env_var => "HADOOP_HOME"
+Settings.define :min_split_size,   :default => "5000000000",                           :description => "Min split size for maps"
+Settings.define :test_outputfmt,   :default => false,                                  :description => "Use this flag to run job that test the ElasticSearchOutputFormat"
+Settings.resolve!
+raise "No input file specified." if Settings.rest.first.blank?
+raise "No output file specified." if Settings.rest.last.blank?
+class Wonderdog
+  attr_accessor :options
+  def initialize
+    @options = Settings.dup
+  end
+  def execute
+    output = options.rest.last
+    remove_output(output) if options.rm
+    system %Q{ echo #{hdp_cmd} }
+    system %Q{ #{hdp_cmd} }
+  end
+  def hdp_cmd
+    [
+      "HADOOP_CLASSPATH=#{hadoop_classpath}",
+      "#{options.hadoop_home}/bin/hadoop jar #{run_jar}",
+      mainclass,
+      "-Dmapred.map.tasks.speculative.execution=false",
+      "-Dmapred.min.split.size=#{options.min_split_size}",
+      "-Dwonderdog.index.name=#{options.index_name}",
+      "-Dwonderdog.object.type=#{options.object_type}",
+      "-Dwonderdog.id.field=#{options.id_field}",
+      "-Dwonderdog.field.names=#{options.field_names}",
+      "-Dwonderdog.bulk.size=#{options.bulk_size}",
+      "-Dwonderdog.config=#{options.es_config}",
+      "-Dwonderdog.plugins.dir=#{options.es_home}/plugins",
+      "-libjars #{libjars}",
+      "#{options.rest.first}",
+      "#{options.rest.last}"
+    ].flatten.compact.join(" \t\\\n  ")
+  end
+  def mainclass
+    return "com.infochimps.elasticsearch.ElasticTest" if Settings.test_outputfmt
+    "com.infochimps.elasticsearch.wonderdog.WonderDog"
+  end
+  def hadoop_classpath
+    cp = ["."]
+    Dir[
+      "/etc/elasticsearch/elasticsearch.yml",
+      "#{options.es_home}/plugins/*/*.jar",
+      "#{options.es_home}/lib/*.jar",
+      "#{options.es_home}/lib/sigar/*.jar"
+    ].each{|jar| cp << jar}
+    cp.join(':')
+  end
+  def run_jar
+    File.dirname(File.expand_path(__FILE__))+'/../build/wonderdog.jar'
+  end
+  def libjars
+    libjars = []
+    Dir[
+      "/etc/elasticsearch/elasticsearch.yml",
+      "#{options.es_home}/plugins/*/*.jar",
+      "#{options.es_home}/lib/*.jar"
+    ].each{|jar| libjars << jar}
+    libjars.join(',')
+  end
+  def remove_output output
+    system %Q{ hdp-rm -r #{output} }
+  end
+end
+runner = Wonderdog.new
+runner.execute

data/config/elasticsearch-example.yml ADDED

@@ -0,0 +1,227 @@
+#
+# ElasticSearch config file
+#
+cluster:
+  name:                         hoolock
+  # http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/439afb06f3e85aa7/431a8543811d7848?lnk=gst&q=configuration#431a8543811d7848
+  routing:
+    allocation:
+      concurrent_recoveries:    1
+# File paths
+path:
+  home:                         /usr/local/share/elasticsearch
+  conf:                         /etc/elasticsearch
+  logs:                         /var/log/elasticsearch
+  # data:                       /mnt/elasticsearch/data
+  # work:                       /mnt/elasticsearch/work
+# http://www.elasticsearch.com/docs/elasticsearch/modules/node/
+node:
+  # # node.data: is this a data esnode (stores, indexes data)? default true
+  data:                         true
+# http://www.elasticsearch.com/docs/elasticsearch/modules/http/
+http:
+  # # http.enabled: is this a query esnode (has http interface, dispatches/gathers queries)? Default true
+  enabled:                    true
+  port:                       9200-9300
+  max_content_length:         100mb
+gateway:
+  # The gateway set on the node level will automatically control the index
+  # gateway to use. For example, if the fs gateway is used, then automatically,
+  # each index created on the node will also use its own respective index level
+  # fs gateway. In this case, if in an index should not persist its state, it
+  # should be explicitly set to none.
+  #
+  # Set gateway.type to one of: [none, local, fs, hadoop, s3]
+  #
+  type:                         local
+  #
+  # recovery begins when recover_after_nodes are present and then either
+  # recovery_after_time has passed *or* expected_nodes have shown up.
+  recover_after_nodes:          24
+  recovery_after_time:          10m         # 5m
+  expected_nodes:               24          # 2
+  #
+  # # use with type: s3
+  # s3:
+  #   bucket:                     infochimps-search
+# http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/1f3001f43266879a/06d62ea3ceb4db30?lnk=gst&q=translog#06d62ea3ceb4db30
+indices:
+  memory:
+    # Increase if you are bulk loading
+    # A number ('512m') or percent ('10%'). You can set limits on a percentage
+    # with max_index_buffer_size and min_index_buffer_size. 10% by default.
+    index_buffer_size:          512m
+cache:
+  memory:
+    # buffer_size:              100k
+    # cache_size:               50m
+    # direct:                   true
+    # warm_cache:               false
+index:
+  number_of_shards:             24
+  number_of_replicas:           0
+  translog:
+    # A shard is flushed to local disk (the lucene index is committed) once this
+    # number of operations accumulate in the translog. defaults to 5000
+    #
+    # If you have
+    flush_threshold:            200000     # 5000
+  merge:
+    policy:
+      # Determines how often segment indices are merged by index operation. With
+      # smaller values, less RAM is used while indexing, and searches on
+      # unoptimized indices are faster, but indexing speed is slower. With
+      # larger values, more RAM is used during indexing, and while searches on
+      # unoptimized indices are slower, indexing is faster. Thus larger values
+      # (greater than 10) are best for batch index creation, and smaller values
+      # (lower than 10) for indices that are interactively maintained. Defaults
+      # to 10.
+      merge_factor:             30
+      # Use the compound file format. If not set, controlled by the actually
+      # store used, this is because the compound format was created to reduce
+      # the number of open file handles when using file based storage. The file
+      # system based ones default to true which others default to false. Even
+      # with file system based ones, consider increasing the number of open file
+      # handles and setting this to false for better performance
+      use_compound_file:        false
+      # A size setting type which sets the minimum size for the lowest level
+      # segments. Any segments below this size are considered to be on the same
+      # level (even if they vary drastically in size) and will be merged
+      # whenever there are mergeFactor of them. This effectively truncates the
+      # “long tail” of small segments that would otherwise be created into a
+      # single level. If you set this too large, it could greatly increase the
+      # merging cost during indexing (if you flush many small
+      # segments). Defaults to 1.6mb
+      min_merge_size:           2.7mb
+      # Largest segment (by total byte size) that may be merged with other
+      # segments. Defaults to unbounded.
+      # max_merge_size:
+      # Largest segment (by document count) that may be merged with other
+      # segments. Defaults to unbounded
+      # max_merge_docs
+    scheduler:
+      max_thread_count:         64
+  # deletionpolicy:             keep_only_last
+  engine:
+    robin:
+      # How often to schedule the refresh operation (the same one the Refresh
+      # API, which enables near real time search).  Default '1s'; set to -1 to
+      # disable automatic refresh (you must instead initiate refresh via API)
+      refresh_interval:         -1
+      # Set the interval between indexed terms. Large values cause less memory
+      # to be used by a reader / searcher, but slow random-access to
+      # terms. Small values cause more memory to be used by a reader / searcher,
+      # and speed random-access to terms. Defaults to 128.
+      term_index_interval:      1024
+  gateway:
+    # The index.gateway.snapshot_interval is a time setting allowing to
+    # configure the interval at which snapshotting of the index shard to the
+    # gateway will take place. Note, only primary shards start this scheduled
+    # snapshotting process. It defaults to 10s, and can be disabled by setting
+    # it to -1.
+    snapshot_interval:          -1
+    # When a primary shard is shut down explicitly (not relocated), the
+    # index.gateway.snapshot_on_close flag can control if while shutting down, a
+    # gateway snapshot should be performed. It defaults to true.
+    snapshot_on_close:          true
+# http://www.elasticsearch.com/docs/elasticsearch/modules/node/network/
+network:
+  bind_host:                    _local_
+  publish_host:                 _local_
+  #
+  # tcp:
+  #   no_delay:                 true
+  #   keep_alive:               ~
+  #   reuse_address             true
+  #   send_buffer_size          ~
+  #   receive_buffer_size:      ~
+# http://www.elasticsearch.com/docs/elasticsearch/modules/transport/
+transport:
+  tcp:
+    port:                       9300-9400
+    connect_timeout:            1s
+    # # enable lzf compression in esnode-esnode communication?
+    compress:                   false
+# http://www.elasticsearch.com/docs/elasticsearch/modules/jmx/
+jmx:
+  # Create an RMI connector?
+  create_connector:           true
+  port:                       9400-9500
+  domain:                     elasticsearch
+# http://www.elasticsearch.com/docs/elasticsearch/modules/threadpool/
+threadpool:
+  # #
+  # # threadpool.type should be one of [cached, scaling, blocking]:
+  # #
+  # # * Cached:  An unbounded thread pool that reuses previously constructed threads.
+  # # * Scaling: A bounded thread pool that reuses previously created free threads.
+  # # * Blocking: A bounded thread pool that reuses previously created free
+  # #   threads. Pending requests block for an available thread (different than
+  # #   the scaling one, where the request is added to a queue and does not
+  # #   block).
+  # #
+  # type:                       cached
+# http://www.elasticsearch.com/docs/elasticsearch/modules/discovery/
+discovery:
+  # set to 'zen' or 'ec2'
+  type:                         zen
+  zen:
+    ping:
+      multicast:
+        enabled:                false
+      unicast:
+        hosts:                  10.195.215.175:9300,10.243.57.219:9300,10.194.218.143:9300,10.204.223.175:9300,10.242.89.235:9300,10.212.226.127:9300
+    # There are two fault detection processes running. The first is by the
+    # master, to ping all the other nodes in the cluster and verify that they
+    # are alive. And on the other end, each node pings to master to verify if
+    # its still alive or an election process needs to be initiated.
+    fd:
+      # How often a node gets pinged. Defaults to "1s".
+      ping_interval:            3s
+      # How long to wait for a ping response, defaults to "30s".
+      ping_timeout:             10s
+      # How many ping failures / timeouts cause a node to be considered failed. Defaults to 3.
+      ping_retries:             3
+  #
+  # # ec2 discovery can cause big trouble with the hadoop loader:
+  # # discovery churn can hit API usage limits
+  # # Be sure to set your cloud keys if you're using ec2
+  # #
+  # ec2:
+  #   # security groups used for discovery
+  #   groups:                     hoolock-data_esnode
+  #   # require *all* (false) or *any* (true) of those groups?
+  #   any_group:                  true
+  #   # private_ip, public_ip, private_dns, public_dns
+  #   host_type:                  private_ip
+  #   availability_zones:         us-east-1d
+# Necessary if you will use either of
+# * the ec2 discovery module: for finding peers
+# * the s3 gateway module, for pushing indices to an s3 mirror.
+# Read more: http://www.elasticsearch.com/docs/elasticsearch/cloud/
+#
+cloud:
+  aws:
+    access_key:                 <%= @aws['aws_access_key_id'] %>
+    secret_key:                 <%= @aws['aws_secret_access_key'] %>
+# monitor.jvm: gc_threshold, interval, enabled
+# thrift:
+#   # port: