wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'json'
4
+ require 'socket'
5
+ require 'optparse'
6
+ require 'open3'
7
+ require 'rake'
8
+
9
+ options = OpenStruct.new
10
+ OptionParser.new do |opts|
11
+
12
+ opts.banner = <<EOS
13
+ Usage: estool <command> [options..]
14
+
15
+ Commands include:
16
+ status Returns the status of INDEX
17
+ list Returns a list of all indices
18
+ health Returns the health of the shards
19
+ flush Performs a full flush of the INDEX
20
+ create Create the specified INDEX
21
+ delete Delete the specified INDEX. Requires confirmation.
22
+ refresh Refresh the specified INDEX
23
+ optimize Optimizes the specified INDEX to (-s) number of segments
24
+ snapshot Snapshots the specified INDEX to the gateway
25
+ segments Returns the segment information. Requires ElasticSearch v
26
+ mapping
27
+ set_replication
28
+ search
29
+ obj_types
30
+
31
+ Options include:
32
+ EOS
33
+
34
+ options.host = Socket.gethostname
35
+ options.port = 9200
36
+ options.index = "_all"
37
+ options.segments = 3
38
+ options.query = "foo"
39
+ options.raw = false
40
+ options.usage = opts
41
+
42
+ opts.on('-c', '--host HOSTNAME', 'Connect to ElasticSearch on HOSTNAME', 'Defaults to localhost') do |host|
43
+ options.host = host
44
+ end
45
+
46
+ opts.on('-p', '--port PORT', 'Connect to ElasticSearch using PORT', 'Defaults to 9200') do |port|
47
+ options.port = port
48
+ end
49
+
50
+ opts.on('-i','--index NAME','Name of index to query against', 'Defaults to _all') do |index|
51
+ options.index = index
52
+ end
53
+
54
+ opts.on('-s', '--segments INT', 'Number of segments to optimize to', 'Defaults to 3. Use with <optimize>') do |num|
55
+ options.segments = num
56
+ end
57
+
58
+ opts.on('-r','--raw', 'Return raw JSON for parsing by another program') do
59
+ options.raw = true
60
+ end
61
+
62
+ opts.on('-q', '--query STRING', 'Query INDEX with STRING.', 'Defaults to foo. Use with <search>') do |str|
63
+ options.query = str
64
+ end
65
+
66
+ opts.on('-h', '--help', 'Display this screen and exit'){ puts opts ; exit }
67
+ end.parse!
68
+
69
+ class ESTool
70
+
71
+ attr_reader :options
72
+
73
+ def initialize(options)
74
+ @options = options
75
+ end
76
+
77
+ def connection() "http://#{options.host}:#{options.port}" ; end
78
+
79
+ def shell_response(cmd, req="-XGET")
80
+ url = File.join(connection, cmd)
81
+ Open3.popen3('curl','-s',req, url){ |stdin, stdout, stderr, thread| JSON.parse(stdout.read, :max_nesting => 100) }
82
+ end
83
+
84
+ def display cmd
85
+ result = self.send(cmd.to_sym)
86
+ display = options.raw ? result.to_json : JSON.pretty_generate(result, :max_nesting => 100)
87
+ puts display
88
+ end
89
+
90
+ def status() shell_response(File.join(options.index, "_status?")) ; end
91
+
92
+ def list() status["indices"].keys ; end
93
+
94
+ def health() shell_response("_cluster/health?") ; end
95
+
96
+ def flush() shell_response(File.join(options.index, "_flush?full=true")) ; end
97
+
98
+ def create() shell_response(options.index, "-XPUT") ; end
99
+
100
+ def delete()
101
+ require_confirmation!("delete", options.index)
102
+ shell_response(options.index, "-XDELETE")
103
+ end
104
+
105
+ def refresh() shell_response(File.join(options.index, "_refresh"), "-XPOST") ; end
106
+
107
+ def optimize() shell_response(File.join(options.index, "_optimize?max_num_segements=#{options.segments}"), "-XPOST") ; end
108
+
109
+ def snapshot() shell_response(File.join(options.index, "_gateway/snapshot"), "-XPOST") ; end
110
+
111
+ def segments() shell_response(File.join(options.index, "_segments")) ; end
112
+
113
+ def mapping() shell_response(File.join(options.index, "_mapping")) ; end
114
+
115
+ # curl -s -XPUT http://host:port/index/_settings -d '{"index":{"number_of_replicas":num}}'
116
+ def set_replication() { "error" => "method not yet implemented" }; end
117
+
118
+ def search() shell_response(File.join(options.index, "_search?q=#{options.query}")) ; end
119
+
120
+ def obj_types() mapping[options.index].keys ; end
121
+
122
+ def require_confirmation!(meth, *args)
123
+ print "#{meth.capitalize} method with args #{args} requires confirmation! [yN]?"
124
+ response = STDIN.gets.chomp
125
+ if response =~ /y/i
126
+ print "#{meth.capitalize} method with args #{args} confirmed!"
127
+ else
128
+ print "#{meth.capitalize} method with args #{args} cancelled!"
129
+ exit
130
+ end
131
+ end
132
+
133
+ def method_missing meth, *args
134
+ puts "invalid command: #{meth}", options.usage
135
+ exit
136
+ end
137
+
138
+ end
139
+
140
+ command = ARGV.first
141
+ ESTool.new(options).display(command)
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'rubberband'
5
+ require 'fileutils'
6
+ require 'configliere'
7
+ Settings.use :commandline, :env_var
8
+
9
+ #
10
+ # Estrus -- an alluringly primitive Elasticsearch stress-testing tool
11
+ #
12
+ # Example usage:
13
+ #
14
+ # ~/ics/backend/wonderdog/ruby/estrus.rb --queries=100 --output_dir=~/ics/backend/estrus_data
15
+ #
16
+ # Output:
17
+ #
18
+ # idx datetime secs msec/query hits shards_successful index nodename query_term
19
+ #
20
+ # Setup
21
+ #
22
+ # sudo apt-get install -y libcurl4-dev wamerican-large
23
+ # sudo gem install rubberband configliere
24
+
25
+ # ,tweet-2010q1
26
+ Settings.define :words_file, :default => "/usr/share/dict/words", :description => "Flat file with words to use"
27
+ Settings.define :offset_start, :default => 50_000, :description => "Where to start reading words", :type => Integer
28
+ Settings.define :offset_scale, :default => 100, :description => "How far in the file to range", :type => Integer
29
+ Settings.define :queries, :default => 10, :description => "Number of queries to run", :type => Integer
30
+ Settings.define :es_indexes, :default => 'tweet-2009q3pre,tweet-2009q4,tweet-2010q1,tweet-201004,tweet-201005,tweet-201005,tweet-201006,tweet-201007,tweet-201008,tweet-201009,tweet-201010,tweet-201011', :description => "Elasticsearch index to query against", :type => Array
31
+ Settings.define :output_dir, :default => nil, :description => "If given, the output is directed to a file named :output_dir/{date}/es-{datetime}-{comment}-{hostname}.tsv"
32
+ Settings.define :comment, :default => nil, :description => "If given, it is included in the filename"
33
+ Settings.define :host, :default => `hostname`.chomp, :description => "Host of ES query server"
34
+ Settings.define :port, :default => '9200', :description => "Port for ES query server"
35
+ Settings.resolve!
36
+
37
+ NODENAME = File.read('/etc/node_name').chomp rescue `hostname`.chomp
38
+ CLIENTS = Settings.es_indexes.inject([]){|clients, index| clients << [index, ElasticSearch.new("#{Settings.host}:#{Settings.port}", :index => index, :type => "tweet")] ; clients }
39
+
40
+ class StressTester
41
+ attr_accessor :started_at
42
+ def initialize
43
+ self.started_at = Time.now.utc
44
+ end
45
+
46
+ def words_file &block
47
+ File.open(Settings.words_file, &block)
48
+ end
49
+
50
+ def random_offset
51
+ Settings.offset_start + rand(1000)*Settings.offset_scale rescue nil
52
+ end
53
+
54
+ def output_file
55
+ return @output_file if @output_file
56
+ return $stdout if Settings.output_dir.blank?
57
+ datehr = started_at.strftime("%Y%m%d%H")
58
+ datetime = started_at.to_flat
59
+ output_filename = File.expand_path(File.join(Settings.output_dir, datehr,
60
+ ["es", datetime, NODENAME, Settings.comment].compact.join('-')+".tsv"))
61
+ FileUtils.mkdir_p(File.dirname(output_filename))
62
+ @output_file = File.open(output_filename, "a")
63
+ end
64
+
65
+ def dump *args
66
+ output_file << args.join("\t")+"\n"
67
+ end
68
+
69
+ def each_word &block
70
+ words_file do |words_file|
71
+ random_offset.times{ words_file.readline }
72
+ loop do
73
+ word = words_file.readline.chomp rescue nil
74
+ break unless word
75
+ next if word =~ /\W/
76
+ yield word
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ class Time ; def to_flat() strftime("%Y%m%d%H%M%S"); end ; end
83
+ class Array ; def random() self[rand(length)] ; end ; end
84
+
85
+ tester = StressTester.new
86
+ n_queries_executed = 0
87
+ tester.each_word do |query_string|
88
+ index, client = CLIENTS.random
89
+
90
+ result = client.search "text:#{query_string}"
91
+ elapsed = Time.now.utc - tester.started_at
92
+ n_queries_executed += 1
93
+ tester.dump(
94
+ n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
95
+ "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
96
+ result.total_entries, result._shards['successful'],
97
+ index, NODENAME,
98
+ query_string)
99
+ $stderr.puts(n_queries_executed) if n_queries_executed % 20 == 0
100
+ break if n_queries_executed >= Settings.queries
101
+ end
102
+
103
+ # query_string = 'verizon'
104
+ # CLIENTS.each do |index,client|
105
+ # result = client.search "text:#{query_string}"
106
+ # elapsed = Time.now.utc - tester.started_at
107
+ # n_queries_executed += 1
108
+ # tester.dump(
109
+ # n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
110
+ # "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
111
+ # result.total_entries, result._shards['successful'],
112
+ # index, NODENAME,
113
+ # query_string)
114
+ #
115
+ # end
116
+
117
+
118
+
119
+
120
+ #
121
+ # TODO: monkeypatch rubberband to use keepalives:
122
+ #
123
+ # def connect!
124
+ # unless defined?(@@patron_session)
125
+ # @@patron_session = Patron::Session.new
126
+ # @session = @@patron_session
127
+ # @session.base_url = @server
128
+ # @session.timeout = @options[:timeout]
129
+ # @session.headers['User-Agent'] = 'ElasticSearch.rb v0.1'
130
+ # @session.headers['Connection'] = 'Keep-Alive'
131
+ # else
132
+ # @session = @@patron_session
133
+ # end
134
+ # @request_count = 1
135
+ # end
136
+
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
+
6
+ Settings.define :index_name, :required => true, :description => "Index to write data to"
7
+ Settings.define :object_type, :default => "tweet", :description => "Type of object we're indexing"
8
+ Settings.define :field_names, :default => "rsrc,tweet_id,created_at,user_id,screen_name,search_id,in_reply_to_user_id,in_reply_to_screen_name,in_reply_to_search_id,in_reply_to_status_id,text,source,lang,lat,lng,retweeted_count,rt_of_user_id,rt_of_screen_name,rt_of_tweet_id,contributors", :description => "Comma separated list of field names"
9
+ Settings.define :id_field, :default => "1", :description => "Index of field to use as object id (counting from 0; default 1), use -1 if no id field"
10
+ Settings.define :bulk_size, :default => "1000", :description => "Number of records per bulk request"
11
+ Settings.define :es_home, :default => "/usr/local/share/elasticsearch", :description => "Path to elasticsearch installation",:env_var => "ES_HOME"
12
+ Settings.define :es_config, :default => "/etc/elasticsearch/elasticsearch.yml", :description => "Path to elasticsearch config"
13
+ Settings.define :rm, :default => false, :description => "Remove existing output?"
14
+ Settings.define :hadoop_home, :default => "/usr/lib/hadoop", :description => "Path to hadoop installation", :env_var => "HADOOP_HOME"
15
+ Settings.define :min_split_size, :default => "5000000000", :description => "Min split size for maps"
16
+ Settings.define :test_outputfmt, :default => false, :description => "Use this flag to run job that test the ElasticSearchOutputFormat"
17
+ Settings.resolve!
18
+
19
+
20
+ raise "No input file specified." if Settings.rest.first.blank?
21
+ raise "No output file specified." if Settings.rest.last.blank?
22
+
23
+ class Wonderdog
24
+ attr_accessor :options
25
+ def initialize
26
+ @options = Settings.dup
27
+ end
28
+
29
+ def execute
30
+ output = options.rest.last
31
+ remove_output(output) if options.rm
32
+ system %Q{ echo #{hdp_cmd} }
33
+ system %Q{ #{hdp_cmd} }
34
+ end
35
+
36
+ def hdp_cmd
37
+ [
38
+ "HADOOP_CLASSPATH=#{hadoop_classpath}",
39
+ "#{options.hadoop_home}/bin/hadoop jar #{run_jar}",
40
+ mainclass,
41
+ "-Dmapred.map.tasks.speculative.execution=false",
42
+ "-Dmapred.min.split.size=#{options.min_split_size}",
43
+ "-Dwonderdog.index.name=#{options.index_name}",
44
+ "-Dwonderdog.object.type=#{options.object_type}",
45
+ "-Dwonderdog.id.field=#{options.id_field}",
46
+ "-Dwonderdog.field.names=#{options.field_names}",
47
+ "-Dwonderdog.bulk.size=#{options.bulk_size}",
48
+ "-Dwonderdog.config=#{options.es_config}",
49
+ "-Dwonderdog.plugins.dir=#{options.es_home}/plugins",
50
+ "-libjars #{libjars}",
51
+ "#{options.rest.first}",
52
+ "#{options.rest.last}"
53
+ ].flatten.compact.join(" \t\\\n ")
54
+ end
55
+
56
+ def mainclass
57
+ return "com.infochimps.elasticsearch.ElasticTest" if Settings.test_outputfmt
58
+ "com.infochimps.elasticsearch.wonderdog.WonderDog"
59
+ end
60
+
61
+ def hadoop_classpath
62
+ cp = ["."]
63
+ Dir[
64
+ "/etc/elasticsearch/elasticsearch.yml",
65
+ "#{options.es_home}/plugins/*/*.jar",
66
+ "#{options.es_home}/lib/*.jar",
67
+ "#{options.es_home}/lib/sigar/*.jar"
68
+ ].each{|jar| cp << jar}
69
+ cp.join(':')
70
+ end
71
+
72
+ def run_jar
73
+ File.dirname(File.expand_path(__FILE__))+'/../build/wonderdog.jar'
74
+ end
75
+
76
+ def libjars
77
+ libjars = []
78
+ Dir[
79
+ "/etc/elasticsearch/elasticsearch.yml",
80
+ "#{options.es_home}/plugins/*/*.jar",
81
+ "#{options.es_home}/lib/*.jar"
82
+ ].each{|jar| libjars << jar}
83
+ libjars.join(',')
84
+ end
85
+
86
+ def remove_output output
87
+ system %Q{ hdp-rm -r #{output} }
88
+ end
89
+
90
+ end
91
+
92
+ runner = Wonderdog.new
93
+ runner.execute
@@ -0,0 +1,227 @@
1
+ #
2
+ # ElasticSearch config file
3
+ #
4
+
5
+ cluster:
6
+ name: hoolock
7
+
8
+ # http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/439afb06f3e85aa7/431a8543811d7848?lnk=gst&q=configuration#431a8543811d7848
9
+ routing:
10
+ allocation:
11
+ concurrent_recoveries: 1
12
+
13
+ # File paths
14
+ path:
15
+ home: /usr/local/share/elasticsearch
16
+ conf: /etc/elasticsearch
17
+ logs: /var/log/elasticsearch
18
+ # data: /mnt/elasticsearch/data
19
+ # work: /mnt/elasticsearch/work
20
+
21
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/node/
22
+ node:
23
+ # # node.data: is this a data esnode (stores, indexes data)? default true
24
+ data: true
25
+
26
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/http/
27
+ http:
28
+ # # http.enabled: is this a query esnode (has http interface, dispatches/gathers queries)? Default true
29
+ enabled: true
30
+ port: 9200-9300
31
+ max_content_length: 100mb
32
+
33
+ gateway:
34
+ # The gateway set on the node level will automatically control the index
35
+ # gateway to use. For example, if the fs gateway is used, then automatically,
36
+ # each index created on the node will also use its own respective index level
37
+ # fs gateway. In this case, if in an index should not persist its state, it
38
+ # should be explicitly set to none.
39
+ #
40
+ # Set gateway.type to one of: [none, local, fs, hadoop, s3]
41
+ #
42
+ type: local
43
+ #
44
+ # recovery begins when recover_after_nodes are present and then either
45
+ # recovery_after_time has passed *or* expected_nodes have shown up.
46
+ recover_after_nodes: 24
47
+ recovery_after_time: 10m # 5m
48
+ expected_nodes: 24 # 2
49
+ #
50
+ # # use with type: s3
51
+ # s3:
52
+ # bucket: infochimps-search
53
+
54
+ # http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/1f3001f43266879a/06d62ea3ceb4db30?lnk=gst&q=translog#06d62ea3ceb4db30
55
+ indices:
56
+ memory:
57
+ # Increase if you are bulk loading
58
+ # A number ('512m') or percent ('10%'). You can set limits on a percentage
59
+ # with max_index_buffer_size and min_index_buffer_size. 10% by default.
60
+ index_buffer_size: 512m
61
+
62
+ cache:
63
+ memory:
64
+ # buffer_size: 100k
65
+ # cache_size: 50m
66
+ # direct: true
67
+ # warm_cache: false
68
+
69
+ index:
70
+ number_of_shards: 24
71
+ number_of_replicas: 0
72
+ translog:
73
+ # A shard is flushed to local disk (the lucene index is committed) once this
74
+ # number of operations accumulate in the translog. defaults to 5000
75
+ #
76
+ # If you have
77
+ flush_threshold: 200000 # 5000
78
+ merge:
79
+ policy:
80
+ # Determines how often segment indices are merged by index operation. With
81
+ # smaller values, less RAM is used while indexing, and searches on
82
+ # unoptimized indices are faster, but indexing speed is slower. With
83
+ # larger values, more RAM is used during indexing, and while searches on
84
+ # unoptimized indices are slower, indexing is faster. Thus larger values
85
+ # (greater than 10) are best for batch index creation, and smaller values
86
+ # (lower than 10) for indices that are interactively maintained. Defaults
87
+ # to 10.
88
+ merge_factor: 30
89
+ # Use the compound file format. If not set, controlled by the actually
90
+ # store used, this is because the compound format was created to reduce
91
+ # the number of open file handles when using file based storage. The file
92
+ # system based ones default to true which others default to false. Even
93
+ # with file system based ones, consider increasing the number of open file
94
+ # handles and setting this to false for better performance
95
+ use_compound_file: false
96
+ # A size setting type which sets the minimum size for the lowest level
97
+ # segments. Any segments below this size are considered to be on the same
98
+ # level (even if they vary drastically in size) and will be merged
99
+ # whenever there are mergeFactor of them. This effectively truncates the
100
+ # “long tail” of small segments that would otherwise be created into a
101
+ # single level. If you set this too large, it could greatly increase the
102
+ # merging cost during indexing (if you flush many small
103
+ # segments). Defaults to 1.6mb
104
+ min_merge_size: 2.7mb
105
+ # Largest segment (by total byte size) that may be merged with other
106
+ # segments. Defaults to unbounded.
107
+ # max_merge_size:
108
+ # Largest segment (by document count) that may be merged with other
109
+ # segments. Defaults to unbounded
110
+ # max_merge_docs
111
+ scheduler:
112
+ max_thread_count: 64
113
+ # deletionpolicy: keep_only_last
114
+
115
+ engine:
116
+ robin:
117
+ # How often to schedule the refresh operation (the same one the Refresh
118
+ # API, which enables near real time search). Default '1s'; set to -1 to
119
+ # disable automatic refresh (you must instead initiate refresh via API)
120
+ refresh_interval: -1
121
+ # Set the interval between indexed terms. Large values cause less memory
122
+ # to be used by a reader / searcher, but slow random-access to
123
+ # terms. Small values cause more memory to be used by a reader / searcher,
124
+ # and speed random-access to terms. Defaults to 128.
125
+ term_index_interval: 1024
126
+
127
+ gateway:
128
+ # The index.gateway.snapshot_interval is a time setting allowing to
129
+ # configure the interval at which snapshotting of the index shard to the
130
+ # gateway will take place. Note, only primary shards start this scheduled
131
+ # snapshotting process. It defaults to 10s, and can be disabled by setting
132
+ # it to -1.
133
+ snapshot_interval: -1
134
+ # When a primary shard is shut down explicitly (not relocated), the
135
+ # index.gateway.snapshot_on_close flag can control if while shutting down, a
136
+ # gateway snapshot should be performed. It defaults to true.
137
+ snapshot_on_close: true
138
+
139
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/node/network/
140
+ network:
141
+ bind_host: _local_
142
+ publish_host: _local_
143
+ #
144
+ # tcp:
145
+ # no_delay: true
146
+ # keep_alive: ~
147
+ # reuse_address true
148
+ # send_buffer_size ~
149
+ # receive_buffer_size: ~
150
+
151
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/transport/
152
+ transport:
153
+ tcp:
154
+ port: 9300-9400
155
+ connect_timeout: 1s
156
+ # # enable lzf compression in esnode-esnode communication?
157
+ compress: false
158
+
159
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/jmx/
160
+ jmx:
161
+ # Create an RMI connector?
162
+ create_connector: true
163
+ port: 9400-9500
164
+ domain: elasticsearch
165
+
166
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/threadpool/
167
+ threadpool:
168
+ # #
169
+ # # threadpool.type should be one of [cached, scaling, blocking]:
170
+ # #
171
+ # # * Cached: An unbounded thread pool that reuses previously constructed threads.
172
+ # # * Scaling: A bounded thread pool that reuses previously created free threads.
173
+ # # * Blocking: A bounded thread pool that reuses previously created free
174
+ # # threads. Pending requests block for an available thread (different than
175
+ # # the scaling one, where the request is added to a queue and does not
176
+ # # block).
177
+ # #
178
+ # type: cached
179
+
180
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/discovery/
181
+ discovery:
182
+ # set to 'zen' or 'ec2'
183
+ type: zen
184
+ zen:
185
+ ping:
186
+ multicast:
187
+ enabled: false
188
+ unicast:
189
+ hosts: 10.195.215.175:9300,10.243.57.219:9300,10.194.218.143:9300,10.204.223.175:9300,10.242.89.235:9300,10.212.226.127:9300
190
+ # There are two fault detection processes running. The first is by the
191
+ # master, to ping all the other nodes in the cluster and verify that they
192
+ # are alive. And on the other end, each node pings to master to verify if
193
+ # its still alive or an election process needs to be initiated.
194
+ fd:
195
+ # How often a node gets pinged. Defaults to "1s".
196
+ ping_interval: 3s
197
+ # How long to wait for a ping response, defaults to "30s".
198
+ ping_timeout: 10s
199
+ # How many ping failures / timeouts cause a node to be considered failed. Defaults to 3.
200
+ ping_retries: 3
201
+ #
202
+ # # ec2 discovery can cause big trouble with the hadoop loader:
203
+ # # discovery churn can hit API usage limits
204
+ # # Be sure to set your cloud keys if you're using ec2
205
+ # #
206
+ # ec2:
207
+ # # security groups used for discovery
208
+ # groups: hoolock-data_esnode
209
+ # # require *all* (false) or *any* (true) of those groups?
210
+ # any_group: true
211
+ # # private_ip, public_ip, private_dns, public_dns
212
+ # host_type: private_ip
213
+ # availability_zones: us-east-1d
214
+
215
+ # Necessary if you will use either of
216
+ # * the ec2 discovery module: for finding peers
217
+ # * the s3 gateway module, for pushing indices to an s3 mirror.
218
+ # Read more: http://www.elasticsearch.com/docs/elasticsearch/cloud/
219
+ #
220
+ cloud:
221
+ aws:
222
+ access_key: <%= @aws['aws_access_key_id'] %>
223
+ secret_key: <%= @aws['aws_secret_access_key'] %>
224
+
225
+ # monitor.jvm: gc_threshold, interval, enabled
226
+ # thrift:
227
+ # # port: