wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'json'
4
+ require 'socket'
5
+ require 'optparse'
6
+ require 'open3'
7
+ require 'rake'
8
+
9
+ options = OpenStruct.new
10
+ OptionParser.new do |opts|
11
+
12
+ opts.banner = <<EOS
13
+ Usage: estool <command> [options..]
14
+
15
+ Commands include:
16
+ status Returns the status of INDEX
17
+ list Returns a list of all indices
18
+ health Returns the health of the shards
19
+ flush Performs a full flush of the INDEX
20
+ create Create the specified INDEX
21
+ delete Delete the specified INDEX. Requires confirmation.
22
+ refresh Refresh the specified INDEX
23
+ optimize Optimizes the specified INDEX to (-s) number of segments
24
+ snapshot Snapshots the specified INDEX to the gateway
25
+ segments Returns the segment information. Requires ElasticSearch v
26
+ mapping
27
+ set_replication
28
+ search
29
+ obj_types
30
+
31
+ Options include:
32
+ EOS
33
+
34
+ options.host = Socket.gethostname
35
+ options.port = 9200
36
+ options.index = "_all"
37
+ options.segments = 3
38
+ options.query = "foo"
39
+ options.raw = false
40
+ options.usage = opts
41
+
42
+ opts.on('-c', '--host HOSTNAME', 'Connect to ElasticSearch on HOSTNAME', 'Defaults to localhost') do |host|
43
+ options.host = host
44
+ end
45
+
46
+ opts.on('-p', '--port PORT', 'Connect to ElasticSearch using PORT', 'Defaults to 9200') do |port|
47
+ options.port = port
48
+ end
49
+
50
+ opts.on('-i','--index NAME','Name of index to query against', 'Defaults to _all') do |index|
51
+ options.index = index
52
+ end
53
+
54
+ opts.on('-s', '--segments INT', 'Number of segments to optimize to', 'Defaults to 3. Use with <optimize>') do |num|
55
+ options.segments = num
56
+ end
57
+
58
+ opts.on('-r','--raw', 'Return raw JSON for parsing by another program') do
59
+ options.raw = true
60
+ end
61
+
62
+ opts.on('-q', '--query STRING', 'Query INDEX with STRING.', 'Defaults to foo. Use with <search>') do |str|
63
+ options.query = str
64
+ end
65
+
66
+ opts.on('-h', '--help', 'Display this screen and exit'){ puts opts ; exit }
67
+ end.parse!
68
+
69
+ class ESTool
70
+
71
+ attr_reader :options
72
+
73
+ def initialize(options)
74
+ @options = options
75
+ end
76
+
77
+ def connection() "http://#{options.host}:#{options.port}" ; end
78
+
79
+ def shell_response(cmd, req="-XGET")
80
+ url = File.join(connection, cmd)
81
+ Open3.popen3('curl','-s',req, url){ |stdin, stdout, stderr, thread| JSON.parse(stdout.read, :max_nesting => 100) }
82
+ end
83
+
84
+ def display cmd
85
+ result = self.send(cmd.to_sym)
86
+ display = options.raw ? result.to_json : JSON.pretty_generate(result, :max_nesting => 100)
87
+ puts display
88
+ end
89
+
90
+ def status() shell_response(File.join(options.index, "_status?")) ; end
91
+
92
+ def list() status["indices"].keys ; end
93
+
94
+ def health() shell_response("_cluster/health?") ; end
95
+
96
+ def flush() shell_response(File.join(options.index, "_flush?full=true")) ; end
97
+
98
+ def create() shell_response(options.index, "-XPUT") ; end
99
+
100
+ def delete()
101
+ require_confirmation!("delete", options.index)
102
+ shell_response(options.index, "-XDELETE")
103
+ end
104
+
105
+ def refresh() shell_response(File.join(options.index, "_refresh"), "-XPOST") ; end
106
+
107
+ def optimize() shell_response(File.join(options.index, "_optimize?max_num_segements=#{options.segments}"), "-XPOST") ; end
108
+
109
+ def snapshot() shell_response(File.join(options.index, "_gateway/snapshot"), "-XPOST") ; end
110
+
111
+ def segments() shell_response(File.join(options.index, "_segments")) ; end
112
+
113
+ def mapping() shell_response(File.join(options.index, "_mapping")) ; end
114
+
115
+ # curl -s -XPUT http://host:port/index/_settings -d '{"index":{"number_of_replicas":num}}'
116
+ def set_replication() { "error" => "method not yet implemented" }; end
117
+
118
+ def search() shell_response(File.join(options.index, "_search?q=#{options.query}")) ; end
119
+
120
+ def obj_types() mapping[options.index].keys ; end
121
+
122
+ def require_confirmation!(meth, *args)
123
+ print "#{meth.capitalize} method with args #{args} requires confirmation! [yN]?"
124
+ response = STDIN.gets.chomp
125
+ if response =~ /y/i
126
+ print "#{meth.capitalize} method with args #{args} confirmed!"
127
+ else
128
+ print "#{meth.capitalize} method with args #{args} cancelled!"
129
+ exit
130
+ end
131
+ end
132
+
133
+ def method_missing meth, *args
134
+ puts "invalid command: #{meth}", options.usage
135
+ exit
136
+ end
137
+
138
+ end
139
+
140
+ command = ARGV.first
141
+ ESTool.new(options).display(command)
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'rubberband'
5
+ require 'fileutils'
6
+ require 'configliere'
7
+ Settings.use :commandline, :env_var
8
+
9
+ #
10
+ # Estrus -- an alluringly primitive Elasticsearch stress-testing tool
11
+ #
12
+ # Example usage:
13
+ #
14
+ # ~/ics/backend/wonderdog/ruby/estrus.rb --queries=100 --output_dir=~/ics/backend/estrus_data
15
+ #
16
+ # Output:
17
+ #
18
+ # idx datetime secs msec/query hits shards_successful index nodename query_term
19
+ #
20
+ # Setup
21
+ #
22
+ # sudo apt-get install -y libcurl4-dev wamerican-large
23
+ # sudo gem install rubberband configliere
24
+
25
+ # ,tweet-2010q1
26
+ Settings.define :words_file, :default => "/usr/share/dict/words", :description => "Flat file with words to use"
27
+ Settings.define :offset_start, :default => 50_000, :description => "Where to start reading words", :type => Integer
28
+ Settings.define :offset_scale, :default => 100, :description => "How far in the file to range", :type => Integer
29
+ Settings.define :queries, :default => 10, :description => "Number of queries to run", :type => Integer
30
+ Settings.define :es_indexes, :default => 'tweet-2009q3pre,tweet-2009q4,tweet-2010q1,tweet-201004,tweet-201005,tweet-201005,tweet-201006,tweet-201007,tweet-201008,tweet-201009,tweet-201010,tweet-201011', :description => "Elasticsearch index to query against", :type => Array
31
+ Settings.define :output_dir, :default => nil, :description => "If given, the output is directed to a file named :output_dir/{date}/es-{datetime}-{comment}-{hostname}.tsv"
32
+ Settings.define :comment, :default => nil, :description => "If given, it is included in the filename"
33
+ Settings.define :host, :default => `hostname`.chomp, :description => "Host of ES query server"
34
+ Settings.define :port, :default => '9200', :description => "Port for ES query server"
35
+ Settings.resolve!
36
+
37
+ NODENAME = File.read('/etc/node_name').chomp rescue `hostname`.chomp
38
+ CLIENTS = Settings.es_indexes.inject([]){|clients, index| clients << [index, ElasticSearch.new("#{Settings.host}:#{Settings.port}", :index => index, :type => "tweet")] ; clients }
39
+
40
+ class StressTester
41
+ attr_accessor :started_at
42
+ def initialize
43
+ self.started_at = Time.now.utc
44
+ end
45
+
46
+ def words_file &block
47
+ File.open(Settings.words_file, &block)
48
+ end
49
+
50
+ def random_offset
51
+ Settings.offset_start + rand(1000)*Settings.offset_scale rescue nil
52
+ end
53
+
54
+ def output_file
55
+ return @output_file if @output_file
56
+ return $stdout if Settings.output_dir.blank?
57
+ datehr = started_at.strftime("%Y%m%d%H")
58
+ datetime = started_at.to_flat
59
+ output_filename = File.expand_path(File.join(Settings.output_dir, datehr,
60
+ ["es", datetime, NODENAME, Settings.comment].compact.join('-')+".tsv"))
61
+ FileUtils.mkdir_p(File.dirname(output_filename))
62
+ @output_file = File.open(output_filename, "a")
63
+ end
64
+
65
+ def dump *args
66
+ output_file << args.join("\t")+"\n"
67
+ end
68
+
69
+ def each_word &block
70
+ words_file do |words_file|
71
+ random_offset.times{ words_file.readline }
72
+ loop do
73
+ word = words_file.readline.chomp rescue nil
74
+ break unless word
75
+ next if word =~ /\W/
76
+ yield word
77
+ end
78
+ end
79
+ end
80
+ end
81
+
82
+ class Time ; def to_flat() strftime("%Y%m%d%H%M%S"); end ; end
83
+ class Array ; def random() self[rand(length)] ; end ; end
84
+
85
+ tester = StressTester.new
86
+ n_queries_executed = 0
87
+ tester.each_word do |query_string|
88
+ index, client = CLIENTS.random
89
+
90
+ result = client.search "text:#{query_string}"
91
+ elapsed = Time.now.utc - tester.started_at
92
+ n_queries_executed += 1
93
+ tester.dump(
94
+ n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
95
+ "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
96
+ result.total_entries, result._shards['successful'],
97
+ index, NODENAME,
98
+ query_string)
99
+ $stderr.puts(n_queries_executed) if n_queries_executed % 20 == 0
100
+ break if n_queries_executed >= Settings.queries
101
+ end
102
+
103
+ # query_string = 'verizon'
104
+ # CLIENTS.each do |index,client|
105
+ # result = client.search "text:#{query_string}"
106
+ # elapsed = Time.now.utc - tester.started_at
107
+ # n_queries_executed += 1
108
+ # tester.dump(
109
+ # n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
110
+ # "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
111
+ # result.total_entries, result._shards['successful'],
112
+ # index, NODENAME,
113
+ # query_string)
114
+ #
115
+ # end
116
+
117
+
118
+
119
+
120
+ #
121
+ # TODO: monkeypatch rubberband to use keepalives:
122
+ #
123
+ # def connect!
124
+ # unless defined?(@@patron_session)
125
+ # @@patron_session = Patron::Session.new
126
+ # @session = @@patron_session
127
+ # @session.base_url = @server
128
+ # @session.timeout = @options[:timeout]
129
+ # @session.headers['User-Agent'] = 'ElasticSearch.rb v0.1'
130
+ # @session.headers['Connection'] = 'Keep-Alive'
131
+ # else
132
+ # @session = @@patron_session
133
+ # end
134
+ # @request_count = 1
135
+ # end
136
+
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'rubygems'
4
+ require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
5
+
6
+ Settings.define :index_name, :required => true, :description => "Index to write data to"
7
+ Settings.define :object_type, :default => "tweet", :description => "Type of object we're indexing"
8
+ Settings.define :field_names, :default => "rsrc,tweet_id,created_at,user_id,screen_name,search_id,in_reply_to_user_id,in_reply_to_screen_name,in_reply_to_search_id,in_reply_to_status_id,text,source,lang,lat,lng,retweeted_count,rt_of_user_id,rt_of_screen_name,rt_of_tweet_id,contributors", :description => "Comma separated list of field names"
9
+ Settings.define :id_field, :default => "1", :description => "Index of field to use as object id (counting from 0; default 1), use -1 if no id field"
10
+ Settings.define :bulk_size, :default => "1000", :description => "Number of records per bulk request"
11
+ Settings.define :es_home, :default => "/usr/local/share/elasticsearch", :description => "Path to elasticsearch installation",:env_var => "ES_HOME"
12
+ Settings.define :es_config, :default => "/etc/elasticsearch/elasticsearch.yml", :description => "Path to elasticsearch config"
13
+ Settings.define :rm, :default => false, :description => "Remove existing output?"
14
+ Settings.define :hadoop_home, :default => "/usr/lib/hadoop", :description => "Path to hadoop installation", :env_var => "HADOOP_HOME"
15
+ Settings.define :min_split_size, :default => "5000000000", :description => "Min split size for maps"
16
+ Settings.define :test_outputfmt, :default => false, :description => "Use this flag to run job that test the ElasticSearchOutputFormat"
17
+ Settings.resolve!
18
+
19
+
20
+ raise "No input file specified." if Settings.rest.first.blank?
21
+ raise "No output file specified." if Settings.rest.last.blank?
22
+
23
+ class Wonderdog
24
+ attr_accessor :options
25
+ def initialize
26
+ @options = Settings.dup
27
+ end
28
+
29
+ def execute
30
+ output = options.rest.last
31
+ remove_output(output) if options.rm
32
+ system %Q{ echo #{hdp_cmd} }
33
+ system %Q{ #{hdp_cmd} }
34
+ end
35
+
36
+ def hdp_cmd
37
+ [
38
+ "HADOOP_CLASSPATH=#{hadoop_classpath}",
39
+ "#{options.hadoop_home}/bin/hadoop jar #{run_jar}",
40
+ mainclass,
41
+ "-Dmapred.map.tasks.speculative.execution=false",
42
+ "-Dmapred.min.split.size=#{options.min_split_size}",
43
+ "-Dwonderdog.index.name=#{options.index_name}",
44
+ "-Dwonderdog.object.type=#{options.object_type}",
45
+ "-Dwonderdog.id.field=#{options.id_field}",
46
+ "-Dwonderdog.field.names=#{options.field_names}",
47
+ "-Dwonderdog.bulk.size=#{options.bulk_size}",
48
+ "-Dwonderdog.config=#{options.es_config}",
49
+ "-Dwonderdog.plugins.dir=#{options.es_home}/plugins",
50
+ "-libjars #{libjars}",
51
+ "#{options.rest.first}",
52
+ "#{options.rest.last}"
53
+ ].flatten.compact.join(" \t\\\n ")
54
+ end
55
+
56
+ def mainclass
57
+ return "com.infochimps.elasticsearch.ElasticTest" if Settings.test_outputfmt
58
+ "com.infochimps.elasticsearch.wonderdog.WonderDog"
59
+ end
60
+
61
+ def hadoop_classpath
62
+ cp = ["."]
63
+ Dir[
64
+ "/etc/elasticsearch/elasticsearch.yml",
65
+ "#{options.es_home}/plugins/*/*.jar",
66
+ "#{options.es_home}/lib/*.jar",
67
+ "#{options.es_home}/lib/sigar/*.jar"
68
+ ].each{|jar| cp << jar}
69
+ cp.join(':')
70
+ end
71
+
72
+ def run_jar
73
+ File.dirname(File.expand_path(__FILE__))+'/../build/wonderdog.jar'
74
+ end
75
+
76
+ def libjars
77
+ libjars = []
78
+ Dir[
79
+ "/etc/elasticsearch/elasticsearch.yml",
80
+ "#{options.es_home}/plugins/*/*.jar",
81
+ "#{options.es_home}/lib/*.jar"
82
+ ].each{|jar| libjars << jar}
83
+ libjars.join(',')
84
+ end
85
+
86
+ def remove_output output
87
+ system %Q{ hdp-rm -r #{output} }
88
+ end
89
+
90
+ end
91
+
92
+ runner = Wonderdog.new
93
+ runner.execute
@@ -0,0 +1,227 @@
1
+ #
2
+ # ElasticSearch config file
3
+ #
4
+
5
+ cluster:
6
+ name: hoolock
7
+
8
+ # http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/439afb06f3e85aa7/431a8543811d7848?lnk=gst&q=configuration#431a8543811d7848
9
+ routing:
10
+ allocation:
11
+ concurrent_recoveries: 1
12
+
13
+ # File paths
14
+ path:
15
+ home: /usr/local/share/elasticsearch
16
+ conf: /etc/elasticsearch
17
+ logs: /var/log/elasticsearch
18
+ # data: /mnt/elasticsearch/data
19
+ # work: /mnt/elasticsearch/work
20
+
21
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/node/
22
+ node:
23
+ # # node.data: is this a data esnode (stores, indexes data)? default true
24
+ data: true
25
+
26
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/http/
27
+ http:
28
+ # # http.enabled: is this a query esnode (has http interface, dispatches/gathers queries)? Default true
29
+ enabled: true
30
+ port: 9200-9300
31
+ max_content_length: 100mb
32
+
33
+ gateway:
34
+ # The gateway set on the node level will automatically control the index
35
+ # gateway to use. For example, if the fs gateway is used, then automatically,
36
+ # each index created on the node will also use its own respective index level
37
+ # fs gateway. In this case, if in an index should not persist its state, it
38
+ # should be explicitly set to none.
39
+ #
40
+ # Set gateway.type to one of: [none, local, fs, hadoop, s3]
41
+ #
42
+ type: local
43
+ #
44
+ # recovery begins when recover_after_nodes are present and then either
45
+ # recovery_after_time has passed *or* expected_nodes have shown up.
46
+ recover_after_nodes: 24
47
+ recovery_after_time: 10m # 5m
48
+ expected_nodes: 24 # 2
49
+ #
50
+ # # use with type: s3
51
+ # s3:
52
+ # bucket: infochimps-search
53
+
54
+ # http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/1f3001f43266879a/06d62ea3ceb4db30?lnk=gst&q=translog#06d62ea3ceb4db30
55
+ indices:
56
+ memory:
57
+ # Increase if you are bulk loading
58
+ # A number ('512m') or percent ('10%'). You can set limits on a percentage
59
+ # with max_index_buffer_size and min_index_buffer_size. 10% by default.
60
+ index_buffer_size: 512m
61
+
62
+ cache:
63
+ memory:
64
+ # buffer_size: 100k
65
+ # cache_size: 50m
66
+ # direct: true
67
+ # warm_cache: false
68
+
69
+ index:
70
+ number_of_shards: 24
71
+ number_of_replicas: 0
72
+ translog:
73
+ # A shard is flushed to local disk (the lucene index is committed) once this
74
+ # number of operations accumulate in the translog. defaults to 5000
75
+ #
76
+ # If you have
77
+ flush_threshold: 200000 # 5000
78
+ merge:
79
+ policy:
80
+ # Determines how often segment indices are merged by index operation. With
81
+ # smaller values, less RAM is used while indexing, and searches on
82
+ # unoptimized indices are faster, but indexing speed is slower. With
83
+ # larger values, more RAM is used during indexing, and while searches on
84
+ # unoptimized indices are slower, indexing is faster. Thus larger values
85
+ # (greater than 10) are best for batch index creation, and smaller values
86
+ # (lower than 10) for indices that are interactively maintained. Defaults
87
+ # to 10.
88
+ merge_factor: 30
89
+ # Use the compound file format. If not set, controlled by the actually
90
+ # store used, this is because the compound format was created to reduce
91
+ # the number of open file handles when using file based storage. The file
92
+ # system based ones default to true which others default to false. Even
93
+ # with file system based ones, consider increasing the number of open file
94
+ # handles and setting this to false for better performance
95
+ use_compound_file: false
96
+ # A size setting type which sets the minimum size for the lowest level
97
+ # segments. Any segments below this size are considered to be on the same
98
+ # level (even if they vary drastically in size) and will be merged
99
+ # whenever there are mergeFactor of them. This effectively truncates the
100
+ # “long tail” of small segments that would otherwise be created into a
101
+ # single level. If you set this too large, it could greatly increase the
102
+ # merging cost during indexing (if you flush many small
103
+ # segments). Defaults to 1.6mb
104
+ min_merge_size: 2.7mb
105
+ # Largest segment (by total byte size) that may be merged with other
106
+ # segments. Defaults to unbounded.
107
+ # max_merge_size:
108
+ # Largest segment (by document count) that may be merged with other
109
+ # segments. Defaults to unbounded
110
+ # max_merge_docs
111
+ scheduler:
112
+ max_thread_count: 64
113
+ # deletionpolicy: keep_only_last
114
+
115
+ engine:
116
+ robin:
117
+ # How often to schedule the refresh operation (the same one the Refresh
118
+ # API, which enables near real time search). Default '1s'; set to -1 to
119
+ # disable automatic refresh (you must instead initiate refresh via API)
120
+ refresh_interval: -1
121
+ # Set the interval between indexed terms. Large values cause less memory
122
+ # to be used by a reader / searcher, but slow random-access to
123
+ # terms. Small values cause more memory to be used by a reader / searcher,
124
+ # and speed random-access to terms. Defaults to 128.
125
+ term_index_interval: 1024
126
+
127
+ gateway:
128
+ # The index.gateway.snapshot_interval is a time setting allowing to
129
+ # configure the interval at which snapshotting of the index shard to the
130
+ # gateway will take place. Note, only primary shards start this scheduled
131
+ # snapshotting process. It defaults to 10s, and can be disabled by setting
132
+ # it to -1.
133
+ snapshot_interval: -1
134
+ # When a primary shard is shut down explicitly (not relocated), the
135
+ # index.gateway.snapshot_on_close flag can control if while shutting down, a
136
+ # gateway snapshot should be performed. It defaults to true.
137
+ snapshot_on_close: true
138
+
139
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/node/network/
140
+ network:
141
+ bind_host: _local_
142
+ publish_host: _local_
143
+ #
144
+ # tcp:
145
+ # no_delay: true
146
+ # keep_alive: ~
147
+ # reuse_address true
148
+ # send_buffer_size ~
149
+ # receive_buffer_size: ~
150
+
151
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/transport/
152
+ transport:
153
+ tcp:
154
+ port: 9300-9400
155
+ connect_timeout: 1s
156
+ # # enable lzf compression in esnode-esnode communication?
157
+ compress: false
158
+
159
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/jmx/
160
+ jmx:
161
+ # Create an RMI connector?
162
+ create_connector: true
163
+ port: 9400-9500
164
+ domain: elasticsearch
165
+
166
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/threadpool/
167
+ threadpool:
168
+ # #
169
+ # # threadpool.type should be one of [cached, scaling, blocking]:
170
+ # #
171
+ # # * Cached: An unbounded thread pool that reuses previously constructed threads.
172
+ # # * Scaling: A bounded thread pool that reuses previously created free threads.
173
+ # # * Blocking: A bounded thread pool that reuses previously created free
174
+ # # threads. Pending requests block for an available thread (different than
175
+ # # the scaling one, where the request is added to a queue and does not
176
+ # # block).
177
+ # #
178
+ # type: cached
179
+
180
+ # http://www.elasticsearch.com/docs/elasticsearch/modules/discovery/
181
+ discovery:
182
+ # set to 'zen' or 'ec2'
183
+ type: zen
184
+ zen:
185
+ ping:
186
+ multicast:
187
+ enabled: false
188
+ unicast:
189
+ hosts: 10.195.215.175:9300,10.243.57.219:9300,10.194.218.143:9300,10.204.223.175:9300,10.242.89.235:9300,10.212.226.127:9300
190
+ # There are two fault detection processes running. The first is by the
191
+ # master, to ping all the other nodes in the cluster and verify that they
192
+ # are alive. And on the other end, each node pings to master to verify if
193
+ # its still alive or an election process needs to be initiated.
194
+ fd:
195
+ # How often a node gets pinged. Defaults to "1s".
196
+ ping_interval: 3s
197
+ # How long to wait for a ping response, defaults to "30s".
198
+ ping_timeout: 10s
199
+ # How many ping failures / timeouts cause a node to be considered failed. Defaults to 3.
200
+ ping_retries: 3
201
+ #
202
+ # # ec2 discovery can cause big trouble with the hadoop loader:
203
+ # # discovery churn can hit API usage limits
204
+ # # Be sure to set your cloud keys if you're using ec2
205
+ # #
206
+ # ec2:
207
+ # # security groups used for discovery
208
+ # groups: hoolock-data_esnode
209
+ # # require *all* (false) or *any* (true) of those groups?
210
+ # any_group: true
211
+ # # private_ip, public_ip, private_dns, public_dns
212
+ # host_type: private_ip
213
+ # availability_zones: us-east-1d
214
+
215
+ # Necessary if you will use either of
216
+ # * the ec2 discovery module: for finding peers
217
+ # * the s3 gateway module, for pushing indices to an s3 mirror.
218
+ # Read more: http://www.elasticsearch.com/docs/elasticsearch/cloud/
219
+ #
220
+ cloud:
221
+ aws:
222
+ access_key: <%= @aws['aws_access_key_id'] %>
223
+ secret_key: <%= @aws['aws_secret_access_key'] %>
224
+
225
+ # monitor.jvm: gc_threshold, interval, enabled
226
+ # thrift:
227
+ # # port: