wonderdog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
data/bin/estool
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'json'
|
4
|
+
require 'socket'
|
5
|
+
require 'optparse'
|
6
|
+
require 'open3'
|
7
|
+
require 'rake'
|
8
|
+
|
9
|
+
options = OpenStruct.new
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
|
12
|
+
opts.banner = <<EOS
|
13
|
+
Usage: estool <command> [options..]
|
14
|
+
|
15
|
+
Commands include:
|
16
|
+
status Returns the status of INDEX
|
17
|
+
list Returns a list of all indices
|
18
|
+
health Returns the health of the shards
|
19
|
+
flush Performs a full flush of the INDEX
|
20
|
+
create Create the specified INDEX
|
21
|
+
delete Delete the specified INDEX. Requires confirmation.
|
22
|
+
refresh Refresh the specified INDEX
|
23
|
+
optimize Optimizes the specified INDEX to (-s) number of segments
|
24
|
+
snapshot Snapshots the specified INDEX to the gateway
|
25
|
+
segments Returns the segment information. Requires ElasticSearch v
|
26
|
+
mapping
|
27
|
+
set_replication
|
28
|
+
search
|
29
|
+
obj_types
|
30
|
+
|
31
|
+
Options include:
|
32
|
+
EOS
|
33
|
+
|
34
|
+
options.host = Socket.gethostname
|
35
|
+
options.port = 9200
|
36
|
+
options.index = "_all"
|
37
|
+
options.segments = 3
|
38
|
+
options.query = "foo"
|
39
|
+
options.raw = false
|
40
|
+
options.usage = opts
|
41
|
+
|
42
|
+
opts.on('-c', '--host HOSTNAME', 'Connect to ElasticSearch on HOSTNAME', 'Defaults to localhost') do |host|
|
43
|
+
options.host = host
|
44
|
+
end
|
45
|
+
|
46
|
+
opts.on('-p', '--port PORT', 'Connect to ElasticSearch using PORT', 'Defaults to 9200') do |port|
|
47
|
+
options.port = port
|
48
|
+
end
|
49
|
+
|
50
|
+
opts.on('-i','--index NAME','Name of index to query against', 'Defaults to _all') do |index|
|
51
|
+
options.index = index
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on('-s', '--segments INT', 'Number of segments to optimize to', 'Defaults to 3. Use with <optimize>') do |num|
|
55
|
+
options.segments = num
|
56
|
+
end
|
57
|
+
|
58
|
+
opts.on('-r','--raw', 'Return raw JSON for parsing by another program') do
|
59
|
+
options.raw = true
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on('-q', '--query STRING', 'Query INDEX with STRING.', 'Defaults to foo. Use with <search>') do |str|
|
63
|
+
options.query = str
|
64
|
+
end
|
65
|
+
|
66
|
+
opts.on('-h', '--help', 'Display this screen and exit'){ puts opts ; exit }
|
67
|
+
end.parse!
|
68
|
+
|
69
|
+
class ESTool
|
70
|
+
|
71
|
+
attr_reader :options
|
72
|
+
|
73
|
+
def initialize(options)
|
74
|
+
@options = options
|
75
|
+
end
|
76
|
+
|
77
|
+
def connection() "http://#{options.host}:#{options.port}" ; end
|
78
|
+
|
79
|
+
def shell_response(cmd, req="-XGET")
|
80
|
+
url = File.join(connection, cmd)
|
81
|
+
Open3.popen3('curl','-s',req, url){ |stdin, stdout, stderr, thread| JSON.parse(stdout.read, :max_nesting => 100) }
|
82
|
+
end
|
83
|
+
|
84
|
+
def display cmd
|
85
|
+
result = self.send(cmd.to_sym)
|
86
|
+
display = options.raw ? result.to_json : JSON.pretty_generate(result, :max_nesting => 100)
|
87
|
+
puts display
|
88
|
+
end
|
89
|
+
|
90
|
+
def status() shell_response(File.join(options.index, "_status?")) ; end
|
91
|
+
|
92
|
+
def list() status["indices"].keys ; end
|
93
|
+
|
94
|
+
def health() shell_response("_cluster/health?") ; end
|
95
|
+
|
96
|
+
def flush() shell_response(File.join(options.index, "_flush?full=true")) ; end
|
97
|
+
|
98
|
+
def create() shell_response(options.index, "-XPUT") ; end
|
99
|
+
|
100
|
+
def delete()
|
101
|
+
require_confirmation!("delete", options.index)
|
102
|
+
shell_response(options.index, "-XDELETE")
|
103
|
+
end
|
104
|
+
|
105
|
+
def refresh() shell_response(File.join(options.index, "_refresh"), "-XPOST") ; end
|
106
|
+
|
107
|
+
def optimize() shell_response(File.join(options.index, "_optimize?max_num_segements=#{options.segments}"), "-XPOST") ; end
|
108
|
+
|
109
|
+
def snapshot() shell_response(File.join(options.index, "_gateway/snapshot"), "-XPOST") ; end
|
110
|
+
|
111
|
+
def segments() shell_response(File.join(options.index, "_segments")) ; end
|
112
|
+
|
113
|
+
def mapping() shell_response(File.join(options.index, "_mapping")) ; end
|
114
|
+
|
115
|
+
# curl -s -XPUT http://host:port/index/_settings -d '{"index":{"number_of_replicas":num}}'
|
116
|
+
def set_replication() { "error" => "method not yet implemented" }; end
|
117
|
+
|
118
|
+
def search() shell_response(File.join(options.index, "_search?q=#{options.query}")) ; end
|
119
|
+
|
120
|
+
def obj_types() mapping[options.index].keys ; end
|
121
|
+
|
122
|
+
def require_confirmation!(meth, *args)
|
123
|
+
print "#{meth.capitalize} method with args #{args} requires confirmation! [yN]?"
|
124
|
+
response = STDIN.gets.chomp
|
125
|
+
if response =~ /y/i
|
126
|
+
print "#{meth.capitalize} method with args #{args} confirmed!"
|
127
|
+
else
|
128
|
+
print "#{meth.capitalize} method with args #{args} cancelled!"
|
129
|
+
exit
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def method_missing meth, *args
|
134
|
+
puts "invalid command: #{meth}", options.usage
|
135
|
+
exit
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
|
140
|
+
command = ARGV.first
|
141
|
+
ESTool.new(options).display(command)
|
data/bin/estrus.rb
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rubberband'
|
5
|
+
require 'fileutils'
|
6
|
+
require 'configliere'
|
7
|
+
Settings.use :commandline, :env_var
|
8
|
+
|
9
|
+
#
|
10
|
+
# Estrus -- an alluringly primitive Elasticsearch stress-testing tool
|
11
|
+
#
|
12
|
+
# Example usage:
|
13
|
+
#
|
14
|
+
# ~/ics/backend/wonderdog/ruby/estrus.rb --queries=100 --output_dir=~/ics/backend/estrus_data
|
15
|
+
#
|
16
|
+
# Output:
|
17
|
+
#
|
18
|
+
# idx datetime secs msec/query hits shards_successful index nodename query_term
|
19
|
+
#
|
20
|
+
# Setup
|
21
|
+
#
|
22
|
+
# sudo apt-get install -y libcurl4-dev wamerican-large
|
23
|
+
# sudo gem install rubberband configliere
|
24
|
+
|
25
|
+
# ,tweet-2010q1
|
26
|
+
Settings.define :words_file, :default => "/usr/share/dict/words", :description => "Flat file with words to use"
|
27
|
+
Settings.define :offset_start, :default => 50_000, :description => "Where to start reading words", :type => Integer
|
28
|
+
Settings.define :offset_scale, :default => 100, :description => "How far in the file to range", :type => Integer
|
29
|
+
Settings.define :queries, :default => 10, :description => "Number of queries to run", :type => Integer
|
30
|
+
Settings.define :es_indexes, :default => 'tweet-2009q3pre,tweet-2009q4,tweet-2010q1,tweet-201004,tweet-201005,tweet-201005,tweet-201006,tweet-201007,tweet-201008,tweet-201009,tweet-201010,tweet-201011', :description => "Elasticsearch index to query against", :type => Array
|
31
|
+
Settings.define :output_dir, :default => nil, :description => "If given, the output is directed to a file named :output_dir/{date}/es-{datetime}-{comment}-{hostname}.tsv"
|
32
|
+
Settings.define :comment, :default => nil, :description => "If given, it is included in the filename"
|
33
|
+
Settings.define :host, :default => `hostname`.chomp, :description => "Host of ES query server"
|
34
|
+
Settings.define :port, :default => '9200', :description => "Port for ES query server"
|
35
|
+
Settings.resolve!
|
36
|
+
|
37
|
+
NODENAME = File.read('/etc/node_name').chomp rescue `hostname`.chomp
|
38
|
+
CLIENTS = Settings.es_indexes.inject([]){|clients, index| clients << [index, ElasticSearch.new("#{Settings.host}:#{Settings.port}", :index => index, :type => "tweet")] ; clients }
|
39
|
+
|
40
|
+
class StressTester
|
41
|
+
attr_accessor :started_at
|
42
|
+
def initialize
|
43
|
+
self.started_at = Time.now.utc
|
44
|
+
end
|
45
|
+
|
46
|
+
def words_file &block
|
47
|
+
File.open(Settings.words_file, &block)
|
48
|
+
end
|
49
|
+
|
50
|
+
def random_offset
|
51
|
+
Settings.offset_start + rand(1000)*Settings.offset_scale rescue nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def output_file
|
55
|
+
return @output_file if @output_file
|
56
|
+
return $stdout if Settings.output_dir.blank?
|
57
|
+
datehr = started_at.strftime("%Y%m%d%H")
|
58
|
+
datetime = started_at.to_flat
|
59
|
+
output_filename = File.expand_path(File.join(Settings.output_dir, datehr,
|
60
|
+
["es", datetime, NODENAME, Settings.comment].compact.join('-')+".tsv"))
|
61
|
+
FileUtils.mkdir_p(File.dirname(output_filename))
|
62
|
+
@output_file = File.open(output_filename, "a")
|
63
|
+
end
|
64
|
+
|
65
|
+
def dump *args
|
66
|
+
output_file << args.join("\t")+"\n"
|
67
|
+
end
|
68
|
+
|
69
|
+
def each_word &block
|
70
|
+
words_file do |words_file|
|
71
|
+
random_offset.times{ words_file.readline }
|
72
|
+
loop do
|
73
|
+
word = words_file.readline.chomp rescue nil
|
74
|
+
break unless word
|
75
|
+
next if word =~ /\W/
|
76
|
+
yield word
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
class Time ; def to_flat() strftime("%Y%m%d%H%M%S"); end ; end
|
83
|
+
class Array ; def random() self[rand(length)] ; end ; end
|
84
|
+
|
85
|
+
tester = StressTester.new
|
86
|
+
n_queries_executed = 0
|
87
|
+
tester.each_word do |query_string|
|
88
|
+
index, client = CLIENTS.random
|
89
|
+
|
90
|
+
result = client.search "text:#{query_string}"
|
91
|
+
elapsed = Time.now.utc - tester.started_at
|
92
|
+
n_queries_executed += 1
|
93
|
+
tester.dump(
|
94
|
+
n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
|
95
|
+
"%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
|
96
|
+
result.total_entries, result._shards['successful'],
|
97
|
+
index, NODENAME,
|
98
|
+
query_string)
|
99
|
+
$stderr.puts(n_queries_executed) if n_queries_executed % 20 == 0
|
100
|
+
break if n_queries_executed >= Settings.queries
|
101
|
+
end
|
102
|
+
|
103
|
+
# query_string = 'verizon'
|
104
|
+
# CLIENTS.each do |index,client|
|
105
|
+
# result = client.search "text:#{query_string}"
|
106
|
+
# elapsed = Time.now.utc - tester.started_at
|
107
|
+
# n_queries_executed += 1
|
108
|
+
# tester.dump(
|
109
|
+
# n_queries_executed, Time.now.utc.to_flat, "%7.1f"%elapsed,
|
110
|
+
# "%7.1f"%( 1000 * elapsed / n_queries_executed.to_f ),
|
111
|
+
# result.total_entries, result._shards['successful'],
|
112
|
+
# index, NODENAME,
|
113
|
+
# query_string)
|
114
|
+
#
|
115
|
+
# end
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
#
|
121
|
+
# TODO: monkeypatch rubberband to use keepalives:
|
122
|
+
#
|
123
|
+
# def connect!
|
124
|
+
# unless defined?(@@patron_session)
|
125
|
+
# @@patron_session = Patron::Session.new
|
126
|
+
# @session = @@patron_session
|
127
|
+
# @session.base_url = @server
|
128
|
+
# @session.timeout = @options[:timeout]
|
129
|
+
# @session.headers['User-Agent'] = 'ElasticSearch.rb v0.1'
|
130
|
+
# @session.headers['Connection'] = 'Keep-Alive'
|
131
|
+
# else
|
132
|
+
# @session = @@patron_session
|
133
|
+
# end
|
134
|
+
# @request_count = 1
|
135
|
+
# end
|
136
|
+
|
data/bin/wonderdog
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'configliere' ; Configliere.use(:commandline, :env_var, :define)
|
5
|
+
|
6
|
+
Settings.define :index_name, :required => true, :description => "Index to write data to"
|
7
|
+
Settings.define :object_type, :default => "tweet", :description => "Type of object we're indexing"
|
8
|
+
Settings.define :field_names, :default => "rsrc,tweet_id,created_at,user_id,screen_name,search_id,in_reply_to_user_id,in_reply_to_screen_name,in_reply_to_search_id,in_reply_to_status_id,text,source,lang,lat,lng,retweeted_count,rt_of_user_id,rt_of_screen_name,rt_of_tweet_id,contributors", :description => "Comma separated list of field names"
|
9
|
+
Settings.define :id_field, :default => "1", :description => "Index of field to use as object id (counting from 0; default 1), use -1 if no id field"
|
10
|
+
Settings.define :bulk_size, :default => "1000", :description => "Number of records per bulk request"
|
11
|
+
Settings.define :es_home, :default => "/usr/local/share/elasticsearch", :description => "Path to elasticsearch installation",:env_var => "ES_HOME"
|
12
|
+
Settings.define :es_config, :default => "/etc/elasticsearch/elasticsearch.yml", :description => "Path to elasticsearch config"
|
13
|
+
Settings.define :rm, :default => false, :description => "Remove existing output?"
|
14
|
+
Settings.define :hadoop_home, :default => "/usr/lib/hadoop", :description => "Path to hadoop installation", :env_var => "HADOOP_HOME"
|
15
|
+
Settings.define :min_split_size, :default => "5000000000", :description => "Min split size for maps"
|
16
|
+
Settings.define :test_outputfmt, :default => false, :description => "Use this flag to run job that test the ElasticSearchOutputFormat"
|
17
|
+
Settings.resolve!
|
18
|
+
|
19
|
+
|
20
|
+
raise "No input file specified." if Settings.rest.first.blank?
|
21
|
+
raise "No output file specified." if Settings.rest.last.blank?
|
22
|
+
|
23
|
+
class Wonderdog
|
24
|
+
attr_accessor :options
|
25
|
+
def initialize
|
26
|
+
@options = Settings.dup
|
27
|
+
end
|
28
|
+
|
29
|
+
def execute
|
30
|
+
output = options.rest.last
|
31
|
+
remove_output(output) if options.rm
|
32
|
+
system %Q{ echo #{hdp_cmd} }
|
33
|
+
system %Q{ #{hdp_cmd} }
|
34
|
+
end
|
35
|
+
|
36
|
+
def hdp_cmd
|
37
|
+
[
|
38
|
+
"HADOOP_CLASSPATH=#{hadoop_classpath}",
|
39
|
+
"#{options.hadoop_home}/bin/hadoop jar #{run_jar}",
|
40
|
+
mainclass,
|
41
|
+
"-Dmapred.map.tasks.speculative.execution=false",
|
42
|
+
"-Dmapred.min.split.size=#{options.min_split_size}",
|
43
|
+
"-Dwonderdog.index.name=#{options.index_name}",
|
44
|
+
"-Dwonderdog.object.type=#{options.object_type}",
|
45
|
+
"-Dwonderdog.id.field=#{options.id_field}",
|
46
|
+
"-Dwonderdog.field.names=#{options.field_names}",
|
47
|
+
"-Dwonderdog.bulk.size=#{options.bulk_size}",
|
48
|
+
"-Dwonderdog.config=#{options.es_config}",
|
49
|
+
"-Dwonderdog.plugins.dir=#{options.es_home}/plugins",
|
50
|
+
"-libjars #{libjars}",
|
51
|
+
"#{options.rest.first}",
|
52
|
+
"#{options.rest.last}"
|
53
|
+
].flatten.compact.join(" \t\\\n ")
|
54
|
+
end
|
55
|
+
|
56
|
+
def mainclass
|
57
|
+
return "com.infochimps.elasticsearch.ElasticTest" if Settings.test_outputfmt
|
58
|
+
"com.infochimps.elasticsearch.wonderdog.WonderDog"
|
59
|
+
end
|
60
|
+
|
61
|
+
def hadoop_classpath
|
62
|
+
cp = ["."]
|
63
|
+
Dir[
|
64
|
+
"/etc/elasticsearch/elasticsearch.yml",
|
65
|
+
"#{options.es_home}/plugins/*/*.jar",
|
66
|
+
"#{options.es_home}/lib/*.jar",
|
67
|
+
"#{options.es_home}/lib/sigar/*.jar"
|
68
|
+
].each{|jar| cp << jar}
|
69
|
+
cp.join(':')
|
70
|
+
end
|
71
|
+
|
72
|
+
def run_jar
|
73
|
+
File.dirname(File.expand_path(__FILE__))+'/../build/wonderdog.jar'
|
74
|
+
end
|
75
|
+
|
76
|
+
def libjars
|
77
|
+
libjars = []
|
78
|
+
Dir[
|
79
|
+
"/etc/elasticsearch/elasticsearch.yml",
|
80
|
+
"#{options.es_home}/plugins/*/*.jar",
|
81
|
+
"#{options.es_home}/lib/*.jar"
|
82
|
+
].each{|jar| libjars << jar}
|
83
|
+
libjars.join(',')
|
84
|
+
end
|
85
|
+
|
86
|
+
def remove_output output
|
87
|
+
system %Q{ hdp-rm -r #{output} }
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
runner = Wonderdog.new
|
93
|
+
runner.execute
|
@@ -0,0 +1,227 @@
|
|
1
|
+
#
|
2
|
+
# ElasticSearch config file
|
3
|
+
#
|
4
|
+
|
5
|
+
cluster:
|
6
|
+
name: hoolock
|
7
|
+
|
8
|
+
# http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/439afb06f3e85aa7/431a8543811d7848?lnk=gst&q=configuration#431a8543811d7848
|
9
|
+
routing:
|
10
|
+
allocation:
|
11
|
+
concurrent_recoveries: 1
|
12
|
+
|
13
|
+
# File paths
|
14
|
+
path:
|
15
|
+
home: /usr/local/share/elasticsearch
|
16
|
+
conf: /etc/elasticsearch
|
17
|
+
logs: /var/log/elasticsearch
|
18
|
+
# data: /mnt/elasticsearch/data
|
19
|
+
# work: /mnt/elasticsearch/work
|
20
|
+
|
21
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/node/
|
22
|
+
node:
|
23
|
+
# # node.data: is this a data esnode (stores, indexes data)? default true
|
24
|
+
data: true
|
25
|
+
|
26
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/http/
|
27
|
+
http:
|
28
|
+
# # http.enabled: is this a query esnode (has http interface, dispatches/gathers queries)? Default true
|
29
|
+
enabled: true
|
30
|
+
port: 9200-9300
|
31
|
+
max_content_length: 100mb
|
32
|
+
|
33
|
+
gateway:
|
34
|
+
# The gateway set on the node level will automatically control the index
|
35
|
+
# gateway to use. For example, if the fs gateway is used, then automatically,
|
36
|
+
# each index created on the node will also use its own respective index level
|
37
|
+
# fs gateway. In this case, if in an index should not persist its state, it
|
38
|
+
# should be explicitly set to none.
|
39
|
+
#
|
40
|
+
# Set gateway.type to one of: [none, local, fs, hadoop, s3]
|
41
|
+
#
|
42
|
+
type: local
|
43
|
+
#
|
44
|
+
# recovery begins when recover_after_nodes are present and then either
|
45
|
+
# recovery_after_time has passed *or* expected_nodes have shown up.
|
46
|
+
recover_after_nodes: 24
|
47
|
+
recovery_after_time: 10m # 5m
|
48
|
+
expected_nodes: 24 # 2
|
49
|
+
#
|
50
|
+
# # use with type: s3
|
51
|
+
# s3:
|
52
|
+
# bucket: infochimps-search
|
53
|
+
|
54
|
+
# http://groups.google.com/a/elasticsearch.com/group/users/browse_thread/thread/1f3001f43266879a/06d62ea3ceb4db30?lnk=gst&q=translog#06d62ea3ceb4db30
|
55
|
+
indices:
|
56
|
+
memory:
|
57
|
+
# Increase if you are bulk loading
|
58
|
+
# A number ('512m') or percent ('10%'). You can set limits on a percentage
|
59
|
+
# with max_index_buffer_size and min_index_buffer_size. 10% by default.
|
60
|
+
index_buffer_size: 512m
|
61
|
+
|
62
|
+
cache:
|
63
|
+
memory:
|
64
|
+
# buffer_size: 100k
|
65
|
+
# cache_size: 50m
|
66
|
+
# direct: true
|
67
|
+
# warm_cache: false
|
68
|
+
|
69
|
+
index:
|
70
|
+
number_of_shards: 24
|
71
|
+
number_of_replicas: 0
|
72
|
+
translog:
|
73
|
+
# A shard is flushed to local disk (the lucene index is committed) once this
|
74
|
+
# number of operations accumulate in the translog. defaults to 5000
|
75
|
+
#
|
76
|
+
# If you have
|
77
|
+
flush_threshold: 200000 # 5000
|
78
|
+
merge:
|
79
|
+
policy:
|
80
|
+
# Determines how often segment indices are merged by index operation. With
|
81
|
+
# smaller values, less RAM is used while indexing, and searches on
|
82
|
+
# unoptimized indices are faster, but indexing speed is slower. With
|
83
|
+
# larger values, more RAM is used during indexing, and while searches on
|
84
|
+
# unoptimized indices are slower, indexing is faster. Thus larger values
|
85
|
+
# (greater than 10) are best for batch index creation, and smaller values
|
86
|
+
# (lower than 10) for indices that are interactively maintained. Defaults
|
87
|
+
# to 10.
|
88
|
+
merge_factor: 30
|
89
|
+
# Use the compound file format. If not set, controlled by the actually
|
90
|
+
# store used, this is because the compound format was created to reduce
|
91
|
+
# the number of open file handles when using file based storage. The file
|
92
|
+
# system based ones default to true which others default to false. Even
|
93
|
+
# with file system based ones, consider increasing the number of open file
|
94
|
+
# handles and setting this to false for better performance
|
95
|
+
use_compound_file: false
|
96
|
+
# A size setting type which sets the minimum size for the lowest level
|
97
|
+
# segments. Any segments below this size are considered to be on the same
|
98
|
+
# level (even if they vary drastically in size) and will be merged
|
99
|
+
# whenever there are mergeFactor of them. This effectively truncates the
|
100
|
+
# “long tail” of small segments that would otherwise be created into a
|
101
|
+
# single level. If you set this too large, it could greatly increase the
|
102
|
+
# merging cost during indexing (if you flush many small
|
103
|
+
# segments). Defaults to 1.6mb
|
104
|
+
min_merge_size: 2.7mb
|
105
|
+
# Largest segment (by total byte size) that may be merged with other
|
106
|
+
# segments. Defaults to unbounded.
|
107
|
+
# max_merge_size:
|
108
|
+
# Largest segment (by document count) that may be merged with other
|
109
|
+
# segments. Defaults to unbounded
|
110
|
+
# max_merge_docs
|
111
|
+
scheduler:
|
112
|
+
max_thread_count: 64
|
113
|
+
# deletionpolicy: keep_only_last
|
114
|
+
|
115
|
+
engine:
|
116
|
+
robin:
|
117
|
+
# How often to schedule the refresh operation (the same one the Refresh
|
118
|
+
# API, which enables near real time search). Default '1s'; set to -1 to
|
119
|
+
# disable automatic refresh (you must instead initiate refresh via API)
|
120
|
+
refresh_interval: -1
|
121
|
+
# Set the interval between indexed terms. Large values cause less memory
|
122
|
+
# to be used by a reader / searcher, but slow random-access to
|
123
|
+
# terms. Small values cause more memory to be used by a reader / searcher,
|
124
|
+
# and speed random-access to terms. Defaults to 128.
|
125
|
+
term_index_interval: 1024
|
126
|
+
|
127
|
+
gateway:
|
128
|
+
# The index.gateway.snapshot_interval is a time setting allowing to
|
129
|
+
# configure the interval at which snapshotting of the index shard to the
|
130
|
+
# gateway will take place. Note, only primary shards start this scheduled
|
131
|
+
# snapshotting process. It defaults to 10s, and can be disabled by setting
|
132
|
+
# it to -1.
|
133
|
+
snapshot_interval: -1
|
134
|
+
# When a primary shard is shut down explicitly (not relocated), the
|
135
|
+
# index.gateway.snapshot_on_close flag can control if while shutting down, a
|
136
|
+
# gateway snapshot should be performed. It defaults to true.
|
137
|
+
snapshot_on_close: true
|
138
|
+
|
139
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/node/network/
|
140
|
+
network:
|
141
|
+
bind_host: _local_
|
142
|
+
publish_host: _local_
|
143
|
+
#
|
144
|
+
# tcp:
|
145
|
+
# no_delay: true
|
146
|
+
# keep_alive: ~
|
147
|
+
# reuse_address true
|
148
|
+
# send_buffer_size ~
|
149
|
+
# receive_buffer_size: ~
|
150
|
+
|
151
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/transport/
|
152
|
+
transport:
|
153
|
+
tcp:
|
154
|
+
port: 9300-9400
|
155
|
+
connect_timeout: 1s
|
156
|
+
# # enable lzf compression in esnode-esnode communication?
|
157
|
+
compress: false
|
158
|
+
|
159
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/jmx/
|
160
|
+
jmx:
|
161
|
+
# Create an RMI connector?
|
162
|
+
create_connector: true
|
163
|
+
port: 9400-9500
|
164
|
+
domain: elasticsearch
|
165
|
+
|
166
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/threadpool/
|
167
|
+
threadpool:
|
168
|
+
# #
|
169
|
+
# # threadpool.type should be one of [cached, scaling, blocking]:
|
170
|
+
# #
|
171
|
+
# # * Cached: An unbounded thread pool that reuses previously constructed threads.
|
172
|
+
# # * Scaling: A bounded thread pool that reuses previously created free threads.
|
173
|
+
# # * Blocking: A bounded thread pool that reuses previously created free
|
174
|
+
# # threads. Pending requests block for an available thread (different than
|
175
|
+
# # the scaling one, where the request is added to a queue and does not
|
176
|
+
# # block).
|
177
|
+
# #
|
178
|
+
# type: cached
|
179
|
+
|
180
|
+
# http://www.elasticsearch.com/docs/elasticsearch/modules/discovery/
|
181
|
+
discovery:
|
182
|
+
# set to 'zen' or 'ec2'
|
183
|
+
type: zen
|
184
|
+
zen:
|
185
|
+
ping:
|
186
|
+
multicast:
|
187
|
+
enabled: false
|
188
|
+
unicast:
|
189
|
+
hosts: 10.195.215.175:9300,10.243.57.219:9300,10.194.218.143:9300,10.204.223.175:9300,10.242.89.235:9300,10.212.226.127:9300
|
190
|
+
# There are two fault detection processes running. The first is by the
|
191
|
+
# master, to ping all the other nodes in the cluster and verify that they
|
192
|
+
# are alive. And on the other end, each node pings to master to verify if
|
193
|
+
# its still alive or an election process needs to be initiated.
|
194
|
+
fd:
|
195
|
+
# How often a node gets pinged. Defaults to "1s".
|
196
|
+
ping_interval: 3s
|
197
|
+
# How long to wait for a ping response, defaults to "30s".
|
198
|
+
ping_timeout: 10s
|
199
|
+
# How many ping failures / timeouts cause a node to be considered failed. Defaults to 3.
|
200
|
+
ping_retries: 3
|
201
|
+
#
|
202
|
+
# # ec2 discovery can cause big trouble with the hadoop loader:
|
203
|
+
# # discovery churn can hit API usage limits
|
204
|
+
# # Be sure to set your cloud keys if you're using ec2
|
205
|
+
# #
|
206
|
+
# ec2:
|
207
|
+
# # security groups used for discovery
|
208
|
+
# groups: hoolock-data_esnode
|
209
|
+
# # require *all* (false) or *any* (true) of those groups?
|
210
|
+
# any_group: true
|
211
|
+
# # private_ip, public_ip, private_dns, public_dns
|
212
|
+
# host_type: private_ip
|
213
|
+
# availability_zones: us-east-1d
|
214
|
+
|
215
|
+
# Necessary if you will use either of
|
216
|
+
# * the ec2 discovery module: for finding peers
|
217
|
+
# * the s3 gateway module, for pushing indices to an s3 mirror.
|
218
|
+
# Read more: http://www.elasticsearch.com/docs/elasticsearch/cloud/
|
219
|
+
#
|
220
|
+
cloud:
|
221
|
+
aws:
|
222
|
+
access_key: <%= @aws['aws_access_key_id'] %>
|
223
|
+
secret_key: <%= @aws['aws_secret_access_key'] %>
|
224
|
+
|
225
|
+
# monitor.jvm: gc_threshold, interval, enabled
|
226
|
+
# thrift:
|
227
|
+
# # port:
|