wonderdog 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +2 -0
- data/.idea/encodings.xml +5 -0
- data/.idea/misc.xml +5 -0
- data/.idea/modules.xml +9 -0
- data/.idea/scopes/scope_settings.xml +5 -0
- data/.idea/vcs.xml +7 -0
- data/.idea/wonderdog.iml +41 -0
- data/Gemfile +1 -1
- data/bin/estool +22 -1
- data/bin/squirrel.rb +108 -0
- data/lib/wonderdog.rb +3 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
- data/lib/wonderdog/version.rb +1 -1
- data/pom.xml +1 -1
- data/spec/spec_helper.rb +1 -1
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
- data/squirrel/all_facets.rb +95 -0
- data/squirrel/change_es_index_settings.rb +19 -0
- data/squirrel/clear_es_caches.rb +30 -0
- data/squirrel/esbackup.rb +184 -0
- data/squirrel/esbackup_stripped.rb +153 -0
- data/squirrel/fields.sh +5 -0
- data/squirrel/getFields.rb +19 -0
- data/squirrel/replay.rb +219 -0
- data/squirrel/squirrel.rb +95 -0
- data/squirrel/warmer_interface.rb +59 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
- data/test/cardinality.rb +43 -0
- data/test/change_es_index_settings.rb +19 -0
- data/test/clear_es_caches.rb +30 -0
- data/test/config/mapping.yml +327 -0
- data/test/config/mappings.yml +328 -0
- data/test/count_check.txt +0 -0
- data/test/esbackup_stripped.rb +153 -0
- data/test/mapping.yml +327 -0
- data/test/medium_slow_queries +41 -0
- data/test/queries.txt +0 -0
- data/test/quick_test_slow_queries +4 -0
- data/test/run_pry.rb +3 -0
- data/test/some_slow_queries +53 -0
- data/test/warmer_interface.rb +64 -0
- data/test/warmindices.rb +65 -0
- data/wonderdog.gemspec +1 -1
- metadata +40 -7
@@ -0,0 +1,95 @@
|
|
1
|
+
require "multi_json"
|
2
|
+
require_relative "../squirrel/esbackup_stripped.rb"
|
3
|
+
require_relative "../squirrel/replay.rb"
|
4
|
+
require_relative "../squirrel/warmer_interface.rb"
|
5
|
+
require_relative "../squirrel/clear_es_caches.rb"
|
6
|
+
require_relative "../squirrel/change_es_index_settings.rb"
|
7
|
+
|
8
|
+
class Squirrel
|
9
|
+
|
10
|
+
def initialize(command, options = {})
|
11
|
+
@command = command
|
12
|
+
@options = options
|
13
|
+
end
|
14
|
+
|
15
|
+
def determine_warmer_action(options = {})
|
16
|
+
options[:index] = options[:warmers_index]
|
17
|
+
unless options[:remove_warmer].nil?
|
18
|
+
puts "removing warmer"
|
19
|
+
options[:action] = "remove_warmer"
|
20
|
+
options[:warmer_name] = options[:remove_warmer]
|
21
|
+
else
|
22
|
+
if options[:warmers]
|
23
|
+
puts "enabling warmers"
|
24
|
+
options[:action] = "enable_warmer"
|
25
|
+
elsif options[:warmers] == false
|
26
|
+
puts "disabling warmers"
|
27
|
+
options[:action] = "disable_warmer"
|
28
|
+
end
|
29
|
+
unless options[:new_warmers_name].nil?
|
30
|
+
puts "adding warmer"
|
31
|
+
options[:action] = "add_warmer"
|
32
|
+
options[:warmer_name] = options[:new_warmers_name]
|
33
|
+
options[:query] = options[:create_warmer]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
WarmerInterface.new(options).determine_interaction
|
37
|
+
end
|
38
|
+
|
39
|
+
def determine_cache_clear(options = {})
|
40
|
+
if options[:clear_all_cache]
|
41
|
+
options[:type] = "all"
|
42
|
+
ClearESCaches.new(options).run
|
43
|
+
end
|
44
|
+
if options[:clear_filter_cache]
|
45
|
+
options[:type] = "filter"
|
46
|
+
ClearESCaches.new(options).run
|
47
|
+
end
|
48
|
+
if options[:clear_fielddata]
|
49
|
+
options[:type] = "fielddata"
|
50
|
+
ClearESCaches.new(options).run
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def cardinality(options)
|
55
|
+
options[:cardinality].each do |field|
|
56
|
+
output = `ruby getFields.rb --dump=#{options[:card_file]} --field=#{field} >> #{field}.txt ;
|
57
|
+
cat #{field}.txt |sort | uniq -c |sort -n | wc -l;`
|
58
|
+
puts "The number of values in #{field} form file #{ooptions[:card_file]} is #{output}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def task_caller
|
63
|
+
puts "Running #{@command}"
|
64
|
+
case @command
|
65
|
+
when :restore
|
66
|
+
@options[:index] = @options[:restore_index]
|
67
|
+
@options[:mappings] = @options[:restore_mapping]
|
68
|
+
ESRestore.new(@options[:restore_file], @options).run
|
69
|
+
when :backup
|
70
|
+
@options[:index] = @options[:dump_index]
|
71
|
+
@options[:mappings] = @options[:dump_mapping]
|
72
|
+
ESBackup.new(@options[:output_dir], @options).run
|
73
|
+
when :duplicate
|
74
|
+
@options[:index] = @options[:duplicate_index]
|
75
|
+
@options[:mappings] = @options[:duplicate_mapping]
|
76
|
+
ESDup.new(@options[:duplicate_file], @options).run
|
77
|
+
when :cardinality
|
78
|
+
cardinality(@options)
|
79
|
+
when :warmer
|
80
|
+
determine_warmer_action(@options)
|
81
|
+
when :replay
|
82
|
+
Replay.new(@options[:execute_slow_queries], @options[:host], @options[:port], @options[:preference], @options[:routing]).run
|
83
|
+
when :cache
|
84
|
+
determine_cache_clear(@options)
|
85
|
+
when :index_settings
|
86
|
+
unless @options[:es_index_settings].nil? || @options[:es_index_settings_values].nil?
|
87
|
+
@options[:settings_and_values] = @options[:es_index_settings].zip(@options[:es_index_settings_values])
|
88
|
+
ChangeESIndexSettings.new(@options).run
|
89
|
+
else
|
90
|
+
puts "both --es_index_settings and --es_index_settings_values are required to change index settings"
|
91
|
+
end
|
92
|
+
else abort Settings.help("Must specify either backup, restore, duplicate, cardinality, warmer, replay, cache or index_settings. Got <#{@command}> UPDATE THIS LINE!")
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'multi_json'
|
2
|
+
require 'httparty'
|
3
|
+
|
4
|
+
class WarmerInterface
|
5
|
+
def initialize(options = {})
|
6
|
+
puts options.inspect
|
7
|
+
@host = options[:host]
|
8
|
+
@port = options[:port]
|
9
|
+
@query = MultiJson.dump(options[:query])
|
10
|
+
@warmer_name = options[:warmer_name]
|
11
|
+
@index = options[:index]
|
12
|
+
@action = options[:action]
|
13
|
+
@warmer_state = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
def add_warmer
|
17
|
+
url = "http://#{@host}:#{@port}/#{@index}/_warmer/#{@warmer_name}"
|
18
|
+
puts url
|
19
|
+
puts @query
|
20
|
+
HTTParty.put(url, {:body => @query})
|
21
|
+
end
|
22
|
+
|
23
|
+
def remove_warmer
|
24
|
+
puts "removing warmer #{@warmer_name}"
|
25
|
+
`curl -s -XDELETE #{@host}:#{@port}/#{@index}/_warmer/#{@warmer_name}`
|
26
|
+
end
|
27
|
+
|
28
|
+
def enable_warmer
|
29
|
+
puts "closing #{@index}"
|
30
|
+
`curl -s -XPOST '#{@host}:#{@port}/#{@index}/_close'`
|
31
|
+
puts "enabling warmer"
|
32
|
+
`curl -s -XPUT '#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{"index.warmer.enabled":"true"}'`
|
33
|
+
puts "opening #{@index}"
|
34
|
+
`curl -s -XPOST '#{@host}:#{@port}/#{@index}/_open'`
|
35
|
+
end
|
36
|
+
|
37
|
+
def disable_warmer
|
38
|
+
puts "closing #{@index}"
|
39
|
+
`curl -s -XPOST '#{@host}:#{@port}/#{@index}/_close'`
|
40
|
+
puts "disabling warmer"
|
41
|
+
`curl -s -XPUT '#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{"index.warmer.enabled":"false"}'`
|
42
|
+
puts "opening #{@index}"
|
43
|
+
`curl -s -XPOST '#{@host}:#{@port}/#{@index}/_open'`
|
44
|
+
end
|
45
|
+
|
46
|
+
def determine_interaction
|
47
|
+
unless @index.nil? || @host.nil? || @port.nil?
|
48
|
+
case command = @action.to_sym
|
49
|
+
when :add_warmer then add_warmer
|
50
|
+
when :remove_warmer then remove_warmer
|
51
|
+
when :enable_warmer then enable_warmer
|
52
|
+
when :disable_warmer then disable_warmer
|
53
|
+
else abort "#{command} is not a recognized action for determine_interaction from warmers_interface"
|
54
|
+
end
|
55
|
+
else
|
56
|
+
puts "index, host and port are required to interact with the warmers"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -133,7 +133,7 @@ public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements
|
|
133
133
|
.setSize(requestSize)
|
134
134
|
.execute()
|
135
135
|
.actionGet();
|
136
|
-
this.numHits = response.
|
136
|
+
this.numHits = response.getHits().totalHits();
|
137
137
|
if(numSplits > numHits) numSplits = numHits; // This could be bad
|
138
138
|
this.numSplitRecords = (numHits/numSplits);
|
139
139
|
}
|
@@ -206,7 +206,7 @@ public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements
|
|
206
206
|
.setQuery(QueryBuilders.queryString(queryString))
|
207
207
|
.execute()
|
208
208
|
.actionGet();
|
209
|
-
return response.
|
209
|
+
return response.getHits().iterator();
|
210
210
|
}
|
211
211
|
|
212
212
|
@Override
|
@@ -63,12 +63,24 @@ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V
|
|
63
63
|
private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
64
64
|
|
65
65
|
private static final String ES_UNICAST_HOSTS_NAME = "discovery.zen.ping.unicast.hosts";
|
66
|
+
|
67
|
+
private static final String ES_TRANSPORT_OPT = "elasticsearch.transport";
|
68
|
+
private static final String ES_TRANSPORT = "false";
|
69
|
+
|
70
|
+
private static final String ES_TRANSPORT_HOST_OPT = "elasticsearch.transport.host";
|
71
|
+
private static final String ES_TRANSPORT_HOST = "localhost";
|
72
|
+
|
73
|
+
private static final String ES_TRANSPORT_PORT_OPT = "elasticsearch.transport.port";
|
74
|
+
private static final String ES_TRANSPORT_PORT = "9300";
|
66
75
|
|
67
76
|
private TransportClient client;
|
68
77
|
|
69
78
|
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) {
|
70
79
|
setLocalElasticSearchInstallation(conf);
|
71
|
-
|
80
|
+
boolean esTransport = new Boolean(conf.get(ES_TRANSPORT_OPT, ES_TRANSPORT));
|
81
|
+
String esTransportHost = conf.get(ES_TRANSPORT_HOST_OPT, ES_TRANSPORT_HOST);
|
82
|
+
Integer esTransportPort = Integer.parseInt(conf.get(ES_TRANSPORT_PORT_OPT, ES_TRANSPORT_PORT));
|
83
|
+
return (RecordReader) new ElasticSearchStreamingRecordReader(split, conf, esTransport, esTransportHost, esTransportPort);
|
72
84
|
}
|
73
85
|
|
74
86
|
public InputSplit[] getSplits(JobConf conf, int requestedNumSplits) {
|
@@ -185,7 +197,7 @@ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V
|
|
185
197
|
request.setQuery(queryJSON);
|
186
198
|
}
|
187
199
|
SearchResponse response = request.execute().actionGet();
|
188
|
-
this.numHits = response.
|
200
|
+
this.numHits = response.getHits().totalHits();
|
189
201
|
|
190
202
|
LOG.info("Ran query: "+String.valueOf(numHits)+" hits");
|
191
203
|
}
|
@@ -37,21 +37,28 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
|
|
37
37
|
|
38
38
|
private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
|
39
39
|
private static final String ES_INDEX_FIELD = "_index";
|
40
|
-
private String indexFieldName;
|
41
40
|
|
42
41
|
private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
|
43
42
|
private static final String ES_MAPPING_FIELD = "_mapping";
|
44
|
-
private String mappingFieldName;
|
45
43
|
|
46
44
|
private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
|
47
45
|
private static final String ES_ID_FIELD = "_id";
|
48
|
-
|
46
|
+
|
47
|
+
private static final String ES_ROUTING_FIELD_OPT = "elasticsearch.output.routing.field";
|
48
|
+
private static final String ES_ROUTING_FIELD = "_routing";
|
49
49
|
|
50
50
|
private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
|
51
51
|
private static final String ES_BULK_SIZE = "1000";
|
52
|
-
private int bulkSize;
|
53
52
|
|
53
|
+
private static final String ES_TRANSPORT_OPT = "elasticsearch.transport";
|
54
|
+
private static final String ES_TRANSPORT = "false";
|
55
|
+
|
56
|
+
private static final String ES_TRANSPORT_HOST_OPT = "elasticsearch.transport.host";
|
57
|
+
private static final String ES_TRANSPORT_HOST = "localhost";
|
54
58
|
|
59
|
+
private static final String ES_TRANSPORT_PORT_OPT = "elasticsearch.transport.port";
|
60
|
+
private static final String ES_TRANSPORT_PORT = "9300";
|
61
|
+
|
55
62
|
// Elasticsearch internal settings required to make a client
|
56
63
|
// connection.
|
57
64
|
private static final String ES_CONFIG_OPT = "es.config";
|
@@ -67,8 +74,16 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
|
|
67
74
|
String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
|
68
75
|
String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
|
69
76
|
String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
|
77
|
+
String routingFieldName = conf.get(ES_ROUTING_FIELD_OPT, ES_ROUTING_FIELD);
|
70
78
|
Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
|
71
|
-
|
79
|
+
boolean esTransport = new Boolean(conf.get(ES_TRANSPORT_OPT, ES_TRANSPORT));
|
80
|
+
String esTransportHost = conf.get(ES_TRANSPORT_HOST_OPT, ES_TRANSPORT_HOST);
|
81
|
+
Integer esTransportPort = Integer.parseInt(conf.get(ES_TRANSPORT_PORT_OPT, ES_TRANSPORT_PORT));
|
82
|
+
|
83
|
+
return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName,
|
84
|
+
indexFieldName, mappingFieldName, idFieldName, routingFieldName,
|
85
|
+
bulkSize,
|
86
|
+
esTransport, esTransportHost, esTransportPort);
|
72
87
|
}
|
73
88
|
|
74
89
|
public void setLocalElasticSearchInstallation(JobConf conf) {
|
@@ -17,6 +17,10 @@ import org.elasticsearch.common.unit.TimeValue;
|
|
17
17
|
import org.elasticsearch.node.Node;
|
18
18
|
import org.elasticsearch.node.NodeBuilder;
|
19
19
|
import org.elasticsearch.client.Client;
|
20
|
+
import org.elasticsearch.common.transport.InetSocketTransportAddress;
|
21
|
+
import org.elasticsearch.common.settings.Settings;
|
22
|
+
import org.elasticsearch.common.settings.ImmutableSettings;
|
23
|
+
import org.elasticsearch.client.transport.TransportClient;
|
20
24
|
import org.elasticsearch.action.search.SearchRequestBuilder;
|
21
25
|
import org.elasticsearch.action.search.SearchScrollRequestBuilder;
|
22
26
|
|
@@ -42,29 +46,66 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
|
|
42
46
|
private Node node;
|
43
47
|
private Client client;
|
44
48
|
private ElasticSearchStreamingSplit split;
|
45
|
-
|
49
|
+
private boolean transport;
|
50
|
+
private String transportHost;
|
51
|
+
private Integer transportPort;
|
52
|
+
|
46
53
|
private String scrollId;
|
47
54
|
private Integer recordsRead;
|
48
55
|
private Iterator<SearchHit> hitsItr = null;
|
49
56
|
|
50
|
-
public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
|
57
|
+
public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf, boolean transport, String transportHost, Integer transportPort) {
|
51
58
|
this.split = (ElasticSearchStreamingSplit) split;
|
52
59
|
this.recordsRead = 0;
|
53
60
|
this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
|
54
61
|
this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
|
55
62
|
this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
|
63
|
+
|
64
|
+
this.transport = transport;
|
65
|
+
this.transportHost = transportHost;
|
66
|
+
this.transportPort = transportPort;
|
56
67
|
|
57
68
|
LOG.info("Initializing "+this.split.getSummary());
|
58
|
-
|
69
|
+
if (transport) {
|
70
|
+
this.client = buildTransportClient();
|
71
|
+
} else {
|
72
|
+
startNode();
|
73
|
+
this.client = node.client();
|
74
|
+
}
|
59
75
|
fetchNextHits();
|
60
76
|
}
|
61
77
|
|
78
|
+
/**
|
79
|
+
Build a transport client that will connect to some
|
80
|
+
Elasticsearch node.
|
81
|
+
|
82
|
+
*/
|
83
|
+
private Client buildTransportClient() {
|
84
|
+
LOG.info("Connecting transport client to "+transportHost+":"+Integer.toString(transportPort));
|
85
|
+
Settings settings = ImmutableSettings.settingsBuilder().put("client.transport.ignore_cluster_name", "true").build();
|
86
|
+
return new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(transportHost, transportPort));
|
87
|
+
}
|
88
|
+
|
89
|
+
/**
|
90
|
+
Start an embedded Elasticsearch node.
|
91
|
+
|
92
|
+
The node will not store any data locally (non-datanode) but
|
93
|
+
will connect to a cluster using the default Elasticsearch
|
94
|
+
settings (those available in
|
95
|
+
/etc/elasticsearch/elasticsearch.yml).
|
96
|
+
*/
|
97
|
+
private void startNode() {
|
98
|
+
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
99
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
100
|
+
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
101
|
+
}
|
102
|
+
|
62
103
|
private void fetchNextHits() {
|
63
104
|
if (scrollId == null) {
|
64
105
|
LOG.info("Running initial scroll with timeout "+scrollTimeout);
|
65
106
|
SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
|
66
107
|
SearchResponse response = request.execute().actionGet();
|
67
|
-
this.scrollId = response.
|
108
|
+
this.scrollId = response.getScrollId();
|
68
109
|
LOG.info("Got scroll ID "+scrollId);
|
69
110
|
// Do we need to call fetchNextHits() again here? Or does
|
70
111
|
// the initial request also itself contain the first set
|
@@ -75,9 +116,9 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
|
|
75
116
|
// LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
|
76
117
|
SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
|
77
118
|
SearchResponse response = request.execute().actionGet();
|
78
|
-
this.scrollId = response.
|
119
|
+
this.scrollId = response.getScrollId();
|
79
120
|
// LOG.info("Got scroll ID "+scrollId);
|
80
|
-
this.hitsItr = response.
|
121
|
+
this.hitsItr = response.getHits().iterator();
|
81
122
|
}
|
82
123
|
}
|
83
124
|
|
@@ -151,26 +192,14 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
|
|
151
192
|
|
152
193
|
@Override
|
153
194
|
public void close() throws IOException {
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
163
|
-
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
164
|
-
this.client = node.client();
|
165
|
-
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
166
|
-
}
|
167
|
-
|
168
|
-
private void stopEmbeddedClient() {
|
169
|
-
LOG.info("Stopping embedded Elasticsearch client...");
|
170
|
-
if (client != null) client.close();
|
171
|
-
if (node != null) node.close();
|
172
|
-
LOG.info("Left Elasticsearch cluster");
|
195
|
+
if (client != null) {
|
196
|
+
LOG.info("Shutting down Elasticsearch client...");
|
197
|
+
client.close();
|
198
|
+
}
|
199
|
+
if (node != null) {
|
200
|
+
LOG.info("Shutting down Elasticsearch node...");
|
201
|
+
node.close();
|
202
|
+
}
|
173
203
|
}
|
174
204
|
|
175
|
-
|
176
205
|
}
|
@@ -20,8 +20,13 @@ import org.elasticsearch.node.Node;
|
|
20
20
|
import org.elasticsearch.node.NodeBuilder;
|
21
21
|
import org.elasticsearch.client.Client;
|
22
22
|
import org.elasticsearch.client.Requests;
|
23
|
+
import org.elasticsearch.common.transport.InetSocketTransportAddress;
|
24
|
+
import org.elasticsearch.common.settings.Settings;
|
25
|
+
import org.elasticsearch.common.settings.ImmutableSettings;
|
26
|
+
import org.elasticsearch.client.transport.TransportClient;
|
23
27
|
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
24
28
|
import org.elasticsearch.action.bulk.BulkResponse;
|
29
|
+
import org.elasticsearch.action.index.IndexRequest;
|
25
30
|
import org.elasticsearch.ExceptionsHelper;
|
26
31
|
|
27
32
|
import org.codehaus.jackson.map.ObjectMapper;
|
@@ -36,6 +41,7 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
|
36
41
|
private String indexFieldName;
|
37
42
|
private String mappingFieldName;
|
38
43
|
private String idFieldName;
|
44
|
+
private String routingFieldName;
|
39
45
|
private Integer bulkSize;
|
40
46
|
|
41
47
|
// Bookkeeping
|
@@ -48,6 +54,9 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
|
48
54
|
private Node node;
|
49
55
|
private Client client;
|
50
56
|
private volatile BulkRequestBuilder currentRequest;
|
57
|
+
private boolean transport;
|
58
|
+
private String transportHost;
|
59
|
+
private Integer transportPort;
|
51
60
|
|
52
61
|
// JSON parsing
|
53
62
|
private ObjectMapper mapper;
|
@@ -56,35 +65,53 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
|
56
65
|
// == Lifecycle ==
|
57
66
|
//
|
58
67
|
|
59
|
-
public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
|
60
|
-
this.defaultIndexName
|
61
|
-
this.defaultMappingName
|
62
|
-
this.indexFieldName
|
63
|
-
this.mappingFieldName
|
64
|
-
this.idFieldName
|
65
|
-
this.
|
68
|
+
public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, String routingFieldName, Integer bulkSize, boolean transport, String transportHost, Integer transportPort) {
|
69
|
+
this.defaultIndexName = defaultIndexName;
|
70
|
+
this.defaultMappingName = defaultMappingName;
|
71
|
+
this.indexFieldName = indexFieldName;
|
72
|
+
this.mappingFieldName = mappingFieldName;
|
73
|
+
this.idFieldName = idFieldName;
|
74
|
+
this.routingFieldName = routingFieldName;
|
75
|
+
this.bulkSize = bulkSize;
|
76
|
+
this.transport = transport;
|
77
|
+
this.transportHost = transportHost;
|
78
|
+
this.transportPort = transportPort;
|
66
79
|
|
67
80
|
LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
|
68
81
|
LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
|
69
82
|
LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
|
70
|
-
|
71
|
-
|
83
|
+
if (transport) {
|
84
|
+
this.client = buildTransportClient();
|
85
|
+
} else {
|
86
|
+
startNode();
|
87
|
+
this.client = node.client();
|
88
|
+
}
|
72
89
|
this.currentRequest = client.prepareBulk();
|
73
90
|
this.mapper = new ObjectMapper();
|
74
91
|
}
|
75
92
|
|
76
93
|
/**
|
77
|
-
|
78
|
-
|
94
|
+
Build a transport client that will connect to some
|
95
|
+
Elasticsearch node.
|
96
|
+
|
97
|
+
*/
|
98
|
+
private Client buildTransportClient() {
|
99
|
+
LOG.info("Connecting transport client to "+transportHost+":"+Integer.toString(transportPort));
|
100
|
+
Settings settings = ImmutableSettings.settingsBuilder().put("client.transport.ignore_cluster_name", "true").build();
|
101
|
+
return new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(transportHost, transportPort));
|
102
|
+
}
|
103
|
+
|
104
|
+
/**
|
105
|
+
Start an embedded Elasticsearch node.
|
79
106
|
|
80
|
-
The
|
81
|
-
|
82
|
-
|
107
|
+
The node will not store any data locally (non-datanode) but
|
108
|
+
will connect to a cluster using the default Elasticsearch
|
109
|
+
settings (those available in
|
110
|
+
/etc/elasticsearch/elasticsearch.yml).
|
83
111
|
*/
|
84
|
-
private void
|
112
|
+
private void startNode() {
|
85
113
|
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
86
114
|
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
87
|
-
this.client = node.client();
|
88
115
|
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
89
116
|
}
|
90
117
|
|
@@ -95,10 +122,14 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
|
95
122
|
*/
|
96
123
|
public void close(Reporter reporter) throws IOException {
|
97
124
|
sendBulkRequestIfMoreThan(0);
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
125
|
+
if (client != null) {
|
126
|
+
LOG.info("Shutting down Elasticsearch client...");
|
127
|
+
client.close();
|
128
|
+
}
|
129
|
+
if (node != null) {
|
130
|
+
LOG.info("Shutting down Elasticsearch node...");
|
131
|
+
node.close();
|
132
|
+
}
|
102
133
|
}
|
103
134
|
|
104
135
|
//
|
@@ -122,12 +153,18 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
|
122
153
|
|
123
154
|
private void index(String json) throws IOException {
|
124
155
|
Map<String, Object> record = mapper.readValue(json, Map.class);
|
156
|
+
IndexRequest request = null;
|
125
157
|
if (record.containsKey(idFieldName)) {
|
126
158
|
Object idValue = record.get(idFieldName);
|
127
|
-
|
159
|
+
request = Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json);
|
128
160
|
} else {
|
129
|
-
|
161
|
+
request = Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json);
|
162
|
+
}
|
163
|
+
if (record.containsKey(routingFieldName)) {
|
164
|
+
Object routingValue = record.get(routingFieldName);
|
165
|
+
request.routing(String.valueOf(routingValue));
|
130
166
|
}
|
167
|
+
currentRequest.add(request);
|
131
168
|
}
|
132
169
|
|
133
170
|
private String indexNameForRecord(Map<String, Object> record) {
|