wonderdog 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. data/.gitignore +2 -0
  2. data/.idea/encodings.xml +5 -0
  3. data/.idea/misc.xml +5 -0
  4. data/.idea/modules.xml +9 -0
  5. data/.idea/scopes/scope_settings.xml +5 -0
  6. data/.idea/vcs.xml +7 -0
  7. data/.idea/wonderdog.iml +41 -0
  8. data/Gemfile +1 -1
  9. data/bin/estool +22 -1
  10. data/bin/squirrel.rb +108 -0
  11. data/lib/wonderdog.rb +3 -0
  12. data/lib/wonderdog/hadoop_invocation_override.rb +4 -1
  13. data/lib/wonderdog/version.rb +1 -1
  14. data/pom.xml +1 -1
  15. data/spec/spec_helper.rb +1 -1
  16. data/spec/wonderdog/hadoop_invocation_override_spec.rb +1 -1
  17. data/squirrel/all_facets.rb +95 -0
  18. data/squirrel/change_es_index_settings.rb +19 -0
  19. data/squirrel/clear_es_caches.rb +30 -0
  20. data/squirrel/esbackup.rb +184 -0
  21. data/squirrel/esbackup_stripped.rb +153 -0
  22. data/squirrel/fields.sh +5 -0
  23. data/squirrel/getFields.rb +19 -0
  24. data/squirrel/replay.rb +219 -0
  25. data/squirrel/squirrel.rb +95 -0
  26. data/squirrel/warmer_interface.rb +59 -0
  27. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +2 -2
  28. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +14 -2
  29. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +20 -5
  30. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +55 -26
  31. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +59 -22
  32. data/test/cardinality.rb +43 -0
  33. data/test/change_es_index_settings.rb +19 -0
  34. data/test/clear_es_caches.rb +30 -0
  35. data/test/config/mapping.yml +327 -0
  36. data/test/config/mappings.yml +328 -0
  37. data/test/count_check.txt +0 -0
  38. data/test/esbackup_stripped.rb +153 -0
  39. data/test/mapping.yml +327 -0
  40. data/test/medium_slow_queries +41 -0
  41. data/test/queries.txt +0 -0
  42. data/test/quick_test_slow_queries +4 -0
  43. data/test/run_pry.rb +3 -0
  44. data/test/some_slow_queries +53 -0
  45. data/test/warmer_interface.rb +64 -0
  46. data/test/warmindices.rb +65 -0
  47. data/wonderdog.gemspec +1 -1
  48. metadata +40 -7
@@ -0,0 +1,95 @@
1
+ require "multi_json"
2
+ require_relative "../squirrel/esbackup_stripped.rb"
3
+ require_relative "../squirrel/replay.rb"
4
+ require_relative "../squirrel/warmer_interface.rb"
5
+ require_relative "../squirrel/clear_es_caches.rb"
6
+ require_relative "../squirrel/change_es_index_settings.rb"
7
+
8
+ class Squirrel
9
+
10
+ def initialize(command, options = {})
11
+ @command = command
12
+ @options = options
13
+ end
14
+
15
+ def determine_warmer_action(options = {})
16
+ options[:index] = options[:warmers_index]
17
+ unless options[:remove_warmer].nil?
18
+ puts "removing warmer"
19
+ options[:action] = "remove_warmer"
20
+ options[:warmer_name] = options[:remove_warmer]
21
+ else
22
+ if options[:warmers]
23
+ puts "enabling warmers"
24
+ options[:action] = "enable_warmer"
25
+ elsif options[:warmers] == false
26
+ puts "disabling warmers"
27
+ options[:action] = "disable_warmer"
28
+ end
29
+ unless options[:new_warmers_name].nil?
30
+ puts "adding warmer"
31
+ options[:action] = "add_warmer"
32
+ options[:warmer_name] = options[:new_warmers_name]
33
+ options[:query] = options[:create_warmer]
34
+ end
35
+ end
36
+ WarmerInterface.new(options).determine_interaction
37
+ end
38
+
39
+ def determine_cache_clear(options = {})
40
+ if options[:clear_all_cache]
41
+ options[:type] = "all"
42
+ ClearESCaches.new(options).run
43
+ end
44
+ if options[:clear_filter_cache]
45
+ options[:type] = "filter"
46
+ ClearESCaches.new(options).run
47
+ end
48
+ if options[:clear_fielddata]
49
+ options[:type] = "fielddata"
50
+ ClearESCaches.new(options).run
51
+ end
52
+ end
53
+
54
+ def cardinality(options)
55
+ options[:cardinality].each do |field|
56
+ output = `ruby getFields.rb --dump=#{options[:card_file]} --field=#{field} >> #{field}.txt ;
57
+ cat #{field}.txt |sort | uniq -c |sort -n | wc -l;`
58
+ puts "The number of values in #{field} form file #{ooptions[:card_file]} is #{output}"
59
+ end
60
+ end
61
+
62
+ def task_caller
63
+ puts "Running #{@command}"
64
+ case @command
65
+ when :restore
66
+ @options[:index] = @options[:restore_index]
67
+ @options[:mappings] = @options[:restore_mapping]
68
+ ESRestore.new(@options[:restore_file], @options).run
69
+ when :backup
70
+ @options[:index] = @options[:dump_index]
71
+ @options[:mappings] = @options[:dump_mapping]
72
+ ESBackup.new(@options[:output_dir], @options).run
73
+ when :duplicate
74
+ @options[:index] = @options[:duplicate_index]
75
+ @options[:mappings] = @options[:duplicate_mapping]
76
+ ESDup.new(@options[:duplicate_file], @options).run
77
+ when :cardinality
78
+ cardinality(@options)
79
+ when :warmer
80
+ determine_warmer_action(@options)
81
+ when :replay
82
+ Replay.new(@options[:execute_slow_queries], @options[:host], @options[:port], @options[:preference], @options[:routing]).run
83
+ when :cache
84
+ determine_cache_clear(@options)
85
+ when :index_settings
86
+ unless @options[:es_index_settings].nil? || @options[:es_index_settings_values].nil?
87
+ @options[:settings_and_values] = @options[:es_index_settings].zip(@options[:es_index_settings_values])
88
+ ChangeESIndexSettings.new(@options).run
89
+ else
90
+ puts "both --es_index_settings and --es_index_settings_values are required to change index settings"
91
+ end
92
+ else abort Settings.help("Must specify either backup, restore, duplicate, cardinality, warmer, replay, cache or index_settings. Got <#{@command}> UPDATE THIS LINE!")
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,59 @@
1
+ require 'multi_json'
2
+ require 'httparty'
3
+
4
+ class WarmerInterface
5
+ def initialize(options = {})
6
+ puts options.inspect
7
+ @host = options[:host]
8
+ @port = options[:port]
9
+ @query = MultiJson.dump(options[:query])
10
+ @warmer_name = options[:warmer_name]
11
+ @index = options[:index]
12
+ @action = options[:action]
13
+ @warmer_state = nil
14
+ end
15
+
16
+ def add_warmer
17
+ url = "http://#{@host}:#{@port}/#{@index}/_warmer/#{@warmer_name}"
18
+ puts url
19
+ puts @query
20
+ HTTParty.put(url, {:body => @query})
21
+ end
22
+
23
+ def remove_warmer
24
+ puts "removing warmer #{@warmer_name}"
25
+ `curl -s -XDELETE #{@host}:#{@port}/#{@index}/_warmer/#{@warmer_name}`
26
+ end
27
+
28
+ def enable_warmer
29
+ puts "closing #{@index}"
30
+ `curl -s -XPOST '#{@host}:#{@port}/#{@index}/_close'`
31
+ puts "enabling warmer"
32
+ `curl -s -XPUT '#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{"index.warmer.enabled":"true"}'`
33
+ puts "opening #{@index}"
34
+ `curl -s -XPOST '#{@host}:#{@port}/#{@index}/_open'`
35
+ end
36
+
37
+ def disable_warmer
38
+ puts "closing #{@index}"
39
+ `curl -s -XPOST '#{@host}:#{@port}/#{@index}/_close'`
40
+ puts "disabling warmer"
41
+ `curl -s -XPUT '#{@host}:#{@port}/#{@index}/_settings?pretty=true' -d '{"index.warmer.enabled":"false"}'`
42
+ puts "opening #{@index}"
43
+ `curl -s -XPOST '#{@host}:#{@port}/#{@index}/_open'`
44
+ end
45
+
46
+ def determine_interaction
47
+ unless @index.nil? || @host.nil? || @port.nil?
48
+ case command = @action.to_sym
49
+ when :add_warmer then add_warmer
50
+ when :remove_warmer then remove_warmer
51
+ when :enable_warmer then enable_warmer
52
+ when :disable_warmer then disable_warmer
53
+ else abort "#{command} is not a recognized action for determine_interaction from warmers_interface"
54
+ end
55
+ else
56
+ puts "index, host and port are required to interact with the warmers"
57
+ end
58
+ end
59
+ end
@@ -133,7 +133,7 @@ public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements
133
133
  .setSize(requestSize)
134
134
  .execute()
135
135
  .actionGet();
136
- this.numHits = response.hits().totalHits();
136
+ this.numHits = response.getHits().totalHits();
137
137
  if(numSplits > numHits) numSplits = numHits; // This could be bad
138
138
  this.numSplitRecords = (numHits/numSplits);
139
139
  }
@@ -206,7 +206,7 @@ public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements
206
206
  .setQuery(QueryBuilders.queryString(queryString))
207
207
  .execute()
208
208
  .actionGet();
209
- return response.hits().iterator();
209
+ return response.getHits().iterator();
210
210
  }
211
211
 
212
212
  @Override
@@ -63,12 +63,24 @@ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V
63
63
  private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
64
64
 
65
65
  private static final String ES_UNICAST_HOSTS_NAME = "discovery.zen.ping.unicast.hosts";
66
+
67
+ private static final String ES_TRANSPORT_OPT = "elasticsearch.transport";
68
+ private static final String ES_TRANSPORT = "false";
69
+
70
+ private static final String ES_TRANSPORT_HOST_OPT = "elasticsearch.transport.host";
71
+ private static final String ES_TRANSPORT_HOST = "localhost";
72
+
73
+ private static final String ES_TRANSPORT_PORT_OPT = "elasticsearch.transport.port";
74
+ private static final String ES_TRANSPORT_PORT = "9300";
66
75
 
67
76
  private TransportClient client;
68
77
 
69
78
  public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) {
70
79
  setLocalElasticSearchInstallation(conf);
71
- return (RecordReader) new ElasticSearchStreamingRecordReader(split, conf);
80
+ boolean esTransport = new Boolean(conf.get(ES_TRANSPORT_OPT, ES_TRANSPORT));
81
+ String esTransportHost = conf.get(ES_TRANSPORT_HOST_OPT, ES_TRANSPORT_HOST);
82
+ Integer esTransportPort = Integer.parseInt(conf.get(ES_TRANSPORT_PORT_OPT, ES_TRANSPORT_PORT));
83
+ return (RecordReader) new ElasticSearchStreamingRecordReader(split, conf, esTransport, esTransportHost, esTransportPort);
72
84
  }
73
85
 
74
86
  public InputSplit[] getSplits(JobConf conf, int requestedNumSplits) {
@@ -185,7 +197,7 @@ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V
185
197
  request.setQuery(queryJSON);
186
198
  }
187
199
  SearchResponse response = request.execute().actionGet();
188
- this.numHits = response.hits().totalHits();
200
+ this.numHits = response.getHits().totalHits();
189
201
 
190
202
  LOG.info("Ran query: "+String.valueOf(numHits)+" hits");
191
203
  }
@@ -37,21 +37,28 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
37
37
 
38
38
  private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
39
39
  private static final String ES_INDEX_FIELD = "_index";
40
- private String indexFieldName;
41
40
 
42
41
  private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
43
42
  private static final String ES_MAPPING_FIELD = "_mapping";
44
- private String mappingFieldName;
45
43
 
46
44
  private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
47
45
  private static final String ES_ID_FIELD = "_id";
48
- private String idFieldName;
46
+
47
+ private static final String ES_ROUTING_FIELD_OPT = "elasticsearch.output.routing.field";
48
+ private static final String ES_ROUTING_FIELD = "_routing";
49
49
 
50
50
  private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
51
51
  private static final String ES_BULK_SIZE = "1000";
52
- private int bulkSize;
53
52
 
53
+ private static final String ES_TRANSPORT_OPT = "elasticsearch.transport";
54
+ private static final String ES_TRANSPORT = "false";
55
+
56
+ private static final String ES_TRANSPORT_HOST_OPT = "elasticsearch.transport.host";
57
+ private static final String ES_TRANSPORT_HOST = "localhost";
54
58
 
59
+ private static final String ES_TRANSPORT_PORT_OPT = "elasticsearch.transport.port";
60
+ private static final String ES_TRANSPORT_PORT = "9300";
61
+
55
62
  // Elasticsearch internal settings required to make a client
56
63
  // connection.
57
64
  private static final String ES_CONFIG_OPT = "es.config";
@@ -67,8 +74,16 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
67
74
  String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
68
75
  String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
69
76
  String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
77
+ String routingFieldName = conf.get(ES_ROUTING_FIELD_OPT, ES_ROUTING_FIELD);
70
78
  Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
71
- return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
79
+ boolean esTransport = new Boolean(conf.get(ES_TRANSPORT_OPT, ES_TRANSPORT));
80
+ String esTransportHost = conf.get(ES_TRANSPORT_HOST_OPT, ES_TRANSPORT_HOST);
81
+ Integer esTransportPort = Integer.parseInt(conf.get(ES_TRANSPORT_PORT_OPT, ES_TRANSPORT_PORT));
82
+
83
+ return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName,
84
+ indexFieldName, mappingFieldName, idFieldName, routingFieldName,
85
+ bulkSize,
86
+ esTransport, esTransportHost, esTransportPort);
72
87
  }
73
88
 
74
89
  public void setLocalElasticSearchInstallation(JobConf conf) {
@@ -17,6 +17,10 @@ import org.elasticsearch.common.unit.TimeValue;
17
17
  import org.elasticsearch.node.Node;
18
18
  import org.elasticsearch.node.NodeBuilder;
19
19
  import org.elasticsearch.client.Client;
20
+ import org.elasticsearch.common.transport.InetSocketTransportAddress;
21
+ import org.elasticsearch.common.settings.Settings;
22
+ import org.elasticsearch.common.settings.ImmutableSettings;
23
+ import org.elasticsearch.client.transport.TransportClient;
20
24
  import org.elasticsearch.action.search.SearchRequestBuilder;
21
25
  import org.elasticsearch.action.search.SearchScrollRequestBuilder;
22
26
 
@@ -42,29 +46,66 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
42
46
  private Node node;
43
47
  private Client client;
44
48
  private ElasticSearchStreamingSplit split;
45
-
49
+ private boolean transport;
50
+ private String transportHost;
51
+ private Integer transportPort;
52
+
46
53
  private String scrollId;
47
54
  private Integer recordsRead;
48
55
  private Iterator<SearchHit> hitsItr = null;
49
56
 
50
- public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
57
+ public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf, boolean transport, String transportHost, Integer transportPort) {
51
58
  this.split = (ElasticSearchStreamingSplit) split;
52
59
  this.recordsRead = 0;
53
60
  this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
54
61
  this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
55
62
  this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
63
+
64
+ this.transport = transport;
65
+ this.transportHost = transportHost;
66
+ this.transportPort = transportPort;
56
67
 
57
68
  LOG.info("Initializing "+this.split.getSummary());
58
- startEmbeddedClient();
69
+ if (transport) {
70
+ this.client = buildTransportClient();
71
+ } else {
72
+ startNode();
73
+ this.client = node.client();
74
+ }
59
75
  fetchNextHits();
60
76
  }
61
77
 
78
+ /**
79
+ Build a transport client that will connect to some
80
+ Elasticsearch node.
81
+
82
+ */
83
+ private Client buildTransportClient() {
84
+ LOG.info("Connecting transport client to "+transportHost+":"+Integer.toString(transportPort));
85
+ Settings settings = ImmutableSettings.settingsBuilder().put("client.transport.ignore_cluster_name", "true").build();
86
+ return new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(transportHost, transportPort));
87
+ }
88
+
89
+ /**
90
+ Start an embedded Elasticsearch node.
91
+
92
+ The node will not store any data locally (non-datanode) but
93
+ will connect to a cluster using the default Elasticsearch
94
+ settings (those available in
95
+ /etc/elasticsearch/elasticsearch.yml).
96
+ */
97
+ private void startNode() {
98
+ LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
99
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
100
+ LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
101
+ }
102
+
62
103
  private void fetchNextHits() {
63
104
  if (scrollId == null) {
64
105
  LOG.info("Running initial scroll with timeout "+scrollTimeout);
65
106
  SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
66
107
  SearchResponse response = request.execute().actionGet();
67
- this.scrollId = response.scrollId();
108
+ this.scrollId = response.getScrollId();
68
109
  LOG.info("Got scroll ID "+scrollId);
69
110
  // Do we need to call fetchNextHits() again here? Or does
70
111
  // the initial request also itself contain the first set
@@ -75,9 +116,9 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
75
116
  // LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
76
117
  SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
77
118
  SearchResponse response = request.execute().actionGet();
78
- this.scrollId = response.scrollId();
119
+ this.scrollId = response.getScrollId();
79
120
  // LOG.info("Got scroll ID "+scrollId);
80
- this.hitsItr = response.hits().iterator();
121
+ this.hitsItr = response.getHits().iterator();
81
122
  }
82
123
  }
83
124
 
@@ -151,26 +192,14 @@ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
151
192
 
152
193
  @Override
153
194
  public void close() throws IOException {
154
- stopEmbeddedClient();
155
- }
156
-
157
- //
158
- // == Connecting to Elasticsearch ==
159
- //
160
-
161
- private void startEmbeddedClient() {
162
- LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
163
- this.node = NodeBuilder.nodeBuilder().client(true).node();
164
- this.client = node.client();
165
- LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
166
- }
167
-
168
- private void stopEmbeddedClient() {
169
- LOG.info("Stopping embedded Elasticsearch client...");
170
- if (client != null) client.close();
171
- if (node != null) node.close();
172
- LOG.info("Left Elasticsearch cluster");
195
+ if (client != null) {
196
+ LOG.info("Shutting down Elasticsearch client...");
197
+ client.close();
198
+ }
199
+ if (node != null) {
200
+ LOG.info("Shutting down Elasticsearch node...");
201
+ node.close();
202
+ }
173
203
  }
174
204
 
175
-
176
205
  }
@@ -20,8 +20,13 @@ import org.elasticsearch.node.Node;
20
20
  import org.elasticsearch.node.NodeBuilder;
21
21
  import org.elasticsearch.client.Client;
22
22
  import org.elasticsearch.client.Requests;
23
+ import org.elasticsearch.common.transport.InetSocketTransportAddress;
24
+ import org.elasticsearch.common.settings.Settings;
25
+ import org.elasticsearch.common.settings.ImmutableSettings;
26
+ import org.elasticsearch.client.transport.TransportClient;
23
27
  import org.elasticsearch.action.bulk.BulkRequestBuilder;
24
28
  import org.elasticsearch.action.bulk.BulkResponse;
29
+ import org.elasticsearch.action.index.IndexRequest;
25
30
  import org.elasticsearch.ExceptionsHelper;
26
31
 
27
32
  import org.codehaus.jackson.map.ObjectMapper;
@@ -36,6 +41,7 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
36
41
  private String indexFieldName;
37
42
  private String mappingFieldName;
38
43
  private String idFieldName;
44
+ private String routingFieldName;
39
45
  private Integer bulkSize;
40
46
 
41
47
  // Bookkeeping
@@ -48,6 +54,9 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
48
54
  private Node node;
49
55
  private Client client;
50
56
  private volatile BulkRequestBuilder currentRequest;
57
+ private boolean transport;
58
+ private String transportHost;
59
+ private Integer transportPort;
51
60
 
52
61
  // JSON parsing
53
62
  private ObjectMapper mapper;
@@ -56,35 +65,53 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
56
65
  // == Lifecycle ==
57
66
  //
58
67
 
59
- public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
60
- this.defaultIndexName = defaultIndexName;
61
- this.defaultMappingName = defaultMappingName;
62
- this.indexFieldName = indexFieldName;
63
- this.mappingFieldName = mappingFieldName;
64
- this.idFieldName = idFieldName;
65
- this.bulkSize = bulkSize;
68
+ public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, String routingFieldName, Integer bulkSize, boolean transport, String transportHost, Integer transportPort) {
69
+ this.defaultIndexName = defaultIndexName;
70
+ this.defaultMappingName = defaultMappingName;
71
+ this.indexFieldName = indexFieldName;
72
+ this.mappingFieldName = mappingFieldName;
73
+ this.idFieldName = idFieldName;
74
+ this.routingFieldName = routingFieldName;
75
+ this.bulkSize = bulkSize;
76
+ this.transport = transport;
77
+ this.transportHost = transportHost;
78
+ this.transportPort = transportPort;
66
79
 
67
80
  LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
68
81
  LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
69
82
  LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
70
-
71
- startEmbeddedClient();
83
+ if (transport) {
84
+ this.client = buildTransportClient();
85
+ } else {
86
+ startNode();
87
+ this.client = node.client();
88
+ }
72
89
  this.currentRequest = client.prepareBulk();
73
90
  this.mapper = new ObjectMapper();
74
91
  }
75
92
 
76
93
  /**
77
- Start an embedded Elasticsearch client. The client will not be
78
- a data node and will not store data locally.
94
+ Build a transport client that will connect to some
95
+ Elasticsearch node.
96
+
97
+ */
98
+ private Client buildTransportClient() {
99
+ LOG.info("Connecting transport client to "+transportHost+":"+Integer.toString(transportPort));
100
+ Settings settings = ImmutableSettings.settingsBuilder().put("client.transport.ignore_cluster_name", "true").build();
101
+ return new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(transportHost, transportPort));
102
+ }
103
+
104
+ /**
105
+ Start an embedded Elasticsearch node.
79
106
 
80
- The client will connect to the target Elasticsearch cluster as
81
- a client node, enabling one-hop writes for all data. See
82
- http://www.elasticsearch.org/guide/reference/java-api/client.html
107
+ The node will not store any data locally (non-datanode) but
108
+ will connect to a cluster using the default Elasticsearch
109
+ settings (those available in
110
+ /etc/elasticsearch/elasticsearch.yml).
83
111
  */
84
- private void startEmbeddedClient() {
112
+ private void startNode() {
85
113
  LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
86
114
  this.node = NodeBuilder.nodeBuilder().client(true).node();
87
- this.client = node.client();
88
115
  LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
89
116
  }
90
117
 
@@ -95,10 +122,14 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
95
122
  */
96
123
  public void close(Reporter reporter) throws IOException {
97
124
  sendBulkRequestIfMoreThan(0);
98
- LOG.info("Shutting down Elasticsearch client...");
99
- if (client != null) client.close();
100
- if (node != null) node.close();
101
- LOG.info("Successfully shut down Elasticsearch client");
125
+ if (client != null) {
126
+ LOG.info("Shutting down Elasticsearch client...");
127
+ client.close();
128
+ }
129
+ if (node != null) {
130
+ LOG.info("Shutting down Elasticsearch node...");
131
+ node.close();
132
+ }
102
133
  }
103
134
 
104
135
  //
@@ -122,12 +153,18 @@ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
122
153
 
123
154
  private void index(String json) throws IOException {
124
155
  Map<String, Object> record = mapper.readValue(json, Map.class);
156
+ IndexRequest request = null;
125
157
  if (record.containsKey(idFieldName)) {
126
158
  Object idValue = record.get(idFieldName);
127
- currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
159
+ request = Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json);
128
160
  } else {
129
- currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
161
+ request = Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json);
162
+ }
163
+ if (record.containsKey(routingFieldName)) {
164
+ Object routingValue = record.get(routingFieldName);
165
+ request.routing(String.valueOf(routingValue));
130
166
  }
167
+ currentRequest.add(request);
131
168
  }
132
169
 
133
170
  private String indexNameForRecord(Map<String, Object> record) {