wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,39 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.mapreduce.JobContext;
6
+ import org.apache.hadoop.mapreduce.OutputCommitter;
7
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
8
+
9
+ /**
10
+ * Small committer class that does not do anything.
11
+ */
12
+ public class ElasticSearchOutputCommitter extends OutputCommitter {
13
+
14
+ @Override
15
+ public void abortTask(TaskAttemptContext arg0) throws IOException {
16
+ }
17
+
18
+ @Override
19
+ public void cleanupJob(JobContext arg0) throws IOException {
20
+ }
21
+
22
+ @Override
23
+ public void commitTask(TaskAttemptContext arg0) throws IOException {
24
+ }
25
+
26
+ @Override
27
+ public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
28
+ return false;
29
+ }
30
+
31
+ @Override
32
+ public void setupJob(JobContext arg0) throws IOException {
33
+ }
34
+
35
+ @Override
36
+ public void setupTask(TaskAttemptContext arg0) throws IOException {
37
+ }
38
+
39
+ }
@@ -0,0 +1,283 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.File;
4
+ import java.io.IOException;
5
+ import java.util.Map;
6
+ import java.util.HashMap;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+ import java.util.concurrent.atomic.AtomicLong;
10
+ import java.util.Random;
11
+ import java.net.URI;
12
+
13
+ import org.apache.commons.logging.Log;
14
+ import org.apache.commons.logging.LogFactory;
15
+
16
+ import org.apache.hadoop.conf.Configurable;
17
+ import org.apache.hadoop.io.*;
18
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
19
+ import org.apache.hadoop.mapreduce.JobContext;
20
+ import org.apache.hadoop.mapreduce.RecordWriter;
21
+ import org.apache.hadoop.conf.Configuration;
22
+ import org.apache.hadoop.fs.FileSystem;
23
+ import org.apache.hadoop.fs.Path;
24
+ import org.apache.hadoop.mapreduce.Counter;
25
+ import org.apache.hadoop.mapreduce.OutputFormat;
26
+ import org.apache.hadoop.mapreduce.OutputCommitter;
27
+ import org.apache.hadoop.filecache.DistributedCache;
28
+
29
+ import org.elasticsearch.common.xcontent.XContentBuilder;
30
+ import org.elasticsearch.common.xcontent.XContentFactory;
31
+ import org.elasticsearch.node.Node;
32
+ import org.elasticsearch.node.NodeBuilder;
33
+ import org.elasticsearch.client.Client;
34
+ import org.elasticsearch.client.Requests;
35
+ import org.elasticsearch.action.bulk.BulkRequestBuilder;
36
+ import org.elasticsearch.indices.IndexAlreadyExistsException;
37
+ import org.elasticsearch.action.bulk.BulkResponse;
38
+ import org.elasticsearch.ExceptionsHelper;
39
+
40
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
41
+
42
+ /**
43
+
44
+ Hadoop OutputFormat for writing arbitrary MapWritables (essentially HashMaps) into Elasticsearch. Records are batched up and sent
45
+ in a one-hop manner to the elastic search data nodes that will index them.
46
+
47
+ */
48
+ public class ElasticSearchOutputFormat extends OutputFormat<NullWritable, MapWritable> implements Configurable {
49
+
50
+ static Log LOG = LogFactory.getLog(ElasticSearchOutputFormat.class);
51
+ private Configuration conf = null;
52
+
53
+ protected class ElasticSearchRecordWriter extends RecordWriter<NullWritable, MapWritable> {
54
+
55
+ private Node node;
56
+ private Client client;
57
+ private String indexName;
58
+ private int bulkSize;
59
+ private int idField;
60
+ private String idFieldName;
61
+ private String objType;
62
+ private String[] fieldNames;
63
+
64
+ // Used for bookkeeping purposes
65
+ private AtomicLong totalBulkTime = new AtomicLong();
66
+ private AtomicLong totalBulkItems = new AtomicLong();
67
+ private Random randgen = new Random();
68
+ private long runStartTime = System.currentTimeMillis();
69
+
70
+ // For hadoop configuration
71
+ private static final String ES_CONFIG_NAME = "elasticsearch.yml";
72
+ private static final String ES_PLUGINS_NAME = "plugins";
73
+ private static final String ES_INDEX_NAME = "elasticsearch.index.name";
74
+ private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
75
+ private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
76
+ private static final String ES_ID_FIELD = "elasticsearch.id.field";
77
+ private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
78
+ private static final String ES_CONFIG = "es.config";
79
+ private static final String ES_PLUGINS = "es.path.plugins";
80
+
81
+ // Other string constants
82
+ private static final String COMMA = ",";
83
+ private static final String SLASH = "/";
84
+ private static final String NO_ID_FIELD = "-1";
85
+
86
+ private volatile BulkRequestBuilder currentRequest;
87
+
88
+ /**
89
+ Instantiates a new RecordWriter for Elasticsearch
90
+ <p>
91
+ The properties that <b>MUST</b> be set in the hadoop Configuration object
92
+ are as follows:
93
+ <ul>
94
+ <li><b>elasticsearch.index.name</b> - The name of the elasticsearch index data will be written to. It does not have to exist ahead of time</li>
95
+ <li><b>elasticsearch.bulk.size</b> - The number of records to be accumulated into a bulk request before writing to elasticsearch.</li>
96
+ <li><b>elasticsearch.is_json</b> - A boolean indicating whether the records to be indexed are json records. If false the records are assumed to be tsv, in which case <b>elasticsearch.field.names</b> must be set and contain a comma separated list of field names</li>
97
+ <li><b>elasticsearch.object.type</b> - The type of objects being indexed</li>
98
+ <li><b>elasticsearch.config</b> - The full path the elasticsearch.yml. It is a local path and must exist on all machines in the hadoop cluster.</li>
99
+ <li><b>elasticsearch.plugins.dir</b> - The full path the elasticsearch plugins directory. It is a local path and must exist on all machines in the hadoop cluster.</li>
100
+ </ul>
101
+ <p>
102
+ The following fields depend on whether <b>elasticsearch.is_json</b> is true or false.
103
+ <ul>
104
+ <li><b>elasticsearch.id.field.name</b> - When <b>elasticsearch.is_json</b> is true, this is the name of a field in the json document that contains the document's id. If -1 is used then the document is assumed to have no id and one is assigned to it by elasticsearch.</li>
105
+ <li><b>elasticsearch.field.names</b> - When <b>elasticsearch.is_json</b> is false, this is a comma separated list of field names.</li>
106
+ <li><b>elasticsearch.id.field</b> - When <b>elasticsearch.is_json</b> is false, this is the numeric index of the field to use as the document id. If -1 is used the document is assumed to have no id and one is assigned to it by elasticsearch.</li>
107
+ </ul>
108
+ */
109
+ public ElasticSearchRecordWriter(TaskAttemptContext context) {
110
+ Configuration conf = context.getConfiguration();
111
+ this.indexName = conf.get(ES_INDEX_NAME);
112
+ this.bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE));
113
+ this.idFieldName = conf.get(ES_ID_FIELD_NAME);
114
+ if (idFieldName.equals(NO_ID_FIELD)) {
115
+ LOG.info("Documents will be assigned ids by elasticsearch");
116
+ this.idField = -1;
117
+ } else {
118
+ LOG.info("Using field:["+idFieldName+"] for document ids");
119
+ }
120
+ this.objType = conf.get(ES_OBJECT_TYPE);
121
+
122
+ //
123
+ // Fetches elasticsearch.yml and the plugins directory from the distributed cache, or
124
+ // from the local config.
125
+ //
126
+ try {
127
+ String taskConfigPath = HadoopUtils.fetchFileFromCache(ES_CONFIG_NAME, conf);
128
+ LOG.info("Using ["+taskConfigPath+"] as es.config");
129
+ String taskPluginsPath = HadoopUtils.fetchArchiveFromCache(ES_PLUGINS_NAME, conf);
130
+ LOG.info("Using ["+taskPluginsPath+"] as es.plugins.dir");
131
+ System.setProperty(ES_CONFIG, taskConfigPath);
132
+ System.setProperty(ES_PLUGINS, taskPluginsPath+SLASH+ES_PLUGINS_NAME);
133
+ } catch (Exception e) {
134
+ System.setProperty(ES_CONFIG,conf.get(ES_CONFIG));
135
+ System.setProperty(ES_PLUGINS,conf.get(ES_PLUGINS));
136
+ }
137
+
138
+ start_embedded_client();
139
+ initialize_index(indexName);
140
+ currentRequest = client.prepareBulk();
141
+ }
142
+
143
+ /**
144
+ Closes the connection to elasticsearch. Any documents remaining in the bulkRequest object are indexed.
145
+ */
146
+ public void close(TaskAttemptContext context) throws IOException {
147
+ if (currentRequest.numberOfActions() > 0) {
148
+ try {
149
+ BulkResponse response = currentRequest.execute().actionGet();
150
+ } catch (Exception e) {
151
+ LOG.warn("Bulk request failed: " + e.getMessage());
152
+ throw new RuntimeException(e);
153
+ }
154
+ }
155
+ LOG.info("Closing record writer");
156
+ client.close();
157
+ LOG.info("Client is closed");
158
+ if (node != null) {
159
+ node.close();
160
+ }
161
+ LOG.info("Record writer closed.");
162
+ }
163
+
164
+ /**
165
+ Writes a single MapWritable record to the bulkRequest object. Once <b>elasticsearch.bulk.size</b> are accumulated the
166
+ records are written to elasticsearch.
167
+ */
168
+ public void write(NullWritable key, MapWritable fields) throws IOException {
169
+ XContentBuilder builder = XContentFactory.jsonBuilder();
170
+ buildContent(builder, fields);
171
+ if (idField == -1) {
172
+ // Document has no inherent id
173
+ currentRequest.add(Requests.indexRequest(indexName).type(objType).source(builder));
174
+ } else {
175
+ try {
176
+ Text mapKey = new Text(idFieldName);
177
+ String record_id = fields.get(mapKey).toString();
178
+ currentRequest.add(Requests.indexRequest(indexName).id(record_id).type(objType).create(false).source(builder));
179
+ } catch (Exception e) {
180
+ LOG.warn("Encountered malformed record");
181
+ }
182
+ }
183
+ processBulkIfNeeded();
184
+ }
185
+
186
+ /**
187
+ Recursively untangles the MapWritable and writes the fields into elasticsearch's XContentBuilder builder.
188
+ */
189
+ private void buildContent(XContentBuilder builder, Writable value) throws IOException {
190
+ if (value instanceof Text) {
191
+ builder.value(((Text)value).toString());
192
+ } else if (value instanceof LongWritable) {
193
+ builder.value(((LongWritable)value).get());
194
+ } else if (value instanceof IntWritable) {
195
+ builder.value(((IntWritable)value).get());
196
+ } else if (value instanceof DoubleWritable) {
197
+ builder.value(((DoubleWritable)value).get());
198
+ } else if (value instanceof FloatWritable) {
199
+ builder.value(((FloatWritable)value).get());
200
+ } else if (value instanceof BooleanWritable) {
201
+ builder.value(((BooleanWritable)value).get());
202
+ } else if (value instanceof MapWritable) {
203
+ builder.startObject();
204
+ for (Map.Entry<Writable,Writable> entry : ((MapWritable)value).entrySet()) {
205
+ if (!(entry.getValue() instanceof NullWritable)) {
206
+ builder.field(entry.getKey().toString());
207
+ buildContent(builder, entry.getValue());
208
+ }
209
+ }
210
+ builder.endObject();
211
+ } else if (value instanceof ArrayWritable) {
212
+ builder.startArray();
213
+ Writable[] arrayOfThings = ((ArrayWritable)value).get();
214
+ for (int i = 0; i < arrayOfThings.length; i++) {
215
+ buildContent(builder, arrayOfThings[i]);
216
+ }
217
+ builder.endArray();
218
+ }
219
+ }
220
+
221
+ /**
222
+ Indexes content to elasticsearch when <b>elasticsearch.bulk.size</b> records have been accumulated.
223
+ */
224
+ private void processBulkIfNeeded() {
225
+ totalBulkItems.incrementAndGet();
226
+ if (currentRequest.numberOfActions() >= bulkSize) {
227
+ try {
228
+ long startTime = System.currentTimeMillis();
229
+ BulkResponse response = currentRequest.execute().actionGet();
230
+ totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
231
+ if (randgen.nextDouble() < 0.1) {
232
+ LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
233
+ }
234
+ } catch (Exception e) {
235
+ LOG.warn("Bulk request failed: " + e.getMessage());
236
+ throw new RuntimeException(e);
237
+ }
238
+ currentRequest = client.prepareBulk();
239
+ }
240
+ }
241
+
242
+ private void initialize_index(String indexName) {
243
+ LOG.info("Initializing index");
244
+ try {
245
+ client.admin().indices().prepareCreate(indexName).execute().actionGet();
246
+ } catch (Exception e) {
247
+ if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
248
+ LOG.warn("Index ["+indexName+"] already exists");
249
+ }
250
+ }
251
+ }
252
+
253
+ //
254
+ // Starts an embedded elasticsearch client (ie. data = false)
255
+ //
256
+ private void start_embedded_client() {
257
+ LOG.info("Starting embedded elasticsearch client ...");
258
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
259
+ this.client = node.client();
260
+ }
261
+ }
262
+
263
+ public RecordWriter<NullWritable, MapWritable> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException {
264
+ return new ElasticSearchRecordWriter(context);
265
+ }
266
+
267
+ public void setConf(Configuration conf) {
268
+ }
269
+
270
+ public Configuration getConf() {
271
+ return conf;
272
+ }
273
+
274
+ @Override
275
+ public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
276
+ // TODO Check if the object exists?
277
+ }
278
+
279
+ @Override
280
+ public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
281
+ return new ElasticSearchOutputCommitter();
282
+ }
283
+ }
@@ -0,0 +1,60 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+ import java.io.DataInput;
5
+ import java.io.DataOutput;
6
+
7
+ import org.apache.hadoop.io.Text;
8
+ import org.apache.hadoop.io.Writable;
9
+ import org.apache.hadoop.mapreduce.InputSplit;
10
+
11
+ public class ElasticSearchSplit extends InputSplit implements Writable {
12
+
13
+ private String queryString;
14
+ private long from;
15
+ private long size;
16
+
17
+ public ElasticSearchSplit() {}
18
+
19
+ public ElasticSearchSplit(String queryString, long from, long size) {
20
+ this.queryString = queryString;
21
+ this.from = from;
22
+ this.size = size;
23
+ }
24
+
25
+ public String getQueryString() {
26
+ return queryString;
27
+ }
28
+
29
+ public long getFrom() {
30
+ return from;
31
+ }
32
+
33
+ public long getSize() {
34
+ return size;
35
+ }
36
+
37
+ @Override
38
+ public String[] getLocations() {
39
+ return new String[] {};
40
+ }
41
+
42
+ @Override
43
+ public long getLength() {
44
+ return 0;
45
+ }
46
+
47
+ @Override
48
+ public void readFields(DataInput in) throws IOException {
49
+ queryString = Text.readString(in);
50
+ from = in.readLong();
51
+ size = in.readLong();
52
+ }
53
+
54
+ @Override
55
+ public void write(DataOutput out) throws IOException {
56
+ Text.writeString(out, queryString);
57
+ out.writeLong(from);
58
+ out.writeLong(size);
59
+ }
60
+ }
@@ -0,0 +1,231 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+ import java.io.BufferedReader;
5
+ import java.io.FileReader;
6
+ import java.util.List;
7
+ import java.util.ArrayList;
8
+ import java.util.Map;
9
+ import java.util.HashMap;
10
+
11
+ import org.apache.commons.logging.Log;
12
+ import org.apache.commons.logging.LogFactory;
13
+
14
+ import org.apache.hadoop.io.*;
15
+ import org.apache.hadoop.mapred.InputFormat;
16
+ import org.apache.hadoop.mapred.RecordReader;
17
+ import org.apache.hadoop.mapred.InputSplit;
18
+ import org.apache.hadoop.mapred.JobConf;
19
+ import org.apache.hadoop.mapred.Reporter;
20
+
21
+ import org.elasticsearch.common.settings.loader.YamlSettingsLoader;
22
+ import org.elasticsearch.common.transport.InetSocketTransportAddress;
23
+
24
+ import org.elasticsearch.client.transport.TransportClient;
25
+ import org.elasticsearch.action.search.SearchRequestBuilder;
26
+
27
+ import org.elasticsearch.action.search.SearchResponse;
28
+ import org.elasticsearch.action.search.SearchType;
29
+ import org.elasticsearch.index.query.FilterBuilders.*;
30
+ import org.elasticsearch.cluster.ClusterName;
31
+
32
+ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V> {
33
+
34
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingInputFormat.class);
35
+
36
+ // Job settings we need to control directly from Java options.
37
+ private static final String ES_INDEX_OPT = "elasticsearch.input.index";
38
+ private static final String ES_DEFAULT_INDEX = "hadoop";
39
+ private String indexName;
40
+
41
+ private static final String ES_MAPPING_OPT = "elasticsearch.input.mapping";
42
+ private static final String ES_DEFAULT_MAPPING = "streaming_record";
43
+ private String mappingName;
44
+
45
+ private static final String ES_NUM_SPLITS_OPT = "elasticsearch.input.splits";
46
+ private static final String ES_NUM_SPLITS = "1";
47
+ private Integer numSplits;
48
+
49
+ private static final String ES_QUERY_OPT = "elasticsearch.input.query";
50
+ private static final String ES_QUERY = "{\"match_all\": {}}";
51
+ private String queryJSON;
52
+
53
+ // Calculated after the first query.
54
+ private long numHits;
55
+ private Integer recordsPerSplit;
56
+
57
+ // Elasticsearch internal settings required to make a client
58
+ // connection.
59
+ private static final String ES_CONFIG_OPT = "es.config";
60
+ private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
61
+
62
+ private static final String ES_PLUGINS_OPT = "es.path.plugins";
63
+ private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
64
+
65
+ private static final String ES_UNICAST_HOSTS_NAME = "discovery.zen.ping.unicast.hosts";
66
+
67
+ private TransportClient client;
68
+
69
+ public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) {
70
+ setLocalElasticSearchInstallation(conf);
71
+ return (RecordReader) new ElasticSearchStreamingRecordReader(split, conf);
72
+ }
73
+
74
+ public InputSplit[] getSplits(JobConf conf, int requestedNumSplits) {
75
+ this.numSplits = requestedNumSplits;
76
+
77
+ setLocalElasticSearchInstallation(conf);
78
+ parseInput(conf);
79
+
80
+ startTransportClient(conf);
81
+ findNumHits();
82
+ stopTransportClient();
83
+
84
+ return createSplits();
85
+ }
86
+
87
+
88
+ //
89
+ // == Setup ==
90
+ //
91
+
92
+ public void setLocalElasticSearchInstallation(JobConf conf) {
93
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
94
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
95
+ System.setProperty(ES_CONFIG_OPT, esConfigPath);
96
+ System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
97
+ LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
98
+ }
99
+
100
+ private void parseInput(JobConf conf) {
101
+ this.indexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
102
+ this.mappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
103
+ // this.numSplits = Integer.parseInt(conf.get(ES_NUM_SPLITS_OPT, ES_NUM_SPLITS));
104
+ this.queryJSON = conf.get(ES_QUERY_OPT, ES_QUERY);
105
+ String message = "Using input /"+indexName;
106
+ if (mappingName != null && mappingName.length() > 0) {
107
+ message += "/"+mappingName;
108
+ }
109
+ if (queryJSON != null && queryJSON.length() > 0) {
110
+ message += " with query: "+queryJSON;
111
+ }
112
+ LOG.info(message);
113
+ }
114
+
115
+ //
116
+ // == Connecting to Elasticsearch and Querying ==
117
+ //
118
+
119
+ private void startTransportClient(JobConf conf) {
120
+ this.client = new TransportClient();
121
+ Map<String,String> settings = parsedSettings(conf);
122
+ String host = hostname(settings);
123
+ if (host.toString().length() == 0) {
124
+ System.exit(1);
125
+ }
126
+ LOG.info("Attempting to connect to Elasticsearch node at " + host + ":9300");
127
+ this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(host, 9300));
128
+ LOG.info("Connected to Elasticsearch cluster");
129
+ }
130
+
131
+ private Map<String,String> parsedSettings(JobConf conf) {
132
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
133
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
134
+
135
+ try {
136
+ BufferedReader reader = new BufferedReader( new FileReader(esConfigPath));
137
+ String line = null;
138
+ StringBuilder stringBuilder = new StringBuilder();
139
+ String ls = System.getProperty("line.separator");
140
+ while( ( line = reader.readLine() ) != null ) {
141
+ stringBuilder.append( line );
142
+ stringBuilder.append( ls );
143
+ }
144
+ return new YamlSettingsLoader().load(stringBuilder.toString());
145
+ } catch (IOException e) {
146
+ LOG.error("Could not find or read the configuration file " + esConfigPath + ".");
147
+ return new HashMap<String,String>();
148
+ }
149
+ }
150
+
151
+ private String hostname(Map<String,String> settings) {
152
+ String hostsString = settings.get(ES_UNICAST_HOSTS_NAME);
153
+ if (hostsString.toString().length() == 0) {
154
+ LOG.error("Could not find hosts. Did you set the '" + ES_UNICAST_HOSTS_NAME + "' key?");
155
+ return "";
156
+ }
157
+
158
+ String[] hosts = hostsString.split(",");
159
+ if (hosts.length > 0) {
160
+ String host = hosts[0];
161
+ if (host.toString().length() == 0) {
162
+ LOG.error("Could not parse hosts from '" + ES_UNICAST_HOSTS_NAME + "' key.");
163
+ return "";
164
+ } else {
165
+ return host;
166
+ }
167
+ } else {
168
+ LOG.error("Could not find any hosts in the '" + ES_UNICAST_HOSTS_NAME + "' key.");
169
+ return "";
170
+ }
171
+ }
172
+
173
+ private void stopTransportClient() {
174
+ if (client != null) client.close();
175
+ LOG.info("Disconnected from Elasticsearch cluster");
176
+ }
177
+
178
+ private void findNumHits() {
179
+ SearchRequestBuilder request = client.prepareSearch(indexName);
180
+ if (mappingName != null && mappingName.length() > 0) {
181
+ request.setTypes(mappingName);
182
+ }
183
+ request.setSearchType(SearchType.COUNT);
184
+ if (queryJSON != null && queryJSON.length() > 0) {
185
+ request.setQuery(queryJSON);
186
+ }
187
+ SearchResponse response = request.execute().actionGet();
188
+ this.numHits = response.hits().totalHits();
189
+
190
+ LOG.info("Ran query: "+String.valueOf(numHits)+" hits");
191
+ }
192
+
193
+ //
194
+ // == Setting splits ==
195
+ //
196
+
197
+ private void readjustSplitsByHits() {
198
+
199
+ }
200
+
201
+ private InputSplit[] createSplits() {
202
+ // Say that
203
+ //
204
+ // numHits = 7
205
+ // numSplits = 2
206
+ if((long) numSplits > numHits) {
207
+ numSplits = (int) numHits;
208
+ }
209
+
210
+ this.recordsPerSplit = (int) (numHits/((long)numSplits)); // == 3 records/split
211
+
212
+ List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
213
+
214
+ // i == 0, 1
215
+ for(int i = 0; i < numSplits; i++) {
216
+ Integer from = i * recordsPerSplit;
217
+ splits.add(new ElasticSearchStreamingSplit(indexName, mappingName, numSplits, queryJSON, numHits, from, recordsPerSplit));
218
+ }
219
+ // 7 is > (2 * 3) == 6
220
+ if (numHits > ((long) (numSplits * recordsPerSplit))) {
221
+ Integer from = numSplits * recordsPerSplit;
222
+ Integer size = (int) (numHits - ((long) from));
223
+ splits.add(new ElasticSearchStreamingSplit(indexName, mappingName, numSplits, queryJSON, numHits, from, size));
224
+ }
225
+
226
+ LOG.info("Splitting "+String.valueOf(numHits)+" hits across "+String.valueOf(splits.size())+" splits ("+String.valueOf(recordsPerSplit)+" hits/split)");
227
+
228
+ return splits.toArray(new InputSplit[splits.size()]);
229
+ }
230
+
231
+ }