wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,39 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.mapreduce.JobContext;
6
+ import org.apache.hadoop.mapreduce.OutputCommitter;
7
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
8
+
9
+ /**
10
+ * Small committer class that does not do anything.
11
+ */
12
+ public class ElasticSearchOutputCommitter extends OutputCommitter {
13
+
14
+ @Override
15
+ public void abortTask(TaskAttemptContext arg0) throws IOException {
16
+ }
17
+
18
+ @Override
19
+ public void cleanupJob(JobContext arg0) throws IOException {
20
+ }
21
+
22
+ @Override
23
+ public void commitTask(TaskAttemptContext arg0) throws IOException {
24
+ }
25
+
26
+ @Override
27
+ public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
28
+ return false;
29
+ }
30
+
31
+ @Override
32
+ public void setupJob(JobContext arg0) throws IOException {
33
+ }
34
+
35
+ @Override
36
+ public void setupTask(TaskAttemptContext arg0) throws IOException {
37
+ }
38
+
39
+ }
@@ -0,0 +1,283 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.File;
4
+ import java.io.IOException;
5
+ import java.util.Map;
6
+ import java.util.HashMap;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+ import java.util.concurrent.atomic.AtomicLong;
10
+ import java.util.Random;
11
+ import java.net.URI;
12
+
13
+ import org.apache.commons.logging.Log;
14
+ import org.apache.commons.logging.LogFactory;
15
+
16
+ import org.apache.hadoop.conf.Configurable;
17
+ import org.apache.hadoop.io.*;
18
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
19
+ import org.apache.hadoop.mapreduce.JobContext;
20
+ import org.apache.hadoop.mapreduce.RecordWriter;
21
+ import org.apache.hadoop.conf.Configuration;
22
+ import org.apache.hadoop.fs.FileSystem;
23
+ import org.apache.hadoop.fs.Path;
24
+ import org.apache.hadoop.mapreduce.Counter;
25
+ import org.apache.hadoop.mapreduce.OutputFormat;
26
+ import org.apache.hadoop.mapreduce.OutputCommitter;
27
+ import org.apache.hadoop.filecache.DistributedCache;
28
+
29
+ import org.elasticsearch.common.xcontent.XContentBuilder;
30
+ import org.elasticsearch.common.xcontent.XContentFactory;
31
+ import org.elasticsearch.node.Node;
32
+ import org.elasticsearch.node.NodeBuilder;
33
+ import org.elasticsearch.client.Client;
34
+ import org.elasticsearch.client.Requests;
35
+ import org.elasticsearch.action.bulk.BulkRequestBuilder;
36
+ import org.elasticsearch.indices.IndexAlreadyExistsException;
37
+ import org.elasticsearch.action.bulk.BulkResponse;
38
+ import org.elasticsearch.ExceptionsHelper;
39
+
40
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
41
+
42
+ /**
43
+
44
+ Hadoop OutputFormat for writing arbitrary MapWritables (essentially HashMaps) into Elasticsearch. Records are batched up and sent
45
+ in a one-hop manner to the elastic search data nodes that will index them.
46
+
47
+ */
48
+ public class ElasticSearchOutputFormat extends OutputFormat<NullWritable, MapWritable> implements Configurable {
49
+
50
+ static Log LOG = LogFactory.getLog(ElasticSearchOutputFormat.class);
51
+ private Configuration conf = null;
52
+
53
+ protected class ElasticSearchRecordWriter extends RecordWriter<NullWritable, MapWritable> {
54
+
55
+ private Node node;
56
+ private Client client;
57
+ private String indexName;
58
+ private int bulkSize;
59
+ private int idField;
60
+ private String idFieldName;
61
+ private String objType;
62
+ private String[] fieldNames;
63
+
64
+ // Used for bookkeeping purposes
65
+ private AtomicLong totalBulkTime = new AtomicLong();
66
+ private AtomicLong totalBulkItems = new AtomicLong();
67
+ private Random randgen = new Random();
68
+ private long runStartTime = System.currentTimeMillis();
69
+
70
+ // For hadoop configuration
71
+ private static final String ES_CONFIG_NAME = "elasticsearch.yml";
72
+ private static final String ES_PLUGINS_NAME = "plugins";
73
+ private static final String ES_INDEX_NAME = "elasticsearch.index.name";
74
+ private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
75
+ private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
76
+ private static final String ES_ID_FIELD = "elasticsearch.id.field";
77
+ private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
78
+ private static final String ES_CONFIG = "es.config";
79
+ private static final String ES_PLUGINS = "es.path.plugins";
80
+
81
+ // Other string constants
82
+ private static final String COMMA = ",";
83
+ private static final String SLASH = "/";
84
+ private static final String NO_ID_FIELD = "-1";
85
+
86
+ private volatile BulkRequestBuilder currentRequest;
87
+
88
+ /**
89
+ Instantiates a new RecordWriter for Elasticsearch
90
+ <p>
91
+ The properties that <b>MUST</b> be set in the hadoop Configuration object
92
+ are as follows:
93
+ <ul>
94
+ <li><b>elasticsearch.index.name</b> - The name of the elasticsearch index data will be written to. It does not have to exist ahead of time</li>
95
+ <li><b>elasticsearch.bulk.size</b> - The number of records to be accumulated into a bulk request before writing to elasticsearch.</li>
96
+ <li><b>elasticsearch.is_json</b> - A boolean indicating whether the records to be indexed are json records. If false the records are assumed to be tsv, in which case <b>elasticsearch.field.names</b> must be set and contain a comma separated list of field names</li>
97
+ <li><b>elasticsearch.object.type</b> - The type of objects being indexed</li>
98
+ <li><b>elasticsearch.config</b> - The full path the elasticsearch.yml. It is a local path and must exist on all machines in the hadoop cluster.</li>
99
+ <li><b>elasticsearch.plugins.dir</b> - The full path the elasticsearch plugins directory. It is a local path and must exist on all machines in the hadoop cluster.</li>
100
+ </ul>
101
+ <p>
102
+ The following fields depend on whether <b>elasticsearch.is_json</b> is true or false.
103
+ <ul>
104
+ <li><b>elasticsearch.id.field.name</b> - When <b>elasticsearch.is_json</b> is true, this is the name of a field in the json document that contains the document's id. If -1 is used then the document is assumed to have no id and one is assigned to it by elasticsearch.</li>
105
+ <li><b>elasticsearch.field.names</b> - When <b>elasticsearch.is_json</b> is false, this is a comma separated list of field names.</li>
106
+ <li><b>elasticsearch.id.field</b> - When <b>elasticsearch.is_json</b> is false, this is the numeric index of the field to use as the document id. If -1 is used the document is assumed to have no id and one is assigned to it by elasticsearch.</li>
107
+ </ul>
108
+ */
109
+ public ElasticSearchRecordWriter(TaskAttemptContext context) {
110
+ Configuration conf = context.getConfiguration();
111
+ this.indexName = conf.get(ES_INDEX_NAME);
112
+ this.bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE));
113
+ this.idFieldName = conf.get(ES_ID_FIELD_NAME);
114
+ if (idFieldName.equals(NO_ID_FIELD)) {
115
+ LOG.info("Documents will be assigned ids by elasticsearch");
116
+ this.idField = -1;
117
+ } else {
118
+ LOG.info("Using field:["+idFieldName+"] for document ids");
119
+ }
120
+ this.objType = conf.get(ES_OBJECT_TYPE);
121
+
122
+ //
123
+ // Fetches elasticsearch.yml and the plugins directory from the distributed cache, or
124
+ // from the local config.
125
+ //
126
+ try {
127
+ String taskConfigPath = HadoopUtils.fetchFileFromCache(ES_CONFIG_NAME, conf);
128
+ LOG.info("Using ["+taskConfigPath+"] as es.config");
129
+ String taskPluginsPath = HadoopUtils.fetchArchiveFromCache(ES_PLUGINS_NAME, conf);
130
+ LOG.info("Using ["+taskPluginsPath+"] as es.plugins.dir");
131
+ System.setProperty(ES_CONFIG, taskConfigPath);
132
+ System.setProperty(ES_PLUGINS, taskPluginsPath+SLASH+ES_PLUGINS_NAME);
133
+ } catch (Exception e) {
134
+ System.setProperty(ES_CONFIG,conf.get(ES_CONFIG));
135
+ System.setProperty(ES_PLUGINS,conf.get(ES_PLUGINS));
136
+ }
137
+
138
+ start_embedded_client();
139
+ initialize_index(indexName);
140
+ currentRequest = client.prepareBulk();
141
+ }
142
+
143
+ /**
144
+ Closes the connection to elasticsearch. Any documents remaining in the bulkRequest object are indexed.
145
+ */
146
+ public void close(TaskAttemptContext context) throws IOException {
147
+ if (currentRequest.numberOfActions() > 0) {
148
+ try {
149
+ BulkResponse response = currentRequest.execute().actionGet();
150
+ } catch (Exception e) {
151
+ LOG.warn("Bulk request failed: " + e.getMessage());
152
+ throw new RuntimeException(e);
153
+ }
154
+ }
155
+ LOG.info("Closing record writer");
156
+ client.close();
157
+ LOG.info("Client is closed");
158
+ if (node != null) {
159
+ node.close();
160
+ }
161
+ LOG.info("Record writer closed.");
162
+ }
163
+
164
+ /**
165
+ Writes a single MapWritable record to the bulkRequest object. Once <b>elasticsearch.bulk.size</b> are accumulated the
166
+ records are written to elasticsearch.
167
+ */
168
+ public void write(NullWritable key, MapWritable fields) throws IOException {
169
+ XContentBuilder builder = XContentFactory.jsonBuilder();
170
+ buildContent(builder, fields);
171
+ if (idField == -1) {
172
+ // Document has no inherent id
173
+ currentRequest.add(Requests.indexRequest(indexName).type(objType).source(builder));
174
+ } else {
175
+ try {
176
+ Text mapKey = new Text(idFieldName);
177
+ String record_id = fields.get(mapKey).toString();
178
+ currentRequest.add(Requests.indexRequest(indexName).id(record_id).type(objType).create(false).source(builder));
179
+ } catch (Exception e) {
180
+ LOG.warn("Encountered malformed record");
181
+ }
182
+ }
183
+ processBulkIfNeeded();
184
+ }
185
+
186
+ /**
187
+ Recursively untangles the MapWritable and writes the fields into elasticsearch's XContentBuilder builder.
188
+ */
189
+ private void buildContent(XContentBuilder builder, Writable value) throws IOException {
190
+ if (value instanceof Text) {
191
+ builder.value(((Text)value).toString());
192
+ } else if (value instanceof LongWritable) {
193
+ builder.value(((LongWritable)value).get());
194
+ } else if (value instanceof IntWritable) {
195
+ builder.value(((IntWritable)value).get());
196
+ } else if (value instanceof DoubleWritable) {
197
+ builder.value(((DoubleWritable)value).get());
198
+ } else if (value instanceof FloatWritable) {
199
+ builder.value(((FloatWritable)value).get());
200
+ } else if (value instanceof BooleanWritable) {
201
+ builder.value(((BooleanWritable)value).get());
202
+ } else if (value instanceof MapWritable) {
203
+ builder.startObject();
204
+ for (Map.Entry<Writable,Writable> entry : ((MapWritable)value).entrySet()) {
205
+ if (!(entry.getValue() instanceof NullWritable)) {
206
+ builder.field(entry.getKey().toString());
207
+ buildContent(builder, entry.getValue());
208
+ }
209
+ }
210
+ builder.endObject();
211
+ } else if (value instanceof ArrayWritable) {
212
+ builder.startArray();
213
+ Writable[] arrayOfThings = ((ArrayWritable)value).get();
214
+ for (int i = 0; i < arrayOfThings.length; i++) {
215
+ buildContent(builder, arrayOfThings[i]);
216
+ }
217
+ builder.endArray();
218
+ }
219
+ }
220
+
221
+ /**
222
+ Indexes content to elasticsearch when <b>elasticsearch.bulk.size</b> records have been accumulated.
223
+ */
224
+ private void processBulkIfNeeded() {
225
+ totalBulkItems.incrementAndGet();
226
+ if (currentRequest.numberOfActions() >= bulkSize) {
227
+ try {
228
+ long startTime = System.currentTimeMillis();
229
+ BulkResponse response = currentRequest.execute().actionGet();
230
+ totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
231
+ if (randgen.nextDouble() < 0.1) {
232
+ LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
233
+ }
234
+ } catch (Exception e) {
235
+ LOG.warn("Bulk request failed: " + e.getMessage());
236
+ throw new RuntimeException(e);
237
+ }
238
+ currentRequest = client.prepareBulk();
239
+ }
240
+ }
241
+
242
+ private void initialize_index(String indexName) {
243
+ LOG.info("Initializing index");
244
+ try {
245
+ client.admin().indices().prepareCreate(indexName).execute().actionGet();
246
+ } catch (Exception e) {
247
+ if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException) {
248
+ LOG.warn("Index ["+indexName+"] already exists");
249
+ }
250
+ }
251
+ }
252
+
253
+ //
254
+ // Starts an embedded elasticsearch client (ie. data = false)
255
+ //
256
+ private void start_embedded_client() {
257
+ LOG.info("Starting embedded elasticsearch client ...");
258
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
259
+ this.client = node.client();
260
+ }
261
+ }
262
+
263
+ public RecordWriter<NullWritable, MapWritable> getRecordWriter(final TaskAttemptContext context) throws IOException, InterruptedException {
264
+ return new ElasticSearchRecordWriter(context);
265
+ }
266
+
267
+ public void setConf(Configuration conf) {
268
+ }
269
+
270
+ public Configuration getConf() {
271
+ return conf;
272
+ }
273
+
274
+ @Override
275
+ public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
276
+ // TODO Check if the object exists?
277
+ }
278
+
279
+ @Override
280
+ public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
281
+ return new ElasticSearchOutputCommitter();
282
+ }
283
+ }
@@ -0,0 +1,60 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+ import java.io.DataInput;
5
+ import java.io.DataOutput;
6
+
7
+ import org.apache.hadoop.io.Text;
8
+ import org.apache.hadoop.io.Writable;
9
+ import org.apache.hadoop.mapreduce.InputSplit;
10
+
11
+ public class ElasticSearchSplit extends InputSplit implements Writable {
12
+
13
+ private String queryString;
14
+ private long from;
15
+ private long size;
16
+
17
+ public ElasticSearchSplit() {}
18
+
19
+ public ElasticSearchSplit(String queryString, long from, long size) {
20
+ this.queryString = queryString;
21
+ this.from = from;
22
+ this.size = size;
23
+ }
24
+
25
+ public String getQueryString() {
26
+ return queryString;
27
+ }
28
+
29
+ public long getFrom() {
30
+ return from;
31
+ }
32
+
33
+ public long getSize() {
34
+ return size;
35
+ }
36
+
37
+ @Override
38
+ public String[] getLocations() {
39
+ return new String[] {};
40
+ }
41
+
42
+ @Override
43
+ public long getLength() {
44
+ return 0;
45
+ }
46
+
47
+ @Override
48
+ public void readFields(DataInput in) throws IOException {
49
+ queryString = Text.readString(in);
50
+ from = in.readLong();
51
+ size = in.readLong();
52
+ }
53
+
54
+ @Override
55
+ public void write(DataOutput out) throws IOException {
56
+ Text.writeString(out, queryString);
57
+ out.writeLong(from);
58
+ out.writeLong(size);
59
+ }
60
+ }
@@ -0,0 +1,231 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+ import java.io.BufferedReader;
5
+ import java.io.FileReader;
6
+ import java.util.List;
7
+ import java.util.ArrayList;
8
+ import java.util.Map;
9
+ import java.util.HashMap;
10
+
11
+ import org.apache.commons.logging.Log;
12
+ import org.apache.commons.logging.LogFactory;
13
+
14
+ import org.apache.hadoop.io.*;
15
+ import org.apache.hadoop.mapred.InputFormat;
16
+ import org.apache.hadoop.mapred.RecordReader;
17
+ import org.apache.hadoop.mapred.InputSplit;
18
+ import org.apache.hadoop.mapred.JobConf;
19
+ import org.apache.hadoop.mapred.Reporter;
20
+
21
+ import org.elasticsearch.common.settings.loader.YamlSettingsLoader;
22
+ import org.elasticsearch.common.transport.InetSocketTransportAddress;
23
+
24
+ import org.elasticsearch.client.transport.TransportClient;
25
+ import org.elasticsearch.action.search.SearchRequestBuilder;
26
+
27
+ import org.elasticsearch.action.search.SearchResponse;
28
+ import org.elasticsearch.action.search.SearchType;
29
+ import org.elasticsearch.index.query.FilterBuilders.*;
30
+ import org.elasticsearch.cluster.ClusterName;
31
+
32
+ public class ElasticSearchStreamingInputFormat<K, V> implements InputFormat<K, V> {
33
+
34
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingInputFormat.class);
35
+
36
+ // Job settings we need to control directly from Java options.
37
+ private static final String ES_INDEX_OPT = "elasticsearch.input.index";
38
+ private static final String ES_DEFAULT_INDEX = "hadoop";
39
+ private String indexName;
40
+
41
+ private static final String ES_MAPPING_OPT = "elasticsearch.input.mapping";
42
+ private static final String ES_DEFAULT_MAPPING = "streaming_record";
43
+ private String mappingName;
44
+
45
+ private static final String ES_NUM_SPLITS_OPT = "elasticsearch.input.splits";
46
+ private static final String ES_NUM_SPLITS = "1";
47
+ private Integer numSplits;
48
+
49
+ private static final String ES_QUERY_OPT = "elasticsearch.input.query";
50
+ private static final String ES_QUERY = "{\"match_all\": {}}";
51
+ private String queryJSON;
52
+
53
+ // Calculated after the first query.
54
+ private long numHits;
55
+ private Integer recordsPerSplit;
56
+
57
+ // Elasticsearch internal settings required to make a client
58
+ // connection.
59
+ private static final String ES_CONFIG_OPT = "es.config";
60
+ private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
61
+
62
+ private static final String ES_PLUGINS_OPT = "es.path.plugins";
63
+ private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
64
+
65
+ private static final String ES_UNICAST_HOSTS_NAME = "discovery.zen.ping.unicast.hosts";
66
+
67
+ private TransportClient client;
68
+
69
+ public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) {
70
+ setLocalElasticSearchInstallation(conf);
71
+ return (RecordReader) new ElasticSearchStreamingRecordReader(split, conf);
72
+ }
73
+
74
+ public InputSplit[] getSplits(JobConf conf, int requestedNumSplits) {
75
+ this.numSplits = requestedNumSplits;
76
+
77
+ setLocalElasticSearchInstallation(conf);
78
+ parseInput(conf);
79
+
80
+ startTransportClient(conf);
81
+ findNumHits();
82
+ stopTransportClient();
83
+
84
+ return createSplits();
85
+ }
86
+
87
+
88
+ //
89
+ // == Setup ==
90
+ //
91
+
92
+ public void setLocalElasticSearchInstallation(JobConf conf) {
93
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
94
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
95
+ System.setProperty(ES_CONFIG_OPT, esConfigPath);
96
+ System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
97
+ LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
98
+ }
99
+
100
+ private void parseInput(JobConf conf) {
101
+ this.indexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
102
+ this.mappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
103
+ // this.numSplits = Integer.parseInt(conf.get(ES_NUM_SPLITS_OPT, ES_NUM_SPLITS));
104
+ this.queryJSON = conf.get(ES_QUERY_OPT, ES_QUERY);
105
+ String message = "Using input /"+indexName;
106
+ if (mappingName != null && mappingName.length() > 0) {
107
+ message += "/"+mappingName;
108
+ }
109
+ if (queryJSON != null && queryJSON.length() > 0) {
110
+ message += " with query: "+queryJSON;
111
+ }
112
+ LOG.info(message);
113
+ }
114
+
115
+ //
116
+ // == Connecting to Elasticsearch and Querying ==
117
+ //
118
+
119
+ private void startTransportClient(JobConf conf) {
120
+ this.client = new TransportClient();
121
+ Map<String,String> settings = parsedSettings(conf);
122
+ String host = hostname(settings);
123
+ if (host.toString().length() == 0) {
124
+ System.exit(1);
125
+ }
126
+ LOG.info("Attempting to connect to Elasticsearch node at " + host + ":9300");
127
+ this.client = new TransportClient().addTransportAddress(new InetSocketTransportAddress(host, 9300));
128
+ LOG.info("Connected to Elasticsearch cluster");
129
+ }
130
+
131
+ private Map<String,String> parsedSettings(JobConf conf) {
132
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
133
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
134
+
135
+ try {
136
+ BufferedReader reader = new BufferedReader( new FileReader(esConfigPath));
137
+ String line = null;
138
+ StringBuilder stringBuilder = new StringBuilder();
139
+ String ls = System.getProperty("line.separator");
140
+ while( ( line = reader.readLine() ) != null ) {
141
+ stringBuilder.append( line );
142
+ stringBuilder.append( ls );
143
+ }
144
+ return new YamlSettingsLoader().load(stringBuilder.toString());
145
+ } catch (IOException e) {
146
+ LOG.error("Could not find or read the configuration file " + esConfigPath + ".");
147
+ return new HashMap<String,String>();
148
+ }
149
+ }
150
+
151
+ private String hostname(Map<String,String> settings) {
152
+ String hostsString = settings.get(ES_UNICAST_HOSTS_NAME);
153
+ if (hostsString.toString().length() == 0) {
154
+ LOG.error("Could not find hosts. Did you set the '" + ES_UNICAST_HOSTS_NAME + "' key?");
155
+ return "";
156
+ }
157
+
158
+ String[] hosts = hostsString.split(",");
159
+ if (hosts.length > 0) {
160
+ String host = hosts[0];
161
+ if (host.toString().length() == 0) {
162
+ LOG.error("Could not parse hosts from '" + ES_UNICAST_HOSTS_NAME + "' key.");
163
+ return "";
164
+ } else {
165
+ return host;
166
+ }
167
+ } else {
168
+ LOG.error("Could not find any hosts in the '" + ES_UNICAST_HOSTS_NAME + "' key.");
169
+ return "";
170
+ }
171
+ }
172
+
173
+ private void stopTransportClient() {
174
+ if (client != null) client.close();
175
+ LOG.info("Disconnected from Elasticsearch cluster");
176
+ }
177
+
178
+ private void findNumHits() {
179
+ SearchRequestBuilder request = client.prepareSearch(indexName);
180
+ if (mappingName != null && mappingName.length() > 0) {
181
+ request.setTypes(mappingName);
182
+ }
183
+ request.setSearchType(SearchType.COUNT);
184
+ if (queryJSON != null && queryJSON.length() > 0) {
185
+ request.setQuery(queryJSON);
186
+ }
187
+ SearchResponse response = request.execute().actionGet();
188
+ this.numHits = response.hits().totalHits();
189
+
190
+ LOG.info("Ran query: "+String.valueOf(numHits)+" hits");
191
+ }
192
+
193
+ //
194
+ // == Setting splits ==
195
+ //
196
+
197
+ private void readjustSplitsByHits() {
198
+
199
+ }
200
+
201
+ private InputSplit[] createSplits() {
202
+ // Say that
203
+ //
204
+ // numHits = 7
205
+ // numSplits = 2
206
+ if((long) numSplits > numHits) {
207
+ numSplits = (int) numHits;
208
+ }
209
+
210
+ this.recordsPerSplit = (int) (numHits/((long)numSplits)); // == 3 records/split
211
+
212
+ List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
213
+
214
+ // i == 0, 1
215
+ for(int i = 0; i < numSplits; i++) {
216
+ Integer from = i * recordsPerSplit;
217
+ splits.add(new ElasticSearchStreamingSplit(indexName, mappingName, numSplits, queryJSON, numHits, from, recordsPerSplit));
218
+ }
219
+ // 7 is > (2 * 3) == 6
220
+ if (numHits > ((long) (numSplits * recordsPerSplit))) {
221
+ Integer from = numSplits * recordsPerSplit;
222
+ Integer size = (int) (numHits - ((long) from));
223
+ splits.add(new ElasticSearchStreamingSplit(indexName, mappingName, numSplits, queryJSON, numHits, from, size));
224
+ }
225
+
226
+ LOG.info("Splitting "+String.valueOf(numHits)+" hits across "+String.valueOf(splits.size())+" splits ("+String.valueOf(recordsPerSplit)+" hits/split)");
227
+
228
+ return splits.toArray(new InputSplit[splits.size()]);
229
+ }
230
+
231
+ }