wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,235 @@
1
+ package com.infochimps.elasticsearch.pig;
2
+
3
+ import java.io.ByteArrayOutputStream;
4
+ import java.io.DataOutputStream;
5
+ import java.io.IOException;
6
+ import java.util.Arrays;
7
+ import java.util.List;
8
+ import java.util.Map;
9
+ import java.util.HashMap;
10
+ import java.util.Properties;
11
+ import java.net.URI;
12
+
13
+ import org.codehaus.jackson.map.ObjectMapper;
14
+ import org.codehaus.jackson.JsonParseException;
15
+ import org.codehaus.jackson.map.JsonMappingException;
16
+
17
+ import org.apache.commons.logging.Log;
18
+ import org.apache.commons.logging.LogFactory;
19
+ import org.apache.hadoop.conf.Configuration;
20
+ import org.apache.hadoop.fs.Path;
21
+ import org.apache.hadoop.io.*;
22
+ import org.apache.hadoop.mapreduce.InputFormat;
23
+ import org.apache.hadoop.mapreduce.Job;
24
+ import org.apache.hadoop.mapreduce.OutputFormat;
25
+ import org.apache.hadoop.mapreduce.RecordReader;
26
+ import org.apache.hadoop.mapreduce.RecordWriter;
27
+ import org.apache.hadoop.filecache.DistributedCache;
28
+
29
+ import org.apache.pig.StoreFunc;
30
+ import org.apache.pig.ResourceSchema;
31
+ import org.apache.pig.ResourceSchema.ResourceFieldSchema;
32
+ import org.apache.pig.StoreFuncInterface;
33
+ import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
34
+ import org.apache.pig.builtin.Utf8StorageConverter;
35
+ import org.apache.pig.data.DataBag;
36
+ import org.apache.pig.data.DataByteArray;
37
+ import org.apache.pig.data.DataType;
38
+ import org.apache.pig.data.Tuple;
39
+ import org.apache.pig.data.TupleFactory;
40
+ import org.apache.pig.impl.logicalLayer.FrontendException;
41
+ import org.apache.pig.impl.util.Utils;
42
+ import org.apache.pig.impl.util.UDFContext;
43
+
44
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
45
+ import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
46
+
47
+ /**
48
+ Pig storefunc for Elastic Search. Takes json data <b>only</b>.
49
+ <p>
50
+ USAGE:
51
+ <p>
52
+ STORE records INTO ElasticSearchJsonIndex();
53
+ STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize);
54
+ STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig);
55
+ STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig, esPlugins);
56
+
57
+ where:
58
+
59
+ idFieldName = Named field of the record to use as the record id. If none is passed in
60
+ then the record is assumed to have no id.
61
+ bulkSize = Number of records for ElasticSearchOutputFormat to batch up before sending
62
+ a bulk index request to Elastic Search. Default: 1000.
63
+ esConfig = Full path to local elasticsearch.yml. Default: /etc/elasticsearch/elasticsearch.yml
64
+ esPlugins = Full path to local elastic search plugins dir. Default: /usr/local/share/elasticsearch/plugins
65
+
66
+ */
67
+ public class ElasticSearchJsonIndex extends StoreFunc implements StoreFuncInterface {
68
+
69
+ private static final Log LOG = LogFactory.getLog(ElasticSearchJsonIndex.class);
70
+
71
+ protected RecordWriter writer = null;
72
+ protected ObjectMapper mapper = new ObjectMapper();
73
+ protected String idFieldName;
74
+ protected String bulkSize;
75
+ protected String esConfig;
76
+ protected String esPlugins;
77
+
78
+ // For hadoop configuration
79
+ private static final String ES_INDEX_NAME = "elasticsearch.index.name";
80
+ private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
81
+ private static final String ES_IS_JSON = "elasticsearch.is_json";
82
+ private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
83
+ private static final String ES_FIELD_NAMES = "elasticsearch.field.names";
84
+ private static final String ES_ID_FIELD = "elasticsearch.id.field";
85
+ private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
86
+
87
+ // Other string constants
88
+ private static final String SLASH = "/";
89
+ private static final String NO_ID_FIELD = "-1";
90
+ private static final String LOCAL_SCHEME = "file://";
91
+ private static final String DEFAULT_BULK = "1000";
92
+ private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
93
+ private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
94
+ private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
95
+ private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
96
+
97
+ public ElasticSearchJsonIndex() {
98
+ this(NO_ID_FIELD, DEFAULT_BULK);
99
+ }
100
+
101
+ public ElasticSearchJsonIndex(String idFieldName, String bulkSize) {
102
+ this(idFieldName, bulkSize, DEFAULT_ES_CONFIG);
103
+ }
104
+
105
+ public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig) {
106
+ this(idFieldName, bulkSize, esConfig, DEFAULT_ES_PLUGINS);
107
+ }
108
+
109
+ public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig, String esPlugins) {
110
+ this.idFieldName = idFieldName;
111
+ this.bulkSize = bulkSize;
112
+ this.esConfig = esConfig;
113
+ this.esPlugins = esPlugins;
114
+ }
115
+
116
+ @Override
117
+ public void checkSchema(ResourceSchema s) throws IOException {
118
+ }
119
+
120
+ /**
121
+ Look at passed in location and configuration and set options. Note that, since this
122
+ is called more than once, we need to make sure and not change anything we've already
123
+ set.
124
+ */
125
+ @Override
126
+ public void setStoreLocation(String location, Job job) throws IOException {
127
+ String[] es_store = location.substring(5).split(SLASH);
128
+ if (es_store.length != 2) {
129
+ throw new RuntimeException("Please specify a valid elasticsearch index, eg. es://myindex/myobj");
130
+ }
131
+ Configuration conf = job.getConfiguration();
132
+ // Only set if we haven't already
133
+ if (conf.get(ES_INDEX_NAME) == null) {
134
+ try {
135
+ job.getConfiguration().set(ES_INDEX_NAME, es_store[0]);
136
+ job.getConfiguration().set(ES_OBJECT_TYPE, es_store[1]);
137
+ } catch (ArrayIndexOutOfBoundsException e) {
138
+ throw new RuntimeException("You must specify both an index and an object type.");
139
+ }
140
+ job.getConfiguration().setBoolean(ES_IS_JSON, true);
141
+ job.getConfiguration().set(ES_BULK_SIZE, bulkSize);
142
+ job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
143
+
144
+ // Adds the elasticsearch.yml file (esConfig) to the distributed cache
145
+ try {
146
+ Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
147
+ Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
148
+
149
+ HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
150
+ HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
151
+
152
+ HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
153
+ HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
154
+ } catch (Exception e) {
155
+ throw new RuntimeException(e);
156
+ }
157
+ }
158
+ }
159
+
160
+ @Override
161
+ public OutputFormat getOutputFormat() throws IOException {
162
+ return new ElasticSearchOutputFormat();
163
+ }
164
+
165
+ // Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
166
+ @Override
167
+ public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
168
+ this.writer = writer;
169
+ }
170
+
171
+ /**
172
+ Map a tuple object into a map-writable object for elasticsearch.
173
+ */
174
+ @SuppressWarnings("unchecked")
175
+ @Override
176
+ public void putNext(Tuple t) throws IOException {
177
+ if (!t.isNull(0)) {
178
+ MapWritable record = new MapWritable();
179
+ String jsonData = t.get(0).toString();
180
+
181
+ // parse json data and put into mapwritable record
182
+ try {
183
+ HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
184
+ record = (MapWritable)toWritable(data);
185
+ } catch (JsonParseException e) {
186
+ e.printStackTrace();
187
+ } catch (JsonMappingException e) {
188
+ e.printStackTrace();
189
+ }
190
+ try {
191
+ writer.write(NullWritable.get(), record);
192
+ } catch (InterruptedException e) {
193
+ throw new IOException(e);
194
+ }
195
+ }
196
+ }
197
+
198
+ /**
199
+ Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
200
+ method for doing this.
201
+ */
202
+ private Writable toWritable(Object thing) {
203
+ if (thing instanceof String) {
204
+ return new Text((String)thing);
205
+ } else if (thing instanceof Long) {
206
+ return new LongWritable((Long)thing);
207
+ } else if (thing instanceof Integer) {
208
+ return new IntWritable((Integer)thing);
209
+ } else if (thing instanceof Double) {
210
+ return new DoubleWritable((Double)thing);
211
+ } else if (thing instanceof Float) {
212
+ return new FloatWritable((Float)thing);
213
+ } else if (thing instanceof Map) {
214
+ MapWritable result = new MapWritable();
215
+ for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
216
+ result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
217
+ }
218
+ return result;
219
+ } else if (thing instanceof List) {
220
+ if (((List)thing).size() > 0) {
221
+ Object first = ((List)thing).get(0);
222
+ Writable[] listOfThings = new Writable[((List)thing).size()];
223
+ for (int i = 0; i < listOfThings.length; i++) {
224
+ listOfThings[i] = toWritable(((List)thing).get(i));
225
+ }
226
+ return new ArrayWritable(toWritable(first).getClass(), listOfThings);
227
+ }
228
+ }
229
+ return NullWritable.get();
230
+ }
231
+
232
+ @Override
233
+ public void cleanupOnFailure(String location, Job job) throws IOException {
234
+ }
235
+ }
@@ -0,0 +1,355 @@
1
+ package com.infochimps.elasticsearch.pig;
2
+
3
+ import java.io.IOException;
4
+ import java.lang.InterruptedException;
5
+ import java.util.Properties;
6
+ import java.util.List;
7
+ import java.util.Map;
8
+ import java.util.HashMap;
9
+ import java.net.URI;
10
+ import java.net.URISyntaxException;
11
+
12
+ import org.apache.commons.logging.Log;
13
+ import org.apache.commons.logging.LogFactory;
14
+
15
+ import org.codehaus.jackson.map.ObjectMapper;
16
+ import org.codehaus.jackson.JsonParseException;
17
+ import org.codehaus.jackson.map.JsonMappingException;
18
+
19
+ import org.apache.hadoop.conf.Configuration;
20
+ import org.apache.hadoop.fs.Path;
21
+ import org.apache.hadoop.mapreduce.Job;
22
+ import org.apache.hadoop.mapreduce.RecordReader;
23
+ import org.apache.hadoop.mapreduce.RecordWriter;
24
+ import org.apache.hadoop.mapreduce.InputFormat;
25
+ import org.apache.hadoop.mapreduce.OutputFormat;
26
+ import org.apache.hadoop.io.*;
27
+
28
+ import org.apache.pig.LoadFunc;
29
+ import org.apache.pig.StoreFuncInterface;
30
+ import org.apache.pig.ResourceSchema;
31
+ import org.apache.pig.impl.util.UDFContext;
32
+ import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
33
+ import org.apache.pig.data.DataByteArray;
34
+ import org.apache.pig.data.Tuple;
35
+ import org.apache.pig.data.TupleFactory;
36
+
37
+ import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
38
+ import com.infochimps.elasticsearch.ElasticSearchInputFormat;
39
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
40
+
41
+ public class ElasticSearchStorage extends LoadFunc implements StoreFuncInterface {
42
+
43
+ private String contextSignature = null;
44
+ private RecordReader reader;
45
+ protected RecordWriter writer = null;
46
+ protected ObjectMapper mapper = new ObjectMapper();
47
+ protected String esConfig;
48
+ protected String esPlugins;
49
+
50
+ // For hadoop configuration
51
+ private static final String ES_INDEX_NAME = "elasticsearch.index.name";
52
+ private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
53
+ private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
54
+ private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
55
+ private static final String ES_IS_JSON = "elasticsearch.is_json";
56
+ private static final String PIG_ES_FIELD_NAMES = "elasticsearch.pig.field.names";
57
+ private static final String ES_REQUEST_SIZE = "elasticsearch.request.size";
58
+ private static final String ES_NUM_SPLITS = "elasticsearch.num.input.splits";
59
+ private static final String ES_QUERY_STRING = "elasticsearch.query.string";
60
+
61
+ private static final String COMMA = ",";
62
+ private static final String LOCAL_SCHEME = "file://";
63
+ private static final String DEFAULT_BULK = "1000";
64
+ private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
65
+ private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
66
+ private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
67
+ private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
68
+ private static final String ES_CONFIG = "es.config";
69
+ private static final String ES_PLUGINS = "es.path.plugins";
70
+
71
+ public ElasticSearchStorage() {
72
+ this(DEFAULT_ES_CONFIG, DEFAULT_ES_PLUGINS);
73
+ }
74
+
75
+ public ElasticSearchStorage(String esConfig) {
76
+ this(esConfig, DEFAULT_ES_PLUGINS);
77
+ }
78
+
79
+ public ElasticSearchStorage(String esConfig, String esPlugins) {
80
+ this.esConfig = esConfig;
81
+ this.esPlugins = esPlugins;
82
+ }
83
+
84
+ @Override
85
+ public Tuple getNext() throws IOException {
86
+ try {
87
+ Tuple tuple = TupleFactory.getInstance().newTuple(2);
88
+ if (reader.nextKeyValue()) {
89
+ Text docId = (Text)reader.getCurrentKey();
90
+ Text docContent = (Text)reader.getCurrentValue();
91
+ tuple.set(0, new DataByteArray(docId.toString()));
92
+ tuple.set(1, new DataByteArray(docContent.toString()));
93
+ return tuple;
94
+ }
95
+ } catch (InterruptedException e) {
96
+ throw new IOException(e);
97
+ }
98
+ return null;
99
+ }
100
+
101
+ @Override
102
+ public InputFormat getInputFormat() {
103
+ return new ElasticSearchInputFormat();
104
+ }
105
+
106
+ @Override
107
+ public void prepareToRead(RecordReader reader, PigSplit split) {
108
+ this.reader = reader;
109
+ }
110
+
111
+ @Override
112
+ public void setUDFContextSignature(String signature) {
113
+ this.contextSignature = signature;
114
+ }
115
+
116
+ @Override
117
+ public void setLocation(String location, Job job) throws IOException {
118
+ elasticSearchSetup(location, job);
119
+ }
120
+
121
+ @Override
122
+ public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
123
+ return location;
124
+ }
125
+
126
+ @Override
127
+ public String relativeToAbsolutePath(String location, Path curDir) throws IOException {
128
+ return location;
129
+ }
130
+
131
+ @Override
132
+ public OutputFormat getOutputFormat() throws IOException {
133
+ return new ElasticSearchOutputFormat();
134
+ }
135
+
136
+ /**
137
+ Here we set the field names for a given tuple even if we
138
+ */
139
+ @Override
140
+ public void checkSchema(ResourceSchema s) throws IOException {
141
+ UDFContext context = UDFContext.getUDFContext();
142
+ Properties property = context.getUDFProperties(ResourceSchema.class);
143
+ String fieldNames = "";
144
+ for (String field : s.fieldNames()) {
145
+ fieldNames += field;
146
+ fieldNames += COMMA;
147
+ }
148
+ property.setProperty(PIG_ES_FIELD_NAMES, fieldNames);
149
+ }
150
+
151
+ // Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
152
+ @Override
153
+ public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
154
+ this.writer = writer;
155
+ }
156
+
157
+ /**
158
+ Here we handle both the delimited record case and the json case.
159
+ */
160
+ @SuppressWarnings("unchecked")
161
+ @Override
162
+ public void putNext(Tuple t) throws IOException {
163
+
164
+ UDFContext context = UDFContext.getUDFContext();
165
+ Properties property = context.getUDFProperties(ResourceSchema.class);
166
+ MapWritable record = new MapWritable();
167
+
168
+ String isJson = property.getProperty(ES_IS_JSON);
169
+ // Handle delimited records (ie. isJson == false)
170
+ if (isJson != null && isJson.equals("false")) {
171
+ String[] fieldNames = property.getProperty(PIG_ES_FIELD_NAMES).split(COMMA);
172
+ for (int i = 0; i < t.size(); i++) {
173
+ if (i < fieldNames.length) {
174
+ try {
175
+ record.put(new Text(fieldNames[i]), new Text(t.get(i).toString()));
176
+ } catch (NullPointerException e) {
177
+ //LOG.info("Increment null field counter.");
178
+ }
179
+ }
180
+ }
181
+ } else {
182
+ if (!t.isNull(0)) {
183
+ String jsonData = t.get(0).toString();
184
+ // parse json data and put into mapwritable record
185
+ try {
186
+ HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
187
+ record = (MapWritable)toWritable(data);
188
+ } catch (JsonParseException e) {
189
+ e.printStackTrace();
190
+ } catch (JsonMappingException e) {
191
+ e.printStackTrace();
192
+ }
193
+ }
194
+ }
195
+
196
+ try {
197
+ writer.write(NullWritable.get(), record);
198
+ } catch (InterruptedException e) {
199
+ throw new IOException(e);
200
+ }
201
+ }
202
+
203
+ @Override
204
+ public void setStoreFuncUDFContextSignature(String signature) {
205
+ this.contextSignature = signature;
206
+ }
207
+
208
+ /**
209
+ Pull out the elasticsearch setup code
210
+ */
211
+ private void elasticSearchSetup(String location, Job job) {
212
+ // Need to use the uri parsing library here to pull out everything
213
+ try {
214
+
215
+ // Parse the passed in location URI, pulling out the arguments as well
216
+ URI parsedLocation = new URI(location);
217
+ HashMap<String, String> query = parseURIQuery(parsedLocation.getQuery());
218
+
219
+ String esHost = location.substring(5).split("/")[0];
220
+ if (esHost==null) {
221
+ throw new RuntimeException("Missing elasticsearch index name, URI must be formatted as es://<index_name>/<object_type>?<params>");
222
+ }
223
+
224
+ if (parsedLocation.getPath()==null) {
225
+ throw new RuntimeException("Missing elasticsearch object type, URI must be formatted as es://<index_name>/<object_type>?<params>");
226
+ }
227
+
228
+ Configuration conf = job.getConfiguration();
229
+ if (conf.get(ES_INDEX_NAME) == null) {
230
+
231
+ // Set elasticsearch index and object type in the Hadoop configuration
232
+ job.getConfiguration().set(ES_INDEX_NAME, esHost);
233
+ job.getConfiguration().set(ES_OBJECT_TYPE, parsedLocation.getPath().replaceAll("/", ""));
234
+
235
+ // Set the request size in the Hadoop configuration
236
+ String requestSize = query.get("size");
237
+ if (requestSize == null) requestSize = DEFAULT_BULK;
238
+ job.getConfiguration().set(ES_BULK_SIZE, requestSize);
239
+ job.getConfiguration().set(ES_REQUEST_SIZE, requestSize);
240
+
241
+ // Set the id field name in the Hadoop configuration
242
+ String idFieldName = query.get("id");
243
+ if (idFieldName == null) idFieldName = "-1";
244
+ job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
245
+
246
+ String queryString = query.get("q");
247
+ if (queryString==null) queryString = "*";
248
+ job.getConfiguration().set(ES_QUERY_STRING, queryString);
249
+
250
+ String numTasks = query.get("tasks");
251
+ if (numTasks==null) numTasks = "100";
252
+ job.getConfiguration().set(ES_NUM_SPLITS, numTasks);
253
+
254
+ // Adds the elasticsearch.yml file (esConfig) and the plugins directory (esPlugins) to the distributed cache
255
+ try {
256
+ Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
257
+ Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
258
+
259
+ HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
260
+ HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
261
+
262
+ HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
263
+ HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
264
+
265
+ } catch (Exception e) {
266
+ throw new RuntimeException(e);
267
+ }
268
+
269
+ //
270
+ // This gets set even when loading data from elasticsearch
271
+ //
272
+ String isJson = query.get("json");
273
+ if (isJson==null || isJson.equals("false")) {
274
+ // We're dealing with delimited records
275
+ UDFContext context = UDFContext.getUDFContext();
276
+ Properties property = context.getUDFProperties(ResourceSchema.class);
277
+ property.setProperty(ES_IS_JSON, "false");
278
+ }
279
+
280
+ // Need to set this to start the local instance of elasticsearch
281
+ job.getConfiguration().set(ES_CONFIG, esConfig);
282
+ job.getConfiguration().set(ES_PLUGINS, esPlugins);
283
+ }
284
+ } catch (URISyntaxException e) {
285
+ throw new RuntimeException(e);
286
+ }
287
+ }
288
+
289
+ /**
290
+ Look at the passed in uri and hadoop configuration and set options.
291
+ <p>
292
+ <b>WARNING</b> Note that, since this is called more than once, it is
293
+ critical to ensure that we do not change or reset anything we've already set.
294
+ */
295
+ @Override
296
+ public void setStoreLocation(String location, Job job) throws IOException {
297
+ elasticSearchSetup(location, job);
298
+ }
299
+
300
+ /**
301
+ Given a URI query string, eg. "foo=bar&happy=true" returns
302
+ a hashmap ({'foo' => 'bar', 'happy' => 'true'})
303
+ */
304
+ private HashMap<String, String> parseURIQuery(String query) {
305
+ HashMap<String, String> argMap = new HashMap<String, String>();
306
+ if (query != null) {
307
+ String[] pairs = query.split("&");
308
+ for (String pair : pairs) {
309
+ String[] splitPair = pair.split("=");
310
+ argMap.put(splitPair[0], splitPair[1]);
311
+ }
312
+ }
313
+ return argMap;
314
+ }
315
+
316
+ /**
317
+ Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
318
+ method for doing this.
319
+ */
320
+ private Writable toWritable(Object thing) {
321
+ if (thing instanceof String) {
322
+ return new Text((String)thing);
323
+ } else if (thing instanceof Long) {
324
+ return new LongWritable((Long)thing);
325
+ } else if (thing instanceof Integer) {
326
+ return new IntWritable((Integer)thing);
327
+ } else if (thing instanceof Double) {
328
+ return new DoubleWritable((Double)thing);
329
+ } else if (thing instanceof Float) {
330
+ return new FloatWritable((Float)thing);
331
+ } else if (thing instanceof Boolean) {
332
+ return new BooleanWritable((Boolean)thing);
333
+ } else if (thing instanceof Map) {
334
+ MapWritable result = new MapWritable();
335
+ for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
336
+ result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
337
+ }
338
+ return result;
339
+ } else if (thing instanceof List) {
340
+ if (((List)thing).size() > 0) {
341
+ Object first = ((List)thing).get(0);
342
+ Writable[] listOfThings = new Writable[((List)thing).size()];
343
+ for (int i = 0; i < listOfThings.length; i++) {
344
+ listOfThings[i] = toWritable(((List)thing).get(i));
345
+ }
346
+ return new ArrayWritable(toWritable(first).getClass(), listOfThings);
347
+ }
348
+ }
349
+ return NullWritable.get();
350
+ }
351
+
352
+ @Override
353
+ public void cleanupOnFailure(String location, Job job) throws IOException {
354
+ }
355
+ }