wonderdog 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
package com.infochimps.elasticsearch.pig;
|
2
|
+
|
3
|
+
import java.io.ByteArrayOutputStream;
|
4
|
+
import java.io.DataOutputStream;
|
5
|
+
import java.io.IOException;
|
6
|
+
import java.util.Arrays;
|
7
|
+
import java.util.List;
|
8
|
+
import java.util.Map;
|
9
|
+
import java.util.HashMap;
|
10
|
+
import java.util.Properties;
|
11
|
+
import java.net.URI;
|
12
|
+
|
13
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
14
|
+
import org.codehaus.jackson.JsonParseException;
|
15
|
+
import org.codehaus.jackson.map.JsonMappingException;
|
16
|
+
|
17
|
+
import org.apache.commons.logging.Log;
|
18
|
+
import org.apache.commons.logging.LogFactory;
|
19
|
+
import org.apache.hadoop.conf.Configuration;
|
20
|
+
import org.apache.hadoop.fs.Path;
|
21
|
+
import org.apache.hadoop.io.*;
|
22
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
23
|
+
import org.apache.hadoop.mapreduce.Job;
|
24
|
+
import org.apache.hadoop.mapreduce.OutputFormat;
|
25
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
26
|
+
import org.apache.hadoop.mapreduce.RecordWriter;
|
27
|
+
import org.apache.hadoop.filecache.DistributedCache;
|
28
|
+
|
29
|
+
import org.apache.pig.StoreFunc;
|
30
|
+
import org.apache.pig.ResourceSchema;
|
31
|
+
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
|
32
|
+
import org.apache.pig.StoreFuncInterface;
|
33
|
+
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
|
34
|
+
import org.apache.pig.builtin.Utf8StorageConverter;
|
35
|
+
import org.apache.pig.data.DataBag;
|
36
|
+
import org.apache.pig.data.DataByteArray;
|
37
|
+
import org.apache.pig.data.DataType;
|
38
|
+
import org.apache.pig.data.Tuple;
|
39
|
+
import org.apache.pig.data.TupleFactory;
|
40
|
+
import org.apache.pig.impl.logicalLayer.FrontendException;
|
41
|
+
import org.apache.pig.impl.util.Utils;
|
42
|
+
import org.apache.pig.impl.util.UDFContext;
|
43
|
+
|
44
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
45
|
+
import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
|
46
|
+
|
47
|
+
/**
|
48
|
+
Pig storefunc for Elastic Search. Takes json data <b>only</b>.
|
49
|
+
<p>
|
50
|
+
USAGE:
|
51
|
+
<p>
|
52
|
+
STORE records INTO ElasticSearchJsonIndex();
|
53
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize);
|
54
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig);
|
55
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig, esPlugins);
|
56
|
+
|
57
|
+
where:
|
58
|
+
|
59
|
+
idFieldName = Named field of the record to use as the record id. If none is passed in
|
60
|
+
then the record is assumed to have no id.
|
61
|
+
bulkSize = Number of records for ElasticSearchOutputFormat to batch up before sending
|
62
|
+
a bulk index request to Elastic Search. Default: 1000.
|
63
|
+
esConfig = Full path to local elasticsearch.yml. Default: /etc/elasticsearch/elasticsearch.yml
|
64
|
+
esPlugins = Full path to local elastic search plugins dir. Default: /usr/local/share/elasticsearch/plugins
|
65
|
+
|
66
|
+
*/
|
67
|
+
public class ElasticSearchJsonIndex extends StoreFunc implements StoreFuncInterface {
|
68
|
+
|
69
|
+
private static final Log LOG = LogFactory.getLog(ElasticSearchJsonIndex.class);
|
70
|
+
|
71
|
+
protected RecordWriter writer = null;
|
72
|
+
protected ObjectMapper mapper = new ObjectMapper();
|
73
|
+
protected String idFieldName;
|
74
|
+
protected String bulkSize;
|
75
|
+
protected String esConfig;
|
76
|
+
protected String esPlugins;
|
77
|
+
|
78
|
+
// For hadoop configuration
|
79
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
80
|
+
private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
|
81
|
+
private static final String ES_IS_JSON = "elasticsearch.is_json";
|
82
|
+
private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
|
83
|
+
private static final String ES_FIELD_NAMES = "elasticsearch.field.names";
|
84
|
+
private static final String ES_ID_FIELD = "elasticsearch.id.field";
|
85
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
86
|
+
|
87
|
+
// Other string constants
|
88
|
+
private static final String SLASH = "/";
|
89
|
+
private static final String NO_ID_FIELD = "-1";
|
90
|
+
private static final String LOCAL_SCHEME = "file://";
|
91
|
+
private static final String DEFAULT_BULK = "1000";
|
92
|
+
private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
93
|
+
private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
94
|
+
private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
|
95
|
+
private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
|
96
|
+
|
97
|
+
public ElasticSearchJsonIndex() {
|
98
|
+
this(NO_ID_FIELD, DEFAULT_BULK);
|
99
|
+
}
|
100
|
+
|
101
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize) {
|
102
|
+
this(idFieldName, bulkSize, DEFAULT_ES_CONFIG);
|
103
|
+
}
|
104
|
+
|
105
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig) {
|
106
|
+
this(idFieldName, bulkSize, esConfig, DEFAULT_ES_PLUGINS);
|
107
|
+
}
|
108
|
+
|
109
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig, String esPlugins) {
|
110
|
+
this.idFieldName = idFieldName;
|
111
|
+
this.bulkSize = bulkSize;
|
112
|
+
this.esConfig = esConfig;
|
113
|
+
this.esPlugins = esPlugins;
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public void checkSchema(ResourceSchema s) throws IOException {
|
118
|
+
}
|
119
|
+
|
120
|
+
/**
|
121
|
+
Look at passed in location and configuration and set options. Note that, since this
|
122
|
+
is called more than once, we need to make sure and not change anything we've already
|
123
|
+
set.
|
124
|
+
*/
|
125
|
+
@Override
|
126
|
+
public void setStoreLocation(String location, Job job) throws IOException {
|
127
|
+
String[] es_store = location.substring(5).split(SLASH);
|
128
|
+
if (es_store.length != 2) {
|
129
|
+
throw new RuntimeException("Please specify a valid elasticsearch index, eg. es://myindex/myobj");
|
130
|
+
}
|
131
|
+
Configuration conf = job.getConfiguration();
|
132
|
+
// Only set if we haven't already
|
133
|
+
if (conf.get(ES_INDEX_NAME) == null) {
|
134
|
+
try {
|
135
|
+
job.getConfiguration().set(ES_INDEX_NAME, es_store[0]);
|
136
|
+
job.getConfiguration().set(ES_OBJECT_TYPE, es_store[1]);
|
137
|
+
} catch (ArrayIndexOutOfBoundsException e) {
|
138
|
+
throw new RuntimeException("You must specify both an index and an object type.");
|
139
|
+
}
|
140
|
+
job.getConfiguration().setBoolean(ES_IS_JSON, true);
|
141
|
+
job.getConfiguration().set(ES_BULK_SIZE, bulkSize);
|
142
|
+
job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
|
143
|
+
|
144
|
+
// Adds the elasticsearch.yml file (esConfig) to the distributed cache
|
145
|
+
try {
|
146
|
+
Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
|
147
|
+
Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
|
148
|
+
|
149
|
+
HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
|
150
|
+
HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
|
151
|
+
|
152
|
+
HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
|
153
|
+
HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
|
154
|
+
} catch (Exception e) {
|
155
|
+
throw new RuntimeException(e);
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
@Override
|
161
|
+
public OutputFormat getOutputFormat() throws IOException {
|
162
|
+
return new ElasticSearchOutputFormat();
|
163
|
+
}
|
164
|
+
|
165
|
+
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
|
166
|
+
@Override
|
167
|
+
public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
|
168
|
+
this.writer = writer;
|
169
|
+
}
|
170
|
+
|
171
|
+
/**
|
172
|
+
Map a tuple object into a map-writable object for elasticsearch.
|
173
|
+
*/
|
174
|
+
@SuppressWarnings("unchecked")
|
175
|
+
@Override
|
176
|
+
public void putNext(Tuple t) throws IOException {
|
177
|
+
if (!t.isNull(0)) {
|
178
|
+
MapWritable record = new MapWritable();
|
179
|
+
String jsonData = t.get(0).toString();
|
180
|
+
|
181
|
+
// parse json data and put into mapwritable record
|
182
|
+
try {
|
183
|
+
HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
|
184
|
+
record = (MapWritable)toWritable(data);
|
185
|
+
} catch (JsonParseException e) {
|
186
|
+
e.printStackTrace();
|
187
|
+
} catch (JsonMappingException e) {
|
188
|
+
e.printStackTrace();
|
189
|
+
}
|
190
|
+
try {
|
191
|
+
writer.write(NullWritable.get(), record);
|
192
|
+
} catch (InterruptedException e) {
|
193
|
+
throw new IOException(e);
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
/**
|
199
|
+
Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
|
200
|
+
method for doing this.
|
201
|
+
*/
|
202
|
+
private Writable toWritable(Object thing) {
|
203
|
+
if (thing instanceof String) {
|
204
|
+
return new Text((String)thing);
|
205
|
+
} else if (thing instanceof Long) {
|
206
|
+
return new LongWritable((Long)thing);
|
207
|
+
} else if (thing instanceof Integer) {
|
208
|
+
return new IntWritable((Integer)thing);
|
209
|
+
} else if (thing instanceof Double) {
|
210
|
+
return new DoubleWritable((Double)thing);
|
211
|
+
} else if (thing instanceof Float) {
|
212
|
+
return new FloatWritable((Float)thing);
|
213
|
+
} else if (thing instanceof Map) {
|
214
|
+
MapWritable result = new MapWritable();
|
215
|
+
for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
|
216
|
+
result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
|
217
|
+
}
|
218
|
+
return result;
|
219
|
+
} else if (thing instanceof List) {
|
220
|
+
if (((List)thing).size() > 0) {
|
221
|
+
Object first = ((List)thing).get(0);
|
222
|
+
Writable[] listOfThings = new Writable[((List)thing).size()];
|
223
|
+
for (int i = 0; i < listOfThings.length; i++) {
|
224
|
+
listOfThings[i] = toWritable(((List)thing).get(i));
|
225
|
+
}
|
226
|
+
return new ArrayWritable(toWritable(first).getClass(), listOfThings);
|
227
|
+
}
|
228
|
+
}
|
229
|
+
return NullWritable.get();
|
230
|
+
}
|
231
|
+
|
232
|
+
@Override
|
233
|
+
public void cleanupOnFailure(String location, Job job) throws IOException {
|
234
|
+
}
|
235
|
+
}
|
@@ -0,0 +1,355 @@
|
|
1
|
+
package com.infochimps.elasticsearch.pig;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.lang.InterruptedException;
|
5
|
+
import java.util.Properties;
|
6
|
+
import java.util.List;
|
7
|
+
import java.util.Map;
|
8
|
+
import java.util.HashMap;
|
9
|
+
import java.net.URI;
|
10
|
+
import java.net.URISyntaxException;
|
11
|
+
|
12
|
+
import org.apache.commons.logging.Log;
|
13
|
+
import org.apache.commons.logging.LogFactory;
|
14
|
+
|
15
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
16
|
+
import org.codehaus.jackson.JsonParseException;
|
17
|
+
import org.codehaus.jackson.map.JsonMappingException;
|
18
|
+
|
19
|
+
import org.apache.hadoop.conf.Configuration;
|
20
|
+
import org.apache.hadoop.fs.Path;
|
21
|
+
import org.apache.hadoop.mapreduce.Job;
|
22
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
23
|
+
import org.apache.hadoop.mapreduce.RecordWriter;
|
24
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
25
|
+
import org.apache.hadoop.mapreduce.OutputFormat;
|
26
|
+
import org.apache.hadoop.io.*;
|
27
|
+
|
28
|
+
import org.apache.pig.LoadFunc;
|
29
|
+
import org.apache.pig.StoreFuncInterface;
|
30
|
+
import org.apache.pig.ResourceSchema;
|
31
|
+
import org.apache.pig.impl.util.UDFContext;
|
32
|
+
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
|
33
|
+
import org.apache.pig.data.DataByteArray;
|
34
|
+
import org.apache.pig.data.Tuple;
|
35
|
+
import org.apache.pig.data.TupleFactory;
|
36
|
+
|
37
|
+
import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
|
38
|
+
import com.infochimps.elasticsearch.ElasticSearchInputFormat;
|
39
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
40
|
+
|
41
|
+
public class ElasticSearchStorage extends LoadFunc implements StoreFuncInterface {
|
42
|
+
|
43
|
+
private String contextSignature = null;
|
44
|
+
private RecordReader reader;
|
45
|
+
protected RecordWriter writer = null;
|
46
|
+
protected ObjectMapper mapper = new ObjectMapper();
|
47
|
+
protected String esConfig;
|
48
|
+
protected String esPlugins;
|
49
|
+
|
50
|
+
// For hadoop configuration
|
51
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
52
|
+
private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
|
53
|
+
private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
|
54
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
55
|
+
private static final String ES_IS_JSON = "elasticsearch.is_json";
|
56
|
+
private static final String PIG_ES_FIELD_NAMES = "elasticsearch.pig.field.names";
|
57
|
+
private static final String ES_REQUEST_SIZE = "elasticsearch.request.size";
|
58
|
+
private static final String ES_NUM_SPLITS = "elasticsearch.num.input.splits";
|
59
|
+
private static final String ES_QUERY_STRING = "elasticsearch.query.string";
|
60
|
+
|
61
|
+
private static final String COMMA = ",";
|
62
|
+
private static final String LOCAL_SCHEME = "file://";
|
63
|
+
private static final String DEFAULT_BULK = "1000";
|
64
|
+
private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
65
|
+
private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
66
|
+
private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
|
67
|
+
private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
|
68
|
+
private static final String ES_CONFIG = "es.config";
|
69
|
+
private static final String ES_PLUGINS = "es.path.plugins";
|
70
|
+
|
71
|
+
public ElasticSearchStorage() {
|
72
|
+
this(DEFAULT_ES_CONFIG, DEFAULT_ES_PLUGINS);
|
73
|
+
}
|
74
|
+
|
75
|
+
public ElasticSearchStorage(String esConfig) {
|
76
|
+
this(esConfig, DEFAULT_ES_PLUGINS);
|
77
|
+
}
|
78
|
+
|
79
|
+
public ElasticSearchStorage(String esConfig, String esPlugins) {
|
80
|
+
this.esConfig = esConfig;
|
81
|
+
this.esPlugins = esPlugins;
|
82
|
+
}
|
83
|
+
|
84
|
+
@Override
|
85
|
+
public Tuple getNext() throws IOException {
|
86
|
+
try {
|
87
|
+
Tuple tuple = TupleFactory.getInstance().newTuple(2);
|
88
|
+
if (reader.nextKeyValue()) {
|
89
|
+
Text docId = (Text)reader.getCurrentKey();
|
90
|
+
Text docContent = (Text)reader.getCurrentValue();
|
91
|
+
tuple.set(0, new DataByteArray(docId.toString()));
|
92
|
+
tuple.set(1, new DataByteArray(docContent.toString()));
|
93
|
+
return tuple;
|
94
|
+
}
|
95
|
+
} catch (InterruptedException e) {
|
96
|
+
throw new IOException(e);
|
97
|
+
}
|
98
|
+
return null;
|
99
|
+
}
|
100
|
+
|
101
|
+
@Override
|
102
|
+
public InputFormat getInputFormat() {
|
103
|
+
return new ElasticSearchInputFormat();
|
104
|
+
}
|
105
|
+
|
106
|
+
@Override
|
107
|
+
public void prepareToRead(RecordReader reader, PigSplit split) {
|
108
|
+
this.reader = reader;
|
109
|
+
}
|
110
|
+
|
111
|
+
@Override
|
112
|
+
public void setUDFContextSignature(String signature) {
|
113
|
+
this.contextSignature = signature;
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public void setLocation(String location, Job job) throws IOException {
|
118
|
+
elasticSearchSetup(location, job);
|
119
|
+
}
|
120
|
+
|
121
|
+
@Override
|
122
|
+
public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
|
123
|
+
return location;
|
124
|
+
}
|
125
|
+
|
126
|
+
@Override
|
127
|
+
public String relativeToAbsolutePath(String location, Path curDir) throws IOException {
|
128
|
+
return location;
|
129
|
+
}
|
130
|
+
|
131
|
+
@Override
|
132
|
+
public OutputFormat getOutputFormat() throws IOException {
|
133
|
+
return new ElasticSearchOutputFormat();
|
134
|
+
}
|
135
|
+
|
136
|
+
/**
|
137
|
+
Here we set the field names for a given tuple even if we
|
138
|
+
*/
|
139
|
+
@Override
|
140
|
+
public void checkSchema(ResourceSchema s) throws IOException {
|
141
|
+
UDFContext context = UDFContext.getUDFContext();
|
142
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
143
|
+
String fieldNames = "";
|
144
|
+
for (String field : s.fieldNames()) {
|
145
|
+
fieldNames += field;
|
146
|
+
fieldNames += COMMA;
|
147
|
+
}
|
148
|
+
property.setProperty(PIG_ES_FIELD_NAMES, fieldNames);
|
149
|
+
}
|
150
|
+
|
151
|
+
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
|
152
|
+
@Override
|
153
|
+
public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
|
154
|
+
this.writer = writer;
|
155
|
+
}
|
156
|
+
|
157
|
+
/**
|
158
|
+
Here we handle both the delimited record case and the json case.
|
159
|
+
*/
|
160
|
+
@SuppressWarnings("unchecked")
|
161
|
+
@Override
|
162
|
+
public void putNext(Tuple t) throws IOException {
|
163
|
+
|
164
|
+
UDFContext context = UDFContext.getUDFContext();
|
165
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
166
|
+
MapWritable record = new MapWritable();
|
167
|
+
|
168
|
+
String isJson = property.getProperty(ES_IS_JSON);
|
169
|
+
// Handle delimited records (ie. isJson == false)
|
170
|
+
if (isJson != null && isJson.equals("false")) {
|
171
|
+
String[] fieldNames = property.getProperty(PIG_ES_FIELD_NAMES).split(COMMA);
|
172
|
+
for (int i = 0; i < t.size(); i++) {
|
173
|
+
if (i < fieldNames.length) {
|
174
|
+
try {
|
175
|
+
record.put(new Text(fieldNames[i]), new Text(t.get(i).toString()));
|
176
|
+
} catch (NullPointerException e) {
|
177
|
+
//LOG.info("Increment null field counter.");
|
178
|
+
}
|
179
|
+
}
|
180
|
+
}
|
181
|
+
} else {
|
182
|
+
if (!t.isNull(0)) {
|
183
|
+
String jsonData = t.get(0).toString();
|
184
|
+
// parse json data and put into mapwritable record
|
185
|
+
try {
|
186
|
+
HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
|
187
|
+
record = (MapWritable)toWritable(data);
|
188
|
+
} catch (JsonParseException e) {
|
189
|
+
e.printStackTrace();
|
190
|
+
} catch (JsonMappingException e) {
|
191
|
+
e.printStackTrace();
|
192
|
+
}
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
196
|
+
try {
|
197
|
+
writer.write(NullWritable.get(), record);
|
198
|
+
} catch (InterruptedException e) {
|
199
|
+
throw new IOException(e);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
@Override
|
204
|
+
public void setStoreFuncUDFContextSignature(String signature) {
|
205
|
+
this.contextSignature = signature;
|
206
|
+
}
|
207
|
+
|
208
|
+
/**
|
209
|
+
Pull out the elasticsearch setup code
|
210
|
+
*/
|
211
|
+
private void elasticSearchSetup(String location, Job job) {
|
212
|
+
// Need to use the uri parsing library here to pull out everything
|
213
|
+
try {
|
214
|
+
|
215
|
+
// Parse the passed in location URI, pulling out the arguments as well
|
216
|
+
URI parsedLocation = new URI(location);
|
217
|
+
HashMap<String, String> query = parseURIQuery(parsedLocation.getQuery());
|
218
|
+
|
219
|
+
String esHost = location.substring(5).split("/")[0];
|
220
|
+
if (esHost==null) {
|
221
|
+
throw new RuntimeException("Missing elasticsearch index name, URI must be formatted as es://<index_name>/<object_type>?<params>");
|
222
|
+
}
|
223
|
+
|
224
|
+
if (parsedLocation.getPath()==null) {
|
225
|
+
throw new RuntimeException("Missing elasticsearch object type, URI must be formatted as es://<index_name>/<object_type>?<params>");
|
226
|
+
}
|
227
|
+
|
228
|
+
Configuration conf = job.getConfiguration();
|
229
|
+
if (conf.get(ES_INDEX_NAME) == null) {
|
230
|
+
|
231
|
+
// Set elasticsearch index and object type in the Hadoop configuration
|
232
|
+
job.getConfiguration().set(ES_INDEX_NAME, esHost);
|
233
|
+
job.getConfiguration().set(ES_OBJECT_TYPE, parsedLocation.getPath().replaceAll("/", ""));
|
234
|
+
|
235
|
+
// Set the request size in the Hadoop configuration
|
236
|
+
String requestSize = query.get("size");
|
237
|
+
if (requestSize == null) requestSize = DEFAULT_BULK;
|
238
|
+
job.getConfiguration().set(ES_BULK_SIZE, requestSize);
|
239
|
+
job.getConfiguration().set(ES_REQUEST_SIZE, requestSize);
|
240
|
+
|
241
|
+
// Set the id field name in the Hadoop configuration
|
242
|
+
String idFieldName = query.get("id");
|
243
|
+
if (idFieldName == null) idFieldName = "-1";
|
244
|
+
job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
|
245
|
+
|
246
|
+
String queryString = query.get("q");
|
247
|
+
if (queryString==null) queryString = "*";
|
248
|
+
job.getConfiguration().set(ES_QUERY_STRING, queryString);
|
249
|
+
|
250
|
+
String numTasks = query.get("tasks");
|
251
|
+
if (numTasks==null) numTasks = "100";
|
252
|
+
job.getConfiguration().set(ES_NUM_SPLITS, numTasks);
|
253
|
+
|
254
|
+
// Adds the elasticsearch.yml file (esConfig) and the plugins directory (esPlugins) to the distributed cache
|
255
|
+
try {
|
256
|
+
Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
|
257
|
+
Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
|
258
|
+
|
259
|
+
HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
|
260
|
+
HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
|
261
|
+
|
262
|
+
HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
|
263
|
+
HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
|
264
|
+
|
265
|
+
} catch (Exception e) {
|
266
|
+
throw new RuntimeException(e);
|
267
|
+
}
|
268
|
+
|
269
|
+
//
|
270
|
+
// This gets set even when loading data from elasticsearch
|
271
|
+
//
|
272
|
+
String isJson = query.get("json");
|
273
|
+
if (isJson==null || isJson.equals("false")) {
|
274
|
+
// We're dealing with delimited records
|
275
|
+
UDFContext context = UDFContext.getUDFContext();
|
276
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
277
|
+
property.setProperty(ES_IS_JSON, "false");
|
278
|
+
}
|
279
|
+
|
280
|
+
// Need to set this to start the local instance of elasticsearch
|
281
|
+
job.getConfiguration().set(ES_CONFIG, esConfig);
|
282
|
+
job.getConfiguration().set(ES_PLUGINS, esPlugins);
|
283
|
+
}
|
284
|
+
} catch (URISyntaxException e) {
|
285
|
+
throw new RuntimeException(e);
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
/**
|
290
|
+
Look at the passed in uri and hadoop configuration and set options.
|
291
|
+
<p>
|
292
|
+
<b>WARNING</b> Note that, since this is called more than once, it is
|
293
|
+
critical to ensure that we do not change or reset anything we've already set.
|
294
|
+
*/
|
295
|
+
@Override
|
296
|
+
public void setStoreLocation(String location, Job job) throws IOException {
|
297
|
+
elasticSearchSetup(location, job);
|
298
|
+
}
|
299
|
+
|
300
|
+
/**
|
301
|
+
Given a URI query string, eg. "foo=bar&happy=true" returns
|
302
|
+
a hashmap ({'foo' => 'bar', 'happy' => 'true'})
|
303
|
+
*/
|
304
|
+
private HashMap<String, String> parseURIQuery(String query) {
|
305
|
+
HashMap<String, String> argMap = new HashMap<String, String>();
|
306
|
+
if (query != null) {
|
307
|
+
String[] pairs = query.split("&");
|
308
|
+
for (String pair : pairs) {
|
309
|
+
String[] splitPair = pair.split("=");
|
310
|
+
argMap.put(splitPair[0], splitPair[1]);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
return argMap;
|
314
|
+
}
|
315
|
+
|
316
|
+
/**
|
317
|
+
Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
|
318
|
+
method for doing this.
|
319
|
+
*/
|
320
|
+
private Writable toWritable(Object thing) {
|
321
|
+
if (thing instanceof String) {
|
322
|
+
return new Text((String)thing);
|
323
|
+
} else if (thing instanceof Long) {
|
324
|
+
return new LongWritable((Long)thing);
|
325
|
+
} else if (thing instanceof Integer) {
|
326
|
+
return new IntWritable((Integer)thing);
|
327
|
+
} else if (thing instanceof Double) {
|
328
|
+
return new DoubleWritable((Double)thing);
|
329
|
+
} else if (thing instanceof Float) {
|
330
|
+
return new FloatWritable((Float)thing);
|
331
|
+
} else if (thing instanceof Boolean) {
|
332
|
+
return new BooleanWritable((Boolean)thing);
|
333
|
+
} else if (thing instanceof Map) {
|
334
|
+
MapWritable result = new MapWritable();
|
335
|
+
for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
|
336
|
+
result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
|
337
|
+
}
|
338
|
+
return result;
|
339
|
+
} else if (thing instanceof List) {
|
340
|
+
if (((List)thing).size() > 0) {
|
341
|
+
Object first = ((List)thing).get(0);
|
342
|
+
Writable[] listOfThings = new Writable[((List)thing).size()];
|
343
|
+
for (int i = 0; i < listOfThings.length; i++) {
|
344
|
+
listOfThings[i] = toWritable(((List)thing).get(i));
|
345
|
+
}
|
346
|
+
return new ArrayWritable(toWritable(first).getClass(), listOfThings);
|
347
|
+
}
|
348
|
+
}
|
349
|
+
return NullWritable.get();
|
350
|
+
}
|
351
|
+
|
352
|
+
@Override
|
353
|
+
public void cleanupOnFailure(String location, Job job) throws IOException {
|
354
|
+
}
|
355
|
+
}
|