wonderdog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,235 @@
|
|
1
|
+
package com.infochimps.elasticsearch.pig;
|
2
|
+
|
3
|
+
import java.io.ByteArrayOutputStream;
|
4
|
+
import java.io.DataOutputStream;
|
5
|
+
import java.io.IOException;
|
6
|
+
import java.util.Arrays;
|
7
|
+
import java.util.List;
|
8
|
+
import java.util.Map;
|
9
|
+
import java.util.HashMap;
|
10
|
+
import java.util.Properties;
|
11
|
+
import java.net.URI;
|
12
|
+
|
13
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
14
|
+
import org.codehaus.jackson.JsonParseException;
|
15
|
+
import org.codehaus.jackson.map.JsonMappingException;
|
16
|
+
|
17
|
+
import org.apache.commons.logging.Log;
|
18
|
+
import org.apache.commons.logging.LogFactory;
|
19
|
+
import org.apache.hadoop.conf.Configuration;
|
20
|
+
import org.apache.hadoop.fs.Path;
|
21
|
+
import org.apache.hadoop.io.*;
|
22
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
23
|
+
import org.apache.hadoop.mapreduce.Job;
|
24
|
+
import org.apache.hadoop.mapreduce.OutputFormat;
|
25
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
26
|
+
import org.apache.hadoop.mapreduce.RecordWriter;
|
27
|
+
import org.apache.hadoop.filecache.DistributedCache;
|
28
|
+
|
29
|
+
import org.apache.pig.StoreFunc;
|
30
|
+
import org.apache.pig.ResourceSchema;
|
31
|
+
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
|
32
|
+
import org.apache.pig.StoreFuncInterface;
|
33
|
+
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
|
34
|
+
import org.apache.pig.builtin.Utf8StorageConverter;
|
35
|
+
import org.apache.pig.data.DataBag;
|
36
|
+
import org.apache.pig.data.DataByteArray;
|
37
|
+
import org.apache.pig.data.DataType;
|
38
|
+
import org.apache.pig.data.Tuple;
|
39
|
+
import org.apache.pig.data.TupleFactory;
|
40
|
+
import org.apache.pig.impl.logicalLayer.FrontendException;
|
41
|
+
import org.apache.pig.impl.util.Utils;
|
42
|
+
import org.apache.pig.impl.util.UDFContext;
|
43
|
+
|
44
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
45
|
+
import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
|
46
|
+
|
47
|
+
/**
|
48
|
+
Pig storefunc for Elastic Search. Takes json data <b>only</b>.
|
49
|
+
<p>
|
50
|
+
USAGE:
|
51
|
+
<p>
|
52
|
+
STORE records INTO ElasticSearchJsonIndex();
|
53
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize);
|
54
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig);
|
55
|
+
STORE records INTO ElasticSearchJsonIndex(idFieldName, bulkSize, esConfig, esPlugins);
|
56
|
+
|
57
|
+
where:
|
58
|
+
|
59
|
+
idFieldName = Named field of the record to use as the record id. If none is passed in
|
60
|
+
then the record is assumed to have no id.
|
61
|
+
bulkSize = Number of records for ElasticSearchOutputFormat to batch up before sending
|
62
|
+
a bulk index request to Elastic Search. Default: 1000.
|
63
|
+
esConfig = Full path to local elasticsearch.yml. Default: /etc/elasticsearch/elasticsearch.yml
|
64
|
+
esPlugins = Full path to local elastic search plugins dir. Default: /usr/local/share/elasticsearch/plugins
|
65
|
+
|
66
|
+
*/
|
67
|
+
public class ElasticSearchJsonIndex extends StoreFunc implements StoreFuncInterface {
|
68
|
+
|
69
|
+
private static final Log LOG = LogFactory.getLog(ElasticSearchJsonIndex.class);
|
70
|
+
|
71
|
+
protected RecordWriter writer = null;
|
72
|
+
protected ObjectMapper mapper = new ObjectMapper();
|
73
|
+
protected String idFieldName;
|
74
|
+
protected String bulkSize;
|
75
|
+
protected String esConfig;
|
76
|
+
protected String esPlugins;
|
77
|
+
|
78
|
+
// For hadoop configuration
|
79
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
80
|
+
private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
|
81
|
+
private static final String ES_IS_JSON = "elasticsearch.is_json";
|
82
|
+
private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
|
83
|
+
private static final String ES_FIELD_NAMES = "elasticsearch.field.names";
|
84
|
+
private static final String ES_ID_FIELD = "elasticsearch.id.field";
|
85
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
86
|
+
|
87
|
+
// Other string constants
|
88
|
+
private static final String SLASH = "/";
|
89
|
+
private static final String NO_ID_FIELD = "-1";
|
90
|
+
private static final String LOCAL_SCHEME = "file://";
|
91
|
+
private static final String DEFAULT_BULK = "1000";
|
92
|
+
private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
93
|
+
private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
94
|
+
private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
|
95
|
+
private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
|
96
|
+
|
97
|
+
public ElasticSearchJsonIndex() {
|
98
|
+
this(NO_ID_FIELD, DEFAULT_BULK);
|
99
|
+
}
|
100
|
+
|
101
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize) {
|
102
|
+
this(idFieldName, bulkSize, DEFAULT_ES_CONFIG);
|
103
|
+
}
|
104
|
+
|
105
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig) {
|
106
|
+
this(idFieldName, bulkSize, esConfig, DEFAULT_ES_PLUGINS);
|
107
|
+
}
|
108
|
+
|
109
|
+
public ElasticSearchJsonIndex(String idFieldName, String bulkSize, String esConfig, String esPlugins) {
|
110
|
+
this.idFieldName = idFieldName;
|
111
|
+
this.bulkSize = bulkSize;
|
112
|
+
this.esConfig = esConfig;
|
113
|
+
this.esPlugins = esPlugins;
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public void checkSchema(ResourceSchema s) throws IOException {
|
118
|
+
}
|
119
|
+
|
120
|
+
/**
|
121
|
+
Look at passed in location and configuration and set options. Note that, since this
|
122
|
+
is called more than once, we need to make sure and not change anything we've already
|
123
|
+
set.
|
124
|
+
*/
|
125
|
+
@Override
|
126
|
+
public void setStoreLocation(String location, Job job) throws IOException {
|
127
|
+
String[] es_store = location.substring(5).split(SLASH);
|
128
|
+
if (es_store.length != 2) {
|
129
|
+
throw new RuntimeException("Please specify a valid elasticsearch index, eg. es://myindex/myobj");
|
130
|
+
}
|
131
|
+
Configuration conf = job.getConfiguration();
|
132
|
+
// Only set if we haven't already
|
133
|
+
if (conf.get(ES_INDEX_NAME) == null) {
|
134
|
+
try {
|
135
|
+
job.getConfiguration().set(ES_INDEX_NAME, es_store[0]);
|
136
|
+
job.getConfiguration().set(ES_OBJECT_TYPE, es_store[1]);
|
137
|
+
} catch (ArrayIndexOutOfBoundsException e) {
|
138
|
+
throw new RuntimeException("You must specify both an index and an object type.");
|
139
|
+
}
|
140
|
+
job.getConfiguration().setBoolean(ES_IS_JSON, true);
|
141
|
+
job.getConfiguration().set(ES_BULK_SIZE, bulkSize);
|
142
|
+
job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
|
143
|
+
|
144
|
+
// Adds the elasticsearch.yml file (esConfig) to the distributed cache
|
145
|
+
try {
|
146
|
+
Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
|
147
|
+
Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
|
148
|
+
|
149
|
+
HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
|
150
|
+
HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
|
151
|
+
|
152
|
+
HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
|
153
|
+
HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
|
154
|
+
} catch (Exception e) {
|
155
|
+
throw new RuntimeException(e);
|
156
|
+
}
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
@Override
|
161
|
+
public OutputFormat getOutputFormat() throws IOException {
|
162
|
+
return new ElasticSearchOutputFormat();
|
163
|
+
}
|
164
|
+
|
165
|
+
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
|
166
|
+
@Override
|
167
|
+
public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
|
168
|
+
this.writer = writer;
|
169
|
+
}
|
170
|
+
|
171
|
+
/**
|
172
|
+
Map a tuple object into a map-writable object for elasticsearch.
|
173
|
+
*/
|
174
|
+
@SuppressWarnings("unchecked")
|
175
|
+
@Override
|
176
|
+
public void putNext(Tuple t) throws IOException {
|
177
|
+
if (!t.isNull(0)) {
|
178
|
+
MapWritable record = new MapWritable();
|
179
|
+
String jsonData = t.get(0).toString();
|
180
|
+
|
181
|
+
// parse json data and put into mapwritable record
|
182
|
+
try {
|
183
|
+
HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
|
184
|
+
record = (MapWritable)toWritable(data);
|
185
|
+
} catch (JsonParseException e) {
|
186
|
+
e.printStackTrace();
|
187
|
+
} catch (JsonMappingException e) {
|
188
|
+
e.printStackTrace();
|
189
|
+
}
|
190
|
+
try {
|
191
|
+
writer.write(NullWritable.get(), record);
|
192
|
+
} catch (InterruptedException e) {
|
193
|
+
throw new IOException(e);
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
/**
|
199
|
+
Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
|
200
|
+
method for doing this.
|
201
|
+
*/
|
202
|
+
private Writable toWritable(Object thing) {
|
203
|
+
if (thing instanceof String) {
|
204
|
+
return new Text((String)thing);
|
205
|
+
} else if (thing instanceof Long) {
|
206
|
+
return new LongWritable((Long)thing);
|
207
|
+
} else if (thing instanceof Integer) {
|
208
|
+
return new IntWritable((Integer)thing);
|
209
|
+
} else if (thing instanceof Double) {
|
210
|
+
return new DoubleWritable((Double)thing);
|
211
|
+
} else if (thing instanceof Float) {
|
212
|
+
return new FloatWritable((Float)thing);
|
213
|
+
} else if (thing instanceof Map) {
|
214
|
+
MapWritable result = new MapWritable();
|
215
|
+
for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
|
216
|
+
result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
|
217
|
+
}
|
218
|
+
return result;
|
219
|
+
} else if (thing instanceof List) {
|
220
|
+
if (((List)thing).size() > 0) {
|
221
|
+
Object first = ((List)thing).get(0);
|
222
|
+
Writable[] listOfThings = new Writable[((List)thing).size()];
|
223
|
+
for (int i = 0; i < listOfThings.length; i++) {
|
224
|
+
listOfThings[i] = toWritable(((List)thing).get(i));
|
225
|
+
}
|
226
|
+
return new ArrayWritable(toWritable(first).getClass(), listOfThings);
|
227
|
+
}
|
228
|
+
}
|
229
|
+
return NullWritable.get();
|
230
|
+
}
|
231
|
+
|
232
|
+
@Override
|
233
|
+
public void cleanupOnFailure(String location, Job job) throws IOException {
|
234
|
+
}
|
235
|
+
}
|
@@ -0,0 +1,355 @@
|
|
1
|
+
package com.infochimps.elasticsearch.pig;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.lang.InterruptedException;
|
5
|
+
import java.util.Properties;
|
6
|
+
import java.util.List;
|
7
|
+
import java.util.Map;
|
8
|
+
import java.util.HashMap;
|
9
|
+
import java.net.URI;
|
10
|
+
import java.net.URISyntaxException;
|
11
|
+
|
12
|
+
import org.apache.commons.logging.Log;
|
13
|
+
import org.apache.commons.logging.LogFactory;
|
14
|
+
|
15
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
16
|
+
import org.codehaus.jackson.JsonParseException;
|
17
|
+
import org.codehaus.jackson.map.JsonMappingException;
|
18
|
+
|
19
|
+
import org.apache.hadoop.conf.Configuration;
|
20
|
+
import org.apache.hadoop.fs.Path;
|
21
|
+
import org.apache.hadoop.mapreduce.Job;
|
22
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
23
|
+
import org.apache.hadoop.mapreduce.RecordWriter;
|
24
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
25
|
+
import org.apache.hadoop.mapreduce.OutputFormat;
|
26
|
+
import org.apache.hadoop.io.*;
|
27
|
+
|
28
|
+
import org.apache.pig.LoadFunc;
|
29
|
+
import org.apache.pig.StoreFuncInterface;
|
30
|
+
import org.apache.pig.ResourceSchema;
|
31
|
+
import org.apache.pig.impl.util.UDFContext;
|
32
|
+
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
|
33
|
+
import org.apache.pig.data.DataByteArray;
|
34
|
+
import org.apache.pig.data.Tuple;
|
35
|
+
import org.apache.pig.data.TupleFactory;
|
36
|
+
|
37
|
+
import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
|
38
|
+
import com.infochimps.elasticsearch.ElasticSearchInputFormat;
|
39
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
40
|
+
|
41
|
+
public class ElasticSearchStorage extends LoadFunc implements StoreFuncInterface {
|
42
|
+
|
43
|
+
private String contextSignature = null;
|
44
|
+
private RecordReader reader;
|
45
|
+
protected RecordWriter writer = null;
|
46
|
+
protected ObjectMapper mapper = new ObjectMapper();
|
47
|
+
protected String esConfig;
|
48
|
+
protected String esPlugins;
|
49
|
+
|
50
|
+
// For hadoop configuration
|
51
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
52
|
+
private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
|
53
|
+
private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
|
54
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
55
|
+
private static final String ES_IS_JSON = "elasticsearch.is_json";
|
56
|
+
private static final String PIG_ES_FIELD_NAMES = "elasticsearch.pig.field.names";
|
57
|
+
private static final String ES_REQUEST_SIZE = "elasticsearch.request.size";
|
58
|
+
private static final String ES_NUM_SPLITS = "elasticsearch.num.input.splits";
|
59
|
+
private static final String ES_QUERY_STRING = "elasticsearch.query.string";
|
60
|
+
|
61
|
+
private static final String COMMA = ",";
|
62
|
+
private static final String LOCAL_SCHEME = "file://";
|
63
|
+
private static final String DEFAULT_BULK = "1000";
|
64
|
+
private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
65
|
+
private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
66
|
+
private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
|
67
|
+
private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
|
68
|
+
private static final String ES_CONFIG = "es.config";
|
69
|
+
private static final String ES_PLUGINS = "es.path.plugins";
|
70
|
+
|
71
|
+
public ElasticSearchStorage() {
|
72
|
+
this(DEFAULT_ES_CONFIG, DEFAULT_ES_PLUGINS);
|
73
|
+
}
|
74
|
+
|
75
|
+
public ElasticSearchStorage(String esConfig) {
|
76
|
+
this(esConfig, DEFAULT_ES_PLUGINS);
|
77
|
+
}
|
78
|
+
|
79
|
+
public ElasticSearchStorage(String esConfig, String esPlugins) {
|
80
|
+
this.esConfig = esConfig;
|
81
|
+
this.esPlugins = esPlugins;
|
82
|
+
}
|
83
|
+
|
84
|
+
@Override
|
85
|
+
public Tuple getNext() throws IOException {
|
86
|
+
try {
|
87
|
+
Tuple tuple = TupleFactory.getInstance().newTuple(2);
|
88
|
+
if (reader.nextKeyValue()) {
|
89
|
+
Text docId = (Text)reader.getCurrentKey();
|
90
|
+
Text docContent = (Text)reader.getCurrentValue();
|
91
|
+
tuple.set(0, new DataByteArray(docId.toString()));
|
92
|
+
tuple.set(1, new DataByteArray(docContent.toString()));
|
93
|
+
return tuple;
|
94
|
+
}
|
95
|
+
} catch (InterruptedException e) {
|
96
|
+
throw new IOException(e);
|
97
|
+
}
|
98
|
+
return null;
|
99
|
+
}
|
100
|
+
|
101
|
+
@Override
|
102
|
+
public InputFormat getInputFormat() {
|
103
|
+
return new ElasticSearchInputFormat();
|
104
|
+
}
|
105
|
+
|
106
|
+
@Override
|
107
|
+
public void prepareToRead(RecordReader reader, PigSplit split) {
|
108
|
+
this.reader = reader;
|
109
|
+
}
|
110
|
+
|
111
|
+
@Override
|
112
|
+
public void setUDFContextSignature(String signature) {
|
113
|
+
this.contextSignature = signature;
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public void setLocation(String location, Job job) throws IOException {
|
118
|
+
elasticSearchSetup(location, job);
|
119
|
+
}
|
120
|
+
|
121
|
+
@Override
|
122
|
+
public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException {
|
123
|
+
return location;
|
124
|
+
}
|
125
|
+
|
126
|
+
@Override
|
127
|
+
public String relativeToAbsolutePath(String location, Path curDir) throws IOException {
|
128
|
+
return location;
|
129
|
+
}
|
130
|
+
|
131
|
+
@Override
|
132
|
+
public OutputFormat getOutputFormat() throws IOException {
|
133
|
+
return new ElasticSearchOutputFormat();
|
134
|
+
}
|
135
|
+
|
136
|
+
/**
|
137
|
+
Here we set the field names for a given tuple even if we
|
138
|
+
*/
|
139
|
+
@Override
|
140
|
+
public void checkSchema(ResourceSchema s) throws IOException {
|
141
|
+
UDFContext context = UDFContext.getUDFContext();
|
142
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
143
|
+
String fieldNames = "";
|
144
|
+
for (String field : s.fieldNames()) {
|
145
|
+
fieldNames += field;
|
146
|
+
fieldNames += COMMA;
|
147
|
+
}
|
148
|
+
property.setProperty(PIG_ES_FIELD_NAMES, fieldNames);
|
149
|
+
}
|
150
|
+
|
151
|
+
// Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
|
152
|
+
@Override
|
153
|
+
public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
|
154
|
+
this.writer = writer;
|
155
|
+
}
|
156
|
+
|
157
|
+
/**
|
158
|
+
Here we handle both the delimited record case and the json case.
|
159
|
+
*/
|
160
|
+
@SuppressWarnings("unchecked")
|
161
|
+
@Override
|
162
|
+
public void putNext(Tuple t) throws IOException {
|
163
|
+
|
164
|
+
UDFContext context = UDFContext.getUDFContext();
|
165
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
166
|
+
MapWritable record = new MapWritable();
|
167
|
+
|
168
|
+
String isJson = property.getProperty(ES_IS_JSON);
|
169
|
+
// Handle delimited records (ie. isJson == false)
|
170
|
+
if (isJson != null && isJson.equals("false")) {
|
171
|
+
String[] fieldNames = property.getProperty(PIG_ES_FIELD_NAMES).split(COMMA);
|
172
|
+
for (int i = 0; i < t.size(); i++) {
|
173
|
+
if (i < fieldNames.length) {
|
174
|
+
try {
|
175
|
+
record.put(new Text(fieldNames[i]), new Text(t.get(i).toString()));
|
176
|
+
} catch (NullPointerException e) {
|
177
|
+
//LOG.info("Increment null field counter.");
|
178
|
+
}
|
179
|
+
}
|
180
|
+
}
|
181
|
+
} else {
|
182
|
+
if (!t.isNull(0)) {
|
183
|
+
String jsonData = t.get(0).toString();
|
184
|
+
// parse json data and put into mapwritable record
|
185
|
+
try {
|
186
|
+
HashMap<String,Object> data = mapper.readValue(jsonData, HashMap.class);
|
187
|
+
record = (MapWritable)toWritable(data);
|
188
|
+
} catch (JsonParseException e) {
|
189
|
+
e.printStackTrace();
|
190
|
+
} catch (JsonMappingException e) {
|
191
|
+
e.printStackTrace();
|
192
|
+
}
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
196
|
+
try {
|
197
|
+
writer.write(NullWritable.get(), record);
|
198
|
+
} catch (InterruptedException e) {
|
199
|
+
throw new IOException(e);
|
200
|
+
}
|
201
|
+
}
|
202
|
+
|
203
|
+
@Override
|
204
|
+
public void setStoreFuncUDFContextSignature(String signature) {
|
205
|
+
this.contextSignature = signature;
|
206
|
+
}
|
207
|
+
|
208
|
+
/**
|
209
|
+
Pull out the elasticsearch setup code
|
210
|
+
*/
|
211
|
+
private void elasticSearchSetup(String location, Job job) {
|
212
|
+
// Need to use the uri parsing library here to pull out everything
|
213
|
+
try {
|
214
|
+
|
215
|
+
// Parse the passed in location URI, pulling out the arguments as well
|
216
|
+
URI parsedLocation = new URI(location);
|
217
|
+
HashMap<String, String> query = parseURIQuery(parsedLocation.getQuery());
|
218
|
+
|
219
|
+
String esHost = location.substring(5).split("/")[0];
|
220
|
+
if (esHost==null) {
|
221
|
+
throw new RuntimeException("Missing elasticsearch index name, URI must be formatted as es://<index_name>/<object_type>?<params>");
|
222
|
+
}
|
223
|
+
|
224
|
+
if (parsedLocation.getPath()==null) {
|
225
|
+
throw new RuntimeException("Missing elasticsearch object type, URI must be formatted as es://<index_name>/<object_type>?<params>");
|
226
|
+
}
|
227
|
+
|
228
|
+
Configuration conf = job.getConfiguration();
|
229
|
+
if (conf.get(ES_INDEX_NAME) == null) {
|
230
|
+
|
231
|
+
// Set elasticsearch index and object type in the Hadoop configuration
|
232
|
+
job.getConfiguration().set(ES_INDEX_NAME, esHost);
|
233
|
+
job.getConfiguration().set(ES_OBJECT_TYPE, parsedLocation.getPath().replaceAll("/", ""));
|
234
|
+
|
235
|
+
// Set the request size in the Hadoop configuration
|
236
|
+
String requestSize = query.get("size");
|
237
|
+
if (requestSize == null) requestSize = DEFAULT_BULK;
|
238
|
+
job.getConfiguration().set(ES_BULK_SIZE, requestSize);
|
239
|
+
job.getConfiguration().set(ES_REQUEST_SIZE, requestSize);
|
240
|
+
|
241
|
+
// Set the id field name in the Hadoop configuration
|
242
|
+
String idFieldName = query.get("id");
|
243
|
+
if (idFieldName == null) idFieldName = "-1";
|
244
|
+
job.getConfiguration().set(ES_ID_FIELD_NAME, idFieldName);
|
245
|
+
|
246
|
+
String queryString = query.get("q");
|
247
|
+
if (queryString==null) queryString = "*";
|
248
|
+
job.getConfiguration().set(ES_QUERY_STRING, queryString);
|
249
|
+
|
250
|
+
String numTasks = query.get("tasks");
|
251
|
+
if (numTasks==null) numTasks = "100";
|
252
|
+
job.getConfiguration().set(ES_NUM_SPLITS, numTasks);
|
253
|
+
|
254
|
+
// Adds the elasticsearch.yml file (esConfig) and the plugins directory (esPlugins) to the distributed cache
|
255
|
+
try {
|
256
|
+
Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
|
257
|
+
Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
|
258
|
+
|
259
|
+
HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
|
260
|
+
HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
|
261
|
+
|
262
|
+
HadoopUtils.uploadLocalFileIfChanged(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
|
263
|
+
HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
|
264
|
+
|
265
|
+
} catch (Exception e) {
|
266
|
+
throw new RuntimeException(e);
|
267
|
+
}
|
268
|
+
|
269
|
+
//
|
270
|
+
// This gets set even when loading data from elasticsearch
|
271
|
+
//
|
272
|
+
String isJson = query.get("json");
|
273
|
+
if (isJson==null || isJson.equals("false")) {
|
274
|
+
// We're dealing with delimited records
|
275
|
+
UDFContext context = UDFContext.getUDFContext();
|
276
|
+
Properties property = context.getUDFProperties(ResourceSchema.class);
|
277
|
+
property.setProperty(ES_IS_JSON, "false");
|
278
|
+
}
|
279
|
+
|
280
|
+
// Need to set this to start the local instance of elasticsearch
|
281
|
+
job.getConfiguration().set(ES_CONFIG, esConfig);
|
282
|
+
job.getConfiguration().set(ES_PLUGINS, esPlugins);
|
283
|
+
}
|
284
|
+
} catch (URISyntaxException e) {
|
285
|
+
throw new RuntimeException(e);
|
286
|
+
}
|
287
|
+
}
|
288
|
+
|
289
|
+
/**
|
290
|
+
Look at the passed in uri and hadoop configuration and set options.
|
291
|
+
<p>
|
292
|
+
<b>WARNING</b> Note that, since this is called more than once, it is
|
293
|
+
critical to ensure that we do not change or reset anything we've already set.
|
294
|
+
*/
|
295
|
+
@Override
|
296
|
+
public void setStoreLocation(String location, Job job) throws IOException {
|
297
|
+
elasticSearchSetup(location, job);
|
298
|
+
}
|
299
|
+
|
300
|
+
/**
|
301
|
+
Given a URI query string, eg. "foo=bar&happy=true" returns
|
302
|
+
a hashmap ({'foo' => 'bar', 'happy' => 'true'})
|
303
|
+
*/
|
304
|
+
private HashMap<String, String> parseURIQuery(String query) {
|
305
|
+
HashMap<String, String> argMap = new HashMap<String, String>();
|
306
|
+
if (query != null) {
|
307
|
+
String[] pairs = query.split("&");
|
308
|
+
for (String pair : pairs) {
|
309
|
+
String[] splitPair = pair.split("=");
|
310
|
+
argMap.put(splitPair[0], splitPair[1]);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
return argMap;
|
314
|
+
}
|
315
|
+
|
316
|
+
/**
|
317
|
+
Recursively converts an arbitrary object into the appropriate writable. Please enlighten me if there is an existing
|
318
|
+
method for doing this.
|
319
|
+
*/
|
320
|
+
private Writable toWritable(Object thing) {
|
321
|
+
if (thing instanceof String) {
|
322
|
+
return new Text((String)thing);
|
323
|
+
} else if (thing instanceof Long) {
|
324
|
+
return new LongWritable((Long)thing);
|
325
|
+
} else if (thing instanceof Integer) {
|
326
|
+
return new IntWritable((Integer)thing);
|
327
|
+
} else if (thing instanceof Double) {
|
328
|
+
return new DoubleWritable((Double)thing);
|
329
|
+
} else if (thing instanceof Float) {
|
330
|
+
return new FloatWritable((Float)thing);
|
331
|
+
} else if (thing instanceof Boolean) {
|
332
|
+
return new BooleanWritable((Boolean)thing);
|
333
|
+
} else if (thing instanceof Map) {
|
334
|
+
MapWritable result = new MapWritable();
|
335
|
+
for (Map.Entry<String,Object> entry : ((Map<String,Object>)thing).entrySet()) {
|
336
|
+
result.put(new Text(entry.getKey().toString()), toWritable(entry.getValue()));
|
337
|
+
}
|
338
|
+
return result;
|
339
|
+
} else if (thing instanceof List) {
|
340
|
+
if (((List)thing).size() > 0) {
|
341
|
+
Object first = ((List)thing).get(0);
|
342
|
+
Writable[] listOfThings = new Writable[((List)thing).size()];
|
343
|
+
for (int i = 0; i < listOfThings.length; i++) {
|
344
|
+
listOfThings[i] = toWritable(((List)thing).get(i));
|
345
|
+
}
|
346
|
+
return new ArrayWritable(toWritable(first).getClass(), listOfThings);
|
347
|
+
}
|
348
|
+
}
|
349
|
+
return NullWritable.get();
|
350
|
+
}
|
351
|
+
|
352
|
+
@Override
|
353
|
+
public void cleanupOnFailure(String location, Job job) throws IOException {
|
354
|
+
}
|
355
|
+
}
|