wonderdog 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import org.apache.hadoop.mapred.JobContext;
|
6
|
+
import org.apache.hadoop.mapred.OutputCommitter;
|
7
|
+
import org.apache.hadoop.mapred.TaskAttemptContext;
|
8
|
+
|
9
|
+
public class ElasticSearchStreamingOutputCommitter extends OutputCommitter {
|
10
|
+
|
11
|
+
@Override
|
12
|
+
public void setupJob(JobContext context) throws IOException {
|
13
|
+
|
14
|
+
}
|
15
|
+
|
16
|
+
@Override
|
17
|
+
public void cleanupJob(JobContext context) throws IOException {
|
18
|
+
}
|
19
|
+
|
20
|
+
@Override
|
21
|
+
public void setupTask(TaskAttemptContext context) throws IOException {
|
22
|
+
}
|
23
|
+
|
24
|
+
@Override
|
25
|
+
public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
|
26
|
+
return false;
|
27
|
+
}
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void commitTask(TaskAttemptContext context) throws IOException {
|
31
|
+
}
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public void abortTask(TaskAttemptContext context) throws IOException {
|
35
|
+
}
|
36
|
+
|
37
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import org.apache.commons.logging.Log;
|
6
|
+
import org.apache.commons.logging.LogFactory;
|
7
|
+
|
8
|
+
import org.apache.hadoop.io.*;
|
9
|
+
import org.apache.hadoop.mapred.TaskAttemptContext;
|
10
|
+
import org.apache.hadoop.mapred.JobConf;
|
11
|
+
import org.apache.hadoop.mapred.RecordWriter;
|
12
|
+
import org.apache.hadoop.fs.FileSystem;
|
13
|
+
import org.apache.hadoop.mapred.OutputFormat;
|
14
|
+
import org.apache.hadoop.util.*;
|
15
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
16
|
+
|
17
|
+
/**
|
18
|
+
|
19
|
+
Hadoop OutputFormat for writing arbitrary MapWritables (essentially
|
20
|
+
HashMaps) into Elasticsearch. Records are batched up and sent in a
|
21
|
+
one-hop manner to the elastic search data nodes that will index
|
22
|
+
them.
|
23
|
+
|
24
|
+
*/
|
25
|
+
public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K, V> {
|
26
|
+
|
27
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingOutputFormat.class);
|
28
|
+
|
29
|
+
// Job settings we need to control directly from Java options.
|
30
|
+
private static final String ES_INDEX_OPT = "elasticsearch.output.index";
|
31
|
+
private static final String ES_DEFAULT_INDEX = "hadoop";
|
32
|
+
private String defaultIndexName;
|
33
|
+
|
34
|
+
private static final String ES_MAPPING_OPT = "elasticsearch.output.mapping";
|
35
|
+
private static final String ES_DEFAULT_MAPPING = "streaming_record";
|
36
|
+
private String defaultMappingName;
|
37
|
+
|
38
|
+
private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
|
39
|
+
private static final String ES_INDEX_FIELD = "_index";
|
40
|
+
private String indexFieldName;
|
41
|
+
|
42
|
+
private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
|
43
|
+
private static final String ES_MAPPING_FIELD = "_mapping";
|
44
|
+
private String mappingFieldName;
|
45
|
+
|
46
|
+
private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
|
47
|
+
private static final String ES_ID_FIELD = "_id";
|
48
|
+
private String idFieldName;
|
49
|
+
|
50
|
+
private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
|
51
|
+
private static final String ES_BULK_SIZE = "100";
|
52
|
+
private int bulkSize;
|
53
|
+
|
54
|
+
|
55
|
+
// Elasticsearch internal settings required to make a client
|
56
|
+
// connection.
|
57
|
+
private static final String ES_CONFIG_OPT = "es.config";
|
58
|
+
private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
59
|
+
|
60
|
+
private static final String ES_PLUGINS_OPT = "es.path.plugins";
|
61
|
+
private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
62
|
+
|
63
|
+
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException {
|
64
|
+
setLocalElasticSearchInstallation(conf);
|
65
|
+
String defaultIndexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
|
66
|
+
String defaultMappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
|
67
|
+
String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
|
68
|
+
String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
|
69
|
+
String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
|
70
|
+
Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
|
71
|
+
return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
|
72
|
+
}
|
73
|
+
|
74
|
+
public void setLocalElasticSearchInstallation(JobConf conf) {
|
75
|
+
String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
|
76
|
+
String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
|
77
|
+
System.setProperty(ES_CONFIG_OPT,esConfigPath);
|
78
|
+
System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
|
79
|
+
LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
|
80
|
+
}
|
81
|
+
|
82
|
+
public ElasticSearchStreamingOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
|
83
|
+
return new ElasticSearchStreamingOutputCommitter();
|
84
|
+
}
|
85
|
+
|
86
|
+
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
|
87
|
+
}
|
88
|
+
}
|
@@ -0,0 +1,176 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import java.util.Iterator;
|
6
|
+
|
7
|
+
import org.apache.commons.logging.Log;
|
8
|
+
import org.apache.commons.logging.LogFactory;
|
9
|
+
|
10
|
+
import org.apache.hadoop.mapred.RecordReader;
|
11
|
+
import org.apache.hadoop.mapred.InputSplit;
|
12
|
+
import org.apache.hadoop.mapred.JobConf;
|
13
|
+
import org.apache.hadoop.io.*;
|
14
|
+
|
15
|
+
import org.elasticsearch.common.unit.TimeValue;
|
16
|
+
|
17
|
+
import org.elasticsearch.node.Node;
|
18
|
+
import org.elasticsearch.node.NodeBuilder;
|
19
|
+
import org.elasticsearch.client.Client;
|
20
|
+
import org.elasticsearch.action.search.SearchRequestBuilder;
|
21
|
+
import org.elasticsearch.action.search.SearchScrollRequestBuilder;
|
22
|
+
|
23
|
+
import org.elasticsearch.search.SearchHit;
|
24
|
+
import org.elasticsearch.search.Scroll;
|
25
|
+
import org.elasticsearch.action.search.SearchResponse;
|
26
|
+
import org.elasticsearch.cluster.ClusterName;
|
27
|
+
|
28
|
+
class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
|
29
|
+
|
30
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordReader.class);
|
31
|
+
|
32
|
+
private static final String ES_REQUEST_SIZE_OPT = "elasticsearch.input.request_size";
|
33
|
+
private static final String ES_REQUEST_SIZE = "100";
|
34
|
+
private Integer requestSize;
|
35
|
+
|
36
|
+
private static final String ES_SCROLL_TIMEOUT_OPT = "elasticsearch.input.scroll_timeout";
|
37
|
+
private static final String ES_SCROLL_TIMEOUT = "5m";
|
38
|
+
private String scrollTimeout;
|
39
|
+
private static final TimeValue defaultScrollTimeout = new TimeValue((long) 300000); // 5 minutes
|
40
|
+
private Scroll scroll;
|
41
|
+
|
42
|
+
private Node node;
|
43
|
+
private Client client;
|
44
|
+
private ElasticSearchStreamingSplit split;
|
45
|
+
|
46
|
+
private String scrollId;
|
47
|
+
private Integer recordsRead;
|
48
|
+
private Iterator<SearchHit> hitsItr = null;
|
49
|
+
|
50
|
+
public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
|
51
|
+
this.split = (ElasticSearchStreamingSplit) split;
|
52
|
+
this.recordsRead = 0;
|
53
|
+
this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
|
54
|
+
this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
|
55
|
+
this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
|
56
|
+
|
57
|
+
LOG.info("Initializing "+this.split.getSummary());
|
58
|
+
startEmbeddedClient();
|
59
|
+
fetchNextHits();
|
60
|
+
}
|
61
|
+
|
62
|
+
private void fetchNextHits() {
|
63
|
+
if (scrollId == null) {
|
64
|
+
LOG.info("Running initial scroll with timeout "+scrollTimeout);
|
65
|
+
SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
|
66
|
+
SearchResponse response = request.execute().actionGet();
|
67
|
+
this.scrollId = response.scrollId();
|
68
|
+
LOG.info("Got scroll ID "+scrollId);
|
69
|
+
// Do we need to call fetchNextHits() again here? Or does
|
70
|
+
// the initial request also itself contain the first set
|
71
|
+
// of hits for the scroll?
|
72
|
+
//
|
73
|
+
// fetchNextHits();
|
74
|
+
} else {
|
75
|
+
// LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
|
76
|
+
SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
|
77
|
+
SearchResponse response = request.execute().actionGet();
|
78
|
+
this.scrollId = response.scrollId();
|
79
|
+
// LOG.info("Got scroll ID "+scrollId);
|
80
|
+
this.hitsItr = response.hits().iterator();
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
@Override
|
85
|
+
public boolean next(K key, V value) throws IOException {
|
86
|
+
if (shouldReadAnotherRecord()) {
|
87
|
+
// We should read more records because we haven't read as
|
88
|
+
// many as we know to be in this split yet.
|
89
|
+
if (hasAnotherRecord()) {
|
90
|
+
// We already have records stacked up ready to read.
|
91
|
+
readRecord(key, value);
|
92
|
+
return true;
|
93
|
+
} else {
|
94
|
+
// We don't have records stacked up so we might need
|
95
|
+
// to fetch some more hits.
|
96
|
+
fetchNextHits();
|
97
|
+
if (hasAnotherRecord()) {
|
98
|
+
// Now if we have records we read one
|
99
|
+
readRecord(key, value);
|
100
|
+
return true;
|
101
|
+
} else {
|
102
|
+
// But if no records are here this time, it's
|
103
|
+
// because we know we're done reading the input.
|
104
|
+
return false;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
} else {
|
108
|
+
// Return false as we're done with this split.
|
109
|
+
return false;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
private boolean shouldReadAnotherRecord() {
|
114
|
+
return recordsRead < split.getSize();
|
115
|
+
}
|
116
|
+
|
117
|
+
private boolean hasAnotherRecord() {
|
118
|
+
return hitsItr != null && hitsItr.hasNext();
|
119
|
+
}
|
120
|
+
|
121
|
+
private void readRecord(K key, V value) {
|
122
|
+
SearchHit hit = hitsItr.next();
|
123
|
+
if (hit != null) {
|
124
|
+
Text keyText = (Text) key;
|
125
|
+
Text valueText = (Text) value;
|
126
|
+
keyText.set(hit.sourceAsString());
|
127
|
+
valueText.set(hit.sourceAsString());
|
128
|
+
recordsRead += 1;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
@Override
|
133
|
+
public K createKey() {
|
134
|
+
return (K) new Text();
|
135
|
+
}
|
136
|
+
|
137
|
+
@Override
|
138
|
+
public V createValue() {
|
139
|
+
return (V) new Text();
|
140
|
+
}
|
141
|
+
|
142
|
+
@Override
|
143
|
+
public long getPos() throws IOException {
|
144
|
+
return recordsRead;
|
145
|
+
}
|
146
|
+
|
147
|
+
@Override
|
148
|
+
public float getProgress() throws IOException {
|
149
|
+
return ((float) recordsRead) / ((float) split.getSize());
|
150
|
+
}
|
151
|
+
|
152
|
+
@Override
|
153
|
+
public void close() throws IOException {
|
154
|
+
stopEmbeddedClient();
|
155
|
+
}
|
156
|
+
|
157
|
+
//
|
158
|
+
// == Connecting to Elasticsearch ==
|
159
|
+
//
|
160
|
+
|
161
|
+
private void startEmbeddedClient() {
|
162
|
+
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
163
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
164
|
+
this.client = node.client();
|
165
|
+
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
166
|
+
}
|
167
|
+
|
168
|
+
private void stopEmbeddedClient() {
|
169
|
+
LOG.info("Stopping embedded Elasticsearch client...");
|
170
|
+
if (client != null) client.close();
|
171
|
+
if (node != null) node.close();
|
172
|
+
LOG.info("Left Elasticsearch cluster");
|
173
|
+
}
|
174
|
+
|
175
|
+
|
176
|
+
}
|
@@ -0,0 +1,171 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.Map;
|
6
|
+
import java.util.concurrent.atomic.AtomicLong;
|
7
|
+
import java.util.Random;
|
8
|
+
|
9
|
+
import org.apache.commons.logging.Log;
|
10
|
+
import org.apache.commons.logging.LogFactory;
|
11
|
+
|
12
|
+
import org.apache.hadoop.io.*;
|
13
|
+
import org.apache.hadoop.mapred.JobConf;
|
14
|
+
import org.apache.hadoop.mapred.RecordWriter;
|
15
|
+
import org.apache.hadoop.mapred.Reporter;
|
16
|
+
import org.apache.hadoop.util.*;
|
17
|
+
|
18
|
+
import org.elasticsearch.cluster.ClusterName;
|
19
|
+
import org.elasticsearch.node.Node;
|
20
|
+
import org.elasticsearch.node.NodeBuilder;
|
21
|
+
import org.elasticsearch.client.Client;
|
22
|
+
import org.elasticsearch.client.Requests;
|
23
|
+
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
24
|
+
import org.elasticsearch.action.bulk.BulkResponse;
|
25
|
+
import org.elasticsearch.ExceptionsHelper;
|
26
|
+
|
27
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
28
|
+
import org.codehaus.jackson.JsonParseException;
|
29
|
+
|
30
|
+
class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
31
|
+
|
32
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordWriter.class);
|
33
|
+
|
34
|
+
private String defaultIndexName;
|
35
|
+
private String defaultMappingName;
|
36
|
+
private String indexFieldName;
|
37
|
+
private String mappingFieldName;
|
38
|
+
private String idFieldName;
|
39
|
+
private Integer bulkSize;
|
40
|
+
|
41
|
+
// Bookkeeping
|
42
|
+
private AtomicLong totalBulkTime = new AtomicLong();
|
43
|
+
private AtomicLong totalBulkItems = new AtomicLong();
|
44
|
+
private Random randgen = new Random();
|
45
|
+
private long runStartTime = System.currentTimeMillis();
|
46
|
+
|
47
|
+
// Elasticsearch indexing
|
48
|
+
private Node node;
|
49
|
+
private Client client;
|
50
|
+
private volatile BulkRequestBuilder currentRequest;
|
51
|
+
|
52
|
+
// JSON parsing
|
53
|
+
private ObjectMapper mapper;
|
54
|
+
|
55
|
+
//
|
56
|
+
// == Lifecycle ==
|
57
|
+
//
|
58
|
+
|
59
|
+
public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
|
60
|
+
this.defaultIndexName = defaultIndexName;
|
61
|
+
this.defaultMappingName = defaultMappingName;
|
62
|
+
this.indexFieldName = indexFieldName;
|
63
|
+
this.mappingFieldName = mappingFieldName;
|
64
|
+
this.idFieldName = idFieldName;
|
65
|
+
this.bulkSize = bulkSize;
|
66
|
+
|
67
|
+
LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
|
68
|
+
LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
|
69
|
+
LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
|
70
|
+
|
71
|
+
startEmbeddedClient();
|
72
|
+
this.currentRequest = client.prepareBulk();
|
73
|
+
this.mapper = new ObjectMapper();
|
74
|
+
}
|
75
|
+
|
76
|
+
/**
|
77
|
+
Start an embedded Elasticsearch client. The client will not be
|
78
|
+
a data node and will not store data locally.
|
79
|
+
|
80
|
+
The client will connect to the target Elasticsearch cluster as
|
81
|
+
a client node, enabling one-hop writes for all data. See
|
82
|
+
http://www.elasticsearch.org/guide/reference/java-api/client.html
|
83
|
+
*/
|
84
|
+
private void startEmbeddedClient() {
|
85
|
+
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
86
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
87
|
+
this.client = node.client();
|
88
|
+
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
/**
|
93
|
+
Close the Elasticsearch client, sending out one last bulk write
|
94
|
+
if necessary.
|
95
|
+
*/
|
96
|
+
public void close(Reporter reporter) throws IOException {
|
97
|
+
sendBulkRequestIfMoreThan(0);
|
98
|
+
LOG.info("Shutting down Elasticsearch client...");
|
99
|
+
if (client != null) client.close();
|
100
|
+
if (node != null) node.close();
|
101
|
+
LOG.info("Successfully shut down Elasticsearch client");
|
102
|
+
}
|
103
|
+
|
104
|
+
//
|
105
|
+
// == Writing records ==
|
106
|
+
//
|
107
|
+
|
108
|
+
public void write(K key, V value) throws IOException {
|
109
|
+
String json = ((Text) key).toString();
|
110
|
+
try {
|
111
|
+
index(json);
|
112
|
+
sendBulkRequestIfBigEnough();
|
113
|
+
} catch(Exception e) {
|
114
|
+
if (ExceptionsHelper.unwrapCause(e) instanceof JsonParseException) {
|
115
|
+
LOG.debug("Bad record: "+json);
|
116
|
+
return;
|
117
|
+
} else {
|
118
|
+
LOG.error("Could not write record: "+json, e);
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
private void index(String json) throws IOException {
|
124
|
+
Map<String, Object> record = mapper.readValue(json, Map.class);
|
125
|
+
if (record.containsKey(idFieldName)) {
|
126
|
+
Object idValue = record.get(idFieldName);
|
127
|
+
currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
|
128
|
+
} else {
|
129
|
+
currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
private String indexNameForRecord(Map<String, Object> record) {
|
134
|
+
if (record.containsKey(indexFieldName)) {
|
135
|
+
Object indexValue = record.get(indexFieldName);
|
136
|
+
return String.valueOf(indexValue);
|
137
|
+
} else {
|
138
|
+
return defaultIndexName;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
private String mappingNameForRecord(Map<String, Object> record) {
|
143
|
+
if (record.containsKey(mappingFieldName)) {
|
144
|
+
Object mappingValue = record.get(mappingFieldName);
|
145
|
+
return String.valueOf(mappingValue);
|
146
|
+
} else {
|
147
|
+
return defaultMappingName;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
//
|
152
|
+
// == Bulk request handling ==
|
153
|
+
//
|
154
|
+
|
155
|
+
private void sendBulkRequestIfBigEnough() {
|
156
|
+
sendBulkRequestIfMoreThan(bulkSize);
|
157
|
+
}
|
158
|
+
|
159
|
+
private void sendBulkRequestIfMoreThan(int size) {
|
160
|
+
totalBulkItems.incrementAndGet();
|
161
|
+
if (currentRequest.numberOfActions() > size) {
|
162
|
+
long startTime = System.currentTimeMillis();
|
163
|
+
BulkResponse response = currentRequest.execute().actionGet();
|
164
|
+
totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
|
165
|
+
if (randgen.nextDouble() < 0.1) {
|
166
|
+
LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
|
167
|
+
}
|
168
|
+
currentRequest = client.prepareBulk();
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|