wonderdog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,37 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import org.apache.hadoop.mapred.JobContext;
|
6
|
+
import org.apache.hadoop.mapred.OutputCommitter;
|
7
|
+
import org.apache.hadoop.mapred.TaskAttemptContext;
|
8
|
+
|
9
|
+
public class ElasticSearchStreamingOutputCommitter extends OutputCommitter {
|
10
|
+
|
11
|
+
@Override
|
12
|
+
public void setupJob(JobContext context) throws IOException {
|
13
|
+
|
14
|
+
}
|
15
|
+
|
16
|
+
@Override
|
17
|
+
public void cleanupJob(JobContext context) throws IOException {
|
18
|
+
}
|
19
|
+
|
20
|
+
@Override
|
21
|
+
public void setupTask(TaskAttemptContext context) throws IOException {
|
22
|
+
}
|
23
|
+
|
24
|
+
@Override
|
25
|
+
public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
|
26
|
+
return false;
|
27
|
+
}
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void commitTask(TaskAttemptContext context) throws IOException {
|
31
|
+
}
|
32
|
+
|
33
|
+
@Override
|
34
|
+
public void abortTask(TaskAttemptContext context) throws IOException {
|
35
|
+
}
|
36
|
+
|
37
|
+
}
|
@@ -0,0 +1,88 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import org.apache.commons.logging.Log;
|
6
|
+
import org.apache.commons.logging.LogFactory;
|
7
|
+
|
8
|
+
import org.apache.hadoop.io.*;
|
9
|
+
import org.apache.hadoop.mapred.TaskAttemptContext;
|
10
|
+
import org.apache.hadoop.mapred.JobConf;
|
11
|
+
import org.apache.hadoop.mapred.RecordWriter;
|
12
|
+
import org.apache.hadoop.fs.FileSystem;
|
13
|
+
import org.apache.hadoop.mapred.OutputFormat;
|
14
|
+
import org.apache.hadoop.util.*;
|
15
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
16
|
+
|
17
|
+
/**
|
18
|
+
|
19
|
+
Hadoop OutputFormat for writing arbitrary MapWritables (essentially
|
20
|
+
HashMaps) into Elasticsearch. Records are batched up and sent in a
|
21
|
+
one-hop manner to the elastic search data nodes that will index
|
22
|
+
them.
|
23
|
+
|
24
|
+
*/
|
25
|
+
public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K, V> {
|
26
|
+
|
27
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingOutputFormat.class);
|
28
|
+
|
29
|
+
// Job settings we need to control directly from Java options.
|
30
|
+
private static final String ES_INDEX_OPT = "elasticsearch.output.index";
|
31
|
+
private static final String ES_DEFAULT_INDEX = "hadoop";
|
32
|
+
private String defaultIndexName;
|
33
|
+
|
34
|
+
private static final String ES_MAPPING_OPT = "elasticsearch.output.mapping";
|
35
|
+
private static final String ES_DEFAULT_MAPPING = "streaming_record";
|
36
|
+
private String defaultMappingName;
|
37
|
+
|
38
|
+
private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
|
39
|
+
private static final String ES_INDEX_FIELD = "_index";
|
40
|
+
private String indexFieldName;
|
41
|
+
|
42
|
+
private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
|
43
|
+
private static final String ES_MAPPING_FIELD = "_mapping";
|
44
|
+
private String mappingFieldName;
|
45
|
+
|
46
|
+
private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
|
47
|
+
private static final String ES_ID_FIELD = "_id";
|
48
|
+
private String idFieldName;
|
49
|
+
|
50
|
+
private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
|
51
|
+
private static final String ES_BULK_SIZE = "100";
|
52
|
+
private int bulkSize;
|
53
|
+
|
54
|
+
|
55
|
+
// Elasticsearch internal settings required to make a client
|
56
|
+
// connection.
|
57
|
+
private static final String ES_CONFIG_OPT = "es.config";
|
58
|
+
private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
|
59
|
+
|
60
|
+
private static final String ES_PLUGINS_OPT = "es.path.plugins";
|
61
|
+
private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
|
62
|
+
|
63
|
+
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException {
|
64
|
+
setLocalElasticSearchInstallation(conf);
|
65
|
+
String defaultIndexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
|
66
|
+
String defaultMappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
|
67
|
+
String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
|
68
|
+
String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
|
69
|
+
String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
|
70
|
+
Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
|
71
|
+
return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
|
72
|
+
}
|
73
|
+
|
74
|
+
public void setLocalElasticSearchInstallation(JobConf conf) {
|
75
|
+
String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
|
76
|
+
String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
|
77
|
+
System.setProperty(ES_CONFIG_OPT,esConfigPath);
|
78
|
+
System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
|
79
|
+
LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
|
80
|
+
}
|
81
|
+
|
82
|
+
public ElasticSearchStreamingOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
|
83
|
+
return new ElasticSearchStreamingOutputCommitter();
|
84
|
+
}
|
85
|
+
|
86
|
+
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
|
87
|
+
}
|
88
|
+
}
|
@@ -0,0 +1,176 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
|
5
|
+
import java.util.Iterator;
|
6
|
+
|
7
|
+
import org.apache.commons.logging.Log;
|
8
|
+
import org.apache.commons.logging.LogFactory;
|
9
|
+
|
10
|
+
import org.apache.hadoop.mapred.RecordReader;
|
11
|
+
import org.apache.hadoop.mapred.InputSplit;
|
12
|
+
import org.apache.hadoop.mapred.JobConf;
|
13
|
+
import org.apache.hadoop.io.*;
|
14
|
+
|
15
|
+
import org.elasticsearch.common.unit.TimeValue;
|
16
|
+
|
17
|
+
import org.elasticsearch.node.Node;
|
18
|
+
import org.elasticsearch.node.NodeBuilder;
|
19
|
+
import org.elasticsearch.client.Client;
|
20
|
+
import org.elasticsearch.action.search.SearchRequestBuilder;
|
21
|
+
import org.elasticsearch.action.search.SearchScrollRequestBuilder;
|
22
|
+
|
23
|
+
import org.elasticsearch.search.SearchHit;
|
24
|
+
import org.elasticsearch.search.Scroll;
|
25
|
+
import org.elasticsearch.action.search.SearchResponse;
|
26
|
+
import org.elasticsearch.cluster.ClusterName;
|
27
|
+
|
28
|
+
class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
|
29
|
+
|
30
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordReader.class);
|
31
|
+
|
32
|
+
private static final String ES_REQUEST_SIZE_OPT = "elasticsearch.input.request_size";
|
33
|
+
private static final String ES_REQUEST_SIZE = "100";
|
34
|
+
private Integer requestSize;
|
35
|
+
|
36
|
+
private static final String ES_SCROLL_TIMEOUT_OPT = "elasticsearch.input.scroll_timeout";
|
37
|
+
private static final String ES_SCROLL_TIMEOUT = "5m";
|
38
|
+
private String scrollTimeout;
|
39
|
+
private static final TimeValue defaultScrollTimeout = new TimeValue((long) 300000); // 5 minutes
|
40
|
+
private Scroll scroll;
|
41
|
+
|
42
|
+
private Node node;
|
43
|
+
private Client client;
|
44
|
+
private ElasticSearchStreamingSplit split;
|
45
|
+
|
46
|
+
private String scrollId;
|
47
|
+
private Integer recordsRead;
|
48
|
+
private Iterator<SearchHit> hitsItr = null;
|
49
|
+
|
50
|
+
public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
|
51
|
+
this.split = (ElasticSearchStreamingSplit) split;
|
52
|
+
this.recordsRead = 0;
|
53
|
+
this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
|
54
|
+
this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
|
55
|
+
this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
|
56
|
+
|
57
|
+
LOG.info("Initializing "+this.split.getSummary());
|
58
|
+
startEmbeddedClient();
|
59
|
+
fetchNextHits();
|
60
|
+
}
|
61
|
+
|
62
|
+
private void fetchNextHits() {
|
63
|
+
if (scrollId == null) {
|
64
|
+
LOG.info("Running initial scroll with timeout "+scrollTimeout);
|
65
|
+
SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
|
66
|
+
SearchResponse response = request.execute().actionGet();
|
67
|
+
this.scrollId = response.scrollId();
|
68
|
+
LOG.info("Got scroll ID "+scrollId);
|
69
|
+
// Do we need to call fetchNextHits() again here? Or does
|
70
|
+
// the initial request also itself contain the first set
|
71
|
+
// of hits for the scroll?
|
72
|
+
//
|
73
|
+
// fetchNextHits();
|
74
|
+
} else {
|
75
|
+
// LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
|
76
|
+
SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
|
77
|
+
SearchResponse response = request.execute().actionGet();
|
78
|
+
this.scrollId = response.scrollId();
|
79
|
+
// LOG.info("Got scroll ID "+scrollId);
|
80
|
+
this.hitsItr = response.hits().iterator();
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
@Override
|
85
|
+
public boolean next(K key, V value) throws IOException {
|
86
|
+
if (shouldReadAnotherRecord()) {
|
87
|
+
// We should read more records because we haven't read as
|
88
|
+
// many as we know to be in this split yet.
|
89
|
+
if (hasAnotherRecord()) {
|
90
|
+
// We already have records stacked up ready to read.
|
91
|
+
readRecord(key, value);
|
92
|
+
return true;
|
93
|
+
} else {
|
94
|
+
// We don't have records stacked up so we might need
|
95
|
+
// to fetch some more hits.
|
96
|
+
fetchNextHits();
|
97
|
+
if (hasAnotherRecord()) {
|
98
|
+
// Now if we have records we read one
|
99
|
+
readRecord(key, value);
|
100
|
+
return true;
|
101
|
+
} else {
|
102
|
+
// But if no records are here this time, it's
|
103
|
+
// because we know we're done reading the input.
|
104
|
+
return false;
|
105
|
+
}
|
106
|
+
}
|
107
|
+
} else {
|
108
|
+
// Return false as we're done with this split.
|
109
|
+
return false;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
private boolean shouldReadAnotherRecord() {
|
114
|
+
return recordsRead < split.getSize();
|
115
|
+
}
|
116
|
+
|
117
|
+
private boolean hasAnotherRecord() {
|
118
|
+
return hitsItr != null && hitsItr.hasNext();
|
119
|
+
}
|
120
|
+
|
121
|
+
private void readRecord(K key, V value) {
|
122
|
+
SearchHit hit = hitsItr.next();
|
123
|
+
if (hit != null) {
|
124
|
+
Text keyText = (Text) key;
|
125
|
+
Text valueText = (Text) value;
|
126
|
+
keyText.set(hit.sourceAsString());
|
127
|
+
valueText.set(hit.sourceAsString());
|
128
|
+
recordsRead += 1;
|
129
|
+
}
|
130
|
+
}
|
131
|
+
|
132
|
+
@Override
|
133
|
+
public K createKey() {
|
134
|
+
return (K) new Text();
|
135
|
+
}
|
136
|
+
|
137
|
+
@Override
|
138
|
+
public V createValue() {
|
139
|
+
return (V) new Text();
|
140
|
+
}
|
141
|
+
|
142
|
+
@Override
|
143
|
+
public long getPos() throws IOException {
|
144
|
+
return recordsRead;
|
145
|
+
}
|
146
|
+
|
147
|
+
@Override
|
148
|
+
public float getProgress() throws IOException {
|
149
|
+
return ((float) recordsRead) / ((float) split.getSize());
|
150
|
+
}
|
151
|
+
|
152
|
+
@Override
|
153
|
+
public void close() throws IOException {
|
154
|
+
stopEmbeddedClient();
|
155
|
+
}
|
156
|
+
|
157
|
+
//
|
158
|
+
// == Connecting to Elasticsearch ==
|
159
|
+
//
|
160
|
+
|
161
|
+
private void startEmbeddedClient() {
|
162
|
+
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
163
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
164
|
+
this.client = node.client();
|
165
|
+
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
166
|
+
}
|
167
|
+
|
168
|
+
private void stopEmbeddedClient() {
|
169
|
+
LOG.info("Stopping embedded Elasticsearch client...");
|
170
|
+
if (client != null) client.close();
|
171
|
+
if (node != null) node.close();
|
172
|
+
LOG.info("Left Elasticsearch cluster");
|
173
|
+
}
|
174
|
+
|
175
|
+
|
176
|
+
}
|
@@ -0,0 +1,171 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.util.Map;
|
6
|
+
import java.util.concurrent.atomic.AtomicLong;
|
7
|
+
import java.util.Random;
|
8
|
+
|
9
|
+
import org.apache.commons.logging.Log;
|
10
|
+
import org.apache.commons.logging.LogFactory;
|
11
|
+
|
12
|
+
import org.apache.hadoop.io.*;
|
13
|
+
import org.apache.hadoop.mapred.JobConf;
|
14
|
+
import org.apache.hadoop.mapred.RecordWriter;
|
15
|
+
import org.apache.hadoop.mapred.Reporter;
|
16
|
+
import org.apache.hadoop.util.*;
|
17
|
+
|
18
|
+
import org.elasticsearch.cluster.ClusterName;
|
19
|
+
import org.elasticsearch.node.Node;
|
20
|
+
import org.elasticsearch.node.NodeBuilder;
|
21
|
+
import org.elasticsearch.client.Client;
|
22
|
+
import org.elasticsearch.client.Requests;
|
23
|
+
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
24
|
+
import org.elasticsearch.action.bulk.BulkResponse;
|
25
|
+
import org.elasticsearch.ExceptionsHelper;
|
26
|
+
|
27
|
+
import org.codehaus.jackson.map.ObjectMapper;
|
28
|
+
import org.codehaus.jackson.JsonParseException;
|
29
|
+
|
30
|
+
class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
|
31
|
+
|
32
|
+
static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordWriter.class);
|
33
|
+
|
34
|
+
private String defaultIndexName;
|
35
|
+
private String defaultMappingName;
|
36
|
+
private String indexFieldName;
|
37
|
+
private String mappingFieldName;
|
38
|
+
private String idFieldName;
|
39
|
+
private Integer bulkSize;
|
40
|
+
|
41
|
+
// Bookkeeping
|
42
|
+
private AtomicLong totalBulkTime = new AtomicLong();
|
43
|
+
private AtomicLong totalBulkItems = new AtomicLong();
|
44
|
+
private Random randgen = new Random();
|
45
|
+
private long runStartTime = System.currentTimeMillis();
|
46
|
+
|
47
|
+
// Elasticsearch indexing
|
48
|
+
private Node node;
|
49
|
+
private Client client;
|
50
|
+
private volatile BulkRequestBuilder currentRequest;
|
51
|
+
|
52
|
+
// JSON parsing
|
53
|
+
private ObjectMapper mapper;
|
54
|
+
|
55
|
+
//
|
56
|
+
// == Lifecycle ==
|
57
|
+
//
|
58
|
+
|
59
|
+
public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
|
60
|
+
this.defaultIndexName = defaultIndexName;
|
61
|
+
this.defaultMappingName = defaultMappingName;
|
62
|
+
this.indexFieldName = indexFieldName;
|
63
|
+
this.mappingFieldName = mappingFieldName;
|
64
|
+
this.idFieldName = idFieldName;
|
65
|
+
this.bulkSize = bulkSize;
|
66
|
+
|
67
|
+
LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
|
68
|
+
LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
|
69
|
+
LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
|
70
|
+
|
71
|
+
startEmbeddedClient();
|
72
|
+
this.currentRequest = client.prepareBulk();
|
73
|
+
this.mapper = new ObjectMapper();
|
74
|
+
}
|
75
|
+
|
76
|
+
/**
|
77
|
+
Start an embedded Elasticsearch client. The client will not be
|
78
|
+
a data node and will not store data locally.
|
79
|
+
|
80
|
+
The client will connect to the target Elasticsearch cluster as
|
81
|
+
a client node, enabling one-hop writes for all data. See
|
82
|
+
http://www.elasticsearch.org/guide/reference/java-api/client.html
|
83
|
+
*/
|
84
|
+
private void startEmbeddedClient() {
|
85
|
+
LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
|
86
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
87
|
+
this.client = node.client();
|
88
|
+
LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
|
89
|
+
}
|
90
|
+
|
91
|
+
|
92
|
+
/**
|
93
|
+
Close the Elasticsearch client, sending out one last bulk write
|
94
|
+
if necessary.
|
95
|
+
*/
|
96
|
+
public void close(Reporter reporter) throws IOException {
|
97
|
+
sendBulkRequestIfMoreThan(0);
|
98
|
+
LOG.info("Shutting down Elasticsearch client...");
|
99
|
+
if (client != null) client.close();
|
100
|
+
if (node != null) node.close();
|
101
|
+
LOG.info("Successfully shut down Elasticsearch client");
|
102
|
+
}
|
103
|
+
|
104
|
+
//
|
105
|
+
// == Writing records ==
|
106
|
+
//
|
107
|
+
|
108
|
+
public void write(K key, V value) throws IOException {
|
109
|
+
String json = ((Text) key).toString();
|
110
|
+
try {
|
111
|
+
index(json);
|
112
|
+
sendBulkRequestIfBigEnough();
|
113
|
+
} catch(Exception e) {
|
114
|
+
if (ExceptionsHelper.unwrapCause(e) instanceof JsonParseException) {
|
115
|
+
LOG.debug("Bad record: "+json);
|
116
|
+
return;
|
117
|
+
} else {
|
118
|
+
LOG.error("Could not write record: "+json, e);
|
119
|
+
}
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
private void index(String json) throws IOException {
|
124
|
+
Map<String, Object> record = mapper.readValue(json, Map.class);
|
125
|
+
if (record.containsKey(idFieldName)) {
|
126
|
+
Object idValue = record.get(idFieldName);
|
127
|
+
currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
|
128
|
+
} else {
|
129
|
+
currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
|
130
|
+
}
|
131
|
+
}
|
132
|
+
|
133
|
+
private String indexNameForRecord(Map<String, Object> record) {
|
134
|
+
if (record.containsKey(indexFieldName)) {
|
135
|
+
Object indexValue = record.get(indexFieldName);
|
136
|
+
return String.valueOf(indexValue);
|
137
|
+
} else {
|
138
|
+
return defaultIndexName;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
142
|
+
private String mappingNameForRecord(Map<String, Object> record) {
|
143
|
+
if (record.containsKey(mappingFieldName)) {
|
144
|
+
Object mappingValue = record.get(mappingFieldName);
|
145
|
+
return String.valueOf(mappingValue);
|
146
|
+
} else {
|
147
|
+
return defaultMappingName;
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
//
|
152
|
+
// == Bulk request handling ==
|
153
|
+
//
|
154
|
+
|
155
|
+
private void sendBulkRequestIfBigEnough() {
|
156
|
+
sendBulkRequestIfMoreThan(bulkSize);
|
157
|
+
}
|
158
|
+
|
159
|
+
private void sendBulkRequestIfMoreThan(int size) {
|
160
|
+
totalBulkItems.incrementAndGet();
|
161
|
+
if (currentRequest.numberOfActions() > size) {
|
162
|
+
long startTime = System.currentTimeMillis();
|
163
|
+
BulkResponse response = currentRequest.execute().actionGet();
|
164
|
+
totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
|
165
|
+
if (randgen.nextDouble() < 0.1) {
|
166
|
+
LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
|
167
|
+
}
|
168
|
+
currentRequest = client.prepareBulk();
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|