wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,37 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.mapred.JobContext;
6
+ import org.apache.hadoop.mapred.OutputCommitter;
7
+ import org.apache.hadoop.mapred.TaskAttemptContext;
8
+
9
+ public class ElasticSearchStreamingOutputCommitter extends OutputCommitter {
10
+
11
+ @Override
12
+ public void setupJob(JobContext context) throws IOException {
13
+
14
+ }
15
+
16
+ @Override
17
+ public void cleanupJob(JobContext context) throws IOException {
18
+ }
19
+
20
+ @Override
21
+ public void setupTask(TaskAttemptContext context) throws IOException {
22
+ }
23
+
24
+ @Override
25
+ public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
26
+ return false;
27
+ }
28
+
29
+ @Override
30
+ public void commitTask(TaskAttemptContext context) throws IOException {
31
+ }
32
+
33
+ @Override
34
+ public void abortTask(TaskAttemptContext context) throws IOException {
35
+ }
36
+
37
+ }
@@ -0,0 +1,88 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.commons.logging.Log;
6
+ import org.apache.commons.logging.LogFactory;
7
+
8
+ import org.apache.hadoop.io.*;
9
+ import org.apache.hadoop.mapred.TaskAttemptContext;
10
+ import org.apache.hadoop.mapred.JobConf;
11
+ import org.apache.hadoop.mapred.RecordWriter;
12
+ import org.apache.hadoop.fs.FileSystem;
13
+ import org.apache.hadoop.mapred.OutputFormat;
14
+ import org.apache.hadoop.util.*;
15
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
16
+
17
+ /**
18
+
19
+ Hadoop OutputFormat for writing arbitrary MapWritables (essentially
20
+ HashMaps) into Elasticsearch. Records are batched up and sent in a
21
+ one-hop manner to the elastic search data nodes that will index
22
+ them.
23
+
24
+ */
25
+ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K, V> {
26
+
27
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingOutputFormat.class);
28
+
29
+ // Job settings we need to control directly from Java options.
30
+ private static final String ES_INDEX_OPT = "elasticsearch.output.index";
31
+ private static final String ES_DEFAULT_INDEX = "hadoop";
32
+ private String defaultIndexName;
33
+
34
+ private static final String ES_MAPPING_OPT = "elasticsearch.output.mapping";
35
+ private static final String ES_DEFAULT_MAPPING = "streaming_record";
36
+ private String defaultMappingName;
37
+
38
+ private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
39
+ private static final String ES_INDEX_FIELD = "_index";
40
+ private String indexFieldName;
41
+
42
+ private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
43
+ private static final String ES_MAPPING_FIELD = "_mapping";
44
+ private String mappingFieldName;
45
+
46
+ private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
47
+ private static final String ES_ID_FIELD = "_id";
48
+ private String idFieldName;
49
+
50
+ private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
51
+ private static final String ES_BULK_SIZE = "100";
52
+ private int bulkSize;
53
+
54
+
55
+ // Elasticsearch internal settings required to make a client
56
+ // connection.
57
+ private static final String ES_CONFIG_OPT = "es.config";
58
+ private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
59
+
60
+ private static final String ES_PLUGINS_OPT = "es.path.plugins";
61
+ private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
62
+
63
+ public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException {
64
+ setLocalElasticSearchInstallation(conf);
65
+ String defaultIndexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
66
+ String defaultMappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
67
+ String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
68
+ String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
69
+ String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
70
+ Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
71
+ return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
72
+ }
73
+
74
+ public void setLocalElasticSearchInstallation(JobConf conf) {
75
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
76
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
77
+ System.setProperty(ES_CONFIG_OPT,esConfigPath);
78
+ System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
79
+ LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
80
+ }
81
+
82
+ public ElasticSearchStreamingOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
83
+ return new ElasticSearchStreamingOutputCommitter();
84
+ }
85
+
86
+ public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
87
+ }
88
+ }
@@ -0,0 +1,176 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import java.util.Iterator;
6
+
7
+ import org.apache.commons.logging.Log;
8
+ import org.apache.commons.logging.LogFactory;
9
+
10
+ import org.apache.hadoop.mapred.RecordReader;
11
+ import org.apache.hadoop.mapred.InputSplit;
12
+ import org.apache.hadoop.mapred.JobConf;
13
+ import org.apache.hadoop.io.*;
14
+
15
+ import org.elasticsearch.common.unit.TimeValue;
16
+
17
+ import org.elasticsearch.node.Node;
18
+ import org.elasticsearch.node.NodeBuilder;
19
+ import org.elasticsearch.client.Client;
20
+ import org.elasticsearch.action.search.SearchRequestBuilder;
21
+ import org.elasticsearch.action.search.SearchScrollRequestBuilder;
22
+
23
+ import org.elasticsearch.search.SearchHit;
24
+ import org.elasticsearch.search.Scroll;
25
+ import org.elasticsearch.action.search.SearchResponse;
26
+ import org.elasticsearch.cluster.ClusterName;
27
+
28
+ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
29
+
30
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordReader.class);
31
+
32
+ private static final String ES_REQUEST_SIZE_OPT = "elasticsearch.input.request_size";
33
+ private static final String ES_REQUEST_SIZE = "100";
34
+ private Integer requestSize;
35
+
36
+ private static final String ES_SCROLL_TIMEOUT_OPT = "elasticsearch.input.scroll_timeout";
37
+ private static final String ES_SCROLL_TIMEOUT = "5m";
38
+ private String scrollTimeout;
39
+ private static final TimeValue defaultScrollTimeout = new TimeValue((long) 300000); // 5 minutes
40
+ private Scroll scroll;
41
+
42
+ private Node node;
43
+ private Client client;
44
+ private ElasticSearchStreamingSplit split;
45
+
46
+ private String scrollId;
47
+ private Integer recordsRead;
48
+ private Iterator<SearchHit> hitsItr = null;
49
+
50
+ public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
51
+ this.split = (ElasticSearchStreamingSplit) split;
52
+ this.recordsRead = 0;
53
+ this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
54
+ this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
55
+ this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
56
+
57
+ LOG.info("Initializing "+this.split.getSummary());
58
+ startEmbeddedClient();
59
+ fetchNextHits();
60
+ }
61
+
62
+ private void fetchNextHits() {
63
+ if (scrollId == null) {
64
+ LOG.info("Running initial scroll with timeout "+scrollTimeout);
65
+ SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
66
+ SearchResponse response = request.execute().actionGet();
67
+ this.scrollId = response.scrollId();
68
+ LOG.info("Got scroll ID "+scrollId);
69
+ // Do we need to call fetchNextHits() again here? Or does
70
+ // the initial request also itself contain the first set
71
+ // of hits for the scroll?
72
+ //
73
+ // fetchNextHits();
74
+ } else {
75
+ // LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
76
+ SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
77
+ SearchResponse response = request.execute().actionGet();
78
+ this.scrollId = response.scrollId();
79
+ // LOG.info("Got scroll ID "+scrollId);
80
+ this.hitsItr = response.hits().iterator();
81
+ }
82
+ }
83
+
84
+ @Override
85
+ public boolean next(K key, V value) throws IOException {
86
+ if (shouldReadAnotherRecord()) {
87
+ // We should read more records because we haven't read as
88
+ // many as we know to be in this split yet.
89
+ if (hasAnotherRecord()) {
90
+ // We already have records stacked up ready to read.
91
+ readRecord(key, value);
92
+ return true;
93
+ } else {
94
+ // We don't have records stacked up so we might need
95
+ // to fetch some more hits.
96
+ fetchNextHits();
97
+ if (hasAnotherRecord()) {
98
+ // Now if we have records we read one
99
+ readRecord(key, value);
100
+ return true;
101
+ } else {
102
+ // But if no records are here this time, it's
103
+ // because we know we're done reading the input.
104
+ return false;
105
+ }
106
+ }
107
+ } else {
108
+ // Return false as we're done with this split.
109
+ return false;
110
+ }
111
+ }
112
+
113
+ private boolean shouldReadAnotherRecord() {
114
+ return recordsRead < split.getSize();
115
+ }
116
+
117
+ private boolean hasAnotherRecord() {
118
+ return hitsItr != null && hitsItr.hasNext();
119
+ }
120
+
121
+ private void readRecord(K key, V value) {
122
+ SearchHit hit = hitsItr.next();
123
+ if (hit != null) {
124
+ Text keyText = (Text) key;
125
+ Text valueText = (Text) value;
126
+ keyText.set(hit.sourceAsString());
127
+ valueText.set(hit.sourceAsString());
128
+ recordsRead += 1;
129
+ }
130
+ }
131
+
132
+ @Override
133
+ public K createKey() {
134
+ return (K) new Text();
135
+ }
136
+
137
+ @Override
138
+ public V createValue() {
139
+ return (V) new Text();
140
+ }
141
+
142
+ @Override
143
+ public long getPos() throws IOException {
144
+ return recordsRead;
145
+ }
146
+
147
+ @Override
148
+ public float getProgress() throws IOException {
149
+ return ((float) recordsRead) / ((float) split.getSize());
150
+ }
151
+
152
+ @Override
153
+ public void close() throws IOException {
154
+ stopEmbeddedClient();
155
+ }
156
+
157
+ //
158
+ // == Connecting to Elasticsearch ==
159
+ //
160
+
161
+ private void startEmbeddedClient() {
162
+ LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
163
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
164
+ this.client = node.client();
165
+ LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
166
+ }
167
+
168
+ private void stopEmbeddedClient() {
169
+ LOG.info("Stopping embedded Elasticsearch client...");
170
+ if (client != null) client.close();
171
+ if (node != null) node.close();
172
+ LOG.info("Left Elasticsearch cluster");
173
+ }
174
+
175
+
176
+ }
@@ -0,0 +1,171 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.File;
4
+ import java.io.IOException;
5
+ import java.util.Map;
6
+ import java.util.concurrent.atomic.AtomicLong;
7
+ import java.util.Random;
8
+
9
+ import org.apache.commons.logging.Log;
10
+ import org.apache.commons.logging.LogFactory;
11
+
12
+ import org.apache.hadoop.io.*;
13
+ import org.apache.hadoop.mapred.JobConf;
14
+ import org.apache.hadoop.mapred.RecordWriter;
15
+ import org.apache.hadoop.mapred.Reporter;
16
+ import org.apache.hadoop.util.*;
17
+
18
+ import org.elasticsearch.cluster.ClusterName;
19
+ import org.elasticsearch.node.Node;
20
+ import org.elasticsearch.node.NodeBuilder;
21
+ import org.elasticsearch.client.Client;
22
+ import org.elasticsearch.client.Requests;
23
+ import org.elasticsearch.action.bulk.BulkRequestBuilder;
24
+ import org.elasticsearch.action.bulk.BulkResponse;
25
+ import org.elasticsearch.ExceptionsHelper;
26
+
27
+ import org.codehaus.jackson.map.ObjectMapper;
28
+ import org.codehaus.jackson.JsonParseException;
29
+
30
+ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
31
+
32
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordWriter.class);
33
+
34
+ private String defaultIndexName;
35
+ private String defaultMappingName;
36
+ private String indexFieldName;
37
+ private String mappingFieldName;
38
+ private String idFieldName;
39
+ private Integer bulkSize;
40
+
41
+ // Bookkeeping
42
+ private AtomicLong totalBulkTime = new AtomicLong();
43
+ private AtomicLong totalBulkItems = new AtomicLong();
44
+ private Random randgen = new Random();
45
+ private long runStartTime = System.currentTimeMillis();
46
+
47
+ // Elasticsearch indexing
48
+ private Node node;
49
+ private Client client;
50
+ private volatile BulkRequestBuilder currentRequest;
51
+
52
+ // JSON parsing
53
+ private ObjectMapper mapper;
54
+
55
+ //
56
+ // == Lifecycle ==
57
+ //
58
+
59
+ public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
60
+ this.defaultIndexName = defaultIndexName;
61
+ this.defaultMappingName = defaultMappingName;
62
+ this.indexFieldName = indexFieldName;
63
+ this.mappingFieldName = mappingFieldName;
64
+ this.idFieldName = idFieldName;
65
+ this.bulkSize = bulkSize;
66
+
67
+ LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
68
+ LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
69
+ LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
70
+
71
+ startEmbeddedClient();
72
+ this.currentRequest = client.prepareBulk();
73
+ this.mapper = new ObjectMapper();
74
+ }
75
+
76
+ /**
77
+ Start an embedded Elasticsearch client. The client will not be
78
+ a data node and will not store data locally.
79
+
80
+ The client will connect to the target Elasticsearch cluster as
81
+ a client node, enabling one-hop writes for all data. See
82
+ http://www.elasticsearch.org/guide/reference/java-api/client.html
83
+ */
84
+ private void startEmbeddedClient() {
85
+ LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
86
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
87
+ this.client = node.client();
88
+ LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
89
+ }
90
+
91
+
92
+ /**
93
+ Close the Elasticsearch client, sending out one last bulk write
94
+ if necessary.
95
+ */
96
+ public void close(Reporter reporter) throws IOException {
97
+ sendBulkRequestIfMoreThan(0);
98
+ LOG.info("Shutting down Elasticsearch client...");
99
+ if (client != null) client.close();
100
+ if (node != null) node.close();
101
+ LOG.info("Successfully shut down Elasticsearch client");
102
+ }
103
+
104
+ //
105
+ // == Writing records ==
106
+ //
107
+
108
+ public void write(K key, V value) throws IOException {
109
+ String json = ((Text) key).toString();
110
+ try {
111
+ index(json);
112
+ sendBulkRequestIfBigEnough();
113
+ } catch(Exception e) {
114
+ if (ExceptionsHelper.unwrapCause(e) instanceof JsonParseException) {
115
+ LOG.debug("Bad record: "+json);
116
+ return;
117
+ } else {
118
+ LOG.error("Could not write record: "+json, e);
119
+ }
120
+ }
121
+ }
122
+
123
+ private void index(String json) throws IOException {
124
+ Map<String, Object> record = mapper.readValue(json, Map.class);
125
+ if (record.containsKey(idFieldName)) {
126
+ Object idValue = record.get(idFieldName);
127
+ currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
128
+ } else {
129
+ currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
130
+ }
131
+ }
132
+
133
+ private String indexNameForRecord(Map<String, Object> record) {
134
+ if (record.containsKey(indexFieldName)) {
135
+ Object indexValue = record.get(indexFieldName);
136
+ return String.valueOf(indexValue);
137
+ } else {
138
+ return defaultIndexName;
139
+ }
140
+ }
141
+
142
+ private String mappingNameForRecord(Map<String, Object> record) {
143
+ if (record.containsKey(mappingFieldName)) {
144
+ Object mappingValue = record.get(mappingFieldName);
145
+ return String.valueOf(mappingValue);
146
+ } else {
147
+ return defaultMappingName;
148
+ }
149
+ }
150
+
151
+ //
152
+ // == Bulk request handling ==
153
+ //
154
+
155
+ private void sendBulkRequestIfBigEnough() {
156
+ sendBulkRequestIfMoreThan(bulkSize);
157
+ }
158
+
159
+ private void sendBulkRequestIfMoreThan(int size) {
160
+ totalBulkItems.incrementAndGet();
161
+ if (currentRequest.numberOfActions() > size) {
162
+ long startTime = System.currentTimeMillis();
163
+ BulkResponse response = currentRequest.execute().actionGet();
164
+ totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
165
+ if (randgen.nextDouble() < 0.1) {
166
+ LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
167
+ }
168
+ currentRequest = client.prepareBulk();
169
+ }
170
+ }
171
+ }