wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,37 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.hadoop.mapred.JobContext;
6
+ import org.apache.hadoop.mapred.OutputCommitter;
7
+ import org.apache.hadoop.mapred.TaskAttemptContext;
8
+
9
+ public class ElasticSearchStreamingOutputCommitter extends OutputCommitter {
10
+
11
+ @Override
12
+ public void setupJob(JobContext context) throws IOException {
13
+
14
+ }
15
+
16
+ @Override
17
+ public void cleanupJob(JobContext context) throws IOException {
18
+ }
19
+
20
+ @Override
21
+ public void setupTask(TaskAttemptContext context) throws IOException {
22
+ }
23
+
24
+ @Override
25
+ public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
26
+ return false;
27
+ }
28
+
29
+ @Override
30
+ public void commitTask(TaskAttemptContext context) throws IOException {
31
+ }
32
+
33
+ @Override
34
+ public void abortTask(TaskAttemptContext context) throws IOException {
35
+ }
36
+
37
+ }
@@ -0,0 +1,88 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import org.apache.commons.logging.Log;
6
+ import org.apache.commons.logging.LogFactory;
7
+
8
+ import org.apache.hadoop.io.*;
9
+ import org.apache.hadoop.mapred.TaskAttemptContext;
10
+ import org.apache.hadoop.mapred.JobConf;
11
+ import org.apache.hadoop.mapred.RecordWriter;
12
+ import org.apache.hadoop.fs.FileSystem;
13
+ import org.apache.hadoop.mapred.OutputFormat;
14
+ import org.apache.hadoop.util.*;
15
+ import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
16
+
17
+ /**
18
+
19
+ Hadoop OutputFormat for writing arbitrary MapWritables (essentially
20
+ HashMaps) into Elasticsearch. Records are batched up and sent in a
21
+ one-hop manner to the elastic search data nodes that will index
22
+ them.
23
+
24
+ */
25
+ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K, V> {
26
+
27
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingOutputFormat.class);
28
+
29
+ // Job settings we need to control directly from Java options.
30
+ private static final String ES_INDEX_OPT = "elasticsearch.output.index";
31
+ private static final String ES_DEFAULT_INDEX = "hadoop";
32
+ private String defaultIndexName;
33
+
34
+ private static final String ES_MAPPING_OPT = "elasticsearch.output.mapping";
35
+ private static final String ES_DEFAULT_MAPPING = "streaming_record";
36
+ private String defaultMappingName;
37
+
38
+ private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
39
+ private static final String ES_INDEX_FIELD = "_index";
40
+ private String indexFieldName;
41
+
42
+ private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
43
+ private static final String ES_MAPPING_FIELD = "_mapping";
44
+ private String mappingFieldName;
45
+
46
+ private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
47
+ private static final String ES_ID_FIELD = "_id";
48
+ private String idFieldName;
49
+
50
+ private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
51
+ private static final String ES_BULK_SIZE = "100";
52
+ private int bulkSize;
53
+
54
+
55
+ // Elasticsearch internal settings required to make a client
56
+ // connection.
57
+ private static final String ES_CONFIG_OPT = "es.config";
58
+ private static final String ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
59
+
60
+ private static final String ES_PLUGINS_OPT = "es.path.plugins";
61
+ private static final String ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
62
+
63
+ public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException {
64
+ setLocalElasticSearchInstallation(conf);
65
+ String defaultIndexName = conf.get(ES_INDEX_OPT, ES_DEFAULT_INDEX);
66
+ String defaultMappingName = conf.get(ES_MAPPING_OPT, ES_DEFAULT_MAPPING);
67
+ String indexFieldName = conf.get(ES_INDEX_FIELD_OPT, ES_INDEX_FIELD);
68
+ String mappingFieldName = conf.get(ES_MAPPING_FIELD_OPT, ES_MAPPING_FIELD);
69
+ String idFieldName = conf.get(ES_ID_FIELD_OPT, ES_ID_FIELD);
70
+ Integer bulkSize = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
71
+ return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
72
+ }
73
+
74
+ public void setLocalElasticSearchInstallation(JobConf conf) {
75
+ String esConfigPath = conf.get(ES_CONFIG_OPT, ES_CONFIG);
76
+ String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
77
+ System.setProperty(ES_CONFIG_OPT,esConfigPath);
78
+ System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
79
+ LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
80
+ }
81
+
82
+ public ElasticSearchStreamingOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
83
+ return new ElasticSearchStreamingOutputCommitter();
84
+ }
85
+
86
+ public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
87
+ }
88
+ }
@@ -0,0 +1,176 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.IOException;
4
+
5
+ import java.util.Iterator;
6
+
7
+ import org.apache.commons.logging.Log;
8
+ import org.apache.commons.logging.LogFactory;
9
+
10
+ import org.apache.hadoop.mapred.RecordReader;
11
+ import org.apache.hadoop.mapred.InputSplit;
12
+ import org.apache.hadoop.mapred.JobConf;
13
+ import org.apache.hadoop.io.*;
14
+
15
+ import org.elasticsearch.common.unit.TimeValue;
16
+
17
+ import org.elasticsearch.node.Node;
18
+ import org.elasticsearch.node.NodeBuilder;
19
+ import org.elasticsearch.client.Client;
20
+ import org.elasticsearch.action.search.SearchRequestBuilder;
21
+ import org.elasticsearch.action.search.SearchScrollRequestBuilder;
22
+
23
+ import org.elasticsearch.search.SearchHit;
24
+ import org.elasticsearch.search.Scroll;
25
+ import org.elasticsearch.action.search.SearchResponse;
26
+ import org.elasticsearch.cluster.ClusterName;
27
+
28
+ class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
29
+
30
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordReader.class);
31
+
32
+ private static final String ES_REQUEST_SIZE_OPT = "elasticsearch.input.request_size";
33
+ private static final String ES_REQUEST_SIZE = "100";
34
+ private Integer requestSize;
35
+
36
+ private static final String ES_SCROLL_TIMEOUT_OPT = "elasticsearch.input.scroll_timeout";
37
+ private static final String ES_SCROLL_TIMEOUT = "5m";
38
+ private String scrollTimeout;
39
+ private static final TimeValue defaultScrollTimeout = new TimeValue((long) 300000); // 5 minutes
40
+ private Scroll scroll;
41
+
42
+ private Node node;
43
+ private Client client;
44
+ private ElasticSearchStreamingSplit split;
45
+
46
+ private String scrollId;
47
+ private Integer recordsRead;
48
+ private Iterator<SearchHit> hitsItr = null;
49
+
50
+ public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
51
+ this.split = (ElasticSearchStreamingSplit) split;
52
+ this.recordsRead = 0;
53
+ this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
54
+ this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
55
+ this.scroll = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
56
+
57
+ LOG.info("Initializing "+this.split.getSummary());
58
+ startEmbeddedClient();
59
+ fetchNextHits();
60
+ }
61
+
62
+ private void fetchNextHits() {
63
+ if (scrollId == null) {
64
+ LOG.info("Running initial scroll with timeout "+scrollTimeout);
65
+ SearchRequestBuilder request = split.initialScrollRequest(client, scroll, requestSize);
66
+ SearchResponse response = request.execute().actionGet();
67
+ this.scrollId = response.scrollId();
68
+ LOG.info("Got scroll ID "+scrollId);
69
+ // Do we need to call fetchNextHits() again here? Or does
70
+ // the initial request also itself contain the first set
71
+ // of hits for the scroll?
72
+ //
73
+ // fetchNextHits();
74
+ } else {
75
+ // LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
76
+ SearchScrollRequestBuilder request = split.scrollRequest(client, scroll, scrollId);
77
+ SearchResponse response = request.execute().actionGet();
78
+ this.scrollId = response.scrollId();
79
+ // LOG.info("Got scroll ID "+scrollId);
80
+ this.hitsItr = response.hits().iterator();
81
+ }
82
+ }
83
+
84
+ @Override
85
+ public boolean next(K key, V value) throws IOException {
86
+ if (shouldReadAnotherRecord()) {
87
+ // We should read more records because we haven't read as
88
+ // many as we know to be in this split yet.
89
+ if (hasAnotherRecord()) {
90
+ // We already have records stacked up ready to read.
91
+ readRecord(key, value);
92
+ return true;
93
+ } else {
94
+ // We don't have records stacked up so we might need
95
+ // to fetch some more hits.
96
+ fetchNextHits();
97
+ if (hasAnotherRecord()) {
98
+ // Now if we have records we read one
99
+ readRecord(key, value);
100
+ return true;
101
+ } else {
102
+ // But if no records are here this time, it's
103
+ // because we know we're done reading the input.
104
+ return false;
105
+ }
106
+ }
107
+ } else {
108
+ // Return false as we're done with this split.
109
+ return false;
110
+ }
111
+ }
112
+
113
+ private boolean shouldReadAnotherRecord() {
114
+ return recordsRead < split.getSize();
115
+ }
116
+
117
+ private boolean hasAnotherRecord() {
118
+ return hitsItr != null && hitsItr.hasNext();
119
+ }
120
+
121
+ private void readRecord(K key, V value) {
122
+ SearchHit hit = hitsItr.next();
123
+ if (hit != null) {
124
+ Text keyText = (Text) key;
125
+ Text valueText = (Text) value;
126
+ keyText.set(hit.sourceAsString());
127
+ valueText.set(hit.sourceAsString());
128
+ recordsRead += 1;
129
+ }
130
+ }
131
+
132
+ @Override
133
+ public K createKey() {
134
+ return (K) new Text();
135
+ }
136
+
137
+ @Override
138
+ public V createValue() {
139
+ return (V) new Text();
140
+ }
141
+
142
+ @Override
143
+ public long getPos() throws IOException {
144
+ return recordsRead;
145
+ }
146
+
147
+ @Override
148
+ public float getProgress() throws IOException {
149
+ return ((float) recordsRead) / ((float) split.getSize());
150
+ }
151
+
152
+ @Override
153
+ public void close() throws IOException {
154
+ stopEmbeddedClient();
155
+ }
156
+
157
+ //
158
+ // == Connecting to Elasticsearch ==
159
+ //
160
+
161
+ private void startEmbeddedClient() {
162
+ LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
163
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
164
+ this.client = node.client();
165
+ LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
166
+ }
167
+
168
+ private void stopEmbeddedClient() {
169
+ LOG.info("Stopping embedded Elasticsearch client...");
170
+ if (client != null) client.close();
171
+ if (node != null) node.close();
172
+ LOG.info("Left Elasticsearch cluster");
173
+ }
174
+
175
+
176
+ }
@@ -0,0 +1,171 @@
1
+ package com.infochimps.elasticsearch;
2
+
3
+ import java.io.File;
4
+ import java.io.IOException;
5
+ import java.util.Map;
6
+ import java.util.concurrent.atomic.AtomicLong;
7
+ import java.util.Random;
8
+
9
+ import org.apache.commons.logging.Log;
10
+ import org.apache.commons.logging.LogFactory;
11
+
12
+ import org.apache.hadoop.io.*;
13
+ import org.apache.hadoop.mapred.JobConf;
14
+ import org.apache.hadoop.mapred.RecordWriter;
15
+ import org.apache.hadoop.mapred.Reporter;
16
+ import org.apache.hadoop.util.*;
17
+
18
+ import org.elasticsearch.cluster.ClusterName;
19
+ import org.elasticsearch.node.Node;
20
+ import org.elasticsearch.node.NodeBuilder;
21
+ import org.elasticsearch.client.Client;
22
+ import org.elasticsearch.client.Requests;
23
+ import org.elasticsearch.action.bulk.BulkRequestBuilder;
24
+ import org.elasticsearch.action.bulk.BulkResponse;
25
+ import org.elasticsearch.ExceptionsHelper;
26
+
27
+ import org.codehaus.jackson.map.ObjectMapper;
28
+ import org.codehaus.jackson.JsonParseException;
29
+
30
+ class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
31
+
32
+ static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordWriter.class);
33
+
34
+ private String defaultIndexName;
35
+ private String defaultMappingName;
36
+ private String indexFieldName;
37
+ private String mappingFieldName;
38
+ private String idFieldName;
39
+ private Integer bulkSize;
40
+
41
+ // Bookkeeping
42
+ private AtomicLong totalBulkTime = new AtomicLong();
43
+ private AtomicLong totalBulkItems = new AtomicLong();
44
+ private Random randgen = new Random();
45
+ private long runStartTime = System.currentTimeMillis();
46
+
47
+ // Elasticsearch indexing
48
+ private Node node;
49
+ private Client client;
50
+ private volatile BulkRequestBuilder currentRequest;
51
+
52
+ // JSON parsing
53
+ private ObjectMapper mapper;
54
+
55
+ //
56
+ // == Lifecycle ==
57
+ //
58
+
59
+ public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
60
+ this.defaultIndexName = defaultIndexName;
61
+ this.defaultMappingName = defaultMappingName;
62
+ this.indexFieldName = indexFieldName;
63
+ this.mappingFieldName = mappingFieldName;
64
+ this.idFieldName = idFieldName;
65
+ this.bulkSize = bulkSize;
66
+
67
+ LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
68
+ LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
69
+ LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
70
+
71
+ startEmbeddedClient();
72
+ this.currentRequest = client.prepareBulk();
73
+ this.mapper = new ObjectMapper();
74
+ }
75
+
76
+ /**
77
+ Start an embedded Elasticsearch client. The client will not be
78
+ a data node and will not store data locally.
79
+
80
+ The client will connect to the target Elasticsearch cluster as
81
+ a client node, enabling one-hop writes for all data. See
82
+ http://www.elasticsearch.org/guide/reference/java-api/client.html
83
+ */
84
+ private void startEmbeddedClient() {
85
+ LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
86
+ this.node = NodeBuilder.nodeBuilder().client(true).node();
87
+ this.client = node.client();
88
+ LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
89
+ }
90
+
91
+
92
+ /**
93
+ Close the Elasticsearch client, sending out one last bulk write
94
+ if necessary.
95
+ */
96
+ public void close(Reporter reporter) throws IOException {
97
+ sendBulkRequestIfMoreThan(0);
98
+ LOG.info("Shutting down Elasticsearch client...");
99
+ if (client != null) client.close();
100
+ if (node != null) node.close();
101
+ LOG.info("Successfully shut down Elasticsearch client");
102
+ }
103
+
104
+ //
105
+ // == Writing records ==
106
+ //
107
+
108
+ public void write(K key, V value) throws IOException {
109
+ String json = ((Text) key).toString();
110
+ try {
111
+ index(json);
112
+ sendBulkRequestIfBigEnough();
113
+ } catch(Exception e) {
114
+ if (ExceptionsHelper.unwrapCause(e) instanceof JsonParseException) {
115
+ LOG.debug("Bad record: "+json);
116
+ return;
117
+ } else {
118
+ LOG.error("Could not write record: "+json, e);
119
+ }
120
+ }
121
+ }
122
+
123
+ private void index(String json) throws IOException {
124
+ Map<String, Object> record = mapper.readValue(json, Map.class);
125
+ if (record.containsKey(idFieldName)) {
126
+ Object idValue = record.get(idFieldName);
127
+ currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
128
+ } else {
129
+ currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
130
+ }
131
+ }
132
+
133
+ private String indexNameForRecord(Map<String, Object> record) {
134
+ if (record.containsKey(indexFieldName)) {
135
+ Object indexValue = record.get(indexFieldName);
136
+ return String.valueOf(indexValue);
137
+ } else {
138
+ return defaultIndexName;
139
+ }
140
+ }
141
+
142
+ private String mappingNameForRecord(Map<String, Object> record) {
143
+ if (record.containsKey(mappingFieldName)) {
144
+ Object mappingValue = record.get(mappingFieldName);
145
+ return String.valueOf(mappingValue);
146
+ } else {
147
+ return defaultMappingName;
148
+ }
149
+ }
150
+
151
+ //
152
+ // == Bulk request handling ==
153
+ //
154
+
155
+ private void sendBulkRequestIfBigEnough() {
156
+ sendBulkRequestIfMoreThan(bulkSize);
157
+ }
158
+
159
+ private void sendBulkRequestIfMoreThan(int size) {
160
+ totalBulkItems.incrementAndGet();
161
+ if (currentRequest.numberOfActions() > size) {
162
+ long startTime = System.currentTimeMillis();
163
+ BulkResponse response = currentRequest.execute().actionGet();
164
+ totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
165
+ if (randgen.nextDouble() < 0.1) {
166
+ LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
167
+ }
168
+ currentRequest = client.prepareBulk();
169
+ }
170
+ }
171
+ }