RubyGems - wonderdog - Versions diffs - 0.0.1 - Mend

wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/.gitignore +49 -0
data/.rspec +2 -0
data/CHANGELOG.md +5 -0
data/LICENSE.md +201 -0
data/README.md +175 -0
data/Rakefile +10 -0
data/bin/estool +141 -0
data/bin/estrus.rb +136 -0
data/bin/wonderdog +93 -0
data/config/elasticsearch-example.yml +227 -0
data/config/elasticsearch.in.sh +52 -0
data/config/logging.yml +43 -0
data/config/more_settings.yml +60 -0
data/config/run_elasticsearch-2.sh +42 -0
data/config/ufo_config.json +12 -0
data/lib/wonderdog.rb +14 -0
data/lib/wonderdog/configuration.rb +25 -0
data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
data/lib/wonderdog/index_and_mapping.rb +67 -0
data/lib/wonderdog/timestamp.rb +43 -0
data/lib/wonderdog/version.rb +3 -0
data/notes/README-benchmarking.txt +272 -0
data/notes/README-read_tuning.textile +74 -0
data/notes/benchmarking-201011.numbers +0 -0
data/notes/cluster_notes.md +17 -0
data/notes/notes.txt +91 -0
data/notes/pigstorefunc.pig +45 -0
data/pom.xml +80 -0
data/spec/spec_helper.rb +22 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +30 -0
data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
data/spec/wonderdog/index_and_type_spec.rb +73 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
data/test/foo.json +3 -0
data/test/foo.tsv +3 -0
data/test/test_dump.pig +19 -0
data/test/test_json_loader.pig +21 -0
data/test/test_tsv_loader.pig +16 -0
data/wonderdog.gemspec +32 -0
metadata +130 -0

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java ADDED

@@ -0,0 +1,37 @@
+package com.infochimps.elasticsearch;
+import java.io.IOException;
+import org.apache.hadoop.mapred.JobContext;
+import org.apache.hadoop.mapred.OutputCommitter;
+import org.apache.hadoop.mapred.TaskAttemptContext;
+public class ElasticSearchStreamingOutputCommitter extends OutputCommitter {
+    @Override
+    public void setupJob(JobContext context) throws IOException {
+    }
+    @Override
+    public void cleanupJob(JobContext context) throws IOException {
+    }
+    @Override
+    public void setupTask(TaskAttemptContext context) throws IOException {
+    }
+    @Override
+    public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
+	return false;
+    }
+    @Override
+    public void commitTask(TaskAttemptContext context) throws IOException {
+    }
+    @Override
+    public void abortTask(TaskAttemptContext context) throws IOException {
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java ADDED

@@ -0,0 +1,88 @@
+package com.infochimps.elasticsearch;
+import java.io.IOException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.TaskAttemptContext;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.mapred.OutputFormat;
+import org.apache.hadoop.util.*;
+import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
+/**
+   Hadoop OutputFormat for writing arbitrary MapWritables (essentially
+   HashMaps) into Elasticsearch. Records are batched up and sent in a
+   one-hop manner to the elastic search data nodes that will index
+   them.
+*/
+public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K, V> {
+    static Log LOG = LogFactory.getLog(ElasticSearchStreamingOutputFormat.class);
+    // Job settings we need to control directly from Java options.
+    private static final String ES_INDEX_OPT     = "elasticsearch.output.index";
+    private static final String ES_DEFAULT_INDEX = "hadoop";
+    private              String defaultIndexName;
+    private static final String ES_MAPPING_OPT     = "elasticsearch.output.mapping";
+    private static final String ES_DEFAULT_MAPPING = "streaming_record";
+    private              String defaultMappingName;
+    private static final String ES_INDEX_FIELD_OPT = "elasticsearch.output.index.field";
+    private static final String ES_INDEX_FIELD     = "_index";
+    private              String indexFieldName;
+    private static final String ES_MAPPING_FIELD_OPT = "elasticsearch.output.mapping.field";
+    private static final String ES_MAPPING_FIELD     = "_mapping";
+    private              String mappingFieldName;
+    private static final String ES_ID_FIELD_OPT = "elasticsearch.output.id.field";
+    private static final String ES_ID_FIELD     = "_id";
+    private              String idFieldName;
+    private static final String ES_BULK_SIZE_OPT     = "elasticsearch.output.bulk_size";
+    private static final String ES_BULK_SIZE         = "100";
+    private              int    bulkSize;
+    // Elasticsearch internal settings required to make a client
+    // connection.
+    private static final String ES_CONFIG_OPT        = "es.config";
+    private static final String ES_CONFIG            = "/etc/elasticsearch/elasticsearch.yml";
+    private static final String ES_PLUGINS_OPT       = "es.path.plugins";
+    private static final String ES_PLUGINS           = "/usr/local/share/elasticsearch/plugins";
+    public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf conf, String name, Progressable progress) throws IOException {
+	setLocalElasticSearchInstallation(conf);
+	String  defaultIndexName   = conf.get(ES_INDEX_OPT,          ES_DEFAULT_INDEX);
+	String  defaultMappingName = conf.get(ES_MAPPING_OPT,        ES_DEFAULT_MAPPING);
+	String  indexFieldName     = conf.get(ES_INDEX_FIELD_OPT,    ES_INDEX_FIELD);
+	String  mappingFieldName   = conf.get(ES_MAPPING_FIELD_OPT,  ES_MAPPING_FIELD);
+	String  idFieldName        = conf.get(ES_ID_FIELD_OPT,       ES_ID_FIELD);
+	Integer bulkSize           = Integer.parseInt(conf.get(ES_BULK_SIZE_OPT, ES_BULK_SIZE));
+        return (RecordWriter) new ElasticSearchStreamingRecordWriter(defaultIndexName, defaultMappingName, indexFieldName, mappingFieldName, idFieldName, bulkSize);
+    }
+    public void setLocalElasticSearchInstallation(JobConf conf) {
+	String esConfigPath  = conf.get(ES_CONFIG_OPT,  ES_CONFIG);
+	String esPluginsPath = conf.get(ES_PLUGINS_OPT, ES_PLUGINS);
+	System.setProperty(ES_CONFIG_OPT,esConfigPath);
+	System.setProperty(ES_PLUGINS_OPT,esPluginsPath);
+	LOG.info("Using Elasticsearch configuration file at "+esConfigPath+" and plugin directory "+esPluginsPath);
+    }
+    public ElasticSearchStreamingOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
+        return new ElasticSearchStreamingOutputCommitter();
+    }
+    public void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException {
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java ADDED

@@ -0,0 +1,176 @@
+package com.infochimps.elasticsearch;
+import java.io.IOException;
+import java.util.Iterator;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.io.*;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.node.Node;
+import org.elasticsearch.node.NodeBuilder;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchScrollRequestBuilder;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.Scroll;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.cluster.ClusterName;
+class ElasticSearchStreamingRecordReader<K, V> implements RecordReader<K, V> {
+    static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordReader.class);
+    private static final String  ES_REQUEST_SIZE_OPT = "elasticsearch.input.request_size";
+    private static final String  ES_REQUEST_SIZE     = "100";
+    private              Integer requestSize;
+    private static final String    ES_SCROLL_TIMEOUT_OPT = "elasticsearch.input.scroll_timeout";
+    private static final String    ES_SCROLL_TIMEOUT     = "5m";
+    private              String    scrollTimeout;
+    private static final TimeValue defaultScrollTimeout = new TimeValue((long) 300000); // 5 minutes
+    private              Scroll    scroll;
+    private Node                        node;
+    private Client                      client;
+    private ElasticSearchStreamingSplit split;
+    private String  scrollId;
+    private Integer recordsRead;
+    private Iterator<SearchHit> hitsItr = null;
+    public ElasticSearchStreamingRecordReader(InputSplit split, JobConf conf) {
+	this.split         = (ElasticSearchStreamingSplit) split;
+	this.recordsRead   = 0;
+	this.requestSize   = Integer.parseInt(conf.get(ES_REQUEST_SIZE_OPT, ES_REQUEST_SIZE));
+	this.scrollTimeout = conf.get(ES_SCROLL_TIMEOUT_OPT, ES_SCROLL_TIMEOUT);
+	this.scroll        = new Scroll(TimeValue.parseTimeValue(this.scrollTimeout, defaultScrollTimeout));
+	LOG.info("Initializing "+this.split.getSummary());
+	startEmbeddedClient();
+	fetchNextHits();
+    }
+    private void fetchNextHits() {
+	if (scrollId == null) {
+	    LOG.info("Running initial scroll with timeout "+scrollTimeout);
+	    SearchRequestBuilder request  = split.initialScrollRequest(client, scroll, requestSize);
+	    SearchResponse       response = request.execute().actionGet();
+	    this.scrollId = response.scrollId();
+	    LOG.info("Got scroll ID "+scrollId);
+	    // Do we need to call fetchNextHits() again here?  Or does
+	    // the initial request also itself contain the first set
+	    // of hits for the scroll?
+	    //
+	    // fetchNextHits();
+	} else {
+	    // LOG.info("Running query for scroll ID "+scrollId+" with timeout "+scrollTimeout);
+	    SearchScrollRequestBuilder request  = split.scrollRequest(client, scroll, scrollId);
+	    SearchResponse             response = request.execute().actionGet();
+	    this.scrollId = response.scrollId();
+	    // LOG.info("Got scroll ID "+scrollId);
+	    this.hitsItr = response.hits().iterator();
+	}
+    }
+    @Override
+	public boolean next(K key, V value) throws IOException {
+	if (shouldReadAnotherRecord()) {
+	    // We should read more records because we haven't read as
+	    // many as we know to be in this split yet.
+	    if (hasAnotherRecord()) {
+		// We already have records stacked up ready to read.
+		readRecord(key, value);
+		return true;
+	    } else {
+		// We don't have records stacked up so we might need
+		// to fetch some more hits.
+		fetchNextHits();
+		if (hasAnotherRecord()) {
+		    // Now if we have records we read one
+		    readRecord(key, value);
+		    return true;
+		} else {
+		    // But if no records are here this time, it's
+		    // because we know we're done reading the input.
+		    return false;
+		}
+	    }
+	} else {
+	    // Return false as we're done with this split.
+	    return false;
+	}
+    }
+    private boolean shouldReadAnotherRecord() {
+	return recordsRead < split.getSize();
+    }
+    private boolean hasAnotherRecord() {
+	return hitsItr != null && hitsItr.hasNext();
+    }
+    private void readRecord(K key, V value) {
+	SearchHit hit = hitsItr.next();
+	if (hit != null) {
+	    Text keyText   = (Text) key;
+	    Text valueText = (Text) value;
+	    keyText.set(hit.sourceAsString());
+	    valueText.set(hit.sourceAsString());
+	    recordsRead += 1;
+	}
+    }
+    @Override
+	public K createKey() {
+	return (K) new Text();
+    }
+    @Override
+	public V createValue() {
+	return (V) new Text();
+    }
+    @Override
+	public long getPos() throws IOException {
+	return recordsRead;
+    }
+    @Override
+	public float getProgress() throws IOException {
+	return ((float) recordsRead) / ((float) split.getSize());
+    }
+    @Override
+	public void close() throws IOException {
+	stopEmbeddedClient();
+    }
+    //
+    // == Connecting to Elasticsearch ==
+    //
+    private void startEmbeddedClient() {
+	LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
+	this.node   = NodeBuilder.nodeBuilder().client(true).node();
+	this.client = node.client();
+	LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
+    }
+    private void stopEmbeddedClient() {
+	LOG.info("Stopping embedded Elasticsearch client...");
+	if (client != null) client.close();
+	if (node   != null) node.close();
+	LOG.info("Left Elasticsearch cluster");
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java ADDED

@@ -0,0 +1,171 @@
+package com.infochimps.elasticsearch;
+import java.io.File;
+import java.io.IOException;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.Random;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.*;
+import org.elasticsearch.cluster.ClusterName;
+import org.elasticsearch.node.Node;
+import org.elasticsearch.node.NodeBuilder;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.client.Requests;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.ExceptionsHelper;
+import org.codehaus.jackson.map.ObjectMapper;
+import org.codehaus.jackson.JsonParseException;
+class ElasticSearchStreamingRecordWriter<K, V> implements RecordWriter<K, V> {
+    static Log LOG = LogFactory.getLog(ElasticSearchStreamingRecordWriter.class);
+    private String  defaultIndexName;
+    private String  defaultMappingName;
+    private String  indexFieldName;
+    private String  mappingFieldName;
+    private String  idFieldName;
+    private Integer bulkSize;
+    // Bookkeeping
+    private AtomicLong totalBulkTime  = new AtomicLong();
+    private AtomicLong totalBulkItems = new AtomicLong();
+    private Random     randgen        = new Random();
+    private long       runStartTime   = System.currentTimeMillis();
+    // Elasticsearch indexing
+    private              Node               node;
+    private              Client             client;
+    private volatile     BulkRequestBuilder currentRequest;
+    // JSON parsing
+    private              ObjectMapper       mapper;
+    //
+    // == Lifecycle ==
+    //
+    public ElasticSearchStreamingRecordWriter(String defaultIndexName, String defaultMappingName, String indexFieldName, String mappingFieldName, String idFieldName, Integer bulkSize) {
+	this.defaultIndexName = defaultIndexName;
+	this.defaultMappingName  = defaultMappingName;
+	this.indexFieldName   = indexFieldName;
+	this.mappingFieldName    = mappingFieldName;
+	this.idFieldName      = idFieldName;
+	this.bulkSize         = bulkSize;
+	LOG.info("Writing "+Integer.toString(bulkSize)+" records per batch");
+	LOG.info("Using default target /"+defaultIndexName+"/"+defaultMappingName);
+	LOG.info("Records override default target with index field '"+indexFieldName+"', mapping field '"+mappingFieldName+"', and ID field '"+idFieldName);
+	startEmbeddedClient();
+	this.currentRequest = client.prepareBulk();
+	this.mapper = new ObjectMapper();
+    }
+    /**
+       Start an embedded Elasticsearch client.  The client will not be
+       a data node and will not store data locally.
+       The client will connect to the target Elasticsearch cluster as
+       a client node, enabling one-hop writes for all data.  See
+       http://www.elasticsearch.org/guide/reference/java-api/client.html
+    */
+    private void startEmbeddedClient() {
+	LOG.info("Starting embedded Elasticsearch client (non-datanode)...");
+	this.node   = NodeBuilder.nodeBuilder().client(true).node();
+	this.client = node.client();
+	LOG.info("Successfully joined Elasticsearch cluster '"+ClusterName.clusterNameFromSettings(node.settings())+'"');
+    }
+    /**
+       Close the Elasticsearch client, sending out one last bulk write
+       if necessary.
+    */
+    public void close(Reporter reporter) throws IOException {
+	sendBulkRequestIfMoreThan(0);
+	LOG.info("Shutting down Elasticsearch client...");
+	if (client != null) client.close();
+	if (node   != null) node.close();
+	LOG.info("Successfully shut down Elasticsearch client");
+    }
+    //
+    // == Writing records ==
+    //
+    public void write(K key, V value) throws IOException {
+	String json = ((Text) key).toString();
+	try {
+	    index(json);
+	    sendBulkRequestIfBigEnough();
+	} catch(Exception e) {
+	    if (ExceptionsHelper.unwrapCause(e) instanceof JsonParseException) {
+		LOG.debug("Bad record: "+json);
+		return;
+	    } else {
+		LOG.error("Could not write record: "+json, e);
+	    }
+	}
+    }
+    private void index(String json) throws IOException {
+	Map<String, Object> record = mapper.readValue(json, Map.class);
+	if (record.containsKey(idFieldName)) {
+	    Object idValue = record.get(idFieldName);
+	    currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).id(String.valueOf(idValue)).type(mappingNameForRecord(record)).create(false).source(json));
+	} else {
+	    currentRequest.add(Requests.indexRequest(indexNameForRecord(record)).type(mappingNameForRecord(record)).source(json));
+	}
+    }
+    private String indexNameForRecord(Map<String, Object> record) {
+	if (record.containsKey(indexFieldName)) {
+	    Object indexValue   = record.get(indexFieldName);
+	    return String.valueOf(indexValue);
+	} else {
+	    return defaultIndexName;
+	}
+    }
+    private String mappingNameForRecord(Map<String, Object> record) {
+	if (record.containsKey(mappingFieldName)) {
+	    Object mappingValue   = record.get(mappingFieldName);
+	    return String.valueOf(mappingValue);
+	} else {
+	    return defaultMappingName;
+	}
+    }
+    //
+    // == Bulk request handling ==
+    //
+    private void sendBulkRequestIfBigEnough() {
+	sendBulkRequestIfMoreThan(bulkSize);
+    }
+    private void sendBulkRequestIfMoreThan(int size) {
+	totalBulkItems.incrementAndGet();
+	if (currentRequest.numberOfActions() > size) {
+	    long startTime        = System.currentTimeMillis();
+	    BulkResponse response = currentRequest.execute().actionGet();
+	    totalBulkTime.addAndGet(System.currentTimeMillis() - startTime);
+	    if (randgen.nextDouble() < 0.1) {
+		LOG.info("Indexed [" + totalBulkItems.get() + "] in [" + (totalBulkTime.get()/1000) + "s] of indexing"+"[" + ((System.currentTimeMillis() - runStartTime)/1000) + "s] of wall clock"+" for ["+ (float)(1000.0*totalBulkItems.get())/(System.currentTimeMillis() - runStartTime) + "rec/s]");
+	    }
+	    currentRequest = client.prepareBulk();
+	}
+    }
+}