RubyGems - wonderdog - Versions diffs - 0.0.1 - Mend

wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/.gitignore +49 -0
data/.rspec +2 -0
data/CHANGELOG.md +5 -0
data/LICENSE.md +201 -0
data/README.md +175 -0
data/Rakefile +10 -0
data/bin/estool +141 -0
data/bin/estrus.rb +136 -0
data/bin/wonderdog +93 -0
data/config/elasticsearch-example.yml +227 -0
data/config/elasticsearch.in.sh +52 -0
data/config/logging.yml +43 -0
data/config/more_settings.yml +60 -0
data/config/run_elasticsearch-2.sh +42 -0
data/config/ufo_config.json +12 -0
data/lib/wonderdog.rb +14 -0
data/lib/wonderdog/configuration.rb +25 -0
data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
data/lib/wonderdog/index_and_mapping.rb +67 -0
data/lib/wonderdog/timestamp.rb +43 -0
data/lib/wonderdog/version.rb +3 -0
data/notes/README-benchmarking.txt +272 -0
data/notes/README-read_tuning.textile +74 -0
data/notes/benchmarking-201011.numbers +0 -0
data/notes/cluster_notes.md +17 -0
data/notes/notes.txt +91 -0
data/notes/pigstorefunc.pig +45 -0
data/pom.xml +80 -0
data/spec/spec_helper.rb +22 -0
data/spec/support/driver_helper.rb +15 -0
data/spec/support/integration_helper.rb +30 -0
data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
data/spec/wonderdog/index_and_type_spec.rb +73 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
data/test/foo.json +3 -0
data/test/foo.tsv +3 -0
data/test/test_dump.pig +19 -0
data/test/test_json_loader.pig +21 -0
data/test/test_tsv_loader.pig +16 -0
data/wonderdog.gemspec +32 -0
metadata +130 -0

data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java ADDED

@@ -0,0 +1,102 @@
+package com.infochimps.elasticsearch;
+import java.io.IOException;
+import java.io.DataInput;
+import java.io.DataOutput;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.InputSplit;
+import org.elasticsearch.search.Scroll;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchScrollRequestBuilder;
+import org.elasticsearch.action.search.SearchType;
+public class ElasticSearchStreamingSplit  implements InputSplit, Writable {
+    private String  indexName;
+    private String  mappingName;
+    private Integer numSplits;
+    private String  queryJSON;
+    private Long    numHits;
+    private Integer from;
+    private Integer size;
+    public ElasticSearchStreamingSplit() {
+    }
+    public ElasticSearchStreamingSplit(String indexName , String mappingName, Integer numSplits, String queryJSON, Long numHits, Integer from, Integer size) {
+	this.indexName   = indexName;
+	this.mappingName = mappingName;
+	this.numSplits   = numSplits;
+	this.queryJSON   = queryJSON;
+	this.numHits     = numHits;
+	this.from        = from;
+	this.size        = size;
+    }
+    public String getSummary() {
+	Integer thisSplitNum  = (int) (((long) from) / (numHits / ((long) numSplits)));
+	return "ElasticSearch input split "+String.valueOf(thisSplitNum + 1)+"/"+String.valueOf(numSplits)+" with "+String.valueOf(size)+" records from /"+indexName+"/"+mappingName;
+    }
+    public Integer getSize() {
+	return size;
+    }
+    public boolean hasQuery() {
+	return queryJSON != null && queryJSON.length() > 0;
+    }
+    public SearchRequestBuilder initialScrollRequest(Client client, Scroll scroll, Integer requestSize) {
+	SearchRequestBuilder request = client.prepareSearch(indexName).setSearchType(SearchType.SCAN).setScroll(scroll);
+	if (mappingName != null && mappingName.length() > 0) {
+	    request.setTypes(mappingName);
+	}
+	request.setFrom((int) from);
+	request.setSize(requestSize);
+	if (hasQuery()) {
+	    request.setQuery(queryJSON);
+	}
+	return request;
+    }
+    public SearchScrollRequestBuilder scrollRequest(Client client, Scroll scroll, String scrollId) {
+	return client.prepareSearchScroll(scrollId).setScroll(scroll);
+    }
+    @Override
+    public String[] getLocations() {
+        return new String[] {};
+    }
+    @Override
+    public long getLength() {
+        return 0;
+    }
+    @Override
+    public void readFields(DataInput in) throws IOException {
+	this.indexName   = Text.readString(in);
+	this.mappingName = Text.readString(in);
+	this.numSplits   = in.readInt();
+	this.queryJSON   = Text.readString(in);
+	this.numHits     = in.readLong();
+	this.from        = in.readInt();
+	this.size        = in.readInt();
+    }
+    @Override
+    public void write(DataOutput out) throws IOException {
+	Text.writeString(out, indexName);
+	Text.writeString(out, mappingName);
+	out.writeInt(numSplits);
+	Text.writeString(out, queryJSON);
+	out.writeLong(numHits);
+	out.writeInt(from);
+	out.writeInt(size);
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java ADDED

@@ -0,0 +1,108 @@
+package com.infochimps.elasticsearch;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.Random;
+import java.util.Map;
+import java.util.HashMap;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapWritable;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.node.Node;
+import org.elasticsearch.node.NodeBuilder;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.client.Requests;
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.indices.IndexAlreadyExistsException;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.ExceptionsHelper;
+//
+// Simple one-hop bulk indexing hadoop job for elasticsearch. It accepts
+// tsv documents, creates batch index requests, and sends records directly
+// to the elasticsearch data node that's going to actually index them.
+//
+public class ElasticTest extends Configured implements Tool {
+    private final static Log LOG = LogFactory.getLog(ElasticTest.class);
+    public static class IndexMapper extends Mapper<LongWritable, Text, NullWritable, MapWritable> {
+        private String[] fieldNames;
+        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+            String[] fields    = value.toString().split("\t");
+            MapWritable record = new MapWritable();
+            for (int i = 0; i < fields.length; i++) {
+                if (i < fieldNames.length) {
+                    record.put(new Text(fieldNames[i]), new Text(fields[i]));
+                }
+            }
+            context.write(NullWritable.get(), record);
+        }
+        //
+        // Called once at the beginning of the map task. Sets up the indexing job.
+        //
+        protected void setup(org.apache.hadoop.mapreduce.Mapper.Context context) throws IOException, InterruptedException {
+            Configuration conf = context.getConfiguration();
+            this.fieldNames = conf.get("wonderdog.field.names").split(",");
+        }
+    }
+    public int run(String[] args) throws Exception {
+        Job job = new Job(getConf());
+        job.setJarByClass(ElasticTest.class);
+        job.setJobName("ElasticTest");
+        job.setMapperClass(IndexMapper.class);
+        job.setNumReduceTasks(0);
+        job.setOutputKeyClass(NullWritable.class);
+        job.setOutputValueClass(MapWritable.class);
+        job.setOutputFormatClass(ElasticSearchOutputFormat.class);
+        List<String> other_args = new ArrayList<String>();
+        for (int i=0; i < args.length; ++i) {
+            System.out.println(args[i]);
+            other_args.add(args[i]);
+        }
+        // Here we need _both_ an input path and an output path.
+        // Output stores failed records so they can be re-indexed
+        FileInputFormat.setInputPaths(job, new Path(other_args.get(0)));
+        FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
+        try {
+            job.waitForCompletion(true);
+        }
+        catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return 0;
+    }
+    public static void main(String[] args) throws Exception {
+        int res = ToolRunner.run(new Configuration(), new ElasticTest(), args);
+        System.exit(res);
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java ADDED

@@ -0,0 +1,100 @@
+package com.infochimps.elasticsearch.hadoop.util;
+import java.io.File;
+import java.io.IOException;
+import java.io.FileNotFoundException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.filecache.DistributedCache;
+public class HadoopUtils {
+    /**
+       Upload a local file to the cluster
+     */
+    public static void uploadLocalFile(Path localsrc, Path hdfsdest, Configuration conf) throws IOException {
+        FileSystem fs = FileSystem.get(conf);
+        if (fs.exists(hdfsdest) && fs.getFileStatus(hdfsdest).isDir()) {
+            fs.delete(hdfsdest, true);
+        }
+        fs.copyFromLocalFile(false, true, localsrc, hdfsdest);
+    }
+    /**
+       Upload a local file to the cluster, if it's newer or nonexistent
+     */
+    public static void uploadLocalFileIfChanged(Path localsrc, Path hdfsdest, Configuration conf) throws IOException {
+        long l_time = new File(localsrc.toUri()).lastModified();
+        try {
+            long h_time = FileSystem.get(conf).getFileStatus(hdfsdest).getModificationTime();
+            if ( l_time > h_time ) {
+                uploadLocalFile(localsrc, hdfsdest, conf);
+            }
+        }
+        catch (FileNotFoundException e) {
+            uploadLocalFile(localsrc, hdfsdest, conf);
+        }
+    }
+    /**
+       Fetches a file with the basename specified from the distributed cache. Returns null if no file is found
+     */
+    public static String fetchFileFromCache(String basename, Configuration conf) throws IOException {
+        Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
+        if (cacheFiles != null && cacheFiles.length > 0) {
+            for (Path cacheFile : cacheFiles) {
+                if (cacheFile.getName().equals(basename)) {
+                    return cacheFile.toString();
+                }
+            }
+        }
+        return null;
+    }
+    /**
+       Fetches a file with the basename specified from the distributed cache. Returns null if no file is found
+     */
+    public static String fetchArchiveFromCache(String basename, Configuration conf) throws IOException {
+        Path[] cacheArchives = DistributedCache.getLocalCacheArchives(conf);
+        if (cacheArchives != null && cacheArchives.length > 0) {
+            for (Path cacheArchive : cacheArchives) {
+                if (cacheArchive.getName().equals(basename)) {
+                    return cacheArchive.toString();
+                }
+            }
+        }
+        return null;
+    }
+    /**
+       Takes a path on the hdfs and ships it in the distributed cache if it is not already in the distributed cache
+     */
+    public static void shipFileIfNotShipped(Path hdfsPath, Configuration conf) throws IOException {
+        if (fetchFileFromCache(hdfsPath.getName(), conf) == null) {
+            try {
+                DistributedCache.addCacheFile(hdfsPath.toUri(), conf);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+        /**
+       Takes a path on the hdfs and ships it in the distributed cache if it is not already in the distributed cache
+     */
+    public static void shipArchiveIfNotShipped(Path hdfsPath, Configuration conf) throws IOException {
+        if (fetchArchiveFromCache(hdfsPath.getName(), conf) == null) {
+            try {
+                DistributedCache.addCacheArchive(hdfsPath.toUri(), conf);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+}

data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java ADDED

@@ -0,0 +1,216 @@
+package com.infochimps.elasticsearch.pig;
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Properties;
+import java.net.URI;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.OutputFormat;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.filecache.DistributedCache;
+import org.apache.pig.StoreFunc;
+import org.apache.pig.ResourceSchema;
+import org.apache.pig.ResourceSchema.ResourceFieldSchema;
+import org.apache.pig.StoreFuncInterface;
+import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
+import org.apache.pig.builtin.Utf8StorageConverter;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.DataByteArray;
+import org.apache.pig.data.DataType;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.util.Utils;
+import org.apache.pig.impl.util.UDFContext;
+import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
+import com.infochimps.elasticsearch.ElasticSearchOutputFormat;
+/**
+   Pig storefunc for Elastic Search. Takes tuples of any primitive type, converts them
+   to strings, and indexes them.
+   USAGE:
+   STORE records INTO ElasticSearchIndex();
+   STORE records INTO ElasticSearchIndex(idField, bulkSize);
+   STORE records INTO ElasticSearchIndex(idField, bulkSize, esConfig);
+   STORE records INTO ElasticSearchIndex(idField, bulkSize, esConfig, esPlugins);
+   where:
+   idField   = Which field of the record to use as the record id. If none is passed in
+               then the record is assumed to have no id.
+   bulkSize  = Number of records for ElasticSearchOutputFormat to batch up before sending
+               a bulk index request to Elastic Search. Default: 1000.
+   esConfig  = Full path to local elasticsearch.yml. Default: /etc/elasticsearch/elasticsearch.yml
+   esPlugins = Full path to local elastic search plugins dir. Default: /usr/local/share/elasticsearch/plugins
+ */
+public class ElasticSearchIndex extends StoreFunc implements StoreFuncInterface {
+    private static final Log LOG = LogFactory.getLog(ElasticSearchIndex.class);
+    protected RecordWriter writer = null;
+    protected String idField;
+    protected String bulkSize;
+    protected String esConfig;
+    protected String esPlugins;
+    // For hadoop configuration
+    private static final String ES_INDEX_NAME = "elasticsearch.index.name";
+    private static final String ES_BULK_SIZE = "elasticsearch.bulk.size";
+    private static final String ES_IS_JSON = "elasticsearch.is_json";
+    private static final String ES_ID_FIELD_NAME = "elasticsearch.id.field.name";
+    private static final String ES_FIELD_NAMES = "elasticsearch.field.names";
+    private static final String ES_ID_FIELD = "elasticsearch.id.field";
+    private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
+    private static final String PIG_ES_FIELD_NAMES = "elasticsearch.pig.field.names";
+    // Other string constants
+    private static final String SLASH = "/";
+    private static final String COMMA = ",";
+    private static final String LOCAL_SCHEME = "file://";
+    private static final String NO_ID_FIELD = "-1";
+    private static final String DEFAULT_BULK = "1000";
+    private static final String DEFAULT_ES_CONFIG = "/etc/elasticsearch/elasticsearch.yml";
+    private static final String DEFAULT_ES_PLUGINS = "/usr/local/share/elasticsearch/plugins";
+    private static final String ES_CONFIG_HDFS_PATH = "/tmp/elasticsearch/elasticsearch.yml";
+    private static final String ES_PLUGINS_HDFS_PATH = "/tmp/elasticsearch/plugins";
+    public ElasticSearchIndex() {
+        this(NO_ID_FIELD, DEFAULT_BULK);
+    }
+    public ElasticSearchIndex(String idField, String bulkSize) {
+        this(idField, bulkSize, DEFAULT_ES_CONFIG);
+    }
+    public ElasticSearchIndex(String idField, String bulkSize, String esConfig) {
+        this(idField, bulkSize, esConfig, DEFAULT_ES_PLUGINS);
+    }
+    public ElasticSearchIndex(String idField, String bulkSize, String esConfig, String esPlugins) {
+        this.idField   = idField;
+        this.bulkSize  = bulkSize;
+        this.esConfig  = esConfig;
+        this.esPlugins = esPlugins;
+    }
+    /**
+       Check that schema is reasonable and serialize the field names as a string for later use.
+     */
+    @Override
+    public void checkSchema(ResourceSchema s) throws IOException {
+        UDFContext context  = UDFContext.getUDFContext();
+        Properties property = context.getUDFProperties(ResourceSchema.class);
+        String fieldNames   = "";
+        for (String field : s.fieldNames()) {
+            fieldNames += field;
+            fieldNames += COMMA;
+        }
+        property.setProperty(PIG_ES_FIELD_NAMES, fieldNames);
+    }
+    /**
+       Look at passed in location and configuration and set options. Note that, since this
+       is called more than once, we need to make sure and not change anything we've already
+       set.
+     */
+    @Override
+    public void setStoreLocation(String location, Job job) throws IOException {
+        String[] es_store  = location.substring(5).split(SLASH);
+        if (es_store.length != 2) {
+            throw new RuntimeException("Please specify a valid elasticsearch index, eg. es://myindex/myobj");
+        }
+        Configuration conf = job.getConfiguration();
+        // Only set if we haven't already
+        if (conf.get(ES_INDEX_NAME) == null) {
+            try {
+                job.getConfiguration().set(ES_INDEX_NAME, es_store[0]);
+                job.getConfiguration().set(ES_OBJECT_TYPE, es_store[1]);
+            } catch (ArrayIndexOutOfBoundsException e) {
+                throw new RuntimeException("You must specify both an index and an object type.");
+            }
+            job.getConfiguration().setBoolean(ES_IS_JSON, false);
+            job.getConfiguration().set(ES_BULK_SIZE, bulkSize);
+            job.getConfiguration().set(ES_ID_FIELD, idField);
+            // Adds the elasticsearch.yml file (esConfig) and the plugins directory (esPlugins) to the distributed cache
+            try {
+                Path hdfsConfigPath = new Path(ES_CONFIG_HDFS_PATH);
+                Path hdfsPluginsPath = new Path(ES_PLUGINS_HDFS_PATH);
+                HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esConfig), hdfsConfigPath, job.getConfiguration());
+                HadoopUtils.shipFileIfNotShipped(hdfsConfigPath, job.getConfiguration());
+                HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME+esPlugins), hdfsPluginsPath, job.getConfiguration());
+                HadoopUtils.shipArchiveIfNotShipped(hdfsPluginsPath, job.getConfiguration());
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+            UDFContext context  = UDFContext.getUDFContext();
+            Properties property = context.getUDFProperties(ResourceSchema.class);
+            job.getConfiguration().set(ES_FIELD_NAMES, property.getProperty(PIG_ES_FIELD_NAMES));
+        }
+    }
+    @Override
+    public OutputFormat getOutputFormat() throws IOException {
+        return new ElasticSearchOutputFormat();
+    }
+    // Suppressing unchecked warnings for RecordWriter, which is not parameterized by StoreFuncInterface
+    @Override
+    public void prepareToWrite(@SuppressWarnings("rawtypes") RecordWriter writer) throws IOException {
+        this.writer = writer;
+    }
+    /**
+       Map a tuple object into a map-writable object for elasticsearch.
+     */
+    @SuppressWarnings("unchecked")
+    @Override
+    public void putNext(Tuple t) throws IOException {
+        UDFContext context  = UDFContext.getUDFContext();
+        Properties property = context.getUDFProperties(ResourceSchema.class);
+        MapWritable record  = new MapWritable();
+        String[] fieldNames = property.getProperty(PIG_ES_FIELD_NAMES).split(COMMA);
+        for (int i = 0; i < t.size(); i++) {
+            if (i < fieldNames.length) {
+                try {
+                    record.put(new Text(fieldNames[i]), new Text(t.get(i).toString()));
+                } catch (NullPointerException e) {
+                    //LOG.info("Increment null field counter.");
+                }
+            }
+        }
+        try {
+            writer.write(NullWritable.get(), record);
+        } catch (InterruptedException e) {
+            throw new IOException(e);
+        }
+    }
+    @Override
+    public void cleanupOnFailure(String location, Job job) throws IOException {
+    }
+}