RubyGems - embulk-executor-mapreduce - Versions diffs - 0.2.3 → 0.2.4 - Mend

embulk-executor-mapreduce 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3c12d59b65314dd94cf12e29b7caf8e83fd947bd
-  data.tar.gz: 478ea3a4e77c7a9e5395a4c4942ac9697772a10d
+  metadata.gz: b2abda7db750f6c161ab8867474fdccfa67eb265
+  data.tar.gz: 8cfc89242d0a57368b5803db9b55e6494b6916e6
 SHA512:
-  metadata.gz: e8c955d1e7a4e0b318bc21a36c51410045b4b4c9e62b954e8848f66693d26fe580a79afb8c8e1c1910f9ecd1731f78772b6b9af7f64e817b9666776da26b126c
-  data.tar.gz: 14fb4a34dbaf1b59b8d37ceac1356d4ab66014a559943f16da087d9f17800e46c80e9b98ae1583be5568ee8909034ba708bfa4ca3dab27ddde90b5c8e2ab8f59
+  metadata.gz: 0e8b2f14207ec85d1ba60b531cc5876c90689a2b259f76a5edb57d8b50d025bd5e7151307be0381d82e813b5f263d5326e39cbccaa56238d8aa3f1171e8a21fd
+  data.tar.gz: 645417db32cc29813fee20f2175a2858485596159fb80c31e77c1f45d1c715f40b239f7417ef0caae927f75428b8f93175eb477281d620fe38edae694e76f66a

data/classpath/embulk-executor-mapreduce-0.2.4.jar ADDED Viewed

Binary file

data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java CHANGED Viewed

@@ -61,6 +61,7 @@ public class EmbulkMapReduce
     private static final String CK_SYSTEM_CONFIG = "embulk.mapreduce.systemConfig";
     private static final String CK_STATE_DIRECTORY_PATH = "embulk.mapreduce.stateDirectorypath";
     private static final String CK_TASK_COUNT = "embulk.mapreduce.taskCount";
+    private static final String CK_RETRY_TASKS = "embulk.mapreduce.retryTasks";
     private static final String CK_TASK = "embulk.mapreduce.task";
     private static final String CK_PLUGIN_ARCHIVE_SPECS = "embulk.mapreduce.pluginArchive.specs";
@@ -94,6 +95,16 @@ public class EmbulkMapReduce
         return config.getInt(CK_TASK_COUNT, 0);
     }
+    public static void setRetryTasks(Configuration config, boolean enabled)
+    {
+        config.setBoolean(CK_RETRY_TASKS, enabled);
+    }
+    public static boolean getRetryTasks(Configuration config)
+    {
+        return config.getBoolean(CK_RETRY_TASKS, false);
+    }
     public static void setStateDirectoryPath(Configuration config, Path path)
     {
         config.set(CK_STATE_DIRECTORY_PATH, path.toString());
@@ -406,8 +417,11 @@ public class EmbulkMapReduce
             this.session = ExecSession.builder(embed.getInjector()).fromExecConfig(task.getExecConfig()).build();
             try {
+                // LocalDirAllocator allocates a directory for a job. Here adds attempt id to the path
+                // so that attempts running on the same machine don't conflict each other.
                 LocalDirAllocator localDirAllocator = new LocalDirAllocator(MRConfig.LOCAL_DIR);
-                Path destPath = localDirAllocator.getLocalPathForWrite("gems", config);
+                String dirName = context.getTaskAttemptID().toString() + "/embulk_gems";
+                Path destPath = localDirAllocator.getLocalPathForWrite(dirName, config);
                 this.localGemPath = new File(destPath.toString());
             } catch (IOException ex) {
                 throw new RuntimeException(ex);
@@ -527,12 +541,14 @@ public class EmbulkMapReduce
     {
         private Context context;
         private SessionRunner runner;
+        private boolean retryTasks;
         @Override
         public void setup(Context context) throws IOException, InterruptedException
         {
             this.context = context;
             this.runner = new SessionRunner(context);
+            this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
             runner.execSession(new ExecAction<Void>() {  // for Exec.getLogger
                 public Void run() throws IOException
@@ -566,16 +582,17 @@ public class EmbulkMapReduce
             try {
                 Executors.process(runner.getExecSession(), task, taskIndex, handler);
-            } catch (Throwable ex) {
+            }
+            catch (Exception ex) {
                 try {
                     handler.setException(ex);
                 } catch (Throwable e) {
                     e.addSuppressed(ex);
                     throw e;
                 }
-                //if (task.getTaskRecovery()) {
-                //    throw ex;
-                //}
+                if (retryTasks) {
+                    throw ex;
+                }
             }
         }
     }

data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java CHANGED Viewed

@@ -128,16 +128,16 @@ public class EmbulkPartitioningMapReduce
                     filterPlugins, task.getFilterSchemas(), task.getFilterTaskSources(),
                     outputPlugin, task.getOutputSchema(), task.getOutputTaskSource(),
                     handler);
-            } catch (Throwable ex) {
+            }
+            catch (Exception ex) {
                 try {
                     handler.setException(ex);
                 } catch (Throwable e) {
                     e.addSuppressed(ex);
                     throw e;
                 }
-                //if (task.getTaskRecovery()) {
-                //    throw ex;
-                //}
+                // always throw this exception to not start reducers when input fails
+                throw ex;
             }
         }
     }
@@ -147,6 +147,7 @@ public class EmbulkPartitioningMapReduce
     {
         private Context context;
         private SessionRunner runner;
+        private boolean retryTasks;
         private AttemptStateUpdateHandler handler;
         private TransactionalPageOutput output;
         private boolean failed = false;
@@ -156,6 +157,7 @@ public class EmbulkPartitioningMapReduce
         {
             this.context = context;
             this.runner = new SessionRunner(context);
+            this.retryTasks = EmbulkMapReduce.getRetryTasks(context.getConfiguration());
             runner.execSession(new ExecAction<Void>() {
                 public Void run() throws Exception
@@ -199,7 +201,8 @@ public class EmbulkPartitioningMapReduce
                 for (PageWritable value : values) {
                     output.add(value.get());
                 }
-            } catch (Throwable ex) {
+            }
+            catch (Exception ex) {
                 failed = true;
                 try {
                     handler.setException(ex);
@@ -207,6 +210,9 @@ public class EmbulkPartitioningMapReduce
                     e.addSuppressed(ex);
                     throw e;
                 }
+                if (retryTasks) {
+                    throw ex;
+                }
             }
         }

data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java CHANGED Viewed

@@ -223,6 +223,7 @@ public class MapReduceExecutor
         EmbulkMapReduce.setSystemConfig(conf, modelManager, systemConfig);
         EmbulkMapReduce.setExecutorTask(conf, modelManager, task);
         EmbulkMapReduce.setMapTaskCount(conf, mapTaskCount);  // used by EmbulkInputFormat
+        EmbulkMapReduce.setRetryTasks(conf, task.getRetryTasks());
         EmbulkMapReduce.setStateDirectoryPath(conf, stateDir);
         // jar files

data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java CHANGED Viewed

@@ -43,6 +43,10 @@ public interface MapReduceExecutorTask
     @ConfigDefault("null")
     public Optional<Integer> getReducers();
+    @Config("retry_tasks")
+    @ConfigDefault("false")
+    public boolean getRetryTasks();
     @Config("partitioning")
     @ConfigDefault("null")
     public Optional<ConfigSource> getPartitioning();

data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java CHANGED Viewed

@@ -1,5 +1,7 @@
 package org.embulk.executor.mapreduce;
+import javax.validation.constraints.Min;
+import javax.validation.constraints.Max;
 import com.google.common.annotations.VisibleForTesting;
 import org.joda.time.DateTimeZone;
 import com.google.common.base.Optional;
@@ -37,6 +39,12 @@ public class TimestampPartitioning
         @ConfigDefault("\"sec\"")
         public String getUnixTimestamp();
+        @Config("map_side_partition_split")
+        @ConfigDefault("1")
+        @Min(1)
+        @Max(65535)  // TimestampPartitioning.LongPartitionKey encodes split number in 16-bit buffer
+        public int getMapSidePartitionSplit();
         public Column getTargetColumn();
         public void setTargetColumn(Column column);
     }
@@ -159,10 +167,19 @@ public class TimestampPartitioning
         Column column = task.getTargetColumn();
         if (column.getType() instanceof TimestampType) {
-            return new TimestampPartitioner(column, Unit.of(task.getUnit()));
-        } else if (column.getType() instanceof LongType) {
-            return new LongUnixTimestampPartitioner(column, Unit.of(task.getUnit()), UnixTimestampUnit.of(task.getUnixTimestamp()));
-        } else {
+            return new TimestampPartitioner(
+                    column,
+                    Unit.of(task.getUnit()),
+                    task.getMapSidePartitionSplit());
+        }
+        else if (column.getType() instanceof LongType) {
+            return new LongUnixTimestampPartitioner(
+                    column,
+                    Unit.of(task.getUnit()),
+                    task.getMapSidePartitionSplit(),
+                    UnixTimestampUnit.of(task.getUnixTimestamp()));
+        }
+        else {
             throw new AssertionError();
         }
     }
@@ -234,13 +251,17 @@ public class TimestampPartitioning
     {
         protected final Column column;
         protected final Unit unit;
+        protected final int mapSidePartitionSplit;
         private final LongPartitionKey key;
+        private long roundRobin;
-        public AbstractTimestampPartitioner(Column column, Unit unit)
+        public AbstractTimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
         {
             this.column = column;
             this.unit = unit;
+            this.mapSidePartitionSplit = mapSidePartitionSplit;
             this.key = new LongPartitionKey();
+            this.roundRobin = 0;
         }
         @Override
@@ -251,7 +272,19 @@ public class TimestampPartitioning
         protected LongPartitionKey updateKey(long v)
         {
-            key.set(v);
+            // ((v << 16) | (roundRobin % mapSidePartitionSplit)) is used to distribute a large partition to
+            // multiple reducers. But this algorithm is not ideal under following scenario:
+            //
+            //   * input data is in 2 hour (hour-0 and hour-1), and partitioning unit is hour.
+            //   * there're 4 reducers.
+            //   * with mapSidePartitionSplit = 1, hadoop uses 3 reducers because
+            //     hour-0 is partitoned to reducer 0 (v + 0) and 1 (v + 1)
+            //     hour-1 is partitoned to reducer 1 (v + 0) and 2 (v + 1)
+            //
+            // So, here needs further optimization to distribute load of the reducers.
+            //
+            key.set((v << 16) | (roundRobin % mapSidePartitionSplit));
+            roundRobin++;
             return key;
         }
     }
@@ -260,9 +293,9 @@ public class TimestampPartitioning
     static class TimestampPartitioner
             extends AbstractTimestampPartitioner
     {
-        public TimestampPartitioner(Column column, Unit unit)
+        public TimestampPartitioner(Column column, Unit unit, int mapSidePartitionSplit)
         {
-            super(column, unit);
+            super(column, unit, mapSidePartitionSplit);
         }
         @Override
@@ -280,9 +313,10 @@ public class TimestampPartitioning
         private final UnixTimestampUnit unixTimestampUnit;
         public LongUnixTimestampPartitioner(Column column, Unit unit,
+                int mapSidePartitionSplit,
                 UnixTimestampUnit unixTimestampUnit)
         {
-            super(column, unit);
+            super(column, unit, mapSidePartitionSplit);
             this.unixTimestampUnit = unixTimestampUnit;
         }

data/src/test/java/org/embulk/executor/mapreduce/TestMapReduceExecutor.java CHANGED Viewed

@@ -1,8 +1,11 @@
 package org.embulk.executor.mapreduce;
 import com.google.common.base.Function;
+import com.google.common.base.Strings;
+import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
 import com.google.inject.Binder;
 import com.google.inject.Module;
 import com.google.inject.Provider;
@@ -20,12 +23,20 @@ import org.junit.Test;
 import org.slf4j.ILoggerFactory;
 import org.slf4j.impl.Log4jLoggerFactory;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
 import java.util.List;
 import java.util.Random;
 import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -59,7 +70,13 @@ public class TestMapReduceExecutor
     {
         ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
         embulk.run(config);
-        // TODO compare input and output
+        assertFileContent(
+                Lists.newArrayList(
+                        "fixtures/csv/sample1.csv",
+                        "fixtures/csv/sample1.csv"),
+                Lists.newArrayList(
+                        "fixtures/csv/embulk_mapred_output.000.00.csv",
+                        "fixtures/csv/embulk_mapred_output.001.00.csv"));
     }
     @Test
@@ -68,7 +85,13 @@ public class TestMapReduceExecutor
     {
         ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
         embulk.run(config);
-        // TODO compare input and output
+        assertFileContent(
+                Lists.newArrayList(
+                        "fixtures/csv/sample1.csv",
+                        "fixtures/csv/sample1.csv"),
+                Lists.newArrayList(
+                        "fixtures/csv/embulk_mapred_partitioning_output.000.00.csv",
+                        "fixtures/csv/embulk_mapred_partitioning_output.001.00.csv"));
     }
     @Test
@@ -248,4 +271,64 @@ public class TestMapReduceExecutor
             return bootstrap;
         }
     }
+    private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
+    {
+        List<List<String>> inputRecords = getRecords(inputFiles);
+        Collections.sort(inputRecords, new RecordComparator());
+        List<List<String>> outputRecords = getRecords(outputFiles);
+        Collections.sort(outputRecords, new RecordComparator());
+        assertEquals(inputRecords, outputRecords);
+    }
+    private static class RecordComparator
+            implements Comparator<List<String>>
+    {
+        @Override
+        public int compare(List<String> r1, List<String> r2)
+        {
+            return r1.get(0).compareTo(r2.get(0));
+        }
+    }
+    private static List<List<String>> getRecords(List<String> files)
+    {
+        List<List<String>> records = new ArrayList<>();
+        try {
+            for (String file : files) {
+                try (BufferedReader r = newReader(file)) {
+                    r.readLine(); // header
+                    records.addAll(getRecords(r)); // contents
+                }
+            }
+        }
+        catch (IOException e) {
+            throw Throwables.propagate(e);
+        }
+        return records;
+    }
+    private static List<List<String>> getRecords(BufferedReader reader)
+            throws IOException
+    {
+        List<List<String>> records = new ArrayList<>();
+        String line;
+        while (!Strings.isNullOrEmpty(line = reader.readLine())) {
+            String[] record = line.split(",");
+            records.add(Lists.newArrayList(record));
+        }
+        return records;
+    }
+    private static BufferedReader newReader(String filePath)
+    {
+        InputStream in = new BufferedInputStream(TestMapReduceExecutor.class.getClassLoader().getResourceAsStream(filePath));
+        return new BufferedReader(new InputStreamReader(in));
+    }
 }

data/src/test/java/org/embulk/executor/mapreduce/TestTimestampPartitioning.java CHANGED Viewed

@@ -117,8 +117,8 @@ public class TestTimestampPartitioning
         Column c1 = new Column(1, "c1", Types.TIMESTAMP);
         Schema schema = new Schema(Arrays.asList(c0, c1));
-        LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, UnixTimestampUnit.SEC);
-        TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR);
+        LongUnixTimestampPartitioner lp = new LongUnixTimestampPartitioner(c0, Unit.HOUR, 1, UnixTimestampUnit.SEC);
+        TimestampPartitioner tp = new TimestampPartitioner(c1, Unit.HOUR, 1);
         long timeWindow = System.currentTimeMillis()/1000/3600*3600;
         PageReader r = new PageReader(schema);

data/src/test/resources/config/embulk_mapred_config.yml CHANGED Viewed

@@ -35,4 +35,12 @@ in:
     - {name: d, type: double}
     - {name: flag, type: boolean}
 out:
-  type: stdout
+  type: file
+  path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_output.'
+  file_ext: 'csv'
+  formatter:
+    charset: UTF-8
+    newline: CRLF
+    type: csv
+    column_options:
+      timestamp: {format: '%Y-%m-%d %H:%M:%S'}

data/src/test/resources/config/embulk_mapred_partitioning_config.yml CHANGED Viewed

@@ -16,6 +16,7 @@ exec:
   job_name: embulk_mapred_partitioning_0001
   exclude_jars:
   - '*log4j-over-slf4j*'
+  map_side_partition_split: 2
 in:
   type: file
   path_prefix: src/test/resources/fixtures/csv/sample
@@ -40,4 +41,12 @@ in:
     - {name: d, type: double}
     - {name: flag, type: boolean}
 out:
-  type: stdout
+  type: file
+  path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_partitioning_output.'
+  file_ext: 'csv'
+  formatter:
+    charset: UTF-8
+    newline: CRLF
+    type: csv
+    column_options:
+      timestamp: {format: '%Y-%m-%d %H:%M:%S'}

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-executor-mapreduce
 version: !ruby/object:Gem::Version
-  version: 0.2.3
+  version: 0.2.4
 platform: ruby
 authors:
 - Sadayuki Furuhashi
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-11-11 00:00:00.000000000 Z
+date: 2015-12-21 00:00:00.000000000 Z
 dependencies: []
 description: Executes tasks on Hadoop.
 email:
@@ -84,7 +84,7 @@ files:
 - classpath/curator-client-2.6.0.jar
 - classpath/curator-framework-2.6.0.jar
 - classpath/curator-recipes-2.6.0.jar
-- classpath/embulk-executor-mapreduce-0.2.3.jar
+- classpath/embulk-executor-mapreduce-0.2.4.jar
 - classpath/gson-2.2.4.jar
 - classpath/hadoop-annotations-2.6.0.jar
 - classpath/hadoop-auth-2.6.0.jar

data/classpath/embulk-executor-mapreduce-0.2.3.jar DELETED Viewed

Binary file