RubyGems - embulk-input-hdfs - Versions diffs - 0.1.0 → 0.1.1 - Mend

embulk-input-hdfs 0.1.0 → 0.1.1

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +34 -0
data/build.gradle +1 -1
data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +16 -17
data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +6 -6
metadata +3 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
-  data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
+  metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
+  data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
 SHA512:
-  metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
-  data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
+  metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
+  data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a

data/README.md CHANGED

@@ -15,6 +15,7 @@ Read files on Hdfs.
 - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
 - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
 - **partition** when this is true, partition input files and increase task count. (default: `true`)
+- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
 ## Example
@@ -32,6 +33,7 @@ in:
   input_path: /user/embulk/test/%Y-%m-%d/*
   rewind_seconds: 86400
   partition: true
+  num_partitions: 30
   decoders:
     - {type: gzip}
   parser:
@@ -53,6 +55,8 @@ in:
 ```
 ## Note
+- The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
+  - see: [The Partitioning Logic](#partition_logic)
 - the feature of the partition supports only 3 line terminators.
   - `\n`
   - `\r`
@@ -61,6 +65,36 @@ in:
 ## The Reference Implementation
 - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
+##<a id="partition_logic">The Partitioning Logic</a>
+```
+int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
+/*
+...
+*/
+    int numPartitions;
+    if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
+        // if the file is compressed, skip partitioning.
+        numPartitions = 1;
+    }
+    else if (!task.getPartition()) {
+        // if no partition mode, skip partitioning.
+        numPartitions = 1;
+    }
+    else {
+        // equalize the file size per task as much as possible.
+        numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+    }
+/*
+...
+*/
+```
 ## Build
 ```

data/build.gradle CHANGED

@@ -12,7 +12,7 @@ configurations {
     provided
 }
-version = "0.1.0"
+version = "0.1.1"
 sourceCompatibility = 1.7
 targetCompatibility = 1.7

data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java CHANGED

@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
         @ConfigDefault("true")
         public boolean getPartition();
-        // this parameter is experimental.
-        @Config("partition_level")
-        @ConfigDefault("3")
-        public int getPartitonLevel();
+        @Config("num_partitions") // this parameter is the approximate value.
+        @ConfigDefault("-1")      // Default: Runtime.getRuntime().availableProcessors()
+        public int getApproximateNumPartitions();
         public List<HdfsPartialFile> getFiles();
         public void setFiles(List<HdfsPartialFile> hdfsFiles);
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
         }
         // TODO: optimum allocation of resources
-        int partitionCountParameter = task.getPartitonLevel();
-        int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
+        int approximateNumPartitions =
+                (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
+        int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
         List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
         for (Path path : pathList) {
-            int partitionCount;
+            int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
+            if (fileLength <= 0) {
+                logger.info("Skip the 0 byte target file: {}", path);
+                continue;
+            }
+            int numPartitions;
             if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
-                partitionCount = 1;
+                numPartitions = 1;
             }
             else if (!task.getPartition()) {
-                partitionCount = 1;
+                numPartitions = 1;
             }
             else {
-                int fileLength = (int) fs.getFileStatus(path).getLen();
-                partitionCount = fileLength / partitionSizeByOneTask;
-                int remainder = fileLength % partitionSizeByOneTask;
-                if (remainder > 0) {
-                    partitionCount++;
-                }
+                numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
             }
-            HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
+            HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
             hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
         }

data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java CHANGED

@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
 {
     private FileSystem fs;
     private Path path;
-    private int partitionCount;
+    private int numPartitions;
-    public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
+    public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
     {
         this.fs = fs;
         this.path = path;
-        this.partitionCount = partitionCount;
+        this.numPartitions = numPartitions;
     }
     public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
     {
         List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
         long size = fs.getFileStatus(path).getLen();
-        for (int i = 0; i < partitionCount; i++) {
-            long start = size * i / partitionCount;
-            long end = size * (i + 1) / partitionCount;
+        for (int i = 0; i < numPartitions; i++) {
+            long start = size * i / numPartitions;
+            long end = size * (i + 1) / numPartitions;
             if (start < end) {
                 hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
             }

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-input-hdfs
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - takahiro.nakayama
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-08 00:00:00.000000000 Z
+date: 2015-09-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -82,7 +82,7 @@ files:
 - classpath/curator-client-2.6.0.jar
 - classpath/curator-framework-2.6.0.jar
 - classpath/curator-recipes-2.6.0.jar
-- classpath/embulk-input-hdfs-0.1.0.jar
+- classpath/embulk-input-hdfs-0.1.1.jar
 - classpath/gson-2.2.4.jar
 - classpath/hadoop-annotations-2.6.0.jar
 - classpath/hadoop-auth-2.6.0.jar