RubyGems - embulk-input-hdfs - Versions diffs - 0.1.0 → 0.1.1 - Mend

embulk-input-hdfs 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/README.md +34 -0
data/build.gradle +1 -1
data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +16 -17
data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +6 -6
metadata +3 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
-  data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
+  metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
+  data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
 SHA512:
-  metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
-  data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
+  metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
+  data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a

data/README.md CHANGED

@@ -15,6 +15,7 @@ Read files on Hdfs.
 - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
 - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
 - **partition** when this is true, partition input files and increase task count. (default: `true`)
+- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
 ## Example
@@ -32,6 +33,7 @@ in:
   input_path: /user/embulk/test/%Y-%m-%d/*
   rewind_seconds: 86400
   partition: true
+  num_partitions: 30
   decoders:
     - {type: gzip}
   parser:
@@ -53,6 +55,8 @@ in:
 ```
 ## Note
+- The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
+  - see: [The Partitioning Logic](#partition_logic)
 - the feature of the partition supports only 3 line terminators.
   - `\n`
   - `\r`
@@ -61,6 +65,36 @@ in:
 ## The Reference Implementation
 - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
+##<a id="partition_logic">The Partitioning Logic</a>
+```
+int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
+/*
+...
+*/
+    int numPartitions;
+    if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
+        // if the file is compressed, skip partitioning.
+        numPartitions = 1;
+    }
+    else if (!task.getPartition()) {
+        // if no partition mode, skip partitioning.
+        numPartitions = 1;
+    }
+    else {
+        // equalize the file size per task as much as possible.
+        numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+    }
+/*
+...
+*/
+```
 ## Build
 ```

data/build.gradle CHANGED

@@ -12,7 +12,7 @@ configurations {
     provided
 }
-version = "0.1.0"
+version = "0.1.1"
 sourceCompatibility = 1.7
 targetCompatibility = 1.7

data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java CHANGED

@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
         @ConfigDefault("true")
         public boolean getPartition();
-        // this parameter is experimental.
-        @Config("partition_level")
-        @ConfigDefault("3")
-        public int getPartitonLevel();
+        @Config("num_partitions") // this parameter is the approximate value.
+        @ConfigDefault("-1")      // Default: Runtime.getRuntime().availableProcessors()
+        public int getApproximateNumPartitions();
         public List<HdfsPartialFile> getFiles();
         public void setFiles(List<HdfsPartialFile> hdfsFiles);
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
         }
         // TODO: optimum allocation of resources
-        int partitionCountParameter = task.getPartitonLevel();
-        int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
+        int approximateNumPartitions =
+                (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
+        int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
         List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
         for (Path path : pathList) {
-            int partitionCount;
+            int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
+            if (fileLength <= 0) {
+                logger.info("Skip the 0 byte target file: {}", path);
+                continue;
+            }
+            int numPartitions;
             if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
-                partitionCount = 1;
+                numPartitions = 1;
             }
             else if (!task.getPartition()) {
-                partitionCount = 1;
+                numPartitions = 1;
             }
             else {
-                int fileLength = (int) fs.getFileStatus(path).getLen();
-                partitionCount = fileLength / partitionSizeByOneTask;
-                int remainder = fileLength % partitionSizeByOneTask;
-                if (remainder > 0) {
-                    partitionCount++;
-                }
+                numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
             }
-            HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
+            HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
             hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
         }

data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java CHANGED

@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
 {
     private FileSystem fs;
     private Path path;
-    private int partitionCount;
+    private int numPartitions;
-    public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
+    public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
     {
         this.fs = fs;
         this.path = path;
-        this.partitionCount = partitionCount;
+        this.numPartitions = numPartitions;
     }
     public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
     {
         List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
         long size = fs.getFileStatus(path).getLen();
-        for (int i = 0; i < partitionCount; i++) {
-            long start = size * i / partitionCount;
-            long end = size * (i + 1) / partitionCount;
+        for (int i = 0; i < numPartitions; i++) {
+            long start = size * i / numPartitions;
+            long end = size * (i + 1) / numPartitions;
             if (start < end) {
                 hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
             }

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: embulk-input-hdfs
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - takahiro.nakayama
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-08 00:00:00.000000000 Z
+date: 2015-09-09 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -82,7 +82,7 @@ files:
 - classpath/curator-client-2.6.0.jar
 - classpath/curator-framework-2.6.0.jar
 - classpath/curator-recipes-2.6.0.jar
-- classpath/embulk-input-hdfs-0.1.0.jar
+- classpath/embulk-input-hdfs-0.1.1.jar
 - classpath/gson-2.2.4.jar
 - classpath/hadoop-annotations-2.6.0.jar
 - classpath/hadoop-auth-2.6.0.jar