RubyGems - embulk-input-hdfs - Versions diffs - 0.1.9 → 0.2.1 - Mend

embulk-input-hdfs 0.1.9 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/CHENGELOG.md +7 -0
data/README.md +18 -15
data/build.gradle +1 -1
data/example/config.yml +4 -1
data/example/data2.csv.gz +0 -0
data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +82 -0
data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +248 -212
data/src/main/java/org/embulk/input/hdfs/PartialFile.java +48 -0
data/src/main/java/org/embulk/input/hdfs/{HdfsPartialFileInputStream.java → PartialFileInputStream.java} +9 -4
data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +125 -0
data/src/main/java/org/embulk/input/hdfs/PartialFileList.java +360 -0
data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +38 -14
data/src/test/resources/sample_03.csv.gz +0 -0
metadata +26 -21
data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +0 -40
data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +0 -39

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ce120e7049f33e30dd23af9f8b7bcedc1a246457
-  data.tar.gz: a2dc70fee60be2ab535df3549e99304e751a7b7a
+  metadata.gz: e666bbbcb18941dce84889c2ee7fb85d65edbaf4
+  data.tar.gz: 7422b508396787d70e6cea3fc534739c2c20c825
 SHA512:
-  metadata.gz: a37baf6f948dff41f694457dc9ea9ea9270e41473642114d4dc7a569c61550471b9dbc440478c638fe56ba79956f043097e2129302d3ae12511bdc9d33cef994
-  data.tar.gz: 16922c84dcdb9715cb1b0377886b36192acdda31a037352e18df83895f33b09a9f275cd02b9662f02ee411725a6dae65950cfc256c707f639312810839018037
+  metadata.gz: c305947dbd3f6bded0a23fbc06efd4d44e6d48cdb4b97c8b0e3861cd4b2a9800f6d8c93cf5280ccb235ca88346e727bb5fb549ae3c7bb2e12a13205e20765085
+  data.tar.gz: 8f33bb06731a3c5a25dd723bef83616992ce5fc8b8d5e1a60d8a1da56421a42b49ae3397feb24134a093bf291af87ddbd208fa866c86fdd997d824a6077434a4

data/CHENGELOG.md ADDED Viewed

@@ -0,0 +1,7 @@
+0.2.1 (2016-02-25)
+==================
+- [Fix] does not work
+0.2.0 (2016-02-15)
+==================
+- [Add] `decompression` option

data/README.md CHANGED Viewed

@@ -14,11 +14,12 @@ Read files on Hdfs.
 - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
 - **config** overwrites configuration parameters (hash, default: `{}`)
-- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
-- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
-- **partition** when this is true, partition input files and increase task count. (default: `true`)
-- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
-- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
+- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s` (string, required).
+- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property. (long, default: `0`)
+- **partition** when this is true, partition input files and increase task count. (boolean, default: `true`)
+- **num_partitions** number of partitions. (long, default: `Runtime.getRuntime().availableProcessors()`)
+- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (long, default: `0`)
+- **decompression** Decompress compressed files by hadoop compression codec api. (boolean. default: `false`)
 ## Example
@@ -77,18 +78,20 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
 ...
 */
-    int numPartitions;
-    if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
-        // if the file is compressed, skip partitioning.
-        numPartitions = 1;
+    long numPartitions;
+    if (task.getPartition()) {
+        if (file.canDecompress()) {
+            numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+        }
+        else if (file.getCodec() != null) { // if not null, the file is compressed.
+            numPartitions = 1;
+        }
+        else {
+            numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+        }
     }
-    else if (!task.getPartition()) {
-        // if no partition mode, skip partitioning.
-        numPartitions = 1;
-    }
     else {
-        // equalize the file size per task as much as possible.
-        numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+        numPartitions = 1;
     }
 /*

data/build.gradle CHANGED Viewed

@@ -15,7 +15,7 @@ configurations {
     provided
 }
-version = "0.1.9"
+version = "0.2.1"
 sourceCompatibility = 1.7
 targetCompatibility = 1.7

data/example/config.yml CHANGED Viewed

@@ -12,11 +12,14 @@ local_fs_example: &local_fs_example
     fs.defaultFS: 'file:///'
     fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
     fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
+    io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
 in:
   type: hdfs
   <<: *local_fs_example
-  path: example/data.csv
+  path: example/data*
+  skip_header_lines: 1
+  decompression: true
   parser:
     charset: UTF-8
     newline: CRLF

data/example/data2.csv.gz ADDED Viewed

Binary file

data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java ADDED Viewed

@@ -0,0 +1,82 @@
+package org.embulk.input.hdfs;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.apache.hadoop.conf.Configuration;
+import org.embulk.config.ConfigException;
+import org.embulk.spi.Exec;
+import org.slf4j.Logger;
+import java.io.File;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.Map;
+/**
+ * Created by takahiro.nakayama on 2/22/16.
+ */
+public class ConfigurationBuilder
+{
+    private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
+    private final ImmutableList.Builder<String> configFilesBuilder;
+    private final ImmutableMap.Builder<String, String> configMapBuilder;
+    public ConfigurationBuilder()
+    {
+        this.configFilesBuilder = ImmutableList.builder();
+        this.configMapBuilder = ImmutableMap.builder();
+    }
+    public ConfigurationBuilder addConfigFiles(List<String> configFiles)
+    {
+        for (String configFile : configFiles) {
+            addConfigFile(configFile);
+        }
+        return this;
+    }
+    public ConfigurationBuilder addConfigFile(String configFile)
+    {
+        configFilesBuilder.add(configFile);
+        return this;
+    }
+    public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
+    {
+        for (Map.Entry<String, String> entry : configMap.entrySet()) {
+            addConfig(entry.getKey(), entry.getValue());
+        }
+        return this;
+    }
+    public ConfigurationBuilder addConfig(String key, String value)
+    {
+        configMapBuilder.put(key, value);
+        return this;
+    }
+    public Configuration build()
+    {
+        Configuration configuration = new Configuration();
+        for (String configFile : configFilesBuilder.build()) {
+            File file = new File(configFile);
+            try {
+                configuration.addResource(file.toURI().toURL());
+            }
+            catch (MalformedURLException e) {
+                throw new ConfigException(e);
+            }
+        }
+        for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
+            configuration.set(entry.getKey(), entry.getValue());
+        }
+        // For debug
+        for (Map.Entry<String, String> entry : configuration) {
+            logger.trace("{}: {}", entry.getKey(), entry.getValue());
+        }
+        logger.trace("Resource Files: {}", configuration);
+        return configuration;
+    }
+}

data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java CHANGED Viewed

@@ -1,12 +1,18 @@
 package org.embulk.input.hdfs;
-import com.google.common.base.Function;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Optional;
+import com.google.common.base.Throwables;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathIOException;
 import org.apache.hadoop.fs.PathNotFoundException;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
 import org.embulk.config.Config;
 import org.embulk.config.ConfigDefault;
 import org.embulk.config.ConfigDiff;
@@ -19,261 +25,142 @@ import org.embulk.spi.BufferAllocator;
 import org.embulk.spi.Exec;
 import org.embulk.spi.FileInputPlugin;
 import org.embulk.spi.TransactionalFileInput;
-import org.embulk.spi.util.InputStreamTransactionalFileInput;
+import org.embulk.spi.util.InputStreamFileInput;
 import org.jruby.embed.ScriptingContainer;
 import org.slf4j.Logger;
-import javax.annotation.Nullable;
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.SequenceInputStream;
-import java.util.ArrayList;
+import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 public class HdfsFileInputPlugin
         implements FileInputPlugin
 {
-    private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
-    private static FileSystem fs;
     public interface PluginTask
-            extends Task
+            extends Task, PartialFileList.Task
     {
         @Config("config_files")
         @ConfigDefault("[]")
-        public List<String> getConfigFiles();
+        List<String> getConfigFiles();
         @Config("config")
         @ConfigDefault("{}")
-        public Map<String, String> getConfig();
+        Map<String, String> getConfig();
         @Config("path")
-        public String getPath();
+        String getPath();
         @Config("rewind_seconds")
         @ConfigDefault("0")
-        public int getRewindSeconds();
+        int getRewindSeconds();
         @Config("partition")
         @ConfigDefault("true")
-        public boolean getPartition();
+        boolean getPartition();
         @Config("num_partitions") // this parameter is the approximate value.
         @ConfigDefault("-1")      // Default: Runtime.getRuntime().availableProcessors()
-        public long getApproximateNumPartitions();
+        long getApproximateNumPartitions();
         @Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
         @ConfigDefault("0")          // The reason why the parameter is configured is that this plugin splits files.
-        public int getSkipHeaderLines();
+        int getSkipHeaderLines();
-        public List<HdfsPartialFile> getFiles();
+        @Config("decompression") // if true, decompress files by using compression codec
+        @ConfigDefault("false")  // when getting FileInputStream.
+        boolean getDecompression();
-        public void setFiles(List<HdfsPartialFile> hdfsFiles);
+        PartialFileList getPartialFileList();
+        void setPartialFileList(PartialFileList partialFileList);
         @ConfigInject
-        public BufferAllocator getBufferAllocator();
+        ScriptingContainer getJRuby();
+        @ConfigInject
+        BufferAllocator getBufferAllocator();
     }
+    private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
+    private Optional<Configuration> configurationContainer = Optional.absent();
     @Override
     public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
     {
         PluginTask task = config.loadConfig(PluginTask.class);
+        Configuration configuration = getConfiguration(task);
         // listing Files
-        String pathString = strftime(task.getPath(), task.getRewindSeconds());
         try {
-            List<String> originalFileList = buildFileList(getFs(task), pathString);
+            FileSystem fs = getFS(configuration);
+            String pathString = strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
+            Path rootPath = new Path(pathString);
+            List<Path> originalFileList = buildOriginalFileList(fs, rootPath);
             if (originalFileList.isEmpty()) {
                 throw new PathNotFoundException(pathString);
             }
             logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
-            task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
+            PartialFileList list = buildPartialFileList(task, originalFileList);
+            task.setPartialFileList(list);
         }
         catch (IOException e) {
             logger.error(e.getMessage());
             throw new RuntimeException(e);
         }
-        // log the detail of partial files.
-        for (HdfsPartialFile partialFile : task.getFiles()) {
-            logger.debug("embulk-input-hdfs: target file: {}, start: {}, end: {}",
-                    partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
-        }
         // number of processors is same with number of targets
-        int taskCount = task.getFiles().size();
+        int taskCount = task.getPartialFileList().getTaskCount();
         logger.info("embulk-input-hdfs: task size: {}", taskCount);
         return resume(task.dump(), taskCount, control);
     }
-    @Override
-    public ConfigDiff resume(TaskSource taskSource,
-            int taskCount,
-            FileInputPlugin.Control control)
+    private Configuration getConfiguration(PluginTask task)
     {
-        control.run(taskSource, taskCount);
-        ConfigDiff configDiff = Exec.newConfigDiff();
-        // usually, yo use last_path
-        //if (task.getFiles().isEmpty()) {
-        //    if (task.getLastPath().isPresent()) {
-        //        configDiff.set("last_path", task.getLastPath().get());
-        //    }
-        //} else {
-        //    List<String> files = new ArrayList<String>(task.getFiles());
-        //    Collections.sort(files);
-        //    configDiff.set("last_path", files.get(files.size() - 1));
-        //}
-        return configDiff;
-    }
+        if (configurationContainer.isPresent()) {
+            return configurationContainer.get();
+        }
-    @Override
-    public void cleanup(TaskSource taskSource,
-            int taskCount,
-            List<TaskReport> successTaskReports)
-    {
+        ConfigurationBuilder builder = new ConfigurationBuilder();
+        builder.addConfigFiles(task.getConfigFiles());
+        builder.addConfigMap(task.getConfig());
+        configurationContainer = Optional.of(builder.build());
+        return configurationContainer.get();
     }
-    @Override
-    public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
+    private FileSystem getFS(Configuration configuration)
     {
-        final PluginTask task = taskSource.loadTask(PluginTask.class);
-        InputStream input;
-        final HdfsPartialFile file = task.getFiles().get(taskIndex);
         try {
-            if (file.getStart() > 0 && task.getSkipHeaderLines() > 0) {
-                input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
-            }
-            else {
-                input = openInputStream(task, file);
-            }
+            return FileSystem.get(configuration);
         }
         catch (IOException e) {
-            logger.error(e.getMessage());
-            throw new RuntimeException(e);
+            throw Throwables.propagate(e);
         }
-        return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
-        {
-            @Override
-            public void abort()
-            { }
-            @Override
-            public TaskReport commit()
-            {
-                return Exec.newTaskReport();
-            }
-        };
     }
-    private InputStream getHeadersInputStream(PluginTask task, HdfsPartialFile partialFile)
-            throws IOException
+    @VisibleForTesting
+    String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
     {
-        FileSystem fs = getFs(task);
-        ByteArrayOutputStream header = new ByteArrayOutputStream();
-        int skippedHeaders = 0;
-        try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
-            while (true) {
-                int c = in.read();
-                if (c < 0) {
-                    break;
-                }
-                header.write(c);
-                if (c == '\n') {
-                    skippedHeaders++;
-                }
-                else if (c == '\r') {
-                    int c2 = in.read();
-                    if (c2 == '\n') {
-                        header.write(c2);
-                    }
-                    skippedHeaders++;
-                }
-                if (skippedHeaders >= task.getSkipHeaderLines()) {
-                    break;
-                }
-            }
-        }
-        header.close();
-        return new ByteArrayInputStream(header.toByteArray());
+        String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
+        return jruby.runScriptlet(script).toString();
     }
-    private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
-            throws IOException
+    private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
     {
-        FileSystem fs = getFs(task);
-        InputStream original = fs.open(new Path(partialFile.getPath()));
-        return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
-    }
+        List<Path> fileList = Lists.newArrayList();
-    private static FileSystem getFs(final PluginTask task)
-        throws IOException
-    {
-        if (fs == null) {
-            setFs(task);
-            return fs;
-        }
-        else {
-            return fs;
-        }
-    }
-    private static FileSystem setFs(final PluginTask task)
-            throws IOException
-    {
-        Configuration configuration = new Configuration();
-        for (String configFile : task.getConfigFiles()) {
-            File file = new File(configFile);
-            configuration.addResource(file.toURI().toURL());
-        }
-        for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
-            configuration.set(entry.getKey(), entry.getValue());
+        final FileStatus[] entries;
+        try {
+            entries = fs.globStatus(rootPath);
         }
-        // For debug
-        for (Map.Entry<String, String> entry : configuration) {
-            logger.trace("{}: {}", entry.getKey(), entry.getValue());
+        catch (IOException e) {
+            throw Throwables.propagate(e);
         }
-        logger.debug("Resource Files: {}", configuration);
-        fs = FileSystem.get(configuration);
-        return fs;
-    }
-    private String strftime(final String raw, final int rewindSeconds)
-    {
-        ScriptingContainer jruby = new ScriptingContainer();
-        Object resolved = jruby.runScriptlet(
-                String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
-        return resolved.toString();
-    }
-    private List<String> buildFileList(final FileSystem fs, final String pathString)
-            throws IOException
-    {
-        List<String> fileList = new ArrayList<>();
-        Path rootPath = new Path(pathString);
-        final FileStatus[] entries = fs.globStatus(rootPath);
         // `globStatus` does not throw PathNotFoundException.
         // return null instead.
         // see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
@@ -283,80 +170,229 @@ public class HdfsFileInputPlugin
         for (FileStatus entry : entries) {
             if (entry.isDirectory()) {
-                fileList.addAll(lsr(fs, entry));
+                List<Path> subEntries = listRecursive(fs, entry);
+                fileList.addAll(subEntries);
             }
             else {
-                fileList.add(entry.getPath().toString());
+                fileList.add(entry.getPath());
             }
         }
         return fileList;
     }
-    private List<String> lsr(final FileSystem fs, FileStatus status)
-            throws IOException
+    private List<Path> listRecursive(final FileSystem fs, FileStatus status)
     {
-        List<String> fileList = new ArrayList<>();
+        List<Path> fileList = Lists.newArrayList();
         if (status.isDirectory()) {
-            for (FileStatus entry : fs.listStatus(status.getPath())) {
-                fileList.addAll(lsr(fs, entry));
+            FileStatus[] entries;
+            try {
+                entries = fs.listStatus(status.getPath());
+            }
+            catch (IOException e) {
+                throw Throwables.propagate(e);
+            }
+            for (FileStatus entry : entries) {
+                fileList.addAll(listRecursive(fs, entry));
             }
         }
         else {
-            fileList.add(status.getPath().toString());
+            fileList.add(status.getPath());
         }
         return fileList;
     }
-    private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
-            throws IOException
+    private PartialFileList buildPartialFileList(PluginTask task, List<Path> pathList)
     {
-        List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
-        {
-            @Nullable
-            @Override
-            public Path apply(@Nullable String input)
-            {
-                return new Path(input);
-            }
-        });
+        Configuration configuration = getConfiguration(task);
+        FileSystem fs = getFS(configuration);
+        boolean shouldPartition = task.getPartition();
+        boolean shouldDecompress = task.getDecompression();
+        Map<Path, Long> pathLengthMap = Maps.newHashMap();
         long totalFileLength = 0;
         for (Path path : pathList) {
-            totalFileLength += fs.getFileStatus(path).getLen();
+            long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
+            if (fileLength <= 0) {
+                logger.info("Skip the 0 byte target file: {}", path);
+                continue;
+            }
+            pathLengthMap.put(path, fileLength);
+            totalFileLength += fileLength;
         }
+        if (totalFileLength <= 0) {
+            throw Throwables.propagate(new PathIOException(task.getPath(), "All files are empty"));
+        }
+        PartialFileList.Builder builder = new PartialFileList.Builder(task);
         // TODO: optimum allocation of resources
-        long approximateNumPartitions =
-                (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
+        final long approximateNumPartitions;
+        if (task.getApproximateNumPartitions() <= 0) {
+            approximateNumPartitions = Runtime.getRuntime().availableProcessors();
+        }
+        else {
+            approximateNumPartitions = task.getApproximateNumPartitions();
+        }
         long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
         if (partitionSizeByOneTask <= 0) {
             partitionSizeByOneTask = 1;
         }
-        List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
-        for (Path path : pathList) {
-            long fileLength = fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
-            if (fileLength <= 0) {
-                logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
-                continue;
-            }
+        for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
+            Path path = entry.getKey();
+            long fileLength = entry.getValue();
             long numPartitions;
-            if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
-                numPartitions = 1;
+            if (shouldPartition) {
+                if (shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null) {
+                    numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+                }
+                else if (getHdfsFileCompressionCodec(fs, path) != null) { // if not null, the file is compressed.
+                    numPartitions = 1;
+                }
+                else {
+                    numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+                }
             }
-            else if (!task.getPartition()) {
+            else {
                 numPartitions = 1;
             }
-            else {
-                numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
+            for (long i = 0; i < numPartitions; i++) {
+                long start = fileLength * i / numPartitions;
+                long end = fileLength * (i + 1) / numPartitions;
+                if (start < end) {
+                    logger.debug("PartialFile: path {}, start: {}, end: {}", path, start, end);
+                    builder.add(path.toString(), start, end, shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null);
+                }
             }
+        }
+        return builder.build();
+    }
-            HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
-            hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
+    private Long getHdfsFileLength(FileSystem fs, Path path, boolean shouldDecompression)
+    {
+        CompressionCodec codec = getHdfsFileCompressionCodec(fs, path);
+        if (codec == null) {
+            try {
+                return fs.getFileStatus(path).getLen();
+            }
+            catch (IOException e) {
+                throw Throwables.propagate(e);
+            }
+        }
+        else if (!shouldDecompression) {
+            try {
+                return fs.getFileStatus(path).getLen();
+            }
+            catch (IOException e) {
+                throw Throwables.propagate(e);
+            }
         }
+        else {
+            long fileLength = 0;
+            try (InputStream is = codec.createInputStream(fs.open(path))) {
+                while (is.read() > 0) {
+                    fileLength++;
+                }
+            }
+            catch (IOException e) {
+                throw Throwables.propagate(e);
+            }
+            return fileLength;
+        }
+    }
-        return hdfsPartialFiles;
+    private CompressionCodec getHdfsFileCompressionCodec(FileSystem fs, Path path)
+    {
+        return getHdfsFileCompressionCodec(fs.getConf(), path);
+    }
+    private CompressionCodec getHdfsFileCompressionCodec(Configuration configuration, Path path)
+    {
+        return new CompressionCodecFactory(configuration).getCodec(path);
+    }
+    @Override
+    public ConfigDiff resume(TaskSource taskSource,
+            int taskCount,
+            FileInputPlugin.Control control)
+    {
+        control.run(taskSource, taskCount);
+        ConfigDiff configDiff = Exec.newConfigDiff();
+        return configDiff;
+    }
+    @Override
+    public void cleanup(TaskSource taskSource,
+            int taskCount,
+            List<TaskReport> successTaskReports)
+    {
+    }
+    @Override
+    public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
+    {
+        final PluginTask task = taskSource.loadTask(PluginTask.class);
+        return new HdfsFileInput(task, taskIndex);
+    }
+    public class HdfsFileInput
+            extends InputStreamFileInput
+            implements TransactionalFileInput
+    {
+        public HdfsFileInput(PluginTask task, int taskIndex)
+        {
+            super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
+        }
+        @Override
+        public void abort()
+        {
+        }
+        @Override
+        public TaskReport commit()
+        {
+            return Exec.newTaskReport();
+        }
+    }
+    // TODO create single-file InputStreamFileInput utility
+    private class SingleFileProvider
+            implements InputStreamFileInput.Provider
+    {
+        private final FileSystem fs;
+        private final int numHeaderLines;
+        private final Iterator<PartialFile> iterator;
+        public SingleFileProvider(PluginTask task, int taskIndex)
+        {
+            this.fs = getFS(getConfiguration(task));
+            this.numHeaderLines = task.getSkipHeaderLines();
+            this.iterator = task.getPartialFileList().get(taskIndex).iterator();
+        }
+        @Override
+        public InputStream openNext() throws IOException
+        {
+            if (!iterator.hasNext()) {
+                return null;
+            }
+            PartialFileInputStreamBuilder builder = new PartialFileInputStreamBuilder(fs, iterator.next()).withHeaders(numHeaderLines);
+            return builder.build();
+        }
+        @Override
+        public void close()
+        {
+        }
     }
 }