embulk-input-hdfs 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c428003a976a3148f1b59e7dd54c7ec870ed3bce
|
4
|
+
data.tar.gz: a2941dce02f97452b54938bf73b5d1dceba5f4c5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02ca4fc8c3c82571296eb8da1d3a61533d0017d49c2712886e0872fbebf58ae8656412c61e3b6f1d6aaf71078ad1e886029a91907df92e236b2c56d3cbbb6083
|
7
|
+
data.tar.gz: f218db27f48822f33427ac0ebcccf5c58c41cdec92486cf9ceb1ddb240f5332744b8d65daf6a672e18488aa675c597ef56d34c994b7c04fe218a5e9c3428c91d
|
data/build.gradle
CHANGED
@@ -58,7 +58,7 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
58
58
|
|
59
59
|
@Config("num_partitions") // this parameter is the approximate value.
|
60
60
|
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
61
|
-
public
|
61
|
+
public long getApproximateNumPartitions();
|
62
62
|
|
63
63
|
public List<HdfsPartialFile> getFiles();
|
64
64
|
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
@@ -246,25 +246,25 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
246
246
|
}
|
247
247
|
});
|
248
248
|
|
249
|
-
|
249
|
+
long totalFileLength = 0;
|
250
250
|
for (Path path : pathList) {
|
251
251
|
totalFileLength += fs.getFileStatus(path).getLen();
|
252
252
|
}
|
253
253
|
|
254
254
|
// TODO: optimum allocation of resources
|
255
|
-
|
255
|
+
long approximateNumPartitions =
|
256
256
|
(task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
|
257
|
-
|
257
|
+
long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
258
258
|
|
259
259
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
260
260
|
for (Path path : pathList) {
|
261
|
-
|
261
|
+
long fileLength = fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
|
262
262
|
if (fileLength <= 0) {
|
263
263
|
logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
|
264
264
|
continue;
|
265
265
|
}
|
266
266
|
|
267
|
-
|
267
|
+
long numPartitions;
|
268
268
|
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
269
269
|
numPartitions = 1;
|
270
270
|
}
|
@@ -14,9 +14,9 @@ public class HdfsFilePartitioner
|
|
14
14
|
{
|
15
15
|
private FileSystem fs;
|
16
16
|
private Path path;
|
17
|
-
private
|
17
|
+
private long numPartitions;
|
18
18
|
|
19
|
-
public HdfsFilePartitioner(FileSystem fs, Path path,
|
19
|
+
public HdfsFilePartitioner(FileSystem fs, Path path, long numPartitions)
|
20
20
|
{
|
21
21
|
this.fs = fs;
|
22
22
|
this.path = path;
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-12-
|
11
|
+
date: 2015-12-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,7 +82,7 @@ files:
|
|
82
82
|
- classpath/curator-client-2.6.0.jar
|
83
83
|
- classpath/curator-framework-2.6.0.jar
|
84
84
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.8.jar
|
86
86
|
- classpath/gson-2.2.4.jar
|
87
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
88
88
|
- classpath/hadoop-auth-2.6.0.jar
|