embulk-input-hdfs 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
|
4
|
+
data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
|
7
|
+
data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a
|
data/README.md
CHANGED
@@ -15,6 +15,7 @@ Read files on Hdfs.
|
|
15
15
|
- **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
17
|
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
18
|
+
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
18
19
|
|
19
20
|
## Example
|
20
21
|
|
@@ -32,6 +33,7 @@ in:
|
|
32
33
|
input_path: /user/embulk/test/%Y-%m-%d/*
|
33
34
|
rewind_seconds: 86400
|
34
35
|
partition: true
|
36
|
+
num_partitions: 30
|
35
37
|
decoders:
|
36
38
|
- {type: gzip}
|
37
39
|
parser:
|
@@ -53,6 +55,8 @@ in:
|
|
53
55
|
```
|
54
56
|
|
55
57
|
## Note
|
58
|
+
- The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
|
59
|
+
- see: [The Partitioning Logic](#partition_logic)
|
56
60
|
- the feature of the partition supports only 3 line terminators.
|
57
61
|
- `\n`
|
58
62
|
- `\r`
|
@@ -61,6 +65,36 @@ in:
|
|
61
65
|
## The Reference Implementation
|
62
66
|
- [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
|
63
67
|
|
68
|
+
##<a id="partition_logic">The Partitioning Logic</a>
|
69
|
+
|
70
|
+
```
|
71
|
+
int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
72
|
+
|
73
|
+
/*
|
74
|
+
...
|
75
|
+
*/
|
76
|
+
|
77
|
+
int numPartitions;
|
78
|
+
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
79
|
+
// if the file is compressed, skip partitioning.
|
80
|
+
numPartitions = 1;
|
81
|
+
}
|
82
|
+
else if (!task.getPartition()) {
|
83
|
+
// if no partition mode, skip partitioning.
|
84
|
+
numPartitions = 1;
|
85
|
+
}
|
86
|
+
else {
|
87
|
+
// equalize the file size per task as much as possible.
|
88
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
89
|
+
}
|
90
|
+
|
91
|
+
/*
|
92
|
+
...
|
93
|
+
*/
|
94
|
+
|
95
|
+
```
|
96
|
+
|
97
|
+
|
64
98
|
## Build
|
65
99
|
|
66
100
|
```
|
data/build.gradle
CHANGED
@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
55
55
|
@ConfigDefault("true")
|
56
56
|
public boolean getPartition();
|
57
57
|
|
58
|
-
// this parameter is
|
59
|
-
@
|
60
|
-
|
61
|
-
public int getPartitonLevel();
|
58
|
+
@Config("num_partitions") // this parameter is the approximate value.
|
59
|
+
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
60
|
+
public int getApproximateNumPartitions();
|
62
61
|
|
63
62
|
public List<HdfsPartialFile> getFiles();
|
64
63
|
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
235
234
|
}
|
236
235
|
|
237
236
|
// TODO: optimum allocation of resources
|
238
|
-
int
|
239
|
-
|
237
|
+
int approximateNumPartitions =
|
238
|
+
(task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
|
239
|
+
int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
240
240
|
|
241
241
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
242
242
|
for (Path path : pathList) {
|
243
|
-
int
|
243
|
+
int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
|
244
|
+
if (fileLength <= 0) {
|
245
|
+
logger.info("Skip the 0 byte target file: {}", path);
|
246
|
+
continue;
|
247
|
+
}
|
244
248
|
|
249
|
+
int numPartitions;
|
245
250
|
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
246
|
-
|
251
|
+
numPartitions = 1;
|
247
252
|
}
|
248
253
|
else if (!task.getPartition()) {
|
249
|
-
|
254
|
+
numPartitions = 1;
|
250
255
|
}
|
251
256
|
else {
|
252
|
-
|
253
|
-
partitionCount = fileLength / partitionSizeByOneTask;
|
254
|
-
int remainder = fileLength % partitionSizeByOneTask;
|
255
|
-
|
256
|
-
if (remainder > 0) {
|
257
|
-
partitionCount++;
|
258
|
-
}
|
257
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
259
258
|
}
|
260
259
|
|
261
|
-
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path,
|
260
|
+
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
|
262
261
|
hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
|
263
262
|
}
|
264
263
|
|
@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
|
|
14
14
|
{
|
15
15
|
private FileSystem fs;
|
16
16
|
private Path path;
|
17
|
-
private int
|
17
|
+
private int numPartitions;
|
18
18
|
|
19
|
-
public HdfsFilePartitioner(FileSystem fs, Path path, int
|
19
|
+
public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
|
20
20
|
{
|
21
21
|
this.fs = fs;
|
22
22
|
this.path = path;
|
23
|
-
this.
|
23
|
+
this.numPartitions = numPartitions;
|
24
24
|
}
|
25
25
|
|
26
26
|
public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
|
27
27
|
{
|
28
28
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
29
29
|
long size = fs.getFileStatus(path).getLen();
|
30
|
-
for (int i = 0; i <
|
31
|
-
long start = size * i /
|
32
|
-
long end = size * (i + 1) /
|
30
|
+
for (int i = 0; i < numPartitions; i++) {
|
31
|
+
long start = size * i / numPartitions;
|
32
|
+
long end = size * (i + 1) / numPartitions;
|
33
33
|
if (start < end) {
|
34
34
|
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
35
35
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,7 +82,7 @@ files:
|
|
82
82
|
- classpath/curator-client-2.6.0.jar
|
83
83
|
- classpath/curator-framework-2.6.0.jar
|
84
84
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.1.jar
|
86
86
|
- classpath/gson-2.2.4.jar
|
87
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
88
88
|
- classpath/hadoop-auth-2.6.0.jar
|