embulk-input-hdfs 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
|
4
|
+
data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
|
7
|
+
data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a
|
data/README.md
CHANGED
@@ -15,6 +15,7 @@ Read files on Hdfs.
|
|
15
15
|
- **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
17
|
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
18
|
+
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
18
19
|
|
19
20
|
## Example
|
20
21
|
|
@@ -32,6 +33,7 @@ in:
|
|
32
33
|
input_path: /user/embulk/test/%Y-%m-%d/*
|
33
34
|
rewind_seconds: 86400
|
34
35
|
partition: true
|
36
|
+
num_partitions: 30
|
35
37
|
decoders:
|
36
38
|
- {type: gzip}
|
37
39
|
parser:
|
@@ -53,6 +55,8 @@ in:
|
|
53
55
|
```
|
54
56
|
|
55
57
|
## Note
|
58
|
+
- The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
|
59
|
+
- see: [The Partitioning Logic](#partition_logic)
|
56
60
|
- the feature of the partition supports only 3 line terminators.
|
57
61
|
- `\n`
|
58
62
|
- `\r`
|
@@ -61,6 +65,36 @@ in:
|
|
61
65
|
## The Reference Implementation
|
62
66
|
- [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
|
63
67
|
|
68
|
+
##<a id="partition_logic">The Partitioning Logic</a>
|
69
|
+
|
70
|
+
```
|
71
|
+
int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
72
|
+
|
73
|
+
/*
|
74
|
+
...
|
75
|
+
*/
|
76
|
+
|
77
|
+
int numPartitions;
|
78
|
+
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
79
|
+
// if the file is compressed, skip partitioning.
|
80
|
+
numPartitions = 1;
|
81
|
+
}
|
82
|
+
else if (!task.getPartition()) {
|
83
|
+
// if no partition mode, skip partitioning.
|
84
|
+
numPartitions = 1;
|
85
|
+
}
|
86
|
+
else {
|
87
|
+
// equalize the file size per task as much as possible.
|
88
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
89
|
+
}
|
90
|
+
|
91
|
+
/*
|
92
|
+
...
|
93
|
+
*/
|
94
|
+
|
95
|
+
```
|
96
|
+
|
97
|
+
|
64
98
|
## Build
|
65
99
|
|
66
100
|
```
|
data/build.gradle
CHANGED
@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
55
55
|
@ConfigDefault("true")
|
56
56
|
public boolean getPartition();
|
57
57
|
|
58
|
-
// this parameter is
|
59
|
-
@
|
60
|
-
|
61
|
-
public int getPartitonLevel();
|
58
|
+
@Config("num_partitions") // this parameter is the approximate value.
|
59
|
+
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
60
|
+
public int getApproximateNumPartitions();
|
62
61
|
|
63
62
|
public List<HdfsPartialFile> getFiles();
|
64
63
|
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
235
234
|
}
|
236
235
|
|
237
236
|
// TODO: optimum allocation of resources
|
238
|
-
int
|
239
|
-
|
237
|
+
int approximateNumPartitions =
|
238
|
+
(task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
|
239
|
+
int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
240
240
|
|
241
241
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
242
242
|
for (Path path : pathList) {
|
243
|
-
int
|
243
|
+
int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
|
244
|
+
if (fileLength <= 0) {
|
245
|
+
logger.info("Skip the 0 byte target file: {}", path);
|
246
|
+
continue;
|
247
|
+
}
|
244
248
|
|
249
|
+
int numPartitions;
|
245
250
|
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
246
|
-
|
251
|
+
numPartitions = 1;
|
247
252
|
}
|
248
253
|
else if (!task.getPartition()) {
|
249
|
-
|
254
|
+
numPartitions = 1;
|
250
255
|
}
|
251
256
|
else {
|
252
|
-
|
253
|
-
partitionCount = fileLength / partitionSizeByOneTask;
|
254
|
-
int remainder = fileLength % partitionSizeByOneTask;
|
255
|
-
|
256
|
-
if (remainder > 0) {
|
257
|
-
partitionCount++;
|
258
|
-
}
|
257
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
259
258
|
}
|
260
259
|
|
261
|
-
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path,
|
260
|
+
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
|
262
261
|
hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
|
263
262
|
}
|
264
263
|
|
@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
|
|
14
14
|
{
|
15
15
|
private FileSystem fs;
|
16
16
|
private Path path;
|
17
|
-
private int
|
17
|
+
private int numPartitions;
|
18
18
|
|
19
|
-
public HdfsFilePartitioner(FileSystem fs, Path path, int
|
19
|
+
public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
|
20
20
|
{
|
21
21
|
this.fs = fs;
|
22
22
|
this.path = path;
|
23
|
-
this.
|
23
|
+
this.numPartitions = numPartitions;
|
24
24
|
}
|
25
25
|
|
26
26
|
public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
|
27
27
|
{
|
28
28
|
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
29
29
|
long size = fs.getFileStatus(path).getLen();
|
30
|
-
for (int i = 0; i <
|
31
|
-
long start = size * i /
|
32
|
-
long end = size * (i + 1) /
|
30
|
+
for (int i = 0; i < numPartitions; i++) {
|
31
|
+
long start = size * i / numPartitions;
|
32
|
+
long end = size * (i + 1) / numPartitions;
|
33
33
|
if (start < end) {
|
34
34
|
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
35
35
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-09-
|
11
|
+
date: 2015-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,7 +82,7 @@ files:
|
|
82
82
|
- classpath/curator-client-2.6.0.jar
|
83
83
|
- classpath/curator-framework-2.6.0.jar
|
84
84
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.1.jar
|
86
86
|
- classpath/gson-2.2.4.jar
|
87
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
88
88
|
- classpath/hadoop-auth-2.6.0.jar
|