embulk-input-hdfs 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
4
- data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
3
+ metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
4
+ data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
5
5
  SHA512:
6
- metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
7
- data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
6
+ metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
7
+ data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a
data/README.md CHANGED
@@ -15,6 +15,7 @@ Read files on Hdfs.
15
15
  - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
16
16
  - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
17
17
  - **partition** when this is true, partition input files and increase task count. (default: `true`)
18
+ - **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
18
19
 
19
20
  ## Example
20
21
 
@@ -32,6 +33,7 @@ in:
32
33
  input_path: /user/embulk/test/%Y-%m-%d/*
33
34
  rewind_seconds: 86400
34
35
  partition: true
36
+ num_partitions: 30
35
37
  decoders:
36
38
  - {type: gzip}
37
39
  parser:
@@ -53,6 +55,8 @@ in:
53
55
  ```
54
56
 
55
57
  ## Note
58
+ - The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
59
+ - see: [The Partitioning Logic](#partition_logic)
56
60
  - the feature of the partition supports only 3 line terminators.
57
61
  - `\n`
58
62
  - `\r`
@@ -61,6 +65,36 @@ in:
61
65
  ## The Reference Implementation
62
66
  - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
63
67
 
68
+ ##<a id="partition_logic">The Partitioning Logic</a>
69
+
70
+ ```
71
+ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
72
+
73
+ /*
74
+ ...
75
+ */
76
+
77
+ int numPartitions;
78
+ if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
79
+ // if the file is compressed, skip partitioning.
80
+ numPartitions = 1;
81
+ }
82
+ else if (!task.getPartition()) {
83
+ // if no partition mode, skip partitioning.
84
+ numPartitions = 1;
85
+ }
86
+ else {
87
+ // equalize the file size per task as much as possible.
88
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
89
+ }
90
+
91
+ /*
92
+ ...
93
+ */
94
+
95
+ ```
96
+
97
+
64
98
  ## Build
65
99
 
66
100
  ```
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.0"
15
+ version = "0.1.1"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
55
55
  @ConfigDefault("true")
56
56
  public boolean getPartition();
57
57
 
58
- // this parameter is experimental.
59
- @Config("partition_level")
60
- @ConfigDefault("3")
61
- public int getPartitonLevel();
58
+ @Config("num_partitions") // this parameter is the approximate value.
59
+ @ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
60
+ public int getApproximateNumPartitions();
62
61
 
63
62
  public List<HdfsPartialFile> getFiles();
64
63
  public void setFiles(List<HdfsPartialFile> hdfsFiles);
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
235
234
  }
236
235
 
237
236
  // TODO: optimum allocation of resources
238
- int partitionCountParameter = task.getPartitonLevel();
239
- int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
237
+ int approximateNumPartitions =
238
+ (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
239
+ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
240
240
 
241
241
  List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
242
242
  for (Path path : pathList) {
243
- int partitionCount;
243
+ int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
244
+ if (fileLength <= 0) {
245
+ logger.info("Skip the 0 byte target file: {}", path);
246
+ continue;
247
+ }
244
248
 
249
+ int numPartitions;
245
250
  if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
246
- partitionCount = 1;
251
+ numPartitions = 1;
247
252
  }
248
253
  else if (!task.getPartition()) {
249
- partitionCount = 1;
254
+ numPartitions = 1;
250
255
  }
251
256
  else {
252
- int fileLength = (int) fs.getFileStatus(path).getLen();
253
- partitionCount = fileLength / partitionSizeByOneTask;
254
- int remainder = fileLength % partitionSizeByOneTask;
255
-
256
- if (remainder > 0) {
257
- partitionCount++;
258
- }
257
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
259
258
  }
260
259
 
261
- HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
260
+ HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
262
261
  hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
263
262
  }
264
263
 
@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
14
14
  {
15
15
  private FileSystem fs;
16
16
  private Path path;
17
- private int partitionCount;
17
+ private int numPartitions;
18
18
 
19
- public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
19
+ public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
20
20
  {
21
21
  this.fs = fs;
22
22
  this.path = path;
23
- this.partitionCount = partitionCount;
23
+ this.numPartitions = numPartitions;
24
24
  }
25
25
 
26
26
  public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
27
27
  {
28
28
  List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
29
29
  long size = fs.getFileStatus(path).getLen();
30
- for (int i = 0; i < partitionCount; i++) {
31
- long start = size * i / partitionCount;
32
- long end = size * (i + 1) / partitionCount;
30
+ for (int i = 0; i < numPartitions; i++) {
31
+ long start = size * i / numPartitions;
32
+ long end = size * (i + 1) / numPartitions;
33
33
  if (start < end) {
34
34
  hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
35
35
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahiro.nakayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-08 00:00:00.000000000 Z
11
+ date: 2015-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -82,7 +82,7 @@ files:
82
82
  - classpath/curator-client-2.6.0.jar
83
83
  - classpath/curator-framework-2.6.0.jar
84
84
  - classpath/curator-recipes-2.6.0.jar
85
- - classpath/embulk-input-hdfs-0.1.0.jar
85
+ - classpath/embulk-input-hdfs-0.1.1.jar
86
86
  - classpath/gson-2.2.4.jar
87
87
  - classpath/hadoop-annotations-2.6.0.jar
88
88
  - classpath/hadoop-auth-2.6.0.jar