embulk-input-hdfs 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
4
- data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
3
+ metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
4
+ data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
5
5
  SHA512:
6
- metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
7
- data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
6
+ metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
7
+ data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a
data/README.md CHANGED
@@ -15,6 +15,7 @@ Read files on Hdfs.
15
15
  - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
16
16
  - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
17
17
  - **partition** when this is true, partition input files and increase task count. (default: `true`)
18
+ - **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
18
19
 
19
20
  ## Example
20
21
 
@@ -32,6 +33,7 @@ in:
32
33
  input_path: /user/embulk/test/%Y-%m-%d/*
33
34
  rewind_seconds: 86400
34
35
  partition: true
36
+ num_partitions: 30
35
37
  decoders:
36
38
  - {type: gzip}
37
39
  parser:
@@ -53,6 +55,8 @@ in:
53
55
  ```
54
56
 
55
57
  ## Note
58
+ - The parameter **num_partitions** is the approximate value. The actual num_partitions is larger than this parameter.
59
+ - see: [The Partitioning Logic](#partition_logic)
56
60
  - the feature of the partition supports only 3 line terminators.
57
61
  - `\n`
58
62
  - `\r`
@@ -61,6 +65,36 @@ in:
61
65
  ## The Reference Implementation
62
66
  - [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
63
67
 
68
+ ##<a id="partition_logic">The Partitioning Logic</a>
69
+
70
+ ```
71
+ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
72
+
73
+ /*
74
+ ...
75
+ */
76
+
77
+ int numPartitions;
78
+ if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
79
+ // if the file is compressed, skip partitioning.
80
+ numPartitions = 1;
81
+ }
82
+ else if (!task.getPartition()) {
83
+ // if no partition mode, skip partitioning.
84
+ numPartitions = 1;
85
+ }
86
+ else {
87
+ // equalize the file size per task as much as possible.
88
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
89
+ }
90
+
91
+ /*
92
+ ...
93
+ */
94
+
95
+ ```
96
+
97
+
64
98
  ## Build
65
99
 
66
100
  ```
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.0"
15
+ version = "0.1.1"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -55,10 +55,9 @@ public class HdfsFileInputPlugin implements FileInputPlugin
55
55
  @ConfigDefault("true")
56
56
  public boolean getPartition();
57
57
 
58
- // this parameter is experimental.
59
- @Config("partition_level")
60
- @ConfigDefault("3")
61
- public int getPartitonLevel();
58
+ @Config("num_partitions") // this parameter is the approximate value.
59
+ @ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
60
+ public int getApproximateNumPartitions();
62
61
 
63
62
  public List<HdfsPartialFile> getFiles();
64
63
  public void setFiles(List<HdfsPartialFile> hdfsFiles);
@@ -235,30 +234,30 @@ public class HdfsFileInputPlugin implements FileInputPlugin
235
234
  }
236
235
 
237
236
  // TODO: optimum allocation of resources
238
- int partitionCountParameter = task.getPartitonLevel();
239
- int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
237
+ int approximateNumPartitions =
238
+ (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
239
+ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
240
240
 
241
241
  List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
242
242
  for (Path path : pathList) {
243
- int partitionCount;
243
+ int fileLength = (int) fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
244
+ if (fileLength <= 0) {
245
+ logger.info("Skip the 0 byte target file: {}", path);
246
+ continue;
247
+ }
244
248
 
249
+ int numPartitions;
245
250
  if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
246
- partitionCount = 1;
251
+ numPartitions = 1;
247
252
  }
248
253
  else if (!task.getPartition()) {
249
- partitionCount = 1;
254
+ numPartitions = 1;
250
255
  }
251
256
  else {
252
- int fileLength = (int) fs.getFileStatus(path).getLen();
253
- partitionCount = fileLength / partitionSizeByOneTask;
254
- int remainder = fileLength % partitionSizeByOneTask;
255
-
256
- if (remainder > 0) {
257
- partitionCount++;
258
- }
257
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
259
258
  }
260
259
 
261
- HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
260
+ HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
262
261
  hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
263
262
  }
264
263
 
@@ -14,22 +14,22 @@ public class HdfsFilePartitioner
14
14
  {
15
15
  private FileSystem fs;
16
16
  private Path path;
17
- private int partitionCount;
17
+ private int numPartitions;
18
18
 
19
- public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
19
+ public HdfsFilePartitioner(FileSystem fs, Path path, int numPartitions)
20
20
  {
21
21
  this.fs = fs;
22
22
  this.path = path;
23
- this.partitionCount = partitionCount;
23
+ this.numPartitions = numPartitions;
24
24
  }
25
25
 
26
26
  public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
27
27
  {
28
28
  List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
29
29
  long size = fs.getFileStatus(path).getLen();
30
- for (int i = 0; i < partitionCount; i++) {
31
- long start = size * i / partitionCount;
32
- long end = size * (i + 1) / partitionCount;
30
+ for (int i = 0; i < numPartitions; i++) {
31
+ long start = size * i / numPartitions;
32
+ long end = size * (i + 1) / numPartitions;
33
33
  if (start < end) {
34
34
  hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
35
35
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahiro.nakayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-08 00:00:00.000000000 Z
11
+ date: 2015-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -82,7 +82,7 @@ files:
82
82
  - classpath/curator-client-2.6.0.jar
83
83
  - classpath/curator-framework-2.6.0.jar
84
84
  - classpath/curator-recipes-2.6.0.jar
85
- - classpath/embulk-input-hdfs-0.1.0.jar
85
+ - classpath/embulk-input-hdfs-0.1.1.jar
86
86
  - classpath/gson-2.2.4.jar
87
87
  - classpath/hadoop-annotations-2.6.0.jar
88
88
  - classpath/hadoop-auth-2.6.0.jar