embulk-input-hdfs 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b559162cba6af0dd036310522baf9559ac4ebcf3
4
- data.tar.gz: 07f1fc7beb1205ba2baf4984c3495a3942514f68
3
+ metadata.gz: aa9425d56cb955c999bdfc8f307004f260065797
4
+ data.tar.gz: d11249a30d11ad70595d5961383f39d626290f4e
5
5
  SHA512:
6
- metadata.gz: 85a7d5b8ba72ed14787881251084edb5e0f59e6424c17377c2545fa9f0e4c95444f3b7b40a61c1b30636d228c6973fec517f4a10be0b6338b3f73b0c8524abd5
7
- data.tar.gz: 898da79bf93d26349a4916d5118980921e90829f5b6745e53678d661fb87d632a0ab4e2b9974ca5bf713fae62a58b305d2ea153675de88c5c4a05c2780f3ea8a
6
+ metadata.gz: 46e6e225a7cc1acf6a1396ecdf72b8fcce1b0679196bfe1bfd18baec6015d602dbedbc22e3bda74f2ce916ddbce8043152ed5d93302850ca1a79cc54f07a1fb8
7
+ data.tar.gz: e49523b895c4d11e10e25295f4298d5f8c2113c1d7eefc48e3a8862fadcea7eaa51c1e677b26e5d24459198da66b58668184d6c4b14f5b86878cba6dbc1384fa
data/README.md CHANGED
@@ -12,7 +12,7 @@ Read files on Hdfs.
12
12
 
13
13
  - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
14
14
  - **config** overwrites configuration parameters (hash, default: `{}`)
15
- - **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
15
+ - **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
16
16
  - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
17
17
  - **partition** when this is true, partition input files and increase task count. (default: `true`)
18
18
  - **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
@@ -30,7 +30,7 @@ in:
30
30
  dfs.replication: 1
31
31
  fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
32
32
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
33
- input_path: /user/embulk/test/%Y-%m-%d/*
33
+ path: /user/embulk/test/%Y-%m-%d/*
34
34
  rewind_seconds: 86400
35
35
  partition: true
36
36
  num_partitions: 30
@@ -12,7 +12,7 @@ configurations {
12
12
  provided
13
13
  }
14
14
 
15
- version = "0.1.1"
15
+ version = "0.1.4"
16
16
 
17
17
  sourceCompatibility = 1.7
18
18
  targetCompatibility = 1.7
@@ -1,34 +1,34 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
- import java.io.IOException;
4
- import java.io.InputStream;
5
- import java.util.List;
6
- import java.util.ArrayList;
7
- import java.util.Map;
8
-
9
3
  import com.google.common.base.Function;
10
- import com.google.common.base.Optional;
11
- import com.google.common.collect.ImmutableList;
12
4
  import com.google.common.collect.Lists;
13
5
  import org.apache.hadoop.conf.Configuration;
14
6
  import org.apache.hadoop.fs.FileStatus;
15
7
  import org.apache.hadoop.fs.FileSystem;
16
8
  import org.apache.hadoop.fs.Path;
17
- import org.embulk.config.TaskReport;
9
+ import org.apache.hadoop.fs.PathNotFoundException;
18
10
  import org.embulk.config.Config;
19
11
  import org.embulk.config.ConfigDefault;
20
- import org.embulk.config.ConfigInject;
21
12
  import org.embulk.config.ConfigDiff;
13
+ import org.embulk.config.ConfigInject;
22
14
  import org.embulk.config.ConfigSource;
23
15
  import org.embulk.config.Task;
16
+ import org.embulk.config.TaskReport;
24
17
  import org.embulk.config.TaskSource;
25
- import org.embulk.spi.*;
26
- import org.embulk.spi.util.InputStreamFileInput;
18
+ import org.embulk.spi.BufferAllocator;
19
+ import org.embulk.spi.Exec;
20
+ import org.embulk.spi.FileInputPlugin;
21
+ import org.embulk.spi.TransactionalFileInput;
27
22
  import org.embulk.spi.util.InputStreamTransactionalFileInput;
28
23
  import org.jruby.embed.ScriptingContainer;
29
24
  import org.slf4j.Logger;
30
25
 
31
26
  import javax.annotation.Nullable;
27
+ import java.io.IOException;
28
+ import java.io.InputStream;
29
+ import java.util.ArrayList;
30
+ import java.util.List;
31
+ import java.util.Map;
32
32
 
33
33
  public class HdfsFileInputPlugin implements FileInputPlugin
34
34
  {
@@ -44,8 +44,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
44
44
  @ConfigDefault("{}")
45
45
  public Map<String, String> getConfig();
46
46
 
47
- @Config("input_path")
48
- public String getInputPath();
47
+ @Config("path")
48
+ public String getPath();
49
49
 
50
50
  @Config("rewind_seconds")
51
51
  @ConfigDefault("0")
@@ -72,9 +72,14 @@ public class HdfsFileInputPlugin implements FileInputPlugin
72
72
  PluginTask task = config.loadConfig(PluginTask.class);
73
73
 
74
74
  // listing Files
75
- String pathString = strftime(task.getInputPath(), task.getRewindSeconds());
75
+ String pathString = strftime(task.getPath(), task.getRewindSeconds());
76
76
  try {
77
77
  List<String> originalFileList = buildFileList(getFs(task), pathString);
78
+
79
+ if (originalFileList.isEmpty()) {
80
+ throw new PathNotFoundException(pathString);
81
+ }
82
+
78
83
  task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
79
84
  logger.info("Loading target files: {}", originalFileList);
80
85
  }
@@ -190,13 +195,17 @@ public class HdfsFileInputPlugin implements FileInputPlugin
190
195
  throws IOException
191
196
  {
192
197
  List<String> fileList = new ArrayList<>();
193
- for (FileStatus entry : fs.globStatus(new Path(pathString))) {
198
+ Path rootPath = new Path(pathString);
199
+
200
+ for (FileStatus entry : fs.globStatus(rootPath)) {
194
201
  if (entry.isDirectory()) {
195
202
  fileList.addAll(lsr(fs, entry));
196
- } else {
203
+ }
204
+ else {
197
205
  fileList.add(entry.getPath().toString());
198
206
  }
199
207
  }
208
+
200
209
  return fileList;
201
210
  }
202
211
 
@@ -117,11 +117,12 @@ public class HdfsPartialFileInputStream extends InputStream
117
117
  {
118
118
  if (current >= start) {
119
119
  return;
120
-
121
120
  }
121
+
122
122
  if (start == 0) {
123
123
  current = 0;
124
- } else {
124
+ }
125
+ else {
125
126
  current = original.skip(--start);
126
127
  if (current != start) {
127
128
  throw new IOException("Cannot skip.");
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-hdfs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - takahiro.nakayama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-09 00:00:00.000000000 Z
11
+ date: 2015-10-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -82,7 +82,7 @@ files:
82
82
  - classpath/curator-client-2.6.0.jar
83
83
  - classpath/curator-framework-2.6.0.jar
84
84
  - classpath/curator-recipes-2.6.0.jar
85
- - classpath/embulk-input-hdfs-0.1.1.jar
85
+ - classpath/embulk-input-hdfs-0.1.4.jar
86
86
  - classpath/gson-2.2.4.jar
87
87
  - classpath/hadoop-annotations-2.6.0.jar
88
88
  - classpath/hadoop-auth-2.6.0.jar