embulk-input-hdfs 0.1.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa9425d56cb955c999bdfc8f307004f260065797
|
4
|
+
data.tar.gz: d11249a30d11ad70595d5961383f39d626290f4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46e6e225a7cc1acf6a1396ecdf72b8fcce1b0679196bfe1bfd18baec6015d602dbedbc22e3bda74f2ce916ddbce8043152ed5d93302850ca1a79cc54f07a1fb8
|
7
|
+
data.tar.gz: e49523b895c4d11e10e25295f4298d5f8c2113c1d7eefc48e3a8862fadcea7eaa51c1e677b26e5d24459198da66b58668184d6c4b14f5b86878cba6dbc1384fa
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Read files on Hdfs.
|
|
12
12
|
|
13
13
|
- **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
14
14
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
15
|
-
- **
|
15
|
+
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
17
|
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
18
18
|
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
@@ -30,7 +30,7 @@ in:
|
|
30
30
|
dfs.replication: 1
|
31
31
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
32
32
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
33
|
-
|
33
|
+
path: /user/embulk/test/%Y-%m-%d/*
|
34
34
|
rewind_seconds: 86400
|
35
35
|
partition: true
|
36
36
|
num_partitions: 30
|
data/build.gradle
CHANGED
@@ -1,34 +1,34 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import java.io.IOException;
|
4
|
-
import java.io.InputStream;
|
5
|
-
import java.util.List;
|
6
|
-
import java.util.ArrayList;
|
7
|
-
import java.util.Map;
|
8
|
-
|
9
3
|
import com.google.common.base.Function;
|
10
|
-
import com.google.common.base.Optional;
|
11
|
-
import com.google.common.collect.ImmutableList;
|
12
4
|
import com.google.common.collect.Lists;
|
13
5
|
import org.apache.hadoop.conf.Configuration;
|
14
6
|
import org.apache.hadoop.fs.FileStatus;
|
15
7
|
import org.apache.hadoop.fs.FileSystem;
|
16
8
|
import org.apache.hadoop.fs.Path;
|
17
|
-
import org.
|
9
|
+
import org.apache.hadoop.fs.PathNotFoundException;
|
18
10
|
import org.embulk.config.Config;
|
19
11
|
import org.embulk.config.ConfigDefault;
|
20
|
-
import org.embulk.config.ConfigInject;
|
21
12
|
import org.embulk.config.ConfigDiff;
|
13
|
+
import org.embulk.config.ConfigInject;
|
22
14
|
import org.embulk.config.ConfigSource;
|
23
15
|
import org.embulk.config.Task;
|
16
|
+
import org.embulk.config.TaskReport;
|
24
17
|
import org.embulk.config.TaskSource;
|
25
|
-
import org.embulk.spi
|
26
|
-
import org.embulk.spi.
|
18
|
+
import org.embulk.spi.BufferAllocator;
|
19
|
+
import org.embulk.spi.Exec;
|
20
|
+
import org.embulk.spi.FileInputPlugin;
|
21
|
+
import org.embulk.spi.TransactionalFileInput;
|
27
22
|
import org.embulk.spi.util.InputStreamTransactionalFileInput;
|
28
23
|
import org.jruby.embed.ScriptingContainer;
|
29
24
|
import org.slf4j.Logger;
|
30
25
|
|
31
26
|
import javax.annotation.Nullable;
|
27
|
+
import java.io.IOException;
|
28
|
+
import java.io.InputStream;
|
29
|
+
import java.util.ArrayList;
|
30
|
+
import java.util.List;
|
31
|
+
import java.util.Map;
|
32
32
|
|
33
33
|
public class HdfsFileInputPlugin implements FileInputPlugin
|
34
34
|
{
|
@@ -44,8 +44,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
44
44
|
@ConfigDefault("{}")
|
45
45
|
public Map<String, String> getConfig();
|
46
46
|
|
47
|
-
@Config("
|
48
|
-
public String
|
47
|
+
@Config("path")
|
48
|
+
public String getPath();
|
49
49
|
|
50
50
|
@Config("rewind_seconds")
|
51
51
|
@ConfigDefault("0")
|
@@ -72,9 +72,14 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
72
72
|
PluginTask task = config.loadConfig(PluginTask.class);
|
73
73
|
|
74
74
|
// listing Files
|
75
|
-
String pathString = strftime(task.
|
75
|
+
String pathString = strftime(task.getPath(), task.getRewindSeconds());
|
76
76
|
try {
|
77
77
|
List<String> originalFileList = buildFileList(getFs(task), pathString);
|
78
|
+
|
79
|
+
if (originalFileList.isEmpty()) {
|
80
|
+
throw new PathNotFoundException(pathString);
|
81
|
+
}
|
82
|
+
|
78
83
|
task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
|
79
84
|
logger.info("Loading target files: {}", originalFileList);
|
80
85
|
}
|
@@ -190,13 +195,17 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
190
195
|
throws IOException
|
191
196
|
{
|
192
197
|
List<String> fileList = new ArrayList<>();
|
193
|
-
|
198
|
+
Path rootPath = new Path(pathString);
|
199
|
+
|
200
|
+
for (FileStatus entry : fs.globStatus(rootPath)) {
|
194
201
|
if (entry.isDirectory()) {
|
195
202
|
fileList.addAll(lsr(fs, entry));
|
196
|
-
}
|
203
|
+
}
|
204
|
+
else {
|
197
205
|
fileList.add(entry.getPath().toString());
|
198
206
|
}
|
199
207
|
}
|
208
|
+
|
200
209
|
return fileList;
|
201
210
|
}
|
202
211
|
|
@@ -117,11 +117,12 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
117
117
|
{
|
118
118
|
if (current >= start) {
|
119
119
|
return;
|
120
|
-
|
121
120
|
}
|
121
|
+
|
122
122
|
if (start == 0) {
|
123
123
|
current = 0;
|
124
|
-
}
|
124
|
+
}
|
125
|
+
else {
|
125
126
|
current = original.skip(--start);
|
126
127
|
if (current != start) {
|
127
128
|
throw new IOException("Cannot skip.");
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,7 +82,7 @@ files:
|
|
82
82
|
- classpath/curator-client-2.6.0.jar
|
83
83
|
- classpath/curator-framework-2.6.0.jar
|
84
84
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.4.jar
|
86
86
|
- classpath/gson-2.2.4.jar
|
87
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
88
88
|
- classpath/hadoop-auth-2.6.0.jar
|