embulk-input-hdfs 0.1.1 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa9425d56cb955c999bdfc8f307004f260065797
|
4
|
+
data.tar.gz: d11249a30d11ad70595d5961383f39d626290f4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 46e6e225a7cc1acf6a1396ecdf72b8fcce1b0679196bfe1bfd18baec6015d602dbedbc22e3bda74f2ce916ddbce8043152ed5d93302850ca1a79cc54f07a1fb8
|
7
|
+
data.tar.gz: e49523b895c4d11e10e25295f4298d5f8c2113c1d7eefc48e3a8862fadcea7eaa51c1e677b26e5d24459198da66b58668184d6c4b14f5b86878cba6dbc1384fa
|
data/README.md
CHANGED
@@ -12,7 +12,7 @@ Read files on Hdfs.
|
|
12
12
|
|
13
13
|
- **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
14
14
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
15
|
-
- **
|
15
|
+
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
17
|
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
18
18
|
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
@@ -30,7 +30,7 @@ in:
|
|
30
30
|
dfs.replication: 1
|
31
31
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
32
32
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
33
|
-
|
33
|
+
path: /user/embulk/test/%Y-%m-%d/*
|
34
34
|
rewind_seconds: 86400
|
35
35
|
partition: true
|
36
36
|
num_partitions: 30
|
data/build.gradle
CHANGED
@@ -1,34 +1,34 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import java.io.IOException;
|
4
|
-
import java.io.InputStream;
|
5
|
-
import java.util.List;
|
6
|
-
import java.util.ArrayList;
|
7
|
-
import java.util.Map;
|
8
|
-
|
9
3
|
import com.google.common.base.Function;
|
10
|
-
import com.google.common.base.Optional;
|
11
|
-
import com.google.common.collect.ImmutableList;
|
12
4
|
import com.google.common.collect.Lists;
|
13
5
|
import org.apache.hadoop.conf.Configuration;
|
14
6
|
import org.apache.hadoop.fs.FileStatus;
|
15
7
|
import org.apache.hadoop.fs.FileSystem;
|
16
8
|
import org.apache.hadoop.fs.Path;
|
17
|
-
import org.
|
9
|
+
import org.apache.hadoop.fs.PathNotFoundException;
|
18
10
|
import org.embulk.config.Config;
|
19
11
|
import org.embulk.config.ConfigDefault;
|
20
|
-
import org.embulk.config.ConfigInject;
|
21
12
|
import org.embulk.config.ConfigDiff;
|
13
|
+
import org.embulk.config.ConfigInject;
|
22
14
|
import org.embulk.config.ConfigSource;
|
23
15
|
import org.embulk.config.Task;
|
16
|
+
import org.embulk.config.TaskReport;
|
24
17
|
import org.embulk.config.TaskSource;
|
25
|
-
import org.embulk.spi
|
26
|
-
import org.embulk.spi.
|
18
|
+
import org.embulk.spi.BufferAllocator;
|
19
|
+
import org.embulk.spi.Exec;
|
20
|
+
import org.embulk.spi.FileInputPlugin;
|
21
|
+
import org.embulk.spi.TransactionalFileInput;
|
27
22
|
import org.embulk.spi.util.InputStreamTransactionalFileInput;
|
28
23
|
import org.jruby.embed.ScriptingContainer;
|
29
24
|
import org.slf4j.Logger;
|
30
25
|
|
31
26
|
import javax.annotation.Nullable;
|
27
|
+
import java.io.IOException;
|
28
|
+
import java.io.InputStream;
|
29
|
+
import java.util.ArrayList;
|
30
|
+
import java.util.List;
|
31
|
+
import java.util.Map;
|
32
32
|
|
33
33
|
public class HdfsFileInputPlugin implements FileInputPlugin
|
34
34
|
{
|
@@ -44,8 +44,8 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
44
44
|
@ConfigDefault("{}")
|
45
45
|
public Map<String, String> getConfig();
|
46
46
|
|
47
|
-
@Config("
|
48
|
-
public String
|
47
|
+
@Config("path")
|
48
|
+
public String getPath();
|
49
49
|
|
50
50
|
@Config("rewind_seconds")
|
51
51
|
@ConfigDefault("0")
|
@@ -72,9 +72,14 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
72
72
|
PluginTask task = config.loadConfig(PluginTask.class);
|
73
73
|
|
74
74
|
// listing Files
|
75
|
-
String pathString = strftime(task.
|
75
|
+
String pathString = strftime(task.getPath(), task.getRewindSeconds());
|
76
76
|
try {
|
77
77
|
List<String> originalFileList = buildFileList(getFs(task), pathString);
|
78
|
+
|
79
|
+
if (originalFileList.isEmpty()) {
|
80
|
+
throw new PathNotFoundException(pathString);
|
81
|
+
}
|
82
|
+
|
78
83
|
task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
|
79
84
|
logger.info("Loading target files: {}", originalFileList);
|
80
85
|
}
|
@@ -190,13 +195,17 @@ public class HdfsFileInputPlugin implements FileInputPlugin
|
|
190
195
|
throws IOException
|
191
196
|
{
|
192
197
|
List<String> fileList = new ArrayList<>();
|
193
|
-
|
198
|
+
Path rootPath = new Path(pathString);
|
199
|
+
|
200
|
+
for (FileStatus entry : fs.globStatus(rootPath)) {
|
194
201
|
if (entry.isDirectory()) {
|
195
202
|
fileList.addAll(lsr(fs, entry));
|
196
|
-
}
|
203
|
+
}
|
204
|
+
else {
|
197
205
|
fileList.add(entry.getPath().toString());
|
198
206
|
}
|
199
207
|
}
|
208
|
+
|
200
209
|
return fileList;
|
201
210
|
}
|
202
211
|
|
@@ -117,11 +117,12 @@ public class HdfsPartialFileInputStream extends InputStream
|
|
117
117
|
{
|
118
118
|
if (current >= start) {
|
119
119
|
return;
|
120
|
-
|
121
120
|
}
|
121
|
+
|
122
122
|
if (start == 0) {
|
123
123
|
current = 0;
|
124
|
-
}
|
124
|
+
}
|
125
|
+
else {
|
125
126
|
current = original.skip(--start);
|
126
127
|
if (current != start) {
|
127
128
|
throw new IOException("Cannot skip.");
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-10-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -82,7 +82,7 @@ files:
|
|
82
82
|
- classpath/curator-client-2.6.0.jar
|
83
83
|
- classpath/curator-framework-2.6.0.jar
|
84
84
|
- classpath/curator-recipes-2.6.0.jar
|
85
|
-
- classpath/embulk-input-hdfs-0.1.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.4.jar
|
86
86
|
- classpath/gson-2.2.4.jar
|
87
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
88
88
|
- classpath/hadoop-auth-2.6.0.jar
|