embulk-input-hdfs 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +12 -1
- data/build.gradle +2 -2
- data/classpath/embulk-input-hdfs-0.1.0.jar +0 -0
- data/lib/embulk/input/hdfs.rb +1 -1
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +267 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +39 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +40 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java +154 -0
- data/src/test/java/org/embulk/input/{TestHdfsFileInputPlugin.java → hdfs/TestHdfsFileInputPlugin.java} +1 -1
- metadata +8 -5
- data/classpath/embulk-input-hdfs-0.0.3.jar +0 -0
- data/src/main/java/org/embulk/input/HdfsFileInputPlugin.java +0 -231
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
|
4
|
+
data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
|
7
|
+
data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
|
data/README.md
CHANGED
@@ -14,6 +14,7 @@ Read files on Hdfs.
|
|
14
14
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
15
15
|
- **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
|
+
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
17
18
|
|
18
19
|
## Example
|
19
20
|
|
@@ -24,12 +25,13 @@ in:
|
|
24
25
|
- /opt/analytics/etc/hadoop/conf/core-site.xml
|
25
26
|
- /opt/analytics/etc/hadoop/conf/hdfs-site.xml
|
26
27
|
config:
|
27
|
-
fs.defaultFS: 'hdfs://
|
28
|
+
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
28
29
|
dfs.replication: 1
|
29
30
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
30
31
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
31
32
|
input_path: /user/embulk/test/%Y-%m-%d/*
|
32
33
|
rewind_seconds: 86400
|
34
|
+
partition: true
|
33
35
|
decoders:
|
34
36
|
- {type: gzip}
|
35
37
|
parser:
|
@@ -50,6 +52,15 @@ in:
|
|
50
52
|
- {name: c3, type: long}
|
51
53
|
```
|
52
54
|
|
55
|
+
## Note
|
56
|
+
- the feature of the partition supports only 3 line terminators.
|
57
|
+
- `\n`
|
58
|
+
- `\r`
|
59
|
+
- `\r\n`
|
60
|
+
|
61
|
+
## The Reference Implementation
|
62
|
+
- [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
|
63
|
+
|
53
64
|
## Build
|
54
65
|
|
55
66
|
```
|
data/build.gradle
CHANGED
@@ -12,7 +12,7 @@ configurations {
|
|
12
12
|
provided
|
13
13
|
}
|
14
14
|
|
15
|
-
version = "0.0
|
15
|
+
version = "0.1.0"
|
16
16
|
|
17
17
|
sourceCompatibility = 1.7
|
18
18
|
targetCompatibility = 1.7
|
@@ -22,7 +22,7 @@ dependencies {
|
|
22
22
|
provided "org.embulk:embulk-core:0.7.0"
|
23
23
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
24
24
|
compile 'org.apache.hadoop:hadoop-client:2.6.0'
|
25
|
-
compile 'com.google.guava:guava:
|
25
|
+
compile 'com.google.guava:guava:15.0'
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
}
|
28
28
|
|
Binary file
|
data/lib/embulk/input/hdfs.rb
CHANGED
@@ -0,0 +1,267 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.InputStream;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Map;
|
8
|
+
|
9
|
+
import com.google.common.base.Function;
|
10
|
+
import com.google.common.base.Optional;
|
11
|
+
import com.google.common.collect.ImmutableList;
|
12
|
+
import com.google.common.collect.Lists;
|
13
|
+
import org.apache.hadoop.conf.Configuration;
|
14
|
+
import org.apache.hadoop.fs.FileStatus;
|
15
|
+
import org.apache.hadoop.fs.FileSystem;
|
16
|
+
import org.apache.hadoop.fs.Path;
|
17
|
+
import org.embulk.config.TaskReport;
|
18
|
+
import org.embulk.config.Config;
|
19
|
+
import org.embulk.config.ConfigDefault;
|
20
|
+
import org.embulk.config.ConfigInject;
|
21
|
+
import org.embulk.config.ConfigDiff;
|
22
|
+
import org.embulk.config.ConfigSource;
|
23
|
+
import org.embulk.config.Task;
|
24
|
+
import org.embulk.config.TaskSource;
|
25
|
+
import org.embulk.spi.*;
|
26
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
27
|
+
import org.embulk.spi.util.InputStreamTransactionalFileInput;
|
28
|
+
import org.jruby.embed.ScriptingContainer;
|
29
|
+
import org.slf4j.Logger;
|
30
|
+
|
31
|
+
import javax.annotation.Nullable;
|
32
|
+
|
33
|
+
public class HdfsFileInputPlugin implements FileInputPlugin
|
34
|
+
{
|
35
|
+
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
36
|
+
|
37
|
+
public interface PluginTask extends Task
|
38
|
+
{
|
39
|
+
@Config("config_files")
|
40
|
+
@ConfigDefault("[]")
|
41
|
+
public List<String> getConfigFiles();
|
42
|
+
|
43
|
+
@Config("config")
|
44
|
+
@ConfigDefault("{}")
|
45
|
+
public Map<String, String> getConfig();
|
46
|
+
|
47
|
+
@Config("input_path")
|
48
|
+
public String getInputPath();
|
49
|
+
|
50
|
+
@Config("rewind_seconds")
|
51
|
+
@ConfigDefault("0")
|
52
|
+
public int getRewindSeconds();
|
53
|
+
|
54
|
+
@Config("partition")
|
55
|
+
@ConfigDefault("true")
|
56
|
+
public boolean getPartition();
|
57
|
+
|
58
|
+
// this parameter is experimental.
|
59
|
+
@Config("partition_level")
|
60
|
+
@ConfigDefault("3")
|
61
|
+
public int getPartitonLevel();
|
62
|
+
|
63
|
+
public List<HdfsPartialFile> getFiles();
|
64
|
+
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
65
|
+
|
66
|
+
@ConfigInject
|
67
|
+
public BufferAllocator getBufferAllocator();
|
68
|
+
}
|
69
|
+
|
70
|
+
@Override
|
71
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
72
|
+
{
|
73
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
74
|
+
|
75
|
+
// listing Files
|
76
|
+
String pathString = strftime(task.getInputPath(), task.getRewindSeconds());
|
77
|
+
try {
|
78
|
+
List<String> originalFileList = buildFileList(getFs(task), pathString);
|
79
|
+
task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
|
80
|
+
logger.info("Loading target files: {}", originalFileList);
|
81
|
+
}
|
82
|
+
catch (IOException e) {
|
83
|
+
logger.error(e.getMessage());
|
84
|
+
throw new RuntimeException(e);
|
85
|
+
}
|
86
|
+
|
87
|
+
// log the detail of partial files.
|
88
|
+
for (HdfsPartialFile partialFile : task.getFiles()) {
|
89
|
+
logger.info("target file: {}, start: {}, end: {}",
|
90
|
+
partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
|
91
|
+
}
|
92
|
+
|
93
|
+
// number of processors is same with number of targets
|
94
|
+
int taskCount = task.getFiles().size();
|
95
|
+
logger.info("task size: {}", taskCount);
|
96
|
+
|
97
|
+
return resume(task.dump(), taskCount, control);
|
98
|
+
}
|
99
|
+
|
100
|
+
@Override
|
101
|
+
public ConfigDiff resume(TaskSource taskSource,
|
102
|
+
int taskCount,
|
103
|
+
FileInputPlugin.Control control)
|
104
|
+
{
|
105
|
+
control.run(taskSource, taskCount);
|
106
|
+
|
107
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
108
|
+
|
109
|
+
// usually, yo use last_path
|
110
|
+
//if (task.getFiles().isEmpty()) {
|
111
|
+
// if (task.getLastPath().isPresent()) {
|
112
|
+
// configDiff.set("last_path", task.getLastPath().get());
|
113
|
+
// }
|
114
|
+
//} else {
|
115
|
+
// List<String> files = new ArrayList<String>(task.getFiles());
|
116
|
+
// Collections.sort(files);
|
117
|
+
// configDiff.set("last_path", files.get(files.size() - 1));
|
118
|
+
//}
|
119
|
+
|
120
|
+
return configDiff;
|
121
|
+
}
|
122
|
+
|
123
|
+
@Override
|
124
|
+
public void cleanup(TaskSource taskSource,
|
125
|
+
int taskCount,
|
126
|
+
List<TaskReport> successTaskReports)
|
127
|
+
{
|
128
|
+
}
|
129
|
+
|
130
|
+
@Override
|
131
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
132
|
+
{
|
133
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
134
|
+
|
135
|
+
InputStream input;
|
136
|
+
try {
|
137
|
+
input = openInputStream(task, task.getFiles().get(taskIndex));
|
138
|
+
}
|
139
|
+
catch (IOException e) {
|
140
|
+
logger.error(e.getMessage());
|
141
|
+
throw new RuntimeException(e);
|
142
|
+
}
|
143
|
+
|
144
|
+
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input) {
|
145
|
+
@Override
|
146
|
+
public void abort()
|
147
|
+
{ }
|
148
|
+
|
149
|
+
@Override
|
150
|
+
public TaskReport commit()
|
151
|
+
{
|
152
|
+
return Exec.newTaskReport();
|
153
|
+
}
|
154
|
+
};
|
155
|
+
}
|
156
|
+
|
157
|
+
private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
|
158
|
+
throws IOException
|
159
|
+
{
|
160
|
+
FileSystem fs = getFs(task);
|
161
|
+
InputStream original = fs.open(new Path(partialFile.getPath()));
|
162
|
+
return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
|
163
|
+
}
|
164
|
+
|
165
|
+
private static FileSystem getFs(final PluginTask task)
|
166
|
+
throws IOException
|
167
|
+
{
|
168
|
+
Configuration configuration = new Configuration();
|
169
|
+
|
170
|
+
for (Object configFile : task.getConfigFiles()) {
|
171
|
+
configuration.addResource(configFile.toString());
|
172
|
+
}
|
173
|
+
configuration.reloadConfiguration();
|
174
|
+
|
175
|
+
for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
|
176
|
+
configuration.set(entry.getKey(), entry.getValue());
|
177
|
+
}
|
178
|
+
|
179
|
+
return FileSystem.get(configuration);
|
180
|
+
}
|
181
|
+
|
182
|
+
private String strftime(final String raw, final int rewind_seconds)
|
183
|
+
{
|
184
|
+
ScriptingContainer jruby = new ScriptingContainer();
|
185
|
+
Object resolved = jruby.runScriptlet(
|
186
|
+
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
|
187
|
+
return resolved.toString();
|
188
|
+
}
|
189
|
+
|
190
|
+
private List<String> buildFileList(final FileSystem fs, final String pathString)
|
191
|
+
throws IOException
|
192
|
+
{
|
193
|
+
List<String> fileList = new ArrayList<>();
|
194
|
+
for (FileStatus entry : fs.globStatus(new Path(pathString))) {
|
195
|
+
if (entry.isDirectory()) {
|
196
|
+
fileList.addAll(lsr(fs, entry));
|
197
|
+
} else {
|
198
|
+
fileList.add(entry.getPath().toString());
|
199
|
+
}
|
200
|
+
}
|
201
|
+
return fileList;
|
202
|
+
}
|
203
|
+
|
204
|
+
private List<String> lsr(final FileSystem fs, FileStatus status)
|
205
|
+
throws IOException
|
206
|
+
{
|
207
|
+
List<String> fileList = new ArrayList<>();
|
208
|
+
if (status.isDirectory()) {
|
209
|
+
for (FileStatus entry : fs.listStatus(status.getPath())) {
|
210
|
+
fileList.addAll(lsr(fs, entry));
|
211
|
+
}
|
212
|
+
}
|
213
|
+
else {
|
214
|
+
fileList.add(status.getPath().toString());
|
215
|
+
}
|
216
|
+
return fileList;
|
217
|
+
}
|
218
|
+
|
219
|
+
private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
|
220
|
+
throws IOException
|
221
|
+
{
|
222
|
+
List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
|
223
|
+
{
|
224
|
+
@Nullable
|
225
|
+
@Override
|
226
|
+
public Path apply(@Nullable String input)
|
227
|
+
{
|
228
|
+
return new Path(input);
|
229
|
+
}
|
230
|
+
});
|
231
|
+
|
232
|
+
int totalFileLength = 0;
|
233
|
+
for (Path path : pathList) {
|
234
|
+
totalFileLength += fs.getFileStatus(path).getLen();
|
235
|
+
}
|
236
|
+
|
237
|
+
// TODO: optimum allocation of resources
|
238
|
+
int partitionCountParameter = task.getPartitonLevel();
|
239
|
+
int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
|
240
|
+
|
241
|
+
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
242
|
+
for (Path path : pathList) {
|
243
|
+
int partitionCount;
|
244
|
+
|
245
|
+
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
246
|
+
partitionCount = 1;
|
247
|
+
}
|
248
|
+
else if (!task.getPartition()) {
|
249
|
+
partitionCount = 1;
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
int fileLength = (int) fs.getFileStatus(path).getLen();
|
253
|
+
partitionCount = fileLength / partitionSizeByOneTask;
|
254
|
+
int remainder = fileLength % partitionSizeByOneTask;
|
255
|
+
|
256
|
+
if (remainder > 0) {
|
257
|
+
partitionCount++;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
|
262
|
+
hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
|
263
|
+
}
|
264
|
+
|
265
|
+
return hdfsPartialFiles;
|
266
|
+
}
|
267
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.FileSystem;
|
4
|
+
import org.apache.hadoop.fs.Path;
|
5
|
+
|
6
|
+
import java.io.IOException;
|
7
|
+
import java.util.ArrayList;
|
8
|
+
import java.util.List;
|
9
|
+
|
10
|
+
/**
|
11
|
+
* Created by takahiro.nakayama on 8/20/15.
|
12
|
+
*/
|
13
|
+
public class HdfsFilePartitioner
|
14
|
+
{
|
15
|
+
private FileSystem fs;
|
16
|
+
private Path path;
|
17
|
+
private int partitionCount;
|
18
|
+
|
19
|
+
public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
|
20
|
+
{
|
21
|
+
this.fs = fs;
|
22
|
+
this.path = path;
|
23
|
+
this.partitionCount = partitionCount;
|
24
|
+
}
|
25
|
+
|
26
|
+
public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
|
27
|
+
{
|
28
|
+
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
29
|
+
long size = fs.getFileStatus(path).getLen();
|
30
|
+
for (int i = 0; i < partitionCount; i++) {
|
31
|
+
long start = size * i / partitionCount;
|
32
|
+
long end = size * (i + 1) / partitionCount;
|
33
|
+
if (start < end) {
|
34
|
+
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
35
|
+
}
|
36
|
+
}
|
37
|
+
return hdfsPartialFiles;
|
38
|
+
}
|
39
|
+
}
|
@@ -0,0 +1,40 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.Path;
|
4
|
+
|
5
|
+
/**
|
6
|
+
* Created by takahiro.nakayama on 8/20/15.
|
7
|
+
*/
|
8
|
+
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
|
9
|
+
public class HdfsPartialFile
|
10
|
+
{
|
11
|
+
private String path;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
|
15
|
+
public HdfsPartialFile(String path, long start, long end)
|
16
|
+
{
|
17
|
+
this.path = path;
|
18
|
+
this.start = start;
|
19
|
+
this.end = end;
|
20
|
+
}
|
21
|
+
|
22
|
+
// see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
|
23
|
+
public HdfsPartialFile() { }
|
24
|
+
|
25
|
+
public String getPath()
|
26
|
+
{
|
27
|
+
return path;
|
28
|
+
}
|
29
|
+
|
30
|
+
public long getStart()
|
31
|
+
{
|
32
|
+
return start;
|
33
|
+
}
|
34
|
+
|
35
|
+
public long getEnd()
|
36
|
+
{
|
37
|
+
return end;
|
38
|
+
}
|
39
|
+
|
40
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.io.PushbackInputStream;
|
7
|
+
|
8
|
+
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
|
9
|
+
public class HdfsPartialFileInputStream extends InputStream
|
10
|
+
{
|
11
|
+
private final PushbackInputStream original;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
private long current;
|
15
|
+
private boolean eof;
|
16
|
+
|
17
|
+
public HdfsPartialFileInputStream(InputStream original, long start, long end)
|
18
|
+
{
|
19
|
+
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
20
|
+
this.start = start;
|
21
|
+
this.end = end;
|
22
|
+
current = -1;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public int read(byte[] b) throws IOException
|
27
|
+
{
|
28
|
+
return read(b, 0, b.length);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public int read(byte[] b, int off, int len) throws IOException
|
33
|
+
{
|
34
|
+
initializeIfNeeded();
|
35
|
+
|
36
|
+
if (eof) {
|
37
|
+
return -1;
|
38
|
+
}
|
39
|
+
|
40
|
+
int read = original.read(b, off, len);
|
41
|
+
if (read < 0) {
|
42
|
+
eof = true;
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
|
46
|
+
current += read;
|
47
|
+
if (current >= end) {
|
48
|
+
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
49
|
+
if (b[off + i] == '\n') {
|
50
|
+
eof = true;
|
51
|
+
return i + 1;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (b[off + i] == '\r') {
|
55
|
+
int next = (i < read ? b[off + i + 1] : prefetch());
|
56
|
+
if (next != '\n') {
|
57
|
+
eof = true;
|
58
|
+
return i + 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
return read;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public int read() throws IOException
|
69
|
+
{
|
70
|
+
initializeIfNeeded();
|
71
|
+
|
72
|
+
if (eof) {
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
int read = original.read();
|
77
|
+
current++;
|
78
|
+
|
79
|
+
if (read < 0) {
|
80
|
+
eof = true;
|
81
|
+
return -1;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (current >= end) {
|
85
|
+
if (read == '\n' || read == '\r' && prefetch() != '\n') {
|
86
|
+
eof = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return read;
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
public long skip(long n) throws IOException
|
95
|
+
{
|
96
|
+
throw new IOException("Skip not supported.");
|
97
|
+
/*
|
98
|
+
long skip = original.skip(n);
|
99
|
+
current += skip;
|
100
|
+
return skip;
|
101
|
+
*/
|
102
|
+
}
|
103
|
+
|
104
|
+
@Override
|
105
|
+
public int available() throws IOException
|
106
|
+
{
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
@Override
|
111
|
+
public void close() throws IOException
|
112
|
+
{
|
113
|
+
original.close();
|
114
|
+
}
|
115
|
+
|
116
|
+
private void initializeIfNeeded() throws IOException
|
117
|
+
{
|
118
|
+
if (current >= start) {
|
119
|
+
return;
|
120
|
+
|
121
|
+
}
|
122
|
+
if (start == 0) {
|
123
|
+
current = 0;
|
124
|
+
} else {
|
125
|
+
current = original.skip(--start);
|
126
|
+
if (current != start) {
|
127
|
+
throw new IOException("Cannot skip.");
|
128
|
+
}
|
129
|
+
|
130
|
+
int c;
|
131
|
+
while ((c = original.read()) >= 0) {
|
132
|
+
start++;
|
133
|
+
current++;
|
134
|
+
|
135
|
+
if (c == '\n' || c == '\r' && prefetch() != '\n') {
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
if (start >= end) {
|
142
|
+
eof = true;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
private int prefetch() throws IOException
|
147
|
+
{
|
148
|
+
int c = original.read();
|
149
|
+
if (c >= 0) {
|
150
|
+
original.unread(c);
|
151
|
+
}
|
152
|
+
return c;
|
153
|
+
}
|
154
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08
|
11
|
+
date: 2015-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,8 +54,11 @@ files:
|
|
54
54
|
- gradlew
|
55
55
|
- gradlew.bat
|
56
56
|
- lib/embulk/input/hdfs.rb
|
57
|
-
- src/main/java/org/embulk/input/HdfsFileInputPlugin.java
|
58
|
-
- src/
|
57
|
+
- src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
|
58
|
+
- src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java
|
59
|
+
- src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
|
60
|
+
- src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
|
61
|
+
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
59
62
|
- classpath/activation-1.1.jar
|
60
63
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
61
64
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -79,7 +82,7 @@ files:
|
|
79
82
|
- classpath/curator-client-2.6.0.jar
|
80
83
|
- classpath/curator-framework-2.6.0.jar
|
81
84
|
- classpath/curator-recipes-2.6.0.jar
|
82
|
-
- classpath/embulk-input-hdfs-0.0.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.0.jar
|
83
86
|
- classpath/gson-2.2.4.jar
|
84
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
85
88
|
- classpath/hadoop-auth-2.6.0.jar
|
Binary file
|
@@ -1,231 +0,0 @@
|
|
1
|
-
package org.embulk.input;
|
2
|
-
|
3
|
-
import org.apache.hadoop.conf.Configuration;
|
4
|
-
import org.apache.hadoop.fs.FileStatus;
|
5
|
-
import org.apache.hadoop.fs.FileSystem;
|
6
|
-
import org.apache.hadoop.fs.Path;
|
7
|
-
import org.embulk.config.*;
|
8
|
-
import org.embulk.spi.BufferAllocator;
|
9
|
-
import org.embulk.spi.Exec;
|
10
|
-
import org.embulk.spi.FileInputPlugin;
|
11
|
-
import org.embulk.spi.TransactionalFileInput;
|
12
|
-
import org.embulk.spi.util.InputStreamFileInput;
|
13
|
-
import org.jruby.embed.ScriptingContainer;
|
14
|
-
import org.slf4j.Logger;
|
15
|
-
|
16
|
-
import java.io.IOException;
|
17
|
-
import java.io.InputStream;
|
18
|
-
import java.util.ArrayList;
|
19
|
-
import java.util.List;
|
20
|
-
import java.util.Map;
|
21
|
-
|
22
|
-
public class HdfsFileInputPlugin implements FileInputPlugin
|
23
|
-
{
|
24
|
-
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
25
|
-
|
26
|
-
public interface PluginTask extends Task
|
27
|
-
{
|
28
|
-
@Config("config_files")
|
29
|
-
@ConfigDefault("[]")
|
30
|
-
public List<String> getConfigFiles();
|
31
|
-
|
32
|
-
@Config("config")
|
33
|
-
@ConfigDefault("{}")
|
34
|
-
public Map<String, String> getConfig();
|
35
|
-
|
36
|
-
@Config("input_path")
|
37
|
-
public String getInputPath();
|
38
|
-
|
39
|
-
@Config("rewind_seconds")
|
40
|
-
@ConfigDefault("0")
|
41
|
-
public int getRewindSeconds();
|
42
|
-
|
43
|
-
public List<String> getTargetFiles();
|
44
|
-
public void setTargetFiles(List<String> targetFiles);
|
45
|
-
|
46
|
-
@ConfigInject
|
47
|
-
public BufferAllocator getBufferAllocator();
|
48
|
-
}
|
49
|
-
|
50
|
-
@Override
|
51
|
-
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
52
|
-
{
|
53
|
-
PluginTask task = config.loadConfig(PluginTask.class);
|
54
|
-
|
55
|
-
// prepare
|
56
|
-
Configuration configuration = getHdfsConfiguration(task);
|
57
|
-
FileSystem fs = getFs(configuration);
|
58
|
-
Path inputPath = new Path(strftime(task.getInputPath(), task.getRewindSeconds()));
|
59
|
-
|
60
|
-
// listing
|
61
|
-
List<String> targetFiles;
|
62
|
-
try {
|
63
|
-
targetFiles = globRecursive(fs, inputPath);
|
64
|
-
} catch (IOException e) {
|
65
|
-
logger.error(e.getMessage());
|
66
|
-
throw new RuntimeException(e);
|
67
|
-
}
|
68
|
-
logger.info("Loading target files: {}", targetFiles);
|
69
|
-
task.setTargetFiles(targetFiles);
|
70
|
-
|
71
|
-
// number of processors is same with number of targets
|
72
|
-
int taskCount = targetFiles.size();
|
73
|
-
|
74
|
-
return resume(task.dump(), taskCount, control);
|
75
|
-
}
|
76
|
-
|
77
|
-
@Override
|
78
|
-
public ConfigDiff resume(TaskSource taskSource,
|
79
|
-
int taskCount,
|
80
|
-
FileInputPlugin.Control control)
|
81
|
-
{
|
82
|
-
control.run(taskSource, taskCount);
|
83
|
-
return Exec.newConfigDiff();
|
84
|
-
}
|
85
|
-
|
86
|
-
@Override
|
87
|
-
public void cleanup(TaskSource taskSource,
|
88
|
-
int taskCount,
|
89
|
-
List<TaskReport> successTaskReports)
|
90
|
-
{
|
91
|
-
}
|
92
|
-
|
93
|
-
@Override
|
94
|
-
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
95
|
-
{
|
96
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
97
|
-
|
98
|
-
// prepare
|
99
|
-
Configuration configuration = getHdfsConfiguration(task);
|
100
|
-
FileSystem fs = getFs(configuration);
|
101
|
-
|
102
|
-
return new HdfsFileInput(task, fs, taskIndex);
|
103
|
-
}
|
104
|
-
|
105
|
-
private Configuration getHdfsConfiguration(final PluginTask task)
|
106
|
-
{
|
107
|
-
Configuration configuration = new Configuration();
|
108
|
-
|
109
|
-
for (Object configFile : task.getConfigFiles()) {
|
110
|
-
configuration.addResource(configFile.toString());
|
111
|
-
}
|
112
|
-
configuration.reloadConfiguration();
|
113
|
-
|
114
|
-
for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
|
115
|
-
configuration.set(entry.getKey(), entry.getValue());
|
116
|
-
}
|
117
|
-
|
118
|
-
return configuration;
|
119
|
-
}
|
120
|
-
|
121
|
-
private FileSystem getFs(final Configuration configuration)
|
122
|
-
{
|
123
|
-
try {
|
124
|
-
FileSystem fs = FileSystem.get(configuration);
|
125
|
-
return fs;
|
126
|
-
}
|
127
|
-
catch (IOException e) {
|
128
|
-
logger.error(e.getMessage());
|
129
|
-
throw new RuntimeException(e);
|
130
|
-
}
|
131
|
-
}
|
132
|
-
|
133
|
-
private String strftime(final String raw, final int rewind_seconds)
|
134
|
-
{
|
135
|
-
ScriptingContainer jruby = new ScriptingContainer();
|
136
|
-
Object resolved = jruby.runScriptlet(
|
137
|
-
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
|
138
|
-
return resolved.toString();
|
139
|
-
}
|
140
|
-
|
141
|
-
private List<String> globRecursive(final FileSystem fs, final Path hdfsPath) throws IOException
|
142
|
-
{
|
143
|
-
List<String> container = new ArrayList<String>();
|
144
|
-
for (FileStatus entry : fs.globStatus(hdfsPath)) {
|
145
|
-
if (entry.isDirectory()) {
|
146
|
-
container.addAll(listRecursive(fs, entry));
|
147
|
-
}
|
148
|
-
else {
|
149
|
-
container.add(entry.getPath().toString());
|
150
|
-
}
|
151
|
-
}
|
152
|
-
return container;
|
153
|
-
}
|
154
|
-
|
155
|
-
private List<String> listRecursive(final FileSystem fs, FileStatus status) throws IOException {
|
156
|
-
List<String> container = new ArrayList<String>();
|
157
|
-
if (status.isDirectory()) {
|
158
|
-
for (FileStatus entry : fs.listStatus(status.getPath())) {
|
159
|
-
container.addAll(listRecursive(fs, entry));
|
160
|
-
}
|
161
|
-
}
|
162
|
-
else {
|
163
|
-
container.add(status.getPath().toString());
|
164
|
-
}
|
165
|
-
return container;
|
166
|
-
}
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
// private List<String> listUniquify(List<String> stringList)
|
171
|
-
// {
|
172
|
-
// Set<String> set = new HashSet<String>();
|
173
|
-
// set.addAll(stringList);
|
174
|
-
// List<String> uniqueStringList = new ArrayList<String>();
|
175
|
-
// uniqueStringList.addAll(set);
|
176
|
-
// return uniqueStringList;
|
177
|
-
// }
|
178
|
-
|
179
|
-
public static class HdfsFileInput extends InputStreamFileInput implements TransactionalFileInput
|
180
|
-
{
|
181
|
-
private static class HdfsFileProvider implements InputStreamFileInput.Provider
|
182
|
-
{
|
183
|
-
private final FileSystem fs;
|
184
|
-
private final Path hdfsPath;
|
185
|
-
private boolean opened = false;
|
186
|
-
|
187
|
-
public HdfsFileProvider(PluginTask task, FileSystem fs, int taskIndex)
|
188
|
-
{
|
189
|
-
this.fs = fs;
|
190
|
-
this.hdfsPath = new Path(task.getTargetFiles().get(taskIndex));
|
191
|
-
}
|
192
|
-
|
193
|
-
@Override
|
194
|
-
public InputStream openNext() throws IOException
|
195
|
-
{
|
196
|
-
if (opened) {
|
197
|
-
return null;
|
198
|
-
}
|
199
|
-
|
200
|
-
opened = true;
|
201
|
-
return fs.open(hdfsPath);
|
202
|
-
}
|
203
|
-
|
204
|
-
@Override
|
205
|
-
public void close()
|
206
|
-
{
|
207
|
-
}
|
208
|
-
}
|
209
|
-
|
210
|
-
public HdfsFileInput(PluginTask task, FileSystem fs, int taskIndex)
|
211
|
-
{
|
212
|
-
super(task.getBufferAllocator(), new HdfsFileProvider(task, fs, taskIndex));
|
213
|
-
}
|
214
|
-
|
215
|
-
@Override
|
216
|
-
public void close()
|
217
|
-
{
|
218
|
-
}
|
219
|
-
|
220
|
-
@Override
|
221
|
-
public void abort()
|
222
|
-
{
|
223
|
-
}
|
224
|
-
|
225
|
-
@Override
|
226
|
-
public TaskReport commit()
|
227
|
-
{
|
228
|
-
return Exec.newTaskReport();
|
229
|
-
}
|
230
|
-
}
|
231
|
-
}
|