embulk-input-hdfs 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +12 -1
- data/build.gradle +2 -2
- data/classpath/embulk-input-hdfs-0.1.0.jar +0 -0
- data/lib/embulk/input/hdfs.rb +1 -1
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +267 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +39 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +40 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java +154 -0
- data/src/test/java/org/embulk/input/{TestHdfsFileInputPlugin.java → hdfs/TestHdfsFileInputPlugin.java} +1 -1
- metadata +8 -5
- data/classpath/embulk-input-hdfs-0.0.3.jar +0 -0
- data/src/main/java/org/embulk/input/HdfsFileInputPlugin.java +0 -231
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02936bd2f2b0abf89c7fdd6eb48144d0b0853082
|
4
|
+
data.tar.gz: 627f9a5edaf9d804945a92b0ef9fe2c2e0ee271d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5cc0f62847d833dae4b63a8a2eaaaaf93e6fcce1b62940586c1704cb7a3395c2848bae3b467cf249b3b5c7918d3c86ccd8d3e98c5a67c02a6de69ed56d08c34
|
7
|
+
data.tar.gz: 2aa0bec94527d2898556e45d88718679b3a9032682836fefc76503d5a4a5b90917e51d0d1d2f8c5b439a3a800df08f6494a2e4ef3264a16a2ee0f61c51ee306a
|
data/README.md
CHANGED
@@ -14,6 +14,7 @@ Read files on Hdfs.
|
|
14
14
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
15
15
|
- **input_path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
|
16
16
|
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
17
|
+
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
17
18
|
|
18
19
|
## Example
|
19
20
|
|
@@ -24,12 +25,13 @@ in:
|
|
24
25
|
- /opt/analytics/etc/hadoop/conf/core-site.xml
|
25
26
|
- /opt/analytics/etc/hadoop/conf/hdfs-site.xml
|
26
27
|
config:
|
27
|
-
fs.defaultFS: 'hdfs://
|
28
|
+
fs.defaultFS: 'hdfs://hadoop-nn1:8020'
|
28
29
|
dfs.replication: 1
|
29
30
|
fs.hdfs.impl: 'org.apache.hadoop.hdfs.DistributedFileSystem'
|
30
31
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
31
32
|
input_path: /user/embulk/test/%Y-%m-%d/*
|
32
33
|
rewind_seconds: 86400
|
34
|
+
partition: true
|
33
35
|
decoders:
|
34
36
|
- {type: gzip}
|
35
37
|
parser:
|
@@ -50,6 +52,15 @@ in:
|
|
50
52
|
- {name: c3, type: long}
|
51
53
|
```
|
52
54
|
|
55
|
+
## Note
|
56
|
+
- the feature of the partition supports only 3 line terminators.
|
57
|
+
- `\n`
|
58
|
+
- `\r`
|
59
|
+
- `\r\n`
|
60
|
+
|
61
|
+
## The Reference Implementation
|
62
|
+
- [hito4t/embulk-input-filesplit](https://github.com/hito4t/embulk-input-filesplit)
|
63
|
+
|
53
64
|
## Build
|
54
65
|
|
55
66
|
```
|
data/build.gradle
CHANGED
@@ -12,7 +12,7 @@ configurations {
|
|
12
12
|
provided
|
13
13
|
}
|
14
14
|
|
15
|
-
version = "0.0
|
15
|
+
version = "0.1.0"
|
16
16
|
|
17
17
|
sourceCompatibility = 1.7
|
18
18
|
targetCompatibility = 1.7
|
@@ -22,7 +22,7 @@ dependencies {
|
|
22
22
|
provided "org.embulk:embulk-core:0.7.0"
|
23
23
|
// compile "YOUR_JAR_DEPENDENCY_GROUP:YOUR_JAR_DEPENDENCY_MODULE:YOUR_JAR_DEPENDENCY_VERSION"
|
24
24
|
compile 'org.apache.hadoop:hadoop-client:2.6.0'
|
25
|
-
compile 'com.google.guava:guava:
|
25
|
+
compile 'com.google.guava:guava:15.0'
|
26
26
|
testCompile "junit:junit:4.+"
|
27
27
|
}
|
28
28
|
|
Binary file
|
data/lib/embulk/input/hdfs.rb
CHANGED
@@ -0,0 +1,267 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.InputStream;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Map;
|
8
|
+
|
9
|
+
import com.google.common.base.Function;
|
10
|
+
import com.google.common.base.Optional;
|
11
|
+
import com.google.common.collect.ImmutableList;
|
12
|
+
import com.google.common.collect.Lists;
|
13
|
+
import org.apache.hadoop.conf.Configuration;
|
14
|
+
import org.apache.hadoop.fs.FileStatus;
|
15
|
+
import org.apache.hadoop.fs.FileSystem;
|
16
|
+
import org.apache.hadoop.fs.Path;
|
17
|
+
import org.embulk.config.TaskReport;
|
18
|
+
import org.embulk.config.Config;
|
19
|
+
import org.embulk.config.ConfigDefault;
|
20
|
+
import org.embulk.config.ConfigInject;
|
21
|
+
import org.embulk.config.ConfigDiff;
|
22
|
+
import org.embulk.config.ConfigSource;
|
23
|
+
import org.embulk.config.Task;
|
24
|
+
import org.embulk.config.TaskSource;
|
25
|
+
import org.embulk.spi.*;
|
26
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
27
|
+
import org.embulk.spi.util.InputStreamTransactionalFileInput;
|
28
|
+
import org.jruby.embed.ScriptingContainer;
|
29
|
+
import org.slf4j.Logger;
|
30
|
+
|
31
|
+
import javax.annotation.Nullable;
|
32
|
+
|
33
|
+
public class HdfsFileInputPlugin implements FileInputPlugin
|
34
|
+
{
|
35
|
+
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
36
|
+
|
37
|
+
public interface PluginTask extends Task
|
38
|
+
{
|
39
|
+
@Config("config_files")
|
40
|
+
@ConfigDefault("[]")
|
41
|
+
public List<String> getConfigFiles();
|
42
|
+
|
43
|
+
@Config("config")
|
44
|
+
@ConfigDefault("{}")
|
45
|
+
public Map<String, String> getConfig();
|
46
|
+
|
47
|
+
@Config("input_path")
|
48
|
+
public String getInputPath();
|
49
|
+
|
50
|
+
@Config("rewind_seconds")
|
51
|
+
@ConfigDefault("0")
|
52
|
+
public int getRewindSeconds();
|
53
|
+
|
54
|
+
@Config("partition")
|
55
|
+
@ConfigDefault("true")
|
56
|
+
public boolean getPartition();
|
57
|
+
|
58
|
+
// this parameter is experimental.
|
59
|
+
@Config("partition_level")
|
60
|
+
@ConfigDefault("3")
|
61
|
+
public int getPartitonLevel();
|
62
|
+
|
63
|
+
public List<HdfsPartialFile> getFiles();
|
64
|
+
public void setFiles(List<HdfsPartialFile> hdfsFiles);
|
65
|
+
|
66
|
+
@ConfigInject
|
67
|
+
public BufferAllocator getBufferAllocator();
|
68
|
+
}
|
69
|
+
|
70
|
+
@Override
|
71
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
72
|
+
{
|
73
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
74
|
+
|
75
|
+
// listing Files
|
76
|
+
String pathString = strftime(task.getInputPath(), task.getRewindSeconds());
|
77
|
+
try {
|
78
|
+
List<String> originalFileList = buildFileList(getFs(task), pathString);
|
79
|
+
task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
|
80
|
+
logger.info("Loading target files: {}", originalFileList);
|
81
|
+
}
|
82
|
+
catch (IOException e) {
|
83
|
+
logger.error(e.getMessage());
|
84
|
+
throw new RuntimeException(e);
|
85
|
+
}
|
86
|
+
|
87
|
+
// log the detail of partial files.
|
88
|
+
for (HdfsPartialFile partialFile : task.getFiles()) {
|
89
|
+
logger.info("target file: {}, start: {}, end: {}",
|
90
|
+
partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
|
91
|
+
}
|
92
|
+
|
93
|
+
// number of processors is same with number of targets
|
94
|
+
int taskCount = task.getFiles().size();
|
95
|
+
logger.info("task size: {}", taskCount);
|
96
|
+
|
97
|
+
return resume(task.dump(), taskCount, control);
|
98
|
+
}
|
99
|
+
|
100
|
+
@Override
|
101
|
+
public ConfigDiff resume(TaskSource taskSource,
|
102
|
+
int taskCount,
|
103
|
+
FileInputPlugin.Control control)
|
104
|
+
{
|
105
|
+
control.run(taskSource, taskCount);
|
106
|
+
|
107
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
108
|
+
|
109
|
+
// usually, yo use last_path
|
110
|
+
//if (task.getFiles().isEmpty()) {
|
111
|
+
// if (task.getLastPath().isPresent()) {
|
112
|
+
// configDiff.set("last_path", task.getLastPath().get());
|
113
|
+
// }
|
114
|
+
//} else {
|
115
|
+
// List<String> files = new ArrayList<String>(task.getFiles());
|
116
|
+
// Collections.sort(files);
|
117
|
+
// configDiff.set("last_path", files.get(files.size() - 1));
|
118
|
+
//}
|
119
|
+
|
120
|
+
return configDiff;
|
121
|
+
}
|
122
|
+
|
123
|
+
@Override
|
124
|
+
public void cleanup(TaskSource taskSource,
|
125
|
+
int taskCount,
|
126
|
+
List<TaskReport> successTaskReports)
|
127
|
+
{
|
128
|
+
}
|
129
|
+
|
130
|
+
@Override
|
131
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
132
|
+
{
|
133
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
134
|
+
|
135
|
+
InputStream input;
|
136
|
+
try {
|
137
|
+
input = openInputStream(task, task.getFiles().get(taskIndex));
|
138
|
+
}
|
139
|
+
catch (IOException e) {
|
140
|
+
logger.error(e.getMessage());
|
141
|
+
throw new RuntimeException(e);
|
142
|
+
}
|
143
|
+
|
144
|
+
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input) {
|
145
|
+
@Override
|
146
|
+
public void abort()
|
147
|
+
{ }
|
148
|
+
|
149
|
+
@Override
|
150
|
+
public TaskReport commit()
|
151
|
+
{
|
152
|
+
return Exec.newTaskReport();
|
153
|
+
}
|
154
|
+
};
|
155
|
+
}
|
156
|
+
|
157
|
+
private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
|
158
|
+
throws IOException
|
159
|
+
{
|
160
|
+
FileSystem fs = getFs(task);
|
161
|
+
InputStream original = fs.open(new Path(partialFile.getPath()));
|
162
|
+
return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
|
163
|
+
}
|
164
|
+
|
165
|
+
private static FileSystem getFs(final PluginTask task)
|
166
|
+
throws IOException
|
167
|
+
{
|
168
|
+
Configuration configuration = new Configuration();
|
169
|
+
|
170
|
+
for (Object configFile : task.getConfigFiles()) {
|
171
|
+
configuration.addResource(configFile.toString());
|
172
|
+
}
|
173
|
+
configuration.reloadConfiguration();
|
174
|
+
|
175
|
+
for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
|
176
|
+
configuration.set(entry.getKey(), entry.getValue());
|
177
|
+
}
|
178
|
+
|
179
|
+
return FileSystem.get(configuration);
|
180
|
+
}
|
181
|
+
|
182
|
+
private String strftime(final String raw, final int rewind_seconds)
|
183
|
+
{
|
184
|
+
ScriptingContainer jruby = new ScriptingContainer();
|
185
|
+
Object resolved = jruby.runScriptlet(
|
186
|
+
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
|
187
|
+
return resolved.toString();
|
188
|
+
}
|
189
|
+
|
190
|
+
private List<String> buildFileList(final FileSystem fs, final String pathString)
|
191
|
+
throws IOException
|
192
|
+
{
|
193
|
+
List<String> fileList = new ArrayList<>();
|
194
|
+
for (FileStatus entry : fs.globStatus(new Path(pathString))) {
|
195
|
+
if (entry.isDirectory()) {
|
196
|
+
fileList.addAll(lsr(fs, entry));
|
197
|
+
} else {
|
198
|
+
fileList.add(entry.getPath().toString());
|
199
|
+
}
|
200
|
+
}
|
201
|
+
return fileList;
|
202
|
+
}
|
203
|
+
|
204
|
+
private List<String> lsr(final FileSystem fs, FileStatus status)
|
205
|
+
throws IOException
|
206
|
+
{
|
207
|
+
List<String> fileList = new ArrayList<>();
|
208
|
+
if (status.isDirectory()) {
|
209
|
+
for (FileStatus entry : fs.listStatus(status.getPath())) {
|
210
|
+
fileList.addAll(lsr(fs, entry));
|
211
|
+
}
|
212
|
+
}
|
213
|
+
else {
|
214
|
+
fileList.add(status.getPath().toString());
|
215
|
+
}
|
216
|
+
return fileList;
|
217
|
+
}
|
218
|
+
|
219
|
+
private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
|
220
|
+
throws IOException
|
221
|
+
{
|
222
|
+
List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
|
223
|
+
{
|
224
|
+
@Nullable
|
225
|
+
@Override
|
226
|
+
public Path apply(@Nullable String input)
|
227
|
+
{
|
228
|
+
return new Path(input);
|
229
|
+
}
|
230
|
+
});
|
231
|
+
|
232
|
+
int totalFileLength = 0;
|
233
|
+
for (Path path : pathList) {
|
234
|
+
totalFileLength += fs.getFileStatus(path).getLen();
|
235
|
+
}
|
236
|
+
|
237
|
+
// TODO: optimum allocation of resources
|
238
|
+
int partitionCountParameter = task.getPartitonLevel();
|
239
|
+
int partitionSizeByOneTask = totalFileLength / (Runtime.getRuntime().availableProcessors() * partitionCountParameter);
|
240
|
+
|
241
|
+
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
242
|
+
for (Path path : pathList) {
|
243
|
+
int partitionCount;
|
244
|
+
|
245
|
+
if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
|
246
|
+
partitionCount = 1;
|
247
|
+
}
|
248
|
+
else if (!task.getPartition()) {
|
249
|
+
partitionCount = 1;
|
250
|
+
}
|
251
|
+
else {
|
252
|
+
int fileLength = (int) fs.getFileStatus(path).getLen();
|
253
|
+
partitionCount = fileLength / partitionSizeByOneTask;
|
254
|
+
int remainder = fileLength % partitionSizeByOneTask;
|
255
|
+
|
256
|
+
if (remainder > 0) {
|
257
|
+
partitionCount++;
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, partitionCount);
|
262
|
+
hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
|
263
|
+
}
|
264
|
+
|
265
|
+
return hdfsPartialFiles;
|
266
|
+
}
|
267
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.FileSystem;
|
4
|
+
import org.apache.hadoop.fs.Path;
|
5
|
+
|
6
|
+
import java.io.IOException;
|
7
|
+
import java.util.ArrayList;
|
8
|
+
import java.util.List;
|
9
|
+
|
10
|
+
/**
|
11
|
+
* Created by takahiro.nakayama on 8/20/15.
|
12
|
+
*/
|
13
|
+
public class HdfsFilePartitioner
|
14
|
+
{
|
15
|
+
private FileSystem fs;
|
16
|
+
private Path path;
|
17
|
+
private int partitionCount;
|
18
|
+
|
19
|
+
public HdfsFilePartitioner(FileSystem fs, Path path, int partitionCount)
|
20
|
+
{
|
21
|
+
this.fs = fs;
|
22
|
+
this.path = path;
|
23
|
+
this.partitionCount = partitionCount;
|
24
|
+
}
|
25
|
+
|
26
|
+
public List<HdfsPartialFile> getHdfsPartialFiles() throws IOException
|
27
|
+
{
|
28
|
+
List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
|
29
|
+
long size = fs.getFileStatus(path).getLen();
|
30
|
+
for (int i = 0; i < partitionCount; i++) {
|
31
|
+
long start = size * i / partitionCount;
|
32
|
+
long end = size * (i + 1) / partitionCount;
|
33
|
+
if (start < end) {
|
34
|
+
hdfsPartialFiles.add(new HdfsPartialFile(path.toString(), start, end));
|
35
|
+
}
|
36
|
+
}
|
37
|
+
return hdfsPartialFiles;
|
38
|
+
}
|
39
|
+
}
|
@@ -0,0 +1,40 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.fs.Path;
|
4
|
+
|
5
|
+
/**
|
6
|
+
* Created by takahiro.nakayama on 8/20/15.
|
7
|
+
*/
|
8
|
+
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFile.java
|
9
|
+
public class HdfsPartialFile
|
10
|
+
{
|
11
|
+
private String path;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
|
15
|
+
public HdfsPartialFile(String path, long start, long end)
|
16
|
+
{
|
17
|
+
this.path = path;
|
18
|
+
this.start = start;
|
19
|
+
this.end = end;
|
20
|
+
}
|
21
|
+
|
22
|
+
// see: http://stackoverflow.com/questions/7625783/jsonmappingexception-no-suitable-constructor-found-for-type-simple-type-class
|
23
|
+
public HdfsPartialFile() { }
|
24
|
+
|
25
|
+
public String getPath()
|
26
|
+
{
|
27
|
+
return path;
|
28
|
+
}
|
29
|
+
|
30
|
+
public long getStart()
|
31
|
+
{
|
32
|
+
return start;
|
33
|
+
}
|
34
|
+
|
35
|
+
public long getEnd()
|
36
|
+
{
|
37
|
+
return end;
|
38
|
+
}
|
39
|
+
|
40
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.io.PushbackInputStream;
|
7
|
+
|
8
|
+
// ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
|
9
|
+
public class HdfsPartialFileInputStream extends InputStream
|
10
|
+
{
|
11
|
+
private final PushbackInputStream original;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
private long current;
|
15
|
+
private boolean eof;
|
16
|
+
|
17
|
+
public HdfsPartialFileInputStream(InputStream original, long start, long end)
|
18
|
+
{
|
19
|
+
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
20
|
+
this.start = start;
|
21
|
+
this.end = end;
|
22
|
+
current = -1;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public int read(byte[] b) throws IOException
|
27
|
+
{
|
28
|
+
return read(b, 0, b.length);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public int read(byte[] b, int off, int len) throws IOException
|
33
|
+
{
|
34
|
+
initializeIfNeeded();
|
35
|
+
|
36
|
+
if (eof) {
|
37
|
+
return -1;
|
38
|
+
}
|
39
|
+
|
40
|
+
int read = original.read(b, off, len);
|
41
|
+
if (read < 0) {
|
42
|
+
eof = true;
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
|
46
|
+
current += read;
|
47
|
+
if (current >= end) {
|
48
|
+
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
49
|
+
if (b[off + i] == '\n') {
|
50
|
+
eof = true;
|
51
|
+
return i + 1;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (b[off + i] == '\r') {
|
55
|
+
int next = (i < read ? b[off + i + 1] : prefetch());
|
56
|
+
if (next != '\n') {
|
57
|
+
eof = true;
|
58
|
+
return i + 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
return read;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public int read() throws IOException
|
69
|
+
{
|
70
|
+
initializeIfNeeded();
|
71
|
+
|
72
|
+
if (eof) {
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
int read = original.read();
|
77
|
+
current++;
|
78
|
+
|
79
|
+
if (read < 0) {
|
80
|
+
eof = true;
|
81
|
+
return -1;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (current >= end) {
|
85
|
+
if (read == '\n' || read == '\r' && prefetch() != '\n') {
|
86
|
+
eof = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return read;
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
public long skip(long n) throws IOException
|
95
|
+
{
|
96
|
+
throw new IOException("Skip not supported.");
|
97
|
+
/*
|
98
|
+
long skip = original.skip(n);
|
99
|
+
current += skip;
|
100
|
+
return skip;
|
101
|
+
*/
|
102
|
+
}
|
103
|
+
|
104
|
+
@Override
|
105
|
+
public int available() throws IOException
|
106
|
+
{
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
@Override
|
111
|
+
public void close() throws IOException
|
112
|
+
{
|
113
|
+
original.close();
|
114
|
+
}
|
115
|
+
|
116
|
+
private void initializeIfNeeded() throws IOException
|
117
|
+
{
|
118
|
+
if (current >= start) {
|
119
|
+
return;
|
120
|
+
|
121
|
+
}
|
122
|
+
if (start == 0) {
|
123
|
+
current = 0;
|
124
|
+
} else {
|
125
|
+
current = original.skip(--start);
|
126
|
+
if (current != start) {
|
127
|
+
throw new IOException("Cannot skip.");
|
128
|
+
}
|
129
|
+
|
130
|
+
int c;
|
131
|
+
while ((c = original.read()) >= 0) {
|
132
|
+
start++;
|
133
|
+
current++;
|
134
|
+
|
135
|
+
if (c == '\n' || c == '\r' && prefetch() != '\n') {
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
if (start >= end) {
|
142
|
+
eof = true;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
private int prefetch() throws IOException
|
147
|
+
{
|
148
|
+
int c = original.read();
|
149
|
+
if (c >= 0) {
|
150
|
+
original.unread(c);
|
151
|
+
}
|
152
|
+
return c;
|
153
|
+
}
|
154
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-hdfs
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- takahiro.nakayama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-08
|
11
|
+
date: 2015-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -54,8 +54,11 @@ files:
|
|
54
54
|
- gradlew
|
55
55
|
- gradlew.bat
|
56
56
|
- lib/embulk/input/hdfs.rb
|
57
|
-
- src/main/java/org/embulk/input/HdfsFileInputPlugin.java
|
58
|
-
- src/
|
57
|
+
- src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java
|
58
|
+
- src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java
|
59
|
+
- src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java
|
60
|
+
- src/main/java/org/embulk/input/hdfs/HdfsPartialFileInputStream.java
|
61
|
+
- src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java
|
59
62
|
- classpath/activation-1.1.jar
|
60
63
|
- classpath/apacheds-i18n-2.0.0-M15.jar
|
61
64
|
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
@@ -79,7 +82,7 @@ files:
|
|
79
82
|
- classpath/curator-client-2.6.0.jar
|
80
83
|
- classpath/curator-framework-2.6.0.jar
|
81
84
|
- classpath/curator-recipes-2.6.0.jar
|
82
|
-
- classpath/embulk-input-hdfs-0.0.
|
85
|
+
- classpath/embulk-input-hdfs-0.1.0.jar
|
83
86
|
- classpath/gson-2.2.4.jar
|
84
87
|
- classpath/hadoop-annotations-2.6.0.jar
|
85
88
|
- classpath/hadoop-auth-2.6.0.jar
|
Binary file
|
@@ -1,231 +0,0 @@
|
|
1
|
-
package org.embulk.input;
|
2
|
-
|
3
|
-
import org.apache.hadoop.conf.Configuration;
|
4
|
-
import org.apache.hadoop.fs.FileStatus;
|
5
|
-
import org.apache.hadoop.fs.FileSystem;
|
6
|
-
import org.apache.hadoop.fs.Path;
|
7
|
-
import org.embulk.config.*;
|
8
|
-
import org.embulk.spi.BufferAllocator;
|
9
|
-
import org.embulk.spi.Exec;
|
10
|
-
import org.embulk.spi.FileInputPlugin;
|
11
|
-
import org.embulk.spi.TransactionalFileInput;
|
12
|
-
import org.embulk.spi.util.InputStreamFileInput;
|
13
|
-
import org.jruby.embed.ScriptingContainer;
|
14
|
-
import org.slf4j.Logger;
|
15
|
-
|
16
|
-
import java.io.IOException;
|
17
|
-
import java.io.InputStream;
|
18
|
-
import java.util.ArrayList;
|
19
|
-
import java.util.List;
|
20
|
-
import java.util.Map;
|
21
|
-
|
22
|
-
public class HdfsFileInputPlugin implements FileInputPlugin
|
23
|
-
{
|
24
|
-
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
25
|
-
|
26
|
-
public interface PluginTask extends Task
|
27
|
-
{
|
28
|
-
@Config("config_files")
|
29
|
-
@ConfigDefault("[]")
|
30
|
-
public List<String> getConfigFiles();
|
31
|
-
|
32
|
-
@Config("config")
|
33
|
-
@ConfigDefault("{}")
|
34
|
-
public Map<String, String> getConfig();
|
35
|
-
|
36
|
-
@Config("input_path")
|
37
|
-
public String getInputPath();
|
38
|
-
|
39
|
-
@Config("rewind_seconds")
|
40
|
-
@ConfigDefault("0")
|
41
|
-
public int getRewindSeconds();
|
42
|
-
|
43
|
-
public List<String> getTargetFiles();
|
44
|
-
public void setTargetFiles(List<String> targetFiles);
|
45
|
-
|
46
|
-
@ConfigInject
|
47
|
-
public BufferAllocator getBufferAllocator();
|
48
|
-
}
|
49
|
-
|
50
|
-
@Override
|
51
|
-
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
52
|
-
{
|
53
|
-
PluginTask task = config.loadConfig(PluginTask.class);
|
54
|
-
|
55
|
-
// prepare
|
56
|
-
Configuration configuration = getHdfsConfiguration(task);
|
57
|
-
FileSystem fs = getFs(configuration);
|
58
|
-
Path inputPath = new Path(strftime(task.getInputPath(), task.getRewindSeconds()));
|
59
|
-
|
60
|
-
// listing
|
61
|
-
List<String> targetFiles;
|
62
|
-
try {
|
63
|
-
targetFiles = globRecursive(fs, inputPath);
|
64
|
-
} catch (IOException e) {
|
65
|
-
logger.error(e.getMessage());
|
66
|
-
throw new RuntimeException(e);
|
67
|
-
}
|
68
|
-
logger.info("Loading target files: {}", targetFiles);
|
69
|
-
task.setTargetFiles(targetFiles);
|
70
|
-
|
71
|
-
// number of processors is same with number of targets
|
72
|
-
int taskCount = targetFiles.size();
|
73
|
-
|
74
|
-
return resume(task.dump(), taskCount, control);
|
75
|
-
}
|
76
|
-
|
77
|
-
@Override
|
78
|
-
public ConfigDiff resume(TaskSource taskSource,
|
79
|
-
int taskCount,
|
80
|
-
FileInputPlugin.Control control)
|
81
|
-
{
|
82
|
-
control.run(taskSource, taskCount);
|
83
|
-
return Exec.newConfigDiff();
|
84
|
-
}
|
85
|
-
|
86
|
-
@Override
|
87
|
-
public void cleanup(TaskSource taskSource,
|
88
|
-
int taskCount,
|
89
|
-
List<TaskReport> successTaskReports)
|
90
|
-
{
|
91
|
-
}
|
92
|
-
|
93
|
-
@Override
|
94
|
-
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
95
|
-
{
|
96
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
97
|
-
|
98
|
-
// prepare
|
99
|
-
Configuration configuration = getHdfsConfiguration(task);
|
100
|
-
FileSystem fs = getFs(configuration);
|
101
|
-
|
102
|
-
return new HdfsFileInput(task, fs, taskIndex);
|
103
|
-
}
|
104
|
-
|
105
|
-
private Configuration getHdfsConfiguration(final PluginTask task)
|
106
|
-
{
|
107
|
-
Configuration configuration = new Configuration();
|
108
|
-
|
109
|
-
for (Object configFile : task.getConfigFiles()) {
|
110
|
-
configuration.addResource(configFile.toString());
|
111
|
-
}
|
112
|
-
configuration.reloadConfiguration();
|
113
|
-
|
114
|
-
for (Map.Entry<String, String> entry: task.getConfig().entrySet()) {
|
115
|
-
configuration.set(entry.getKey(), entry.getValue());
|
116
|
-
}
|
117
|
-
|
118
|
-
return configuration;
|
119
|
-
}
|
120
|
-
|
121
|
-
private FileSystem getFs(final Configuration configuration)
|
122
|
-
{
|
123
|
-
try {
|
124
|
-
FileSystem fs = FileSystem.get(configuration);
|
125
|
-
return fs;
|
126
|
-
}
|
127
|
-
catch (IOException e) {
|
128
|
-
logger.error(e.getMessage());
|
129
|
-
throw new RuntimeException(e);
|
130
|
-
}
|
131
|
-
}
|
132
|
-
|
133
|
-
private String strftime(final String raw, final int rewind_seconds)
|
134
|
-
{
|
135
|
-
ScriptingContainer jruby = new ScriptingContainer();
|
136
|
-
Object resolved = jruby.runScriptlet(
|
137
|
-
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewind_seconds), raw));
|
138
|
-
return resolved.toString();
|
139
|
-
}
|
140
|
-
|
141
|
-
private List<String> globRecursive(final FileSystem fs, final Path hdfsPath) throws IOException
|
142
|
-
{
|
143
|
-
List<String> container = new ArrayList<String>();
|
144
|
-
for (FileStatus entry : fs.globStatus(hdfsPath)) {
|
145
|
-
if (entry.isDirectory()) {
|
146
|
-
container.addAll(listRecursive(fs, entry));
|
147
|
-
}
|
148
|
-
else {
|
149
|
-
container.add(entry.getPath().toString());
|
150
|
-
}
|
151
|
-
}
|
152
|
-
return container;
|
153
|
-
}
|
154
|
-
|
155
|
-
private List<String> listRecursive(final FileSystem fs, FileStatus status) throws IOException {
|
156
|
-
List<String> container = new ArrayList<String>();
|
157
|
-
if (status.isDirectory()) {
|
158
|
-
for (FileStatus entry : fs.listStatus(status.getPath())) {
|
159
|
-
container.addAll(listRecursive(fs, entry));
|
160
|
-
}
|
161
|
-
}
|
162
|
-
else {
|
163
|
-
container.add(status.getPath().toString());
|
164
|
-
}
|
165
|
-
return container;
|
166
|
-
}
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
// private List<String> listUniquify(List<String> stringList)
|
171
|
-
// {
|
172
|
-
// Set<String> set = new HashSet<String>();
|
173
|
-
// set.addAll(stringList);
|
174
|
-
// List<String> uniqueStringList = new ArrayList<String>();
|
175
|
-
// uniqueStringList.addAll(set);
|
176
|
-
// return uniqueStringList;
|
177
|
-
// }
|
178
|
-
|
179
|
-
public static class HdfsFileInput extends InputStreamFileInput implements TransactionalFileInput
|
180
|
-
{
|
181
|
-
private static class HdfsFileProvider implements InputStreamFileInput.Provider
|
182
|
-
{
|
183
|
-
private final FileSystem fs;
|
184
|
-
private final Path hdfsPath;
|
185
|
-
private boolean opened = false;
|
186
|
-
|
187
|
-
public HdfsFileProvider(PluginTask task, FileSystem fs, int taskIndex)
|
188
|
-
{
|
189
|
-
this.fs = fs;
|
190
|
-
this.hdfsPath = new Path(task.getTargetFiles().get(taskIndex));
|
191
|
-
}
|
192
|
-
|
193
|
-
@Override
|
194
|
-
public InputStream openNext() throws IOException
|
195
|
-
{
|
196
|
-
if (opened) {
|
197
|
-
return null;
|
198
|
-
}
|
199
|
-
|
200
|
-
opened = true;
|
201
|
-
return fs.open(hdfsPath);
|
202
|
-
}
|
203
|
-
|
204
|
-
@Override
|
205
|
-
public void close()
|
206
|
-
{
|
207
|
-
}
|
208
|
-
}
|
209
|
-
|
210
|
-
public HdfsFileInput(PluginTask task, FileSystem fs, int taskIndex)
|
211
|
-
{
|
212
|
-
super(task.getBufferAllocator(), new HdfsFileProvider(task, fs, taskIndex));
|
213
|
-
}
|
214
|
-
|
215
|
-
@Override
|
216
|
-
public void close()
|
217
|
-
{
|
218
|
-
}
|
219
|
-
|
220
|
-
@Override
|
221
|
-
public void abort()
|
222
|
-
{
|
223
|
-
}
|
224
|
-
|
225
|
-
@Override
|
226
|
-
public TaskReport commit()
|
227
|
-
{
|
228
|
-
return Exec.newTaskReport();
|
229
|
-
}
|
230
|
-
}
|
231
|
-
}
|