embulk-input-hdfs 0.1.9 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHENGELOG.md +7 -0
- data/README.md +18 -15
- data/build.gradle +1 -1
- data/example/config.yml +4 -1
- data/example/data2.csv.gz +0 -0
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +82 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +248 -212
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +48 -0
- data/src/main/java/org/embulk/input/hdfs/{HdfsPartialFileInputStream.java → PartialFileInputStream.java} +9 -4
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +125 -0
- data/src/main/java/org/embulk/input/hdfs/PartialFileList.java +360 -0
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +38 -14
- data/src/test/resources/sample_03.csv.gz +0 -0
- metadata +26 -21
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +0 -40
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e666bbbcb18941dce84889c2ee7fb85d65edbaf4
|
4
|
+
data.tar.gz: 7422b508396787d70e6cea3fc534739c2c20c825
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c305947dbd3f6bded0a23fbc06efd4d44e6d48cdb4b97c8b0e3861cd4b2a9800f6d8c93cf5280ccb235ca88346e727bb5fb549ae3c7bb2e12a13205e20765085
|
7
|
+
data.tar.gz: 8f33bb06731a3c5a25dd723bef83616992ce5fc8b8d5e1a60d8a1da56421a42b49ae3397feb24134a093bf291af87ddbd208fa866c86fdd997d824a6077434a4
|
data/CHENGELOG.md
ADDED
data/README.md
CHANGED
@@ -14,11 +14,12 @@ Read files on Hdfs.
|
|
14
14
|
|
15
15
|
- **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
16
16
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
17
|
-
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s
|
18
|
-
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
19
|
-
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
20
|
-
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
21
|
-
- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
|
17
|
+
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s` (string, required).
|
18
|
+
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property. (long, default: `0`)
|
19
|
+
- **partition** when this is true, partition input files and increase task count. (boolean, default: `true`)
|
20
|
+
- **num_partitions** number of partitions. (long, default: `Runtime.getRuntime().availableProcessors()`)
|
21
|
+
- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (long, default: `0`)
|
22
|
+
- **decompression** Decompress compressed files by hadoop compression codec api. (boolean. default: `false`)
|
22
23
|
|
23
24
|
## Example
|
24
25
|
|
@@ -77,18 +78,20 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
|
77
78
|
...
|
78
79
|
*/
|
79
80
|
|
80
|
-
|
81
|
-
if (
|
82
|
-
|
83
|
-
|
81
|
+
long numPartitions;
|
82
|
+
if (task.getPartition()) {
|
83
|
+
if (file.canDecompress()) {
|
84
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
85
|
+
}
|
86
|
+
else if (file.getCodec() != null) { // if not null, the file is compressed.
|
87
|
+
numPartitions = 1;
|
88
|
+
}
|
89
|
+
else {
|
90
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
91
|
+
}
|
84
92
|
}
|
85
|
-
else if (!task.getPartition()) {
|
86
|
-
// if no partition mode, skip partitioning.
|
87
|
-
numPartitions = 1;
|
88
|
-
}
|
89
93
|
else {
|
90
|
-
|
91
|
-
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
94
|
+
numPartitions = 1;
|
92
95
|
}
|
93
96
|
|
94
97
|
/*
|
data/build.gradle
CHANGED
data/example/config.yml
CHANGED
@@ -12,11 +12,14 @@ local_fs_example: &local_fs_example
|
|
12
12
|
fs.defaultFS: 'file:///'
|
13
13
|
fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
14
14
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
15
|
+
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
15
16
|
|
16
17
|
in:
|
17
18
|
type: hdfs
|
18
19
|
<<: *local_fs_example
|
19
|
-
path: example/data
|
20
|
+
path: example/data*
|
21
|
+
skip_header_lines: 1
|
22
|
+
decompression: true
|
20
23
|
parser:
|
21
24
|
charset: UTF-8
|
22
25
|
newline: CRLF
|
Binary file
|
@@ -0,0 +1,82 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import com.google.common.collect.Lists;
|
6
|
+
import com.google.common.collect.Maps;
|
7
|
+
import org.apache.hadoop.conf.Configuration;
|
8
|
+
import org.embulk.config.ConfigException;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.slf4j.Logger;
|
11
|
+
|
12
|
+
import java.io.File;
|
13
|
+
import java.net.MalformedURLException;
|
14
|
+
import java.util.List;
|
15
|
+
import java.util.Map;
|
16
|
+
|
17
|
+
/**
|
18
|
+
* Created by takahiro.nakayama on 2/22/16.
|
19
|
+
*/
|
20
|
+
public class ConfigurationBuilder
|
21
|
+
{
|
22
|
+
private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
|
23
|
+
private final ImmutableList.Builder<String> configFilesBuilder;
|
24
|
+
private final ImmutableMap.Builder<String, String> configMapBuilder;
|
25
|
+
|
26
|
+
public ConfigurationBuilder()
|
27
|
+
{
|
28
|
+
this.configFilesBuilder = ImmutableList.builder();
|
29
|
+
this.configMapBuilder = ImmutableMap.builder();
|
30
|
+
}
|
31
|
+
|
32
|
+
public ConfigurationBuilder addConfigFiles(List<String> configFiles)
|
33
|
+
{
|
34
|
+
for (String configFile : configFiles) {
|
35
|
+
addConfigFile(configFile);
|
36
|
+
}
|
37
|
+
return this;
|
38
|
+
}
|
39
|
+
|
40
|
+
public ConfigurationBuilder addConfigFile(String configFile)
|
41
|
+
{
|
42
|
+
configFilesBuilder.add(configFile);
|
43
|
+
return this;
|
44
|
+
}
|
45
|
+
|
46
|
+
public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
|
47
|
+
{
|
48
|
+
for (Map.Entry<String, String> entry : configMap.entrySet()) {
|
49
|
+
addConfig(entry.getKey(), entry.getValue());
|
50
|
+
}
|
51
|
+
return this;
|
52
|
+
}
|
53
|
+
|
54
|
+
public ConfigurationBuilder addConfig(String key, String value)
|
55
|
+
{
|
56
|
+
configMapBuilder.put(key, value);
|
57
|
+
return this;
|
58
|
+
}
|
59
|
+
|
60
|
+
public Configuration build()
|
61
|
+
{
|
62
|
+
Configuration configuration = new Configuration();
|
63
|
+
for (String configFile : configFilesBuilder.build()) {
|
64
|
+
File file = new File(configFile);
|
65
|
+
try {
|
66
|
+
configuration.addResource(file.toURI().toURL());
|
67
|
+
}
|
68
|
+
catch (MalformedURLException e) {
|
69
|
+
throw new ConfigException(e);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
|
73
|
+
configuration.set(entry.getKey(), entry.getValue());
|
74
|
+
}
|
75
|
+
// For debug
|
76
|
+
for (Map.Entry<String, String> entry : configuration) {
|
77
|
+
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
78
|
+
}
|
79
|
+
logger.trace("Resource Files: {}", configuration);
|
80
|
+
return configuration;
|
81
|
+
}
|
82
|
+
}
|
@@ -1,12 +1,18 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import com.google.common.
|
3
|
+
import com.google.common.annotations.VisibleForTesting;
|
4
|
+
import com.google.common.base.Optional;
|
5
|
+
import com.google.common.base.Throwables;
|
4
6
|
import com.google.common.collect.Lists;
|
7
|
+
import com.google.common.collect.Maps;
|
5
8
|
import org.apache.hadoop.conf.Configuration;
|
6
9
|
import org.apache.hadoop.fs.FileStatus;
|
7
10
|
import org.apache.hadoop.fs.FileSystem;
|
8
11
|
import org.apache.hadoop.fs.Path;
|
12
|
+
import org.apache.hadoop.fs.PathIOException;
|
9
13
|
import org.apache.hadoop.fs.PathNotFoundException;
|
14
|
+
import org.apache.hadoop.io.compress.CompressionCodec;
|
15
|
+
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
10
16
|
import org.embulk.config.Config;
|
11
17
|
import org.embulk.config.ConfigDefault;
|
12
18
|
import org.embulk.config.ConfigDiff;
|
@@ -19,261 +25,142 @@ import org.embulk.spi.BufferAllocator;
|
|
19
25
|
import org.embulk.spi.Exec;
|
20
26
|
import org.embulk.spi.FileInputPlugin;
|
21
27
|
import org.embulk.spi.TransactionalFileInput;
|
22
|
-
import org.embulk.spi.util.
|
28
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
23
29
|
import org.jruby.embed.ScriptingContainer;
|
24
30
|
import org.slf4j.Logger;
|
25
31
|
|
26
|
-
import javax.annotation.Nullable;
|
27
|
-
|
28
|
-
import java.io.BufferedInputStream;
|
29
|
-
import java.io.ByteArrayInputStream;
|
30
|
-
import java.io.ByteArrayOutputStream;
|
31
|
-
import java.io.File;
|
32
32
|
import java.io.IOException;
|
33
33
|
import java.io.InputStream;
|
34
|
-
import java.
|
35
|
-
import java.util.ArrayList;
|
34
|
+
import java.util.Iterator;
|
36
35
|
import java.util.List;
|
37
36
|
import java.util.Map;
|
38
37
|
|
39
38
|
public class HdfsFileInputPlugin
|
40
39
|
implements FileInputPlugin
|
41
40
|
{
|
42
|
-
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
43
|
-
private static FileSystem fs;
|
44
|
-
|
45
41
|
public interface PluginTask
|
46
|
-
extends Task
|
42
|
+
extends Task, PartialFileList.Task
|
47
43
|
{
|
48
44
|
@Config("config_files")
|
49
45
|
@ConfigDefault("[]")
|
50
|
-
|
46
|
+
List<String> getConfigFiles();
|
51
47
|
|
52
48
|
@Config("config")
|
53
49
|
@ConfigDefault("{}")
|
54
|
-
|
50
|
+
Map<String, String> getConfig();
|
55
51
|
|
56
52
|
@Config("path")
|
57
|
-
|
53
|
+
String getPath();
|
58
54
|
|
59
55
|
@Config("rewind_seconds")
|
60
56
|
@ConfigDefault("0")
|
61
|
-
|
57
|
+
int getRewindSeconds();
|
62
58
|
|
63
59
|
@Config("partition")
|
64
60
|
@ConfigDefault("true")
|
65
|
-
|
61
|
+
boolean getPartition();
|
66
62
|
|
67
63
|
@Config("num_partitions") // this parameter is the approximate value.
|
68
64
|
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
69
|
-
|
65
|
+
long getApproximateNumPartitions();
|
70
66
|
|
71
67
|
@Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
|
72
68
|
@ConfigDefault("0") // The reason why the parameter is configured is that this plugin splits files.
|
73
|
-
|
69
|
+
int getSkipHeaderLines();
|
74
70
|
|
75
|
-
|
71
|
+
@Config("decompression") // if true, decompress files by using compression codec
|
72
|
+
@ConfigDefault("false") // when getting FileInputStream.
|
73
|
+
boolean getDecompression();
|
76
74
|
|
77
|
-
|
75
|
+
PartialFileList getPartialFileList();
|
76
|
+
void setPartialFileList(PartialFileList partialFileList);
|
78
77
|
|
79
78
|
@ConfigInject
|
80
|
-
|
79
|
+
ScriptingContainer getJRuby();
|
80
|
+
|
81
|
+
@ConfigInject
|
82
|
+
BufferAllocator getBufferAllocator();
|
81
83
|
}
|
82
84
|
|
85
|
+
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
86
|
+
private Optional<Configuration> configurationContainer = Optional.absent();
|
87
|
+
|
83
88
|
@Override
|
84
89
|
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
85
90
|
{
|
86
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
Configuration configuration = getConfiguration(task);
|
87
93
|
|
88
94
|
// listing Files
|
89
|
-
String pathString = strftime(task.getPath(), task.getRewindSeconds());
|
90
95
|
try {
|
91
|
-
|
96
|
+
FileSystem fs = getFS(configuration);
|
97
|
+
|
98
|
+
String pathString = strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
|
99
|
+
Path rootPath = new Path(pathString);
|
100
|
+
|
101
|
+
List<Path> originalFileList = buildOriginalFileList(fs, rootPath);
|
92
102
|
|
93
103
|
if (originalFileList.isEmpty()) {
|
94
104
|
throw new PathNotFoundException(pathString);
|
95
105
|
}
|
96
106
|
|
97
107
|
logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
|
98
|
-
|
108
|
+
PartialFileList list = buildPartialFileList(task, originalFileList);
|
109
|
+
task.setPartialFileList(list);
|
99
110
|
}
|
100
111
|
catch (IOException e) {
|
101
112
|
logger.error(e.getMessage());
|
102
113
|
throw new RuntimeException(e);
|
103
114
|
}
|
104
115
|
|
105
|
-
// log the detail of partial files.
|
106
|
-
for (HdfsPartialFile partialFile : task.getFiles()) {
|
107
|
-
logger.debug("embulk-input-hdfs: target file: {}, start: {}, end: {}",
|
108
|
-
partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
|
109
|
-
}
|
110
|
-
|
111
116
|
// number of processors is same with number of targets
|
112
|
-
int taskCount = task.
|
117
|
+
int taskCount = task.getPartialFileList().getTaskCount();
|
113
118
|
logger.info("embulk-input-hdfs: task size: {}", taskCount);
|
114
119
|
|
115
120
|
return resume(task.dump(), taskCount, control);
|
116
121
|
}
|
117
122
|
|
118
|
-
|
119
|
-
public ConfigDiff resume(TaskSource taskSource,
|
120
|
-
int taskCount,
|
121
|
-
FileInputPlugin.Control control)
|
123
|
+
private Configuration getConfiguration(PluginTask task)
|
122
124
|
{
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
// usually, yo use last_path
|
128
|
-
//if (task.getFiles().isEmpty()) {
|
129
|
-
// if (task.getLastPath().isPresent()) {
|
130
|
-
// configDiff.set("last_path", task.getLastPath().get());
|
131
|
-
// }
|
132
|
-
//} else {
|
133
|
-
// List<String> files = new ArrayList<String>(task.getFiles());
|
134
|
-
// Collections.sort(files);
|
135
|
-
// configDiff.set("last_path", files.get(files.size() - 1));
|
136
|
-
//}
|
137
|
-
|
138
|
-
return configDiff;
|
139
|
-
}
|
125
|
+
if (configurationContainer.isPresent()) {
|
126
|
+
return configurationContainer.get();
|
127
|
+
}
|
140
128
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
129
|
+
ConfigurationBuilder builder = new ConfigurationBuilder();
|
130
|
+
builder.addConfigFiles(task.getConfigFiles());
|
131
|
+
builder.addConfigMap(task.getConfig());
|
132
|
+
configurationContainer = Optional.of(builder.build());
|
133
|
+
return configurationContainer.get();
|
146
134
|
}
|
147
135
|
|
148
|
-
|
149
|
-
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
136
|
+
private FileSystem getFS(Configuration configuration)
|
150
137
|
{
|
151
|
-
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
152
|
-
|
153
|
-
InputStream input;
|
154
|
-
final HdfsPartialFile file = task.getFiles().get(taskIndex);
|
155
138
|
try {
|
156
|
-
|
157
|
-
input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
|
158
|
-
}
|
159
|
-
else {
|
160
|
-
input = openInputStream(task, file);
|
161
|
-
}
|
139
|
+
return FileSystem.get(configuration);
|
162
140
|
}
|
163
141
|
catch (IOException e) {
|
164
|
-
|
165
|
-
throw new RuntimeException(e);
|
142
|
+
throw Throwables.propagate(e);
|
166
143
|
}
|
167
|
-
|
168
|
-
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
|
169
|
-
{
|
170
|
-
@Override
|
171
|
-
public void abort()
|
172
|
-
{ }
|
173
|
-
|
174
|
-
@Override
|
175
|
-
public TaskReport commit()
|
176
|
-
{
|
177
|
-
return Exec.newTaskReport();
|
178
|
-
}
|
179
|
-
};
|
180
144
|
}
|
181
145
|
|
182
|
-
|
183
|
-
|
146
|
+
@VisibleForTesting
|
147
|
+
String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
|
184
148
|
{
|
185
|
-
|
186
|
-
|
187
|
-
int skippedHeaders = 0;
|
188
|
-
|
189
|
-
try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
|
190
|
-
while (true) {
|
191
|
-
int c = in.read();
|
192
|
-
if (c < 0) {
|
193
|
-
break;
|
194
|
-
}
|
195
|
-
|
196
|
-
header.write(c);
|
197
|
-
|
198
|
-
if (c == '\n') {
|
199
|
-
skippedHeaders++;
|
200
|
-
}
|
201
|
-
else if (c == '\r') {
|
202
|
-
int c2 = in.read();
|
203
|
-
if (c2 == '\n') {
|
204
|
-
header.write(c2);
|
205
|
-
}
|
206
|
-
skippedHeaders++;
|
207
|
-
}
|
208
|
-
|
209
|
-
if (skippedHeaders >= task.getSkipHeaderLines()) {
|
210
|
-
break;
|
211
|
-
}
|
212
|
-
}
|
213
|
-
}
|
214
|
-
header.close();
|
215
|
-
return new ByteArrayInputStream(header.toByteArray());
|
149
|
+
String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
|
150
|
+
return jruby.runScriptlet(script).toString();
|
216
151
|
}
|
217
152
|
|
218
|
-
private
|
219
|
-
throws IOException
|
153
|
+
private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
|
220
154
|
{
|
221
|
-
|
222
|
-
InputStream original = fs.open(new Path(partialFile.getPath()));
|
223
|
-
return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
|
224
|
-
}
|
155
|
+
List<Path> fileList = Lists.newArrayList();
|
225
156
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if (fs == null) {
|
230
|
-
setFs(task);
|
231
|
-
return fs;
|
232
|
-
}
|
233
|
-
else {
|
234
|
-
return fs;
|
235
|
-
}
|
236
|
-
}
|
237
|
-
|
238
|
-
private static FileSystem setFs(final PluginTask task)
|
239
|
-
throws IOException
|
240
|
-
{
|
241
|
-
Configuration configuration = new Configuration();
|
242
|
-
|
243
|
-
for (String configFile : task.getConfigFiles()) {
|
244
|
-
File file = new File(configFile);
|
245
|
-
configuration.addResource(file.toURI().toURL());
|
246
|
-
}
|
247
|
-
|
248
|
-
for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
|
249
|
-
configuration.set(entry.getKey(), entry.getValue());
|
157
|
+
final FileStatus[] entries;
|
158
|
+
try {
|
159
|
+
entries = fs.globStatus(rootPath);
|
250
160
|
}
|
251
|
-
|
252
|
-
|
253
|
-
for (Map.Entry<String, String> entry : configuration) {
|
254
|
-
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
161
|
+
catch (IOException e) {
|
162
|
+
throw Throwables.propagate(e);
|
255
163
|
}
|
256
|
-
logger.debug("Resource Files: {}", configuration);
|
257
|
-
|
258
|
-
fs = FileSystem.get(configuration);
|
259
|
-
return fs;
|
260
|
-
}
|
261
|
-
|
262
|
-
private String strftime(final String raw, final int rewindSeconds)
|
263
|
-
{
|
264
|
-
ScriptingContainer jruby = new ScriptingContainer();
|
265
|
-
Object resolved = jruby.runScriptlet(
|
266
|
-
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
|
267
|
-
return resolved.toString();
|
268
|
-
}
|
269
|
-
|
270
|
-
private List<String> buildFileList(final FileSystem fs, final String pathString)
|
271
|
-
throws IOException
|
272
|
-
{
|
273
|
-
List<String> fileList = new ArrayList<>();
|
274
|
-
Path rootPath = new Path(pathString);
|
275
|
-
|
276
|
-
final FileStatus[] entries = fs.globStatus(rootPath);
|
277
164
|
// `globStatus` does not throw PathNotFoundException.
|
278
165
|
// return null instead.
|
279
166
|
// see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
|
@@ -283,80 +170,229 @@ public class HdfsFileInputPlugin
|
|
283
170
|
|
284
171
|
for (FileStatus entry : entries) {
|
285
172
|
if (entry.isDirectory()) {
|
286
|
-
|
173
|
+
List<Path> subEntries = listRecursive(fs, entry);
|
174
|
+
fileList.addAll(subEntries);
|
287
175
|
}
|
288
176
|
else {
|
289
|
-
fileList.add(entry.getPath()
|
177
|
+
fileList.add(entry.getPath());
|
290
178
|
}
|
291
179
|
}
|
292
180
|
|
293
181
|
return fileList;
|
294
182
|
}
|
295
183
|
|
296
|
-
private List<
|
297
|
-
throws IOException
|
184
|
+
private List<Path> listRecursive(final FileSystem fs, FileStatus status)
|
298
185
|
{
|
299
|
-
List<
|
186
|
+
List<Path> fileList = Lists.newArrayList();
|
300
187
|
if (status.isDirectory()) {
|
301
|
-
|
302
|
-
|
188
|
+
FileStatus[] entries;
|
189
|
+
try {
|
190
|
+
entries = fs.listStatus(status.getPath());
|
191
|
+
}
|
192
|
+
catch (IOException e) {
|
193
|
+
throw Throwables.propagate(e);
|
194
|
+
}
|
195
|
+
|
196
|
+
for (FileStatus entry : entries) {
|
197
|
+
fileList.addAll(listRecursive(fs, entry));
|
303
198
|
}
|
304
199
|
}
|
305
200
|
else {
|
306
|
-
fileList.add(status.getPath()
|
201
|
+
fileList.add(status.getPath());
|
307
202
|
}
|
308
203
|
return fileList;
|
309
204
|
}
|
310
205
|
|
311
|
-
private
|
312
|
-
throws IOException
|
206
|
+
private PartialFileList buildPartialFileList(PluginTask task, List<Path> pathList)
|
313
207
|
{
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
public Path apply(@Nullable String input)
|
319
|
-
{
|
320
|
-
return new Path(input);
|
321
|
-
}
|
322
|
-
});
|
208
|
+
Configuration configuration = getConfiguration(task);
|
209
|
+
FileSystem fs = getFS(configuration);
|
210
|
+
boolean shouldPartition = task.getPartition();
|
211
|
+
boolean shouldDecompress = task.getDecompression();
|
323
212
|
|
213
|
+
Map<Path, Long> pathLengthMap = Maps.newHashMap();
|
324
214
|
long totalFileLength = 0;
|
325
215
|
for (Path path : pathList) {
|
326
|
-
|
216
|
+
long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
|
217
|
+
|
218
|
+
if (fileLength <= 0) {
|
219
|
+
logger.info("Skip the 0 byte target file: {}", path);
|
220
|
+
continue;
|
221
|
+
}
|
222
|
+
|
223
|
+
pathLengthMap.put(path, fileLength);
|
224
|
+
totalFileLength += fileLength;
|
327
225
|
}
|
226
|
+
if (totalFileLength <= 0) {
|
227
|
+
throw Throwables.propagate(new PathIOException(task.getPath(), "All files are empty"));
|
228
|
+
}
|
229
|
+
|
230
|
+
PartialFileList.Builder builder = new PartialFileList.Builder(task);
|
328
231
|
|
329
232
|
// TODO: optimum allocation of resources
|
330
|
-
long approximateNumPartitions
|
331
|
-
|
233
|
+
final long approximateNumPartitions;
|
234
|
+
if (task.getApproximateNumPartitions() <= 0) {
|
235
|
+
approximateNumPartitions = Runtime.getRuntime().availableProcessors();
|
236
|
+
}
|
237
|
+
else {
|
238
|
+
approximateNumPartitions = task.getApproximateNumPartitions();
|
239
|
+
}
|
240
|
+
|
332
241
|
long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
333
242
|
if (partitionSizeByOneTask <= 0) {
|
334
243
|
partitionSizeByOneTask = 1;
|
335
244
|
}
|
336
245
|
|
337
|
-
|
338
|
-
|
339
|
-
long fileLength =
|
340
|
-
if (fileLength <= 0) {
|
341
|
-
logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
|
342
|
-
continue;
|
343
|
-
}
|
246
|
+
for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
|
247
|
+
Path path = entry.getKey();
|
248
|
+
long fileLength = entry.getValue();
|
344
249
|
|
345
250
|
long numPartitions;
|
346
|
-
if (
|
347
|
-
|
251
|
+
if (shouldPartition) {
|
252
|
+
if (shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null) {
|
253
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
254
|
+
}
|
255
|
+
else if (getHdfsFileCompressionCodec(fs, path) != null) { // if not null, the file is compressed.
|
256
|
+
numPartitions = 1;
|
257
|
+
}
|
258
|
+
else {
|
259
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
260
|
+
}
|
348
261
|
}
|
349
|
-
else
|
262
|
+
else {
|
350
263
|
numPartitions = 1;
|
351
264
|
}
|
352
|
-
|
353
|
-
|
265
|
+
|
266
|
+
for (long i = 0; i < numPartitions; i++) {
|
267
|
+
long start = fileLength * i / numPartitions;
|
268
|
+
long end = fileLength * (i + 1) / numPartitions;
|
269
|
+
if (start < end) {
|
270
|
+
logger.debug("PartialFile: path {}, start: {}, end: {}", path, start, end);
|
271
|
+
builder.add(path.toString(), start, end, shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null);
|
272
|
+
}
|
354
273
|
}
|
274
|
+
}
|
275
|
+
|
276
|
+
return builder.build();
|
277
|
+
}
|
355
278
|
|
356
|
-
|
357
|
-
|
279
|
+
private Long getHdfsFileLength(FileSystem fs, Path path, boolean shouldDecompression)
|
280
|
+
{
|
281
|
+
CompressionCodec codec = getHdfsFileCompressionCodec(fs, path);
|
282
|
+
if (codec == null) {
|
283
|
+
try {
|
284
|
+
return fs.getFileStatus(path).getLen();
|
285
|
+
}
|
286
|
+
catch (IOException e) {
|
287
|
+
throw Throwables.propagate(e);
|
288
|
+
}
|
289
|
+
}
|
290
|
+
else if (!shouldDecompression) {
|
291
|
+
try {
|
292
|
+
return fs.getFileStatus(path).getLen();
|
293
|
+
}
|
294
|
+
catch (IOException e) {
|
295
|
+
throw Throwables.propagate(e);
|
296
|
+
}
|
358
297
|
}
|
298
|
+
else {
|
299
|
+
long fileLength = 0;
|
300
|
+
try (InputStream is = codec.createInputStream(fs.open(path))) {
|
301
|
+
while (is.read() > 0) {
|
302
|
+
fileLength++;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
catch (IOException e) {
|
306
|
+
throw Throwables.propagate(e);
|
307
|
+
}
|
308
|
+
return fileLength;
|
309
|
+
}
|
310
|
+
}
|
359
311
|
|
360
|
-
|
312
|
+
private CompressionCodec getHdfsFileCompressionCodec(FileSystem fs, Path path)
|
313
|
+
{
|
314
|
+
return getHdfsFileCompressionCodec(fs.getConf(), path);
|
315
|
+
}
|
316
|
+
|
317
|
+
private CompressionCodec getHdfsFileCompressionCodec(Configuration configuration, Path path)
|
318
|
+
{
|
319
|
+
return new CompressionCodecFactory(configuration).getCodec(path);
|
320
|
+
}
|
321
|
+
|
322
|
+
@Override
|
323
|
+
public ConfigDiff resume(TaskSource taskSource,
|
324
|
+
int taskCount,
|
325
|
+
FileInputPlugin.Control control)
|
326
|
+
{
|
327
|
+
control.run(taskSource, taskCount);
|
328
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
329
|
+
return configDiff;
|
330
|
+
}
|
331
|
+
|
332
|
+
@Override
|
333
|
+
public void cleanup(TaskSource taskSource,
|
334
|
+
int taskCount,
|
335
|
+
List<TaskReport> successTaskReports)
|
336
|
+
{
|
337
|
+
}
|
338
|
+
|
339
|
+
@Override
|
340
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
341
|
+
{
|
342
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
343
|
+
return new HdfsFileInput(task, taskIndex);
|
344
|
+
}
|
345
|
+
|
346
|
+
public class HdfsFileInput
|
347
|
+
extends InputStreamFileInput
|
348
|
+
implements TransactionalFileInput
|
349
|
+
{
|
350
|
+
|
351
|
+
public HdfsFileInput(PluginTask task, int taskIndex)
|
352
|
+
{
|
353
|
+
super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
|
354
|
+
}
|
355
|
+
|
356
|
+
@Override
|
357
|
+
public void abort()
|
358
|
+
{
|
359
|
+
}
|
360
|
+
|
361
|
+
@Override
|
362
|
+
public TaskReport commit()
|
363
|
+
{
|
364
|
+
return Exec.newTaskReport();
|
365
|
+
}
|
366
|
+
}
|
367
|
+
|
368
|
+
// TODO create single-file InputStreamFileInput utility
|
369
|
+
private class SingleFileProvider
|
370
|
+
implements InputStreamFileInput.Provider
|
371
|
+
{
|
372
|
+
private final FileSystem fs;
|
373
|
+
private final int numHeaderLines;
|
374
|
+
private final Iterator<PartialFile> iterator;
|
375
|
+
|
376
|
+
public SingleFileProvider(PluginTask task, int taskIndex)
|
377
|
+
{
|
378
|
+
this.fs = getFS(getConfiguration(task));
|
379
|
+
this.numHeaderLines = task.getSkipHeaderLines();
|
380
|
+
this.iterator = task.getPartialFileList().get(taskIndex).iterator();
|
381
|
+
}
|
382
|
+
|
383
|
+
@Override
|
384
|
+
public InputStream openNext() throws IOException
|
385
|
+
{
|
386
|
+
if (!iterator.hasNext()) {
|
387
|
+
return null;
|
388
|
+
}
|
389
|
+
PartialFileInputStreamBuilder builder = new PartialFileInputStreamBuilder(fs, iterator.next()).withHeaders(numHeaderLines);
|
390
|
+
return builder.build();
|
391
|
+
}
|
392
|
+
|
393
|
+
@Override
|
394
|
+
public void close()
|
395
|
+
{
|
396
|
+
}
|
361
397
|
}
|
362
398
|
}
|