embulk-input-hdfs 0.1.9 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHENGELOG.md +7 -0
- data/README.md +18 -15
- data/build.gradle +1 -1
- data/example/config.yml +4 -1
- data/example/data2.csv.gz +0 -0
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +82 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +248 -212
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +48 -0
- data/src/main/java/org/embulk/input/hdfs/{HdfsPartialFileInputStream.java → PartialFileInputStream.java} +9 -4
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +125 -0
- data/src/main/java/org/embulk/input/hdfs/PartialFileList.java +360 -0
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +38 -14
- data/src/test/resources/sample_03.csv.gz +0 -0
- metadata +26 -21
- data/src/main/java/org/embulk/input/hdfs/HdfsFilePartitioner.java +0 -40
- data/src/main/java/org/embulk/input/hdfs/HdfsPartialFile.java +0 -39
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e666bbbcb18941dce84889c2ee7fb85d65edbaf4
|
4
|
+
data.tar.gz: 7422b508396787d70e6cea3fc534739c2c20c825
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c305947dbd3f6bded0a23fbc06efd4d44e6d48cdb4b97c8b0e3861cd4b2a9800f6d8c93cf5280ccb235ca88346e727bb5fb549ae3c7bb2e12a13205e20765085
|
7
|
+
data.tar.gz: 8f33bb06731a3c5a25dd723bef83616992ce5fc8b8d5e1a60d8a1da56421a42b49ae3397feb24134a093bf291af87ddbd208fa866c86fdd997d824a6077434a4
|
data/CHENGELOG.md
ADDED
data/README.md
CHANGED
@@ -14,11 +14,12 @@ Read files on Hdfs.
|
|
14
14
|
|
15
15
|
- **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
|
16
16
|
- **config** overwrites configuration parameters (hash, default: `{}`)
|
17
|
-
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s
|
18
|
-
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
|
19
|
-
- **partition** when this is true, partition input files and increase task count. (default: `true`)
|
20
|
-
- **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
|
21
|
-
- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
|
17
|
+
- **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s` (string, required).
|
18
|
+
- **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property. (long, default: `0`)
|
19
|
+
- **partition** when this is true, partition input files and increase task count. (boolean, default: `true`)
|
20
|
+
- **num_partitions** number of partitions. (long, default: `Runtime.getRuntime().availableProcessors()`)
|
21
|
+
- **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (long, default: `0`)
|
22
|
+
- **decompression** Decompress compressed files by hadoop compression codec api. (boolean. default: `false`)
|
22
23
|
|
23
24
|
## Example
|
24
25
|
|
@@ -77,18 +78,20 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
|
77
78
|
...
|
78
79
|
*/
|
79
80
|
|
80
|
-
|
81
|
-
if (
|
82
|
-
|
83
|
-
|
81
|
+
long numPartitions;
|
82
|
+
if (task.getPartition()) {
|
83
|
+
if (file.canDecompress()) {
|
84
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
85
|
+
}
|
86
|
+
else if (file.getCodec() != null) { // if not null, the file is compressed.
|
87
|
+
numPartitions = 1;
|
88
|
+
}
|
89
|
+
else {
|
90
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
91
|
+
}
|
84
92
|
}
|
85
|
-
else if (!task.getPartition()) {
|
86
|
-
// if no partition mode, skip partitioning.
|
87
|
-
numPartitions = 1;
|
88
|
-
}
|
89
93
|
else {
|
90
|
-
|
91
|
-
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
94
|
+
numPartitions = 1;
|
92
95
|
}
|
93
96
|
|
94
97
|
/*
|
data/build.gradle
CHANGED
data/example/config.yml
CHANGED
@@ -12,11 +12,14 @@ local_fs_example: &local_fs_example
|
|
12
12
|
fs.defaultFS: 'file:///'
|
13
13
|
fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
14
14
|
fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
|
15
|
+
io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
|
15
16
|
|
16
17
|
in:
|
17
18
|
type: hdfs
|
18
19
|
<<: *local_fs_example
|
19
|
-
path: example/data
|
20
|
+
path: example/data*
|
21
|
+
skip_header_lines: 1
|
22
|
+
decompression: true
|
20
23
|
parser:
|
21
24
|
charset: UTF-8
|
22
25
|
newline: CRLF
|
Binary file
|
@@ -0,0 +1,82 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import com.google.common.collect.ImmutableList;
|
4
|
+
import com.google.common.collect.ImmutableMap;
|
5
|
+
import com.google.common.collect.Lists;
|
6
|
+
import com.google.common.collect.Maps;
|
7
|
+
import org.apache.hadoop.conf.Configuration;
|
8
|
+
import org.embulk.config.ConfigException;
|
9
|
+
import org.embulk.spi.Exec;
|
10
|
+
import org.slf4j.Logger;
|
11
|
+
|
12
|
+
import java.io.File;
|
13
|
+
import java.net.MalformedURLException;
|
14
|
+
import java.util.List;
|
15
|
+
import java.util.Map;
|
16
|
+
|
17
|
+
/**
|
18
|
+
* Created by takahiro.nakayama on 2/22/16.
|
19
|
+
*/
|
20
|
+
public class ConfigurationBuilder
|
21
|
+
{
|
22
|
+
private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
|
23
|
+
private final ImmutableList.Builder<String> configFilesBuilder;
|
24
|
+
private final ImmutableMap.Builder<String, String> configMapBuilder;
|
25
|
+
|
26
|
+
public ConfigurationBuilder()
|
27
|
+
{
|
28
|
+
this.configFilesBuilder = ImmutableList.builder();
|
29
|
+
this.configMapBuilder = ImmutableMap.builder();
|
30
|
+
}
|
31
|
+
|
32
|
+
public ConfigurationBuilder addConfigFiles(List<String> configFiles)
|
33
|
+
{
|
34
|
+
for (String configFile : configFiles) {
|
35
|
+
addConfigFile(configFile);
|
36
|
+
}
|
37
|
+
return this;
|
38
|
+
}
|
39
|
+
|
40
|
+
public ConfigurationBuilder addConfigFile(String configFile)
|
41
|
+
{
|
42
|
+
configFilesBuilder.add(configFile);
|
43
|
+
return this;
|
44
|
+
}
|
45
|
+
|
46
|
+
public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
|
47
|
+
{
|
48
|
+
for (Map.Entry<String, String> entry : configMap.entrySet()) {
|
49
|
+
addConfig(entry.getKey(), entry.getValue());
|
50
|
+
}
|
51
|
+
return this;
|
52
|
+
}
|
53
|
+
|
54
|
+
public ConfigurationBuilder addConfig(String key, String value)
|
55
|
+
{
|
56
|
+
configMapBuilder.put(key, value);
|
57
|
+
return this;
|
58
|
+
}
|
59
|
+
|
60
|
+
public Configuration build()
|
61
|
+
{
|
62
|
+
Configuration configuration = new Configuration();
|
63
|
+
for (String configFile : configFilesBuilder.build()) {
|
64
|
+
File file = new File(configFile);
|
65
|
+
try {
|
66
|
+
configuration.addResource(file.toURI().toURL());
|
67
|
+
}
|
68
|
+
catch (MalformedURLException e) {
|
69
|
+
throw new ConfigException(e);
|
70
|
+
}
|
71
|
+
}
|
72
|
+
for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
|
73
|
+
configuration.set(entry.getKey(), entry.getValue());
|
74
|
+
}
|
75
|
+
// For debug
|
76
|
+
for (Map.Entry<String, String> entry : configuration) {
|
77
|
+
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
78
|
+
}
|
79
|
+
logger.trace("Resource Files: {}", configuration);
|
80
|
+
return configuration;
|
81
|
+
}
|
82
|
+
}
|
@@ -1,12 +1,18 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import com.google.common.
|
3
|
+
import com.google.common.annotations.VisibleForTesting;
|
4
|
+
import com.google.common.base.Optional;
|
5
|
+
import com.google.common.base.Throwables;
|
4
6
|
import com.google.common.collect.Lists;
|
7
|
+
import com.google.common.collect.Maps;
|
5
8
|
import org.apache.hadoop.conf.Configuration;
|
6
9
|
import org.apache.hadoop.fs.FileStatus;
|
7
10
|
import org.apache.hadoop.fs.FileSystem;
|
8
11
|
import org.apache.hadoop.fs.Path;
|
12
|
+
import org.apache.hadoop.fs.PathIOException;
|
9
13
|
import org.apache.hadoop.fs.PathNotFoundException;
|
14
|
+
import org.apache.hadoop.io.compress.CompressionCodec;
|
15
|
+
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
10
16
|
import org.embulk.config.Config;
|
11
17
|
import org.embulk.config.ConfigDefault;
|
12
18
|
import org.embulk.config.ConfigDiff;
|
@@ -19,261 +25,142 @@ import org.embulk.spi.BufferAllocator;
|
|
19
25
|
import org.embulk.spi.Exec;
|
20
26
|
import org.embulk.spi.FileInputPlugin;
|
21
27
|
import org.embulk.spi.TransactionalFileInput;
|
22
|
-
import org.embulk.spi.util.
|
28
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
23
29
|
import org.jruby.embed.ScriptingContainer;
|
24
30
|
import org.slf4j.Logger;
|
25
31
|
|
26
|
-
import javax.annotation.Nullable;
|
27
|
-
|
28
|
-
import java.io.BufferedInputStream;
|
29
|
-
import java.io.ByteArrayInputStream;
|
30
|
-
import java.io.ByteArrayOutputStream;
|
31
|
-
import java.io.File;
|
32
32
|
import java.io.IOException;
|
33
33
|
import java.io.InputStream;
|
34
|
-
import java.
|
35
|
-
import java.util.ArrayList;
|
34
|
+
import java.util.Iterator;
|
36
35
|
import java.util.List;
|
37
36
|
import java.util.Map;
|
38
37
|
|
39
38
|
public class HdfsFileInputPlugin
|
40
39
|
implements FileInputPlugin
|
41
40
|
{
|
42
|
-
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
43
|
-
private static FileSystem fs;
|
44
|
-
|
45
41
|
public interface PluginTask
|
46
|
-
extends Task
|
42
|
+
extends Task, PartialFileList.Task
|
47
43
|
{
|
48
44
|
@Config("config_files")
|
49
45
|
@ConfigDefault("[]")
|
50
|
-
|
46
|
+
List<String> getConfigFiles();
|
51
47
|
|
52
48
|
@Config("config")
|
53
49
|
@ConfigDefault("{}")
|
54
|
-
|
50
|
+
Map<String, String> getConfig();
|
55
51
|
|
56
52
|
@Config("path")
|
57
|
-
|
53
|
+
String getPath();
|
58
54
|
|
59
55
|
@Config("rewind_seconds")
|
60
56
|
@ConfigDefault("0")
|
61
|
-
|
57
|
+
int getRewindSeconds();
|
62
58
|
|
63
59
|
@Config("partition")
|
64
60
|
@ConfigDefault("true")
|
65
|
-
|
61
|
+
boolean getPartition();
|
66
62
|
|
67
63
|
@Config("num_partitions") // this parameter is the approximate value.
|
68
64
|
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
69
|
-
|
65
|
+
long getApproximateNumPartitions();
|
70
66
|
|
71
67
|
@Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
|
72
68
|
@ConfigDefault("0") // The reason why the parameter is configured is that this plugin splits files.
|
73
|
-
|
69
|
+
int getSkipHeaderLines();
|
74
70
|
|
75
|
-
|
71
|
+
@Config("decompression") // if true, decompress files by using compression codec
|
72
|
+
@ConfigDefault("false") // when getting FileInputStream.
|
73
|
+
boolean getDecompression();
|
76
74
|
|
77
|
-
|
75
|
+
PartialFileList getPartialFileList();
|
76
|
+
void setPartialFileList(PartialFileList partialFileList);
|
78
77
|
|
79
78
|
@ConfigInject
|
80
|
-
|
79
|
+
ScriptingContainer getJRuby();
|
80
|
+
|
81
|
+
@ConfigInject
|
82
|
+
BufferAllocator getBufferAllocator();
|
81
83
|
}
|
82
84
|
|
85
|
+
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
86
|
+
private Optional<Configuration> configurationContainer = Optional.absent();
|
87
|
+
|
83
88
|
@Override
|
84
89
|
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
85
90
|
{
|
86
91
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
+
Configuration configuration = getConfiguration(task);
|
87
93
|
|
88
94
|
// listing Files
|
89
|
-
String pathString = strftime(task.getPath(), task.getRewindSeconds());
|
90
95
|
try {
|
91
|
-
|
96
|
+
FileSystem fs = getFS(configuration);
|
97
|
+
|
98
|
+
String pathString = strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
|
99
|
+
Path rootPath = new Path(pathString);
|
100
|
+
|
101
|
+
List<Path> originalFileList = buildOriginalFileList(fs, rootPath);
|
92
102
|
|
93
103
|
if (originalFileList.isEmpty()) {
|
94
104
|
throw new PathNotFoundException(pathString);
|
95
105
|
}
|
96
106
|
|
97
107
|
logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
|
98
|
-
|
108
|
+
PartialFileList list = buildPartialFileList(task, originalFileList);
|
109
|
+
task.setPartialFileList(list);
|
99
110
|
}
|
100
111
|
catch (IOException e) {
|
101
112
|
logger.error(e.getMessage());
|
102
113
|
throw new RuntimeException(e);
|
103
114
|
}
|
104
115
|
|
105
|
-
// log the detail of partial files.
|
106
|
-
for (HdfsPartialFile partialFile : task.getFiles()) {
|
107
|
-
logger.debug("embulk-input-hdfs: target file: {}, start: {}, end: {}",
|
108
|
-
partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
|
109
|
-
}
|
110
|
-
|
111
116
|
// number of processors is same with number of targets
|
112
|
-
int taskCount = task.
|
117
|
+
int taskCount = task.getPartialFileList().getTaskCount();
|
113
118
|
logger.info("embulk-input-hdfs: task size: {}", taskCount);
|
114
119
|
|
115
120
|
return resume(task.dump(), taskCount, control);
|
116
121
|
}
|
117
122
|
|
118
|
-
|
119
|
-
public ConfigDiff resume(TaskSource taskSource,
|
120
|
-
int taskCount,
|
121
|
-
FileInputPlugin.Control control)
|
123
|
+
private Configuration getConfiguration(PluginTask task)
|
122
124
|
{
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
// usually, yo use last_path
|
128
|
-
//if (task.getFiles().isEmpty()) {
|
129
|
-
// if (task.getLastPath().isPresent()) {
|
130
|
-
// configDiff.set("last_path", task.getLastPath().get());
|
131
|
-
// }
|
132
|
-
//} else {
|
133
|
-
// List<String> files = new ArrayList<String>(task.getFiles());
|
134
|
-
// Collections.sort(files);
|
135
|
-
// configDiff.set("last_path", files.get(files.size() - 1));
|
136
|
-
//}
|
137
|
-
|
138
|
-
return configDiff;
|
139
|
-
}
|
125
|
+
if (configurationContainer.isPresent()) {
|
126
|
+
return configurationContainer.get();
|
127
|
+
}
|
140
128
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
129
|
+
ConfigurationBuilder builder = new ConfigurationBuilder();
|
130
|
+
builder.addConfigFiles(task.getConfigFiles());
|
131
|
+
builder.addConfigMap(task.getConfig());
|
132
|
+
configurationContainer = Optional.of(builder.build());
|
133
|
+
return configurationContainer.get();
|
146
134
|
}
|
147
135
|
|
148
|
-
|
149
|
-
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
136
|
+
private FileSystem getFS(Configuration configuration)
|
150
137
|
{
|
151
|
-
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
152
|
-
|
153
|
-
InputStream input;
|
154
|
-
final HdfsPartialFile file = task.getFiles().get(taskIndex);
|
155
138
|
try {
|
156
|
-
|
157
|
-
input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
|
158
|
-
}
|
159
|
-
else {
|
160
|
-
input = openInputStream(task, file);
|
161
|
-
}
|
139
|
+
return FileSystem.get(configuration);
|
162
140
|
}
|
163
141
|
catch (IOException e) {
|
164
|
-
|
165
|
-
throw new RuntimeException(e);
|
142
|
+
throw Throwables.propagate(e);
|
166
143
|
}
|
167
|
-
|
168
|
-
return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
|
169
|
-
{
|
170
|
-
@Override
|
171
|
-
public void abort()
|
172
|
-
{ }
|
173
|
-
|
174
|
-
@Override
|
175
|
-
public TaskReport commit()
|
176
|
-
{
|
177
|
-
return Exec.newTaskReport();
|
178
|
-
}
|
179
|
-
};
|
180
144
|
}
|
181
145
|
|
182
|
-
|
183
|
-
|
146
|
+
@VisibleForTesting
|
147
|
+
String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
|
184
148
|
{
|
185
|
-
|
186
|
-
|
187
|
-
int skippedHeaders = 0;
|
188
|
-
|
189
|
-
try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
|
190
|
-
while (true) {
|
191
|
-
int c = in.read();
|
192
|
-
if (c < 0) {
|
193
|
-
break;
|
194
|
-
}
|
195
|
-
|
196
|
-
header.write(c);
|
197
|
-
|
198
|
-
if (c == '\n') {
|
199
|
-
skippedHeaders++;
|
200
|
-
}
|
201
|
-
else if (c == '\r') {
|
202
|
-
int c2 = in.read();
|
203
|
-
if (c2 == '\n') {
|
204
|
-
header.write(c2);
|
205
|
-
}
|
206
|
-
skippedHeaders++;
|
207
|
-
}
|
208
|
-
|
209
|
-
if (skippedHeaders >= task.getSkipHeaderLines()) {
|
210
|
-
break;
|
211
|
-
}
|
212
|
-
}
|
213
|
-
}
|
214
|
-
header.close();
|
215
|
-
return new ByteArrayInputStream(header.toByteArray());
|
149
|
+
String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
|
150
|
+
return jruby.runScriptlet(script).toString();
|
216
151
|
}
|
217
152
|
|
218
|
-
private
|
219
|
-
throws IOException
|
153
|
+
private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
|
220
154
|
{
|
221
|
-
|
222
|
-
InputStream original = fs.open(new Path(partialFile.getPath()));
|
223
|
-
return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
|
224
|
-
}
|
155
|
+
List<Path> fileList = Lists.newArrayList();
|
225
156
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if (fs == null) {
|
230
|
-
setFs(task);
|
231
|
-
return fs;
|
232
|
-
}
|
233
|
-
else {
|
234
|
-
return fs;
|
235
|
-
}
|
236
|
-
}
|
237
|
-
|
238
|
-
private static FileSystem setFs(final PluginTask task)
|
239
|
-
throws IOException
|
240
|
-
{
|
241
|
-
Configuration configuration = new Configuration();
|
242
|
-
|
243
|
-
for (String configFile : task.getConfigFiles()) {
|
244
|
-
File file = new File(configFile);
|
245
|
-
configuration.addResource(file.toURI().toURL());
|
246
|
-
}
|
247
|
-
|
248
|
-
for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
|
249
|
-
configuration.set(entry.getKey(), entry.getValue());
|
157
|
+
final FileStatus[] entries;
|
158
|
+
try {
|
159
|
+
entries = fs.globStatus(rootPath);
|
250
160
|
}
|
251
|
-
|
252
|
-
|
253
|
-
for (Map.Entry<String, String> entry : configuration) {
|
254
|
-
logger.trace("{}: {}", entry.getKey(), entry.getValue());
|
161
|
+
catch (IOException e) {
|
162
|
+
throw Throwables.propagate(e);
|
255
163
|
}
|
256
|
-
logger.debug("Resource Files: {}", configuration);
|
257
|
-
|
258
|
-
fs = FileSystem.get(configuration);
|
259
|
-
return fs;
|
260
|
-
}
|
261
|
-
|
262
|
-
private String strftime(final String raw, final int rewindSeconds)
|
263
|
-
{
|
264
|
-
ScriptingContainer jruby = new ScriptingContainer();
|
265
|
-
Object resolved = jruby.runScriptlet(
|
266
|
-
String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
|
267
|
-
return resolved.toString();
|
268
|
-
}
|
269
|
-
|
270
|
-
private List<String> buildFileList(final FileSystem fs, final String pathString)
|
271
|
-
throws IOException
|
272
|
-
{
|
273
|
-
List<String> fileList = new ArrayList<>();
|
274
|
-
Path rootPath = new Path(pathString);
|
275
|
-
|
276
|
-
final FileStatus[] entries = fs.globStatus(rootPath);
|
277
164
|
// `globStatus` does not throw PathNotFoundException.
|
278
165
|
// return null instead.
|
279
166
|
// see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
|
@@ -283,80 +170,229 @@ public class HdfsFileInputPlugin
|
|
283
170
|
|
284
171
|
for (FileStatus entry : entries) {
|
285
172
|
if (entry.isDirectory()) {
|
286
|
-
|
173
|
+
List<Path> subEntries = listRecursive(fs, entry);
|
174
|
+
fileList.addAll(subEntries);
|
287
175
|
}
|
288
176
|
else {
|
289
|
-
fileList.add(entry.getPath()
|
177
|
+
fileList.add(entry.getPath());
|
290
178
|
}
|
291
179
|
}
|
292
180
|
|
293
181
|
return fileList;
|
294
182
|
}
|
295
183
|
|
296
|
-
private List<
|
297
|
-
throws IOException
|
184
|
+
private List<Path> listRecursive(final FileSystem fs, FileStatus status)
|
298
185
|
{
|
299
|
-
List<
|
186
|
+
List<Path> fileList = Lists.newArrayList();
|
300
187
|
if (status.isDirectory()) {
|
301
|
-
|
302
|
-
|
188
|
+
FileStatus[] entries;
|
189
|
+
try {
|
190
|
+
entries = fs.listStatus(status.getPath());
|
191
|
+
}
|
192
|
+
catch (IOException e) {
|
193
|
+
throw Throwables.propagate(e);
|
194
|
+
}
|
195
|
+
|
196
|
+
for (FileStatus entry : entries) {
|
197
|
+
fileList.addAll(listRecursive(fs, entry));
|
303
198
|
}
|
304
199
|
}
|
305
200
|
else {
|
306
|
-
fileList.add(status.getPath()
|
201
|
+
fileList.add(status.getPath());
|
307
202
|
}
|
308
203
|
return fileList;
|
309
204
|
}
|
310
205
|
|
311
|
-
private
|
312
|
-
throws IOException
|
206
|
+
private PartialFileList buildPartialFileList(PluginTask task, List<Path> pathList)
|
313
207
|
{
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
public Path apply(@Nullable String input)
|
319
|
-
{
|
320
|
-
return new Path(input);
|
321
|
-
}
|
322
|
-
});
|
208
|
+
Configuration configuration = getConfiguration(task);
|
209
|
+
FileSystem fs = getFS(configuration);
|
210
|
+
boolean shouldPartition = task.getPartition();
|
211
|
+
boolean shouldDecompress = task.getDecompression();
|
323
212
|
|
213
|
+
Map<Path, Long> pathLengthMap = Maps.newHashMap();
|
324
214
|
long totalFileLength = 0;
|
325
215
|
for (Path path : pathList) {
|
326
|
-
|
216
|
+
long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
|
217
|
+
|
218
|
+
if (fileLength <= 0) {
|
219
|
+
logger.info("Skip the 0 byte target file: {}", path);
|
220
|
+
continue;
|
221
|
+
}
|
222
|
+
|
223
|
+
pathLengthMap.put(path, fileLength);
|
224
|
+
totalFileLength += fileLength;
|
327
225
|
}
|
226
|
+
if (totalFileLength <= 0) {
|
227
|
+
throw Throwables.propagate(new PathIOException(task.getPath(), "All files are empty"));
|
228
|
+
}
|
229
|
+
|
230
|
+
PartialFileList.Builder builder = new PartialFileList.Builder(task);
|
328
231
|
|
329
232
|
// TODO: optimum allocation of resources
|
330
|
-
long approximateNumPartitions
|
331
|
-
|
233
|
+
final long approximateNumPartitions;
|
234
|
+
if (task.getApproximateNumPartitions() <= 0) {
|
235
|
+
approximateNumPartitions = Runtime.getRuntime().availableProcessors();
|
236
|
+
}
|
237
|
+
else {
|
238
|
+
approximateNumPartitions = task.getApproximateNumPartitions();
|
239
|
+
}
|
240
|
+
|
332
241
|
long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
333
242
|
if (partitionSizeByOneTask <= 0) {
|
334
243
|
partitionSizeByOneTask = 1;
|
335
244
|
}
|
336
245
|
|
337
|
-
|
338
|
-
|
339
|
-
long fileLength =
|
340
|
-
if (fileLength <= 0) {
|
341
|
-
logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
|
342
|
-
continue;
|
343
|
-
}
|
246
|
+
for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
|
247
|
+
Path path = entry.getKey();
|
248
|
+
long fileLength = entry.getValue();
|
344
249
|
|
345
250
|
long numPartitions;
|
346
|
-
if (
|
347
|
-
|
251
|
+
if (shouldPartition) {
|
252
|
+
if (shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null) {
|
253
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
254
|
+
}
|
255
|
+
else if (getHdfsFileCompressionCodec(fs, path) != null) { // if not null, the file is compressed.
|
256
|
+
numPartitions = 1;
|
257
|
+
}
|
258
|
+
else {
|
259
|
+
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
260
|
+
}
|
348
261
|
}
|
349
|
-
else
|
262
|
+
else {
|
350
263
|
numPartitions = 1;
|
351
264
|
}
|
352
|
-
|
353
|
-
|
265
|
+
|
266
|
+
for (long i = 0; i < numPartitions; i++) {
|
267
|
+
long start = fileLength * i / numPartitions;
|
268
|
+
long end = fileLength * (i + 1) / numPartitions;
|
269
|
+
if (start < end) {
|
270
|
+
logger.debug("PartialFile: path {}, start: {}, end: {}", path, start, end);
|
271
|
+
builder.add(path.toString(), start, end, shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null);
|
272
|
+
}
|
354
273
|
}
|
274
|
+
}
|
275
|
+
|
276
|
+
return builder.build();
|
277
|
+
}
|
355
278
|
|
356
|
-
|
357
|
-
|
279
|
+
private Long getHdfsFileLength(FileSystem fs, Path path, boolean shouldDecompression)
|
280
|
+
{
|
281
|
+
CompressionCodec codec = getHdfsFileCompressionCodec(fs, path);
|
282
|
+
if (codec == null) {
|
283
|
+
try {
|
284
|
+
return fs.getFileStatus(path).getLen();
|
285
|
+
}
|
286
|
+
catch (IOException e) {
|
287
|
+
throw Throwables.propagate(e);
|
288
|
+
}
|
289
|
+
}
|
290
|
+
else if (!shouldDecompression) {
|
291
|
+
try {
|
292
|
+
return fs.getFileStatus(path).getLen();
|
293
|
+
}
|
294
|
+
catch (IOException e) {
|
295
|
+
throw Throwables.propagate(e);
|
296
|
+
}
|
358
297
|
}
|
298
|
+
else {
|
299
|
+
long fileLength = 0;
|
300
|
+
try (InputStream is = codec.createInputStream(fs.open(path))) {
|
301
|
+
while (is.read() > 0) {
|
302
|
+
fileLength++;
|
303
|
+
}
|
304
|
+
}
|
305
|
+
catch (IOException e) {
|
306
|
+
throw Throwables.propagate(e);
|
307
|
+
}
|
308
|
+
return fileLength;
|
309
|
+
}
|
310
|
+
}
|
359
311
|
|
360
|
-
|
312
|
+
private CompressionCodec getHdfsFileCompressionCodec(FileSystem fs, Path path)
|
313
|
+
{
|
314
|
+
return getHdfsFileCompressionCodec(fs.getConf(), path);
|
315
|
+
}
|
316
|
+
|
317
|
+
private CompressionCodec getHdfsFileCompressionCodec(Configuration configuration, Path path)
|
318
|
+
{
|
319
|
+
return new CompressionCodecFactory(configuration).getCodec(path);
|
320
|
+
}
|
321
|
+
|
322
|
+
@Override
|
323
|
+
public ConfigDiff resume(TaskSource taskSource,
|
324
|
+
int taskCount,
|
325
|
+
FileInputPlugin.Control control)
|
326
|
+
{
|
327
|
+
control.run(taskSource, taskCount);
|
328
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
329
|
+
return configDiff;
|
330
|
+
}
|
331
|
+
|
332
|
+
@Override
|
333
|
+
public void cleanup(TaskSource taskSource,
|
334
|
+
int taskCount,
|
335
|
+
List<TaskReport> successTaskReports)
|
336
|
+
{
|
337
|
+
}
|
338
|
+
|
339
|
+
@Override
|
340
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
341
|
+
{
|
342
|
+
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
343
|
+
return new HdfsFileInput(task, taskIndex);
|
344
|
+
}
|
345
|
+
|
346
|
+
public class HdfsFileInput
|
347
|
+
extends InputStreamFileInput
|
348
|
+
implements TransactionalFileInput
|
349
|
+
{
|
350
|
+
|
351
|
+
public HdfsFileInput(PluginTask task, int taskIndex)
|
352
|
+
{
|
353
|
+
super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
|
354
|
+
}
|
355
|
+
|
356
|
+
@Override
|
357
|
+
public void abort()
|
358
|
+
{
|
359
|
+
}
|
360
|
+
|
361
|
+
@Override
|
362
|
+
public TaskReport commit()
|
363
|
+
{
|
364
|
+
return Exec.newTaskReport();
|
365
|
+
}
|
366
|
+
}
|
367
|
+
|
368
|
+
// TODO create single-file InputStreamFileInput utility
|
369
|
+
private class SingleFileProvider
|
370
|
+
implements InputStreamFileInput.Provider
|
371
|
+
{
|
372
|
+
private final FileSystem fs;
|
373
|
+
private final int numHeaderLines;
|
374
|
+
private final Iterator<PartialFile> iterator;
|
375
|
+
|
376
|
+
public SingleFileProvider(PluginTask task, int taskIndex)
|
377
|
+
{
|
378
|
+
this.fs = getFS(getConfiguration(task));
|
379
|
+
this.numHeaderLines = task.getSkipHeaderLines();
|
380
|
+
this.iterator = task.getPartialFileList().get(taskIndex).iterator();
|
381
|
+
}
|
382
|
+
|
383
|
+
@Override
|
384
|
+
public InputStream openNext() throws IOException
|
385
|
+
{
|
386
|
+
if (!iterator.hasNext()) {
|
387
|
+
return null;
|
388
|
+
}
|
389
|
+
PartialFileInputStreamBuilder builder = new PartialFileInputStreamBuilder(fs, iterator.next()).withHeaders(numHeaderLines);
|
390
|
+
return builder.build();
|
391
|
+
}
|
392
|
+
|
393
|
+
@Override
|
394
|
+
public void close()
|
395
|
+
{
|
396
|
+
}
|
361
397
|
}
|
362
398
|
}
|