embulk-input-hdfs 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHENGELOG.md +5 -0
- data/README.md +20 -17
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java +60 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +107 -200
- data/src/main/java/org/embulk/input/hdfs/Strftime.java +34 -0
- data/src/main/java/org/embulk/input/hdfs/TargetFileInfo.java +174 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileList.java → TargetFileInfoList.java} +73 -90
- data/src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java +128 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileInputStream.java → TargetFilePartialInputStream.java} +4 -6
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +6 -8
- metadata +19 -18
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +0 -82
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +0 -48
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +0 -125
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0ecc7937c596a6725e8c9f95804132af16d94dc
|
4
|
+
data.tar.gz: 0312623c3baf28b75a42839f565a6f0671afc2b8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6a4315e6f9c1dda4f752bcbe930373032b63c019c65bd9356ac385bcce7543fbb8e9279bb5204221f7dd2bef515259c2c19c0e69b1c8d148097567fa599b2c26
|
7
|
+
data.tar.gz: 589a462546b0b1cd376757f61179b857aba5d056b96c6864cc991617929ea6dc5b9255088c79766d54b8784fdc922b4f5192a90a913df0d424bb6fc5e64ea9a8
|
data/CHENGELOG.md
CHANGED
data/README.md
CHANGED
@@ -77,23 +77,26 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
|
|
77
77
|
/*
|
78
78
|
...
|
79
79
|
*/
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
80
|
+
long numPartitions = 1; // default is no partition.
|
81
|
+
if (isPartitionable(task, conf, status)) { // partition: true and (decompression: false or CompressionCodec is null)
|
82
|
+
numPartitions = ((status.getLen() - 1) / partitionSizeByOneTask) + 1;
|
83
|
+
}
|
84
|
+
|
85
|
+
for (long i = 0; i < numPartitions; i++) {
|
86
|
+
long start = status.getLen() * i / numPartitions;
|
87
|
+
long end = status.getLen() * (i + 1) / numPartitions;
|
88
|
+
if (start < end) {
|
89
|
+
TargetFileInfo targetFileInfo = new TargetFileInfo.Builder()
|
90
|
+
.pathString(status.getPath().toString())
|
91
|
+
.start(start)
|
92
|
+
.end(end)
|
93
|
+
.isDecompressible(isDecompressible(task, conf, status))
|
94
|
+
.isPartitionable(isPartitionable(task, conf, status))
|
95
|
+
.numHeaderLines(task.getSkipHeaderLines())
|
96
|
+
.build();
|
97
|
+
builder.add(targetFileInfo);
|
98
|
+
}
|
99
|
+
}
|
97
100
|
/*
|
98
101
|
...
|
99
102
|
*/
|
data/build.gradle
CHANGED
@@ -0,0 +1,60 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.apache.hadoop.conf.Configuration;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigDefault;
|
6
|
+
import org.embulk.config.ConfigException;
|
7
|
+
import org.embulk.spi.Exec;
|
8
|
+
import org.slf4j.Logger;
|
9
|
+
|
10
|
+
import java.io.File;
|
11
|
+
import java.net.MalformedURLException;
|
12
|
+
import java.util.List;
|
13
|
+
import java.util.Map;
|
14
|
+
|
15
|
+
public class ConfigurationFactory
|
16
|
+
{
|
17
|
+
public static final Logger logger = Exec.getLogger(ConfigurationFactory.class);
|
18
|
+
|
19
|
+
interface Task
|
20
|
+
{
|
21
|
+
@Config("config_files")
|
22
|
+
@ConfigDefault("[]")
|
23
|
+
List<String> getConfigFiles();
|
24
|
+
|
25
|
+
@Config("config")
|
26
|
+
@ConfigDefault("{}")
|
27
|
+
Map<String, String> getConfig();
|
28
|
+
}
|
29
|
+
|
30
|
+
private ConfigurationFactory()
|
31
|
+
{
|
32
|
+
}
|
33
|
+
|
34
|
+
public static Configuration create(Task task)
|
35
|
+
{
|
36
|
+
Configuration c = new Configuration();
|
37
|
+
for (String f : task.getConfigFiles()) {
|
38
|
+
try {
|
39
|
+
logger.debug("embulk-input-hdfs: load a config file: {}", f);
|
40
|
+
c.addResource(new File(f).toURI().toURL());
|
41
|
+
}
|
42
|
+
catch (MalformedURLException e) {
|
43
|
+
throw new ConfigException(e);
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
|
48
|
+
logger.debug("embulk-input-hdfs: load a config: {}:{}", entry.getKey(), entry.getValue());
|
49
|
+
c.set(entry.getKey(), entry.getValue());
|
50
|
+
}
|
51
|
+
|
52
|
+
// For logging
|
53
|
+
for (Map.Entry<String, String> entry : c) {
|
54
|
+
logger.trace("embulk-input-hdfs: {}: {}", entry.getKey(), entry.getValue());
|
55
|
+
}
|
56
|
+
logger.trace("embulk-input-hdfs: Resource Files: {}", c);
|
57
|
+
|
58
|
+
return c;
|
59
|
+
}
|
60
|
+
}
|
@@ -1,17 +1,12 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
import com.google.common.annotations.VisibleForTesting;
|
4
|
-
import com.google.common.base.Optional;
|
5
3
|
import com.google.common.base.Throwables;
|
6
4
|
import com.google.common.collect.Lists;
|
7
|
-
import com.google.common.collect.Maps;
|
8
5
|
import org.apache.hadoop.conf.Configuration;
|
9
6
|
import org.apache.hadoop.fs.FileStatus;
|
10
7
|
import org.apache.hadoop.fs.FileSystem;
|
11
8
|
import org.apache.hadoop.fs.Path;
|
12
|
-
import org.apache.hadoop.fs.PathIOException;
|
13
9
|
import org.apache.hadoop.fs.PathNotFoundException;
|
14
|
-
import org.apache.hadoop.io.compress.CompressionCodec;
|
15
10
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
16
11
|
import org.embulk.config.Config;
|
17
12
|
import org.embulk.config.ConfigDefault;
|
@@ -22,43 +17,30 @@ import org.embulk.config.Task;
|
|
22
17
|
import org.embulk.config.TaskReport;
|
23
18
|
import org.embulk.config.TaskSource;
|
24
19
|
import org.embulk.spi.BufferAllocator;
|
20
|
+
import org.embulk.spi.DataException;
|
25
21
|
import org.embulk.spi.Exec;
|
26
22
|
import org.embulk.spi.FileInputPlugin;
|
27
23
|
import org.embulk.spi.TransactionalFileInput;
|
28
24
|
import org.embulk.spi.util.InputStreamFileInput;
|
29
|
-
import org.jruby.embed.ScriptingContainer;
|
30
25
|
import org.slf4j.Logger;
|
31
26
|
|
32
27
|
import java.io.IOException;
|
33
28
|
import java.io.InputStream;
|
34
29
|
import java.util.Iterator;
|
35
30
|
import java.util.List;
|
36
|
-
import java.util.Map;
|
37
31
|
|
38
32
|
public class HdfsFileInputPlugin
|
39
33
|
implements FileInputPlugin
|
40
34
|
{
|
41
35
|
public interface PluginTask
|
42
|
-
extends Task,
|
36
|
+
extends Task, TargetFileInfoList.Task, ConfigurationFactory.Task, Strftime.Task
|
43
37
|
{
|
44
|
-
@Config("config_files")
|
45
|
-
@ConfigDefault("[]")
|
46
|
-
List<String> getConfigFiles();
|
47
|
-
|
48
|
-
@Config("config")
|
49
|
-
@ConfigDefault("{}")
|
50
|
-
Map<String, String> getConfig();
|
51
|
-
|
52
38
|
@Config("path")
|
53
39
|
String getPath();
|
54
40
|
|
55
|
-
@Config("rewind_seconds")
|
56
|
-
@ConfigDefault("0")
|
57
|
-
int getRewindSeconds();
|
58
|
-
|
59
41
|
@Config("partition")
|
60
42
|
@ConfigDefault("true")
|
61
|
-
boolean
|
43
|
+
boolean getWillPartition();
|
62
44
|
|
63
45
|
@Config("num_partitions") // this parameter is the approximate value.
|
64
46
|
@ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
|
@@ -70,253 +52,175 @@ public class HdfsFileInputPlugin
|
|
70
52
|
|
71
53
|
@Config("decompression") // if true, decompress files by using compression codec
|
72
54
|
@ConfigDefault("false") // when getting FileInputStream.
|
73
|
-
boolean
|
55
|
+
boolean getWillDecompress();
|
74
56
|
|
75
|
-
|
76
|
-
void
|
77
|
-
|
78
|
-
@ConfigInject
|
79
|
-
ScriptingContainer getJRuby();
|
80
|
-
|
81
|
-
@ConfigInject
|
82
|
-
BufferAllocator getBufferAllocator();
|
57
|
+
TargetFileInfoList getTargetFileInfoList();
|
58
|
+
void setTargetFileInfoList(TargetFileInfoList targetFileInfoList);
|
83
59
|
}
|
84
60
|
|
85
61
|
private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
|
86
|
-
private Optional<Configuration> configurationContainer = Optional.absent();
|
87
62
|
|
88
63
|
@Override
|
89
64
|
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
90
65
|
{
|
91
66
|
PluginTask task = config.loadConfig(PluginTask.class);
|
92
|
-
|
67
|
+
|
68
|
+
if (task.getWillPartition() && task.getWillDecompress()) {
|
69
|
+
logger.info("embulk-input-hdfs: Please be sure that the target files cannot be partitioned if they are compressed.");
|
70
|
+
}
|
71
|
+
|
72
|
+
Configuration conf = ConfigurationFactory.create(task);
|
93
73
|
|
94
74
|
// listing Files
|
95
75
|
try {
|
96
|
-
FileSystem fs =
|
76
|
+
FileSystem fs = FileSystem.get(conf);
|
97
77
|
|
98
|
-
String pathString =
|
78
|
+
String pathString = new Strftime(task).format(task.getPath());
|
99
79
|
Path rootPath = new Path(pathString);
|
100
80
|
|
101
|
-
List<
|
81
|
+
List<FileStatus> statusList = listFileStatuses(fs, rootPath);
|
102
82
|
|
103
|
-
if (
|
83
|
+
if (statusList.isEmpty()) {
|
104
84
|
throw new PathNotFoundException(pathString);
|
105
85
|
}
|
106
86
|
|
107
|
-
|
108
|
-
|
109
|
-
|
87
|
+
for (FileStatus status : statusList) {
|
88
|
+
logger.debug("embulk-input-hdfs: Loading paths: {}, length: {}", status.getPath(), status.getLen());
|
89
|
+
}
|
90
|
+
|
91
|
+
TargetFileInfoList list = buildTargetFileInfoList(task, statusList);
|
92
|
+
task.setTargetFileInfoList(list);
|
110
93
|
}
|
111
94
|
catch (IOException e) {
|
112
95
|
logger.error(e.getMessage());
|
113
|
-
throw
|
96
|
+
throw Throwables.propagate(e);
|
114
97
|
}
|
115
98
|
|
116
99
|
// number of processors is same with number of targets
|
117
|
-
int taskCount = task.
|
100
|
+
int taskCount = task.getTargetFileInfoList().getTaskCount();
|
118
101
|
logger.info("embulk-input-hdfs: task size: {}", taskCount);
|
119
102
|
|
120
103
|
return resume(task.dump(), taskCount, control);
|
121
104
|
}
|
122
105
|
|
123
|
-
private
|
106
|
+
private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath)
|
107
|
+
throws IOException
|
124
108
|
{
|
125
|
-
|
126
|
-
return configurationContainer.get();
|
127
|
-
}
|
128
|
-
|
129
|
-
ConfigurationBuilder builder = new ConfigurationBuilder();
|
130
|
-
builder.addConfigFiles(task.getConfigFiles());
|
131
|
-
builder.addConfigMap(task.getConfig());
|
132
|
-
configurationContainer = Optional.of(builder.build());
|
133
|
-
return configurationContainer.get();
|
134
|
-
}
|
109
|
+
List<FileStatus> statusList = Lists.newArrayList();
|
135
110
|
|
136
|
-
|
137
|
-
{
|
138
|
-
try {
|
139
|
-
return FileSystem.get(configuration);
|
140
|
-
}
|
141
|
-
catch (IOException e) {
|
142
|
-
throw Throwables.propagate(e);
|
143
|
-
}
|
144
|
-
}
|
145
|
-
|
146
|
-
@VisibleForTesting
|
147
|
-
String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
|
148
|
-
{
|
149
|
-
String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
|
150
|
-
return jruby.runScriptlet(script).toString();
|
151
|
-
}
|
152
|
-
|
153
|
-
private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
|
154
|
-
{
|
155
|
-
List<Path> fileList = Lists.newArrayList();
|
156
|
-
|
157
|
-
final FileStatus[] entries;
|
158
|
-
try {
|
159
|
-
entries = fs.globStatus(rootPath);
|
160
|
-
}
|
161
|
-
catch (IOException e) {
|
162
|
-
throw Throwables.propagate(e);
|
163
|
-
}
|
111
|
+
FileStatus[] entries = fs.globStatus(rootPath);
|
164
112
|
// `globStatus` does not throw PathNotFoundException.
|
165
113
|
// return null instead.
|
166
114
|
// see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
|
167
115
|
if (entries == null) {
|
168
|
-
return
|
116
|
+
return statusList;
|
169
117
|
}
|
170
118
|
|
171
119
|
for (FileStatus entry : entries) {
|
172
120
|
if (entry.isDirectory()) {
|
173
|
-
|
174
|
-
|
121
|
+
// TODO: use fs.listFiles(entry.getPath(), true); ?
|
122
|
+
List<FileStatus> subEntries = listRecursive(fs, entry);
|
123
|
+
statusList.addAll(subEntries);
|
175
124
|
}
|
176
125
|
else {
|
177
|
-
|
126
|
+
statusList.add(entry);
|
178
127
|
}
|
179
128
|
}
|
180
129
|
|
181
|
-
return
|
130
|
+
return statusList;
|
182
131
|
}
|
183
132
|
|
184
|
-
private List<
|
133
|
+
private List<FileStatus> listRecursive(FileSystem fs, FileStatus status)
|
134
|
+
throws IOException
|
185
135
|
{
|
186
|
-
List<
|
136
|
+
List<FileStatus> statusList = Lists.newArrayList();
|
187
137
|
if (status.isDirectory()) {
|
188
|
-
FileStatus[] entries;
|
189
|
-
try {
|
190
|
-
entries = fs.listStatus(status.getPath());
|
191
|
-
}
|
192
|
-
catch (IOException e) {
|
193
|
-
throw Throwables.propagate(e);
|
194
|
-
}
|
195
|
-
|
138
|
+
FileStatus[] entries = fs.listStatus(status.getPath());
|
196
139
|
for (FileStatus entry : entries) {
|
197
|
-
|
140
|
+
statusList.addAll(listRecursive(fs, entry));
|
198
141
|
}
|
199
142
|
}
|
200
143
|
else {
|
201
|
-
|
144
|
+
statusList.add(status);
|
202
145
|
}
|
203
|
-
return
|
146
|
+
return statusList;
|
204
147
|
}
|
205
148
|
|
206
|
-
private
|
149
|
+
private TargetFileInfoList buildTargetFileInfoList(PluginTask task, List<FileStatus> statusList)
|
150
|
+
throws IOException, DataException
|
207
151
|
{
|
208
|
-
|
209
|
-
FileSystem fs = getFS(configuration);
|
210
|
-
boolean shouldPartition = task.getPartition();
|
211
|
-
boolean shouldDecompress = task.getDecompression();
|
212
|
-
|
213
|
-
Map<Path, Long> pathLengthMap = Maps.newHashMap();
|
214
|
-
long totalFileLength = 0;
|
215
|
-
for (Path path : pathList) {
|
216
|
-
long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
|
217
|
-
|
218
|
-
if (fileLength <= 0) {
|
219
|
-
logger.info("Skip the 0 byte target file: {}", path);
|
220
|
-
continue;
|
221
|
-
}
|
222
|
-
|
223
|
-
pathLengthMap.put(path, fileLength);
|
224
|
-
totalFileLength += fileLength;
|
225
|
-
}
|
152
|
+
long totalFileLength = calcTotalFilesLength(statusList);
|
226
153
|
if (totalFileLength <= 0) {
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
PartialFileList.Builder builder = new PartialFileList.Builder(task);
|
231
|
-
|
232
|
-
// TODO: optimum allocation of resources
|
233
|
-
final long approximateNumPartitions;
|
234
|
-
if (task.getApproximateNumPartitions() <= 0) {
|
235
|
-
approximateNumPartitions = Runtime.getRuntime().availableProcessors();
|
236
|
-
}
|
237
|
-
else {
|
238
|
-
approximateNumPartitions = task.getApproximateNumPartitions();
|
154
|
+
// TODO: skip this error because other file input plugins have no errors if files are empty.
|
155
|
+
throw new DataException("embulk-input-hdfs: All files are empty: " + task.getPath());
|
239
156
|
}
|
240
157
|
|
241
|
-
long partitionSizeByOneTask = totalFileLength
|
242
|
-
if (partitionSizeByOneTask <= 0) {
|
243
|
-
partitionSizeByOneTask = 1;
|
244
|
-
}
|
245
|
-
|
246
|
-
for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
|
247
|
-
Path path = entry.getKey();
|
248
|
-
long fileLength = entry.getValue();
|
158
|
+
long partitionSizeByOneTask = calcApproximatePartitionSizeByOneTask(task, totalFileLength);
|
249
159
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
}
|
255
|
-
|
256
|
-
numPartitions = 1;
|
257
|
-
}
|
258
|
-
else {
|
259
|
-
numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
|
260
|
-
}
|
160
|
+
Configuration conf = ConfigurationFactory.create(task);
|
161
|
+
TargetFileInfoList.Builder builder = TargetFileInfoList.builder(task);
|
162
|
+
for (FileStatus status : statusList) {
|
163
|
+
if (status.getLen() <= 0) {
|
164
|
+
logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", status.getPath());
|
165
|
+
continue;
|
261
166
|
}
|
262
|
-
|
263
|
-
|
167
|
+
|
168
|
+
long numPartitions = 1; // default is no partition.
|
169
|
+
if (isPartitionable(task, conf, status)) {
|
170
|
+
numPartitions = ((status.getLen() - 1) / partitionSizeByOneTask) + 1;
|
264
171
|
}
|
265
172
|
|
266
173
|
for (long i = 0; i < numPartitions; i++) {
|
267
|
-
long start =
|
268
|
-
long end =
|
174
|
+
long start = status.getLen() * i / numPartitions;
|
175
|
+
long end = status.getLen() * (i + 1) / numPartitions;
|
269
176
|
if (start < end) {
|
270
|
-
|
271
|
-
|
177
|
+
TargetFileInfo targetFileInfo = new TargetFileInfo.Builder()
|
178
|
+
.pathString(status.getPath().toString())
|
179
|
+
.start(start)
|
180
|
+
.end(end)
|
181
|
+
.isDecompressible(isDecompressible(task, conf, status))
|
182
|
+
.isPartitionable(isPartitionable(task, conf, status))
|
183
|
+
.numHeaderLines(task.getSkipHeaderLines())
|
184
|
+
.build();
|
185
|
+
builder.add(targetFileInfo);
|
272
186
|
}
|
273
187
|
}
|
274
188
|
}
|
275
|
-
|
276
189
|
return builder.build();
|
277
190
|
}
|
278
191
|
|
279
|
-
private
|
192
|
+
private boolean isDecompressible(PluginTask task, Configuration conf, FileStatus status)
|
280
193
|
{
|
281
|
-
|
282
|
-
if (codec == null) {
|
283
|
-
try {
|
284
|
-
return fs.getFileStatus(path).getLen();
|
285
|
-
}
|
286
|
-
catch (IOException e) {
|
287
|
-
throw Throwables.propagate(e);
|
288
|
-
}
|
289
|
-
}
|
290
|
-
else if (!shouldDecompression) {
|
291
|
-
try {
|
292
|
-
return fs.getFileStatus(path).getLen();
|
293
|
-
}
|
294
|
-
catch (IOException e) {
|
295
|
-
throw Throwables.propagate(e);
|
296
|
-
}
|
297
|
-
}
|
298
|
-
else {
|
299
|
-
long fileLength = 0;
|
300
|
-
try (InputStream is = codec.createInputStream(fs.open(path))) {
|
301
|
-
while (is.read() > 0) {
|
302
|
-
fileLength++;
|
303
|
-
}
|
304
|
-
}
|
305
|
-
catch (IOException e) {
|
306
|
-
throw Throwables.propagate(e);
|
307
|
-
}
|
308
|
-
return fileLength;
|
309
|
-
}
|
194
|
+
return task.getWillDecompress() && new CompressionCodecFactory(conf).getCodec(status.getPath()) != null;
|
310
195
|
}
|
311
196
|
|
312
|
-
private
|
197
|
+
private boolean isPartitionable(PluginTask task, Configuration conf, FileStatus status)
|
313
198
|
{
|
314
|
-
return
|
199
|
+
return task.getWillPartition() && !isDecompressible(task, conf, status);
|
315
200
|
}
|
316
201
|
|
317
|
-
private
|
202
|
+
private long calcTotalFilesLength(List<FileStatus> statusList)
|
203
|
+
throws IOException
|
318
204
|
{
|
319
|
-
|
205
|
+
long total = 0L;
|
206
|
+
for (FileStatus status : statusList) {
|
207
|
+
total += status.getLen();
|
208
|
+
}
|
209
|
+
return total;
|
210
|
+
}
|
211
|
+
|
212
|
+
private long calcApproximatePartitionSizeByOneTask(PluginTask task, long totalFilesLength)
|
213
|
+
{
|
214
|
+
long numPartitions = task.getApproximateNumPartitions();
|
215
|
+
if (numPartitions <= 0) {
|
216
|
+
numPartitions = Runtime.getRuntime().availableProcessors();
|
217
|
+
}
|
218
|
+
// TODO: optimum allocation of resources
|
219
|
+
long partitionSizeByOneTask = totalFilesLength / numPartitions;
|
220
|
+
if (partitionSizeByOneTask <= 0) {
|
221
|
+
partitionSizeByOneTask = 1;
|
222
|
+
}
|
223
|
+
return partitionSizeByOneTask;
|
320
224
|
}
|
321
225
|
|
322
226
|
@Override
|
@@ -340,17 +244,22 @@ public class HdfsFileInputPlugin
|
|
340
244
|
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
341
245
|
{
|
342
246
|
final PluginTask task = taskSource.loadTask(PluginTask.class);
|
343
|
-
|
247
|
+
try {
|
248
|
+
return new HdfsFileInput(task, taskIndex);
|
249
|
+
}
|
250
|
+
catch (IOException e) {
|
251
|
+
throw Throwables.propagate(e);
|
252
|
+
}
|
344
253
|
}
|
345
254
|
|
346
255
|
public class HdfsFileInput
|
347
256
|
extends InputStreamFileInput
|
348
257
|
implements TransactionalFileInput
|
349
258
|
{
|
350
|
-
|
351
259
|
public HdfsFileInput(PluginTask task, int taskIndex)
|
260
|
+
throws IOException
|
352
261
|
{
|
353
|
-
super(
|
262
|
+
super(Exec.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
|
354
263
|
}
|
355
264
|
|
356
265
|
@Override
|
@@ -369,15 +278,14 @@ public class HdfsFileInputPlugin
|
|
369
278
|
private class SingleFileProvider
|
370
279
|
implements InputStreamFileInput.Provider
|
371
280
|
{
|
372
|
-
private final
|
373
|
-
private final
|
374
|
-
private final Iterator<PartialFile> iterator;
|
281
|
+
private final TargetFileInputStreamFactory factory;
|
282
|
+
private final Iterator<TargetFileInfo> iterator;
|
375
283
|
|
376
284
|
public SingleFileProvider(PluginTask task, int taskIndex)
|
285
|
+
throws IOException
|
377
286
|
{
|
378
|
-
this.
|
379
|
-
this.
|
380
|
-
this.iterator = task.getPartialFileList().get(taskIndex).iterator();
|
287
|
+
this.factory = new TargetFileInputStreamFactory(FileSystem.get(ConfigurationFactory.create(task)));
|
288
|
+
this.iterator = task.getTargetFileInfoList().get(taskIndex).iterator();
|
381
289
|
}
|
382
290
|
|
383
291
|
@Override
|
@@ -386,8 +294,7 @@ public class HdfsFileInputPlugin
|
|
386
294
|
if (!iterator.hasNext()) {
|
387
295
|
return null;
|
388
296
|
}
|
389
|
-
|
390
|
-
return builder.build();
|
297
|
+
return factory.create(iterator.next());
|
391
298
|
}
|
392
299
|
|
393
300
|
@Override
|