embulk-input-hdfs 0.1.9 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce120e7049f33e30dd23af9f8b7bcedc1a246457
4
- data.tar.gz: a2dc70fee60be2ab535df3549e99304e751a7b7a
3
+ metadata.gz: e666bbbcb18941dce84889c2ee7fb85d65edbaf4
4
+ data.tar.gz: 7422b508396787d70e6cea3fc534739c2c20c825
5
5
  SHA512:
6
- metadata.gz: a37baf6f948dff41f694457dc9ea9ea9270e41473642114d4dc7a569c61550471b9dbc440478c638fe56ba79956f043097e2129302d3ae12511bdc9d33cef994
7
- data.tar.gz: 16922c84dcdb9715cb1b0377886b36192acdda31a037352e18df83895f33b09a9f275cd02b9662f02ee411725a6dae65950cfc256c707f639312810839018037
6
+ metadata.gz: c305947dbd3f6bded0a23fbc06efd4d44e6d48cdb4b97c8b0e3861cd4b2a9800f6d8c93cf5280ccb235ca88346e727bb5fb549ae3c7bb2e12a13205e20765085
7
+ data.tar.gz: 8f33bb06731a3c5a25dd723bef83616992ce5fc8b8d5e1a60d8a1da56421a42b49ae3397feb24134a093bf291af87ddbd208fa866c86fdd997d824a6077434a4
data/CHENGELOG.md ADDED
@@ -0,0 +1,7 @@
1
+ 0.2.1 (2016-02-25)
2
+ ==================
3
+ - [Fix] does not work
4
+
5
+ 0.2.0 (2016-02-15)
6
+ ==================
7
+ - [Add] `decompression` option
data/README.md CHANGED
@@ -14,11 +14,12 @@ Read files on Hdfs.
14
14
 
15
15
  - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
16
16
  - **config** overwrites configuration parameters (hash, default: `{}`)
17
- - **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
18
- - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
19
- - **partition** when this is true, partition input files and increase task count. (default: `true`)
20
- - **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
21
- - **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
17
+ - **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s` (string, required).
18
+ - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property. (long, default: `0`)
19
+ - **partition** when this is true, partition input files and increase task count. (boolean, default: `true`)
20
+ - **num_partitions** number of partitions. (long, default: `Runtime.getRuntime().availableProcessors()`)
21
+ - **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (long, default: `0`)
22
+ - **decompression** Decompress compressed files by hadoop compression codec api. (boolean. default: `false`)
22
23
 
23
24
  ## Example
24
25
 
@@ -77,18 +78,20 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
77
78
  ...
78
79
  */
79
80
 
80
- int numPartitions;
81
- if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
82
- // if the file is compressed, skip partitioning.
83
- numPartitions = 1;
81
+ long numPartitions;
82
+ if (task.getPartition()) {
83
+ if (file.canDecompress()) {
84
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
85
+ }
86
+ else if (file.getCodec() != null) { // if not null, the file is compressed.
87
+ numPartitions = 1;
88
+ }
89
+ else {
90
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
91
+ }
84
92
  }
85
- else if (!task.getPartition()) {
86
- // if no partition mode, skip partitioning.
87
- numPartitions = 1;
88
- }
89
93
  else {
90
- // equalize the file size per task as much as possible.
91
- numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
94
+ numPartitions = 1;
92
95
  }
93
96
 
94
97
  /*
data/build.gradle CHANGED
@@ -15,7 +15,7 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.1.9"
18
+ version = "0.2.1"
19
19
 
20
20
  sourceCompatibility = 1.7
21
21
  targetCompatibility = 1.7
data/example/config.yml CHANGED
@@ -12,11 +12,14 @@ local_fs_example: &local_fs_example
12
12
  fs.defaultFS: 'file:///'
13
13
  fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
14
14
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
15
16
 
16
17
  in:
17
18
  type: hdfs
18
19
  <<: *local_fs_example
19
- path: example/data.csv
20
+ path: example/data*
21
+ skip_header_lines: 1
22
+ decompression: true
20
23
  parser:
21
24
  charset: UTF-8
22
25
  newline: CRLF
Binary file
@@ -0,0 +1,82 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import com.google.common.collect.Lists;
6
+ import com.google.common.collect.Maps;
7
+ import org.apache.hadoop.conf.Configuration;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.spi.Exec;
10
+ import org.slf4j.Logger;
11
+
12
+ import java.io.File;
13
+ import java.net.MalformedURLException;
14
+ import java.util.List;
15
+ import java.util.Map;
16
+
17
+ /**
18
+ * Created by takahiro.nakayama on 2/22/16.
19
+ */
20
+ public class ConfigurationBuilder
21
+ {
22
+ private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
23
+ private final ImmutableList.Builder<String> configFilesBuilder;
24
+ private final ImmutableMap.Builder<String, String> configMapBuilder;
25
+
26
+ public ConfigurationBuilder()
27
+ {
28
+ this.configFilesBuilder = ImmutableList.builder();
29
+ this.configMapBuilder = ImmutableMap.builder();
30
+ }
31
+
32
+ public ConfigurationBuilder addConfigFiles(List<String> configFiles)
33
+ {
34
+ for (String configFile : configFiles) {
35
+ addConfigFile(configFile);
36
+ }
37
+ return this;
38
+ }
39
+
40
+ public ConfigurationBuilder addConfigFile(String configFile)
41
+ {
42
+ configFilesBuilder.add(configFile);
43
+ return this;
44
+ }
45
+
46
+ public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
47
+ {
48
+ for (Map.Entry<String, String> entry : configMap.entrySet()) {
49
+ addConfig(entry.getKey(), entry.getValue());
50
+ }
51
+ return this;
52
+ }
53
+
54
+ public ConfigurationBuilder addConfig(String key, String value)
55
+ {
56
+ configMapBuilder.put(key, value);
57
+ return this;
58
+ }
59
+
60
+ public Configuration build()
61
+ {
62
+ Configuration configuration = new Configuration();
63
+ for (String configFile : configFilesBuilder.build()) {
64
+ File file = new File(configFile);
65
+ try {
66
+ configuration.addResource(file.toURI().toURL());
67
+ }
68
+ catch (MalformedURLException e) {
69
+ throw new ConfigException(e);
70
+ }
71
+ }
72
+ for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
73
+ configuration.set(entry.getKey(), entry.getValue());
74
+ }
75
+ // For debug
76
+ for (Map.Entry<String, String> entry : configuration) {
77
+ logger.trace("{}: {}", entry.getKey(), entry.getValue());
78
+ }
79
+ logger.trace("Resource Files: {}", configuration);
80
+ return configuration;
81
+ }
82
+ }
@@ -1,12 +1,18 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
- import com.google.common.base.Function;
3
+ import com.google.common.annotations.VisibleForTesting;
4
+ import com.google.common.base.Optional;
5
+ import com.google.common.base.Throwables;
4
6
  import com.google.common.collect.Lists;
7
+ import com.google.common.collect.Maps;
5
8
  import org.apache.hadoop.conf.Configuration;
6
9
  import org.apache.hadoop.fs.FileStatus;
7
10
  import org.apache.hadoop.fs.FileSystem;
8
11
  import org.apache.hadoop.fs.Path;
12
+ import org.apache.hadoop.fs.PathIOException;
9
13
  import org.apache.hadoop.fs.PathNotFoundException;
14
+ import org.apache.hadoop.io.compress.CompressionCodec;
15
+ import org.apache.hadoop.io.compress.CompressionCodecFactory;
10
16
  import org.embulk.config.Config;
11
17
  import org.embulk.config.ConfigDefault;
12
18
  import org.embulk.config.ConfigDiff;
@@ -19,261 +25,142 @@ import org.embulk.spi.BufferAllocator;
19
25
  import org.embulk.spi.Exec;
20
26
  import org.embulk.spi.FileInputPlugin;
21
27
  import org.embulk.spi.TransactionalFileInput;
22
- import org.embulk.spi.util.InputStreamTransactionalFileInput;
28
+ import org.embulk.spi.util.InputStreamFileInput;
23
29
  import org.jruby.embed.ScriptingContainer;
24
30
  import org.slf4j.Logger;
25
31
 
26
- import javax.annotation.Nullable;
27
-
28
- import java.io.BufferedInputStream;
29
- import java.io.ByteArrayInputStream;
30
- import java.io.ByteArrayOutputStream;
31
- import java.io.File;
32
32
  import java.io.IOException;
33
33
  import java.io.InputStream;
34
- import java.io.SequenceInputStream;
35
- import java.util.ArrayList;
34
+ import java.util.Iterator;
36
35
  import java.util.List;
37
36
  import java.util.Map;
38
37
 
39
38
  public class HdfsFileInputPlugin
40
39
  implements FileInputPlugin
41
40
  {
42
- private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
43
- private static FileSystem fs;
44
-
45
41
  public interface PluginTask
46
- extends Task
42
+ extends Task, PartialFileList.Task
47
43
  {
48
44
  @Config("config_files")
49
45
  @ConfigDefault("[]")
50
- public List<String> getConfigFiles();
46
+ List<String> getConfigFiles();
51
47
 
52
48
  @Config("config")
53
49
  @ConfigDefault("{}")
54
- public Map<String, String> getConfig();
50
+ Map<String, String> getConfig();
55
51
 
56
52
  @Config("path")
57
- public String getPath();
53
+ String getPath();
58
54
 
59
55
  @Config("rewind_seconds")
60
56
  @ConfigDefault("0")
61
- public int getRewindSeconds();
57
+ int getRewindSeconds();
62
58
 
63
59
  @Config("partition")
64
60
  @ConfigDefault("true")
65
- public boolean getPartition();
61
+ boolean getPartition();
66
62
 
67
63
  @Config("num_partitions") // this parameter is the approximate value.
68
64
  @ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
69
- public long getApproximateNumPartitions();
65
+ long getApproximateNumPartitions();
70
66
 
71
67
  @Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
72
68
  @ConfigDefault("0") // The reason why the parameter is configured is that this plugin splits files.
73
- public int getSkipHeaderLines();
69
+ int getSkipHeaderLines();
74
70
 
75
- public List<HdfsPartialFile> getFiles();
71
+ @Config("decompression") // if true, decompress files by using compression codec
72
+ @ConfigDefault("false") // when getting FileInputStream.
73
+ boolean getDecompression();
76
74
 
77
- public void setFiles(List<HdfsPartialFile> hdfsFiles);
75
+ PartialFileList getPartialFileList();
76
+ void setPartialFileList(PartialFileList partialFileList);
78
77
 
79
78
  @ConfigInject
80
- public BufferAllocator getBufferAllocator();
79
+ ScriptingContainer getJRuby();
80
+
81
+ @ConfigInject
82
+ BufferAllocator getBufferAllocator();
81
83
  }
82
84
 
85
+ private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
86
+ private Optional<Configuration> configurationContainer = Optional.absent();
87
+
83
88
  @Override
84
89
  public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
85
90
  {
86
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+ Configuration configuration = getConfiguration(task);
87
93
 
88
94
  // listing Files
89
- String pathString = strftime(task.getPath(), task.getRewindSeconds());
90
95
  try {
91
- List<String> originalFileList = buildFileList(getFs(task), pathString);
96
+ FileSystem fs = getFS(configuration);
97
+
98
+ String pathString = strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
99
+ Path rootPath = new Path(pathString);
100
+
101
+ List<Path> originalFileList = buildOriginalFileList(fs, rootPath);
92
102
 
93
103
  if (originalFileList.isEmpty()) {
94
104
  throw new PathNotFoundException(pathString);
95
105
  }
96
106
 
97
107
  logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
98
- task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
108
+ PartialFileList list = buildPartialFileList(task, originalFileList);
109
+ task.setPartialFileList(list);
99
110
  }
100
111
  catch (IOException e) {
101
112
  logger.error(e.getMessage());
102
113
  throw new RuntimeException(e);
103
114
  }
104
115
 
105
- // log the detail of partial files.
106
- for (HdfsPartialFile partialFile : task.getFiles()) {
107
- logger.debug("embulk-input-hdfs: target file: {}, start: {}, end: {}",
108
- partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
109
- }
110
-
111
116
  // number of processors is same with number of targets
112
- int taskCount = task.getFiles().size();
117
+ int taskCount = task.getPartialFileList().getTaskCount();
113
118
  logger.info("embulk-input-hdfs: task size: {}", taskCount);
114
119
 
115
120
  return resume(task.dump(), taskCount, control);
116
121
  }
117
122
 
118
- @Override
119
- public ConfigDiff resume(TaskSource taskSource,
120
- int taskCount,
121
- FileInputPlugin.Control control)
123
+ private Configuration getConfiguration(PluginTask task)
122
124
  {
123
- control.run(taskSource, taskCount);
124
-
125
- ConfigDiff configDiff = Exec.newConfigDiff();
126
-
127
- // usually, yo use last_path
128
- //if (task.getFiles().isEmpty()) {
129
- // if (task.getLastPath().isPresent()) {
130
- // configDiff.set("last_path", task.getLastPath().get());
131
- // }
132
- //} else {
133
- // List<String> files = new ArrayList<String>(task.getFiles());
134
- // Collections.sort(files);
135
- // configDiff.set("last_path", files.get(files.size() - 1));
136
- //}
137
-
138
- return configDiff;
139
- }
125
+ if (configurationContainer.isPresent()) {
126
+ return configurationContainer.get();
127
+ }
140
128
 
141
- @Override
142
- public void cleanup(TaskSource taskSource,
143
- int taskCount,
144
- List<TaskReport> successTaskReports)
145
- {
129
+ ConfigurationBuilder builder = new ConfigurationBuilder();
130
+ builder.addConfigFiles(task.getConfigFiles());
131
+ builder.addConfigMap(task.getConfig());
132
+ configurationContainer = Optional.of(builder.build());
133
+ return configurationContainer.get();
146
134
  }
147
135
 
148
- @Override
149
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
136
+ private FileSystem getFS(Configuration configuration)
150
137
  {
151
- final PluginTask task = taskSource.loadTask(PluginTask.class);
152
-
153
- InputStream input;
154
- final HdfsPartialFile file = task.getFiles().get(taskIndex);
155
138
  try {
156
- if (file.getStart() > 0 && task.getSkipHeaderLines() > 0) {
157
- input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
158
- }
159
- else {
160
- input = openInputStream(task, file);
161
- }
139
+ return FileSystem.get(configuration);
162
140
  }
163
141
  catch (IOException e) {
164
- logger.error(e.getMessage());
165
- throw new RuntimeException(e);
142
+ throw Throwables.propagate(e);
166
143
  }
167
-
168
- return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
169
- {
170
- @Override
171
- public void abort()
172
- { }
173
-
174
- @Override
175
- public TaskReport commit()
176
- {
177
- return Exec.newTaskReport();
178
- }
179
- };
180
144
  }
181
145
 
182
- private InputStream getHeadersInputStream(PluginTask task, HdfsPartialFile partialFile)
183
- throws IOException
146
+ @VisibleForTesting
147
+ String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
184
148
  {
185
- FileSystem fs = getFs(task);
186
- ByteArrayOutputStream header = new ByteArrayOutputStream();
187
- int skippedHeaders = 0;
188
-
189
- try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
190
- while (true) {
191
- int c = in.read();
192
- if (c < 0) {
193
- break;
194
- }
195
-
196
- header.write(c);
197
-
198
- if (c == '\n') {
199
- skippedHeaders++;
200
- }
201
- else if (c == '\r') {
202
- int c2 = in.read();
203
- if (c2 == '\n') {
204
- header.write(c2);
205
- }
206
- skippedHeaders++;
207
- }
208
-
209
- if (skippedHeaders >= task.getSkipHeaderLines()) {
210
- break;
211
- }
212
- }
213
- }
214
- header.close();
215
- return new ByteArrayInputStream(header.toByteArray());
149
+ String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
150
+ return jruby.runScriptlet(script).toString();
216
151
  }
217
152
 
218
- private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
219
- throws IOException
153
+ private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
220
154
  {
221
- FileSystem fs = getFs(task);
222
- InputStream original = fs.open(new Path(partialFile.getPath()));
223
- return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
224
- }
155
+ List<Path> fileList = Lists.newArrayList();
225
156
 
226
- private static FileSystem getFs(final PluginTask task)
227
- throws IOException
228
- {
229
- if (fs == null) {
230
- setFs(task);
231
- return fs;
232
- }
233
- else {
234
- return fs;
235
- }
236
- }
237
-
238
- private static FileSystem setFs(final PluginTask task)
239
- throws IOException
240
- {
241
- Configuration configuration = new Configuration();
242
-
243
- for (String configFile : task.getConfigFiles()) {
244
- File file = new File(configFile);
245
- configuration.addResource(file.toURI().toURL());
246
- }
247
-
248
- for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
249
- configuration.set(entry.getKey(), entry.getValue());
157
+ final FileStatus[] entries;
158
+ try {
159
+ entries = fs.globStatus(rootPath);
250
160
  }
251
-
252
- // For debug
253
- for (Map.Entry<String, String> entry : configuration) {
254
- logger.trace("{}: {}", entry.getKey(), entry.getValue());
161
+ catch (IOException e) {
162
+ throw Throwables.propagate(e);
255
163
  }
256
- logger.debug("Resource Files: {}", configuration);
257
-
258
- fs = FileSystem.get(configuration);
259
- return fs;
260
- }
261
-
262
- private String strftime(final String raw, final int rewindSeconds)
263
- {
264
- ScriptingContainer jruby = new ScriptingContainer();
265
- Object resolved = jruby.runScriptlet(
266
- String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
267
- return resolved.toString();
268
- }
269
-
270
- private List<String> buildFileList(final FileSystem fs, final String pathString)
271
- throws IOException
272
- {
273
- List<String> fileList = new ArrayList<>();
274
- Path rootPath = new Path(pathString);
275
-
276
- final FileStatus[] entries = fs.globStatus(rootPath);
277
164
  // `globStatus` does not throw PathNotFoundException.
278
165
  // return null instead.
279
166
  // see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
@@ -283,80 +170,229 @@ public class HdfsFileInputPlugin
283
170
 
284
171
  for (FileStatus entry : entries) {
285
172
  if (entry.isDirectory()) {
286
- fileList.addAll(lsr(fs, entry));
173
+ List<Path> subEntries = listRecursive(fs, entry);
174
+ fileList.addAll(subEntries);
287
175
  }
288
176
  else {
289
- fileList.add(entry.getPath().toString());
177
+ fileList.add(entry.getPath());
290
178
  }
291
179
  }
292
180
 
293
181
  return fileList;
294
182
  }
295
183
 
296
- private List<String> lsr(final FileSystem fs, FileStatus status)
297
- throws IOException
184
+ private List<Path> listRecursive(final FileSystem fs, FileStatus status)
298
185
  {
299
- List<String> fileList = new ArrayList<>();
186
+ List<Path> fileList = Lists.newArrayList();
300
187
  if (status.isDirectory()) {
301
- for (FileStatus entry : fs.listStatus(status.getPath())) {
302
- fileList.addAll(lsr(fs, entry));
188
+ FileStatus[] entries;
189
+ try {
190
+ entries = fs.listStatus(status.getPath());
191
+ }
192
+ catch (IOException e) {
193
+ throw Throwables.propagate(e);
194
+ }
195
+
196
+ for (FileStatus entry : entries) {
197
+ fileList.addAll(listRecursive(fs, entry));
303
198
  }
304
199
  }
305
200
  else {
306
- fileList.add(status.getPath().toString());
201
+ fileList.add(status.getPath());
307
202
  }
308
203
  return fileList;
309
204
  }
310
205
 
311
- private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
312
- throws IOException
206
+ private PartialFileList buildPartialFileList(PluginTask task, List<Path> pathList)
313
207
  {
314
- List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
315
- {
316
- @Nullable
317
- @Override
318
- public Path apply(@Nullable String input)
319
- {
320
- return new Path(input);
321
- }
322
- });
208
+ Configuration configuration = getConfiguration(task);
209
+ FileSystem fs = getFS(configuration);
210
+ boolean shouldPartition = task.getPartition();
211
+ boolean shouldDecompress = task.getDecompression();
323
212
 
213
+ Map<Path, Long> pathLengthMap = Maps.newHashMap();
324
214
  long totalFileLength = 0;
325
215
  for (Path path : pathList) {
326
- totalFileLength += fs.getFileStatus(path).getLen();
216
+ long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
217
+
218
+ if (fileLength <= 0) {
219
+ logger.info("Skip the 0 byte target file: {}", path);
220
+ continue;
221
+ }
222
+
223
+ pathLengthMap.put(path, fileLength);
224
+ totalFileLength += fileLength;
327
225
  }
226
+ if (totalFileLength <= 0) {
227
+ throw Throwables.propagate(new PathIOException(task.getPath(), "All files are empty"));
228
+ }
229
+
230
+ PartialFileList.Builder builder = new PartialFileList.Builder(task);
328
231
 
329
232
  // TODO: optimum allocation of resources
330
- long approximateNumPartitions =
331
- (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
233
+ final long approximateNumPartitions;
234
+ if (task.getApproximateNumPartitions() <= 0) {
235
+ approximateNumPartitions = Runtime.getRuntime().availableProcessors();
236
+ }
237
+ else {
238
+ approximateNumPartitions = task.getApproximateNumPartitions();
239
+ }
240
+
332
241
  long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
333
242
  if (partitionSizeByOneTask <= 0) {
334
243
  partitionSizeByOneTask = 1;
335
244
  }
336
245
 
337
- List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
338
- for (Path path : pathList) {
339
- long fileLength = fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
340
- if (fileLength <= 0) {
341
- logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
342
- continue;
343
- }
246
+ for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
247
+ Path path = entry.getKey();
248
+ long fileLength = entry.getValue();
344
249
 
345
250
  long numPartitions;
346
- if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
347
- numPartitions = 1;
251
+ if (shouldPartition) {
252
+ if (shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null) {
253
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
254
+ }
255
+ else if (getHdfsFileCompressionCodec(fs, path) != null) { // if not null, the file is compressed.
256
+ numPartitions = 1;
257
+ }
258
+ else {
259
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
260
+ }
348
261
  }
349
- else if (!task.getPartition()) {
262
+ else {
350
263
  numPartitions = 1;
351
264
  }
352
- else {
353
- numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
265
+
266
+ for (long i = 0; i < numPartitions; i++) {
267
+ long start = fileLength * i / numPartitions;
268
+ long end = fileLength * (i + 1) / numPartitions;
269
+ if (start < end) {
270
+ logger.debug("PartialFile: path {}, start: {}, end: {}", path, start, end);
271
+ builder.add(path.toString(), start, end, shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null);
272
+ }
354
273
  }
274
+ }
275
+
276
+ return builder.build();
277
+ }
355
278
 
356
- HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
357
- hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
279
+ private Long getHdfsFileLength(FileSystem fs, Path path, boolean shouldDecompression)
280
+ {
281
+ CompressionCodec codec = getHdfsFileCompressionCodec(fs, path);
282
+ if (codec == null) {
283
+ try {
284
+ return fs.getFileStatus(path).getLen();
285
+ }
286
+ catch (IOException e) {
287
+ throw Throwables.propagate(e);
288
+ }
289
+ }
290
+ else if (!shouldDecompression) {
291
+ try {
292
+ return fs.getFileStatus(path).getLen();
293
+ }
294
+ catch (IOException e) {
295
+ throw Throwables.propagate(e);
296
+ }
358
297
  }
298
+ else {
299
+ long fileLength = 0;
300
+ try (InputStream is = codec.createInputStream(fs.open(path))) {
301
+ while (is.read() > 0) {
302
+ fileLength++;
303
+ }
304
+ }
305
+ catch (IOException e) {
306
+ throw Throwables.propagate(e);
307
+ }
308
+ return fileLength;
309
+ }
310
+ }
359
311
 
360
- return hdfsPartialFiles;
312
+ private CompressionCodec getHdfsFileCompressionCodec(FileSystem fs, Path path)
313
+ {
314
+ return getHdfsFileCompressionCodec(fs.getConf(), path);
315
+ }
316
+
317
+ private CompressionCodec getHdfsFileCompressionCodec(Configuration configuration, Path path)
318
+ {
319
+ return new CompressionCodecFactory(configuration).getCodec(path);
320
+ }
321
+
322
+ @Override
323
+ public ConfigDiff resume(TaskSource taskSource,
324
+ int taskCount,
325
+ FileInputPlugin.Control control)
326
+ {
327
+ control.run(taskSource, taskCount);
328
+ ConfigDiff configDiff = Exec.newConfigDiff();
329
+ return configDiff;
330
+ }
331
+
332
+ @Override
333
+ public void cleanup(TaskSource taskSource,
334
+ int taskCount,
335
+ List<TaskReport> successTaskReports)
336
+ {
337
+ }
338
+
339
+ @Override
340
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
341
+ {
342
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
343
+ return new HdfsFileInput(task, taskIndex);
344
+ }
345
+
346
+ public class HdfsFileInput
347
+ extends InputStreamFileInput
348
+ implements TransactionalFileInput
349
+ {
350
+
351
+ public HdfsFileInput(PluginTask task, int taskIndex)
352
+ {
353
+ super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
354
+ }
355
+
356
+ @Override
357
+ public void abort()
358
+ {
359
+ }
360
+
361
+ @Override
362
+ public TaskReport commit()
363
+ {
364
+ return Exec.newTaskReport();
365
+ }
366
+ }
367
+
368
+ // TODO create single-file InputStreamFileInput utility
369
+ private class SingleFileProvider
370
+ implements InputStreamFileInput.Provider
371
+ {
372
+ private final FileSystem fs;
373
+ private final int numHeaderLines;
374
+ private final Iterator<PartialFile> iterator;
375
+
376
+ public SingleFileProvider(PluginTask task, int taskIndex)
377
+ {
378
+ this.fs = getFS(getConfiguration(task));
379
+ this.numHeaderLines = task.getSkipHeaderLines();
380
+ this.iterator = task.getPartialFileList().get(taskIndex).iterator();
381
+ }
382
+
383
+ @Override
384
+ public InputStream openNext() throws IOException
385
+ {
386
+ if (!iterator.hasNext()) {
387
+ return null;
388
+ }
389
+ PartialFileInputStreamBuilder builder = new PartialFileInputStreamBuilder(fs, iterator.next()).withHeaders(numHeaderLines);
390
+ return builder.build();
391
+ }
392
+
393
+ @Override
394
+ public void close()
395
+ {
396
+ }
361
397
  }
362
398
  }