embulk-input-hdfs 0.1.9 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce120e7049f33e30dd23af9f8b7bcedc1a246457
4
- data.tar.gz: a2dc70fee60be2ab535df3549e99304e751a7b7a
3
+ metadata.gz: e666bbbcb18941dce84889c2ee7fb85d65edbaf4
4
+ data.tar.gz: 7422b508396787d70e6cea3fc534739c2c20c825
5
5
  SHA512:
6
- metadata.gz: a37baf6f948dff41f694457dc9ea9ea9270e41473642114d4dc7a569c61550471b9dbc440478c638fe56ba79956f043097e2129302d3ae12511bdc9d33cef994
7
- data.tar.gz: 16922c84dcdb9715cb1b0377886b36192acdda31a037352e18df83895f33b09a9f275cd02b9662f02ee411725a6dae65950cfc256c707f639312810839018037
6
+ metadata.gz: c305947dbd3f6bded0a23fbc06efd4d44e6d48cdb4b97c8b0e3861cd4b2a9800f6d8c93cf5280ccb235ca88346e727bb5fb549ae3c7bb2e12a13205e20765085
7
+ data.tar.gz: 8f33bb06731a3c5a25dd723bef83616992ce5fc8b8d5e1a60d8a1da56421a42b49ae3397feb24134a093bf291af87ddbd208fa866c86fdd997d824a6077434a4
data/CHENGELOG.md ADDED
@@ -0,0 +1,7 @@
1
+ 0.2.1 (2016-02-25)
2
+ ==================
3
+ - [Fix] does not work
4
+
5
+ 0.2.0 (2016-02-15)
6
+ ==================
7
+ - [Add] `decompression` option
data/README.md CHANGED
@@ -14,11 +14,12 @@ Read files on Hdfs.
14
14
 
15
15
  - **config_files** list of paths to Hadoop's configuration files (array of strings, default: `[]`)
16
16
  - **config** overwrites configuration parameters (hash, default: `{}`)
17
- - **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s`.
18
- - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property.
19
- - **partition** when this is true, partition input files and increase task count. (default: `true`)
20
- - **num_partitions** number of partitions. (default: `Runtime.getRuntime().availableProcessors()`)
21
- - **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (default: `0`)
17
+ - **path** file path on Hdfs. you can use glob and Date format like `%Y%m%d/%s` (string, required).
18
+ - **rewind_seconds** When you use Date format in input_path property, the format is executed by using the time which is Now minus this property. (long, default: `0`)
19
+ - **partition** when this is true, partition input files and increase task count. (boolean, default: `true`)
20
+ - **num_partitions** number of partitions. (long, default: `Runtime.getRuntime().availableProcessors()`)
21
+ - **skip_header_lines** Skip this number of lines first. Set 1 if the file has header line. (long, default: `0`)
22
+ - **decompression** Decompress compressed files by hadoop compression codec api. (boolean. default: `false`)
22
23
 
23
24
  ## Example
24
25
 
@@ -77,18 +78,20 @@ int partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
77
78
  ...
78
79
  */
79
80
 
80
- int numPartitions;
81
- if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
82
- // if the file is compressed, skip partitioning.
83
- numPartitions = 1;
81
+ long numPartitions;
82
+ if (task.getPartition()) {
83
+ if (file.canDecompress()) {
84
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
85
+ }
86
+ else if (file.getCodec() != null) { // if not null, the file is compressed.
87
+ numPartitions = 1;
88
+ }
89
+ else {
90
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
91
+ }
84
92
  }
85
- else if (!task.getPartition()) {
86
- // if no partition mode, skip partitioning.
87
- numPartitions = 1;
88
- }
89
93
  else {
90
- // equalize the file size per task as much as possible.
91
- numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
94
+ numPartitions = 1;
92
95
  }
93
96
 
94
97
  /*
data/build.gradle CHANGED
@@ -15,7 +15,7 @@ configurations {
15
15
  provided
16
16
  }
17
17
 
18
- version = "0.1.9"
18
+ version = "0.2.1"
19
19
 
20
20
  sourceCompatibility = 1.7
21
21
  targetCompatibility = 1.7
data/example/config.yml CHANGED
@@ -12,11 +12,14 @@ local_fs_example: &local_fs_example
12
12
  fs.defaultFS: 'file:///'
13
13
  fs.hdfs.impl: 'org.apache.hadoop.fs.LocalFileSystem'
14
14
  fs.file.impl: 'org.apache.hadoop.fs.LocalFileSystem'
15
+ io.compression.codecs: 'org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec'
15
16
 
16
17
  in:
17
18
  type: hdfs
18
19
  <<: *local_fs_example
19
- path: example/data.csv
20
+ path: example/data*
21
+ skip_header_lines: 1
22
+ decompression: true
20
23
  parser:
21
24
  charset: UTF-8
22
25
  newline: CRLF
Binary file
@@ -0,0 +1,82 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import com.google.common.collect.ImmutableList;
4
+ import com.google.common.collect.ImmutableMap;
5
+ import com.google.common.collect.Lists;
6
+ import com.google.common.collect.Maps;
7
+ import org.apache.hadoop.conf.Configuration;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.spi.Exec;
10
+ import org.slf4j.Logger;
11
+
12
+ import java.io.File;
13
+ import java.net.MalformedURLException;
14
+ import java.util.List;
15
+ import java.util.Map;
16
+
17
+ /**
18
+ * Created by takahiro.nakayama on 2/22/16.
19
+ */
20
+ public class ConfigurationBuilder
21
+ {
22
+ private static final Logger logger = Exec.getLogger(ConfigurationBuilder.class);
23
+ private final ImmutableList.Builder<String> configFilesBuilder;
24
+ private final ImmutableMap.Builder<String, String> configMapBuilder;
25
+
26
+ public ConfigurationBuilder()
27
+ {
28
+ this.configFilesBuilder = ImmutableList.builder();
29
+ this.configMapBuilder = ImmutableMap.builder();
30
+ }
31
+
32
+ public ConfigurationBuilder addConfigFiles(List<String> configFiles)
33
+ {
34
+ for (String configFile : configFiles) {
35
+ addConfigFile(configFile);
36
+ }
37
+ return this;
38
+ }
39
+
40
+ public ConfigurationBuilder addConfigFile(String configFile)
41
+ {
42
+ configFilesBuilder.add(configFile);
43
+ return this;
44
+ }
45
+
46
+ public ConfigurationBuilder addConfigMap(Map<String, String> configMap)
47
+ {
48
+ for (Map.Entry<String, String> entry : configMap.entrySet()) {
49
+ addConfig(entry.getKey(), entry.getValue());
50
+ }
51
+ return this;
52
+ }
53
+
54
+ public ConfigurationBuilder addConfig(String key, String value)
55
+ {
56
+ configMapBuilder.put(key, value);
57
+ return this;
58
+ }
59
+
60
+ public Configuration build()
61
+ {
62
+ Configuration configuration = new Configuration();
63
+ for (String configFile : configFilesBuilder.build()) {
64
+ File file = new File(configFile);
65
+ try {
66
+ configuration.addResource(file.toURI().toURL());
67
+ }
68
+ catch (MalformedURLException e) {
69
+ throw new ConfigException(e);
70
+ }
71
+ }
72
+ for (Map.Entry<String, String> entry : configMapBuilder.build().entrySet()) {
73
+ configuration.set(entry.getKey(), entry.getValue());
74
+ }
75
+ // For debug
76
+ for (Map.Entry<String, String> entry : configuration) {
77
+ logger.trace("{}: {}", entry.getKey(), entry.getValue());
78
+ }
79
+ logger.trace("Resource Files: {}", configuration);
80
+ return configuration;
81
+ }
82
+ }
@@ -1,12 +1,18 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
- import com.google.common.base.Function;
3
+ import com.google.common.annotations.VisibleForTesting;
4
+ import com.google.common.base.Optional;
5
+ import com.google.common.base.Throwables;
4
6
  import com.google.common.collect.Lists;
7
+ import com.google.common.collect.Maps;
5
8
  import org.apache.hadoop.conf.Configuration;
6
9
  import org.apache.hadoop.fs.FileStatus;
7
10
  import org.apache.hadoop.fs.FileSystem;
8
11
  import org.apache.hadoop.fs.Path;
12
+ import org.apache.hadoop.fs.PathIOException;
9
13
  import org.apache.hadoop.fs.PathNotFoundException;
14
+ import org.apache.hadoop.io.compress.CompressionCodec;
15
+ import org.apache.hadoop.io.compress.CompressionCodecFactory;
10
16
  import org.embulk.config.Config;
11
17
  import org.embulk.config.ConfigDefault;
12
18
  import org.embulk.config.ConfigDiff;
@@ -19,261 +25,142 @@ import org.embulk.spi.BufferAllocator;
19
25
  import org.embulk.spi.Exec;
20
26
  import org.embulk.spi.FileInputPlugin;
21
27
  import org.embulk.spi.TransactionalFileInput;
22
- import org.embulk.spi.util.InputStreamTransactionalFileInput;
28
+ import org.embulk.spi.util.InputStreamFileInput;
23
29
  import org.jruby.embed.ScriptingContainer;
24
30
  import org.slf4j.Logger;
25
31
 
26
- import javax.annotation.Nullable;
27
-
28
- import java.io.BufferedInputStream;
29
- import java.io.ByteArrayInputStream;
30
- import java.io.ByteArrayOutputStream;
31
- import java.io.File;
32
32
  import java.io.IOException;
33
33
  import java.io.InputStream;
34
- import java.io.SequenceInputStream;
35
- import java.util.ArrayList;
34
+ import java.util.Iterator;
36
35
  import java.util.List;
37
36
  import java.util.Map;
38
37
 
39
38
  public class HdfsFileInputPlugin
40
39
  implements FileInputPlugin
41
40
  {
42
- private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
43
- private static FileSystem fs;
44
-
45
41
  public interface PluginTask
46
- extends Task
42
+ extends Task, PartialFileList.Task
47
43
  {
48
44
  @Config("config_files")
49
45
  @ConfigDefault("[]")
50
- public List<String> getConfigFiles();
46
+ List<String> getConfigFiles();
51
47
 
52
48
  @Config("config")
53
49
  @ConfigDefault("{}")
54
- public Map<String, String> getConfig();
50
+ Map<String, String> getConfig();
55
51
 
56
52
  @Config("path")
57
- public String getPath();
53
+ String getPath();
58
54
 
59
55
  @Config("rewind_seconds")
60
56
  @ConfigDefault("0")
61
- public int getRewindSeconds();
57
+ int getRewindSeconds();
62
58
 
63
59
  @Config("partition")
64
60
  @ConfigDefault("true")
65
- public boolean getPartition();
61
+ boolean getPartition();
66
62
 
67
63
  @Config("num_partitions") // this parameter is the approximate value.
68
64
  @ConfigDefault("-1") // Default: Runtime.getRuntime().availableProcessors()
69
- public long getApproximateNumPartitions();
65
+ long getApproximateNumPartitions();
70
66
 
71
67
  @Config("skip_header_lines") // Skip this number of lines first. Set 1 if the file has header line.
72
68
  @ConfigDefault("0") // The reason why the parameter is configured is that this plugin splits files.
73
- public int getSkipHeaderLines();
69
+ int getSkipHeaderLines();
74
70
 
75
- public List<HdfsPartialFile> getFiles();
71
+ @Config("decompression") // if true, decompress files by using compression codec
72
+ @ConfigDefault("false") // when getting FileInputStream.
73
+ boolean getDecompression();
76
74
 
77
- public void setFiles(List<HdfsPartialFile> hdfsFiles);
75
+ PartialFileList getPartialFileList();
76
+ void setPartialFileList(PartialFileList partialFileList);
78
77
 
79
78
  @ConfigInject
80
- public BufferAllocator getBufferAllocator();
79
+ ScriptingContainer getJRuby();
80
+
81
+ @ConfigInject
82
+ BufferAllocator getBufferAllocator();
81
83
  }
82
84
 
85
+ private static final Logger logger = Exec.getLogger(HdfsFileInputPlugin.class);
86
+ private Optional<Configuration> configurationContainer = Optional.absent();
87
+
83
88
  @Override
84
89
  public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
85
90
  {
86
91
  PluginTask task = config.loadConfig(PluginTask.class);
92
+ Configuration configuration = getConfiguration(task);
87
93
 
88
94
  // listing Files
89
- String pathString = strftime(task.getPath(), task.getRewindSeconds());
90
95
  try {
91
- List<String> originalFileList = buildFileList(getFs(task), pathString);
96
+ FileSystem fs = getFS(configuration);
97
+
98
+ String pathString = strftime(task.getJRuby(), task.getPath(), task.getRewindSeconds());
99
+ Path rootPath = new Path(pathString);
100
+
101
+ List<Path> originalFileList = buildOriginalFileList(fs, rootPath);
92
102
 
93
103
  if (originalFileList.isEmpty()) {
94
104
  throw new PathNotFoundException(pathString);
95
105
  }
96
106
 
97
107
  logger.debug("embulk-input-hdfs: Loading target files: {}", originalFileList);
98
- task.setFiles(allocateHdfsFilesToTasks(task, getFs(task), originalFileList));
108
+ PartialFileList list = buildPartialFileList(task, originalFileList);
109
+ task.setPartialFileList(list);
99
110
  }
100
111
  catch (IOException e) {
101
112
  logger.error(e.getMessage());
102
113
  throw new RuntimeException(e);
103
114
  }
104
115
 
105
- // log the detail of partial files.
106
- for (HdfsPartialFile partialFile : task.getFiles()) {
107
- logger.debug("embulk-input-hdfs: target file: {}, start: {}, end: {}",
108
- partialFile.getPath(), partialFile.getStart(), partialFile.getEnd());
109
- }
110
-
111
116
  // number of processors is same with number of targets
112
- int taskCount = task.getFiles().size();
117
+ int taskCount = task.getPartialFileList().getTaskCount();
113
118
  logger.info("embulk-input-hdfs: task size: {}", taskCount);
114
119
 
115
120
  return resume(task.dump(), taskCount, control);
116
121
  }
117
122
 
118
- @Override
119
- public ConfigDiff resume(TaskSource taskSource,
120
- int taskCount,
121
- FileInputPlugin.Control control)
123
+ private Configuration getConfiguration(PluginTask task)
122
124
  {
123
- control.run(taskSource, taskCount);
124
-
125
- ConfigDiff configDiff = Exec.newConfigDiff();
126
-
127
- // usually, yo use last_path
128
- //if (task.getFiles().isEmpty()) {
129
- // if (task.getLastPath().isPresent()) {
130
- // configDiff.set("last_path", task.getLastPath().get());
131
- // }
132
- //} else {
133
- // List<String> files = new ArrayList<String>(task.getFiles());
134
- // Collections.sort(files);
135
- // configDiff.set("last_path", files.get(files.size() - 1));
136
- //}
137
-
138
- return configDiff;
139
- }
125
+ if (configurationContainer.isPresent()) {
126
+ return configurationContainer.get();
127
+ }
140
128
 
141
- @Override
142
- public void cleanup(TaskSource taskSource,
143
- int taskCount,
144
- List<TaskReport> successTaskReports)
145
- {
129
+ ConfigurationBuilder builder = new ConfigurationBuilder();
130
+ builder.addConfigFiles(task.getConfigFiles());
131
+ builder.addConfigMap(task.getConfig());
132
+ configurationContainer = Optional.of(builder.build());
133
+ return configurationContainer.get();
146
134
  }
147
135
 
148
- @Override
149
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
136
+ private FileSystem getFS(Configuration configuration)
150
137
  {
151
- final PluginTask task = taskSource.loadTask(PluginTask.class);
152
-
153
- InputStream input;
154
- final HdfsPartialFile file = task.getFiles().get(taskIndex);
155
138
  try {
156
- if (file.getStart() > 0 && task.getSkipHeaderLines() > 0) {
157
- input = new SequenceInputStream(getHeadersInputStream(task, file), openInputStream(task, file));
158
- }
159
- else {
160
- input = openInputStream(task, file);
161
- }
139
+ return FileSystem.get(configuration);
162
140
  }
163
141
  catch (IOException e) {
164
- logger.error(e.getMessage());
165
- throw new RuntimeException(e);
142
+ throw Throwables.propagate(e);
166
143
  }
167
-
168
- return new InputStreamTransactionalFileInput(task.getBufferAllocator(), input)
169
- {
170
- @Override
171
- public void abort()
172
- { }
173
-
174
- @Override
175
- public TaskReport commit()
176
- {
177
- return Exec.newTaskReport();
178
- }
179
- };
180
144
  }
181
145
 
182
- private InputStream getHeadersInputStream(PluginTask task, HdfsPartialFile partialFile)
183
- throws IOException
146
+ @VisibleForTesting
147
+ String strftime(final ScriptingContainer jruby, final String format, final int rewindSeconds)
184
148
  {
185
- FileSystem fs = getFs(task);
186
- ByteArrayOutputStream header = new ByteArrayOutputStream();
187
- int skippedHeaders = 0;
188
-
189
- try (BufferedInputStream in = new BufferedInputStream(fs.open(new Path(partialFile.getPath())))) {
190
- while (true) {
191
- int c = in.read();
192
- if (c < 0) {
193
- break;
194
- }
195
-
196
- header.write(c);
197
-
198
- if (c == '\n') {
199
- skippedHeaders++;
200
- }
201
- else if (c == '\r') {
202
- int c2 = in.read();
203
- if (c2 == '\n') {
204
- header.write(c2);
205
- }
206
- skippedHeaders++;
207
- }
208
-
209
- if (skippedHeaders >= task.getSkipHeaderLines()) {
210
- break;
211
- }
212
- }
213
- }
214
- header.close();
215
- return new ByteArrayInputStream(header.toByteArray());
149
+ String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
150
+ return jruby.runScriptlet(script).toString();
216
151
  }
217
152
 
218
- private static HdfsPartialFileInputStream openInputStream(PluginTask task, HdfsPartialFile partialFile)
219
- throws IOException
153
+ private List<Path> buildOriginalFileList(FileSystem fs, Path rootPath)
220
154
  {
221
- FileSystem fs = getFs(task);
222
- InputStream original = fs.open(new Path(partialFile.getPath()));
223
- return new HdfsPartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
224
- }
155
+ List<Path> fileList = Lists.newArrayList();
225
156
 
226
- private static FileSystem getFs(final PluginTask task)
227
- throws IOException
228
- {
229
- if (fs == null) {
230
- setFs(task);
231
- return fs;
232
- }
233
- else {
234
- return fs;
235
- }
236
- }
237
-
238
- private static FileSystem setFs(final PluginTask task)
239
- throws IOException
240
- {
241
- Configuration configuration = new Configuration();
242
-
243
- for (String configFile : task.getConfigFiles()) {
244
- File file = new File(configFile);
245
- configuration.addResource(file.toURI().toURL());
246
- }
247
-
248
- for (Map.Entry<String, String> entry : task.getConfig().entrySet()) {
249
- configuration.set(entry.getKey(), entry.getValue());
157
+ final FileStatus[] entries;
158
+ try {
159
+ entries = fs.globStatus(rootPath);
250
160
  }
251
-
252
- // For debug
253
- for (Map.Entry<String, String> entry : configuration) {
254
- logger.trace("{}: {}", entry.getKey(), entry.getValue());
161
+ catch (IOException e) {
162
+ throw Throwables.propagate(e);
255
163
  }
256
- logger.debug("Resource Files: {}", configuration);
257
-
258
- fs = FileSystem.get(configuration);
259
- return fs;
260
- }
261
-
262
- private String strftime(final String raw, final int rewindSeconds)
263
- {
264
- ScriptingContainer jruby = new ScriptingContainer();
265
- Object resolved = jruby.runScriptlet(
266
- String.format("(Time.now - %s).strftime('%s')", String.valueOf(rewindSeconds), raw));
267
- return resolved.toString();
268
- }
269
-
270
- private List<String> buildFileList(final FileSystem fs, final String pathString)
271
- throws IOException
272
- {
273
- List<String> fileList = new ArrayList<>();
274
- Path rootPath = new Path(pathString);
275
-
276
- final FileStatus[] entries = fs.globStatus(rootPath);
277
164
  // `globStatus` does not throw PathNotFoundException.
278
165
  // return null instead.
279
166
  // see: https://github.com/apache/hadoop/blob/branch-2.7.0/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/Globber.java#L286
@@ -283,80 +170,229 @@ public class HdfsFileInputPlugin
283
170
 
284
171
  for (FileStatus entry : entries) {
285
172
  if (entry.isDirectory()) {
286
- fileList.addAll(lsr(fs, entry));
173
+ List<Path> subEntries = listRecursive(fs, entry);
174
+ fileList.addAll(subEntries);
287
175
  }
288
176
  else {
289
- fileList.add(entry.getPath().toString());
177
+ fileList.add(entry.getPath());
290
178
  }
291
179
  }
292
180
 
293
181
  return fileList;
294
182
  }
295
183
 
296
- private List<String> lsr(final FileSystem fs, FileStatus status)
297
- throws IOException
184
+ private List<Path> listRecursive(final FileSystem fs, FileStatus status)
298
185
  {
299
- List<String> fileList = new ArrayList<>();
186
+ List<Path> fileList = Lists.newArrayList();
300
187
  if (status.isDirectory()) {
301
- for (FileStatus entry : fs.listStatus(status.getPath())) {
302
- fileList.addAll(lsr(fs, entry));
188
+ FileStatus[] entries;
189
+ try {
190
+ entries = fs.listStatus(status.getPath());
191
+ }
192
+ catch (IOException e) {
193
+ throw Throwables.propagate(e);
194
+ }
195
+
196
+ for (FileStatus entry : entries) {
197
+ fileList.addAll(listRecursive(fs, entry));
303
198
  }
304
199
  }
305
200
  else {
306
- fileList.add(status.getPath().toString());
201
+ fileList.add(status.getPath());
307
202
  }
308
203
  return fileList;
309
204
  }
310
205
 
311
- private List<HdfsPartialFile> allocateHdfsFilesToTasks(final PluginTask task, final FileSystem fs, final List<String> fileList)
312
- throws IOException
206
+ private PartialFileList buildPartialFileList(PluginTask task, List<Path> pathList)
313
207
  {
314
- List<Path> pathList = Lists.transform(fileList, new Function<String, Path>()
315
- {
316
- @Nullable
317
- @Override
318
- public Path apply(@Nullable String input)
319
- {
320
- return new Path(input);
321
- }
322
- });
208
+ Configuration configuration = getConfiguration(task);
209
+ FileSystem fs = getFS(configuration);
210
+ boolean shouldPartition = task.getPartition();
211
+ boolean shouldDecompress = task.getDecompression();
323
212
 
213
+ Map<Path, Long> pathLengthMap = Maps.newHashMap();
324
214
  long totalFileLength = 0;
325
215
  for (Path path : pathList) {
326
- totalFileLength += fs.getFileStatus(path).getLen();
216
+ long fileLength = getHdfsFileLength(fs, path, shouldDecompress);
217
+
218
+ if (fileLength <= 0) {
219
+ logger.info("Skip the 0 byte target file: {}", path);
220
+ continue;
221
+ }
222
+
223
+ pathLengthMap.put(path, fileLength);
224
+ totalFileLength += fileLength;
327
225
  }
226
+ if (totalFileLength <= 0) {
227
+ throw Throwables.propagate(new PathIOException(task.getPath(), "All files are empty"));
228
+ }
229
+
230
+ PartialFileList.Builder builder = new PartialFileList.Builder(task);
328
231
 
329
232
  // TODO: optimum allocation of resources
330
- long approximateNumPartitions =
331
- (task.getApproximateNumPartitions() <= 0) ? Runtime.getRuntime().availableProcessors() : task.getApproximateNumPartitions();
233
+ final long approximateNumPartitions;
234
+ if (task.getApproximateNumPartitions() <= 0) {
235
+ approximateNumPartitions = Runtime.getRuntime().availableProcessors();
236
+ }
237
+ else {
238
+ approximateNumPartitions = task.getApproximateNumPartitions();
239
+ }
240
+
332
241
  long partitionSizeByOneTask = totalFileLength / approximateNumPartitions;
333
242
  if (partitionSizeByOneTask <= 0) {
334
243
  partitionSizeByOneTask = 1;
335
244
  }
336
245
 
337
- List<HdfsPartialFile> hdfsPartialFiles = new ArrayList<>();
338
- for (Path path : pathList) {
339
- long fileLength = fs.getFileStatus(path).getLen(); // declare `fileLength` here because this is used below.
340
- if (fileLength <= 0) {
341
- logger.info("embulk-input-hdfs: Skip the 0 byte target file: {}", path);
342
- continue;
343
- }
246
+ for (Map.Entry<Path, Long> entry : pathLengthMap.entrySet()) {
247
+ Path path = entry.getKey();
248
+ long fileLength = entry.getValue();
344
249
 
345
250
  long numPartitions;
346
- if (path.toString().endsWith(".gz") || path.toString().endsWith(".bz2") || path.toString().endsWith(".lzo")) {
347
- numPartitions = 1;
251
+ if (shouldPartition) {
252
+ if (shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null) {
253
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
254
+ }
255
+ else if (getHdfsFileCompressionCodec(fs, path) != null) { // if not null, the file is compressed.
256
+ numPartitions = 1;
257
+ }
258
+ else {
259
+ numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
260
+ }
348
261
  }
349
- else if (!task.getPartition()) {
262
+ else {
350
263
  numPartitions = 1;
351
264
  }
352
- else {
353
- numPartitions = ((fileLength - 1) / partitionSizeByOneTask) + 1;
265
+
266
+ for (long i = 0; i < numPartitions; i++) {
267
+ long start = fileLength * i / numPartitions;
268
+ long end = fileLength * (i + 1) / numPartitions;
269
+ if (start < end) {
270
+ logger.debug("PartialFile: path {}, start: {}, end: {}", path, start, end);
271
+ builder.add(path.toString(), start, end, shouldDecompress && getHdfsFileCompressionCodec(fs, path) != null);
272
+ }
354
273
  }
274
+ }
275
+
276
+ return builder.build();
277
+ }
355
278
 
356
- HdfsFilePartitioner partitioner = new HdfsFilePartitioner(fs, path, numPartitions);
357
- hdfsPartialFiles.addAll(partitioner.getHdfsPartialFiles());
279
+ private Long getHdfsFileLength(FileSystem fs, Path path, boolean shouldDecompression)
280
+ {
281
+ CompressionCodec codec = getHdfsFileCompressionCodec(fs, path);
282
+ if (codec == null) {
283
+ try {
284
+ return fs.getFileStatus(path).getLen();
285
+ }
286
+ catch (IOException e) {
287
+ throw Throwables.propagate(e);
288
+ }
289
+ }
290
+ else if (!shouldDecompression) {
291
+ try {
292
+ return fs.getFileStatus(path).getLen();
293
+ }
294
+ catch (IOException e) {
295
+ throw Throwables.propagate(e);
296
+ }
358
297
  }
298
+ else {
299
+ long fileLength = 0;
300
+ try (InputStream is = codec.createInputStream(fs.open(path))) {
301
+ while (is.read() > 0) {
302
+ fileLength++;
303
+ }
304
+ }
305
+ catch (IOException e) {
306
+ throw Throwables.propagate(e);
307
+ }
308
+ return fileLength;
309
+ }
310
+ }
359
311
 
360
- return hdfsPartialFiles;
312
+ private CompressionCodec getHdfsFileCompressionCodec(FileSystem fs, Path path)
313
+ {
314
+ return getHdfsFileCompressionCodec(fs.getConf(), path);
315
+ }
316
+
317
+ private CompressionCodec getHdfsFileCompressionCodec(Configuration configuration, Path path)
318
+ {
319
+ return new CompressionCodecFactory(configuration).getCodec(path);
320
+ }
321
+
322
+ @Override
323
+ public ConfigDiff resume(TaskSource taskSource,
324
+ int taskCount,
325
+ FileInputPlugin.Control control)
326
+ {
327
+ control.run(taskSource, taskCount);
328
+ ConfigDiff configDiff = Exec.newConfigDiff();
329
+ return configDiff;
330
+ }
331
+
332
+ @Override
333
+ public void cleanup(TaskSource taskSource,
334
+ int taskCount,
335
+ List<TaskReport> successTaskReports)
336
+ {
337
+ }
338
+
339
+ @Override
340
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
341
+ {
342
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
343
+ return new HdfsFileInput(task, taskIndex);
344
+ }
345
+
346
+ public class HdfsFileInput
347
+ extends InputStreamFileInput
348
+ implements TransactionalFileInput
349
+ {
350
+
351
+ public HdfsFileInput(PluginTask task, int taskIndex)
352
+ {
353
+ super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
354
+ }
355
+
356
+ @Override
357
+ public void abort()
358
+ {
359
+ }
360
+
361
+ @Override
362
+ public TaskReport commit()
363
+ {
364
+ return Exec.newTaskReport();
365
+ }
366
+ }
367
+
368
+ // TODO create single-file InputStreamFileInput utility
369
+ private class SingleFileProvider
370
+ implements InputStreamFileInput.Provider
371
+ {
372
+ private final FileSystem fs;
373
+ private final int numHeaderLines;
374
+ private final Iterator<PartialFile> iterator;
375
+
376
+ public SingleFileProvider(PluginTask task, int taskIndex)
377
+ {
378
+ this.fs = getFS(getConfiguration(task));
379
+ this.numHeaderLines = task.getSkipHeaderLines();
380
+ this.iterator = task.getPartialFileList().get(taskIndex).iterator();
381
+ }
382
+
383
+ @Override
384
+ public InputStream openNext() throws IOException
385
+ {
386
+ if (!iterator.hasNext()) {
387
+ return null;
388
+ }
389
+ PartialFileInputStreamBuilder builder = new PartialFileInputStreamBuilder(fs, iterator.next()).withHeaders(numHeaderLines);
390
+ return builder.build();
391
+ }
392
+
393
+ @Override
394
+ public void close()
395
+ {
396
+ }
361
397
  }
362
398
  }