embulk-input-filesplit 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +41 -40
  3. data/build.gradle +64 -64
  4. data/classpath/embulk-input-filesplit-0.1.4.jar +0 -0
  5. data/lib/embulk/input/filesplit.rb +3 -3
  6. data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +300 -187
  7. data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +70 -70
  8. data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +129 -94
  9. data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +78 -78
  10. data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -570
  11. data/src/test/resources/data/sub1/test1.csv +1 -0
  12. data/src/test/resources/data/sub1/test2.csv +3 -0
  13. data/src/test/resources/data/sub2/test1.csv +1 -0
  14. data/src/test/resources/data/sub2/test2.csv +3 -0
  15. data/src/test/resources/data/sub2/x.csv +1 -0
  16. data/src/test/resources/data/test-header.csv +5 -5
  17. data/src/test/resources/data/test-semicolon.csv +4 -4
  18. data/src/test/resources/data/test.csv +4 -4
  19. data/src/test/resources/yml/test-error1.yml +22 -0
  20. data/src/test/resources/yml/test-error2.yml +24 -0
  21. data/src/test/resources/yml/test-header.yml +24 -24
  22. data/src/test/resources/yml/test-only-header.yml +24 -24
  23. data/src/test/resources/yml/test-path_prefix-directory.yml +23 -0
  24. data/src/test/resources/yml/test-path_prefix-files.yml +23 -0
  25. data/src/test/resources/yml/test-tasks.yml +23 -23
  26. data/src/test/resources/yml/test.yml +22 -22
  27. metadata +15 -6
  28. data/classpath/embulk-input-filesplit-0.1.3.jar +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a6c43f390ee5dd1de19e4481dd2443dc3d3d63c0
4
- data.tar.gz: 7fb23bbe679ffa5fc730e3f553a1c14c2633f381
3
+ metadata.gz: 6c850998d274e9c10441daaac5ec8f25fb06ed7f
4
+ data.tar.gz: 6ede3a6e8f48a3ac8c4b3568605733cb7bc4b856
5
5
  SHA512:
6
- metadata.gz: a55da85aa17a112889765c8ec8281e1816c40e00a849032e72ca4618d85c1dda1c759cf178a5bffc5b1f3bd73f918c9eb2e3461f825eedb30b5010810273e833
7
- data.tar.gz: eb261354f866e3e4833815fd19c9634a380283dbadf9a47dcb8f12eb6135f502ee0c7d8e13f10c973b91f06fddb77e479f243bf8520e9b2f2338c14b388f35ce
6
+ metadata.gz: 7bbef333a3b9cacffc614fcb2b3b926d26f0ffde4e1f956568c072c1ac0b1a66db5c04e472b5ad8ed85dee81c20d68e080723feacad0823a2b3bd47536071d40
7
+ data.tar.gz: ffe27227b3469ad54c91a2e51bc3acb7a1523a8ac09921833d59d5b983ab2759a4b3151a5b1a7414a666a27da224d3cf0527d029e5787031b8dceb07b86d1ebb
data/README.md CHANGED
@@ -1,40 +1,41 @@
1
- # Splitting input file plugin for Embulk
2
-
3
- This Embulk plugin splits and inputs a text file.
4
- By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
-
6
- Lines of the text file should be separated by CR or LF or CRLF.
7
- The plugin searches line separators and splits a file properly.
8
-
9
- ## Overview
10
-
11
- * **Plugin type**: input
12
-
13
- ## Configuration
14
-
15
- - **path**: the path of a text file (string, required)
16
- - **header_line**: whether the first line is a header or not (boolean, default: false)
17
- - **tasks**: number of tasks (integer, default: number of available processors * 2)
18
-
19
- ### Example
20
-
21
- ```yaml
22
- in:
23
- type: filesplit
24
- path: '/data/address.csv'
25
- header_line: true
26
- tasks: 4
27
- parser:
28
- charset: UTF-8
29
- newline: CRLF
30
- type: csv
31
- header_line: true
32
- delimiter: ','
33
- ...
34
- ```
35
-
36
- ### Build
37
-
38
- ```
39
- $ ./gradle gem
40
- ```
1
+ # Splitting input file plugin for Embulk
2
+
3
+ This Embulk plugin splits and inputs a text file.
4
+ By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
+
6
+ Lines of the text file should be separated by CR or LF or CRLF.
7
+ The plugin searches line separators and splits a file properly.
8
+
9
+ ## Overview
10
+
11
+ * **Plugin type**: input
12
+
13
+ ## Configuration
14
+
15
+ - **path**: the path of a text file (string, either this or path_prefix is required)
16
+ - **path_prefix**: the path prefix of text files (string, either this or path_prefix is required)
17
+ - **header_line**: whether the first line is a header or not (boolean, default: false)
18
+ - **tasks**: number of tasks (integer, default: number of available processors * 2)
19
+
20
+ ### Example
21
+
22
+ ```yaml
23
+ in:
24
+ type: filesplit
25
+ path: '/data/address.csv'
26
+ header_line: true
27
+ tasks: 4
28
+ parser:
29
+ charset: UTF-8
30
+ newline: CRLF
31
+ type: csv
32
+ header_line: true
33
+ delimiter: ','
34
+ ...
35
+ ```
36
+
37
+ ### Build
38
+
39
+ ```
40
+ $ ./gradle gem
41
+ ```
@@ -1,64 +1,64 @@
1
- plugins {
2
- id 'com.jfrog.bintray' version '1.1'
3
- id 'com.github.jruby-gradle.base' version '0.1.5'
4
- id 'java'
5
- }
6
- import com.github.jrubygradle.JRubyExec
7
-
8
- apply plugin: 'java'
9
- apply plugin: 'com.github.jruby-gradle.base'
10
-
11
- [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
-
13
- project.version = '0.1.3'
14
-
15
- repositories {
16
- mavenCentral()
17
- jcenter()
18
- }
19
-
20
- configurations {
21
- provided
22
- }
23
-
24
- dependencies {
25
- compile 'org.embulk:embulk-core:0.7.4'
26
- provided 'org.embulk:embulk-core:0.7.4'
27
- testCompile 'org.embulk:embulk-standards:0.7.4'
28
- testCompile 'junit:junit:4.+'
29
- }
30
-
31
- task classpath(type: Copy, dependsOn: ["jar"]) {
32
- doFirst { file('classpath').deleteDir() }
33
- from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
- into 'classpath'
35
- }
36
- clean { delete 'classpath' }
37
-
38
- task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
- jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
- script "build/gemspec"
41
- doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
- }
43
-
44
- task gemspec << {
45
- file('build').mkdirs();
46
- file('build/gemspec').write($/
47
- Gem::Specification.new do |spec|
48
- spec.name = "${project.name}"
49
- spec.version = "${project.version}"
50
- spec.authors = ["Hitoshi Tanaka"]
51
- spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
- spec.summary = "Embulk plugin for splitting input file"
53
- spec.licenses = ["Apache 2.0"]
54
- spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
- spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
- spec.require_paths = ["lib"]
57
- end
58
- /$)
59
- }
60
-
61
-
62
- task gempush << {
63
- "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
- }
1
+ plugins {
2
+ id 'com.jfrog.bintray' version '1.6'
3
+ id 'com.github.jruby-gradle.base' version '1.2.1'
4
+ id 'java'
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+
8
+ apply plugin: 'java'
9
+ apply plugin: 'com.github.jruby-gradle.base'
10
+
11
+ [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
+
13
+ project.version = '0.1.4'
14
+
15
+ repositories {
16
+ mavenCentral()
17
+ jcenter()
18
+ }
19
+
20
+ configurations {
21
+ provided
22
+ }
23
+
24
+ dependencies {
25
+ compile 'org.embulk:embulk-core:0.7.4'
26
+ provided 'org.embulk:embulk-core:0.7.4'
27
+ testCompile 'org.embulk:embulk-standards:0.7.4'
28
+ testCompile 'junit:junit:4.+'
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file('classpath').deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into 'classpath'
35
+ }
36
+ clean { delete 'classpath' }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
+ jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
+ scriptArgs "build/gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemspec << {
45
+ file('build').mkdirs();
46
+ file('build/gemspec').write($/
47
+ Gem::Specification.new do |spec|
48
+ spec.name = "${project.name}"
49
+ spec.version = "${project.version}"
50
+ spec.authors = ["Hitoshi Tanaka"]
51
+ spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
+ spec.summary = "Embulk plugin for splitting input file"
53
+ spec.licenses = ["Apache 2.0"]
54
+ spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
+ spec.require_paths = ["lib"]
57
+ end
58
+ /$)
59
+ }
60
+
61
+
62
+ task gempush << {
63
+ "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
+ }
@@ -1,3 +1,3 @@
1
- Embulk::JavaPlugin.register_input(
2
- :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
- File.expand_path('../../../../classpath', __FILE__))
1
+ Embulk::JavaPlugin.register_input(
2
+ :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -1,187 +1,300 @@
1
- package org.embulk.input.filesplit;
2
-
3
- import java.io.BufferedInputStream;
4
- import java.io.ByteArrayInputStream;
5
- import java.io.ByteArrayOutputStream;
6
- import java.io.File;
7
- import java.io.FileInputStream;
8
- import java.io.IOException;
9
- import java.io.InputStream;
10
- import java.io.SequenceInputStream;
11
- import java.util.ArrayList;
12
- import java.util.List;
13
-
14
- import org.embulk.config.Config;
15
- import org.embulk.config.ConfigDefault;
16
- import org.embulk.config.ConfigDiff;
17
- import org.embulk.config.ConfigInject;
18
- import org.embulk.config.ConfigSource;
19
- import org.embulk.config.Task;
20
- import org.embulk.config.TaskReport;
21
- import org.embulk.config.TaskSource;
22
- import org.embulk.spi.BufferAllocator;
23
- import org.embulk.spi.Exec;
24
- import org.embulk.spi.FileInputPlugin;
25
- import org.embulk.spi.TransactionalFileInput;
26
- import org.embulk.spi.util.InputStreamFileInput;
27
-
28
- import com.google.common.base.Optional;
29
-
30
-
31
- public class LocalFileSplitInputPlugin
32
- implements FileInputPlugin
33
- {
34
- public interface PluginTask
35
- extends Task
36
- {
37
- @Config("path")
38
- public String getPath();
39
-
40
- @Config("tasks")
41
- @ConfigDefault("null")
42
- public Optional<Integer> getTasks();
43
-
44
- @Config("header_line")
45
- @ConfigDefault("false")
46
- public boolean getHeaderLine();
47
-
48
- public List<PartialFile> getFiles();
49
- public void setFiles(List<PartialFile> files);
50
-
51
- @ConfigInject
52
- public BufferAllocator getBufferAllocator();
53
- }
54
-
55
- @Override
56
- public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
57
- {
58
- PluginTask task = config.loadConfig(PluginTask.class);
59
-
60
- int tasks;
61
- if (task.getTasks().isPresent()) {
62
- tasks = task.getTasks().get();
63
- if (tasks <= 0) {
64
- throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
65
- }
66
- } else {
67
- tasks = Runtime.getRuntime().availableProcessors() * 2;
68
- }
69
-
70
- long size = new File(task.getPath()).length();
71
- List<PartialFile> files = new ArrayList<PartialFile>();
72
- for (int i = 0; i < tasks; i++) {
73
- long start = size * i / tasks;
74
- long end = size * (i + 1) / tasks;
75
- if (start < end) {
76
- files.add(new PartialFile(task.getPath(), start, end));
77
- }
78
- }
79
-
80
- task.setFiles(files);
81
-
82
- return resume(task.dump(), task.getFiles().size(), control);
83
- }
84
-
85
- @Override
86
- public ConfigDiff resume(TaskSource taskSource,
87
- int taskCount,
88
- FileInputPlugin.Control control)
89
- {
90
- control.run(taskSource, taskCount);
91
-
92
- return Exec.newConfigDiff();
93
- }
94
-
95
- @Override
96
- public void cleanup(TaskSource taskSource,
97
- int taskCount,
98
- List<TaskReport> successTaskReports)
99
- { }
100
-
101
- @Override
102
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
103
- {
104
- PluginTask task = taskSource.loadTask(PluginTask.class);
105
- return new LocalFileSplitInput(task, taskIndex);
106
- }
107
-
108
- public static class LocalFileSplitInput
109
- extends InputStreamFileInput
110
- implements TransactionalFileInput
111
- {
112
- public static class FileSplitProvider
113
- implements InputStreamFileInput.Provider
114
- {
115
- private final PartialFile file;
116
- private final boolean hasHeader;
117
- private boolean opened = false;
118
-
119
- public FileSplitProvider(PartialFile file, boolean hasHeader)
120
- {
121
- this.file = file;
122
- this.hasHeader = hasHeader;
123
- }
124
-
125
- @Override
126
- public InputStream openNext() throws IOException
127
- {
128
- if (opened) {
129
- return null;
130
- }
131
- opened = true;
132
-
133
- InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
134
- if (file.getStart() > 0 && hasHeader) {
135
- in = new SequenceInputStream(openHeader(file.getPath()), in);
136
- }
137
- return in;
138
- }
139
-
140
- @Override
141
- public void close() { }
142
-
143
- private InputStream openHeader(String path) throws IOException
144
- {
145
- ByteArrayOutputStream header = new ByteArrayOutputStream();
146
- try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
147
- while (true) {
148
- int c = in.read();
149
- if (c < 0) {
150
- break;
151
- }
152
-
153
- header.write(c);
154
-
155
- if (c == '\n') {
156
- break;
157
- }
158
-
159
- if (c == '\r') {
160
- int c2 = in.read();
161
- if (c2 == '\n') {
162
- header.write(c2);
163
- }
164
- break;
165
- }
166
- }
167
- }
168
- header.close();
169
- return new ByteArrayInputStream(header.toByteArray());
170
- }
171
- }
172
-
173
- public LocalFileSplitInput(PluginTask task, int taskIndex)
174
- {
175
- super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
176
- }
177
-
178
- @Override
179
- public void abort() { }
180
-
181
- @Override
182
- public TaskReport commit()
183
- {
184
- return Exec.newTaskReport();
185
- }
186
- }
187
- }
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.ByteArrayInputStream;
5
+ import java.io.ByteArrayOutputStream;
6
+ import java.io.File;
7
+ import java.io.FileInputStream;
8
+ import java.io.IOException;
9
+ import java.io.InputStream;
10
+ import java.io.SequenceInputStream;
11
+ import java.nio.file.Files;
12
+ import java.nio.file.FileVisitOption;
13
+ import java.nio.file.Path;
14
+ import java.nio.file.Paths;
15
+ import java.nio.file.SimpleFileVisitor;
16
+ import java.nio.file.FileVisitResult;
17
+ import java.nio.file.attribute.BasicFileAttributes;
18
+ import java.util.ArrayList;
19
+ import java.util.EnumSet;
20
+ import java.util.List;
21
+ import java.util.Set;
22
+ import com.google.common.collect.ImmutableList;
23
+
24
+ import org.embulk.config.Config;
25
+ import org.embulk.config.ConfigDefault;
26
+ import org.embulk.config.ConfigDiff;
27
+ import org.embulk.config.ConfigInject;
28
+ import org.embulk.config.ConfigSource;
29
+ import org.embulk.config.Task;
30
+ import org.embulk.config.TaskReport;
31
+ import org.embulk.config.TaskSource;
32
+ import org.embulk.spi.BufferAllocator;
33
+ import org.embulk.spi.Exec;
34
+ import org.embulk.spi.FileInputPlugin;
35
+ import org.embulk.spi.TransactionalFileInput;
36
+ import org.embulk.spi.util.InputStreamFileInput;
37
+
38
+ import com.google.common.base.Optional;
39
+
40
+ public class LocalFileSplitInputPlugin
41
+ implements FileInputPlugin
42
+ {
43
+
44
+ private final static Path CURRENT_DIR = Paths.get(".").normalize();
45
+
46
+ public interface PluginTask
47
+ extends Task
48
+ {
49
+ @Config("path")
50
+ @ConfigDefault("null")
51
+ public Optional<String> getPath();
52
+
53
+ @Config("path_prefix")
54
+ @ConfigDefault("null")
55
+ public Optional<String> getPathPrefix();
56
+
57
+ @Config("tasks")
58
+ @ConfigDefault("null")
59
+ public Optional<Integer> getTasks();
60
+
61
+ @Config("header_line")
62
+ @ConfigDefault("false")
63
+ public boolean getHeaderLine();
64
+
65
+ public List<PartialFile> getFiles();
66
+ public void setFiles(List<PartialFile> files);
67
+
68
+ @ConfigInject
69
+ public BufferAllocator getBufferAllocator();
70
+ }
71
+
72
+ @Override
73
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
74
+ {
75
+ PluginTask task = config.loadConfig(PluginTask.class);
76
+
77
+ int tasks;
78
+ if (task.getTasks().isPresent()) {
79
+ tasks = task.getTasks().get();
80
+ if (tasks <= 0) {
81
+ throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
82
+ }
83
+ } else {
84
+ tasks = Runtime.getRuntime().availableProcessors() * 2;
85
+ }
86
+
87
+ List<String> paths = new ArrayList<String>();
88
+ if (task.getPath().isPresent()) {
89
+ if (task.getPathPrefix().isPresent()) {
90
+ throw new IllegalArgumentException("Cannot specify both 'path' and 'path_prefix'");
91
+ }
92
+ paths.add(task.getPath().get());
93
+ } else if (task.getPathPrefix().isPresent()) {
94
+ paths.addAll(listFiles(task.getPathPrefix().get()));
95
+ } else {
96
+ throw new IllegalArgumentException("Specify either 'path' or 'path_prefix'");
97
+ }
98
+
99
+ List<PartialFile> files = new ArrayList<PartialFile>();
100
+ for (String path : paths) {
101
+ long size = new File(path).length();
102
+ for (int i = 0; i < tasks; i++) {
103
+ long start = size * i / tasks;
104
+ long end = size * (i + 1) / tasks;
105
+ if (start < end) {
106
+ files.add(new PartialFile(path, start, end));
107
+ }
108
+ }
109
+ }
110
+
111
+ task.setFiles(files);
112
+
113
+ return resume(task.dump(), task.getFiles().size(), control);
114
+ }
115
+
116
+ @Override
117
+ public ConfigDiff resume(TaskSource taskSource,
118
+ int taskCount,
119
+ FileInputPlugin.Control control)
120
+ {
121
+ control.run(taskSource, taskCount);
122
+
123
+ return Exec.newConfigDiff();
124
+ }
125
+
126
+ @Override
127
+ public void cleanup(TaskSource taskSource,
128
+ int taskCount,
129
+ List<TaskReport> successTaskReports)
130
+ { }
131
+
132
+ @Override
133
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
134
+ {
135
+ PluginTask task = taskSource.loadTask(PluginTask.class);
136
+ return new LocalFileSplitInput(task, taskIndex);
137
+ }
138
+
139
+ /**
140
+ * Most of this implementation is based on LocalFileInputPlugin hosted at:
141
+ * https://github.com/embulk/embulk
142
+ *
143
+ */
144
+ private List<String> listFiles(String prefix)
145
+ {
146
+ final Path pathPrefix = Paths.get(prefix).normalize();
147
+ final Path directory;
148
+ final String fileNamePrefix;
149
+ if (Files.isDirectory(pathPrefix)) {
150
+ directory = pathPrefix;
151
+ fileNamePrefix = "";
152
+ } else {
153
+ fileNamePrefix = pathPrefix.getFileName().toString();
154
+ Path d = pathPrefix.getParent();
155
+ directory = (d == null ? CURRENT_DIR : d);
156
+ }
157
+
158
+ final ImmutableList.Builder<String> builder = ImmutableList.builder();
159
+ try {
160
+ int maxDepth = Integer.MAX_VALUE;
161
+ Set<FileVisitOption> opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
162
+
163
+ Files.walkFileTree(directory, opts, maxDepth, new SimpleFileVisitor<Path>() {
164
+ @Override
165
+ public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs)
166
+ {
167
+ if (path.equals(directory)) {
168
+ return FileVisitResult.CONTINUE;
169
+ } else {
170
+ Path parent = path.getParent();
171
+ if (parent == null) {
172
+ parent = CURRENT_DIR;
173
+ }
174
+ if (parent.equals(directory)) {
175
+ if (path.getFileName().toString().startsWith(fileNamePrefix)) {
176
+ return FileVisitResult.CONTINUE;
177
+ } else {
178
+ return FileVisitResult.SKIP_SUBTREE;
179
+ }
180
+ } else {
181
+ return FileVisitResult.CONTINUE;
182
+ }
183
+ }
184
+ }
185
+
186
+ @Override
187
+ public FileVisitResult visitFile(Path path, BasicFileAttributes attrs)
188
+ {
189
+ try {
190
+ // Avoid directories from listing.
191
+ // Directories are normally unvisited with |FileVisitor#visitFile|, but symbolic links to
192
+ // directories are visited like files unless |FOLLOW_LINKS| is set in |Files#walkFileTree|.
193
+ // Symbolic links to directories are explicitly skipped here by checking with |Path#toReadlPath|.
194
+ if (Files.isDirectory(path.toRealPath())) {
195
+ return FileVisitResult.CONTINUE;
196
+ }
197
+ } catch (IOException ex){
198
+ throw new RuntimeException("Can't resolve symbolic link", ex);
199
+ }
200
+ Path parent = path.getParent();
201
+ if (parent == null) {
202
+ parent = CURRENT_DIR;
203
+ }
204
+ if (parent.equals(directory)) {
205
+ if (path.getFileName().toString().startsWith(fileNamePrefix)) {
206
+ builder.add(path.toString());
207
+ return FileVisitResult.CONTINUE;
208
+ }
209
+ } else {
210
+ builder.add(path.toString());
211
+ }
212
+ return FileVisitResult.CONTINUE;
213
+ }
214
+ });
215
+ } catch (IOException ex) {
216
+ throw new RuntimeException(String.format("Failed get a list of local files at '%s'", directory), ex);
217
+ }
218
+ return builder.build();
219
+ }
220
+
221
+ public static class LocalFileSplitInput
222
+ extends InputStreamFileInput
223
+ implements TransactionalFileInput
224
+ {
225
+ public static class FileSplitProvider
226
+ implements InputStreamFileInput.Provider
227
+ {
228
+ private final PartialFile file;
229
+ private final boolean hasHeader;
230
+ private boolean opened = false;
231
+
232
+ public FileSplitProvider(PartialFile file, boolean hasHeader)
233
+ {
234
+ this.file = file;
235
+ this.hasHeader = hasHeader;
236
+ }
237
+
238
+ @Override
239
+ public InputStream openNext() throws IOException
240
+ {
241
+ if (opened) {
242
+ return null;
243
+ }
244
+ opened = true;
245
+
246
+ InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
247
+ if (file.getStart() > 0 && hasHeader) {
248
+ in = new SequenceInputStream(openHeader(file.getPath()), in);
249
+ }
250
+ return in;
251
+ }
252
+
253
+ @Override
254
+ public void close() { }
255
+
256
+ private InputStream openHeader(String path) throws IOException
257
+ {
258
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
259
+ try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
260
+ while (true) {
261
+ int c = in.read();
262
+ if (c < 0) {
263
+ break;
264
+ }
265
+
266
+ header.write(c);
267
+
268
+ if (c == '\n') {
269
+ break;
270
+ }
271
+
272
+ if (c == '\r') {
273
+ int c2 = in.read();
274
+ if (c2 == '\n') {
275
+ header.write(c2);
276
+ }
277
+ break;
278
+ }
279
+ }
280
+ }
281
+ header.close();
282
+ return new ByteArrayInputStream(header.toByteArray());
283
+ }
284
+ }
285
+
286
+ public LocalFileSplitInput(PluginTask task, int taskIndex)
287
+ {
288
+ super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
289
+ }
290
+
291
+ @Override
292
+ public void abort() { }
293
+
294
+ @Override
295
+ public TaskReport commit()
296
+ {
297
+ return Exec.newTaskReport();
298
+ }
299
+ }
300
+ }