embulk-input-filesplit 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +41 -40
  3. data/build.gradle +64 -64
  4. data/classpath/embulk-input-filesplit-0.1.4.jar +0 -0
  5. data/lib/embulk/input/filesplit.rb +3 -3
  6. data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +300 -187
  7. data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +70 -70
  8. data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +129 -94
  9. data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +78 -78
  10. data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -570
  11. data/src/test/resources/data/sub1/test1.csv +1 -0
  12. data/src/test/resources/data/sub1/test2.csv +3 -0
  13. data/src/test/resources/data/sub2/test1.csv +1 -0
  14. data/src/test/resources/data/sub2/test2.csv +3 -0
  15. data/src/test/resources/data/sub2/x.csv +1 -0
  16. data/src/test/resources/data/test-header.csv +5 -5
  17. data/src/test/resources/data/test-semicolon.csv +4 -4
  18. data/src/test/resources/data/test.csv +4 -4
  19. data/src/test/resources/yml/test-error1.yml +22 -0
  20. data/src/test/resources/yml/test-error2.yml +24 -0
  21. data/src/test/resources/yml/test-header.yml +24 -24
  22. data/src/test/resources/yml/test-only-header.yml +24 -24
  23. data/src/test/resources/yml/test-path_prefix-directory.yml +23 -0
  24. data/src/test/resources/yml/test-path_prefix-files.yml +23 -0
  25. data/src/test/resources/yml/test-tasks.yml +23 -23
  26. data/src/test/resources/yml/test.yml +22 -22
  27. metadata +15 -6
  28. data/classpath/embulk-input-filesplit-0.1.3.jar +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a6c43f390ee5dd1de19e4481dd2443dc3d3d63c0
4
- data.tar.gz: 7fb23bbe679ffa5fc730e3f553a1c14c2633f381
3
+ metadata.gz: 6c850998d274e9c10441daaac5ec8f25fb06ed7f
4
+ data.tar.gz: 6ede3a6e8f48a3ac8c4b3568605733cb7bc4b856
5
5
  SHA512:
6
- metadata.gz: a55da85aa17a112889765c8ec8281e1816c40e00a849032e72ca4618d85c1dda1c759cf178a5bffc5b1f3bd73f918c9eb2e3461f825eedb30b5010810273e833
7
- data.tar.gz: eb261354f866e3e4833815fd19c9634a380283dbadf9a47dcb8f12eb6135f502ee0c7d8e13f10c973b91f06fddb77e479f243bf8520e9b2f2338c14b388f35ce
6
+ metadata.gz: 7bbef333a3b9cacffc614fcb2b3b926d26f0ffde4e1f956568c072c1ac0b1a66db5c04e472b5ad8ed85dee81c20d68e080723feacad0823a2b3bd47536071d40
7
+ data.tar.gz: ffe27227b3469ad54c91a2e51bc3acb7a1523a8ac09921833d59d5b983ab2759a4b3151a5b1a7414a666a27da224d3cf0527d029e5787031b8dceb07b86d1ebb
data/README.md CHANGED
@@ -1,40 +1,41 @@
1
- # Splitting input file plugin for Embulk
2
-
3
- This Embulk plugin splits and inputs a text file.
4
- By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
-
6
- Lines of the text file should be separated by CR or LF or CRLF.
7
- The plugin searches line separators and splits a file properly.
8
-
9
- ## Overview
10
-
11
- * **Plugin type**: input
12
-
13
- ## Configuration
14
-
15
- - **path**: the path of a text file (string, required)
16
- - **header_line**: whether the first line is a header or not (boolean, default: false)
17
- - **tasks**: number of tasks (integer, default: number of available processors * 2)
18
-
19
- ### Example
20
-
21
- ```yaml
22
- in:
23
- type: filesplit
24
- path: '/data/address.csv'
25
- header_line: true
26
- tasks: 4
27
- parser:
28
- charset: UTF-8
29
- newline: CRLF
30
- type: csv
31
- header_line: true
32
- delimiter: ','
33
- ...
34
- ```
35
-
36
- ### Build
37
-
38
- ```
39
- $ ./gradle gem
40
- ```
1
+ # Splitting input file plugin for Embulk
2
+
3
+ This Embulk plugin splits and inputs a text file.
4
+ By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
+
6
+ Lines of the text file should be separated by CR or LF or CRLF.
7
+ The plugin searches line separators and splits a file properly.
8
+
9
+ ## Overview
10
+
11
+ * **Plugin type**: input
12
+
13
+ ## Configuration
14
+
15
+ - **path**: the path of a text file (string, either this or path_prefix is required)
16
+ - **path_prefix**: the path prefix of text files (string, either this or path_prefix is required)
17
+ - **header_line**: whether the first line is a header or not (boolean, default: false)
18
+ - **tasks**: number of tasks (integer, default: number of available processors * 2)
19
+
20
+ ### Example
21
+
22
+ ```yaml
23
+ in:
24
+ type: filesplit
25
+ path: '/data/address.csv'
26
+ header_line: true
27
+ tasks: 4
28
+ parser:
29
+ charset: UTF-8
30
+ newline: CRLF
31
+ type: csv
32
+ header_line: true
33
+ delimiter: ','
34
+ ...
35
+ ```
36
+
37
+ ### Build
38
+
39
+ ```
40
+ $ ./gradle gem
41
+ ```
@@ -1,64 +1,64 @@
1
- plugins {
2
- id 'com.jfrog.bintray' version '1.1'
3
- id 'com.github.jruby-gradle.base' version '0.1.5'
4
- id 'java'
5
- }
6
- import com.github.jrubygradle.JRubyExec
7
-
8
- apply plugin: 'java'
9
- apply plugin: 'com.github.jruby-gradle.base'
10
-
11
- [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
-
13
- project.version = '0.1.3'
14
-
15
- repositories {
16
- mavenCentral()
17
- jcenter()
18
- }
19
-
20
- configurations {
21
- provided
22
- }
23
-
24
- dependencies {
25
- compile 'org.embulk:embulk-core:0.7.4'
26
- provided 'org.embulk:embulk-core:0.7.4'
27
- testCompile 'org.embulk:embulk-standards:0.7.4'
28
- testCompile 'junit:junit:4.+'
29
- }
30
-
31
- task classpath(type: Copy, dependsOn: ["jar"]) {
32
- doFirst { file('classpath').deleteDir() }
33
- from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
- into 'classpath'
35
- }
36
- clean { delete 'classpath' }
37
-
38
- task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
- jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
- script "build/gemspec"
41
- doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
- }
43
-
44
- task gemspec << {
45
- file('build').mkdirs();
46
- file('build/gemspec').write($/
47
- Gem::Specification.new do |spec|
48
- spec.name = "${project.name}"
49
- spec.version = "${project.version}"
50
- spec.authors = ["Hitoshi Tanaka"]
51
- spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
- spec.summary = "Embulk plugin for splitting input file"
53
- spec.licenses = ["Apache 2.0"]
54
- spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
- spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
- spec.require_paths = ["lib"]
57
- end
58
- /$)
59
- }
60
-
61
-
62
- task gempush << {
63
- "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
- }
1
+ plugins {
2
+ id 'com.jfrog.bintray' version '1.6'
3
+ id 'com.github.jruby-gradle.base' version '1.2.1'
4
+ id 'java'
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+
8
+ apply plugin: 'java'
9
+ apply plugin: 'com.github.jruby-gradle.base'
10
+
11
+ [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
+
13
+ project.version = '0.1.4'
14
+
15
+ repositories {
16
+ mavenCentral()
17
+ jcenter()
18
+ }
19
+
20
+ configurations {
21
+ provided
22
+ }
23
+
24
+ dependencies {
25
+ compile 'org.embulk:embulk-core:0.7.4'
26
+ provided 'org.embulk:embulk-core:0.7.4'
27
+ testCompile 'org.embulk:embulk-standards:0.7.4'
28
+ testCompile 'junit:junit:4.+'
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file('classpath').deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into 'classpath'
35
+ }
36
+ clean { delete 'classpath' }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
+ jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
+ scriptArgs "build/gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemspec << {
45
+ file('build').mkdirs();
46
+ file('build/gemspec').write($/
47
+ Gem::Specification.new do |spec|
48
+ spec.name = "${project.name}"
49
+ spec.version = "${project.version}"
50
+ spec.authors = ["Hitoshi Tanaka"]
51
+ spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
+ spec.summary = "Embulk plugin for splitting input file"
53
+ spec.licenses = ["Apache 2.0"]
54
+ spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
+ spec.require_paths = ["lib"]
57
+ end
58
+ /$)
59
+ }
60
+
61
+
62
+ task gempush << {
63
+ "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
+ }
@@ -1,3 +1,3 @@
1
- Embulk::JavaPlugin.register_input(
2
- :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
- File.expand_path('../../../../classpath', __FILE__))
1
+ Embulk::JavaPlugin.register_input(
2
+ :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -1,187 +1,300 @@
1
- package org.embulk.input.filesplit;
2
-
3
- import java.io.BufferedInputStream;
4
- import java.io.ByteArrayInputStream;
5
- import java.io.ByteArrayOutputStream;
6
- import java.io.File;
7
- import java.io.FileInputStream;
8
- import java.io.IOException;
9
- import java.io.InputStream;
10
- import java.io.SequenceInputStream;
11
- import java.util.ArrayList;
12
- import java.util.List;
13
-
14
- import org.embulk.config.Config;
15
- import org.embulk.config.ConfigDefault;
16
- import org.embulk.config.ConfigDiff;
17
- import org.embulk.config.ConfigInject;
18
- import org.embulk.config.ConfigSource;
19
- import org.embulk.config.Task;
20
- import org.embulk.config.TaskReport;
21
- import org.embulk.config.TaskSource;
22
- import org.embulk.spi.BufferAllocator;
23
- import org.embulk.spi.Exec;
24
- import org.embulk.spi.FileInputPlugin;
25
- import org.embulk.spi.TransactionalFileInput;
26
- import org.embulk.spi.util.InputStreamFileInput;
27
-
28
- import com.google.common.base.Optional;
29
-
30
-
31
- public class LocalFileSplitInputPlugin
32
- implements FileInputPlugin
33
- {
34
- public interface PluginTask
35
- extends Task
36
- {
37
- @Config("path")
38
- public String getPath();
39
-
40
- @Config("tasks")
41
- @ConfigDefault("null")
42
- public Optional<Integer> getTasks();
43
-
44
- @Config("header_line")
45
- @ConfigDefault("false")
46
- public boolean getHeaderLine();
47
-
48
- public List<PartialFile> getFiles();
49
- public void setFiles(List<PartialFile> files);
50
-
51
- @ConfigInject
52
- public BufferAllocator getBufferAllocator();
53
- }
54
-
55
- @Override
56
- public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
57
- {
58
- PluginTask task = config.loadConfig(PluginTask.class);
59
-
60
- int tasks;
61
- if (task.getTasks().isPresent()) {
62
- tasks = task.getTasks().get();
63
- if (tasks <= 0) {
64
- throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
65
- }
66
- } else {
67
- tasks = Runtime.getRuntime().availableProcessors() * 2;
68
- }
69
-
70
- long size = new File(task.getPath()).length();
71
- List<PartialFile> files = new ArrayList<PartialFile>();
72
- for (int i = 0; i < tasks; i++) {
73
- long start = size * i / tasks;
74
- long end = size * (i + 1) / tasks;
75
- if (start < end) {
76
- files.add(new PartialFile(task.getPath(), start, end));
77
- }
78
- }
79
-
80
- task.setFiles(files);
81
-
82
- return resume(task.dump(), task.getFiles().size(), control);
83
- }
84
-
85
- @Override
86
- public ConfigDiff resume(TaskSource taskSource,
87
- int taskCount,
88
- FileInputPlugin.Control control)
89
- {
90
- control.run(taskSource, taskCount);
91
-
92
- return Exec.newConfigDiff();
93
- }
94
-
95
- @Override
96
- public void cleanup(TaskSource taskSource,
97
- int taskCount,
98
- List<TaskReport> successTaskReports)
99
- { }
100
-
101
- @Override
102
- public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
103
- {
104
- PluginTask task = taskSource.loadTask(PluginTask.class);
105
- return new LocalFileSplitInput(task, taskIndex);
106
- }
107
-
108
- public static class LocalFileSplitInput
109
- extends InputStreamFileInput
110
- implements TransactionalFileInput
111
- {
112
- public static class FileSplitProvider
113
- implements InputStreamFileInput.Provider
114
- {
115
- private final PartialFile file;
116
- private final boolean hasHeader;
117
- private boolean opened = false;
118
-
119
- public FileSplitProvider(PartialFile file, boolean hasHeader)
120
- {
121
- this.file = file;
122
- this.hasHeader = hasHeader;
123
- }
124
-
125
- @Override
126
- public InputStream openNext() throws IOException
127
- {
128
- if (opened) {
129
- return null;
130
- }
131
- opened = true;
132
-
133
- InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
134
- if (file.getStart() > 0 && hasHeader) {
135
- in = new SequenceInputStream(openHeader(file.getPath()), in);
136
- }
137
- return in;
138
- }
139
-
140
- @Override
141
- public void close() { }
142
-
143
- private InputStream openHeader(String path) throws IOException
144
- {
145
- ByteArrayOutputStream header = new ByteArrayOutputStream();
146
- try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
147
- while (true) {
148
- int c = in.read();
149
- if (c < 0) {
150
- break;
151
- }
152
-
153
- header.write(c);
154
-
155
- if (c == '\n') {
156
- break;
157
- }
158
-
159
- if (c == '\r') {
160
- int c2 = in.read();
161
- if (c2 == '\n') {
162
- header.write(c2);
163
- }
164
- break;
165
- }
166
- }
167
- }
168
- header.close();
169
- return new ByteArrayInputStream(header.toByteArray());
170
- }
171
- }
172
-
173
- public LocalFileSplitInput(PluginTask task, int taskIndex)
174
- {
175
- super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
176
- }
177
-
178
- @Override
179
- public void abort() { }
180
-
181
- @Override
182
- public TaskReport commit()
183
- {
184
- return Exec.newTaskReport();
185
- }
186
- }
187
- }
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.ByteArrayInputStream;
5
+ import java.io.ByteArrayOutputStream;
6
+ import java.io.File;
7
+ import java.io.FileInputStream;
8
+ import java.io.IOException;
9
+ import java.io.InputStream;
10
+ import java.io.SequenceInputStream;
11
+ import java.nio.file.Files;
12
+ import java.nio.file.FileVisitOption;
13
+ import java.nio.file.Path;
14
+ import java.nio.file.Paths;
15
+ import java.nio.file.SimpleFileVisitor;
16
+ import java.nio.file.FileVisitResult;
17
+ import java.nio.file.attribute.BasicFileAttributes;
18
+ import java.util.ArrayList;
19
+ import java.util.EnumSet;
20
+ import java.util.List;
21
+ import java.util.Set;
22
+ import com.google.common.collect.ImmutableList;
23
+
24
+ import org.embulk.config.Config;
25
+ import org.embulk.config.ConfigDefault;
26
+ import org.embulk.config.ConfigDiff;
27
+ import org.embulk.config.ConfigInject;
28
+ import org.embulk.config.ConfigSource;
29
+ import org.embulk.config.Task;
30
+ import org.embulk.config.TaskReport;
31
+ import org.embulk.config.TaskSource;
32
+ import org.embulk.spi.BufferAllocator;
33
+ import org.embulk.spi.Exec;
34
+ import org.embulk.spi.FileInputPlugin;
35
+ import org.embulk.spi.TransactionalFileInput;
36
+ import org.embulk.spi.util.InputStreamFileInput;
37
+
38
+ import com.google.common.base.Optional;
39
+
40
+ public class LocalFileSplitInputPlugin
41
+ implements FileInputPlugin
42
+ {
43
+
44
+ private final static Path CURRENT_DIR = Paths.get(".").normalize();
45
+
46
+ public interface PluginTask
47
+ extends Task
48
+ {
49
+ @Config("path")
50
+ @ConfigDefault("null")
51
+ public Optional<String> getPath();
52
+
53
+ @Config("path_prefix")
54
+ @ConfigDefault("null")
55
+ public Optional<String> getPathPrefix();
56
+
57
+ @Config("tasks")
58
+ @ConfigDefault("null")
59
+ public Optional<Integer> getTasks();
60
+
61
+ @Config("header_line")
62
+ @ConfigDefault("false")
63
+ public boolean getHeaderLine();
64
+
65
+ public List<PartialFile> getFiles();
66
+ public void setFiles(List<PartialFile> files);
67
+
68
+ @ConfigInject
69
+ public BufferAllocator getBufferAllocator();
70
+ }
71
+
72
+ @Override
73
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
74
+ {
75
+ PluginTask task = config.loadConfig(PluginTask.class);
76
+
77
+ int tasks;
78
+ if (task.getTasks().isPresent()) {
79
+ tasks = task.getTasks().get();
80
+ if (tasks <= 0) {
81
+ throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
82
+ }
83
+ } else {
84
+ tasks = Runtime.getRuntime().availableProcessors() * 2;
85
+ }
86
+
87
+ List<String> paths = new ArrayList<String>();
88
+ if (task.getPath().isPresent()) {
89
+ if (task.getPathPrefix().isPresent()) {
90
+ throw new IllegalArgumentException("Cannot specify both 'path' and 'path_prefix'");
91
+ }
92
+ paths.add(task.getPath().get());
93
+ } else if (task.getPathPrefix().isPresent()) {
94
+ paths.addAll(listFiles(task.getPathPrefix().get()));
95
+ } else {
96
+ throw new IllegalArgumentException("Specify either 'path' or 'path_prefix'");
97
+ }
98
+
99
+ List<PartialFile> files = new ArrayList<PartialFile>();
100
+ for (String path : paths) {
101
+ long size = new File(path).length();
102
+ for (int i = 0; i < tasks; i++) {
103
+ long start = size * i / tasks;
104
+ long end = size * (i + 1) / tasks;
105
+ if (start < end) {
106
+ files.add(new PartialFile(path, start, end));
107
+ }
108
+ }
109
+ }
110
+
111
+ task.setFiles(files);
112
+
113
+ return resume(task.dump(), task.getFiles().size(), control);
114
+ }
115
+
116
+ @Override
117
+ public ConfigDiff resume(TaskSource taskSource,
118
+ int taskCount,
119
+ FileInputPlugin.Control control)
120
+ {
121
+ control.run(taskSource, taskCount);
122
+
123
+ return Exec.newConfigDiff();
124
+ }
125
+
126
+ @Override
127
+ public void cleanup(TaskSource taskSource,
128
+ int taskCount,
129
+ List<TaskReport> successTaskReports)
130
+ { }
131
+
132
+ @Override
133
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
134
+ {
135
+ PluginTask task = taskSource.loadTask(PluginTask.class);
136
+ return new LocalFileSplitInput(task, taskIndex);
137
+ }
138
+
139
+ /**
140
+ * Most of this implementation is based on LocalFileInputPlugin hosted at:
141
+ * https://github.com/embulk/embulk
142
+ *
143
+ */
144
+ private List<String> listFiles(String prefix)
145
+ {
146
+ final Path pathPrefix = Paths.get(prefix).normalize();
147
+ final Path directory;
148
+ final String fileNamePrefix;
149
+ if (Files.isDirectory(pathPrefix)) {
150
+ directory = pathPrefix;
151
+ fileNamePrefix = "";
152
+ } else {
153
+ fileNamePrefix = pathPrefix.getFileName().toString();
154
+ Path d = pathPrefix.getParent();
155
+ directory = (d == null ? CURRENT_DIR : d);
156
+ }
157
+
158
+ final ImmutableList.Builder<String> builder = ImmutableList.builder();
159
+ try {
160
+ int maxDepth = Integer.MAX_VALUE;
161
+ Set<FileVisitOption> opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
162
+
163
+ Files.walkFileTree(directory, opts, maxDepth, new SimpleFileVisitor<Path>() {
164
+ @Override
165
+ public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs)
166
+ {
167
+ if (path.equals(directory)) {
168
+ return FileVisitResult.CONTINUE;
169
+ } else {
170
+ Path parent = path.getParent();
171
+ if (parent == null) {
172
+ parent = CURRENT_DIR;
173
+ }
174
+ if (parent.equals(directory)) {
175
+ if (path.getFileName().toString().startsWith(fileNamePrefix)) {
176
+ return FileVisitResult.CONTINUE;
177
+ } else {
178
+ return FileVisitResult.SKIP_SUBTREE;
179
+ }
180
+ } else {
181
+ return FileVisitResult.CONTINUE;
182
+ }
183
+ }
184
+ }
185
+
186
+ @Override
187
+ public FileVisitResult visitFile(Path path, BasicFileAttributes attrs)
188
+ {
189
+ try {
190
+ // Avoid directories from listing.
191
+ // Directories are normally unvisited with |FileVisitor#visitFile|, but symbolic links to
192
+ // directories are visited like files unless |FOLLOW_LINKS| is set in |Files#walkFileTree|.
193
+ // Symbolic links to directories are explicitly skipped here by checking with |Path#toReadlPath|.
194
+ if (Files.isDirectory(path.toRealPath())) {
195
+ return FileVisitResult.CONTINUE;
196
+ }
197
+ } catch (IOException ex){
198
+ throw new RuntimeException("Can't resolve symbolic link", ex);
199
+ }
200
+ Path parent = path.getParent();
201
+ if (parent == null) {
202
+ parent = CURRENT_DIR;
203
+ }
204
+ if (parent.equals(directory)) {
205
+ if (path.getFileName().toString().startsWith(fileNamePrefix)) {
206
+ builder.add(path.toString());
207
+ return FileVisitResult.CONTINUE;
208
+ }
209
+ } else {
210
+ builder.add(path.toString());
211
+ }
212
+ return FileVisitResult.CONTINUE;
213
+ }
214
+ });
215
+ } catch (IOException ex) {
216
+ throw new RuntimeException(String.format("Failed get a list of local files at '%s'", directory), ex);
217
+ }
218
+ return builder.build();
219
+ }
220
+
221
+ public static class LocalFileSplitInput
222
+ extends InputStreamFileInput
223
+ implements TransactionalFileInput
224
+ {
225
+ public static class FileSplitProvider
226
+ implements InputStreamFileInput.Provider
227
+ {
228
+ private final PartialFile file;
229
+ private final boolean hasHeader;
230
+ private boolean opened = false;
231
+
232
+ public FileSplitProvider(PartialFile file, boolean hasHeader)
233
+ {
234
+ this.file = file;
235
+ this.hasHeader = hasHeader;
236
+ }
237
+
238
+ @Override
239
+ public InputStream openNext() throws IOException
240
+ {
241
+ if (opened) {
242
+ return null;
243
+ }
244
+ opened = true;
245
+
246
+ InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
247
+ if (file.getStart() > 0 && hasHeader) {
248
+ in = new SequenceInputStream(openHeader(file.getPath()), in);
249
+ }
250
+ return in;
251
+ }
252
+
253
+ @Override
254
+ public void close() { }
255
+
256
+ private InputStream openHeader(String path) throws IOException
257
+ {
258
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
259
+ try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
260
+ while (true) {
261
+ int c = in.read();
262
+ if (c < 0) {
263
+ break;
264
+ }
265
+
266
+ header.write(c);
267
+
268
+ if (c == '\n') {
269
+ break;
270
+ }
271
+
272
+ if (c == '\r') {
273
+ int c2 = in.read();
274
+ if (c2 == '\n') {
275
+ header.write(c2);
276
+ }
277
+ break;
278
+ }
279
+ }
280
+ }
281
+ header.close();
282
+ return new ByteArrayInputStream(header.toByteArray());
283
+ }
284
+ }
285
+
286
+ public LocalFileSplitInput(PluginTask task, int taskIndex)
287
+ {
288
+ super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
289
+ }
290
+
291
+ @Override
292
+ public void abort() { }
293
+
294
+ @Override
295
+ public TaskReport commit()
296
+ {
297
+ return Exec.newTaskReport();
298
+ }
299
+ }
300
+ }