embulk-input-filesplit 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +41 -40
- data/build.gradle +64 -64
- data/classpath/embulk-input-filesplit-0.1.4.jar +0 -0
- data/lib/embulk/input/filesplit.rb +3 -3
- data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +300 -187
- data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +70 -70
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +129 -94
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +78 -78
- data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -570
- data/src/test/resources/data/sub1/test1.csv +1 -0
- data/src/test/resources/data/sub1/test2.csv +3 -0
- data/src/test/resources/data/sub2/test1.csv +1 -0
- data/src/test/resources/data/sub2/test2.csv +3 -0
- data/src/test/resources/data/sub2/x.csv +1 -0
- data/src/test/resources/data/test-header.csv +5 -5
- data/src/test/resources/data/test-semicolon.csv +4 -4
- data/src/test/resources/data/test.csv +4 -4
- data/src/test/resources/yml/test-error1.yml +22 -0
- data/src/test/resources/yml/test-error2.yml +24 -0
- data/src/test/resources/yml/test-header.yml +24 -24
- data/src/test/resources/yml/test-only-header.yml +24 -24
- data/src/test/resources/yml/test-path_prefix-directory.yml +23 -0
- data/src/test/resources/yml/test-path_prefix-files.yml +23 -0
- data/src/test/resources/yml/test-tasks.yml +23 -23
- data/src/test/resources/yml/test.yml +22 -22
- metadata +15 -6
- data/classpath/embulk-input-filesplit-0.1.3.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c850998d274e9c10441daaac5ec8f25fb06ed7f
|
4
|
+
data.tar.gz: 6ede3a6e8f48a3ac8c4b3568605733cb7bc4b856
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bbef333a3b9cacffc614fcb2b3b926d26f0ffde4e1f956568c072c1ac0b1a66db5c04e472b5ad8ed85dee81c20d68e080723feacad0823a2b3bd47536071d40
|
7
|
+
data.tar.gz: ffe27227b3469ad54c91a2e51bc3acb7a1523a8ac09921833d59d5b983ab2759a4b3151a5b1a7414a666a27da224d3cf0527d029e5787031b8dceb07b86d1ebb
|
data/README.md
CHANGED
@@ -1,40 +1,41 @@
|
|
1
|
-
# Splitting input file plugin for Embulk
|
2
|
-
|
3
|
-
This Embulk plugin splits and inputs a text file.
|
4
|
-
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
-
|
6
|
-
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
-
The plugin searches line separators and splits a file properly.
|
8
|
-
|
9
|
-
## Overview
|
10
|
-
|
11
|
-
* **Plugin type**: input
|
12
|
-
|
13
|
-
## Configuration
|
14
|
-
|
15
|
-
- **path**: the path of a text file (string, required)
|
16
|
-
- **
|
17
|
-
- **
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
1
|
+
# Splitting input file plugin for Embulk
|
2
|
+
|
3
|
+
This Embulk plugin splits and inputs a text file.
|
4
|
+
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
+
|
6
|
+
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
+
The plugin searches line separators and splits a file properly.
|
8
|
+
|
9
|
+
## Overview
|
10
|
+
|
11
|
+
* **Plugin type**: input
|
12
|
+
|
13
|
+
## Configuration
|
14
|
+
|
15
|
+
- **path**: the path of a text file (string, either this or path_prefix is required)
|
16
|
+
- **path_prefix**: the path prefix of text files (string, either this or path_prefix is required)
|
17
|
+
- **header_line**: whether the first line is a header or not (boolean, default: false)
|
18
|
+
- **tasks**: number of tasks (integer, default: number of available processors * 2)
|
19
|
+
|
20
|
+
### Example
|
21
|
+
|
22
|
+
```yaml
|
23
|
+
in:
|
24
|
+
type: filesplit
|
25
|
+
path: '/data/address.csv'
|
26
|
+
header_line: true
|
27
|
+
tasks: 4
|
28
|
+
parser:
|
29
|
+
charset: UTF-8
|
30
|
+
newline: CRLF
|
31
|
+
type: csv
|
32
|
+
header_line: true
|
33
|
+
delimiter: ','
|
34
|
+
...
|
35
|
+
```
|
36
|
+
|
37
|
+
### Build
|
38
|
+
|
39
|
+
```
|
40
|
+
$ ./gradle gem
|
41
|
+
```
|
data/build.gradle
CHANGED
@@ -1,64 +1,64 @@
|
|
1
|
-
plugins {
|
2
|
-
id 'com.jfrog.bintray' version '1.
|
3
|
-
id 'com.github.jruby-gradle.base' version '
|
4
|
-
id 'java'
|
5
|
-
}
|
6
|
-
import com.github.jrubygradle.JRubyExec
|
7
|
-
|
8
|
-
apply plugin: 'java'
|
9
|
-
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
-
|
11
|
-
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
-
|
13
|
-
project.version = '0.1.
|
14
|
-
|
15
|
-
repositories {
|
16
|
-
mavenCentral()
|
17
|
-
jcenter()
|
18
|
-
}
|
19
|
-
|
20
|
-
configurations {
|
21
|
-
provided
|
22
|
-
}
|
23
|
-
|
24
|
-
dependencies {
|
25
|
-
compile 'org.embulk:embulk-core:0.7.4'
|
26
|
-
provided 'org.embulk:embulk-core:0.7.4'
|
27
|
-
testCompile 'org.embulk:embulk-standards:0.7.4'
|
28
|
-
testCompile 'junit:junit:4.+'
|
29
|
-
}
|
30
|
-
|
31
|
-
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
-
doFirst { file('classpath').deleteDir() }
|
33
|
-
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
-
into 'classpath'
|
35
|
-
}
|
36
|
-
clean { delete 'classpath' }
|
37
|
-
|
38
|
-
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
-
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
-
|
41
|
-
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
-
}
|
43
|
-
|
44
|
-
task gemspec << {
|
45
|
-
file('build').mkdirs();
|
46
|
-
file('build/gemspec').write($/
|
47
|
-
Gem::Specification.new do |spec|
|
48
|
-
spec.name = "${project.name}"
|
49
|
-
spec.version = "${project.version}"
|
50
|
-
spec.authors = ["Hitoshi Tanaka"]
|
51
|
-
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
-
spec.summary = "Embulk plugin for splitting input file"
|
53
|
-
spec.licenses = ["Apache 2.0"]
|
54
|
-
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
-
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
-
spec.require_paths = ["lib"]
|
57
|
-
end
|
58
|
-
/$)
|
59
|
-
}
|
60
|
-
|
61
|
-
|
62
|
-
task gempush << {
|
63
|
-
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
-
}
|
1
|
+
plugins {
|
2
|
+
id 'com.jfrog.bintray' version '1.6'
|
3
|
+
id 'com.github.jruby-gradle.base' version '1.2.1'
|
4
|
+
id 'java'
|
5
|
+
}
|
6
|
+
import com.github.jrubygradle.JRubyExec
|
7
|
+
|
8
|
+
apply plugin: 'java'
|
9
|
+
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
+
|
11
|
+
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
+
|
13
|
+
project.version = '0.1.4'
|
14
|
+
|
15
|
+
repositories {
|
16
|
+
mavenCentral()
|
17
|
+
jcenter()
|
18
|
+
}
|
19
|
+
|
20
|
+
configurations {
|
21
|
+
provided
|
22
|
+
}
|
23
|
+
|
24
|
+
dependencies {
|
25
|
+
compile 'org.embulk:embulk-core:0.7.4'
|
26
|
+
provided 'org.embulk:embulk-core:0.7.4'
|
27
|
+
testCompile 'org.embulk:embulk-standards:0.7.4'
|
28
|
+
testCompile 'junit:junit:4.+'
|
29
|
+
}
|
30
|
+
|
31
|
+
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
+
doFirst { file('classpath').deleteDir() }
|
33
|
+
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
+
into 'classpath'
|
35
|
+
}
|
36
|
+
clean { delete 'classpath' }
|
37
|
+
|
38
|
+
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
+
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
+
scriptArgs "build/gemspec"
|
41
|
+
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
+
}
|
43
|
+
|
44
|
+
task gemspec << {
|
45
|
+
file('build').mkdirs();
|
46
|
+
file('build/gemspec').write($/
|
47
|
+
Gem::Specification.new do |spec|
|
48
|
+
spec.name = "${project.name}"
|
49
|
+
spec.version = "${project.version}"
|
50
|
+
spec.authors = ["Hitoshi Tanaka"]
|
51
|
+
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
+
spec.summary = "Embulk plugin for splitting input file"
|
53
|
+
spec.licenses = ["Apache 2.0"]
|
54
|
+
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
+
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
+
spec.require_paths = ["lib"]
|
57
|
+
end
|
58
|
+
/$)
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
task gempush << {
|
63
|
+
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
+
}
|
Binary file
|
@@ -1,3 +1,3 @@
|
|
1
|
-
Embulk::JavaPlugin.register_input(
|
2
|
-
:filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
|
3
|
-
File.expand_path('../../../../classpath', __FILE__))
|
1
|
+
Embulk::JavaPlugin.register_input(
|
2
|
+
:filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
|
3
|
+
File.expand_path('../../../../classpath', __FILE__))
|
@@ -1,187 +1,300 @@
|
|
1
|
-
package org.embulk.input.filesplit;
|
2
|
-
|
3
|
-
import java.io.BufferedInputStream;
|
4
|
-
import java.io.ByteArrayInputStream;
|
5
|
-
import java.io.ByteArrayOutputStream;
|
6
|
-
import java.io.File;
|
7
|
-
import java.io.FileInputStream;
|
8
|
-
import java.io.IOException;
|
9
|
-
import java.io.InputStream;
|
10
|
-
import java.io.SequenceInputStream;
|
11
|
-
import java.
|
12
|
-
import java.
|
13
|
-
|
14
|
-
import
|
15
|
-
import
|
16
|
-
import
|
17
|
-
import
|
18
|
-
import
|
19
|
-
import
|
20
|
-
import
|
21
|
-
import
|
22
|
-
import
|
23
|
-
|
24
|
-
import org.embulk.
|
25
|
-
import org.embulk.
|
26
|
-
import org.embulk.
|
27
|
-
|
28
|
-
import
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.ByteArrayInputStream;
|
5
|
+
import java.io.ByteArrayOutputStream;
|
6
|
+
import java.io.File;
|
7
|
+
import java.io.FileInputStream;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.InputStream;
|
10
|
+
import java.io.SequenceInputStream;
|
11
|
+
import java.nio.file.Files;
|
12
|
+
import java.nio.file.FileVisitOption;
|
13
|
+
import java.nio.file.Path;
|
14
|
+
import java.nio.file.Paths;
|
15
|
+
import java.nio.file.SimpleFileVisitor;
|
16
|
+
import java.nio.file.FileVisitResult;
|
17
|
+
import java.nio.file.attribute.BasicFileAttributes;
|
18
|
+
import java.util.ArrayList;
|
19
|
+
import java.util.EnumSet;
|
20
|
+
import java.util.List;
|
21
|
+
import java.util.Set;
|
22
|
+
import com.google.common.collect.ImmutableList;
|
23
|
+
|
24
|
+
import org.embulk.config.Config;
|
25
|
+
import org.embulk.config.ConfigDefault;
|
26
|
+
import org.embulk.config.ConfigDiff;
|
27
|
+
import org.embulk.config.ConfigInject;
|
28
|
+
import org.embulk.config.ConfigSource;
|
29
|
+
import org.embulk.config.Task;
|
30
|
+
import org.embulk.config.TaskReport;
|
31
|
+
import org.embulk.config.TaskSource;
|
32
|
+
import org.embulk.spi.BufferAllocator;
|
33
|
+
import org.embulk.spi.Exec;
|
34
|
+
import org.embulk.spi.FileInputPlugin;
|
35
|
+
import org.embulk.spi.TransactionalFileInput;
|
36
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
37
|
+
|
38
|
+
import com.google.common.base.Optional;
|
39
|
+
|
40
|
+
public class LocalFileSplitInputPlugin
|
41
|
+
implements FileInputPlugin
|
42
|
+
{
|
43
|
+
|
44
|
+
private final static Path CURRENT_DIR = Paths.get(".").normalize();
|
45
|
+
|
46
|
+
public interface PluginTask
|
47
|
+
extends Task
|
48
|
+
{
|
49
|
+
@Config("path")
|
50
|
+
@ConfigDefault("null")
|
51
|
+
public Optional<String> getPath();
|
52
|
+
|
53
|
+
@Config("path_prefix")
|
54
|
+
@ConfigDefault("null")
|
55
|
+
public Optional<String> getPathPrefix();
|
56
|
+
|
57
|
+
@Config("tasks")
|
58
|
+
@ConfigDefault("null")
|
59
|
+
public Optional<Integer> getTasks();
|
60
|
+
|
61
|
+
@Config("header_line")
|
62
|
+
@ConfigDefault("false")
|
63
|
+
public boolean getHeaderLine();
|
64
|
+
|
65
|
+
public List<PartialFile> getFiles();
|
66
|
+
public void setFiles(List<PartialFile> files);
|
67
|
+
|
68
|
+
@ConfigInject
|
69
|
+
public BufferAllocator getBufferAllocator();
|
70
|
+
}
|
71
|
+
|
72
|
+
@Override
|
73
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
74
|
+
{
|
75
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
76
|
+
|
77
|
+
int tasks;
|
78
|
+
if (task.getTasks().isPresent()) {
|
79
|
+
tasks = task.getTasks().get();
|
80
|
+
if (tasks <= 0) {
|
81
|
+
throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
|
82
|
+
}
|
83
|
+
} else {
|
84
|
+
tasks = Runtime.getRuntime().availableProcessors() * 2;
|
85
|
+
}
|
86
|
+
|
87
|
+
List<String> paths = new ArrayList<String>();
|
88
|
+
if (task.getPath().isPresent()) {
|
89
|
+
if (task.getPathPrefix().isPresent()) {
|
90
|
+
throw new IllegalArgumentException("Cannot specify both 'path' and 'path_prefix'");
|
91
|
+
}
|
92
|
+
paths.add(task.getPath().get());
|
93
|
+
} else if (task.getPathPrefix().isPresent()) {
|
94
|
+
paths.addAll(listFiles(task.getPathPrefix().get()));
|
95
|
+
} else {
|
96
|
+
throw new IllegalArgumentException("Specify either 'path' or 'path_prefix'");
|
97
|
+
}
|
98
|
+
|
99
|
+
List<PartialFile> files = new ArrayList<PartialFile>();
|
100
|
+
for (String path : paths) {
|
101
|
+
long size = new File(path).length();
|
102
|
+
for (int i = 0; i < tasks; i++) {
|
103
|
+
long start = size * i / tasks;
|
104
|
+
long end = size * (i + 1) / tasks;
|
105
|
+
if (start < end) {
|
106
|
+
files.add(new PartialFile(path, start, end));
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
task.setFiles(files);
|
112
|
+
|
113
|
+
return resume(task.dump(), task.getFiles().size(), control);
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public ConfigDiff resume(TaskSource taskSource,
|
118
|
+
int taskCount,
|
119
|
+
FileInputPlugin.Control control)
|
120
|
+
{
|
121
|
+
control.run(taskSource, taskCount);
|
122
|
+
|
123
|
+
return Exec.newConfigDiff();
|
124
|
+
}
|
125
|
+
|
126
|
+
@Override
|
127
|
+
public void cleanup(TaskSource taskSource,
|
128
|
+
int taskCount,
|
129
|
+
List<TaskReport> successTaskReports)
|
130
|
+
{ }
|
131
|
+
|
132
|
+
@Override
|
133
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
134
|
+
{
|
135
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
136
|
+
return new LocalFileSplitInput(task, taskIndex);
|
137
|
+
}
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Most of this implementation is based on LocalFileInputPlugin hosted at:
|
141
|
+
* https://github.com/embulk/embulk
|
142
|
+
*
|
143
|
+
*/
|
144
|
+
private List<String> listFiles(String prefix)
|
145
|
+
{
|
146
|
+
final Path pathPrefix = Paths.get(prefix).normalize();
|
147
|
+
final Path directory;
|
148
|
+
final String fileNamePrefix;
|
149
|
+
if (Files.isDirectory(pathPrefix)) {
|
150
|
+
directory = pathPrefix;
|
151
|
+
fileNamePrefix = "";
|
152
|
+
} else {
|
153
|
+
fileNamePrefix = pathPrefix.getFileName().toString();
|
154
|
+
Path d = pathPrefix.getParent();
|
155
|
+
directory = (d == null ? CURRENT_DIR : d);
|
156
|
+
}
|
157
|
+
|
158
|
+
final ImmutableList.Builder<String> builder = ImmutableList.builder();
|
159
|
+
try {
|
160
|
+
int maxDepth = Integer.MAX_VALUE;
|
161
|
+
Set<FileVisitOption> opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
|
162
|
+
|
163
|
+
Files.walkFileTree(directory, opts, maxDepth, new SimpleFileVisitor<Path>() {
|
164
|
+
@Override
|
165
|
+
public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs)
|
166
|
+
{
|
167
|
+
if (path.equals(directory)) {
|
168
|
+
return FileVisitResult.CONTINUE;
|
169
|
+
} else {
|
170
|
+
Path parent = path.getParent();
|
171
|
+
if (parent == null) {
|
172
|
+
parent = CURRENT_DIR;
|
173
|
+
}
|
174
|
+
if (parent.equals(directory)) {
|
175
|
+
if (path.getFileName().toString().startsWith(fileNamePrefix)) {
|
176
|
+
return FileVisitResult.CONTINUE;
|
177
|
+
} else {
|
178
|
+
return FileVisitResult.SKIP_SUBTREE;
|
179
|
+
}
|
180
|
+
} else {
|
181
|
+
return FileVisitResult.CONTINUE;
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
@Override
|
187
|
+
public FileVisitResult visitFile(Path path, BasicFileAttributes attrs)
|
188
|
+
{
|
189
|
+
try {
|
190
|
+
// Avoid directories from listing.
|
191
|
+
// Directories are normally unvisited with |FileVisitor#visitFile|, but symbolic links to
|
192
|
+
// directories are visited like files unless |FOLLOW_LINKS| is set in |Files#walkFileTree|.
|
193
|
+
// Symbolic links to directories are explicitly skipped here by checking with |Path#toReadlPath|.
|
194
|
+
if (Files.isDirectory(path.toRealPath())) {
|
195
|
+
return FileVisitResult.CONTINUE;
|
196
|
+
}
|
197
|
+
} catch (IOException ex){
|
198
|
+
throw new RuntimeException("Can't resolve symbolic link", ex);
|
199
|
+
}
|
200
|
+
Path parent = path.getParent();
|
201
|
+
if (parent == null) {
|
202
|
+
parent = CURRENT_DIR;
|
203
|
+
}
|
204
|
+
if (parent.equals(directory)) {
|
205
|
+
if (path.getFileName().toString().startsWith(fileNamePrefix)) {
|
206
|
+
builder.add(path.toString());
|
207
|
+
return FileVisitResult.CONTINUE;
|
208
|
+
}
|
209
|
+
} else {
|
210
|
+
builder.add(path.toString());
|
211
|
+
}
|
212
|
+
return FileVisitResult.CONTINUE;
|
213
|
+
}
|
214
|
+
});
|
215
|
+
} catch (IOException ex) {
|
216
|
+
throw new RuntimeException(String.format("Failed get a list of local files at '%s'", directory), ex);
|
217
|
+
}
|
218
|
+
return builder.build();
|
219
|
+
}
|
220
|
+
|
221
|
+
public static class LocalFileSplitInput
|
222
|
+
extends InputStreamFileInput
|
223
|
+
implements TransactionalFileInput
|
224
|
+
{
|
225
|
+
public static class FileSplitProvider
|
226
|
+
implements InputStreamFileInput.Provider
|
227
|
+
{
|
228
|
+
private final PartialFile file;
|
229
|
+
private final boolean hasHeader;
|
230
|
+
private boolean opened = false;
|
231
|
+
|
232
|
+
public FileSplitProvider(PartialFile file, boolean hasHeader)
|
233
|
+
{
|
234
|
+
this.file = file;
|
235
|
+
this.hasHeader = hasHeader;
|
236
|
+
}
|
237
|
+
|
238
|
+
@Override
|
239
|
+
public InputStream openNext() throws IOException
|
240
|
+
{
|
241
|
+
if (opened) {
|
242
|
+
return null;
|
243
|
+
}
|
244
|
+
opened = true;
|
245
|
+
|
246
|
+
InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
|
247
|
+
if (file.getStart() > 0 && hasHeader) {
|
248
|
+
in = new SequenceInputStream(openHeader(file.getPath()), in);
|
249
|
+
}
|
250
|
+
return in;
|
251
|
+
}
|
252
|
+
|
253
|
+
@Override
|
254
|
+
public void close() { }
|
255
|
+
|
256
|
+
private InputStream openHeader(String path) throws IOException
|
257
|
+
{
|
258
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
259
|
+
try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
|
260
|
+
while (true) {
|
261
|
+
int c = in.read();
|
262
|
+
if (c < 0) {
|
263
|
+
break;
|
264
|
+
}
|
265
|
+
|
266
|
+
header.write(c);
|
267
|
+
|
268
|
+
if (c == '\n') {
|
269
|
+
break;
|
270
|
+
}
|
271
|
+
|
272
|
+
if (c == '\r') {
|
273
|
+
int c2 = in.read();
|
274
|
+
if (c2 == '\n') {
|
275
|
+
header.write(c2);
|
276
|
+
}
|
277
|
+
break;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
header.close();
|
282
|
+
return new ByteArrayInputStream(header.toByteArray());
|
283
|
+
}
|
284
|
+
}
|
285
|
+
|
286
|
+
public LocalFileSplitInput(PluginTask task, int taskIndex)
|
287
|
+
{
|
288
|
+
super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
|
289
|
+
}
|
290
|
+
|
291
|
+
@Override
|
292
|
+
public void abort() { }
|
293
|
+
|
294
|
+
@Override
|
295
|
+
public TaskReport commit()
|
296
|
+
{
|
297
|
+
return Exec.newTaskReport();
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|