embulk-input-filesplit 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +41 -40
- data/build.gradle +64 -64
- data/classpath/embulk-input-filesplit-0.1.4.jar +0 -0
- data/lib/embulk/input/filesplit.rb +3 -3
- data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +300 -187
- data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +70 -70
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +129 -94
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +78 -78
- data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -570
- data/src/test/resources/data/sub1/test1.csv +1 -0
- data/src/test/resources/data/sub1/test2.csv +3 -0
- data/src/test/resources/data/sub2/test1.csv +1 -0
- data/src/test/resources/data/sub2/test2.csv +3 -0
- data/src/test/resources/data/sub2/x.csv +1 -0
- data/src/test/resources/data/test-header.csv +5 -5
- data/src/test/resources/data/test-semicolon.csv +4 -4
- data/src/test/resources/data/test.csv +4 -4
- data/src/test/resources/yml/test-error1.yml +22 -0
- data/src/test/resources/yml/test-error2.yml +24 -0
- data/src/test/resources/yml/test-header.yml +24 -24
- data/src/test/resources/yml/test-only-header.yml +24 -24
- data/src/test/resources/yml/test-path_prefix-directory.yml +23 -0
- data/src/test/resources/yml/test-path_prefix-files.yml +23 -0
- data/src/test/resources/yml/test-tasks.yml +23 -23
- data/src/test/resources/yml/test.yml +22 -22
- metadata +15 -6
- data/classpath/embulk-input-filesplit-0.1.3.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6c850998d274e9c10441daaac5ec8f25fb06ed7f
|
4
|
+
data.tar.gz: 6ede3a6e8f48a3ac8c4b3568605733cb7bc4b856
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7bbef333a3b9cacffc614fcb2b3b926d26f0ffde4e1f956568c072c1ac0b1a66db5c04e472b5ad8ed85dee81c20d68e080723feacad0823a2b3bd47536071d40
|
7
|
+
data.tar.gz: ffe27227b3469ad54c91a2e51bc3acb7a1523a8ac09921833d59d5b983ab2759a4b3151a5b1a7414a666a27da224d3cf0527d029e5787031b8dceb07b86d1ebb
|
data/README.md
CHANGED
@@ -1,40 +1,41 @@
|
|
1
|
-
# Splitting input file plugin for Embulk
|
2
|
-
|
3
|
-
This Embulk plugin splits and inputs a text file.
|
4
|
-
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
-
|
6
|
-
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
-
The plugin searches line separators and splits a file properly.
|
8
|
-
|
9
|
-
## Overview
|
10
|
-
|
11
|
-
* **Plugin type**: input
|
12
|
-
|
13
|
-
## Configuration
|
14
|
-
|
15
|
-
- **path**: the path of a text file (string, required)
|
16
|
-
- **
|
17
|
-
- **
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
1
|
+
# Splitting input file plugin for Embulk
|
2
|
+
|
3
|
+
This Embulk plugin splits and inputs a text file.
|
4
|
+
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
+
|
6
|
+
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
+
The plugin searches line separators and splits a file properly.
|
8
|
+
|
9
|
+
## Overview
|
10
|
+
|
11
|
+
* **Plugin type**: input
|
12
|
+
|
13
|
+
## Configuration
|
14
|
+
|
15
|
+
- **path**: the path of a text file (string, either this or path_prefix is required)
|
16
|
+
- **path_prefix**: the path prefix of text files (string, either this or path_prefix is required)
|
17
|
+
- **header_line**: whether the first line is a header or not (boolean, default: false)
|
18
|
+
- **tasks**: number of tasks (integer, default: number of available processors * 2)
|
19
|
+
|
20
|
+
### Example
|
21
|
+
|
22
|
+
```yaml
|
23
|
+
in:
|
24
|
+
type: filesplit
|
25
|
+
path: '/data/address.csv'
|
26
|
+
header_line: true
|
27
|
+
tasks: 4
|
28
|
+
parser:
|
29
|
+
charset: UTF-8
|
30
|
+
newline: CRLF
|
31
|
+
type: csv
|
32
|
+
header_line: true
|
33
|
+
delimiter: ','
|
34
|
+
...
|
35
|
+
```
|
36
|
+
|
37
|
+
### Build
|
38
|
+
|
39
|
+
```
|
40
|
+
$ ./gradle gem
|
41
|
+
```
|
data/build.gradle
CHANGED
@@ -1,64 +1,64 @@
|
|
1
|
-
plugins {
|
2
|
-
id 'com.jfrog.bintray' version '1.
|
3
|
-
id 'com.github.jruby-gradle.base' version '
|
4
|
-
id 'java'
|
5
|
-
}
|
6
|
-
import com.github.jrubygradle.JRubyExec
|
7
|
-
|
8
|
-
apply plugin: 'java'
|
9
|
-
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
-
|
11
|
-
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
-
|
13
|
-
project.version = '0.1.
|
14
|
-
|
15
|
-
repositories {
|
16
|
-
mavenCentral()
|
17
|
-
jcenter()
|
18
|
-
}
|
19
|
-
|
20
|
-
configurations {
|
21
|
-
provided
|
22
|
-
}
|
23
|
-
|
24
|
-
dependencies {
|
25
|
-
compile 'org.embulk:embulk-core:0.7.4'
|
26
|
-
provided 'org.embulk:embulk-core:0.7.4'
|
27
|
-
testCompile 'org.embulk:embulk-standards:0.7.4'
|
28
|
-
testCompile 'junit:junit:4.+'
|
29
|
-
}
|
30
|
-
|
31
|
-
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
-
doFirst { file('classpath').deleteDir() }
|
33
|
-
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
-
into 'classpath'
|
35
|
-
}
|
36
|
-
clean { delete 'classpath' }
|
37
|
-
|
38
|
-
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
-
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
-
|
41
|
-
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
-
}
|
43
|
-
|
44
|
-
task gemspec << {
|
45
|
-
file('build').mkdirs();
|
46
|
-
file('build/gemspec').write($/
|
47
|
-
Gem::Specification.new do |spec|
|
48
|
-
spec.name = "${project.name}"
|
49
|
-
spec.version = "${project.version}"
|
50
|
-
spec.authors = ["Hitoshi Tanaka"]
|
51
|
-
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
-
spec.summary = "Embulk plugin for splitting input file"
|
53
|
-
spec.licenses = ["Apache 2.0"]
|
54
|
-
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
-
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
-
spec.require_paths = ["lib"]
|
57
|
-
end
|
58
|
-
/$)
|
59
|
-
}
|
60
|
-
|
61
|
-
|
62
|
-
task gempush << {
|
63
|
-
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
-
}
|
1
|
+
plugins {
|
2
|
+
id 'com.jfrog.bintray' version '1.6'
|
3
|
+
id 'com.github.jruby-gradle.base' version '1.2.1'
|
4
|
+
id 'java'
|
5
|
+
}
|
6
|
+
import com.github.jrubygradle.JRubyExec
|
7
|
+
|
8
|
+
apply plugin: 'java'
|
9
|
+
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
+
|
11
|
+
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
+
|
13
|
+
project.version = '0.1.4'
|
14
|
+
|
15
|
+
repositories {
|
16
|
+
mavenCentral()
|
17
|
+
jcenter()
|
18
|
+
}
|
19
|
+
|
20
|
+
configurations {
|
21
|
+
provided
|
22
|
+
}
|
23
|
+
|
24
|
+
dependencies {
|
25
|
+
compile 'org.embulk:embulk-core:0.7.4'
|
26
|
+
provided 'org.embulk:embulk-core:0.7.4'
|
27
|
+
testCompile 'org.embulk:embulk-standards:0.7.4'
|
28
|
+
testCompile 'junit:junit:4.+'
|
29
|
+
}
|
30
|
+
|
31
|
+
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
+
doFirst { file('classpath').deleteDir() }
|
33
|
+
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
+
into 'classpath'
|
35
|
+
}
|
36
|
+
clean { delete 'classpath' }
|
37
|
+
|
38
|
+
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
+
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
+
scriptArgs "build/gemspec"
|
41
|
+
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
+
}
|
43
|
+
|
44
|
+
task gemspec << {
|
45
|
+
file('build').mkdirs();
|
46
|
+
file('build/gemspec').write($/
|
47
|
+
Gem::Specification.new do |spec|
|
48
|
+
spec.name = "${project.name}"
|
49
|
+
spec.version = "${project.version}"
|
50
|
+
spec.authors = ["Hitoshi Tanaka"]
|
51
|
+
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
+
spec.summary = "Embulk plugin for splitting input file"
|
53
|
+
spec.licenses = ["Apache 2.0"]
|
54
|
+
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
+
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
+
spec.require_paths = ["lib"]
|
57
|
+
end
|
58
|
+
/$)
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
task gempush << {
|
63
|
+
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
+
}
|
Binary file
|
@@ -1,3 +1,3 @@
|
|
1
|
-
Embulk::JavaPlugin.register_input(
|
2
|
-
:filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
|
3
|
-
File.expand_path('../../../../classpath', __FILE__))
|
1
|
+
Embulk::JavaPlugin.register_input(
|
2
|
+
:filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
|
3
|
+
File.expand_path('../../../../classpath', __FILE__))
|
@@ -1,187 +1,300 @@
|
|
1
|
-
package org.embulk.input.filesplit;
|
2
|
-
|
3
|
-
import java.io.BufferedInputStream;
|
4
|
-
import java.io.ByteArrayInputStream;
|
5
|
-
import java.io.ByteArrayOutputStream;
|
6
|
-
import java.io.File;
|
7
|
-
import java.io.FileInputStream;
|
8
|
-
import java.io.IOException;
|
9
|
-
import java.io.InputStream;
|
10
|
-
import java.io.SequenceInputStream;
|
11
|
-
import java.
|
12
|
-
import java.
|
13
|
-
|
14
|
-
import
|
15
|
-
import
|
16
|
-
import
|
17
|
-
import
|
18
|
-
import
|
19
|
-
import
|
20
|
-
import
|
21
|
-
import
|
22
|
-
import
|
23
|
-
|
24
|
-
import org.embulk.
|
25
|
-
import org.embulk.
|
26
|
-
import org.embulk.
|
27
|
-
|
28
|
-
import
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.ByteArrayInputStream;
|
5
|
+
import java.io.ByteArrayOutputStream;
|
6
|
+
import java.io.File;
|
7
|
+
import java.io.FileInputStream;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.InputStream;
|
10
|
+
import java.io.SequenceInputStream;
|
11
|
+
import java.nio.file.Files;
|
12
|
+
import java.nio.file.FileVisitOption;
|
13
|
+
import java.nio.file.Path;
|
14
|
+
import java.nio.file.Paths;
|
15
|
+
import java.nio.file.SimpleFileVisitor;
|
16
|
+
import java.nio.file.FileVisitResult;
|
17
|
+
import java.nio.file.attribute.BasicFileAttributes;
|
18
|
+
import java.util.ArrayList;
|
19
|
+
import java.util.EnumSet;
|
20
|
+
import java.util.List;
|
21
|
+
import java.util.Set;
|
22
|
+
import com.google.common.collect.ImmutableList;
|
23
|
+
|
24
|
+
import org.embulk.config.Config;
|
25
|
+
import org.embulk.config.ConfigDefault;
|
26
|
+
import org.embulk.config.ConfigDiff;
|
27
|
+
import org.embulk.config.ConfigInject;
|
28
|
+
import org.embulk.config.ConfigSource;
|
29
|
+
import org.embulk.config.Task;
|
30
|
+
import org.embulk.config.TaskReport;
|
31
|
+
import org.embulk.config.TaskSource;
|
32
|
+
import org.embulk.spi.BufferAllocator;
|
33
|
+
import org.embulk.spi.Exec;
|
34
|
+
import org.embulk.spi.FileInputPlugin;
|
35
|
+
import org.embulk.spi.TransactionalFileInput;
|
36
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
37
|
+
|
38
|
+
import com.google.common.base.Optional;
|
39
|
+
|
40
|
+
public class LocalFileSplitInputPlugin
|
41
|
+
implements FileInputPlugin
|
42
|
+
{
|
43
|
+
|
44
|
+
private final static Path CURRENT_DIR = Paths.get(".").normalize();
|
45
|
+
|
46
|
+
public interface PluginTask
|
47
|
+
extends Task
|
48
|
+
{
|
49
|
+
@Config("path")
|
50
|
+
@ConfigDefault("null")
|
51
|
+
public Optional<String> getPath();
|
52
|
+
|
53
|
+
@Config("path_prefix")
|
54
|
+
@ConfigDefault("null")
|
55
|
+
public Optional<String> getPathPrefix();
|
56
|
+
|
57
|
+
@Config("tasks")
|
58
|
+
@ConfigDefault("null")
|
59
|
+
public Optional<Integer> getTasks();
|
60
|
+
|
61
|
+
@Config("header_line")
|
62
|
+
@ConfigDefault("false")
|
63
|
+
public boolean getHeaderLine();
|
64
|
+
|
65
|
+
public List<PartialFile> getFiles();
|
66
|
+
public void setFiles(List<PartialFile> files);
|
67
|
+
|
68
|
+
@ConfigInject
|
69
|
+
public BufferAllocator getBufferAllocator();
|
70
|
+
}
|
71
|
+
|
72
|
+
@Override
|
73
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
74
|
+
{
|
75
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
76
|
+
|
77
|
+
int tasks;
|
78
|
+
if (task.getTasks().isPresent()) {
|
79
|
+
tasks = task.getTasks().get();
|
80
|
+
if (tasks <= 0) {
|
81
|
+
throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
|
82
|
+
}
|
83
|
+
} else {
|
84
|
+
tasks = Runtime.getRuntime().availableProcessors() * 2;
|
85
|
+
}
|
86
|
+
|
87
|
+
List<String> paths = new ArrayList<String>();
|
88
|
+
if (task.getPath().isPresent()) {
|
89
|
+
if (task.getPathPrefix().isPresent()) {
|
90
|
+
throw new IllegalArgumentException("Cannot specify both 'path' and 'path_prefix'");
|
91
|
+
}
|
92
|
+
paths.add(task.getPath().get());
|
93
|
+
} else if (task.getPathPrefix().isPresent()) {
|
94
|
+
paths.addAll(listFiles(task.getPathPrefix().get()));
|
95
|
+
} else {
|
96
|
+
throw new IllegalArgumentException("Specify either 'path' or 'path_prefix'");
|
97
|
+
}
|
98
|
+
|
99
|
+
List<PartialFile> files = new ArrayList<PartialFile>();
|
100
|
+
for (String path : paths) {
|
101
|
+
long size = new File(path).length();
|
102
|
+
for (int i = 0; i < tasks; i++) {
|
103
|
+
long start = size * i / tasks;
|
104
|
+
long end = size * (i + 1) / tasks;
|
105
|
+
if (start < end) {
|
106
|
+
files.add(new PartialFile(path, start, end));
|
107
|
+
}
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
task.setFiles(files);
|
112
|
+
|
113
|
+
return resume(task.dump(), task.getFiles().size(), control);
|
114
|
+
}
|
115
|
+
|
116
|
+
@Override
|
117
|
+
public ConfigDiff resume(TaskSource taskSource,
|
118
|
+
int taskCount,
|
119
|
+
FileInputPlugin.Control control)
|
120
|
+
{
|
121
|
+
control.run(taskSource, taskCount);
|
122
|
+
|
123
|
+
return Exec.newConfigDiff();
|
124
|
+
}
|
125
|
+
|
126
|
+
@Override
|
127
|
+
public void cleanup(TaskSource taskSource,
|
128
|
+
int taskCount,
|
129
|
+
List<TaskReport> successTaskReports)
|
130
|
+
{ }
|
131
|
+
|
132
|
+
@Override
|
133
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
134
|
+
{
|
135
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
136
|
+
return new LocalFileSplitInput(task, taskIndex);
|
137
|
+
}
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Most of this implementation is based on LocalFileInputPlugin hosted at:
|
141
|
+
* https://github.com/embulk/embulk
|
142
|
+
*
|
143
|
+
*/
|
144
|
+
private List<String> listFiles(String prefix)
|
145
|
+
{
|
146
|
+
final Path pathPrefix = Paths.get(prefix).normalize();
|
147
|
+
final Path directory;
|
148
|
+
final String fileNamePrefix;
|
149
|
+
if (Files.isDirectory(pathPrefix)) {
|
150
|
+
directory = pathPrefix;
|
151
|
+
fileNamePrefix = "";
|
152
|
+
} else {
|
153
|
+
fileNamePrefix = pathPrefix.getFileName().toString();
|
154
|
+
Path d = pathPrefix.getParent();
|
155
|
+
directory = (d == null ? CURRENT_DIR : d);
|
156
|
+
}
|
157
|
+
|
158
|
+
final ImmutableList.Builder<String> builder = ImmutableList.builder();
|
159
|
+
try {
|
160
|
+
int maxDepth = Integer.MAX_VALUE;
|
161
|
+
Set<FileVisitOption> opts = EnumSet.of(FileVisitOption.FOLLOW_LINKS);
|
162
|
+
|
163
|
+
Files.walkFileTree(directory, opts, maxDepth, new SimpleFileVisitor<Path>() {
|
164
|
+
@Override
|
165
|
+
public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs)
|
166
|
+
{
|
167
|
+
if (path.equals(directory)) {
|
168
|
+
return FileVisitResult.CONTINUE;
|
169
|
+
} else {
|
170
|
+
Path parent = path.getParent();
|
171
|
+
if (parent == null) {
|
172
|
+
parent = CURRENT_DIR;
|
173
|
+
}
|
174
|
+
if (parent.equals(directory)) {
|
175
|
+
if (path.getFileName().toString().startsWith(fileNamePrefix)) {
|
176
|
+
return FileVisitResult.CONTINUE;
|
177
|
+
} else {
|
178
|
+
return FileVisitResult.SKIP_SUBTREE;
|
179
|
+
}
|
180
|
+
} else {
|
181
|
+
return FileVisitResult.CONTINUE;
|
182
|
+
}
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
186
|
+
@Override
|
187
|
+
public FileVisitResult visitFile(Path path, BasicFileAttributes attrs)
|
188
|
+
{
|
189
|
+
try {
|
190
|
+
// Avoid directories from listing.
|
191
|
+
// Directories are normally unvisited with |FileVisitor#visitFile|, but symbolic links to
|
192
|
+
// directories are visited like files unless |FOLLOW_LINKS| is set in |Files#walkFileTree|.
|
193
|
+
// Symbolic links to directories are explicitly skipped here by checking with |Path#toReadlPath|.
|
194
|
+
if (Files.isDirectory(path.toRealPath())) {
|
195
|
+
return FileVisitResult.CONTINUE;
|
196
|
+
}
|
197
|
+
} catch (IOException ex){
|
198
|
+
throw new RuntimeException("Can't resolve symbolic link", ex);
|
199
|
+
}
|
200
|
+
Path parent = path.getParent();
|
201
|
+
if (parent == null) {
|
202
|
+
parent = CURRENT_DIR;
|
203
|
+
}
|
204
|
+
if (parent.equals(directory)) {
|
205
|
+
if (path.getFileName().toString().startsWith(fileNamePrefix)) {
|
206
|
+
builder.add(path.toString());
|
207
|
+
return FileVisitResult.CONTINUE;
|
208
|
+
}
|
209
|
+
} else {
|
210
|
+
builder.add(path.toString());
|
211
|
+
}
|
212
|
+
return FileVisitResult.CONTINUE;
|
213
|
+
}
|
214
|
+
});
|
215
|
+
} catch (IOException ex) {
|
216
|
+
throw new RuntimeException(String.format("Failed get a list of local files at '%s'", directory), ex);
|
217
|
+
}
|
218
|
+
return builder.build();
|
219
|
+
}
|
220
|
+
|
221
|
+
public static class LocalFileSplitInput
|
222
|
+
extends InputStreamFileInput
|
223
|
+
implements TransactionalFileInput
|
224
|
+
{
|
225
|
+
public static class FileSplitProvider
|
226
|
+
implements InputStreamFileInput.Provider
|
227
|
+
{
|
228
|
+
private final PartialFile file;
|
229
|
+
private final boolean hasHeader;
|
230
|
+
private boolean opened = false;
|
231
|
+
|
232
|
+
public FileSplitProvider(PartialFile file, boolean hasHeader)
|
233
|
+
{
|
234
|
+
this.file = file;
|
235
|
+
this.hasHeader = hasHeader;
|
236
|
+
}
|
237
|
+
|
238
|
+
@Override
|
239
|
+
public InputStream openNext() throws IOException
|
240
|
+
{
|
241
|
+
if (opened) {
|
242
|
+
return null;
|
243
|
+
}
|
244
|
+
opened = true;
|
245
|
+
|
246
|
+
InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
|
247
|
+
if (file.getStart() > 0 && hasHeader) {
|
248
|
+
in = new SequenceInputStream(openHeader(file.getPath()), in);
|
249
|
+
}
|
250
|
+
return in;
|
251
|
+
}
|
252
|
+
|
253
|
+
@Override
|
254
|
+
public void close() { }
|
255
|
+
|
256
|
+
private InputStream openHeader(String path) throws IOException
|
257
|
+
{
|
258
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
259
|
+
try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
|
260
|
+
while (true) {
|
261
|
+
int c = in.read();
|
262
|
+
if (c < 0) {
|
263
|
+
break;
|
264
|
+
}
|
265
|
+
|
266
|
+
header.write(c);
|
267
|
+
|
268
|
+
if (c == '\n') {
|
269
|
+
break;
|
270
|
+
}
|
271
|
+
|
272
|
+
if (c == '\r') {
|
273
|
+
int c2 = in.read();
|
274
|
+
if (c2 == '\n') {
|
275
|
+
header.write(c2);
|
276
|
+
}
|
277
|
+
break;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
}
|
281
|
+
header.close();
|
282
|
+
return new ByteArrayInputStream(header.toByteArray());
|
283
|
+
}
|
284
|
+
}
|
285
|
+
|
286
|
+
public LocalFileSplitInput(PluginTask task, int taskIndex)
|
287
|
+
{
|
288
|
+
super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
|
289
|
+
}
|
290
|
+
|
291
|
+
@Override
|
292
|
+
public void abort() { }
|
293
|
+
|
294
|
+
@Override
|
295
|
+
public TaskReport commit()
|
296
|
+
{
|
297
|
+
return Exec.newTaskReport();
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|