embulk-input-filesplit 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +40 -0
- data/build.gradle +64 -0
- data/classpath/embulk-input-filesplit-0.1.1.jar +0 -0
- data/lib/embulk/input/filesplit.rb +3 -0
- data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +187 -0
- data/src/main/java/org/embulk/input/filesplit/PartialFile.java +53 -0
- data/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java +154 -0
- data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +95 -0
- data/src/test/java/org/embulk/input/filesplit/EmptyConfigSource.java +107 -0
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +94 -0
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +81 -0
- data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -0
- data/src/test/resources/data/empty.csv +0 -0
- data/src/test/resources/data/test-header.csv +5 -0
- data/src/test/resources/data/test-only-header.csv +1 -0
- data/src/test/resources/data/test.csv +4 -0
- data/src/test/resources/resource.txt +0 -0
- data/src/test/resources/yml/test-header.yml +25 -0
- data/src/test/resources/yml/test-only-header.yml +25 -0
- data/src/test/resources/yml/test-tasks.yml +24 -0
- data/src/test/resources/yml/test.yml +23 -0
- metadata +64 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 16ce42eb0c98f4e0f5092b4ddf29b5d77954e106
|
4
|
+
data.tar.gz: 961132ebfbadeb9650f480e27946cf4de4b3763a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fdc723070b206a2d2bc1c9de1445f610acc10227c74e6fc13966521391fe430eca9d9d267b6e9dfabbe03f513fc810a5cb869ed88108cf9223d9402e123f406b
|
7
|
+
data.tar.gz: 220a17b5af68e4f6fe8c80f1d3f1a98cb3e3fc7ee6466ff85911fbb8dd4c145d2ce60cc362b6db15467d15458eef9411595eada0272ef799b41e8e2c2cf6ab04
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# Splitting input file plugin for Embulk
|
2
|
+
|
3
|
+
This Embulk plugin splits and inputs a text file.
|
4
|
+
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
+
|
6
|
+
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
+
The plugin searches line separators and splits a file properly.
|
8
|
+
|
9
|
+
## Overview
|
10
|
+
|
11
|
+
* **Plugin type**: input
|
12
|
+
|
13
|
+
## Configuration
|
14
|
+
|
15
|
+
- **path**: the path of a text file (string, required)
|
16
|
+
- **header_line**: whether the first line is a header or not (boolean, default: false)
|
17
|
+
- **tasks**: number of tasks (integer, default: number of available processors * 2)
|
18
|
+
|
19
|
+
### Example
|
20
|
+
|
21
|
+
```yaml
|
22
|
+
in:
|
23
|
+
type: filesplit
|
24
|
+
path: '/data/address.csv'
|
25
|
+
header_line: true
|
26
|
+
tasks: 4
|
27
|
+
parser:
|
28
|
+
charset: UTF-8
|
29
|
+
newline: CRLF
|
30
|
+
type: csv
|
31
|
+
header_line: true
|
32
|
+
delimiter: ','
|
33
|
+
...
|
34
|
+
```
|
35
|
+
|
36
|
+
### Build
|
37
|
+
|
38
|
+
```
|
39
|
+
$ ./gradle gem
|
40
|
+
```
|
data/build.gradle
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
plugins {
|
2
|
+
id 'com.jfrog.bintray' version '1.1'
|
3
|
+
id 'com.github.jruby-gradle.base' version '0.1.5'
|
4
|
+
id 'java'
|
5
|
+
}
|
6
|
+
import com.github.jrubygradle.JRubyExec
|
7
|
+
|
8
|
+
apply plugin: 'java'
|
9
|
+
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
+
|
11
|
+
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
+
|
13
|
+
project.version = '0.1.1'
|
14
|
+
|
15
|
+
repositories {
|
16
|
+
mavenCentral()
|
17
|
+
jcenter()
|
18
|
+
}
|
19
|
+
|
20
|
+
configurations {
|
21
|
+
provided
|
22
|
+
}
|
23
|
+
|
24
|
+
dependencies {
|
25
|
+
compile 'org.embulk:embulk-core:0.5.0'
|
26
|
+
provided 'org.embulk:embulk-core:0.5.0'
|
27
|
+
testCompile 'org.embulk:embulk-standards:0.5.0'
|
28
|
+
testCompile 'junit:junit:4.+'
|
29
|
+
}
|
30
|
+
|
31
|
+
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
+
doFirst { file('classpath').deleteDir() }
|
33
|
+
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
+
into 'classpath'
|
35
|
+
}
|
36
|
+
clean { delete 'classpath' }
|
37
|
+
|
38
|
+
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
+
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
+
script "build/gemspec"
|
41
|
+
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
+
}
|
43
|
+
|
44
|
+
task gemspec << {
|
45
|
+
file('build').mkdirs();
|
46
|
+
file('build/gemspec').write($/
|
47
|
+
Gem::Specification.new do |spec|
|
48
|
+
spec.name = "${project.name}"
|
49
|
+
spec.version = "${project.version}"
|
50
|
+
spec.authors = ["Hitoshi Tanaka"]
|
51
|
+
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
+
spec.summary = "Embulk plugin for splitting input file"
|
53
|
+
spec.licenses = ["Apache 2.0"]
|
54
|
+
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
+
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
+
spec.require_paths = ["lib"]
|
57
|
+
end
|
58
|
+
/$)
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
task gempush << {
|
63
|
+
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
+
}
|
Binary file
|
@@ -0,0 +1,187 @@
|
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.ByteArrayInputStream;
|
5
|
+
import java.io.ByteArrayOutputStream;
|
6
|
+
import java.io.File;
|
7
|
+
import java.io.FileInputStream;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.InputStream;
|
10
|
+
import java.io.SequenceInputStream;
|
11
|
+
import java.util.ArrayList;
|
12
|
+
import java.util.List;
|
13
|
+
|
14
|
+
import org.embulk.config.CommitReport;
|
15
|
+
import org.embulk.config.Config;
|
16
|
+
import org.embulk.config.ConfigDefault;
|
17
|
+
import org.embulk.config.ConfigDiff;
|
18
|
+
import org.embulk.config.ConfigInject;
|
19
|
+
import org.embulk.config.ConfigSource;
|
20
|
+
import org.embulk.config.Task;
|
21
|
+
import org.embulk.config.TaskSource;
|
22
|
+
import org.embulk.spi.BufferAllocator;
|
23
|
+
import org.embulk.spi.Exec;
|
24
|
+
import org.embulk.spi.FileInputPlugin;
|
25
|
+
import org.embulk.spi.TransactionalFileInput;
|
26
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
27
|
+
|
28
|
+
import com.google.common.base.Optional;
|
29
|
+
|
30
|
+
|
31
|
+
public class LocalFileSplitInputPlugin
|
32
|
+
implements FileInputPlugin
|
33
|
+
{
|
34
|
+
public interface PluginTask
|
35
|
+
extends Task
|
36
|
+
{
|
37
|
+
@Config("path")
|
38
|
+
public String getPath();
|
39
|
+
|
40
|
+
@Config("tasks")
|
41
|
+
@ConfigDefault("null")
|
42
|
+
public Optional<Integer> getTasks();
|
43
|
+
|
44
|
+
@Config("header_line")
|
45
|
+
@ConfigDefault("false")
|
46
|
+
public boolean getHeaderLine();
|
47
|
+
|
48
|
+
public List<PartialFile> getFiles();
|
49
|
+
public void setFiles(List<PartialFile> files);
|
50
|
+
|
51
|
+
@ConfigInject
|
52
|
+
public BufferAllocator getBufferAllocator();
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
57
|
+
{
|
58
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
59
|
+
|
60
|
+
int tasks;
|
61
|
+
if (task.getTasks().isPresent()) {
|
62
|
+
tasks = task.getTasks().get();
|
63
|
+
if (tasks <= 0) {
|
64
|
+
throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
|
65
|
+
}
|
66
|
+
} else {
|
67
|
+
tasks = Runtime.getRuntime().availableProcessors() * 2;
|
68
|
+
}
|
69
|
+
|
70
|
+
long size = new File(task.getPath()).length();
|
71
|
+
List<PartialFile> files = new ArrayList<PartialFile>();
|
72
|
+
for (int i = 0; i < tasks; i++) {
|
73
|
+
long start = size * i / tasks;
|
74
|
+
long end = size * (i + 1) / tasks;
|
75
|
+
if (start < end) {
|
76
|
+
files.add(new PartialFile(task.getPath(), start, end));
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
task.setFiles(files);
|
81
|
+
|
82
|
+
return resume(task.dump(), task.getFiles().size(), control);
|
83
|
+
}
|
84
|
+
|
85
|
+
@Override
|
86
|
+
public ConfigDiff resume(TaskSource taskSource,
|
87
|
+
int taskCount,
|
88
|
+
FileInputPlugin.Control control)
|
89
|
+
{
|
90
|
+
control.run(taskSource, taskCount);
|
91
|
+
|
92
|
+
return Exec.newConfigDiff();
|
93
|
+
}
|
94
|
+
|
95
|
+
@Override
|
96
|
+
public void cleanup(TaskSource taskSource,
|
97
|
+
int taskCount,
|
98
|
+
List<CommitReport> successCommitReports)
|
99
|
+
{ }
|
100
|
+
|
101
|
+
@Override
|
102
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
103
|
+
{
|
104
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
105
|
+
return new LocalFileSplitInput(task, taskIndex);
|
106
|
+
}
|
107
|
+
|
108
|
+
public static class LocalFileSplitInput
|
109
|
+
extends InputStreamFileInput
|
110
|
+
implements TransactionalFileInput
|
111
|
+
{
|
112
|
+
public static class FileSplitProvider
|
113
|
+
implements InputStreamFileInput.Provider
|
114
|
+
{
|
115
|
+
private final PartialFile file;
|
116
|
+
private final boolean hasHeader;
|
117
|
+
private boolean opened = false;
|
118
|
+
|
119
|
+
public FileSplitProvider(PartialFile file, boolean hasHeader)
|
120
|
+
{
|
121
|
+
this.file = file;
|
122
|
+
this.hasHeader = hasHeader;
|
123
|
+
}
|
124
|
+
|
125
|
+
@Override
|
126
|
+
public InputStream openNext() throws IOException
|
127
|
+
{
|
128
|
+
if (opened) {
|
129
|
+
return null;
|
130
|
+
}
|
131
|
+
opened = true;
|
132
|
+
|
133
|
+
InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
|
134
|
+
if (file.getStart() > 0 && hasHeader) {
|
135
|
+
in = new SequenceInputStream(openHeader(file.getPath()), in);
|
136
|
+
}
|
137
|
+
return in;
|
138
|
+
}
|
139
|
+
|
140
|
+
@Override
|
141
|
+
public void close() { }
|
142
|
+
|
143
|
+
private InputStream openHeader(String path) throws IOException
|
144
|
+
{
|
145
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
146
|
+
try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
|
147
|
+
while (true) {
|
148
|
+
int c = in.read();
|
149
|
+
if (c < 0) {
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
|
153
|
+
header.write(c);
|
154
|
+
|
155
|
+
if (c == '\n') {
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
|
159
|
+
if (c == '\r') {
|
160
|
+
int c2 = in.read();
|
161
|
+
if (c2 == '\n') {
|
162
|
+
header.write(c2);
|
163
|
+
}
|
164
|
+
break;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
header.close();
|
169
|
+
return new ByteArrayInputStream(header.toByteArray());
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
public LocalFileSplitInput(PluginTask task, int taskIndex)
|
174
|
+
{
|
175
|
+
super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
|
176
|
+
}
|
177
|
+
|
178
|
+
@Override
|
179
|
+
public void abort() { }
|
180
|
+
|
181
|
+
@Override
|
182
|
+
public CommitReport commit()
|
183
|
+
{
|
184
|
+
return Exec.newCommitReport();
|
185
|
+
}
|
186
|
+
}
|
187
|
+
}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
/*
|
2
|
+
* $Id: typical.epf 2627 2010-03-18 01:40:13Z tiba $
|
3
|
+
*/
|
4
|
+
package org.embulk.input.filesplit;
|
5
|
+
|
6
|
+
public class PartialFile
|
7
|
+
{
|
8
|
+
private String path;
|
9
|
+
private long start;
|
10
|
+
private long end;
|
11
|
+
|
12
|
+
|
13
|
+
public PartialFile(String path, long start, long end)
|
14
|
+
{
|
15
|
+
this.path = path;
|
16
|
+
this.start = start;
|
17
|
+
this.end = end;
|
18
|
+
}
|
19
|
+
|
20
|
+
public PartialFile() {
|
21
|
+
}
|
22
|
+
|
23
|
+
public String getPath()
|
24
|
+
{
|
25
|
+
return path;
|
26
|
+
}
|
27
|
+
|
28
|
+
public void setPath(String path)
|
29
|
+
{
|
30
|
+
this.path = path;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
public long getStart()
|
35
|
+
{
|
36
|
+
return start;
|
37
|
+
}
|
38
|
+
|
39
|
+
public void setStart(long start)
|
40
|
+
{
|
41
|
+
this.start = start;
|
42
|
+
}
|
43
|
+
|
44
|
+
public long getEnd()
|
45
|
+
{
|
46
|
+
return end;
|
47
|
+
}
|
48
|
+
|
49
|
+
public void setEnd(long end)
|
50
|
+
{
|
51
|
+
this.end = end;
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.io.PushbackInputStream;
|
7
|
+
|
8
|
+
|
9
|
+
public class PartialFileInputStream extends InputStream
|
10
|
+
{
|
11
|
+
private final PushbackInputStream original;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
private long current;
|
15
|
+
private boolean eof;
|
16
|
+
|
17
|
+
public PartialFileInputStream(InputStream original, long start, long end)
|
18
|
+
{
|
19
|
+
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
20
|
+
this.start = start;
|
21
|
+
this.end = end;
|
22
|
+
current = -1;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public int read(byte[] b) throws IOException
|
27
|
+
{
|
28
|
+
return read(b, 0, b.length);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public int read(byte[] b, int off, int len) throws IOException
|
33
|
+
{
|
34
|
+
initializeIfNeeded();
|
35
|
+
|
36
|
+
if (eof) {
|
37
|
+
return -1;
|
38
|
+
}
|
39
|
+
|
40
|
+
int read = original.read(b, off, len);
|
41
|
+
if (read < 0) {
|
42
|
+
eof = true;
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
|
46
|
+
current += read;
|
47
|
+
if (current >= end) {
|
48
|
+
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
49
|
+
if (b[off + i] == '\n') {
|
50
|
+
eof = true;
|
51
|
+
return i + 1;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (b[off + i] == '\r') {
|
55
|
+
int next = (i < read ? b[off + i + 1] : prefetch());
|
56
|
+
if (next != '\n') {
|
57
|
+
eof = true;
|
58
|
+
return i + 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
return read;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public int read() throws IOException
|
69
|
+
{
|
70
|
+
initializeIfNeeded();
|
71
|
+
|
72
|
+
if (eof) {
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
int read = original.read();
|
77
|
+
current++;
|
78
|
+
|
79
|
+
if (read < 0) {
|
80
|
+
eof = true;
|
81
|
+
return -1;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (current >= end) {
|
85
|
+
if (read == '\n' || read == '\r' && prefetch() != '\n') {
|
86
|
+
eof = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return read;
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
public long skip(long n) throws IOException
|
95
|
+
{
|
96
|
+
throw new IOException("Skip not supported.");
|
97
|
+
/*
|
98
|
+
long skip = original.skip(n);
|
99
|
+
current += skip;
|
100
|
+
return skip;
|
101
|
+
*/
|
102
|
+
}
|
103
|
+
|
104
|
+
@Override
|
105
|
+
public int available() throws IOException
|
106
|
+
{
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
@Override
|
111
|
+
public void close() throws IOException
|
112
|
+
{
|
113
|
+
original.close();
|
114
|
+
}
|
115
|
+
|
116
|
+
private void initializeIfNeeded() throws IOException
|
117
|
+
{
|
118
|
+
if (current >= start) {
|
119
|
+
return;
|
120
|
+
|
121
|
+
}
|
122
|
+
if (start == 0) {
|
123
|
+
current = 0;
|
124
|
+
} else {
|
125
|
+
current = original.skip(--start);
|
126
|
+
if (current != start) {
|
127
|
+
throw new IOException("Cannot skip.");
|
128
|
+
}
|
129
|
+
|
130
|
+
int c;
|
131
|
+
while ((c = original.read()) >= 0) {
|
132
|
+
start++;
|
133
|
+
current++;
|
134
|
+
|
135
|
+
if (c == '\n' || c == '\r' && prefetch() != '\n') {
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
if (start >= end) {
|
142
|
+
eof = true;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
private int prefetch() throws IOException
|
147
|
+
{
|
148
|
+
int c = original.read();
|
149
|
+
if (c >= 0) {
|
150
|
+
original.unread(c);
|
151
|
+
}
|
152
|
+
return c;
|
153
|
+
}
|
154
|
+
}
|