embulk-input-filesplit 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +40 -0
- data/build.gradle +64 -0
- data/classpath/embulk-input-filesplit-0.1.1.jar +0 -0
- data/lib/embulk/input/filesplit.rb +3 -0
- data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +187 -0
- data/src/main/java/org/embulk/input/filesplit/PartialFile.java +53 -0
- data/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java +154 -0
- data/src/test/java/org/embulk/input/filesplit/EmbulkPluginTester.java +95 -0
- data/src/test/java/org/embulk/input/filesplit/EmptyConfigSource.java +107 -0
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputPluginTest.java +94 -0
- data/src/test/java/org/embulk/input/filesplit/LocalFileSplitInputTest.java +81 -0
- data/src/test/java/org/embulk/input/filesplit/PartialFileInputStreamTest.java +570 -0
- data/src/test/resources/data/empty.csv +0 -0
- data/src/test/resources/data/test-header.csv +5 -0
- data/src/test/resources/data/test-only-header.csv +1 -0
- data/src/test/resources/data/test.csv +4 -0
- data/src/test/resources/resource.txt +0 -0
- data/src/test/resources/yml/test-header.yml +25 -0
- data/src/test/resources/yml/test-only-header.yml +25 -0
- data/src/test/resources/yml/test-tasks.yml +24 -0
- data/src/test/resources/yml/test.yml +23 -0
- metadata +64 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 16ce42eb0c98f4e0f5092b4ddf29b5d77954e106
|
4
|
+
data.tar.gz: 961132ebfbadeb9650f480e27946cf4de4b3763a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fdc723070b206a2d2bc1c9de1445f610acc10227c74e6fc13966521391fe430eca9d9d267b6e9dfabbe03f513fc810a5cb869ed88108cf9223d9402e123f406b
|
7
|
+
data.tar.gz: 220a17b5af68e4f6fe8c80f1d3f1a98cb3e3fc7ee6466ff85911fbb8dd4c145d2ce60cc362b6db15467d15458eef9411595eada0272ef799b41e8e2c2cf6ab04
|
data/README.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
# Splitting input file plugin for Embulk
|
2
|
+
|
3
|
+
This Embulk plugin splits and inputs a text file.
|
4
|
+
By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
|
5
|
+
|
6
|
+
Lines of the text file should be separated by CR or LF or CRLF.
|
7
|
+
The plugin searches line separators and splits a file properly.
|
8
|
+
|
9
|
+
## Overview
|
10
|
+
|
11
|
+
* **Plugin type**: input
|
12
|
+
|
13
|
+
## Configuration
|
14
|
+
|
15
|
+
- **path**: the path of a text file (string, required)
|
16
|
+
- **header_line**: whether the first line is a header or not (boolean, default: false)
|
17
|
+
- **tasks**: number of tasks (integer, default: number of available processors * 2)
|
18
|
+
|
19
|
+
### Example
|
20
|
+
|
21
|
+
```yaml
|
22
|
+
in:
|
23
|
+
type: filesplit
|
24
|
+
path: '/data/address.csv'
|
25
|
+
header_line: true
|
26
|
+
tasks: 4
|
27
|
+
parser:
|
28
|
+
charset: UTF-8
|
29
|
+
newline: CRLF
|
30
|
+
type: csv
|
31
|
+
header_line: true
|
32
|
+
delimiter: ','
|
33
|
+
...
|
34
|
+
```
|
35
|
+
|
36
|
+
### Build
|
37
|
+
|
38
|
+
```
|
39
|
+
$ ./gradle gem
|
40
|
+
```
|
data/build.gradle
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
plugins {
|
2
|
+
id 'com.jfrog.bintray' version '1.1'
|
3
|
+
id 'com.github.jruby-gradle.base' version '0.1.5'
|
4
|
+
id 'java'
|
5
|
+
}
|
6
|
+
import com.github.jrubygradle.JRubyExec
|
7
|
+
|
8
|
+
apply plugin: 'java'
|
9
|
+
apply plugin: 'com.github.jruby-gradle.base'
|
10
|
+
|
11
|
+
[compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
|
12
|
+
|
13
|
+
project.version = '0.1.1'
|
14
|
+
|
15
|
+
repositories {
|
16
|
+
mavenCentral()
|
17
|
+
jcenter()
|
18
|
+
}
|
19
|
+
|
20
|
+
configurations {
|
21
|
+
provided
|
22
|
+
}
|
23
|
+
|
24
|
+
dependencies {
|
25
|
+
compile 'org.embulk:embulk-core:0.5.0'
|
26
|
+
provided 'org.embulk:embulk-core:0.5.0'
|
27
|
+
testCompile 'org.embulk:embulk-standards:0.5.0'
|
28
|
+
testCompile 'junit:junit:4.+'
|
29
|
+
}
|
30
|
+
|
31
|
+
task classpath(type: Copy, dependsOn: ["jar"]) {
|
32
|
+
doFirst { file('classpath').deleteDir() }
|
33
|
+
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
34
|
+
into 'classpath'
|
35
|
+
}
|
36
|
+
clean { delete 'classpath' }
|
37
|
+
|
38
|
+
task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
|
39
|
+
jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
|
40
|
+
script "build/gemspec"
|
41
|
+
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
42
|
+
}
|
43
|
+
|
44
|
+
task gemspec << {
|
45
|
+
file('build').mkdirs();
|
46
|
+
file('build/gemspec').write($/
|
47
|
+
Gem::Specification.new do |spec|
|
48
|
+
spec.name = "${project.name}"
|
49
|
+
spec.version = "${project.version}"
|
50
|
+
spec.authors = ["Hitoshi Tanaka"]
|
51
|
+
spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
|
52
|
+
spec.summary = "Embulk plugin for splitting input file"
|
53
|
+
spec.licenses = ["Apache 2.0"]
|
54
|
+
spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
|
55
|
+
spec.test_files = spec.files.grep(%r"^(test|spec)/")
|
56
|
+
spec.require_paths = ["lib"]
|
57
|
+
end
|
58
|
+
/$)
|
59
|
+
}
|
60
|
+
|
61
|
+
|
62
|
+
task gempush << {
|
63
|
+
"gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
|
64
|
+
}
|
Binary file
|
@@ -0,0 +1,187 @@
|
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.ByteArrayInputStream;
|
5
|
+
import java.io.ByteArrayOutputStream;
|
6
|
+
import java.io.File;
|
7
|
+
import java.io.FileInputStream;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.InputStream;
|
10
|
+
import java.io.SequenceInputStream;
|
11
|
+
import java.util.ArrayList;
|
12
|
+
import java.util.List;
|
13
|
+
|
14
|
+
import org.embulk.config.CommitReport;
|
15
|
+
import org.embulk.config.Config;
|
16
|
+
import org.embulk.config.ConfigDefault;
|
17
|
+
import org.embulk.config.ConfigDiff;
|
18
|
+
import org.embulk.config.ConfigInject;
|
19
|
+
import org.embulk.config.ConfigSource;
|
20
|
+
import org.embulk.config.Task;
|
21
|
+
import org.embulk.config.TaskSource;
|
22
|
+
import org.embulk.spi.BufferAllocator;
|
23
|
+
import org.embulk.spi.Exec;
|
24
|
+
import org.embulk.spi.FileInputPlugin;
|
25
|
+
import org.embulk.spi.TransactionalFileInput;
|
26
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
27
|
+
|
28
|
+
import com.google.common.base.Optional;
|
29
|
+
|
30
|
+
|
31
|
+
public class LocalFileSplitInputPlugin
|
32
|
+
implements FileInputPlugin
|
33
|
+
{
|
34
|
+
public interface PluginTask
|
35
|
+
extends Task
|
36
|
+
{
|
37
|
+
@Config("path")
|
38
|
+
public String getPath();
|
39
|
+
|
40
|
+
@Config("tasks")
|
41
|
+
@ConfigDefault("null")
|
42
|
+
public Optional<Integer> getTasks();
|
43
|
+
|
44
|
+
@Config("header_line")
|
45
|
+
@ConfigDefault("false")
|
46
|
+
public boolean getHeaderLine();
|
47
|
+
|
48
|
+
public List<PartialFile> getFiles();
|
49
|
+
public void setFiles(List<PartialFile> files);
|
50
|
+
|
51
|
+
@ConfigInject
|
52
|
+
public BufferAllocator getBufferAllocator();
|
53
|
+
}
|
54
|
+
|
55
|
+
@Override
|
56
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
57
|
+
{
|
58
|
+
PluginTask task = config.loadConfig(PluginTask.class);
|
59
|
+
|
60
|
+
int tasks;
|
61
|
+
if (task.getTasks().isPresent()) {
|
62
|
+
tasks = task.getTasks().get();
|
63
|
+
if (tasks <= 0) {
|
64
|
+
throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
|
65
|
+
}
|
66
|
+
} else {
|
67
|
+
tasks = Runtime.getRuntime().availableProcessors() * 2;
|
68
|
+
}
|
69
|
+
|
70
|
+
long size = new File(task.getPath()).length();
|
71
|
+
List<PartialFile> files = new ArrayList<PartialFile>();
|
72
|
+
for (int i = 0; i < tasks; i++) {
|
73
|
+
long start = size * i / tasks;
|
74
|
+
long end = size * (i + 1) / tasks;
|
75
|
+
if (start < end) {
|
76
|
+
files.add(new PartialFile(task.getPath(), start, end));
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
task.setFiles(files);
|
81
|
+
|
82
|
+
return resume(task.dump(), task.getFiles().size(), control);
|
83
|
+
}
|
84
|
+
|
85
|
+
@Override
|
86
|
+
public ConfigDiff resume(TaskSource taskSource,
|
87
|
+
int taskCount,
|
88
|
+
FileInputPlugin.Control control)
|
89
|
+
{
|
90
|
+
control.run(taskSource, taskCount);
|
91
|
+
|
92
|
+
return Exec.newConfigDiff();
|
93
|
+
}
|
94
|
+
|
95
|
+
@Override
|
96
|
+
public void cleanup(TaskSource taskSource,
|
97
|
+
int taskCount,
|
98
|
+
List<CommitReport> successCommitReports)
|
99
|
+
{ }
|
100
|
+
|
101
|
+
@Override
|
102
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
103
|
+
{
|
104
|
+
PluginTask task = taskSource.loadTask(PluginTask.class);
|
105
|
+
return new LocalFileSplitInput(task, taskIndex);
|
106
|
+
}
|
107
|
+
|
108
|
+
public static class LocalFileSplitInput
|
109
|
+
extends InputStreamFileInput
|
110
|
+
implements TransactionalFileInput
|
111
|
+
{
|
112
|
+
public static class FileSplitProvider
|
113
|
+
implements InputStreamFileInput.Provider
|
114
|
+
{
|
115
|
+
private final PartialFile file;
|
116
|
+
private final boolean hasHeader;
|
117
|
+
private boolean opened = false;
|
118
|
+
|
119
|
+
public FileSplitProvider(PartialFile file, boolean hasHeader)
|
120
|
+
{
|
121
|
+
this.file = file;
|
122
|
+
this.hasHeader = hasHeader;
|
123
|
+
}
|
124
|
+
|
125
|
+
@Override
|
126
|
+
public InputStream openNext() throws IOException
|
127
|
+
{
|
128
|
+
if (opened) {
|
129
|
+
return null;
|
130
|
+
}
|
131
|
+
opened = true;
|
132
|
+
|
133
|
+
InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
|
134
|
+
if (file.getStart() > 0 && hasHeader) {
|
135
|
+
in = new SequenceInputStream(openHeader(file.getPath()), in);
|
136
|
+
}
|
137
|
+
return in;
|
138
|
+
}
|
139
|
+
|
140
|
+
@Override
|
141
|
+
public void close() { }
|
142
|
+
|
143
|
+
private InputStream openHeader(String path) throws IOException
|
144
|
+
{
|
145
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
146
|
+
try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
|
147
|
+
while (true) {
|
148
|
+
int c = in.read();
|
149
|
+
if (c < 0) {
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
|
153
|
+
header.write(c);
|
154
|
+
|
155
|
+
if (c == '\n') {
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
|
159
|
+
if (c == '\r') {
|
160
|
+
int c2 = in.read();
|
161
|
+
if (c2 == '\n') {
|
162
|
+
header.write(c2);
|
163
|
+
}
|
164
|
+
break;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
header.close();
|
169
|
+
return new ByteArrayInputStream(header.toByteArray());
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
public LocalFileSplitInput(PluginTask task, int taskIndex)
|
174
|
+
{
|
175
|
+
super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
|
176
|
+
}
|
177
|
+
|
178
|
+
@Override
|
179
|
+
public void abort() { }
|
180
|
+
|
181
|
+
@Override
|
182
|
+
public CommitReport commit()
|
183
|
+
{
|
184
|
+
return Exec.newCommitReport();
|
185
|
+
}
|
186
|
+
}
|
187
|
+
}
|
@@ -0,0 +1,53 @@
|
|
1
|
+
/*
|
2
|
+
* $Id: typical.epf 2627 2010-03-18 01:40:13Z tiba $
|
3
|
+
*/
|
4
|
+
package org.embulk.input.filesplit;
|
5
|
+
|
6
|
+
public class PartialFile
|
7
|
+
{
|
8
|
+
private String path;
|
9
|
+
private long start;
|
10
|
+
private long end;
|
11
|
+
|
12
|
+
|
13
|
+
public PartialFile(String path, long start, long end)
|
14
|
+
{
|
15
|
+
this.path = path;
|
16
|
+
this.start = start;
|
17
|
+
this.end = end;
|
18
|
+
}
|
19
|
+
|
20
|
+
public PartialFile() {
|
21
|
+
}
|
22
|
+
|
23
|
+
public String getPath()
|
24
|
+
{
|
25
|
+
return path;
|
26
|
+
}
|
27
|
+
|
28
|
+
public void setPath(String path)
|
29
|
+
{
|
30
|
+
this.path = path;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
public long getStart()
|
35
|
+
{
|
36
|
+
return start;
|
37
|
+
}
|
38
|
+
|
39
|
+
public void setStart(long start)
|
40
|
+
{
|
41
|
+
this.start = start;
|
42
|
+
}
|
43
|
+
|
44
|
+
public long getEnd()
|
45
|
+
{
|
46
|
+
return end;
|
47
|
+
}
|
48
|
+
|
49
|
+
public void setEnd(long end)
|
50
|
+
{
|
51
|
+
this.end = end;
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,154 @@
|
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.io.PushbackInputStream;
|
7
|
+
|
8
|
+
|
9
|
+
public class PartialFileInputStream extends InputStream
|
10
|
+
{
|
11
|
+
private final PushbackInputStream original;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
private long current;
|
15
|
+
private boolean eof;
|
16
|
+
|
17
|
+
public PartialFileInputStream(InputStream original, long start, long end)
|
18
|
+
{
|
19
|
+
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
20
|
+
this.start = start;
|
21
|
+
this.end = end;
|
22
|
+
current = -1;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public int read(byte[] b) throws IOException
|
27
|
+
{
|
28
|
+
return read(b, 0, b.length);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public int read(byte[] b, int off, int len) throws IOException
|
33
|
+
{
|
34
|
+
initializeIfNeeded();
|
35
|
+
|
36
|
+
if (eof) {
|
37
|
+
return -1;
|
38
|
+
}
|
39
|
+
|
40
|
+
int read = original.read(b, off, len);
|
41
|
+
if (read < 0) {
|
42
|
+
eof = true;
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
|
46
|
+
current += read;
|
47
|
+
if (current >= end) {
|
48
|
+
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
49
|
+
if (b[off + i] == '\n') {
|
50
|
+
eof = true;
|
51
|
+
return i + 1;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (b[off + i] == '\r') {
|
55
|
+
int next = (i < read ? b[off + i + 1] : prefetch());
|
56
|
+
if (next != '\n') {
|
57
|
+
eof = true;
|
58
|
+
return i + 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
return read;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public int read() throws IOException
|
69
|
+
{
|
70
|
+
initializeIfNeeded();
|
71
|
+
|
72
|
+
if (eof) {
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
int read = original.read();
|
77
|
+
current++;
|
78
|
+
|
79
|
+
if (read < 0) {
|
80
|
+
eof = true;
|
81
|
+
return -1;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (current >= end) {
|
85
|
+
if (read == '\n' || read == '\r' && prefetch() != '\n') {
|
86
|
+
eof = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return read;
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
public long skip(long n) throws IOException
|
95
|
+
{
|
96
|
+
throw new IOException("Skip not supported.");
|
97
|
+
/*
|
98
|
+
long skip = original.skip(n);
|
99
|
+
current += skip;
|
100
|
+
return skip;
|
101
|
+
*/
|
102
|
+
}
|
103
|
+
|
104
|
+
@Override
|
105
|
+
public int available() throws IOException
|
106
|
+
{
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
@Override
|
111
|
+
public void close() throws IOException
|
112
|
+
{
|
113
|
+
original.close();
|
114
|
+
}
|
115
|
+
|
116
|
+
private void initializeIfNeeded() throws IOException
|
117
|
+
{
|
118
|
+
if (current >= start) {
|
119
|
+
return;
|
120
|
+
|
121
|
+
}
|
122
|
+
if (start == 0) {
|
123
|
+
current = 0;
|
124
|
+
} else {
|
125
|
+
current = original.skip(--start);
|
126
|
+
if (current != start) {
|
127
|
+
throw new IOException("Cannot skip.");
|
128
|
+
}
|
129
|
+
|
130
|
+
int c;
|
131
|
+
while ((c = original.read()) >= 0) {
|
132
|
+
start++;
|
133
|
+
current++;
|
134
|
+
|
135
|
+
if (c == '\n' || c == '\r' && prefetch() != '\n') {
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
if (start >= end) {
|
142
|
+
eof = true;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
private int prefetch() throws IOException
|
147
|
+
{
|
148
|
+
int c = original.read();
|
149
|
+
if (c >= 0) {
|
150
|
+
original.unread(c);
|
151
|
+
}
|
152
|
+
return c;
|
153
|
+
}
|
154
|
+
}
|