embulk-input-filesplit 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 16ce42eb0c98f4e0f5092b4ddf29b5d77954e106
4
+ data.tar.gz: 961132ebfbadeb9650f480e27946cf4de4b3763a
5
+ SHA512:
6
+ metadata.gz: fdc723070b206a2d2bc1c9de1445f610acc10227c74e6fc13966521391fe430eca9d9d267b6e9dfabbe03f513fc810a5cb869ed88108cf9223d9402e123f406b
7
+ data.tar.gz: 220a17b5af68e4f6fe8c80f1d3f1a98cb3e3fc7ee6466ff85911fbb8dd4c145d2ce60cc362b6db15467d15458eef9411595eada0272ef799b41e8e2c2cf6ab04
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # Splitting input file plugin for Embulk
2
+
3
+ This Embulk plugin splits and inputs a text file.
4
+ By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
+
6
+ Lines of the text file should be separated by CR or LF or CRLF.
7
+ The plugin searches line separators and splits a file properly.
8
+
9
+ ## Overview
10
+
11
+ * **Plugin type**: input
12
+
13
+ ## Configuration
14
+
15
+ - **path**: the path of a text file (string, required)
16
+ - **header_line**: whether the first line is a header or not (boolean, default: false)
17
+ - **tasks**: number of tasks (integer, default: number of available processors * 2)
18
+
19
+ ### Example
20
+
21
+ ```yaml
22
+ in:
23
+ type: filesplit
24
+ path: '/data/address.csv'
25
+ header_line: true
26
+ tasks: 4
27
+ parser:
28
+ charset: UTF-8
29
+ newline: CRLF
30
+ type: csv
31
+ header_line: true
32
+ delimiter: ','
33
+ ...
34
+ ```
35
+
36
+ ### Build
37
+
38
+ ```
39
+ $ ./gradle gem
40
+ ```
data/build.gradle ADDED
@@ -0,0 +1,64 @@
1
+ plugins {
2
+ id 'com.jfrog.bintray' version '1.1'
3
+ id 'com.github.jruby-gradle.base' version '0.1.5'
4
+ id 'java'
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+
8
+ apply plugin: 'java'
9
+ apply plugin: 'com.github.jruby-gradle.base'
10
+
11
+ [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
+
13
+ project.version = '0.1.1'
14
+
15
+ repositories {
16
+ mavenCentral()
17
+ jcenter()
18
+ }
19
+
20
+ configurations {
21
+ provided
22
+ }
23
+
24
+ dependencies {
25
+ compile 'org.embulk:embulk-core:0.5.0'
26
+ provided 'org.embulk:embulk-core:0.5.0'
27
+ testCompile 'org.embulk:embulk-standards:0.5.0'
28
+ testCompile 'junit:junit:4.+'
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file('classpath').deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into 'classpath'
35
+ }
36
+ clean { delete 'classpath' }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
+ jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
+ script "build/gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemspec << {
45
+ file('build').mkdirs();
46
+ file('build/gemspec').write($/
47
+ Gem::Specification.new do |spec|
48
+ spec.name = "${project.name}"
49
+ spec.version = "${project.version}"
50
+ spec.authors = ["Hitoshi Tanaka"]
51
+ spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
+ spec.summary = "Embulk plugin for splitting input file"
53
+ spec.licenses = ["Apache 2.0"]
54
+ spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
+ spec.require_paths = ["lib"]
57
+ end
58
+ /$)
59
+ }
60
+
61
+
62
+ task gempush << {
63
+ "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
+ }
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_input(
2
+ :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,187 @@
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.ByteArrayInputStream;
5
+ import java.io.ByteArrayOutputStream;
6
+ import java.io.File;
7
+ import java.io.FileInputStream;
8
+ import java.io.IOException;
9
+ import java.io.InputStream;
10
+ import java.io.SequenceInputStream;
11
+ import java.util.ArrayList;
12
+ import java.util.List;
13
+
14
+ import org.embulk.config.CommitReport;
15
+ import org.embulk.config.Config;
16
+ import org.embulk.config.ConfigDefault;
17
+ import org.embulk.config.ConfigDiff;
18
+ import org.embulk.config.ConfigInject;
19
+ import org.embulk.config.ConfigSource;
20
+ import org.embulk.config.Task;
21
+ import org.embulk.config.TaskSource;
22
+ import org.embulk.spi.BufferAllocator;
23
+ import org.embulk.spi.Exec;
24
+ import org.embulk.spi.FileInputPlugin;
25
+ import org.embulk.spi.TransactionalFileInput;
26
+ import org.embulk.spi.util.InputStreamFileInput;
27
+
28
+ import com.google.common.base.Optional;
29
+
30
+
31
+ public class LocalFileSplitInputPlugin
32
+ implements FileInputPlugin
33
+ {
34
+ public interface PluginTask
35
+ extends Task
36
+ {
37
+ @Config("path")
38
+ public String getPath();
39
+
40
+ @Config("tasks")
41
+ @ConfigDefault("null")
42
+ public Optional<Integer> getTasks();
43
+
44
+ @Config("header_line")
45
+ @ConfigDefault("false")
46
+ public boolean getHeaderLine();
47
+
48
+ public List<PartialFile> getFiles();
49
+ public void setFiles(List<PartialFile> files);
50
+
51
+ @ConfigInject
52
+ public BufferAllocator getBufferAllocator();
53
+ }
54
+
55
+ @Override
56
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
57
+ {
58
+ PluginTask task = config.loadConfig(PluginTask.class);
59
+
60
+ int tasks;
61
+ if (task.getTasks().isPresent()) {
62
+ tasks = task.getTasks().get();
63
+ if (tasks <= 0) {
64
+ throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
65
+ }
66
+ } else {
67
+ tasks = Runtime.getRuntime().availableProcessors() * 2;
68
+ }
69
+
70
+ long size = new File(task.getPath()).length();
71
+ List<PartialFile> files = new ArrayList<PartialFile>();
72
+ for (int i = 0; i < tasks; i++) {
73
+ long start = size * i / tasks;
74
+ long end = size * (i + 1) / tasks;
75
+ if (start < end) {
76
+ files.add(new PartialFile(task.getPath(), start, end));
77
+ }
78
+ }
79
+
80
+ task.setFiles(files);
81
+
82
+ return resume(task.dump(), task.getFiles().size(), control);
83
+ }
84
+
85
+ @Override
86
+ public ConfigDiff resume(TaskSource taskSource,
87
+ int taskCount,
88
+ FileInputPlugin.Control control)
89
+ {
90
+ control.run(taskSource, taskCount);
91
+
92
+ return Exec.newConfigDiff();
93
+ }
94
+
95
+ @Override
96
+ public void cleanup(TaskSource taskSource,
97
+ int taskCount,
98
+ List<CommitReport> successCommitReports)
99
+ { }
100
+
101
+ @Override
102
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
103
+ {
104
+ PluginTask task = taskSource.loadTask(PluginTask.class);
105
+ return new LocalFileSplitInput(task, taskIndex);
106
+ }
107
+
108
+ public static class LocalFileSplitInput
109
+ extends InputStreamFileInput
110
+ implements TransactionalFileInput
111
+ {
112
+ public static class FileSplitProvider
113
+ implements InputStreamFileInput.Provider
114
+ {
115
+ private final PartialFile file;
116
+ private final boolean hasHeader;
117
+ private boolean opened = false;
118
+
119
+ public FileSplitProvider(PartialFile file, boolean hasHeader)
120
+ {
121
+ this.file = file;
122
+ this.hasHeader = hasHeader;
123
+ }
124
+
125
+ @Override
126
+ public InputStream openNext() throws IOException
127
+ {
128
+ if (opened) {
129
+ return null;
130
+ }
131
+ opened = true;
132
+
133
+ InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
134
+ if (file.getStart() > 0 && hasHeader) {
135
+ in = new SequenceInputStream(openHeader(file.getPath()), in);
136
+ }
137
+ return in;
138
+ }
139
+
140
+ @Override
141
+ public void close() { }
142
+
143
+ private InputStream openHeader(String path) throws IOException
144
+ {
145
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
146
+ try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
147
+ while (true) {
148
+ int c = in.read();
149
+ if (c < 0) {
150
+ break;
151
+ }
152
+
153
+ header.write(c);
154
+
155
+ if (c == '\n') {
156
+ break;
157
+ }
158
+
159
+ if (c == '\r') {
160
+ int c2 = in.read();
161
+ if (c2 == '\n') {
162
+ header.write(c2);
163
+ }
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ header.close();
169
+ return new ByteArrayInputStream(header.toByteArray());
170
+ }
171
+ }
172
+
173
+ public LocalFileSplitInput(PluginTask task, int taskIndex)
174
+ {
175
+ super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
176
+ }
177
+
178
+ @Override
179
+ public void abort() { }
180
+
181
+ @Override
182
+ public CommitReport commit()
183
+ {
184
+ return Exec.newCommitReport();
185
+ }
186
+ }
187
+ }
@@ -0,0 +1,53 @@
1
+ /*
2
+ * $Id: typical.epf 2627 2010-03-18 01:40:13Z tiba $
3
+ */
4
+ package org.embulk.input.filesplit;
5
+
6
+ public class PartialFile
7
+ {
8
+ private String path;
9
+ private long start;
10
+ private long end;
11
+
12
+
13
+ public PartialFile(String path, long start, long end)
14
+ {
15
+ this.path = path;
16
+ this.start = start;
17
+ this.end = end;
18
+ }
19
+
20
+ public PartialFile() {
21
+ }
22
+
23
+ public String getPath()
24
+ {
25
+ return path;
26
+ }
27
+
28
+ public void setPath(String path)
29
+ {
30
+ this.path = path;
31
+ }
32
+
33
+
34
+ public long getStart()
35
+ {
36
+ return start;
37
+ }
38
+
39
+ public void setStart(long start)
40
+ {
41
+ this.start = start;
42
+ }
43
+
44
+ public long getEnd()
45
+ {
46
+ return end;
47
+ }
48
+
49
+ public void setEnd(long end)
50
+ {
51
+ this.end = end;
52
+ }
53
+ }
@@ -0,0 +1,154 @@
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+ import java.io.PushbackInputStream;
7
+
8
+
9
+ public class PartialFileInputStream extends InputStream
10
+ {
11
+ private final PushbackInputStream original;
12
+ private long start;
13
+ private long end;
14
+ private long current;
15
+ private boolean eof;
16
+
17
+ public PartialFileInputStream(InputStream original, long start, long end)
18
+ {
19
+ this.original = new PushbackInputStream(new BufferedInputStream(original));
20
+ this.start = start;
21
+ this.end = end;
22
+ current = -1;
23
+ }
24
+
25
+ @Override
26
+ public int read(byte[] b) throws IOException
27
+ {
28
+ return read(b, 0, b.length);
29
+ }
30
+
31
+ @Override
32
+ public int read(byte[] b, int off, int len) throws IOException
33
+ {
34
+ initializeIfNeeded();
35
+
36
+ if (eof) {
37
+ return -1;
38
+ }
39
+
40
+ int read = original.read(b, off, len);
41
+ if (read < 0) {
42
+ eof = true;
43
+ return -1;
44
+ }
45
+
46
+ current += read;
47
+ if (current >= end) {
48
+ for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
49
+ if (b[off + i] == '\n') {
50
+ eof = true;
51
+ return i + 1;
52
+ }
53
+
54
+ if (b[off + i] == '\r') {
55
+ int next = (i < read ? b[off + i + 1] : prefetch());
56
+ if (next != '\n') {
57
+ eof = true;
58
+ return i + 1;
59
+ }
60
+ }
61
+ }
62
+ }
63
+
64
+ return read;
65
+ }
66
+
67
+ @Override
68
+ public int read() throws IOException
69
+ {
70
+ initializeIfNeeded();
71
+
72
+ if (eof) {
73
+ return -1;
74
+ }
75
+
76
+ int read = original.read();
77
+ current++;
78
+
79
+ if (read < 0) {
80
+ eof = true;
81
+ return -1;
82
+ }
83
+
84
+ if (current >= end) {
85
+ if (read == '\n' || read == '\r' && prefetch() != '\n') {
86
+ eof = true;
87
+ }
88
+ }
89
+
90
+ return read;
91
+ }
92
+
93
+ @Override
94
+ public long skip(long n) throws IOException
95
+ {
96
+ throw new IOException("Skip not supported.");
97
+ /*
98
+ long skip = original.skip(n);
99
+ current += skip;
100
+ return skip;
101
+ */
102
+ }
103
+
104
+ @Override
105
+ public int available() throws IOException
106
+ {
107
+ return 0;
108
+ }
109
+
110
+ @Override
111
+ public void close() throws IOException
112
+ {
113
+ original.close();
114
+ }
115
+
116
+ private void initializeIfNeeded() throws IOException
117
+ {
118
+ if (current >= start) {
119
+ return;
120
+
121
+ }
122
+ if (start == 0) {
123
+ current = 0;
124
+ } else {
125
+ current = original.skip(--start);
126
+ if (current != start) {
127
+ throw new IOException("Cannot skip.");
128
+ }
129
+
130
+ int c;
131
+ while ((c = original.read()) >= 0) {
132
+ start++;
133
+ current++;
134
+
135
+ if (c == '\n' || c == '\r' && prefetch() != '\n') {
136
+ break;
137
+ }
138
+ }
139
+ }
140
+
141
+ if (start >= end) {
142
+ eof = true;
143
+ }
144
+ }
145
+
146
+ private int prefetch() throws IOException
147
+ {
148
+ int c = original.read();
149
+ if (c >= 0) {
150
+ original.unread(c);
151
+ }
152
+ return c;
153
+ }
154
+ }