embulk-input-filesplit 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 16ce42eb0c98f4e0f5092b4ddf29b5d77954e106
4
+ data.tar.gz: 961132ebfbadeb9650f480e27946cf4de4b3763a
5
+ SHA512:
6
+ metadata.gz: fdc723070b206a2d2bc1c9de1445f610acc10227c74e6fc13966521391fe430eca9d9d267b6e9dfabbe03f513fc810a5cb869ed88108cf9223d9402e123f406b
7
+ data.tar.gz: 220a17b5af68e4f6fe8c80f1d3f1a98cb3e3fc7ee6466ff85911fbb8dd4c145d2ce60cc362b6db15467d15458eef9411595eada0272ef799b41e8e2c2cf6ab04
data/README.md ADDED
@@ -0,0 +1,40 @@
1
+ # Splitting input file plugin for Embulk
2
+
3
+ This Embulk plugin splits and inputs a text file.
4
+ By splitting a file, input tasks will be executed in multithreads and the performance will be improved.
5
+
6
+ Lines of the text file should be separated by CR or LF or CRLF.
7
+ The plugin searches line separators and splits a file properly.
8
+
9
+ ## Overview
10
+
11
+ * **Plugin type**: input
12
+
13
+ ## Configuration
14
+
15
+ - **path**: the path of a text file (string, required)
16
+ - **header_line**: whether the first line is a header or not (boolean, default: false)
17
+ - **tasks**: number of tasks (integer, default: number of available processors * 2)
18
+
19
+ ### Example
20
+
21
+ ```yaml
22
+ in:
23
+ type: filesplit
24
+ path: '/data/address.csv'
25
+ header_line: true
26
+ tasks: 4
27
+ parser:
28
+ charset: UTF-8
29
+ newline: CRLF
30
+ type: csv
31
+ header_line: true
32
+ delimiter: ','
33
+ ...
34
+ ```
35
+
36
+ ### Build
37
+
38
+ ```
39
+ $ ./gradle gem
40
+ ```
data/build.gradle ADDED
@@ -0,0 +1,64 @@
1
+ plugins {
2
+ id 'com.jfrog.bintray' version '1.1'
3
+ id 'com.github.jruby-gradle.base' version '0.1.5'
4
+ id 'java'
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+
8
+ apply plugin: 'java'
9
+ apply plugin: 'com.github.jruby-gradle.base'
10
+
11
+ [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
12
+
13
+ project.version = '0.1.1'
14
+
15
+ repositories {
16
+ mavenCentral()
17
+ jcenter()
18
+ }
19
+
20
+ configurations {
21
+ provided
22
+ }
23
+
24
+ dependencies {
25
+ compile 'org.embulk:embulk-core:0.5.0'
26
+ provided 'org.embulk:embulk-core:0.5.0'
27
+ testCompile 'org.embulk:embulk-standards:0.5.0'
28
+ testCompile 'junit:junit:4.+'
29
+ }
30
+
31
+ task classpath(type: Copy, dependsOn: ["jar"]) {
32
+ doFirst { file('classpath').deleteDir() }
33
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
34
+ into 'classpath'
35
+ }
36
+ clean { delete 'classpath' }
37
+
38
+ task gem(type: JRubyExec, dependsOn: ['build', 'gemspec', 'classpath']) {
39
+ jrubyArgs '-rrubygems/gem_runner', "-eGem::GemRunner.new.run(ARGV)", 'build'
40
+ script "build/gemspec"
41
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
42
+ }
43
+
44
+ task gemspec << {
45
+ file('build').mkdirs();
46
+ file('build/gemspec').write($/
47
+ Gem::Specification.new do |spec|
48
+ spec.name = "${project.name}"
49
+ spec.version = "${project.version}"
50
+ spec.authors = ["Hitoshi Tanaka"]
51
+ spec.homepage = "https://github.com/hito4t/embulk-input-filesplit"
52
+ spec.summary = "Embulk plugin for splitting input file"
53
+ spec.licenses = ["Apache 2.0"]
54
+ spec.files = `git ls-files`.split("\n").grep(%r"^(?!\.)").grep(%r"^(?!depends/)") + Dir["classpath/*.jar"]
55
+ spec.test_files = spec.files.grep(%r"^(test|spec)/")
56
+ spec.require_paths = ["lib"]
57
+ end
58
+ /$)
59
+ }
60
+
61
+
62
+ task gempush << {
63
+ "gem push pkg/embulk-input-filesplit-${project.version}.gem".execute().waitFor()
64
+ }
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_input(
2
+ :filesplit, "org.embulk.input.filesplit.LocalFileSplitInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,187 @@
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.ByteArrayInputStream;
5
+ import java.io.ByteArrayOutputStream;
6
+ import java.io.File;
7
+ import java.io.FileInputStream;
8
+ import java.io.IOException;
9
+ import java.io.InputStream;
10
+ import java.io.SequenceInputStream;
11
+ import java.util.ArrayList;
12
+ import java.util.List;
13
+
14
+ import org.embulk.config.CommitReport;
15
+ import org.embulk.config.Config;
16
+ import org.embulk.config.ConfigDefault;
17
+ import org.embulk.config.ConfigDiff;
18
+ import org.embulk.config.ConfigInject;
19
+ import org.embulk.config.ConfigSource;
20
+ import org.embulk.config.Task;
21
+ import org.embulk.config.TaskSource;
22
+ import org.embulk.spi.BufferAllocator;
23
+ import org.embulk.spi.Exec;
24
+ import org.embulk.spi.FileInputPlugin;
25
+ import org.embulk.spi.TransactionalFileInput;
26
+ import org.embulk.spi.util.InputStreamFileInput;
27
+
28
+ import com.google.common.base.Optional;
29
+
30
+
31
+ public class LocalFileSplitInputPlugin
32
+ implements FileInputPlugin
33
+ {
34
+ public interface PluginTask
35
+ extends Task
36
+ {
37
+ @Config("path")
38
+ public String getPath();
39
+
40
+ @Config("tasks")
41
+ @ConfigDefault("null")
42
+ public Optional<Integer> getTasks();
43
+
44
+ @Config("header_line")
45
+ @ConfigDefault("false")
46
+ public boolean getHeaderLine();
47
+
48
+ public List<PartialFile> getFiles();
49
+ public void setFiles(List<PartialFile> files);
50
+
51
+ @ConfigInject
52
+ public BufferAllocator getBufferAllocator();
53
+ }
54
+
55
+ @Override
56
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
57
+ {
58
+ PluginTask task = config.loadConfig(PluginTask.class);
59
+
60
+ int tasks;
61
+ if (task.getTasks().isPresent()) {
62
+ tasks = task.getTasks().get();
63
+ if (tasks <= 0) {
64
+ throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
65
+ }
66
+ } else {
67
+ tasks = Runtime.getRuntime().availableProcessors() * 2;
68
+ }
69
+
70
+ long size = new File(task.getPath()).length();
71
+ List<PartialFile> files = new ArrayList<PartialFile>();
72
+ for (int i = 0; i < tasks; i++) {
73
+ long start = size * i / tasks;
74
+ long end = size * (i + 1) / tasks;
75
+ if (start < end) {
76
+ files.add(new PartialFile(task.getPath(), start, end));
77
+ }
78
+ }
79
+
80
+ task.setFiles(files);
81
+
82
+ return resume(task.dump(), task.getFiles().size(), control);
83
+ }
84
+
85
+ @Override
86
+ public ConfigDiff resume(TaskSource taskSource,
87
+ int taskCount,
88
+ FileInputPlugin.Control control)
89
+ {
90
+ control.run(taskSource, taskCount);
91
+
92
+ return Exec.newConfigDiff();
93
+ }
94
+
95
+ @Override
96
+ public void cleanup(TaskSource taskSource,
97
+ int taskCount,
98
+ List<CommitReport> successCommitReports)
99
+ { }
100
+
101
+ @Override
102
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
103
+ {
104
+ PluginTask task = taskSource.loadTask(PluginTask.class);
105
+ return new LocalFileSplitInput(task, taskIndex);
106
+ }
107
+
108
+ public static class LocalFileSplitInput
109
+ extends InputStreamFileInput
110
+ implements TransactionalFileInput
111
+ {
112
+ public static class FileSplitProvider
113
+ implements InputStreamFileInput.Provider
114
+ {
115
+ private final PartialFile file;
116
+ private final boolean hasHeader;
117
+ private boolean opened = false;
118
+
119
+ public FileSplitProvider(PartialFile file, boolean hasHeader)
120
+ {
121
+ this.file = file;
122
+ this.hasHeader = hasHeader;
123
+ }
124
+
125
+ @Override
126
+ public InputStream openNext() throws IOException
127
+ {
128
+ if (opened) {
129
+ return null;
130
+ }
131
+ opened = true;
132
+
133
+ InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
134
+ if (file.getStart() > 0 && hasHeader) {
135
+ in = new SequenceInputStream(openHeader(file.getPath()), in);
136
+ }
137
+ return in;
138
+ }
139
+
140
+ @Override
141
+ public void close() { }
142
+
143
+ private InputStream openHeader(String path) throws IOException
144
+ {
145
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
146
+ try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
147
+ while (true) {
148
+ int c = in.read();
149
+ if (c < 0) {
150
+ break;
151
+ }
152
+
153
+ header.write(c);
154
+
155
+ if (c == '\n') {
156
+ break;
157
+ }
158
+
159
+ if (c == '\r') {
160
+ int c2 = in.read();
161
+ if (c2 == '\n') {
162
+ header.write(c2);
163
+ }
164
+ break;
165
+ }
166
+ }
167
+ }
168
+ header.close();
169
+ return new ByteArrayInputStream(header.toByteArray());
170
+ }
171
+ }
172
+
173
+ public LocalFileSplitInput(PluginTask task, int taskIndex)
174
+ {
175
+ super(task.getBufferAllocator(), new FileSplitProvider(task.getFiles().get(taskIndex), task.getHeaderLine()));
176
+ }
177
+
178
+ @Override
179
+ public void abort() { }
180
+
181
+ @Override
182
+ public CommitReport commit()
183
+ {
184
+ return Exec.newCommitReport();
185
+ }
186
+ }
187
+ }
@@ -0,0 +1,53 @@
1
+ /*
2
+ * $Id: typical.epf 2627 2010-03-18 01:40:13Z tiba $
3
+ */
4
+ package org.embulk.input.filesplit;
5
+
6
+ public class PartialFile
7
+ {
8
+ private String path;
9
+ private long start;
10
+ private long end;
11
+
12
+
13
+ public PartialFile(String path, long start, long end)
14
+ {
15
+ this.path = path;
16
+ this.start = start;
17
+ this.end = end;
18
+ }
19
+
20
+ public PartialFile() {
21
+ }
22
+
23
+ public String getPath()
24
+ {
25
+ return path;
26
+ }
27
+
28
+ public void setPath(String path)
29
+ {
30
+ this.path = path;
31
+ }
32
+
33
+
34
+ public long getStart()
35
+ {
36
+ return start;
37
+ }
38
+
39
+ public void setStart(long start)
40
+ {
41
+ this.start = start;
42
+ }
43
+
44
+ public long getEnd()
45
+ {
46
+ return end;
47
+ }
48
+
49
+ public void setEnd(long end)
50
+ {
51
+ this.end = end;
52
+ }
53
+ }
@@ -0,0 +1,154 @@
1
+ package org.embulk.input.filesplit;
2
+
3
+ import java.io.BufferedInputStream;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+ import java.io.PushbackInputStream;
7
+
8
+
9
+ public class PartialFileInputStream extends InputStream
10
+ {
11
+ private final PushbackInputStream original;
12
+ private long start;
13
+ private long end;
14
+ private long current;
15
+ private boolean eof;
16
+
17
+ public PartialFileInputStream(InputStream original, long start, long end)
18
+ {
19
+ this.original = new PushbackInputStream(new BufferedInputStream(original));
20
+ this.start = start;
21
+ this.end = end;
22
+ current = -1;
23
+ }
24
+
25
+ @Override
26
+ public int read(byte[] b) throws IOException
27
+ {
28
+ return read(b, 0, b.length);
29
+ }
30
+
31
+ @Override
32
+ public int read(byte[] b, int off, int len) throws IOException
33
+ {
34
+ initializeIfNeeded();
35
+
36
+ if (eof) {
37
+ return -1;
38
+ }
39
+
40
+ int read = original.read(b, off, len);
41
+ if (read < 0) {
42
+ eof = true;
43
+ return -1;
44
+ }
45
+
46
+ current += read;
47
+ if (current >= end) {
48
+ for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
49
+ if (b[off + i] == '\n') {
50
+ eof = true;
51
+ return i + 1;
52
+ }
53
+
54
+ if (b[off + i] == '\r') {
55
+ int next = (i < read ? b[off + i + 1] : prefetch());
56
+ if (next != '\n') {
57
+ eof = true;
58
+ return i + 1;
59
+ }
60
+ }
61
+ }
62
+ }
63
+
64
+ return read;
65
+ }
66
+
67
+ @Override
68
+ public int read() throws IOException
69
+ {
70
+ initializeIfNeeded();
71
+
72
+ if (eof) {
73
+ return -1;
74
+ }
75
+
76
+ int read = original.read();
77
+ current++;
78
+
79
+ if (read < 0) {
80
+ eof = true;
81
+ return -1;
82
+ }
83
+
84
+ if (current >= end) {
85
+ if (read == '\n' || read == '\r' && prefetch() != '\n') {
86
+ eof = true;
87
+ }
88
+ }
89
+
90
+ return read;
91
+ }
92
+
93
+ @Override
94
+ public long skip(long n) throws IOException
95
+ {
96
+ throw new IOException("Skip not supported.");
97
+ /*
98
+ long skip = original.skip(n);
99
+ current += skip;
100
+ return skip;
101
+ */
102
+ }
103
+
104
+ @Override
105
+ public int available() throws IOException
106
+ {
107
+ return 0;
108
+ }
109
+
110
+ @Override
111
+ public void close() throws IOException
112
+ {
113
+ original.close();
114
+ }
115
+
116
+ private void initializeIfNeeded() throws IOException
117
+ {
118
+ if (current >= start) {
119
+ return;
120
+
121
+ }
122
+ if (start == 0) {
123
+ current = 0;
124
+ } else {
125
+ current = original.skip(--start);
126
+ if (current != start) {
127
+ throw new IOException("Cannot skip.");
128
+ }
129
+
130
+ int c;
131
+ while ((c = original.read()) >= 0) {
132
+ start++;
133
+ current++;
134
+
135
+ if (c == '\n' || c == '\r' && prefetch() != '\n') {
136
+ break;
137
+ }
138
+ }
139
+ }
140
+
141
+ if (start >= end) {
142
+ eof = true;
143
+ }
144
+ }
145
+
146
+ private int prefetch() throws IOException
147
+ {
148
+ int c = original.read();
149
+ if (c >= 0) {
150
+ original.unread(c);
151
+ }
152
+ return c;
153
+ }
154
+ }