embulk-input-filesplit 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +1 -1
- data/classpath/{embulk-input-filesplit-0.1.1.jar → embulk-input-filesplit-0.1.2.jar} +0 -0
- data/src/main/java/org/embulk/input/filesplit/LocalFileSplitInputPlugin.java +37 -37
- data/src/main/java/org/embulk/input/filesplit/PartialFile.java +49 -52
- data/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java +154 -154
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db346109428797d7798e50be35a6d8b24b016e84
|
4
|
+
data.tar.gz: e3c3206c03686b1ba05d3ddcd955768451f871c4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b4df6195347a2eae091b9e783e1c554244885d408f7e89706a27129f7afc18ad76a031bb1155b17818d81b8399c2286cc02d6772ff137c59df09bc30d3331d81
|
7
|
+
data.tar.gz: 49a1b4306e12a38532ef48dbe3c51094a970b9869e2d9510ab79c8fb8fdbdc35fd1b88df963b0a929a49dea546ce78f0a9313e78da2d894041b77fb8491264d4
|
data/build.gradle
CHANGED
Binary file
|
@@ -31,7 +31,7 @@ import com.google.common.base.Optional;
|
|
31
31
|
public class LocalFileSplitInputPlugin
|
32
32
|
implements FileInputPlugin
|
33
33
|
{
|
34
|
-
|
34
|
+
public interface PluginTask
|
35
35
|
extends Task
|
36
36
|
{
|
37
37
|
@Config("path")
|
@@ -59,22 +59,22 @@ public class LocalFileSplitInputPlugin
|
|
59
59
|
|
60
60
|
int tasks;
|
61
61
|
if (task.getTasks().isPresent()) {
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
tasks = task.getTasks().get();
|
63
|
+
if (tasks <= 0) {
|
64
|
+
throw new IllegalArgumentException(String.format("'tasks' is %d but must be greater than 0", tasks));
|
65
|
+
}
|
66
66
|
} else {
|
67
|
-
|
67
|
+
tasks = Runtime.getRuntime().availableProcessors() * 2;
|
68
68
|
}
|
69
69
|
|
70
70
|
long size = new File(task.getPath()).length();
|
71
71
|
List<PartialFile> files = new ArrayList<PartialFile>();
|
72
72
|
for (int i = 0; i < tasks; i++) {
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
73
|
+
long start = size * i / tasks;
|
74
|
+
long end = size * (i + 1) / tasks;
|
75
|
+
if (start < end) {
|
76
|
+
files.add(new PartialFile(task.getPath(), start, end));
|
77
|
+
}
|
78
78
|
}
|
79
79
|
|
80
80
|
task.setFiles(files);
|
@@ -132,7 +132,7 @@ public class LocalFileSplitInputPlugin
|
|
132
132
|
|
133
133
|
InputStream in = new PartialFileInputStream(new FileInputStream(file.getPath()), file.getStart(), file.getEnd());
|
134
134
|
if (file.getStart() > 0 && hasHeader) {
|
135
|
-
|
135
|
+
in = new SequenceInputStream(openHeader(file.getPath()), in);
|
136
136
|
}
|
137
137
|
return in;
|
138
138
|
}
|
@@ -142,31 +142,31 @@ public class LocalFileSplitInputPlugin
|
|
142
142
|
|
143
143
|
private InputStream openHeader(String path) throws IOException
|
144
144
|
{
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
145
|
+
ByteArrayOutputStream header = new ByteArrayOutputStream();
|
146
|
+
try (BufferedInputStream in = new BufferedInputStream(new FileInputStream(path))) {
|
147
|
+
while (true) {
|
148
|
+
int c = in.read();
|
149
|
+
if (c < 0) {
|
150
|
+
break;
|
151
|
+
}
|
152
|
+
|
153
|
+
header.write(c);
|
154
|
+
|
155
|
+
if (c == '\n') {
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
|
159
|
+
if (c == '\r') {
|
160
|
+
int c2 = in.read();
|
161
|
+
if (c2 == '\n') {
|
162
|
+
header.write(c2);
|
163
|
+
}
|
164
|
+
break;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
header.close();
|
169
|
+
return new ByteArrayInputStream(header.toByteArray());
|
170
170
|
}
|
171
171
|
}
|
172
172
|
|
@@ -1,53 +1,50 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
{
|
51
|
-
this.end = end;
|
52
|
-
}
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
public class PartialFile
|
4
|
+
{
|
5
|
+
private String path;
|
6
|
+
private long start;
|
7
|
+
private long end;
|
8
|
+
|
9
|
+
|
10
|
+
public PartialFile(String path, long start, long end)
|
11
|
+
{
|
12
|
+
this.path = path;
|
13
|
+
this.start = start;
|
14
|
+
this.end = end;
|
15
|
+
}
|
16
|
+
|
17
|
+
public PartialFile() {
|
18
|
+
}
|
19
|
+
|
20
|
+
public String getPath()
|
21
|
+
{
|
22
|
+
return path;
|
23
|
+
}
|
24
|
+
|
25
|
+
public void setPath(String path)
|
26
|
+
{
|
27
|
+
this.path = path;
|
28
|
+
}
|
29
|
+
|
30
|
+
|
31
|
+
public long getStart()
|
32
|
+
{
|
33
|
+
return start;
|
34
|
+
}
|
35
|
+
|
36
|
+
public void setStart(long start)
|
37
|
+
{
|
38
|
+
this.start = start;
|
39
|
+
}
|
40
|
+
|
41
|
+
public long getEnd()
|
42
|
+
{
|
43
|
+
return end;
|
44
|
+
}
|
45
|
+
|
46
|
+
public void setEnd(long end)
|
47
|
+
{
|
48
|
+
this.end = end;
|
49
|
+
}
|
53
50
|
}
|
@@ -1,154 +1,154 @@
|
|
1
|
-
package org.embulk.input.filesplit;
|
2
|
-
|
3
|
-
import java.io.BufferedInputStream;
|
4
|
-
import java.io.IOException;
|
5
|
-
import java.io.InputStream;
|
6
|
-
import java.io.PushbackInputStream;
|
7
|
-
|
8
|
-
|
9
|
-
public class PartialFileInputStream extends InputStream
|
10
|
-
{
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
}
|
1
|
+
package org.embulk.input.filesplit;
|
2
|
+
|
3
|
+
import java.io.BufferedInputStream;
|
4
|
+
import java.io.IOException;
|
5
|
+
import java.io.InputStream;
|
6
|
+
import java.io.PushbackInputStream;
|
7
|
+
|
8
|
+
|
9
|
+
public class PartialFileInputStream extends InputStream
|
10
|
+
{
|
11
|
+
private final PushbackInputStream original;
|
12
|
+
private long start;
|
13
|
+
private long end;
|
14
|
+
private long current;
|
15
|
+
private boolean eof;
|
16
|
+
|
17
|
+
public PartialFileInputStream(InputStream original, long start, long end)
|
18
|
+
{
|
19
|
+
this.original = new PushbackInputStream(new BufferedInputStream(original));
|
20
|
+
this.start = start;
|
21
|
+
this.end = end;
|
22
|
+
current = -1;
|
23
|
+
}
|
24
|
+
|
25
|
+
@Override
|
26
|
+
public int read(byte[] b) throws IOException
|
27
|
+
{
|
28
|
+
return read(b, 0, b.length);
|
29
|
+
}
|
30
|
+
|
31
|
+
@Override
|
32
|
+
public int read(byte[] b, int off, int len) throws IOException
|
33
|
+
{
|
34
|
+
initializeIfNeeded();
|
35
|
+
|
36
|
+
if (eof) {
|
37
|
+
return -1;
|
38
|
+
}
|
39
|
+
|
40
|
+
int read = original.read(b, off, len);
|
41
|
+
if (read < 0) {
|
42
|
+
eof = true;
|
43
|
+
return -1;
|
44
|
+
}
|
45
|
+
|
46
|
+
current += read;
|
47
|
+
if (current >= end) {
|
48
|
+
for (int i = Math.max((int)(end - 1 - current + read), 0); i < read; i++) {
|
49
|
+
if (b[off + i] == '\n') {
|
50
|
+
eof = true;
|
51
|
+
return i + 1;
|
52
|
+
}
|
53
|
+
|
54
|
+
if (b[off + i] == '\r') {
|
55
|
+
int next = (i < read ? b[off + i + 1] : prefetch());
|
56
|
+
if (next != '\n') {
|
57
|
+
eof = true;
|
58
|
+
return i + 1;
|
59
|
+
}
|
60
|
+
}
|
61
|
+
}
|
62
|
+
}
|
63
|
+
|
64
|
+
return read;
|
65
|
+
}
|
66
|
+
|
67
|
+
@Override
|
68
|
+
public int read() throws IOException
|
69
|
+
{
|
70
|
+
initializeIfNeeded();
|
71
|
+
|
72
|
+
if (eof) {
|
73
|
+
return -1;
|
74
|
+
}
|
75
|
+
|
76
|
+
int read = original.read();
|
77
|
+
current++;
|
78
|
+
|
79
|
+
if (read < 0) {
|
80
|
+
eof = true;
|
81
|
+
return -1;
|
82
|
+
}
|
83
|
+
|
84
|
+
if (current >= end) {
|
85
|
+
if (read == '\n' || read == '\r' && prefetch() != '\n') {
|
86
|
+
eof = true;
|
87
|
+
}
|
88
|
+
}
|
89
|
+
|
90
|
+
return read;
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
public long skip(long n) throws IOException
|
95
|
+
{
|
96
|
+
throw new IOException("Skip not supported.");
|
97
|
+
/*
|
98
|
+
long skip = original.skip(n);
|
99
|
+
current += skip;
|
100
|
+
return skip;
|
101
|
+
*/
|
102
|
+
}
|
103
|
+
|
104
|
+
@Override
|
105
|
+
public int available() throws IOException
|
106
|
+
{
|
107
|
+
return 0;
|
108
|
+
}
|
109
|
+
|
110
|
+
@Override
|
111
|
+
public void close() throws IOException
|
112
|
+
{
|
113
|
+
original.close();
|
114
|
+
}
|
115
|
+
|
116
|
+
private void initializeIfNeeded() throws IOException
|
117
|
+
{
|
118
|
+
if (current >= start) {
|
119
|
+
return;
|
120
|
+
|
121
|
+
}
|
122
|
+
if (start == 0) {
|
123
|
+
current = 0;
|
124
|
+
} else {
|
125
|
+
current = original.skip(--start);
|
126
|
+
if (current != start) {
|
127
|
+
throw new IOException("Cannot skip.");
|
128
|
+
}
|
129
|
+
|
130
|
+
int c;
|
131
|
+
while ((c = original.read()) >= 0) {
|
132
|
+
start++;
|
133
|
+
current++;
|
134
|
+
|
135
|
+
if (c == '\n' || c == '\r' && prefetch() != '\n') {
|
136
|
+
break;
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
if (start >= end) {
|
142
|
+
eof = true;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
private int prefetch() throws IOException
|
147
|
+
{
|
148
|
+
int c = original.read();
|
149
|
+
if (c >= 0) {
|
150
|
+
original.unread(c);
|
151
|
+
}
|
152
|
+
return c;
|
153
|
+
}
|
154
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-filesplit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hitoshi Tanaka
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email:
|
@@ -36,7 +36,7 @@ files:
|
|
36
36
|
- src/test/resources/yml/test-only-header.yml
|
37
37
|
- src/test/resources/yml/test-tasks.yml
|
38
38
|
- src/test/resources/yml/test.yml
|
39
|
-
- classpath/embulk-input-filesplit-0.1.
|
39
|
+
- classpath/embulk-input-filesplit-0.1.2.jar
|
40
40
|
homepage: https://github.com/hito4t/embulk-input-filesplit
|
41
41
|
licenses:
|
42
42
|
- Apache 2.0
|