embulk-input-hdfs 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHENGELOG.md +5 -0
- data/README.md +20 -17
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java +60 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +107 -200
- data/src/main/java/org/embulk/input/hdfs/Strftime.java +34 -0
- data/src/main/java/org/embulk/input/hdfs/TargetFileInfo.java +174 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileList.java → TargetFileInfoList.java} +73 -90
- data/src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java +128 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileInputStream.java → TargetFilePartialInputStream.java} +4 -6
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +6 -8
- metadata +19 -18
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +0 -82
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +0 -48
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +0 -125
@@ -0,0 +1,34 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.embulk.config.Config;
|
4
|
+
import org.embulk.config.ConfigDefault;
|
5
|
+
import org.embulk.config.ConfigInject;
|
6
|
+
import org.jruby.embed.ScriptingContainer;
|
7
|
+
|
8
|
+
public class Strftime
|
9
|
+
{
|
10
|
+
interface Task
|
11
|
+
{
|
12
|
+
@Config("rewind_seconds")
|
13
|
+
@ConfigDefault("0")
|
14
|
+
int getRewindSeconds();
|
15
|
+
|
16
|
+
@ConfigInject
|
17
|
+
ScriptingContainer getJRuby();
|
18
|
+
}
|
19
|
+
|
20
|
+
private final int rewindSeconds;
|
21
|
+
private final ScriptingContainer jruby;
|
22
|
+
|
23
|
+
public Strftime(Task task)
|
24
|
+
{
|
25
|
+
this.rewindSeconds = task.getRewindSeconds();
|
26
|
+
this.jruby = task.getJRuby();
|
27
|
+
}
|
28
|
+
|
29
|
+
public String format(String format)
|
30
|
+
{
|
31
|
+
String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
|
32
|
+
return jruby.runScriptlet(script).toString();
|
33
|
+
}
|
34
|
+
}
|
@@ -0,0 +1,174 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
6
|
+
import com.google.common.base.Optional;
|
7
|
+
import org.embulk.config.ConfigException;
|
8
|
+
|
9
|
+
import java.io.Serializable;
|
10
|
+
import java.lang.reflect.Field;
|
11
|
+
|
12
|
+
public class TargetFileInfo
|
13
|
+
implements Serializable
|
14
|
+
{
|
15
|
+
public static class Builder
|
16
|
+
{
|
17
|
+
private Optional<Long> start = Optional.absent();
|
18
|
+
private Optional<Long> end = Optional.absent();
|
19
|
+
private Optional<String> pathString = Optional.absent();
|
20
|
+
private Optional<Boolean> isDecompressible = Optional.absent();
|
21
|
+
private Optional<Boolean> isPartitionable = Optional.absent();
|
22
|
+
private Optional<Integer> numHeaderLines = Optional.absent();
|
23
|
+
|
24
|
+
public Builder()
|
25
|
+
{
|
26
|
+
}
|
27
|
+
|
28
|
+
public Builder start(long start)
|
29
|
+
{
|
30
|
+
this.start = Optional.of(start);
|
31
|
+
return this;
|
32
|
+
}
|
33
|
+
|
34
|
+
public Builder end(long end)
|
35
|
+
{
|
36
|
+
this.end = Optional.of(end);
|
37
|
+
return this;
|
38
|
+
}
|
39
|
+
|
40
|
+
public Builder pathString(String pathString)
|
41
|
+
{
|
42
|
+
this.pathString = Optional.of(pathString);
|
43
|
+
return this;
|
44
|
+
}
|
45
|
+
|
46
|
+
public Builder isDecompressible(boolean isDecompressible)
|
47
|
+
{
|
48
|
+
this.isDecompressible = Optional.of(isDecompressible);
|
49
|
+
return this;
|
50
|
+
}
|
51
|
+
|
52
|
+
public Builder isPartitionable(boolean isPartitionable)
|
53
|
+
{
|
54
|
+
this.isPartitionable = Optional.of(isPartitionable);
|
55
|
+
return this;
|
56
|
+
}
|
57
|
+
|
58
|
+
public Builder numHeaderLines(int numHeaderLines)
|
59
|
+
{
|
60
|
+
this.numHeaderLines = Optional.of(numHeaderLines);
|
61
|
+
return this;
|
62
|
+
}
|
63
|
+
|
64
|
+
public TargetFileInfo build()
|
65
|
+
{
|
66
|
+
try {
|
67
|
+
validate();
|
68
|
+
}
|
69
|
+
catch (IllegalAccessException | IllegalStateException e) {
|
70
|
+
throw new ConfigException(e);
|
71
|
+
}
|
72
|
+
|
73
|
+
return new TargetFileInfo(
|
74
|
+
pathString.get(), start.get(), end.get(),
|
75
|
+
isDecompressible.get(), isPartitionable.get(),
|
76
|
+
numHeaderLines.get());
|
77
|
+
}
|
78
|
+
|
79
|
+
private void validate()
|
80
|
+
throws IllegalAccessException, IllegalStateException
|
81
|
+
{
|
82
|
+
for (Field field : getClass().getDeclaredFields()) {
|
83
|
+
if (field.getType() != Optional.class) {
|
84
|
+
// for avoiding Z class by JUnit insertion.
|
85
|
+
continue;
|
86
|
+
}
|
87
|
+
Optional value = (Optional) field.get(this);
|
88
|
+
if (!value.isPresent()) {
|
89
|
+
String msg = String.format("field:%s is absent", field.getName());
|
90
|
+
throw new IllegalStateException(msg);
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
if (isDecompressible.get() && isPartitionable.get()) {
|
95
|
+
String msg = String.format("IllegalState: isDecompressible is true and isPartitionable is true: %s", pathString.get());
|
96
|
+
throw new IllegalStateException(msg);
|
97
|
+
}
|
98
|
+
|
99
|
+
if (isDecompressible.get() && start.get() != 0) {
|
100
|
+
String msg = String.format("IllegalState: isDecompressible is true, but start is not 0: %s", pathString.get());
|
101
|
+
throw new IllegalStateException(msg);
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
// private static final long serialVersionUID = 1L; // to suppress warnings?
|
107
|
+
private final long start;
|
108
|
+
private final long end;
|
109
|
+
private final String pathString;
|
110
|
+
private final boolean isDecompressible;
|
111
|
+
private final boolean isPartitionable;
|
112
|
+
private final int numHeaderLines;
|
113
|
+
|
114
|
+
@JsonCreator
|
115
|
+
private TargetFileInfo(
|
116
|
+
@JsonProperty("path_string") String pathString,
|
117
|
+
@JsonProperty("start") long start,
|
118
|
+
@JsonProperty("end") long end,
|
119
|
+
@JsonProperty("is_decompressible") boolean isDecompressible,
|
120
|
+
@JsonProperty("is_partitionable") boolean isPartitionable,
|
121
|
+
@JsonProperty("num_header_lines") int numHeaderLines)
|
122
|
+
{
|
123
|
+
this.pathString = pathString;
|
124
|
+
this.start = start;
|
125
|
+
this.end = end;
|
126
|
+
this.isDecompressible = isDecompressible;
|
127
|
+
this.isPartitionable = isPartitionable;
|
128
|
+
this.numHeaderLines = numHeaderLines;
|
129
|
+
}
|
130
|
+
|
131
|
+
@JsonProperty("start")
|
132
|
+
public long getStart()
|
133
|
+
{
|
134
|
+
return start;
|
135
|
+
}
|
136
|
+
|
137
|
+
@JsonProperty("end")
|
138
|
+
public long getEnd()
|
139
|
+
{
|
140
|
+
return end;
|
141
|
+
}
|
142
|
+
|
143
|
+
@JsonProperty("is_decompressible")
|
144
|
+
public boolean getIsDecompressible()
|
145
|
+
{
|
146
|
+
return isDecompressible;
|
147
|
+
}
|
148
|
+
|
149
|
+
@JsonProperty("is_partitionable")
|
150
|
+
public boolean getIsPartitionable()
|
151
|
+
{
|
152
|
+
return isPartitionable;
|
153
|
+
}
|
154
|
+
|
155
|
+
@JsonProperty("path_string")
|
156
|
+
public String getPathString()
|
157
|
+
{
|
158
|
+
return pathString;
|
159
|
+
}
|
160
|
+
|
161
|
+
@JsonProperty("num_header_lines")
|
162
|
+
public int getNumHeaderLines()
|
163
|
+
{
|
164
|
+
return numHeaderLines;
|
165
|
+
}
|
166
|
+
|
167
|
+
@JsonIgnore
|
168
|
+
public long getSize()
|
169
|
+
{
|
170
|
+
// NOTE: this size is reference value which
|
171
|
+
// becomes smaller than raw if the file is compressed.
|
172
|
+
return getEnd() - getStart();
|
173
|
+
}
|
174
|
+
}
|
@@ -1,35 +1,34 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
import
|
7
|
-
import
|
8
|
-
import
|
9
|
-
import
|
10
|
-
import
|
11
|
-
import
|
3
|
+
// Ported from https://github.com/embulk/embulk-input-s3/blob/b546158123a734acf0785d61400c69fcdd910ed6/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
|
4
|
+
// and Modified for this package.
|
5
|
+
|
6
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
7
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
8
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
9
|
+
import com.google.common.base.Optional;
|
10
|
+
import com.google.common.base.Throwables;
|
11
|
+
import org.apache.commons.lang.SerializationUtils;
|
12
|
+
import org.embulk.config.Config;
|
13
|
+
import org.embulk.config.ConfigDefault;
|
14
|
+
import org.embulk.config.ConfigSource;
|
15
|
+
|
12
16
|
import java.io.BufferedInputStream;
|
17
|
+
import java.io.BufferedOutputStream;
|
13
18
|
import java.io.ByteArrayInputStream;
|
14
19
|
import java.io.ByteArrayOutputStream;
|
15
20
|
import java.io.IOException;
|
21
|
+
import java.io.InputStream;
|
22
|
+
import java.io.OutputStream;
|
16
23
|
import java.nio.ByteBuffer;
|
17
|
-
import java.
|
18
|
-
import
|
19
|
-
import
|
20
|
-
import
|
21
|
-
import
|
22
|
-
import
|
23
|
-
import com.fasterxml.jackson.annotation.JsonProperty;
|
24
|
-
import com.fasterxml.jackson.annotation.JsonIgnore;
|
25
|
-
import com.fasterxml.jackson.annotation.JsonCreator;
|
24
|
+
import java.util.AbstractList;
|
25
|
+
import java.util.ArrayList;
|
26
|
+
import java.util.List;
|
27
|
+
import java.util.regex.Pattern;
|
28
|
+
import java.util.zip.GZIPInputStream;
|
29
|
+
import java.util.zip.GZIPOutputStream;
|
26
30
|
|
27
|
-
|
28
|
-
* Created by takahiro.nakayama on 2/20/16.
|
29
|
-
* Ported from https://github.com/embulk/embulk-input-s3/blob/master/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
|
30
|
-
* and Modified for this package.
|
31
|
-
*/
|
32
|
-
public class PartialFileList
|
31
|
+
public class TargetFileInfoList
|
33
32
|
{
|
34
33
|
public interface Task
|
35
34
|
{
|
@@ -50,21 +49,15 @@ public class PartialFileList
|
|
50
49
|
public static class Entry
|
51
50
|
{
|
52
51
|
private int index;
|
53
|
-
private long
|
54
|
-
private long end;
|
55
|
-
private boolean canDecompress;
|
52
|
+
private long size;
|
56
53
|
|
57
54
|
@JsonCreator
|
58
55
|
public Entry(
|
59
56
|
@JsonProperty("index") int index,
|
60
|
-
@JsonProperty("
|
61
|
-
@JsonProperty("end") long end,
|
62
|
-
@JsonProperty("can_decompress") boolean canDecompress)
|
57
|
+
@JsonProperty("size") long size)
|
63
58
|
{
|
64
59
|
this.index = index;
|
65
|
-
this.
|
66
|
-
this.end = end;
|
67
|
-
this.canDecompress = canDecompress;
|
60
|
+
this.size = size;
|
68
61
|
}
|
69
62
|
|
70
63
|
@JsonProperty("index")
|
@@ -73,28 +66,10 @@ public class PartialFileList
|
|
73
66
|
return index;
|
74
67
|
}
|
75
68
|
|
76
|
-
@JsonProperty("
|
77
|
-
public long getStart()
|
78
|
-
{
|
79
|
-
return start;
|
80
|
-
}
|
81
|
-
|
82
|
-
@JsonProperty("end")
|
83
|
-
public long getEnd()
|
84
|
-
{
|
85
|
-
return end;
|
86
|
-
}
|
87
|
-
|
88
|
-
@JsonProperty("can_decompress")
|
89
|
-
public boolean getCanDecompress()
|
90
|
-
{
|
91
|
-
return canDecompress;
|
92
|
-
}
|
93
|
-
|
94
|
-
@JsonIgnore
|
69
|
+
@JsonProperty("size")
|
95
70
|
public long getSize()
|
96
71
|
{
|
97
|
-
return
|
72
|
+
return size;
|
98
73
|
}
|
99
74
|
}
|
100
75
|
|
@@ -103,11 +78,11 @@ public class PartialFileList
|
|
103
78
|
private final ByteArrayOutputStream binary;
|
104
79
|
private final OutputStream stream;
|
105
80
|
private final List<Entry> entries = new ArrayList<>();
|
106
|
-
private
|
81
|
+
private TargetFileInfo last = null;
|
107
82
|
|
108
83
|
private int limitCount = Integer.MAX_VALUE;
|
109
|
-
private long minTaskSize =
|
110
|
-
private Pattern pathMatchPattern;
|
84
|
+
private long minTaskSize = 0L;
|
85
|
+
private Pattern pathMatchPattern = Pattern.compile(".*");
|
111
86
|
|
112
87
|
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
113
88
|
|
@@ -167,7 +142,7 @@ public class PartialFileList
|
|
167
142
|
}
|
168
143
|
|
169
144
|
// returns true if this file is used
|
170
|
-
public synchronized boolean add(
|
145
|
+
public synchronized boolean add(TargetFileInfo targetFileInfo)
|
171
146
|
{
|
172
147
|
// TODO throw IllegalStateException if stream is already closed
|
173
148
|
|
@@ -175,36 +150,36 @@ public class PartialFileList
|
|
175
150
|
return false;
|
176
151
|
}
|
177
152
|
|
178
|
-
if (!pathMatchPattern.matcher(
|
153
|
+
if (!pathMatchPattern.matcher(targetFileInfo.getPathString()).find()) {
|
179
154
|
return false;
|
180
155
|
}
|
181
156
|
|
182
157
|
int index = entries.size();
|
183
|
-
entries.add(new Entry(index,
|
158
|
+
entries.add(new Entry(index, targetFileInfo.getSize()));
|
184
159
|
|
185
|
-
byte[] data =
|
160
|
+
byte[] data = SerializationUtils.serialize(targetFileInfo);
|
186
161
|
castBuffer.putInt(0, data.length);
|
187
162
|
try {
|
188
163
|
stream.write(castBuffer.array());
|
189
164
|
stream.write(data);
|
190
165
|
}
|
191
|
-
catch (IOException
|
192
|
-
throw Throwables.propagate(
|
166
|
+
catch (IOException ex) {
|
167
|
+
throw Throwables.propagate(ex);
|
193
168
|
}
|
194
169
|
|
195
|
-
last =
|
170
|
+
last = targetFileInfo;
|
196
171
|
return true;
|
197
172
|
}
|
198
173
|
|
199
|
-
public
|
174
|
+
public TargetFileInfoList build()
|
200
175
|
{
|
201
176
|
try {
|
202
177
|
stream.close();
|
203
178
|
}
|
204
|
-
catch (IOException
|
205
|
-
throw Throwables.propagate(
|
179
|
+
catch (IOException ex) {
|
180
|
+
throw Throwables.propagate(ex);
|
206
181
|
}
|
207
|
-
return new
|
182
|
+
return new TargetFileInfoList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
208
183
|
}
|
209
184
|
|
210
185
|
private List<List<Entry>> getSplits(List<Entry> all)
|
@@ -230,13 +205,13 @@ public class PartialFileList
|
|
230
205
|
|
231
206
|
private final byte[] data;
|
232
207
|
private final List<List<Entry>> tasks;
|
233
|
-
private final Optional<
|
208
|
+
private final Optional<TargetFileInfo> last;
|
234
209
|
|
235
210
|
@JsonCreator
|
236
|
-
public
|
211
|
+
public TargetFileInfoList(
|
237
212
|
@JsonProperty("data") byte[] data,
|
238
213
|
@JsonProperty("tasks") List<List<Entry>> tasks,
|
239
|
-
@JsonProperty("last") Optional<
|
214
|
+
@JsonProperty("last") Optional<TargetFileInfo> last)
|
240
215
|
{
|
241
216
|
this.data = data;
|
242
217
|
this.tasks = tasks;
|
@@ -244,12 +219,18 @@ public class PartialFileList
|
|
244
219
|
}
|
245
220
|
|
246
221
|
@JsonIgnore
|
247
|
-
public
|
222
|
+
public static TargetFileInfoList.Builder builder(Task task)
|
223
|
+
{
|
224
|
+
return new TargetFileInfoList.Builder(task);
|
225
|
+
}
|
226
|
+
|
227
|
+
@JsonIgnore
|
228
|
+
public Optional<TargetFileInfo> getLastTargetFileInfo(Optional<TargetFileInfo> targetFileInfo)
|
248
229
|
{
|
249
230
|
if (last.isPresent()) {
|
250
231
|
return last;
|
251
232
|
}
|
252
|
-
return
|
233
|
+
return targetFileInfo;
|
253
234
|
}
|
254
235
|
|
255
236
|
@JsonIgnore
|
@@ -259,31 +240,34 @@ public class PartialFileList
|
|
259
240
|
}
|
260
241
|
|
261
242
|
@JsonIgnore
|
262
|
-
public List<
|
243
|
+
public List<TargetFileInfo> get(int i)
|
263
244
|
{
|
264
245
|
return new EntryList(data, tasks.get(i));
|
265
246
|
}
|
266
247
|
|
267
248
|
@JsonProperty("data")
|
249
|
+
@Deprecated
|
268
250
|
public byte[] getData()
|
269
251
|
{
|
270
252
|
return data;
|
271
253
|
}
|
272
254
|
|
273
255
|
@JsonProperty("tasks")
|
256
|
+
@Deprecated
|
274
257
|
public List<List<Entry>> getTasks()
|
275
258
|
{
|
276
259
|
return tasks;
|
277
260
|
}
|
278
261
|
|
279
262
|
@JsonProperty("last")
|
280
|
-
|
263
|
+
@Deprecated
|
264
|
+
public Optional<TargetFileInfo> getLast()
|
281
265
|
{
|
282
266
|
return last;
|
283
267
|
}
|
284
268
|
|
285
269
|
private class EntryList
|
286
|
-
extends AbstractList<
|
270
|
+
extends AbstractList<TargetFileInfo>
|
287
271
|
{
|
288
272
|
private final byte[] data;
|
289
273
|
private final List<Entry> entries;
|
@@ -299,34 +283,33 @@ public class PartialFileList
|
|
299
283
|
try {
|
300
284
|
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
301
285
|
}
|
302
|
-
catch (IOException
|
303
|
-
throw Throwables.propagate(
|
286
|
+
catch (IOException ex) {
|
287
|
+
throw Throwables.propagate(ex);
|
304
288
|
}
|
305
289
|
this.current = 0;
|
306
290
|
}
|
307
291
|
|
308
292
|
@Override
|
309
|
-
public synchronized
|
293
|
+
public synchronized TargetFileInfo get(int i)
|
310
294
|
{
|
311
|
-
Entry
|
312
|
-
if (
|
295
|
+
Entry e = entries.get(i);
|
296
|
+
if (e.getIndex() < current) {
|
313
297
|
// rewind to the head
|
314
298
|
try {
|
315
299
|
stream.close();
|
316
300
|
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
317
301
|
}
|
318
|
-
catch (IOException
|
319
|
-
throw Throwables.propagate(
|
302
|
+
catch (IOException ex) {
|
303
|
+
throw Throwables.propagate(ex);
|
320
304
|
}
|
321
305
|
current = 0;
|
322
306
|
}
|
323
307
|
|
324
|
-
while (current <
|
308
|
+
while (current < e.getIndex()) {
|
325
309
|
readNext();
|
326
310
|
}
|
327
311
|
// now current == e.getIndex()
|
328
|
-
return
|
329
|
-
entry.getStart(), entry.getEnd(), entry.getCanDecompress());
|
312
|
+
return readNextString();
|
330
313
|
}
|
331
314
|
|
332
315
|
@Override
|
@@ -347,14 +330,14 @@ public class PartialFileList
|
|
347
330
|
|
348
331
|
return b;
|
349
332
|
}
|
350
|
-
catch (IOException
|
351
|
-
throw Throwables.propagate(
|
333
|
+
catch (IOException ex) {
|
334
|
+
throw Throwables.propagate(ex);
|
352
335
|
}
|
353
336
|
}
|
354
337
|
|
355
|
-
private
|
338
|
+
private TargetFileInfo readNextString()
|
356
339
|
{
|
357
|
-
return
|
340
|
+
return (TargetFileInfo) SerializationUtils.deserialize(readNext());
|
358
341
|
}
|
359
342
|
}
|
360
|
-
}
|
343
|
+
}
|