embulk-input-hdfs 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHENGELOG.md +5 -0
- data/README.md +20 -17
- data/build.gradle +1 -1
- data/src/main/java/org/embulk/input/hdfs/ConfigurationFactory.java +60 -0
- data/src/main/java/org/embulk/input/hdfs/HdfsFileInputPlugin.java +107 -200
- data/src/main/java/org/embulk/input/hdfs/Strftime.java +34 -0
- data/src/main/java/org/embulk/input/hdfs/TargetFileInfo.java +174 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileList.java → TargetFileInfoList.java} +73 -90
- data/src/main/java/org/embulk/input/hdfs/TargetFileInputStreamFactory.java +128 -0
- data/src/main/java/org/embulk/input/hdfs/{PartialFileInputStream.java → TargetFilePartialInputStream.java} +4 -6
- data/src/test/java/org/embulk/input/hdfs/TestHdfsFileInputPlugin.java +6 -8
- metadata +19 -18
- data/src/main/java/org/embulk/input/hdfs/ConfigurationBuilder.java +0 -82
- data/src/main/java/org/embulk/input/hdfs/PartialFile.java +0 -48
- data/src/main/java/org/embulk/input/hdfs/PartialFileInputStreamBuilder.java +0 -125
@@ -0,0 +1,34 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import org.embulk.config.Config;
|
4
|
+
import org.embulk.config.ConfigDefault;
|
5
|
+
import org.embulk.config.ConfigInject;
|
6
|
+
import org.jruby.embed.ScriptingContainer;
|
7
|
+
|
8
|
+
public class Strftime
|
9
|
+
{
|
10
|
+
interface Task
|
11
|
+
{
|
12
|
+
@Config("rewind_seconds")
|
13
|
+
@ConfigDefault("0")
|
14
|
+
int getRewindSeconds();
|
15
|
+
|
16
|
+
@ConfigInject
|
17
|
+
ScriptingContainer getJRuby();
|
18
|
+
}
|
19
|
+
|
20
|
+
private final int rewindSeconds;
|
21
|
+
private final ScriptingContainer jruby;
|
22
|
+
|
23
|
+
public Strftime(Task task)
|
24
|
+
{
|
25
|
+
this.rewindSeconds = task.getRewindSeconds();
|
26
|
+
this.jruby = task.getJRuby();
|
27
|
+
}
|
28
|
+
|
29
|
+
public String format(String format)
|
30
|
+
{
|
31
|
+
String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
|
32
|
+
return jruby.runScriptlet(script).toString();
|
33
|
+
}
|
34
|
+
}
|
@@ -0,0 +1,174 @@
|
|
1
|
+
package org.embulk.input.hdfs;
|
2
|
+
|
3
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
4
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
5
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
6
|
+
import com.google.common.base.Optional;
|
7
|
+
import org.embulk.config.ConfigException;
|
8
|
+
|
9
|
+
import java.io.Serializable;
|
10
|
+
import java.lang.reflect.Field;
|
11
|
+
|
12
|
+
public class TargetFileInfo
|
13
|
+
implements Serializable
|
14
|
+
{
|
15
|
+
public static class Builder
|
16
|
+
{
|
17
|
+
private Optional<Long> start = Optional.absent();
|
18
|
+
private Optional<Long> end = Optional.absent();
|
19
|
+
private Optional<String> pathString = Optional.absent();
|
20
|
+
private Optional<Boolean> isDecompressible = Optional.absent();
|
21
|
+
private Optional<Boolean> isPartitionable = Optional.absent();
|
22
|
+
private Optional<Integer> numHeaderLines = Optional.absent();
|
23
|
+
|
24
|
+
public Builder()
|
25
|
+
{
|
26
|
+
}
|
27
|
+
|
28
|
+
public Builder start(long start)
|
29
|
+
{
|
30
|
+
this.start = Optional.of(start);
|
31
|
+
return this;
|
32
|
+
}
|
33
|
+
|
34
|
+
public Builder end(long end)
|
35
|
+
{
|
36
|
+
this.end = Optional.of(end);
|
37
|
+
return this;
|
38
|
+
}
|
39
|
+
|
40
|
+
public Builder pathString(String pathString)
|
41
|
+
{
|
42
|
+
this.pathString = Optional.of(pathString);
|
43
|
+
return this;
|
44
|
+
}
|
45
|
+
|
46
|
+
public Builder isDecompressible(boolean isDecompressible)
|
47
|
+
{
|
48
|
+
this.isDecompressible = Optional.of(isDecompressible);
|
49
|
+
return this;
|
50
|
+
}
|
51
|
+
|
52
|
+
public Builder isPartitionable(boolean isPartitionable)
|
53
|
+
{
|
54
|
+
this.isPartitionable = Optional.of(isPartitionable);
|
55
|
+
return this;
|
56
|
+
}
|
57
|
+
|
58
|
+
public Builder numHeaderLines(int numHeaderLines)
|
59
|
+
{
|
60
|
+
this.numHeaderLines = Optional.of(numHeaderLines);
|
61
|
+
return this;
|
62
|
+
}
|
63
|
+
|
64
|
+
public TargetFileInfo build()
|
65
|
+
{
|
66
|
+
try {
|
67
|
+
validate();
|
68
|
+
}
|
69
|
+
catch (IllegalAccessException | IllegalStateException e) {
|
70
|
+
throw new ConfigException(e);
|
71
|
+
}
|
72
|
+
|
73
|
+
return new TargetFileInfo(
|
74
|
+
pathString.get(), start.get(), end.get(),
|
75
|
+
isDecompressible.get(), isPartitionable.get(),
|
76
|
+
numHeaderLines.get());
|
77
|
+
}
|
78
|
+
|
79
|
+
private void validate()
|
80
|
+
throws IllegalAccessException, IllegalStateException
|
81
|
+
{
|
82
|
+
for (Field field : getClass().getDeclaredFields()) {
|
83
|
+
if (field.getType() != Optional.class) {
|
84
|
+
// for avoiding Z class by JUnit insertion.
|
85
|
+
continue;
|
86
|
+
}
|
87
|
+
Optional value = (Optional) field.get(this);
|
88
|
+
if (!value.isPresent()) {
|
89
|
+
String msg = String.format("field:%s is absent", field.getName());
|
90
|
+
throw new IllegalStateException(msg);
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
if (isDecompressible.get() && isPartitionable.get()) {
|
95
|
+
String msg = String.format("IllegalState: isDecompressible is true and isPartitionable is true: %s", pathString.get());
|
96
|
+
throw new IllegalStateException(msg);
|
97
|
+
}
|
98
|
+
|
99
|
+
if (isDecompressible.get() && start.get() != 0) {
|
100
|
+
String msg = String.format("IllegalState: isDecompressible is true, but start is not 0: %s", pathString.get());
|
101
|
+
throw new IllegalStateException(msg);
|
102
|
+
}
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
// private static final long serialVersionUID = 1L; // to suppress warnings?
|
107
|
+
private final long start;
|
108
|
+
private final long end;
|
109
|
+
private final String pathString;
|
110
|
+
private final boolean isDecompressible;
|
111
|
+
private final boolean isPartitionable;
|
112
|
+
private final int numHeaderLines;
|
113
|
+
|
114
|
+
@JsonCreator
|
115
|
+
private TargetFileInfo(
|
116
|
+
@JsonProperty("path_string") String pathString,
|
117
|
+
@JsonProperty("start") long start,
|
118
|
+
@JsonProperty("end") long end,
|
119
|
+
@JsonProperty("is_decompressible") boolean isDecompressible,
|
120
|
+
@JsonProperty("is_partitionable") boolean isPartitionable,
|
121
|
+
@JsonProperty("num_header_lines") int numHeaderLines)
|
122
|
+
{
|
123
|
+
this.pathString = pathString;
|
124
|
+
this.start = start;
|
125
|
+
this.end = end;
|
126
|
+
this.isDecompressible = isDecompressible;
|
127
|
+
this.isPartitionable = isPartitionable;
|
128
|
+
this.numHeaderLines = numHeaderLines;
|
129
|
+
}
|
130
|
+
|
131
|
+
@JsonProperty("start")
|
132
|
+
public long getStart()
|
133
|
+
{
|
134
|
+
return start;
|
135
|
+
}
|
136
|
+
|
137
|
+
@JsonProperty("end")
|
138
|
+
public long getEnd()
|
139
|
+
{
|
140
|
+
return end;
|
141
|
+
}
|
142
|
+
|
143
|
+
@JsonProperty("is_decompressible")
|
144
|
+
public boolean getIsDecompressible()
|
145
|
+
{
|
146
|
+
return isDecompressible;
|
147
|
+
}
|
148
|
+
|
149
|
+
@JsonProperty("is_partitionable")
|
150
|
+
public boolean getIsPartitionable()
|
151
|
+
{
|
152
|
+
return isPartitionable;
|
153
|
+
}
|
154
|
+
|
155
|
+
@JsonProperty("path_string")
|
156
|
+
public String getPathString()
|
157
|
+
{
|
158
|
+
return pathString;
|
159
|
+
}
|
160
|
+
|
161
|
+
@JsonProperty("num_header_lines")
|
162
|
+
public int getNumHeaderLines()
|
163
|
+
{
|
164
|
+
return numHeaderLines;
|
165
|
+
}
|
166
|
+
|
167
|
+
@JsonIgnore
|
168
|
+
public long getSize()
|
169
|
+
{
|
170
|
+
// NOTE: this size is reference value which
|
171
|
+
// becomes smaller than raw if the file is compressed.
|
172
|
+
return getEnd() - getStart();
|
173
|
+
}
|
174
|
+
}
|
@@ -1,35 +1,34 @@
|
|
1
1
|
package org.embulk.input.hdfs;
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
import
|
7
|
-
import
|
8
|
-
import
|
9
|
-
import
|
10
|
-
import
|
11
|
-
import
|
3
|
+
// Ported from https://github.com/embulk/embulk-input-s3/blob/b546158123a734acf0785d61400c69fcdd910ed6/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
|
4
|
+
// and Modified for this package.
|
5
|
+
|
6
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
7
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
8
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
9
|
+
import com.google.common.base.Optional;
|
10
|
+
import com.google.common.base.Throwables;
|
11
|
+
import org.apache.commons.lang.SerializationUtils;
|
12
|
+
import org.embulk.config.Config;
|
13
|
+
import org.embulk.config.ConfigDefault;
|
14
|
+
import org.embulk.config.ConfigSource;
|
15
|
+
|
12
16
|
import java.io.BufferedInputStream;
|
17
|
+
import java.io.BufferedOutputStream;
|
13
18
|
import java.io.ByteArrayInputStream;
|
14
19
|
import java.io.ByteArrayOutputStream;
|
15
20
|
import java.io.IOException;
|
21
|
+
import java.io.InputStream;
|
22
|
+
import java.io.OutputStream;
|
16
23
|
import java.nio.ByteBuffer;
|
17
|
-
import java.
|
18
|
-
import
|
19
|
-
import
|
20
|
-
import
|
21
|
-
import
|
22
|
-
import
|
23
|
-
import com.fasterxml.jackson.annotation.JsonProperty;
|
24
|
-
import com.fasterxml.jackson.annotation.JsonIgnore;
|
25
|
-
import com.fasterxml.jackson.annotation.JsonCreator;
|
24
|
+
import java.util.AbstractList;
|
25
|
+
import java.util.ArrayList;
|
26
|
+
import java.util.List;
|
27
|
+
import java.util.regex.Pattern;
|
28
|
+
import java.util.zip.GZIPInputStream;
|
29
|
+
import java.util.zip.GZIPOutputStream;
|
26
30
|
|
27
|
-
|
28
|
-
* Created by takahiro.nakayama on 2/20/16.
|
29
|
-
* Ported from https://github.com/embulk/embulk-input-s3/blob/master/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
|
30
|
-
* and Modified for this package.
|
31
|
-
*/
|
32
|
-
public class PartialFileList
|
31
|
+
public class TargetFileInfoList
|
33
32
|
{
|
34
33
|
public interface Task
|
35
34
|
{
|
@@ -50,21 +49,15 @@ public class PartialFileList
|
|
50
49
|
public static class Entry
|
51
50
|
{
|
52
51
|
private int index;
|
53
|
-
private long
|
54
|
-
private long end;
|
55
|
-
private boolean canDecompress;
|
52
|
+
private long size;
|
56
53
|
|
57
54
|
@JsonCreator
|
58
55
|
public Entry(
|
59
56
|
@JsonProperty("index") int index,
|
60
|
-
@JsonProperty("
|
61
|
-
@JsonProperty("end") long end,
|
62
|
-
@JsonProperty("can_decompress") boolean canDecompress)
|
57
|
+
@JsonProperty("size") long size)
|
63
58
|
{
|
64
59
|
this.index = index;
|
65
|
-
this.
|
66
|
-
this.end = end;
|
67
|
-
this.canDecompress = canDecompress;
|
60
|
+
this.size = size;
|
68
61
|
}
|
69
62
|
|
70
63
|
@JsonProperty("index")
|
@@ -73,28 +66,10 @@ public class PartialFileList
|
|
73
66
|
return index;
|
74
67
|
}
|
75
68
|
|
76
|
-
@JsonProperty("
|
77
|
-
public long getStart()
|
78
|
-
{
|
79
|
-
return start;
|
80
|
-
}
|
81
|
-
|
82
|
-
@JsonProperty("end")
|
83
|
-
public long getEnd()
|
84
|
-
{
|
85
|
-
return end;
|
86
|
-
}
|
87
|
-
|
88
|
-
@JsonProperty("can_decompress")
|
89
|
-
public boolean getCanDecompress()
|
90
|
-
{
|
91
|
-
return canDecompress;
|
92
|
-
}
|
93
|
-
|
94
|
-
@JsonIgnore
|
69
|
+
@JsonProperty("size")
|
95
70
|
public long getSize()
|
96
71
|
{
|
97
|
-
return
|
72
|
+
return size;
|
98
73
|
}
|
99
74
|
}
|
100
75
|
|
@@ -103,11 +78,11 @@ public class PartialFileList
|
|
103
78
|
private final ByteArrayOutputStream binary;
|
104
79
|
private final OutputStream stream;
|
105
80
|
private final List<Entry> entries = new ArrayList<>();
|
106
|
-
private
|
81
|
+
private TargetFileInfo last = null;
|
107
82
|
|
108
83
|
private int limitCount = Integer.MAX_VALUE;
|
109
|
-
private long minTaskSize =
|
110
|
-
private Pattern pathMatchPattern;
|
84
|
+
private long minTaskSize = 0L;
|
85
|
+
private Pattern pathMatchPattern = Pattern.compile(".*");
|
111
86
|
|
112
87
|
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
113
88
|
|
@@ -167,7 +142,7 @@ public class PartialFileList
|
|
167
142
|
}
|
168
143
|
|
169
144
|
// returns true if this file is used
|
170
|
-
public synchronized boolean add(
|
145
|
+
public synchronized boolean add(TargetFileInfo targetFileInfo)
|
171
146
|
{
|
172
147
|
// TODO throw IllegalStateException if stream is already closed
|
173
148
|
|
@@ -175,36 +150,36 @@ public class PartialFileList
|
|
175
150
|
return false;
|
176
151
|
}
|
177
152
|
|
178
|
-
if (!pathMatchPattern.matcher(
|
153
|
+
if (!pathMatchPattern.matcher(targetFileInfo.getPathString()).find()) {
|
179
154
|
return false;
|
180
155
|
}
|
181
156
|
|
182
157
|
int index = entries.size();
|
183
|
-
entries.add(new Entry(index,
|
158
|
+
entries.add(new Entry(index, targetFileInfo.getSize()));
|
184
159
|
|
185
|
-
byte[] data =
|
160
|
+
byte[] data = SerializationUtils.serialize(targetFileInfo);
|
186
161
|
castBuffer.putInt(0, data.length);
|
187
162
|
try {
|
188
163
|
stream.write(castBuffer.array());
|
189
164
|
stream.write(data);
|
190
165
|
}
|
191
|
-
catch (IOException
|
192
|
-
throw Throwables.propagate(
|
166
|
+
catch (IOException ex) {
|
167
|
+
throw Throwables.propagate(ex);
|
193
168
|
}
|
194
169
|
|
195
|
-
last =
|
170
|
+
last = targetFileInfo;
|
196
171
|
return true;
|
197
172
|
}
|
198
173
|
|
199
|
-
public
|
174
|
+
public TargetFileInfoList build()
|
200
175
|
{
|
201
176
|
try {
|
202
177
|
stream.close();
|
203
178
|
}
|
204
|
-
catch (IOException
|
205
|
-
throw Throwables.propagate(
|
179
|
+
catch (IOException ex) {
|
180
|
+
throw Throwables.propagate(ex);
|
206
181
|
}
|
207
|
-
return new
|
182
|
+
return new TargetFileInfoList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
208
183
|
}
|
209
184
|
|
210
185
|
private List<List<Entry>> getSplits(List<Entry> all)
|
@@ -230,13 +205,13 @@ public class PartialFileList
|
|
230
205
|
|
231
206
|
private final byte[] data;
|
232
207
|
private final List<List<Entry>> tasks;
|
233
|
-
private final Optional<
|
208
|
+
private final Optional<TargetFileInfo> last;
|
234
209
|
|
235
210
|
@JsonCreator
|
236
|
-
public
|
211
|
+
public TargetFileInfoList(
|
237
212
|
@JsonProperty("data") byte[] data,
|
238
213
|
@JsonProperty("tasks") List<List<Entry>> tasks,
|
239
|
-
@JsonProperty("last") Optional<
|
214
|
+
@JsonProperty("last") Optional<TargetFileInfo> last)
|
240
215
|
{
|
241
216
|
this.data = data;
|
242
217
|
this.tasks = tasks;
|
@@ -244,12 +219,18 @@ public class PartialFileList
|
|
244
219
|
}
|
245
220
|
|
246
221
|
@JsonIgnore
|
247
|
-
public
|
222
|
+
public static TargetFileInfoList.Builder builder(Task task)
|
223
|
+
{
|
224
|
+
return new TargetFileInfoList.Builder(task);
|
225
|
+
}
|
226
|
+
|
227
|
+
@JsonIgnore
|
228
|
+
public Optional<TargetFileInfo> getLastTargetFileInfo(Optional<TargetFileInfo> targetFileInfo)
|
248
229
|
{
|
249
230
|
if (last.isPresent()) {
|
250
231
|
return last;
|
251
232
|
}
|
252
|
-
return
|
233
|
+
return targetFileInfo;
|
253
234
|
}
|
254
235
|
|
255
236
|
@JsonIgnore
|
@@ -259,31 +240,34 @@ public class PartialFileList
|
|
259
240
|
}
|
260
241
|
|
261
242
|
@JsonIgnore
|
262
|
-
public List<
|
243
|
+
public List<TargetFileInfo> get(int i)
|
263
244
|
{
|
264
245
|
return new EntryList(data, tasks.get(i));
|
265
246
|
}
|
266
247
|
|
267
248
|
@JsonProperty("data")
|
249
|
+
@Deprecated
|
268
250
|
public byte[] getData()
|
269
251
|
{
|
270
252
|
return data;
|
271
253
|
}
|
272
254
|
|
273
255
|
@JsonProperty("tasks")
|
256
|
+
@Deprecated
|
274
257
|
public List<List<Entry>> getTasks()
|
275
258
|
{
|
276
259
|
return tasks;
|
277
260
|
}
|
278
261
|
|
279
262
|
@JsonProperty("last")
|
280
|
-
|
263
|
+
@Deprecated
|
264
|
+
public Optional<TargetFileInfo> getLast()
|
281
265
|
{
|
282
266
|
return last;
|
283
267
|
}
|
284
268
|
|
285
269
|
private class EntryList
|
286
|
-
extends AbstractList<
|
270
|
+
extends AbstractList<TargetFileInfo>
|
287
271
|
{
|
288
272
|
private final byte[] data;
|
289
273
|
private final List<Entry> entries;
|
@@ -299,34 +283,33 @@ public class PartialFileList
|
|
299
283
|
try {
|
300
284
|
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
301
285
|
}
|
302
|
-
catch (IOException
|
303
|
-
throw Throwables.propagate(
|
286
|
+
catch (IOException ex) {
|
287
|
+
throw Throwables.propagate(ex);
|
304
288
|
}
|
305
289
|
this.current = 0;
|
306
290
|
}
|
307
291
|
|
308
292
|
@Override
|
309
|
-
public synchronized
|
293
|
+
public synchronized TargetFileInfo get(int i)
|
310
294
|
{
|
311
|
-
Entry
|
312
|
-
if (
|
295
|
+
Entry e = entries.get(i);
|
296
|
+
if (e.getIndex() < current) {
|
313
297
|
// rewind to the head
|
314
298
|
try {
|
315
299
|
stream.close();
|
316
300
|
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
317
301
|
}
|
318
|
-
catch (IOException
|
319
|
-
throw Throwables.propagate(
|
302
|
+
catch (IOException ex) {
|
303
|
+
throw Throwables.propagate(ex);
|
320
304
|
}
|
321
305
|
current = 0;
|
322
306
|
}
|
323
307
|
|
324
|
-
while (current <
|
308
|
+
while (current < e.getIndex()) {
|
325
309
|
readNext();
|
326
310
|
}
|
327
311
|
// now current == e.getIndex()
|
328
|
-
return
|
329
|
-
entry.getStart(), entry.getEnd(), entry.getCanDecompress());
|
312
|
+
return readNextString();
|
330
313
|
}
|
331
314
|
|
332
315
|
@Override
|
@@ -347,14 +330,14 @@ public class PartialFileList
|
|
347
330
|
|
348
331
|
return b;
|
349
332
|
}
|
350
|
-
catch (IOException
|
351
|
-
throw Throwables.propagate(
|
333
|
+
catch (IOException ex) {
|
334
|
+
throw Throwables.propagate(ex);
|
352
335
|
}
|
353
336
|
}
|
354
337
|
|
355
|
-
private
|
338
|
+
private TargetFileInfo readNextString()
|
356
339
|
{
|
357
|
-
return
|
340
|
+
return (TargetFileInfo) SerializationUtils.deserialize(readNext());
|
358
341
|
}
|
359
342
|
}
|
360
|
-
}
|
343
|
+
}
|