embulk-input-hdfs 0.1.9 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.apache.hadoop.fs.Path;
4
+
5
+ /**
6
+ * Created by takahiro.nakayama on 2/20/16.
7
+ * is the same as PartialFileList.Entry, so this class does not need?
8
+ */
9
+ public class PartialFile
10
+ {
11
+ private final Path path;
12
+ private final long start;
13
+ private final long end;
14
+ private final boolean canDecompress;
15
+
16
+ public PartialFile(String path, long start, long end, boolean canDecompress)
17
+ {
18
+ this(new Path(path), start, end, canDecompress);
19
+ }
20
+
21
+ public PartialFile(Path path, long start, long end, boolean canDecompress)
22
+ {
23
+ this.path = path;
24
+ this.start = start;
25
+ this.end = end;
26
+ this.canDecompress = canDecompress;
27
+ }
28
+
29
+ public Path getPath()
30
+ {
31
+ return path;
32
+ }
33
+
34
+ public long getStart()
35
+ {
36
+ return start;
37
+ }
38
+
39
+ public long getEnd()
40
+ {
41
+ return end;
42
+ }
43
+
44
+ public boolean getCanDecompress()
45
+ {
46
+ return canDecompress;
47
+ }
48
+ }
@@ -5,9 +5,12 @@ import java.io.IOException;
5
5
  import java.io.InputStream;
6
6
  import java.io.PushbackInputStream;
7
7
 
8
- // ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
9
- public class HdfsPartialFileInputStream
10
- extends InputStream
8
+ /**
9
+ * Created by takahiro.nakayama on 2/13/16.
10
+ * ref. https://github.com/hito4t/embulk-input-filesplit/blob/master/src/main/java/org/embulk/input/filesplit/PartialFileInputStream.java
11
+ */
12
+ public class PartialFileInputStream
13
+ extends InputStream
11
14
  {
12
15
  private final PushbackInputStream original;
13
16
  private long start;
@@ -15,12 +18,14 @@ public class HdfsPartialFileInputStream
15
18
  private long current;
16
19
  private boolean eof;
17
20
 
18
- public HdfsPartialFileInputStream(InputStream original, long start, long end)
21
+ public PartialFileInputStream(InputStream original, long start, long end)
19
22
  {
20
23
  this.original = new PushbackInputStream(new BufferedInputStream(original));
21
24
  this.start = start;
22
25
  this.end = end;
23
26
  current = -1;
27
+ // TODO: support LineTerminator
28
+ // this.lineTerminator
24
29
  }
25
30
 
26
31
  @Override
@@ -0,0 +1,125 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.base.Throwables;
5
+ import org.apache.hadoop.fs.FileSystem;
6
+ import org.apache.hadoop.io.compress.CodecPool;
7
+ import org.apache.hadoop.io.compress.CompressionCodec;
8
+ import org.apache.hadoop.io.compress.CompressionCodecFactory;
9
+ import org.apache.hadoop.io.compress.Decompressor;
10
+ import org.embulk.spi.Exec;
11
+ import org.slf4j.Logger;
12
+
13
+ import java.io.BufferedInputStream;
14
+ import java.io.ByteArrayInputStream;
15
+ import java.io.ByteArrayOutputStream;
16
+ import java.io.IOException;
17
+ import java.io.InputStream;
18
+ import java.io.SequenceInputStream;
19
+
20
+ /**
21
+ * Created by takahiro.nakayama on 2/21/16.
22
+ */
23
+ public class PartialFileInputStreamBuilder
24
+ {
25
+ private static final Logger logger = Exec.getLogger(PartialFileInputStreamBuilder.class);
26
+ private final FileSystem fs;
27
+ private final PartialFile partialFile;
28
+ private int numHeaderLines = 0;
29
+
30
+ public PartialFileInputStreamBuilder(FileSystem fs, PartialFile partialFile)
31
+ {
32
+ this.fs = fs;
33
+ this.partialFile = partialFile;
34
+ }
35
+
36
+ public InputStream build()
37
+ throws IOException
38
+ {
39
+ logger.trace("path: {}, start: {}, end: {}, num_header_lines: {}",
40
+ partialFile.getPath(), partialFile.getStart(), partialFile.getEnd(), numHeaderLines);
41
+ if (partialFile.getStart() > 0 && numHeaderLines > 0) {
42
+ return new SequenceInputStream(createHeadersInputStream(), createPartialFileInputStream());
43
+ }
44
+ else {
45
+ return createPartialFileInputStream();
46
+ }
47
+ }
48
+
49
+ public PartialFileInputStreamBuilder withHeaders(int numHeaderLines)
50
+ {
51
+ this.numHeaderLines = numHeaderLines;
52
+ return this;
53
+ }
54
+
55
+ private InputStream createOriginalFileWrappedInputStream()
56
+ {
57
+ InputStream original = createOriginalFileInputStream();
58
+ CompressionCodec codec = new CompressionCodecFactory(fs.getConf()).getCodec(partialFile.getPath());
59
+ if (partialFile.getCanDecompress() && codec != null) {
60
+ try {
61
+ return codec.createInputStream(original);
62
+ }
63
+ catch (IOException e) {
64
+ throw Throwables.propagate(e);
65
+ }
66
+ }
67
+ else {
68
+ return original;
69
+ }
70
+ }
71
+
72
+ private InputStream createOriginalFileInputStream()
73
+ {
74
+ try {
75
+ return fs.open(partialFile.getPath());
76
+ }
77
+ catch (IOException e) {
78
+ throw Throwables.propagate(e);
79
+ }
80
+ }
81
+
82
+ // memo: compressioncodec使ったinputstream作る奴いても良いような…
83
+ // じゃないと、headers もおかしくなるような…ならんか
84
+
85
+ private InputStream createPartialFileInputStream()
86
+ {
87
+ InputStream original = createOriginalFileWrappedInputStream();
88
+ return new PartialFileInputStream(original, partialFile.getStart(), partialFile.getEnd());
89
+ }
90
+
91
+ private InputStream createHeadersInputStream()
92
+ throws IOException
93
+ {
94
+ ByteArrayOutputStream header = new ByteArrayOutputStream();
95
+ int skippedHeaders = 0;
96
+ InputStream original = createOriginalFileWrappedInputStream();
97
+ try (BufferedInputStream in = new BufferedInputStream(original)) {
98
+ while (true) {
99
+ int c = in.read();
100
+ if (c < 0) {
101
+ break;
102
+ }
103
+
104
+ header.write(c);
105
+
106
+ if (c == '\n') {
107
+ skippedHeaders++;
108
+ }
109
+ else if (c == '\r') {
110
+ int c2 = in.read();
111
+ if (c2 == '\n') {
112
+ header.write(c2);
113
+ }
114
+ skippedHeaders++;
115
+ }
116
+
117
+ if (skippedHeaders >= numHeaderLines) {
118
+ break;
119
+ }
120
+ }
121
+ }
122
+ header.close();
123
+ return new ByteArrayInputStream(header.toByteArray());
124
+ }
125
+ }
@@ -0,0 +1,360 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import java.util.List;
4
+ import java.util.AbstractList;
5
+ import java.util.ArrayList;
6
+ import java.util.zip.GZIPInputStream;
7
+ import java.util.zip.GZIPOutputStream;
8
+ import java.util.regex.Pattern;
9
+ import java.io.InputStream;
10
+ import java.io.OutputStream;
11
+ import java.io.BufferedOutputStream;
12
+ import java.io.BufferedInputStream;
13
+ import java.io.ByteArrayInputStream;
14
+ import java.io.ByteArrayOutputStream;
15
+ import java.io.IOException;
16
+ import java.nio.ByteBuffer;
17
+ import java.nio.charset.StandardCharsets;
18
+ import org.embulk.config.Config;
19
+ import org.embulk.config.ConfigDefault;
20
+ import org.embulk.config.ConfigSource;
21
+ import com.google.common.base.Throwables;
22
+ import com.google.common.base.Optional;
23
+ import com.fasterxml.jackson.annotation.JsonProperty;
24
+ import com.fasterxml.jackson.annotation.JsonIgnore;
25
+ import com.fasterxml.jackson.annotation.JsonCreator;
26
+
27
+ /**
28
+ * Created by takahiro.nakayama on 2/20/16.
29
+ * Ported from https://github.com/embulk/embulk-input-s3/blob/master/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
30
+ * and Modified for this package.
31
+ */
32
+ public class PartialFileList
33
+ {
34
+ public interface Task
35
+ {
36
+ @Config("path_match_pattern")
37
+ @ConfigDefault("\".*\"")
38
+ String getPathMatchPattern();
39
+
40
+ @Config("total_file_count_limit")
41
+ @ConfigDefault("2147483647")
42
+ int getTotalFileCountLimit();
43
+
44
+ // TODO support more algorithms to combine tasks
45
+ @Config("min_task_size")
46
+ @ConfigDefault("0")
47
+ long getMinTaskSize();
48
+ }
49
+
50
+ public static class Entry
51
+ {
52
+ private int index;
53
+ private long start;
54
+ private long end;
55
+ private boolean canDecompress;
56
+
57
+ @JsonCreator
58
+ public Entry(
59
+ @JsonProperty("index") int index,
60
+ @JsonProperty("start") long start,
61
+ @JsonProperty("end") long end,
62
+ @JsonProperty("can_decompress") boolean canDecompress)
63
+ {
64
+ this.index = index;
65
+ this.start = start;
66
+ this.end = end;
67
+ this.canDecompress = canDecompress;
68
+ }
69
+
70
+ @JsonProperty("index")
71
+ public int getIndex()
72
+ {
73
+ return index;
74
+ }
75
+
76
+ @JsonProperty("start")
77
+ public long getStart()
78
+ {
79
+ return start;
80
+ }
81
+
82
+ @JsonProperty("end")
83
+ public long getEnd()
84
+ {
85
+ return end;
86
+ }
87
+
88
+ @JsonProperty("can_decompress")
89
+ public boolean getCanDecompress()
90
+ {
91
+ return canDecompress;
92
+ }
93
+
94
+ @JsonIgnore
95
+ public long getSize()
96
+ {
97
+ return getEnd() - getStart();
98
+ }
99
+ }
100
+
101
+ public static class Builder
102
+ {
103
+ private final ByteArrayOutputStream binary;
104
+ private final OutputStream stream;
105
+ private final List<Entry> entries = new ArrayList<>();
106
+ private String last = null;
107
+
108
+ private int limitCount = Integer.MAX_VALUE;
109
+ private long minTaskSize = 1;
110
+ private Pattern pathMatchPattern;
111
+
112
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
113
+
114
+ public Builder(Task task)
115
+ {
116
+ this();
117
+ this.pathMatchPattern = Pattern.compile(task.getPathMatchPattern());
118
+ this.limitCount = task.getTotalFileCountLimit();
119
+ this.minTaskSize = task.getMinTaskSize();
120
+ }
121
+
122
+ public Builder(ConfigSource config)
123
+ {
124
+ this();
125
+ this.pathMatchPattern = Pattern.compile(config.get(String.class, "path_match_pattern", ".*"));
126
+ this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
127
+ this.minTaskSize = config.get(long.class, "min_task_size", 0L);
128
+ }
129
+
130
+ public Builder()
131
+ {
132
+ binary = new ByteArrayOutputStream();
133
+ try {
134
+ stream = new BufferedOutputStream(new GZIPOutputStream(binary));
135
+ }
136
+ catch (IOException ex) {
137
+ throw Throwables.propagate(ex);
138
+ }
139
+ }
140
+
141
+ public Builder limitTotalFileCount(int limitCount)
142
+ {
143
+ this.limitCount = limitCount;
144
+ return this;
145
+ }
146
+
147
+ public Builder minTaskSize(long bytes)
148
+ {
149
+ this.minTaskSize = bytes;
150
+ return this;
151
+ }
152
+
153
+ public Builder pathMatchPattern(String pattern)
154
+ {
155
+ this.pathMatchPattern = Pattern.compile(pattern);
156
+ return this;
157
+ }
158
+
159
+ public int size()
160
+ {
161
+ return entries.size();
162
+ }
163
+
164
+ public boolean needsMore()
165
+ {
166
+ return size() < limitCount;
167
+ }
168
+
169
+ // returns true if this file is used
170
+ public synchronized boolean add(String path, long start, long end, boolean canDecompress)
171
+ {
172
+ // TODO throw IllegalStateException if stream is already closed
173
+
174
+ if (!needsMore()) {
175
+ return false;
176
+ }
177
+
178
+ if (!pathMatchPattern.matcher(path).find()) {
179
+ return false;
180
+ }
181
+
182
+ int index = entries.size();
183
+ entries.add(new Entry(index, start, end, canDecompress));
184
+
185
+ byte[] data = path.getBytes(StandardCharsets.UTF_8);
186
+ castBuffer.putInt(0, data.length);
187
+ try {
188
+ stream.write(castBuffer.array());
189
+ stream.write(data);
190
+ }
191
+ catch (IOException e) {
192
+ throw Throwables.propagate(e);
193
+ }
194
+
195
+ last = path;
196
+ return true;
197
+ }
198
+
199
+ public PartialFileList build()
200
+ {
201
+ try {
202
+ stream.close();
203
+ }
204
+ catch (IOException e) {
205
+ throw Throwables.propagate(e);
206
+ }
207
+ return new PartialFileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
208
+ }
209
+
210
+ private List<List<Entry>> getSplits(List<Entry> all)
211
+ {
212
+ List<List<Entry>> tasks = new ArrayList<>();
213
+ long currentTaskSize = 0;
214
+ List<Entry> currentTask = new ArrayList<>();
215
+ for (Entry entry : all) {
216
+ currentTask.add(entry);
217
+ currentTaskSize += entry.getSize(); // TODO consider to multiply the size by cost_per_byte, and add cost_per_file
218
+ if (currentTaskSize >= minTaskSize) {
219
+ tasks.add(currentTask);
220
+ currentTask = new ArrayList<>();
221
+ currentTaskSize = 0;
222
+ }
223
+ }
224
+ if (!currentTask.isEmpty()) {
225
+ tasks.add(currentTask);
226
+ }
227
+ return tasks;
228
+ }
229
+ }
230
+
231
+ private final byte[] data;
232
+ private final List<List<Entry>> tasks;
233
+ private final Optional<String> last;
234
+
235
+ @JsonCreator
236
+ public PartialFileList(
237
+ @JsonProperty("data") byte[] data,
238
+ @JsonProperty("tasks") List<List<Entry>> tasks,
239
+ @JsonProperty("last") Optional<String> last)
240
+ {
241
+ this.data = data;
242
+ this.tasks = tasks;
243
+ this.last = last;
244
+ }
245
+
246
+ @JsonIgnore
247
+ public Optional<String> getLastPath(Optional<String> lastLastPath)
248
+ {
249
+ if (last.isPresent()) {
250
+ return last;
251
+ }
252
+ return lastLastPath;
253
+ }
254
+
255
+ @JsonIgnore
256
+ public int getTaskCount()
257
+ {
258
+ return tasks.size();
259
+ }
260
+
261
+ @JsonIgnore
262
+ public List<PartialFile> get(int i)
263
+ {
264
+ return new EntryList(data, tasks.get(i));
265
+ }
266
+
267
+ @JsonProperty("data")
268
+ public byte[] getData()
269
+ {
270
+ return data;
271
+ }
272
+
273
+ @JsonProperty("tasks")
274
+ public List<List<Entry>> getTasks()
275
+ {
276
+ return tasks;
277
+ }
278
+
279
+ @JsonProperty("last")
280
+ public Optional<String> getLast()
281
+ {
282
+ return last;
283
+ }
284
+
285
+ private class EntryList
286
+ extends AbstractList<PartialFile>
287
+ {
288
+ private final byte[] data;
289
+ private final List<Entry> entries;
290
+ private InputStream stream;
291
+ private int current;
292
+
293
+ private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
294
+
295
+ public EntryList(byte[] data, List<Entry> entries)
296
+ {
297
+ this.data = data;
298
+ this.entries = entries;
299
+ try {
300
+ this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
301
+ }
302
+ catch (IOException e) {
303
+ throw Throwables.propagate(e);
304
+ }
305
+ this.current = 0;
306
+ }
307
+
308
+ @Override
309
+ public synchronized PartialFile get(int i)
310
+ {
311
+ Entry entry = entries.get(i);
312
+ if (entry.getIndex() < current) {
313
+ // rewind to the head
314
+ try {
315
+ stream.close();
316
+ stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
317
+ }
318
+ catch (IOException e) {
319
+ throw Throwables.propagate(e);
320
+ }
321
+ current = 0;
322
+ }
323
+
324
+ while (current < entry.getIndex()) {
325
+ readNext();
326
+ }
327
+ // now current == e.getIndex()
328
+ return new PartialFile(readNextString(),
329
+ entry.getStart(), entry.getEnd(), entry.getCanDecompress());
330
+ }
331
+
332
+ @Override
333
+ public int size()
334
+ {
335
+ return entries.size();
336
+ }
337
+
338
+ private byte[] readNext()
339
+ {
340
+ try {
341
+ stream.read(castBuffer.array());
342
+ int n = castBuffer.getInt(0);
343
+ byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
344
+ stream.read(b);
345
+
346
+ current++;
347
+
348
+ return b;
349
+ }
350
+ catch (IOException e) {
351
+ throw Throwables.propagate(e);
352
+ }
353
+ }
354
+
355
+ private String readNextString()
356
+ {
357
+ return new String(readNext(), StandardCharsets.UTF_8);
358
+ }
359
+ }
360
+ }