embulk-input-hdfs 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,34 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.ConfigInject;
6
+ import org.jruby.embed.ScriptingContainer;
7
+
8
+ public class Strftime
9
+ {
10
+ interface Task
11
+ {
12
+ @Config("rewind_seconds")
13
+ @ConfigDefault("0")
14
+ int getRewindSeconds();
15
+
16
+ @ConfigInject
17
+ ScriptingContainer getJRuby();
18
+ }
19
+
20
+ private final int rewindSeconds;
21
+ private final ScriptingContainer jruby;
22
+
23
+ public Strftime(Task task)
24
+ {
25
+ this.rewindSeconds = task.getRewindSeconds();
26
+ this.jruby = task.getJRuby();
27
+ }
28
+
29
+ public String format(String format)
30
+ {
31
+ String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
32
+ return jruby.runScriptlet(script).toString();
33
+ }
34
+ }
@@ -0,0 +1,174 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import org.embulk.config.ConfigException;
8
+
9
+ import java.io.Serializable;
10
+ import java.lang.reflect.Field;
11
+
12
+ public class TargetFileInfo
13
+ implements Serializable
14
+ {
15
+ public static class Builder
16
+ {
17
+ private Optional<Long> start = Optional.absent();
18
+ private Optional<Long> end = Optional.absent();
19
+ private Optional<String> pathString = Optional.absent();
20
+ private Optional<Boolean> isDecompressible = Optional.absent();
21
+ private Optional<Boolean> isPartitionable = Optional.absent();
22
+ private Optional<Integer> numHeaderLines = Optional.absent();
23
+
24
+ public Builder()
25
+ {
26
+ }
27
+
28
+ public Builder start(long start)
29
+ {
30
+ this.start = Optional.of(start);
31
+ return this;
32
+ }
33
+
34
+ public Builder end(long end)
35
+ {
36
+ this.end = Optional.of(end);
37
+ return this;
38
+ }
39
+
40
+ public Builder pathString(String pathString)
41
+ {
42
+ this.pathString = Optional.of(pathString);
43
+ return this;
44
+ }
45
+
46
+ public Builder isDecompressible(boolean isDecompressible)
47
+ {
48
+ this.isDecompressible = Optional.of(isDecompressible);
49
+ return this;
50
+ }
51
+
52
+ public Builder isPartitionable(boolean isPartitionable)
53
+ {
54
+ this.isPartitionable = Optional.of(isPartitionable);
55
+ return this;
56
+ }
57
+
58
+ public Builder numHeaderLines(int numHeaderLines)
59
+ {
60
+ this.numHeaderLines = Optional.of(numHeaderLines);
61
+ return this;
62
+ }
63
+
64
+ public TargetFileInfo build()
65
+ {
66
+ try {
67
+ validate();
68
+ }
69
+ catch (IllegalAccessException | IllegalStateException e) {
70
+ throw new ConfigException(e);
71
+ }
72
+
73
+ return new TargetFileInfo(
74
+ pathString.get(), start.get(), end.get(),
75
+ isDecompressible.get(), isPartitionable.get(),
76
+ numHeaderLines.get());
77
+ }
78
+
79
+ private void validate()
80
+ throws IllegalAccessException, IllegalStateException
81
+ {
82
+ for (Field field : getClass().getDeclaredFields()) {
83
+ if (field.getType() != Optional.class) {
84
+ // for avoiding Z class by JUnit insertion.
85
+ continue;
86
+ }
87
+ Optional value = (Optional) field.get(this);
88
+ if (!value.isPresent()) {
89
+ String msg = String.format("field:%s is absent", field.getName());
90
+ throw new IllegalStateException(msg);
91
+ }
92
+ }
93
+
94
+ if (isDecompressible.get() && isPartitionable.get()) {
95
+ String msg = String.format("IllegalState: isDecompressible is true and isPartitionable is true: %s", pathString.get());
96
+ throw new IllegalStateException(msg);
97
+ }
98
+
99
+ if (isDecompressible.get() && start.get() != 0) {
100
+ String msg = String.format("IllegalState: isDecompressible is true, but start is not 0: %s", pathString.get());
101
+ throw new IllegalStateException(msg);
102
+ }
103
+ }
104
+ }
105
+
106
+ // private static final long serialVersionUID = 1L; // to suppress warnings?
107
+ private final long start;
108
+ private final long end;
109
+ private final String pathString;
110
+ private final boolean isDecompressible;
111
+ private final boolean isPartitionable;
112
+ private final int numHeaderLines;
113
+
114
+ @JsonCreator
115
+ private TargetFileInfo(
116
+ @JsonProperty("path_string") String pathString,
117
+ @JsonProperty("start") long start,
118
+ @JsonProperty("end") long end,
119
+ @JsonProperty("is_decompressible") boolean isDecompressible,
120
+ @JsonProperty("is_partitionable") boolean isPartitionable,
121
+ @JsonProperty("num_header_lines") int numHeaderLines)
122
+ {
123
+ this.pathString = pathString;
124
+ this.start = start;
125
+ this.end = end;
126
+ this.isDecompressible = isDecompressible;
127
+ this.isPartitionable = isPartitionable;
128
+ this.numHeaderLines = numHeaderLines;
129
+ }
130
+
131
+ @JsonProperty("start")
132
+ public long getStart()
133
+ {
134
+ return start;
135
+ }
136
+
137
+ @JsonProperty("end")
138
+ public long getEnd()
139
+ {
140
+ return end;
141
+ }
142
+
143
+ @JsonProperty("is_decompressible")
144
+ public boolean getIsDecompressible()
145
+ {
146
+ return isDecompressible;
147
+ }
148
+
149
+ @JsonProperty("is_partitionable")
150
+ public boolean getIsPartitionable()
151
+ {
152
+ return isPartitionable;
153
+ }
154
+
155
+ @JsonProperty("path_string")
156
+ public String getPathString()
157
+ {
158
+ return pathString;
159
+ }
160
+
161
+ @JsonProperty("num_header_lines")
162
+ public int getNumHeaderLines()
163
+ {
164
+ return numHeaderLines;
165
+ }
166
+
167
+ @JsonIgnore
168
+ public long getSize()
169
+ {
170
+ // NOTE: this size is reference value which
171
+ // becomes smaller than raw if the file is compressed.
172
+ return getEnd() - getStart();
173
+ }
174
+ }
@@ -1,35 +1,34 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
- import java.util.List;
4
- import java.util.AbstractList;
5
- import java.util.ArrayList;
6
- import java.util.zip.GZIPInputStream;
7
- import java.util.zip.GZIPOutputStream;
8
- import java.util.regex.Pattern;
9
- import java.io.InputStream;
10
- import java.io.OutputStream;
11
- import java.io.BufferedOutputStream;
3
+ // Ported from https://github.com/embulk/embulk-input-s3/blob/b546158123a734acf0785d61400c69fcdd910ed6/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
4
+ // and Modified for this package.
5
+
6
+ import com.fasterxml.jackson.annotation.JsonCreator;
7
+ import com.fasterxml.jackson.annotation.JsonIgnore;
8
+ import com.fasterxml.jackson.annotation.JsonProperty;
9
+ import com.google.common.base.Optional;
10
+ import com.google.common.base.Throwables;
11
+ import org.apache.commons.lang.SerializationUtils;
12
+ import org.embulk.config.Config;
13
+ import org.embulk.config.ConfigDefault;
14
+ import org.embulk.config.ConfigSource;
15
+
12
16
  import java.io.BufferedInputStream;
17
+ import java.io.BufferedOutputStream;
13
18
  import java.io.ByteArrayInputStream;
14
19
  import java.io.ByteArrayOutputStream;
15
20
  import java.io.IOException;
21
+ import java.io.InputStream;
22
+ import java.io.OutputStream;
16
23
  import java.nio.ByteBuffer;
17
- import java.nio.charset.StandardCharsets;
18
- import org.embulk.config.Config;
19
- import org.embulk.config.ConfigDefault;
20
- import org.embulk.config.ConfigSource;
21
- import com.google.common.base.Throwables;
22
- import com.google.common.base.Optional;
23
- import com.fasterxml.jackson.annotation.JsonProperty;
24
- import com.fasterxml.jackson.annotation.JsonIgnore;
25
- import com.fasterxml.jackson.annotation.JsonCreator;
24
+ import java.util.AbstractList;
25
+ import java.util.ArrayList;
26
+ import java.util.List;
27
+ import java.util.regex.Pattern;
28
+ import java.util.zip.GZIPInputStream;
29
+ import java.util.zip.GZIPOutputStream;
26
30
 
27
- /**
28
- * Created by takahiro.nakayama on 2/20/16.
29
- * Ported from https://github.com/embulk/embulk-input-s3/blob/master/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
30
- * and Modified for this package.
31
- */
32
- public class PartialFileList
31
+ public class TargetFileInfoList
33
32
  {
34
33
  public interface Task
35
34
  {
@@ -50,21 +49,15 @@ public class PartialFileList
50
49
  public static class Entry
51
50
  {
52
51
  private int index;
53
- private long start;
54
- private long end;
55
- private boolean canDecompress;
52
+ private long size;
56
53
 
57
54
  @JsonCreator
58
55
  public Entry(
59
56
  @JsonProperty("index") int index,
60
- @JsonProperty("start") long start,
61
- @JsonProperty("end") long end,
62
- @JsonProperty("can_decompress") boolean canDecompress)
57
+ @JsonProperty("size") long size)
63
58
  {
64
59
  this.index = index;
65
- this.start = start;
66
- this.end = end;
67
- this.canDecompress = canDecompress;
60
+ this.size = size;
68
61
  }
69
62
 
70
63
  @JsonProperty("index")
@@ -73,28 +66,10 @@ public class PartialFileList
73
66
  return index;
74
67
  }
75
68
 
76
- @JsonProperty("start")
77
- public long getStart()
78
- {
79
- return start;
80
- }
81
-
82
- @JsonProperty("end")
83
- public long getEnd()
84
- {
85
- return end;
86
- }
87
-
88
- @JsonProperty("can_decompress")
89
- public boolean getCanDecompress()
90
- {
91
- return canDecompress;
92
- }
93
-
94
- @JsonIgnore
69
+ @JsonProperty("size")
95
70
  public long getSize()
96
71
  {
97
- return getEnd() - getStart();
72
+ return size;
98
73
  }
99
74
  }
100
75
 
@@ -103,11 +78,11 @@ public class PartialFileList
103
78
  private final ByteArrayOutputStream binary;
104
79
  private final OutputStream stream;
105
80
  private final List<Entry> entries = new ArrayList<>();
106
- private String last = null;
81
+ private TargetFileInfo last = null;
107
82
 
108
83
  private int limitCount = Integer.MAX_VALUE;
109
- private long minTaskSize = 1;
110
- private Pattern pathMatchPattern;
84
+ private long minTaskSize = 0L;
85
+ private Pattern pathMatchPattern = Pattern.compile(".*");
111
86
 
112
87
  private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
113
88
 
@@ -167,7 +142,7 @@ public class PartialFileList
167
142
  }
168
143
 
169
144
  // returns true if this file is used
170
- public synchronized boolean add(String path, long start, long end, boolean canDecompress)
145
+ public synchronized boolean add(TargetFileInfo targetFileInfo)
171
146
  {
172
147
  // TODO throw IllegalStateException if stream is already closed
173
148
 
@@ -175,36 +150,36 @@ public class PartialFileList
175
150
  return false;
176
151
  }
177
152
 
178
- if (!pathMatchPattern.matcher(path).find()) {
153
+ if (!pathMatchPattern.matcher(targetFileInfo.getPathString()).find()) {
179
154
  return false;
180
155
  }
181
156
 
182
157
  int index = entries.size();
183
- entries.add(new Entry(index, start, end, canDecompress));
158
+ entries.add(new Entry(index, targetFileInfo.getSize()));
184
159
 
185
- byte[] data = path.getBytes(StandardCharsets.UTF_8);
160
+ byte[] data = SerializationUtils.serialize(targetFileInfo);
186
161
  castBuffer.putInt(0, data.length);
187
162
  try {
188
163
  stream.write(castBuffer.array());
189
164
  stream.write(data);
190
165
  }
191
- catch (IOException e) {
192
- throw Throwables.propagate(e);
166
+ catch (IOException ex) {
167
+ throw Throwables.propagate(ex);
193
168
  }
194
169
 
195
- last = path;
170
+ last = targetFileInfo;
196
171
  return true;
197
172
  }
198
173
 
199
- public PartialFileList build()
174
+ public TargetFileInfoList build()
200
175
  {
201
176
  try {
202
177
  stream.close();
203
178
  }
204
- catch (IOException e) {
205
- throw Throwables.propagate(e);
179
+ catch (IOException ex) {
180
+ throw Throwables.propagate(ex);
206
181
  }
207
- return new PartialFileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
182
+ return new TargetFileInfoList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
208
183
  }
209
184
 
210
185
  private List<List<Entry>> getSplits(List<Entry> all)
@@ -230,13 +205,13 @@ public class PartialFileList
230
205
 
231
206
  private final byte[] data;
232
207
  private final List<List<Entry>> tasks;
233
- private final Optional<String> last;
208
+ private final Optional<TargetFileInfo> last;
234
209
 
235
210
  @JsonCreator
236
- public PartialFileList(
211
+ public TargetFileInfoList(
237
212
  @JsonProperty("data") byte[] data,
238
213
  @JsonProperty("tasks") List<List<Entry>> tasks,
239
- @JsonProperty("last") Optional<String> last)
214
+ @JsonProperty("last") Optional<TargetFileInfo> last)
240
215
  {
241
216
  this.data = data;
242
217
  this.tasks = tasks;
@@ -244,12 +219,18 @@ public class PartialFileList
244
219
  }
245
220
 
246
221
  @JsonIgnore
247
- public Optional<String> getLastPath(Optional<String> lastLastPath)
222
+ public static TargetFileInfoList.Builder builder(Task task)
223
+ {
224
+ return new TargetFileInfoList.Builder(task);
225
+ }
226
+
227
+ @JsonIgnore
228
+ public Optional<TargetFileInfo> getLastTargetFileInfo(Optional<TargetFileInfo> targetFileInfo)
248
229
  {
249
230
  if (last.isPresent()) {
250
231
  return last;
251
232
  }
252
- return lastLastPath;
233
+ return targetFileInfo;
253
234
  }
254
235
 
255
236
  @JsonIgnore
@@ -259,31 +240,34 @@ public class PartialFileList
259
240
  }
260
241
 
261
242
  @JsonIgnore
262
- public List<PartialFile> get(int i)
243
+ public List<TargetFileInfo> get(int i)
263
244
  {
264
245
  return new EntryList(data, tasks.get(i));
265
246
  }
266
247
 
267
248
  @JsonProperty("data")
249
+ @Deprecated
268
250
  public byte[] getData()
269
251
  {
270
252
  return data;
271
253
  }
272
254
 
273
255
  @JsonProperty("tasks")
256
+ @Deprecated
274
257
  public List<List<Entry>> getTasks()
275
258
  {
276
259
  return tasks;
277
260
  }
278
261
 
279
262
  @JsonProperty("last")
280
- public Optional<String> getLast()
263
+ @Deprecated
264
+ public Optional<TargetFileInfo> getLast()
281
265
  {
282
266
  return last;
283
267
  }
284
268
 
285
269
  private class EntryList
286
- extends AbstractList<PartialFile>
270
+ extends AbstractList<TargetFileInfo>
287
271
  {
288
272
  private final byte[] data;
289
273
  private final List<Entry> entries;
@@ -299,34 +283,33 @@ public class PartialFileList
299
283
  try {
300
284
  this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
301
285
  }
302
- catch (IOException e) {
303
- throw Throwables.propagate(e);
286
+ catch (IOException ex) {
287
+ throw Throwables.propagate(ex);
304
288
  }
305
289
  this.current = 0;
306
290
  }
307
291
 
308
292
  @Override
309
- public synchronized PartialFile get(int i)
293
+ public synchronized TargetFileInfo get(int i)
310
294
  {
311
- Entry entry = entries.get(i);
312
- if (entry.getIndex() < current) {
295
+ Entry e = entries.get(i);
296
+ if (e.getIndex() < current) {
313
297
  // rewind to the head
314
298
  try {
315
299
  stream.close();
316
300
  stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
317
301
  }
318
- catch (IOException e) {
319
- throw Throwables.propagate(e);
302
+ catch (IOException ex) {
303
+ throw Throwables.propagate(ex);
320
304
  }
321
305
  current = 0;
322
306
  }
323
307
 
324
- while (current < entry.getIndex()) {
308
+ while (current < e.getIndex()) {
325
309
  readNext();
326
310
  }
327
311
  // now current == e.getIndex()
328
- return new PartialFile(readNextString(),
329
- entry.getStart(), entry.getEnd(), entry.getCanDecompress());
312
+ return readNextString();
330
313
  }
331
314
 
332
315
  @Override
@@ -347,14 +330,14 @@ public class PartialFileList
347
330
 
348
331
  return b;
349
332
  }
350
- catch (IOException e) {
351
- throw Throwables.propagate(e);
333
+ catch (IOException ex) {
334
+ throw Throwables.propagate(ex);
352
335
  }
353
336
  }
354
337
 
355
- private String readNextString()
338
+ private TargetFileInfo readNextString()
356
339
  {
357
- return new String(readNext(), StandardCharsets.UTF_8);
340
+ return (TargetFileInfo) SerializationUtils.deserialize(readNext());
358
341
  }
359
342
  }
360
- }
343
+ }