embulk-input-hdfs 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,34 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import org.embulk.config.Config;
4
+ import org.embulk.config.ConfigDefault;
5
+ import org.embulk.config.ConfigInject;
6
+ import org.jruby.embed.ScriptingContainer;
7
+
8
+ public class Strftime
9
+ {
10
+ interface Task
11
+ {
12
+ @Config("rewind_seconds")
13
+ @ConfigDefault("0")
14
+ int getRewindSeconds();
15
+
16
+ @ConfigInject
17
+ ScriptingContainer getJRuby();
18
+ }
19
+
20
+ private final int rewindSeconds;
21
+ private final ScriptingContainer jruby;
22
+
23
+ public Strftime(Task task)
24
+ {
25
+ this.rewindSeconds = task.getRewindSeconds();
26
+ this.jruby = task.getJRuby();
27
+ }
28
+
29
+ public String format(String format)
30
+ {
31
+ String script = String.format("(Time.now - %d).strftime('%s')", rewindSeconds, format);
32
+ return jruby.runScriptlet(script).toString();
33
+ }
34
+ }
@@ -0,0 +1,174 @@
1
+ package org.embulk.input.hdfs;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonIgnore;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+ import com.google.common.base.Optional;
7
+ import org.embulk.config.ConfigException;
8
+
9
+ import java.io.Serializable;
10
+ import java.lang.reflect.Field;
11
+
12
+ public class TargetFileInfo
13
+ implements Serializable
14
+ {
15
+ public static class Builder
16
+ {
17
+ private Optional<Long> start = Optional.absent();
18
+ private Optional<Long> end = Optional.absent();
19
+ private Optional<String> pathString = Optional.absent();
20
+ private Optional<Boolean> isDecompressible = Optional.absent();
21
+ private Optional<Boolean> isPartitionable = Optional.absent();
22
+ private Optional<Integer> numHeaderLines = Optional.absent();
23
+
24
+ public Builder()
25
+ {
26
+ }
27
+
28
+ public Builder start(long start)
29
+ {
30
+ this.start = Optional.of(start);
31
+ return this;
32
+ }
33
+
34
+ public Builder end(long end)
35
+ {
36
+ this.end = Optional.of(end);
37
+ return this;
38
+ }
39
+
40
+ public Builder pathString(String pathString)
41
+ {
42
+ this.pathString = Optional.of(pathString);
43
+ return this;
44
+ }
45
+
46
+ public Builder isDecompressible(boolean isDecompressible)
47
+ {
48
+ this.isDecompressible = Optional.of(isDecompressible);
49
+ return this;
50
+ }
51
+
52
+ public Builder isPartitionable(boolean isPartitionable)
53
+ {
54
+ this.isPartitionable = Optional.of(isPartitionable);
55
+ return this;
56
+ }
57
+
58
+ public Builder numHeaderLines(int numHeaderLines)
59
+ {
60
+ this.numHeaderLines = Optional.of(numHeaderLines);
61
+ return this;
62
+ }
63
+
64
+ public TargetFileInfo build()
65
+ {
66
+ try {
67
+ validate();
68
+ }
69
+ catch (IllegalAccessException | IllegalStateException e) {
70
+ throw new ConfigException(e);
71
+ }
72
+
73
+ return new TargetFileInfo(
74
+ pathString.get(), start.get(), end.get(),
75
+ isDecompressible.get(), isPartitionable.get(),
76
+ numHeaderLines.get());
77
+ }
78
+
79
+ private void validate()
80
+ throws IllegalAccessException, IllegalStateException
81
+ {
82
+ for (Field field : getClass().getDeclaredFields()) {
83
+ if (field.getType() != Optional.class) {
84
+ // for avoiding Z class by JUnit insertion.
85
+ continue;
86
+ }
87
+ Optional value = (Optional) field.get(this);
88
+ if (!value.isPresent()) {
89
+ String msg = String.format("field:%s is absent", field.getName());
90
+ throw new IllegalStateException(msg);
91
+ }
92
+ }
93
+
94
+ if (isDecompressible.get() && isPartitionable.get()) {
95
+ String msg = String.format("IllegalState: isDecompressible is true and isPartitionable is true: %s", pathString.get());
96
+ throw new IllegalStateException(msg);
97
+ }
98
+
99
+ if (isDecompressible.get() && start.get() != 0) {
100
+ String msg = String.format("IllegalState: isDecompressible is true, but start is not 0: %s", pathString.get());
101
+ throw new IllegalStateException(msg);
102
+ }
103
+ }
104
+ }
105
+
106
+ // private static final long serialVersionUID = 1L; // to suppress warnings?
107
+ private final long start;
108
+ private final long end;
109
+ private final String pathString;
110
+ private final boolean isDecompressible;
111
+ private final boolean isPartitionable;
112
+ private final int numHeaderLines;
113
+
114
+ @JsonCreator
115
+ private TargetFileInfo(
116
+ @JsonProperty("path_string") String pathString,
117
+ @JsonProperty("start") long start,
118
+ @JsonProperty("end") long end,
119
+ @JsonProperty("is_decompressible") boolean isDecompressible,
120
+ @JsonProperty("is_partitionable") boolean isPartitionable,
121
+ @JsonProperty("num_header_lines") int numHeaderLines)
122
+ {
123
+ this.pathString = pathString;
124
+ this.start = start;
125
+ this.end = end;
126
+ this.isDecompressible = isDecompressible;
127
+ this.isPartitionable = isPartitionable;
128
+ this.numHeaderLines = numHeaderLines;
129
+ }
130
+
131
+ @JsonProperty("start")
132
+ public long getStart()
133
+ {
134
+ return start;
135
+ }
136
+
137
+ @JsonProperty("end")
138
+ public long getEnd()
139
+ {
140
+ return end;
141
+ }
142
+
143
+ @JsonProperty("is_decompressible")
144
+ public boolean getIsDecompressible()
145
+ {
146
+ return isDecompressible;
147
+ }
148
+
149
+ @JsonProperty("is_partitionable")
150
+ public boolean getIsPartitionable()
151
+ {
152
+ return isPartitionable;
153
+ }
154
+
155
+ @JsonProperty("path_string")
156
+ public String getPathString()
157
+ {
158
+ return pathString;
159
+ }
160
+
161
+ @JsonProperty("num_header_lines")
162
+ public int getNumHeaderLines()
163
+ {
164
+ return numHeaderLines;
165
+ }
166
+
167
+ @JsonIgnore
168
+ public long getSize()
169
+ {
170
+ // NOTE: this size is reference value which
171
+ // becomes smaller than raw if the file is compressed.
172
+ return getEnd() - getStart();
173
+ }
174
+ }
@@ -1,35 +1,34 @@
1
1
  package org.embulk.input.hdfs;
2
2
 
3
- import java.util.List;
4
- import java.util.AbstractList;
5
- import java.util.ArrayList;
6
- import java.util.zip.GZIPInputStream;
7
- import java.util.zip.GZIPOutputStream;
8
- import java.util.regex.Pattern;
9
- import java.io.InputStream;
10
- import java.io.OutputStream;
11
- import java.io.BufferedOutputStream;
3
+ // Ported from https://github.com/embulk/embulk-input-s3/blob/b546158123a734acf0785d61400c69fcdd910ed6/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
4
+ // and Modified for this package.
5
+
6
+ import com.fasterxml.jackson.annotation.JsonCreator;
7
+ import com.fasterxml.jackson.annotation.JsonIgnore;
8
+ import com.fasterxml.jackson.annotation.JsonProperty;
9
+ import com.google.common.base.Optional;
10
+ import com.google.common.base.Throwables;
11
+ import org.apache.commons.lang.SerializationUtils;
12
+ import org.embulk.config.Config;
13
+ import org.embulk.config.ConfigDefault;
14
+ import org.embulk.config.ConfigSource;
15
+
12
16
  import java.io.BufferedInputStream;
17
+ import java.io.BufferedOutputStream;
13
18
  import java.io.ByteArrayInputStream;
14
19
  import java.io.ByteArrayOutputStream;
15
20
  import java.io.IOException;
21
+ import java.io.InputStream;
22
+ import java.io.OutputStream;
16
23
  import java.nio.ByteBuffer;
17
- import java.nio.charset.StandardCharsets;
18
- import org.embulk.config.Config;
19
- import org.embulk.config.ConfigDefault;
20
- import org.embulk.config.ConfigSource;
21
- import com.google.common.base.Throwables;
22
- import com.google.common.base.Optional;
23
- import com.fasterxml.jackson.annotation.JsonProperty;
24
- import com.fasterxml.jackson.annotation.JsonIgnore;
25
- import com.fasterxml.jackson.annotation.JsonCreator;
24
+ import java.util.AbstractList;
25
+ import java.util.ArrayList;
26
+ import java.util.List;
27
+ import java.util.regex.Pattern;
28
+ import java.util.zip.GZIPInputStream;
29
+ import java.util.zip.GZIPOutputStream;
26
30
 
27
- /**
28
- * Created by takahiro.nakayama on 2/20/16.
29
- * Ported from https://github.com/embulk/embulk-input-s3/blob/master/embulk-input-s3/src/main/java/org/embulk/input/s3/FileList.java
30
- * and Modified for this package.
31
- */
32
- public class PartialFileList
31
+ public class TargetFileInfoList
33
32
  {
34
33
  public interface Task
35
34
  {
@@ -50,21 +49,15 @@ public class PartialFileList
50
49
  public static class Entry
51
50
  {
52
51
  private int index;
53
- private long start;
54
- private long end;
55
- private boolean canDecompress;
52
+ private long size;
56
53
 
57
54
  @JsonCreator
58
55
  public Entry(
59
56
  @JsonProperty("index") int index,
60
- @JsonProperty("start") long start,
61
- @JsonProperty("end") long end,
62
- @JsonProperty("can_decompress") boolean canDecompress)
57
+ @JsonProperty("size") long size)
63
58
  {
64
59
  this.index = index;
65
- this.start = start;
66
- this.end = end;
67
- this.canDecompress = canDecompress;
60
+ this.size = size;
68
61
  }
69
62
 
70
63
  @JsonProperty("index")
@@ -73,28 +66,10 @@ public class PartialFileList
73
66
  return index;
74
67
  }
75
68
 
76
- @JsonProperty("start")
77
- public long getStart()
78
- {
79
- return start;
80
- }
81
-
82
- @JsonProperty("end")
83
- public long getEnd()
84
- {
85
- return end;
86
- }
87
-
88
- @JsonProperty("can_decompress")
89
- public boolean getCanDecompress()
90
- {
91
- return canDecompress;
92
- }
93
-
94
- @JsonIgnore
69
+ @JsonProperty("size")
95
70
  public long getSize()
96
71
  {
97
- return getEnd() - getStart();
72
+ return size;
98
73
  }
99
74
  }
100
75
 
@@ -103,11 +78,11 @@ public class PartialFileList
103
78
  private final ByteArrayOutputStream binary;
104
79
  private final OutputStream stream;
105
80
  private final List<Entry> entries = new ArrayList<>();
106
- private String last = null;
81
+ private TargetFileInfo last = null;
107
82
 
108
83
  private int limitCount = Integer.MAX_VALUE;
109
- private long minTaskSize = 1;
110
- private Pattern pathMatchPattern;
84
+ private long minTaskSize = 0L;
85
+ private Pattern pathMatchPattern = Pattern.compile(".*");
111
86
 
112
87
  private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
113
88
 
@@ -167,7 +142,7 @@ public class PartialFileList
167
142
  }
168
143
 
169
144
  // returns true if this file is used
170
- public synchronized boolean add(String path, long start, long end, boolean canDecompress)
145
+ public synchronized boolean add(TargetFileInfo targetFileInfo)
171
146
  {
172
147
  // TODO throw IllegalStateException if stream is already closed
173
148
 
@@ -175,36 +150,36 @@ public class PartialFileList
175
150
  return false;
176
151
  }
177
152
 
178
- if (!pathMatchPattern.matcher(path).find()) {
153
+ if (!pathMatchPattern.matcher(targetFileInfo.getPathString()).find()) {
179
154
  return false;
180
155
  }
181
156
 
182
157
  int index = entries.size();
183
- entries.add(new Entry(index, start, end, canDecompress));
158
+ entries.add(new Entry(index, targetFileInfo.getSize()));
184
159
 
185
- byte[] data = path.getBytes(StandardCharsets.UTF_8);
160
+ byte[] data = SerializationUtils.serialize(targetFileInfo);
186
161
  castBuffer.putInt(0, data.length);
187
162
  try {
188
163
  stream.write(castBuffer.array());
189
164
  stream.write(data);
190
165
  }
191
- catch (IOException e) {
192
- throw Throwables.propagate(e);
166
+ catch (IOException ex) {
167
+ throw Throwables.propagate(ex);
193
168
  }
194
169
 
195
- last = path;
170
+ last = targetFileInfo;
196
171
  return true;
197
172
  }
198
173
 
199
- public PartialFileList build()
174
+ public TargetFileInfoList build()
200
175
  {
201
176
  try {
202
177
  stream.close();
203
178
  }
204
- catch (IOException e) {
205
- throw Throwables.propagate(e);
179
+ catch (IOException ex) {
180
+ throw Throwables.propagate(ex);
206
181
  }
207
- return new PartialFileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
182
+ return new TargetFileInfoList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
208
183
  }
209
184
 
210
185
  private List<List<Entry>> getSplits(List<Entry> all)
@@ -230,13 +205,13 @@ public class PartialFileList
230
205
 
231
206
  private final byte[] data;
232
207
  private final List<List<Entry>> tasks;
233
- private final Optional<String> last;
208
+ private final Optional<TargetFileInfo> last;
234
209
 
235
210
  @JsonCreator
236
- public PartialFileList(
211
+ public TargetFileInfoList(
237
212
  @JsonProperty("data") byte[] data,
238
213
  @JsonProperty("tasks") List<List<Entry>> tasks,
239
- @JsonProperty("last") Optional<String> last)
214
+ @JsonProperty("last") Optional<TargetFileInfo> last)
240
215
  {
241
216
  this.data = data;
242
217
  this.tasks = tasks;
@@ -244,12 +219,18 @@ public class PartialFileList
244
219
  }
245
220
 
246
221
  @JsonIgnore
247
- public Optional<String> getLastPath(Optional<String> lastLastPath)
222
+ public static TargetFileInfoList.Builder builder(Task task)
223
+ {
224
+ return new TargetFileInfoList.Builder(task);
225
+ }
226
+
227
+ @JsonIgnore
228
+ public Optional<TargetFileInfo> getLastTargetFileInfo(Optional<TargetFileInfo> targetFileInfo)
248
229
  {
249
230
  if (last.isPresent()) {
250
231
  return last;
251
232
  }
252
- return lastLastPath;
233
+ return targetFileInfo;
253
234
  }
254
235
 
255
236
  @JsonIgnore
@@ -259,31 +240,34 @@ public class PartialFileList
259
240
  }
260
241
 
261
242
  @JsonIgnore
262
- public List<PartialFile> get(int i)
243
+ public List<TargetFileInfo> get(int i)
263
244
  {
264
245
  return new EntryList(data, tasks.get(i));
265
246
  }
266
247
 
267
248
  @JsonProperty("data")
249
+ @Deprecated
268
250
  public byte[] getData()
269
251
  {
270
252
  return data;
271
253
  }
272
254
 
273
255
  @JsonProperty("tasks")
256
+ @Deprecated
274
257
  public List<List<Entry>> getTasks()
275
258
  {
276
259
  return tasks;
277
260
  }
278
261
 
279
262
  @JsonProperty("last")
280
- public Optional<String> getLast()
263
+ @Deprecated
264
+ public Optional<TargetFileInfo> getLast()
281
265
  {
282
266
  return last;
283
267
  }
284
268
 
285
269
  private class EntryList
286
- extends AbstractList<PartialFile>
270
+ extends AbstractList<TargetFileInfo>
287
271
  {
288
272
  private final byte[] data;
289
273
  private final List<Entry> entries;
@@ -299,34 +283,33 @@ public class PartialFileList
299
283
  try {
300
284
  this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
301
285
  }
302
- catch (IOException e) {
303
- throw Throwables.propagate(e);
286
+ catch (IOException ex) {
287
+ throw Throwables.propagate(ex);
304
288
  }
305
289
  this.current = 0;
306
290
  }
307
291
 
308
292
  @Override
309
- public synchronized PartialFile get(int i)
293
+ public synchronized TargetFileInfo get(int i)
310
294
  {
311
- Entry entry = entries.get(i);
312
- if (entry.getIndex() < current) {
295
+ Entry e = entries.get(i);
296
+ if (e.getIndex() < current) {
313
297
  // rewind to the head
314
298
  try {
315
299
  stream.close();
316
300
  stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
317
301
  }
318
- catch (IOException e) {
319
- throw Throwables.propagate(e);
302
+ catch (IOException ex) {
303
+ throw Throwables.propagate(ex);
320
304
  }
321
305
  current = 0;
322
306
  }
323
307
 
324
- while (current < entry.getIndex()) {
308
+ while (current < e.getIndex()) {
325
309
  readNext();
326
310
  }
327
311
  // now current == e.getIndex()
328
- return new PartialFile(readNextString(),
329
- entry.getStart(), entry.getEnd(), entry.getCanDecompress());
312
+ return readNextString();
330
313
  }
331
314
 
332
315
  @Override
@@ -347,14 +330,14 @@ public class PartialFileList
347
330
 
348
331
  return b;
349
332
  }
350
- catch (IOException e) {
351
- throw Throwables.propagate(e);
333
+ catch (IOException ex) {
334
+ throw Throwables.propagate(ex);
352
335
  }
353
336
  }
354
337
 
355
- private String readNextString()
338
+ private TargetFileInfo readNextString()
356
339
  {
357
- return new String(readNext(), StandardCharsets.UTF_8);
340
+ return (TargetFileInfo) SerializationUtils.deserialize(readNext());
358
341
  }
359
342
  }
360
- }
343
+ }