embulk-input-s3 0.3.0 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (22) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
  3. data/classpath/embulk-input-s3-0.3.5.jar +0 -0
  4. data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
  5. data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
  6. data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
  7. data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
  8. data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
  9. data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
  10. data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
  11. data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
  12. data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
  13. data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
  14. data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
  15. data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
  16. data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
  17. data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
  18. data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
  19. metadata +15 -5
  20. data/classpath/embulk-input-s3-0.3.0.jar +0 -0
  21. data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
  22. data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7aef07b030009a6352551d4b7ebd78be174dbf53
4
- data.tar.gz: ec9ecba65e22bc04e73819d145ccb2a6a4d8115e
3
+ metadata.gz: d77ba197fb47f89fc3a890e240ddd963a5acc9fd
4
+ data.tar.gz: 97647120fdd11ddc13e03916a11429477c707ba4
5
5
  SHA512:
6
- metadata.gz: 867c41e167c9addeeabc893781c912880756b9ef5a16554a16440972acd30c425500b927a17736d2892cff167d64c352e11c87386808f756453290beb0b738a7
7
- data.tar.gz: 07f0fea1f0716c3b74d384eb8174f6193d84e0714dabfd8fa4bf8cdaaba6b25f27f1a26aa5a1746115a89a095ae3799d3000350ffdf890be4255bed29a45d369
6
+ metadata.gz: d81311e7e1e921bc336de2a78c01230616104a3c93ca6ba7699fd934c8536588bb0fd813033fd596774a5995641b753eaf6007d489e3f44daf30edbee95df215
7
+ data.tar.gz: f4d71fe3d8be542bd7563f7b52d2503d8840314f175ab41fecfb2bf27498188a8ec3e0a58828fcc022bb4b650396e1559b87684134d0d232b65dce633c4450dd
Binary file
@@ -7,15 +7,9 @@ import com.amazonaws.auth.AWSCredentialsProvider;
7
7
  import com.amazonaws.retry.PredefinedRetryPolicies;
8
8
  import com.amazonaws.services.s3.AmazonS3;
9
9
  import com.amazonaws.services.s3.AmazonS3ClientBuilder;
10
- import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
11
10
  import com.amazonaws.services.s3.model.GetObjectRequest;
12
- import com.amazonaws.services.s3.model.ListObjectsRequest;
13
- import com.amazonaws.services.s3.model.ObjectListing;
14
- import com.amazonaws.services.s3.model.ObjectMetadata;
15
11
  import com.amazonaws.services.s3.model.S3Object;
16
12
  import com.amazonaws.services.s3.model.S3ObjectInputStream;
17
- import com.amazonaws.services.s3.model.S3ObjectSummary;
18
- import com.amazonaws.services.s3.model.StorageClass;
19
13
  import com.google.common.annotations.VisibleForTesting;
20
14
  import org.embulk.config.Config;
21
15
  import org.embulk.config.ConfigDefault;
@@ -26,6 +20,10 @@ import org.embulk.config.ConfigSource;
26
20
  import org.embulk.config.Task;
27
21
  import org.embulk.config.TaskReport;
28
22
  import org.embulk.config.TaskSource;
23
+ import org.embulk.input.s3.explorer.S3NameOrderPrefixFileExplorer;
24
+ import org.embulk.input.s3.explorer.S3SingleFileExplorer;
25
+ import org.embulk.input.s3.explorer.S3TimeOrderPrefixFileExplorer;
26
+ import org.embulk.input.s3.utils.DateUtils;
29
27
  import org.embulk.spi.BufferAllocator;
30
28
  import org.embulk.spi.Exec;
31
29
  import org.embulk.spi.FileInputPlugin;
@@ -40,6 +38,9 @@ import org.slf4j.Logger;
40
38
 
41
39
  import java.io.IOException;
42
40
  import java.io.InputStream;
41
+ import java.text.SimpleDateFormat;
42
+ import java.util.Collections;
43
+ import java.util.Date;
43
44
  import java.util.Iterator;
44
45
  import java.util.List;
45
46
  import java.util.Optional;
@@ -51,6 +52,7 @@ public abstract class AbstractS3FileInputPlugin
51
52
  implements FileInputPlugin
52
53
  {
53
54
  private static final Logger LOGGER = Exec.getLogger(S3FileInputPlugin.class);
55
+ private static final String FULL_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
54
56
 
55
57
  public interface PluginTask
56
58
  extends AwsCredentialsTask, FileList.Task, RetrySupportPluginTask, Task
@@ -88,12 +90,35 @@ public abstract class AbstractS3FileInputPlugin
88
90
  @ConfigDefault("false")
89
91
  boolean getSkipGlacierObjects();
90
92
 
93
+ @Config("use_modified_time")
94
+ @ConfigDefault("false")
95
+ boolean getUseModifiedTime();
96
+
97
+ @Config("last_modified_time")
98
+ @ConfigDefault("null")
99
+ Optional<String> getLastModifiedTime();
100
+
91
101
  // TODO timeout, ssl, etc
92
102
 
103
+ ////////////////////////////////////////
104
+ // Internal configurations
105
+ ////////////////////////////////////////
106
+
93
107
  FileList getFiles();
94
108
 
95
109
  void setFiles(FileList files);
96
110
 
111
+ /**
112
+ * end_modified_time is conditionally set if modified_time mode is enabled.
113
+ *
114
+ * It is internal state and must not be set in config.yml
115
+ */
116
+ @Config("__end_modified_time")
117
+ @ConfigDefault("null")
118
+ Optional<Date> getEndModifiedTime();
119
+
120
+ void setEndModifiedTime(Optional<Date> endModifiedTime);
121
+
97
122
  @ConfigInject
98
123
  BufferAllocator getBufferAllocator();
99
124
  }
@@ -105,6 +130,7 @@ public abstract class AbstractS3FileInputPlugin
105
130
  {
106
131
  PluginTask task = config.loadConfig(getTaskClass());
107
132
 
133
+ errorIfInternalParamsAreSet(task);
108
134
  validateInputTask(task);
109
135
  // list files recursively
110
136
  task.setFiles(listFiles(task));
@@ -130,9 +156,15 @@ public abstract class AbstractS3FileInputPlugin
130
156
 
131
157
  // last_path
132
158
  if (task.getIncremental()) {
133
- Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
134
- LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
135
- configDiff.set("last_path", lastPath);
159
+ if (task.getUseModifiedTime()) {
160
+ Date endModifiedTime = task.getEndModifiedTime().orElse(new Date());
161
+ configDiff.set("last_modified_time", new SimpleDateFormat(FULL_DATE_FORMAT).format(endModifiedTime));
162
+ }
163
+ else {
164
+ Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
165
+ LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
166
+ configDiff.set("last_path", lastPath);
167
+ }
136
168
  }
137
169
  return configDiff;
138
170
  }
@@ -180,11 +212,10 @@ public abstract class AbstractS3FileInputPlugin
180
212
  {
181
213
  ClientConfiguration clientConfig = new ClientConfiguration();
182
214
 
183
- /** PLT-9886: disable built-in retry*/
184
215
  //clientConfig.setProtocol(Protocol.HTTP);
185
- // clientConfig.setMaxConnections(50); // SDK default: 50
216
+ clientConfig.setMaxConnections(50); // SDK default: 50
186
217
  // clientConfig.setMaxErrorRetry(3); // SDK default: 3
187
- // clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
218
+ clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
188
219
  clientConfig.setRetryPolicy(PredefinedRetryPolicies.NO_RETRY_POLICY);
189
220
  // set http proxy
190
221
  if (task.getHttpProxy().isPresent()) {
@@ -238,22 +269,35 @@ public abstract class AbstractS3FileInputPlugin
238
269
  String bucketName = task.getBucket();
239
270
  FileList.Builder builder = new FileList.Builder(task);
240
271
  RetryExecutor retryExec = retryExecutorFrom(task);
272
+
241
273
  if (task.getPath().isPresent()) {
242
274
  LOGGER.info("Start getting object with path: [{}]", task.getPath().get());
243
- addS3DirectObject(builder, client, task.getBucket(), task.getPath().get(), retryExec);
275
+ new S3SingleFileExplorer(bucketName, client, retryExec, task.getPath().get()).addToBuilder(builder);
276
+ return builder.build();
244
277
  }
245
- else {
246
- // does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
247
- LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
248
- if (task.getPathPrefix().get().equals("/")) {
249
- LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
250
- }
251
278
 
252
- listS3FilesByPrefix(builder, client, bucketName,
253
- task.getPathPrefix().get(), task.getLastPath(), task.getSkipGlacierObjects(), retryExec);
254
- LOGGER.info("Found total [{}] files", builder.size());
279
+ // does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
280
+ LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
281
+ if (task.getPathPrefix().get().equals("/")) {
282
+ LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
255
283
  }
256
284
 
285
+ if (task.getUseModifiedTime()) {
286
+ Date now = new Date();
287
+ Optional<Date> from = task.getLastModifiedTime().isPresent()
288
+ ? Optional.of(DateUtils.parse(task.getLastModifiedTime().get(), Collections.singletonList(FULL_DATE_FORMAT)))
289
+ : Optional.empty();
290
+ task.setEndModifiedTime(Optional.of(now));
291
+
292
+ new S3TimeOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
293
+ task.getSkipGlacierObjects(), from, now).addToBuilder(builder);
294
+ }
295
+ else {
296
+ new S3NameOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
297
+ task.getSkipGlacierObjects(), task.getLastPath().orElse(null)).addToBuilder(builder);
298
+ }
299
+
300
+ LOGGER.info("Found total [{}] files", builder.size());
257
301
  return builder.build();
258
302
  }
259
303
  catch (AmazonServiceException ex) {
@@ -269,107 +313,13 @@ public abstract class AbstractS3FileInputPlugin
269
313
  }
270
314
  }
271
315
 
272
- @VisibleForTesting
273
- public void addS3DirectObject(FileList.Builder builder,
274
- final AmazonS3 client,
275
- String bucket,
276
- String objectKey)
277
- {
278
- addS3DirectObject(builder, client, bucket, objectKey, null);
279
- }
280
-
281
- @VisibleForTesting
282
- public void addS3DirectObject(FileList.Builder builder,
283
- final AmazonS3 client,
284
- String bucket,
285
- String objectKey,
286
- RetryExecutor retryExec)
287
- {
288
- final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucket, objectKey);
289
-
290
- ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
291
- @Override
292
- public ObjectMetadata call()
293
- {
294
- return client.getObjectMetadata(objectMetadataRequest);
295
- }
296
- }.executeWith(retryExec);
297
-
298
- builder.add(objectKey, objectMetadata.getContentLength());
299
- }
300
-
301
- private void validateInputTask(PluginTask task)
316
+ private void validateInputTask(final PluginTask task)
302
317
  {
303
318
  if (!task.getPathPrefix().isPresent() && !task.getPath().isPresent()) {
304
319
  throw new ConfigException("Either path or path_prefix is required");
305
320
  }
306
321
  }
307
322
 
308
- @VisibleForTesting
309
- public static void listS3FilesByPrefix(FileList.Builder builder,
310
- final AmazonS3 client,
311
- String bucketName,
312
- String prefix,
313
- Optional<String> lastPath,
314
- boolean skipGlacierObjects)
315
- {
316
- listS3FilesByPrefix(builder, client, bucketName, prefix, lastPath, skipGlacierObjects, null);
317
- }
318
-
319
- /**
320
- * Lists S3 filenames filtered by prefix.
321
- * <p>
322
- * The resulting list does not include the file that's size == 0.
323
- * @param builder custom Filelist builder
324
- * @param client Amazon S3
325
- * @param bucketName Amazon S3 bucket name
326
- * @param prefix Amazon S3 bucket name prefix
327
- * @param lastPath last path
328
- * @param skipGlacierObjects skip gracier objects
329
- * @param retryExec a retry executor object to do the retrying
330
- */
331
- @VisibleForTesting
332
- public static void listS3FilesByPrefix(FileList.Builder builder,
333
- final AmazonS3 client,
334
- String bucketName,
335
- String prefix,
336
- Optional<String> lastPath,
337
- boolean skipGlacierObjects,
338
- RetryExecutor retryExec)
339
- {
340
- String lastKey = lastPath.orElse(null);
341
- do {
342
- final String finalLastKey = lastKey;
343
- final ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, finalLastKey, null, 1024);
344
- ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects") {
345
- @Override
346
- public ObjectListing call()
347
- {
348
- return client.listObjects(req);
349
- }
350
- }.executeWith(retryExec);
351
- for (S3ObjectSummary s : ol.getObjectSummaries()) {
352
- if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
353
- if (skipGlacierObjects) {
354
- Exec.getLogger("AbstractS3FileInputPlugin.class").warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
355
- continue;
356
- }
357
- else {
358
- throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
359
- }
360
- }
361
- if (s.getSize() > 0) {
362
- builder.add(s.getKey(), s.getSize());
363
- if (!builder.needsMore()) {
364
- LOGGER.warn("Too many files matched, stop listing file");
365
- return;
366
- }
367
- }
368
- }
369
- lastKey = ol.getNextMarker();
370
- } while (lastKey != null);
371
- }
372
-
373
323
  @Override
374
324
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
375
325
  {
@@ -441,6 +391,14 @@ public abstract class AbstractS3FileInputPlugin
441
391
  }
442
392
  }
443
393
 
394
+ @VisibleForTesting
395
+ static void errorIfInternalParamsAreSet(PluginTask task)
396
+ {
397
+ if (task.getEndModifiedTime().isPresent()) {
398
+ throw new ConfigException("'__end_modified_time' must not be set.");
399
+ }
400
+ }
401
+
444
402
  // TODO create single-file InputStreamFileInput utility
445
403
  private class SingleFileProvider
446
404
  implements InputStreamFileInput.Provider
@@ -476,6 +434,9 @@ public abstract class AbstractS3FileInputPlugin
476
434
  }.executeWithCheckedException(retryExec, IOException.class);
477
435
 
478
436
  long objectSize = object.getObjectMetadata().getContentLength();
437
+ // Some plugin users are parsing this output to get file list.
438
+ // Keep it for now but might be removed in the future.
439
+ LOGGER.info("Open S3Object with bucket [{}], key [{}], with size [{}]", bucket, key, objectSize);
479
440
  InputStream inputStream = new ResumableInputStream(object.getObjectContent(), new S3InputStreamReopener(client, request, objectSize, retryExec));
480
441
  return new InputStreamWithHints(inputStream, String.format("s3://%s/%s", bucket, key));
481
442
  }
@@ -19,7 +19,7 @@ import static org.embulk.spi.util.RetryExecutor.Retryable;
19
19
  * Retryable utility, regardless the occurred exceptions,
20
20
  * Also provide a default approach for exception propagation.
21
21
  */
22
- class DefaultRetryable<T> implements Retryable<T>
22
+ public class DefaultRetryable<T> implements Retryable<T>
23
23
  {
24
24
  private static final Logger log = Exec.getLogger(DefaultRetryable.class);
25
25
  private static final Set<Integer> NONRETRYABLE_STATUS_CODES = new HashSet<Integer>(2);
@@ -11,7 +11,7 @@ public interface RetrySupportPluginTask extends Task
11
11
  int getMaximumRetries();
12
12
 
13
13
  @Config("initial_retry_interval_millis")
14
- @ConfigDefault("30000")
14
+ @ConfigDefault("2000")
15
15
  int getInitialRetryIntervalMillis();
16
16
 
17
17
  @Config("maximum_retry_interval_millis")
@@ -0,0 +1,21 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import org.embulk.input.s3.FileList;
5
+ import org.embulk.spi.util.RetryExecutor;
6
+
7
+ public abstract class S3FileExplorer
8
+ {
9
+ protected String bucketName;
10
+ protected AmazonS3 s3Client;
11
+ protected RetryExecutor retryExecutor;
12
+
13
+ public S3FileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor)
14
+ {
15
+ this.bucketName = bucketName;
16
+ this.s3Client = s3Client;
17
+ this.retryExecutor = retryExecutor;
18
+ }
19
+
20
+ public abstract void addToBuilder(FileList.Builder builder);
21
+ }
@@ -0,0 +1,45 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
7
+ import org.embulk.input.s3.DefaultRetryable;
8
+ import org.embulk.spi.util.RetryExecutor;
9
+
10
+ import java.util.List;
11
+
12
+ public class S3NameOrderPrefixFileExplorer extends S3PrefixFileExplorer
13
+ {
14
+ private String lastPath;
15
+
16
+ public S3NameOrderPrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor,
17
+ final String pathPrefix, final boolean skipGlacierObjects, final String lastPath)
18
+ {
19
+ super(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects);
20
+ this.lastPath = lastPath;
21
+ }
22
+
23
+ @Override
24
+ protected List<S3ObjectSummary> fetch()
25
+ {
26
+ final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
27
+ final ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects")
28
+ {
29
+ @Override
30
+ public ObjectListing call()
31
+ {
32
+ return s3Client.listObjects(req);
33
+ }
34
+ }.executeWith(retryExecutor);
35
+ lastPath = ol.getNextMarker();
36
+
37
+ return ol.getObjectSummaries();
38
+ }
39
+
40
+ @Override
41
+ protected boolean hasNext()
42
+ {
43
+ return lastPath != null;
44
+ }
45
+ }
@@ -0,0 +1,57 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
5
+ import com.amazonaws.services.s3.model.StorageClass;
6
+ import org.embulk.config.ConfigException;
7
+ import org.embulk.input.s3.FileList;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.util.RetryExecutor;
10
+ import org.slf4j.Logger;
11
+
12
+ import java.util.List;
13
+
14
+ public abstract class S3PrefixFileExplorer extends S3FileExplorer
15
+ {
16
+ private static final Logger LOGGER = Exec.getLogger(S3PrefixFileExplorer.class);
17
+
18
+ protected String pathPrefix;
19
+
20
+ private final boolean skipGlacierObjects;
21
+
22
+ public S3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
23
+ {
24
+ super(bucketName, s3Client, retryExecutor);
25
+ this.pathPrefix = pathPrefix;
26
+ this.skipGlacierObjects = skipGlacierObjects;
27
+ }
28
+
29
+ @Override
30
+ public void addToBuilder(final FileList.Builder builder)
31
+ {
32
+ do {
33
+ final List<S3ObjectSummary> s3ObjectSummaries = fetch();
34
+
35
+ for (final S3ObjectSummary s : s3ObjectSummaries) {
36
+ if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
37
+ if (skipGlacierObjects) {
38
+ LOGGER.warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
39
+ continue;
40
+ }
41
+ throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
42
+ }
43
+ if (s.getSize() > 0) {
44
+ builder.add(s.getKey(), s.getSize());
45
+ if (!builder.needsMore()) {
46
+ LOGGER.warn("Too many files matched, stop listing file");
47
+ return;
48
+ }
49
+ }
50
+ }
51
+ } while (hasNext());
52
+ }
53
+
54
+ protected abstract List<S3ObjectSummary> fetch();
55
+
56
+ protected abstract boolean hasNext();
57
+ }