embulk-input-s3 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
  3. data/classpath/embulk-input-s3-0.3.5.jar +0 -0
  4. data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
  5. data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
  6. data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
  7. data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
  8. data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
  9. data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
  10. data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
  11. data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
  12. data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
  13. data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
  14. data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
  15. data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
  16. data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
  17. data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
  18. data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
  19. metadata +15 -5
  20. data/classpath/embulk-input-s3-0.3.0.jar +0 -0
  21. data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
  22. data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7aef07b030009a6352551d4b7ebd78be174dbf53
4
- data.tar.gz: ec9ecba65e22bc04e73819d145ccb2a6a4d8115e
3
+ metadata.gz: d77ba197fb47f89fc3a890e240ddd963a5acc9fd
4
+ data.tar.gz: 97647120fdd11ddc13e03916a11429477c707ba4
5
5
  SHA512:
6
- metadata.gz: 867c41e167c9addeeabc893781c912880756b9ef5a16554a16440972acd30c425500b927a17736d2892cff167d64c352e11c87386808f756453290beb0b738a7
7
- data.tar.gz: 07f0fea1f0716c3b74d384eb8174f6193d84e0714dabfd8fa4bf8cdaaba6b25f27f1a26aa5a1746115a89a095ae3799d3000350ffdf890be4255bed29a45d369
6
+ metadata.gz: d81311e7e1e921bc336de2a78c01230616104a3c93ca6ba7699fd934c8536588bb0fd813033fd596774a5995641b753eaf6007d489e3f44daf30edbee95df215
7
+ data.tar.gz: f4d71fe3d8be542bd7563f7b52d2503d8840314f175ab41fecfb2bf27498188a8ec3e0a58828fcc022bb4b650396e1559b87684134d0d232b65dce633c4450dd
Binary file
@@ -7,15 +7,9 @@ import com.amazonaws.auth.AWSCredentialsProvider;
7
7
  import com.amazonaws.retry.PredefinedRetryPolicies;
8
8
  import com.amazonaws.services.s3.AmazonS3;
9
9
  import com.amazonaws.services.s3.AmazonS3ClientBuilder;
10
- import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
11
10
  import com.amazonaws.services.s3.model.GetObjectRequest;
12
- import com.amazonaws.services.s3.model.ListObjectsRequest;
13
- import com.amazonaws.services.s3.model.ObjectListing;
14
- import com.amazonaws.services.s3.model.ObjectMetadata;
15
11
  import com.amazonaws.services.s3.model.S3Object;
16
12
  import com.amazonaws.services.s3.model.S3ObjectInputStream;
17
- import com.amazonaws.services.s3.model.S3ObjectSummary;
18
- import com.amazonaws.services.s3.model.StorageClass;
19
13
  import com.google.common.annotations.VisibleForTesting;
20
14
  import org.embulk.config.Config;
21
15
  import org.embulk.config.ConfigDefault;
@@ -26,6 +20,10 @@ import org.embulk.config.ConfigSource;
26
20
  import org.embulk.config.Task;
27
21
  import org.embulk.config.TaskReport;
28
22
  import org.embulk.config.TaskSource;
23
+ import org.embulk.input.s3.explorer.S3NameOrderPrefixFileExplorer;
24
+ import org.embulk.input.s3.explorer.S3SingleFileExplorer;
25
+ import org.embulk.input.s3.explorer.S3TimeOrderPrefixFileExplorer;
26
+ import org.embulk.input.s3.utils.DateUtils;
29
27
  import org.embulk.spi.BufferAllocator;
30
28
  import org.embulk.spi.Exec;
31
29
  import org.embulk.spi.FileInputPlugin;
@@ -40,6 +38,9 @@ import org.slf4j.Logger;
40
38
 
41
39
  import java.io.IOException;
42
40
  import java.io.InputStream;
41
+ import java.text.SimpleDateFormat;
42
+ import java.util.Collections;
43
+ import java.util.Date;
43
44
  import java.util.Iterator;
44
45
  import java.util.List;
45
46
  import java.util.Optional;
@@ -51,6 +52,7 @@ public abstract class AbstractS3FileInputPlugin
51
52
  implements FileInputPlugin
52
53
  {
53
54
  private static final Logger LOGGER = Exec.getLogger(S3FileInputPlugin.class);
55
+ private static final String FULL_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
54
56
 
55
57
  public interface PluginTask
56
58
  extends AwsCredentialsTask, FileList.Task, RetrySupportPluginTask, Task
@@ -88,12 +90,35 @@ public abstract class AbstractS3FileInputPlugin
88
90
  @ConfigDefault("false")
89
91
  boolean getSkipGlacierObjects();
90
92
 
93
+ @Config("use_modified_time")
94
+ @ConfigDefault("false")
95
+ boolean getUseModifiedTime();
96
+
97
+ @Config("last_modified_time")
98
+ @ConfigDefault("null")
99
+ Optional<String> getLastModifiedTime();
100
+
91
101
  // TODO timeout, ssl, etc
92
102
 
103
+ ////////////////////////////////////////
104
+ // Internal configurations
105
+ ////////////////////////////////////////
106
+
93
107
  FileList getFiles();
94
108
 
95
109
  void setFiles(FileList files);
96
110
 
111
+ /**
112
+ * end_modified_time is conditionally set if modified_time mode is enabled.
113
+ *
114
+ * It is internal state and must not be set in config.yml
115
+ */
116
+ @Config("__end_modified_time")
117
+ @ConfigDefault("null")
118
+ Optional<Date> getEndModifiedTime();
119
+
120
+ void setEndModifiedTime(Optional<Date> endModifiedTime);
121
+
97
122
  @ConfigInject
98
123
  BufferAllocator getBufferAllocator();
99
124
  }
@@ -105,6 +130,7 @@ public abstract class AbstractS3FileInputPlugin
105
130
  {
106
131
  PluginTask task = config.loadConfig(getTaskClass());
107
132
 
133
+ errorIfInternalParamsAreSet(task);
108
134
  validateInputTask(task);
109
135
  // list files recursively
110
136
  task.setFiles(listFiles(task));
@@ -130,9 +156,15 @@ public abstract class AbstractS3FileInputPlugin
130
156
 
131
157
  // last_path
132
158
  if (task.getIncremental()) {
133
- Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
134
- LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
135
- configDiff.set("last_path", lastPath);
159
+ if (task.getUseModifiedTime()) {
160
+ Date endModifiedTime = task.getEndModifiedTime().orElse(new Date());
161
+ configDiff.set("last_modified_time", new SimpleDateFormat(FULL_DATE_FORMAT).format(endModifiedTime));
162
+ }
163
+ else {
164
+ Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
165
+ LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
166
+ configDiff.set("last_path", lastPath);
167
+ }
136
168
  }
137
169
  return configDiff;
138
170
  }
@@ -180,11 +212,10 @@ public abstract class AbstractS3FileInputPlugin
180
212
  {
181
213
  ClientConfiguration clientConfig = new ClientConfiguration();
182
214
 
183
- /** PLT-9886: disable built-in retry*/
184
215
  //clientConfig.setProtocol(Protocol.HTTP);
185
- // clientConfig.setMaxConnections(50); // SDK default: 50
216
+ clientConfig.setMaxConnections(50); // SDK default: 50
186
217
  // clientConfig.setMaxErrorRetry(3); // SDK default: 3
187
- // clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
218
+ clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
188
219
  clientConfig.setRetryPolicy(PredefinedRetryPolicies.NO_RETRY_POLICY);
189
220
  // set http proxy
190
221
  if (task.getHttpProxy().isPresent()) {
@@ -238,22 +269,35 @@ public abstract class AbstractS3FileInputPlugin
238
269
  String bucketName = task.getBucket();
239
270
  FileList.Builder builder = new FileList.Builder(task);
240
271
  RetryExecutor retryExec = retryExecutorFrom(task);
272
+
241
273
  if (task.getPath().isPresent()) {
242
274
  LOGGER.info("Start getting object with path: [{}]", task.getPath().get());
243
- addS3DirectObject(builder, client, task.getBucket(), task.getPath().get(), retryExec);
275
+ new S3SingleFileExplorer(bucketName, client, retryExec, task.getPath().get()).addToBuilder(builder);
276
+ return builder.build();
244
277
  }
245
- else {
246
- // does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
247
- LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
248
- if (task.getPathPrefix().get().equals("/")) {
249
- LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
250
- }
251
278
 
252
- listS3FilesByPrefix(builder, client, bucketName,
253
- task.getPathPrefix().get(), task.getLastPath(), task.getSkipGlacierObjects(), retryExec);
254
- LOGGER.info("Found total [{}] files", builder.size());
279
+ // does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
280
+ LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
281
+ if (task.getPathPrefix().get().equals("/")) {
282
+ LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
255
283
  }
256
284
 
285
+ if (task.getUseModifiedTime()) {
286
+ Date now = new Date();
287
+ Optional<Date> from = task.getLastModifiedTime().isPresent()
288
+ ? Optional.of(DateUtils.parse(task.getLastModifiedTime().get(), Collections.singletonList(FULL_DATE_FORMAT)))
289
+ : Optional.empty();
290
+ task.setEndModifiedTime(Optional.of(now));
291
+
292
+ new S3TimeOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
293
+ task.getSkipGlacierObjects(), from, now).addToBuilder(builder);
294
+ }
295
+ else {
296
+ new S3NameOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
297
+ task.getSkipGlacierObjects(), task.getLastPath().orElse(null)).addToBuilder(builder);
298
+ }
299
+
300
+ LOGGER.info("Found total [{}] files", builder.size());
257
301
  return builder.build();
258
302
  }
259
303
  catch (AmazonServiceException ex) {
@@ -269,107 +313,13 @@ public abstract class AbstractS3FileInputPlugin
269
313
  }
270
314
  }
271
315
 
272
- @VisibleForTesting
273
- public void addS3DirectObject(FileList.Builder builder,
274
- final AmazonS3 client,
275
- String bucket,
276
- String objectKey)
277
- {
278
- addS3DirectObject(builder, client, bucket, objectKey, null);
279
- }
280
-
281
- @VisibleForTesting
282
- public void addS3DirectObject(FileList.Builder builder,
283
- final AmazonS3 client,
284
- String bucket,
285
- String objectKey,
286
- RetryExecutor retryExec)
287
- {
288
- final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucket, objectKey);
289
-
290
- ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
291
- @Override
292
- public ObjectMetadata call()
293
- {
294
- return client.getObjectMetadata(objectMetadataRequest);
295
- }
296
- }.executeWith(retryExec);
297
-
298
- builder.add(objectKey, objectMetadata.getContentLength());
299
- }
300
-
301
- private void validateInputTask(PluginTask task)
316
+ private void validateInputTask(final PluginTask task)
302
317
  {
303
318
  if (!task.getPathPrefix().isPresent() && !task.getPath().isPresent()) {
304
319
  throw new ConfigException("Either path or path_prefix is required");
305
320
  }
306
321
  }
307
322
 
308
- @VisibleForTesting
309
- public static void listS3FilesByPrefix(FileList.Builder builder,
310
- final AmazonS3 client,
311
- String bucketName,
312
- String prefix,
313
- Optional<String> lastPath,
314
- boolean skipGlacierObjects)
315
- {
316
- listS3FilesByPrefix(builder, client, bucketName, prefix, lastPath, skipGlacierObjects, null);
317
- }
318
-
319
- /**
320
- * Lists S3 filenames filtered by prefix.
321
- * <p>
322
- * The resulting list does not include the file that's size == 0.
323
- * @param builder custom Filelist builder
324
- * @param client Amazon S3
325
- * @param bucketName Amazon S3 bucket name
326
- * @param prefix Amazon S3 bucket name prefix
327
- * @param lastPath last path
328
- * @param skipGlacierObjects skip gracier objects
329
- * @param retryExec a retry executor object to do the retrying
330
- */
331
- @VisibleForTesting
332
- public static void listS3FilesByPrefix(FileList.Builder builder,
333
- final AmazonS3 client,
334
- String bucketName,
335
- String prefix,
336
- Optional<String> lastPath,
337
- boolean skipGlacierObjects,
338
- RetryExecutor retryExec)
339
- {
340
- String lastKey = lastPath.orElse(null);
341
- do {
342
- final String finalLastKey = lastKey;
343
- final ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, finalLastKey, null, 1024);
344
- ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects") {
345
- @Override
346
- public ObjectListing call()
347
- {
348
- return client.listObjects(req);
349
- }
350
- }.executeWith(retryExec);
351
- for (S3ObjectSummary s : ol.getObjectSummaries()) {
352
- if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
353
- if (skipGlacierObjects) {
354
- Exec.getLogger("AbstractS3FileInputPlugin.class").warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
355
- continue;
356
- }
357
- else {
358
- throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
359
- }
360
- }
361
- if (s.getSize() > 0) {
362
- builder.add(s.getKey(), s.getSize());
363
- if (!builder.needsMore()) {
364
- LOGGER.warn("Too many files matched, stop listing file");
365
- return;
366
- }
367
- }
368
- }
369
- lastKey = ol.getNextMarker();
370
- } while (lastKey != null);
371
- }
372
-
373
323
  @Override
374
324
  public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
375
325
  {
@@ -441,6 +391,14 @@ public abstract class AbstractS3FileInputPlugin
441
391
  }
442
392
  }
443
393
 
394
+ @VisibleForTesting
395
+ static void errorIfInternalParamsAreSet(PluginTask task)
396
+ {
397
+ if (task.getEndModifiedTime().isPresent()) {
398
+ throw new ConfigException("'__end_modified_time' must not be set.");
399
+ }
400
+ }
401
+
444
402
  // TODO create single-file InputStreamFileInput utility
445
403
  private class SingleFileProvider
446
404
  implements InputStreamFileInput.Provider
@@ -476,6 +434,9 @@ public abstract class AbstractS3FileInputPlugin
476
434
  }.executeWithCheckedException(retryExec, IOException.class);
477
435
 
478
436
  long objectSize = object.getObjectMetadata().getContentLength();
437
+ // Some plugin users are parsing this output to get file list.
438
+ // Keep it for now but might be removed in the future.
439
+ LOGGER.info("Open S3Object with bucket [{}], key [{}], with size [{}]", bucket, key, objectSize);
479
440
  InputStream inputStream = new ResumableInputStream(object.getObjectContent(), new S3InputStreamReopener(client, request, objectSize, retryExec));
480
441
  return new InputStreamWithHints(inputStream, String.format("s3://%s/%s", bucket, key));
481
442
  }
@@ -19,7 +19,7 @@ import static org.embulk.spi.util.RetryExecutor.Retryable;
19
19
  * Retryable utility, regardless the occurred exceptions,
20
20
  * Also provide a default approach for exception propagation.
21
21
  */
22
- class DefaultRetryable<T> implements Retryable<T>
22
+ public class DefaultRetryable<T> implements Retryable<T>
23
23
  {
24
24
  private static final Logger log = Exec.getLogger(DefaultRetryable.class);
25
25
  private static final Set<Integer> NONRETRYABLE_STATUS_CODES = new HashSet<Integer>(2);
@@ -11,7 +11,7 @@ public interface RetrySupportPluginTask extends Task
11
11
  int getMaximumRetries();
12
12
 
13
13
  @Config("initial_retry_interval_millis")
14
- @ConfigDefault("30000")
14
+ @ConfigDefault("2000")
15
15
  int getInitialRetryIntervalMillis();
16
16
 
17
17
  @Config("maximum_retry_interval_millis")
@@ -0,0 +1,21 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import org.embulk.input.s3.FileList;
5
+ import org.embulk.spi.util.RetryExecutor;
6
+
7
+ public abstract class S3FileExplorer
8
+ {
9
+ protected String bucketName;
10
+ protected AmazonS3 s3Client;
11
+ protected RetryExecutor retryExecutor;
12
+
13
+ public S3FileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor)
14
+ {
15
+ this.bucketName = bucketName;
16
+ this.s3Client = s3Client;
17
+ this.retryExecutor = retryExecutor;
18
+ }
19
+
20
+ public abstract void addToBuilder(FileList.Builder builder);
21
+ }
@@ -0,0 +1,45 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
7
+ import org.embulk.input.s3.DefaultRetryable;
8
+ import org.embulk.spi.util.RetryExecutor;
9
+
10
+ import java.util.List;
11
+
12
+ public class S3NameOrderPrefixFileExplorer extends S3PrefixFileExplorer
13
+ {
14
+ private String lastPath;
15
+
16
+ public S3NameOrderPrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor,
17
+ final String pathPrefix, final boolean skipGlacierObjects, final String lastPath)
18
+ {
19
+ super(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects);
20
+ this.lastPath = lastPath;
21
+ }
22
+
23
+ @Override
24
+ protected List<S3ObjectSummary> fetch()
25
+ {
26
+ final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
27
+ final ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects")
28
+ {
29
+ @Override
30
+ public ObjectListing call()
31
+ {
32
+ return s3Client.listObjects(req);
33
+ }
34
+ }.executeWith(retryExecutor);
35
+ lastPath = ol.getNextMarker();
36
+
37
+ return ol.getObjectSummaries();
38
+ }
39
+
40
+ @Override
41
+ protected boolean hasNext()
42
+ {
43
+ return lastPath != null;
44
+ }
45
+ }
@@ -0,0 +1,57 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
5
+ import com.amazonaws.services.s3.model.StorageClass;
6
+ import org.embulk.config.ConfigException;
7
+ import org.embulk.input.s3.FileList;
8
+ import org.embulk.spi.Exec;
9
+ import org.embulk.spi.util.RetryExecutor;
10
+ import org.slf4j.Logger;
11
+
12
+ import java.util.List;
13
+
14
+ public abstract class S3PrefixFileExplorer extends S3FileExplorer
15
+ {
16
+ private static final Logger LOGGER = Exec.getLogger(S3PrefixFileExplorer.class);
17
+
18
+ protected String pathPrefix;
19
+
20
+ private final boolean skipGlacierObjects;
21
+
22
+ public S3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
23
+ {
24
+ super(bucketName, s3Client, retryExecutor);
25
+ this.pathPrefix = pathPrefix;
26
+ this.skipGlacierObjects = skipGlacierObjects;
27
+ }
28
+
29
+ @Override
30
+ public void addToBuilder(final FileList.Builder builder)
31
+ {
32
+ do {
33
+ final List<S3ObjectSummary> s3ObjectSummaries = fetch();
34
+
35
+ for (final S3ObjectSummary s : s3ObjectSummaries) {
36
+ if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
37
+ if (skipGlacierObjects) {
38
+ LOGGER.warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
39
+ continue;
40
+ }
41
+ throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
42
+ }
43
+ if (s.getSize() > 0) {
44
+ builder.add(s.getKey(), s.getSize());
45
+ if (!builder.needsMore()) {
46
+ LOGGER.warn("Too many files matched, stop listing file");
47
+ return;
48
+ }
49
+ }
50
+ }
51
+ } while (hasNext());
52
+ }
53
+
54
+ protected abstract List<S3ObjectSummary> fetch();
55
+
56
+ protected abstract boolean hasNext();
57
+ }