embulk-input-s3 0.3.0 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
- data/classpath/embulk-input-s3-0.3.5.jar +0 -0
- data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
- data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
- data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
- data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
- data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
- data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
- data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
- data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
- metadata +15 -5
- data/classpath/embulk-input-s3-0.3.0.jar +0 -0
- data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
- data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d77ba197fb47f89fc3a890e240ddd963a5acc9fd
|
4
|
+
data.tar.gz: 97647120fdd11ddc13e03916a11429477c707ba4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d81311e7e1e921bc336de2a78c01230616104a3c93ca6ba7699fd934c8536588bb0fd813033fd596774a5995641b753eaf6007d489e3f44daf30edbee95df215
|
7
|
+
data.tar.gz: f4d71fe3d8be542bd7563f7b52d2503d8840314f175ab41fecfb2bf27498188a8ec3e0a58828fcc022bb4b650396e1559b87684134d0d232b65dce633c4450dd
|
Binary file
|
Binary file
|
Binary file
|
@@ -7,15 +7,9 @@ import com.amazonaws.auth.AWSCredentialsProvider;
|
|
7
7
|
import com.amazonaws.retry.PredefinedRetryPolicies;
|
8
8
|
import com.amazonaws.services.s3.AmazonS3;
|
9
9
|
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
|
10
|
-
import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
|
11
10
|
import com.amazonaws.services.s3.model.GetObjectRequest;
|
12
|
-
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
13
|
-
import com.amazonaws.services.s3.model.ObjectListing;
|
14
|
-
import com.amazonaws.services.s3.model.ObjectMetadata;
|
15
11
|
import com.amazonaws.services.s3.model.S3Object;
|
16
12
|
import com.amazonaws.services.s3.model.S3ObjectInputStream;
|
17
|
-
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
18
|
-
import com.amazonaws.services.s3.model.StorageClass;
|
19
13
|
import com.google.common.annotations.VisibleForTesting;
|
20
14
|
import org.embulk.config.Config;
|
21
15
|
import org.embulk.config.ConfigDefault;
|
@@ -26,6 +20,10 @@ import org.embulk.config.ConfigSource;
|
|
26
20
|
import org.embulk.config.Task;
|
27
21
|
import org.embulk.config.TaskReport;
|
28
22
|
import org.embulk.config.TaskSource;
|
23
|
+
import org.embulk.input.s3.explorer.S3NameOrderPrefixFileExplorer;
|
24
|
+
import org.embulk.input.s3.explorer.S3SingleFileExplorer;
|
25
|
+
import org.embulk.input.s3.explorer.S3TimeOrderPrefixFileExplorer;
|
26
|
+
import org.embulk.input.s3.utils.DateUtils;
|
29
27
|
import org.embulk.spi.BufferAllocator;
|
30
28
|
import org.embulk.spi.Exec;
|
31
29
|
import org.embulk.spi.FileInputPlugin;
|
@@ -40,6 +38,9 @@ import org.slf4j.Logger;
|
|
40
38
|
|
41
39
|
import java.io.IOException;
|
42
40
|
import java.io.InputStream;
|
41
|
+
import java.text.SimpleDateFormat;
|
42
|
+
import java.util.Collections;
|
43
|
+
import java.util.Date;
|
43
44
|
import java.util.Iterator;
|
44
45
|
import java.util.List;
|
45
46
|
import java.util.Optional;
|
@@ -51,6 +52,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
51
52
|
implements FileInputPlugin
|
52
53
|
{
|
53
54
|
private static final Logger LOGGER = Exec.getLogger(S3FileInputPlugin.class);
|
55
|
+
private static final String FULL_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
54
56
|
|
55
57
|
public interface PluginTask
|
56
58
|
extends AwsCredentialsTask, FileList.Task, RetrySupportPluginTask, Task
|
@@ -88,12 +90,35 @@ public abstract class AbstractS3FileInputPlugin
|
|
88
90
|
@ConfigDefault("false")
|
89
91
|
boolean getSkipGlacierObjects();
|
90
92
|
|
93
|
+
@Config("use_modified_time")
|
94
|
+
@ConfigDefault("false")
|
95
|
+
boolean getUseModifiedTime();
|
96
|
+
|
97
|
+
@Config("last_modified_time")
|
98
|
+
@ConfigDefault("null")
|
99
|
+
Optional<String> getLastModifiedTime();
|
100
|
+
|
91
101
|
// TODO timeout, ssl, etc
|
92
102
|
|
103
|
+
////////////////////////////////////////
|
104
|
+
// Internal configurations
|
105
|
+
////////////////////////////////////////
|
106
|
+
|
93
107
|
FileList getFiles();
|
94
108
|
|
95
109
|
void setFiles(FileList files);
|
96
110
|
|
111
|
+
/**
|
112
|
+
* end_modified_time is conditionally set if modified_time mode is enabled.
|
113
|
+
*
|
114
|
+
* It is internal state and must not be set in config.yml
|
115
|
+
*/
|
116
|
+
@Config("__end_modified_time")
|
117
|
+
@ConfigDefault("null")
|
118
|
+
Optional<Date> getEndModifiedTime();
|
119
|
+
|
120
|
+
void setEndModifiedTime(Optional<Date> endModifiedTime);
|
121
|
+
|
97
122
|
@ConfigInject
|
98
123
|
BufferAllocator getBufferAllocator();
|
99
124
|
}
|
@@ -105,6 +130,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
105
130
|
{
|
106
131
|
PluginTask task = config.loadConfig(getTaskClass());
|
107
132
|
|
133
|
+
errorIfInternalParamsAreSet(task);
|
108
134
|
validateInputTask(task);
|
109
135
|
// list files recursively
|
110
136
|
task.setFiles(listFiles(task));
|
@@ -130,9 +156,15 @@ public abstract class AbstractS3FileInputPlugin
|
|
130
156
|
|
131
157
|
// last_path
|
132
158
|
if (task.getIncremental()) {
|
133
|
-
|
134
|
-
|
135
|
-
|
159
|
+
if (task.getUseModifiedTime()) {
|
160
|
+
Date endModifiedTime = task.getEndModifiedTime().orElse(new Date());
|
161
|
+
configDiff.set("last_modified_time", new SimpleDateFormat(FULL_DATE_FORMAT).format(endModifiedTime));
|
162
|
+
}
|
163
|
+
else {
|
164
|
+
Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
|
165
|
+
LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
|
166
|
+
configDiff.set("last_path", lastPath);
|
167
|
+
}
|
136
168
|
}
|
137
169
|
return configDiff;
|
138
170
|
}
|
@@ -180,11 +212,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
180
212
|
{
|
181
213
|
ClientConfiguration clientConfig = new ClientConfiguration();
|
182
214
|
|
183
|
-
/** PLT-9886: disable built-in retry*/
|
184
215
|
//clientConfig.setProtocol(Protocol.HTTP);
|
185
|
-
|
216
|
+
clientConfig.setMaxConnections(50); // SDK default: 50
|
186
217
|
// clientConfig.setMaxErrorRetry(3); // SDK default: 3
|
187
|
-
|
218
|
+
clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
|
188
219
|
clientConfig.setRetryPolicy(PredefinedRetryPolicies.NO_RETRY_POLICY);
|
189
220
|
// set http proxy
|
190
221
|
if (task.getHttpProxy().isPresent()) {
|
@@ -238,22 +269,35 @@ public abstract class AbstractS3FileInputPlugin
|
|
238
269
|
String bucketName = task.getBucket();
|
239
270
|
FileList.Builder builder = new FileList.Builder(task);
|
240
271
|
RetryExecutor retryExec = retryExecutorFrom(task);
|
272
|
+
|
241
273
|
if (task.getPath().isPresent()) {
|
242
274
|
LOGGER.info("Start getting object with path: [{}]", task.getPath().get());
|
243
|
-
|
275
|
+
new S3SingleFileExplorer(bucketName, client, retryExec, task.getPath().get()).addToBuilder(builder);
|
276
|
+
return builder.build();
|
244
277
|
}
|
245
|
-
else {
|
246
|
-
// does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
|
247
|
-
LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
|
248
|
-
if (task.getPathPrefix().get().equals("/")) {
|
249
|
-
LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
250
|
-
}
|
251
278
|
|
252
|
-
|
253
|
-
|
254
|
-
|
279
|
+
// does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
|
280
|
+
LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
|
281
|
+
if (task.getPathPrefix().get().equals("/")) {
|
282
|
+
LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
255
283
|
}
|
256
284
|
|
285
|
+
if (task.getUseModifiedTime()) {
|
286
|
+
Date now = new Date();
|
287
|
+
Optional<Date> from = task.getLastModifiedTime().isPresent()
|
288
|
+
? Optional.of(DateUtils.parse(task.getLastModifiedTime().get(), Collections.singletonList(FULL_DATE_FORMAT)))
|
289
|
+
: Optional.empty();
|
290
|
+
task.setEndModifiedTime(Optional.of(now));
|
291
|
+
|
292
|
+
new S3TimeOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
|
293
|
+
task.getSkipGlacierObjects(), from, now).addToBuilder(builder);
|
294
|
+
}
|
295
|
+
else {
|
296
|
+
new S3NameOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
|
297
|
+
task.getSkipGlacierObjects(), task.getLastPath().orElse(null)).addToBuilder(builder);
|
298
|
+
}
|
299
|
+
|
300
|
+
LOGGER.info("Found total [{}] files", builder.size());
|
257
301
|
return builder.build();
|
258
302
|
}
|
259
303
|
catch (AmazonServiceException ex) {
|
@@ -269,107 +313,13 @@ public abstract class AbstractS3FileInputPlugin
|
|
269
313
|
}
|
270
314
|
}
|
271
315
|
|
272
|
-
|
273
|
-
public void addS3DirectObject(FileList.Builder builder,
|
274
|
-
final AmazonS3 client,
|
275
|
-
String bucket,
|
276
|
-
String objectKey)
|
277
|
-
{
|
278
|
-
addS3DirectObject(builder, client, bucket, objectKey, null);
|
279
|
-
}
|
280
|
-
|
281
|
-
@VisibleForTesting
|
282
|
-
public void addS3DirectObject(FileList.Builder builder,
|
283
|
-
final AmazonS3 client,
|
284
|
-
String bucket,
|
285
|
-
String objectKey,
|
286
|
-
RetryExecutor retryExec)
|
287
|
-
{
|
288
|
-
final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucket, objectKey);
|
289
|
-
|
290
|
-
ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
|
291
|
-
@Override
|
292
|
-
public ObjectMetadata call()
|
293
|
-
{
|
294
|
-
return client.getObjectMetadata(objectMetadataRequest);
|
295
|
-
}
|
296
|
-
}.executeWith(retryExec);
|
297
|
-
|
298
|
-
builder.add(objectKey, objectMetadata.getContentLength());
|
299
|
-
}
|
300
|
-
|
301
|
-
private void validateInputTask(PluginTask task)
|
316
|
+
private void validateInputTask(final PluginTask task)
|
302
317
|
{
|
303
318
|
if (!task.getPathPrefix().isPresent() && !task.getPath().isPresent()) {
|
304
319
|
throw new ConfigException("Either path or path_prefix is required");
|
305
320
|
}
|
306
321
|
}
|
307
322
|
|
308
|
-
@VisibleForTesting
|
309
|
-
public static void listS3FilesByPrefix(FileList.Builder builder,
|
310
|
-
final AmazonS3 client,
|
311
|
-
String bucketName,
|
312
|
-
String prefix,
|
313
|
-
Optional<String> lastPath,
|
314
|
-
boolean skipGlacierObjects)
|
315
|
-
{
|
316
|
-
listS3FilesByPrefix(builder, client, bucketName, prefix, lastPath, skipGlacierObjects, null);
|
317
|
-
}
|
318
|
-
|
319
|
-
/**
|
320
|
-
* Lists S3 filenames filtered by prefix.
|
321
|
-
* <p>
|
322
|
-
* The resulting list does not include the file that's size == 0.
|
323
|
-
* @param builder custom Filelist builder
|
324
|
-
* @param client Amazon S3
|
325
|
-
* @param bucketName Amazon S3 bucket name
|
326
|
-
* @param prefix Amazon S3 bucket name prefix
|
327
|
-
* @param lastPath last path
|
328
|
-
* @param skipGlacierObjects skip gracier objects
|
329
|
-
* @param retryExec a retry executor object to do the retrying
|
330
|
-
*/
|
331
|
-
@VisibleForTesting
|
332
|
-
public static void listS3FilesByPrefix(FileList.Builder builder,
|
333
|
-
final AmazonS3 client,
|
334
|
-
String bucketName,
|
335
|
-
String prefix,
|
336
|
-
Optional<String> lastPath,
|
337
|
-
boolean skipGlacierObjects,
|
338
|
-
RetryExecutor retryExec)
|
339
|
-
{
|
340
|
-
String lastKey = lastPath.orElse(null);
|
341
|
-
do {
|
342
|
-
final String finalLastKey = lastKey;
|
343
|
-
final ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, finalLastKey, null, 1024);
|
344
|
-
ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects") {
|
345
|
-
@Override
|
346
|
-
public ObjectListing call()
|
347
|
-
{
|
348
|
-
return client.listObjects(req);
|
349
|
-
}
|
350
|
-
}.executeWith(retryExec);
|
351
|
-
for (S3ObjectSummary s : ol.getObjectSummaries()) {
|
352
|
-
if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
|
353
|
-
if (skipGlacierObjects) {
|
354
|
-
Exec.getLogger("AbstractS3FileInputPlugin.class").warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
|
355
|
-
continue;
|
356
|
-
}
|
357
|
-
else {
|
358
|
-
throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
|
359
|
-
}
|
360
|
-
}
|
361
|
-
if (s.getSize() > 0) {
|
362
|
-
builder.add(s.getKey(), s.getSize());
|
363
|
-
if (!builder.needsMore()) {
|
364
|
-
LOGGER.warn("Too many files matched, stop listing file");
|
365
|
-
return;
|
366
|
-
}
|
367
|
-
}
|
368
|
-
}
|
369
|
-
lastKey = ol.getNextMarker();
|
370
|
-
} while (lastKey != null);
|
371
|
-
}
|
372
|
-
|
373
323
|
@Override
|
374
324
|
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
375
325
|
{
|
@@ -441,6 +391,14 @@ public abstract class AbstractS3FileInputPlugin
|
|
441
391
|
}
|
442
392
|
}
|
443
393
|
|
394
|
+
@VisibleForTesting
|
395
|
+
static void errorIfInternalParamsAreSet(PluginTask task)
|
396
|
+
{
|
397
|
+
if (task.getEndModifiedTime().isPresent()) {
|
398
|
+
throw new ConfigException("'__end_modified_time' must not be set.");
|
399
|
+
}
|
400
|
+
}
|
401
|
+
|
444
402
|
// TODO create single-file InputStreamFileInput utility
|
445
403
|
private class SingleFileProvider
|
446
404
|
implements InputStreamFileInput.Provider
|
@@ -476,6 +434,9 @@ public abstract class AbstractS3FileInputPlugin
|
|
476
434
|
}.executeWithCheckedException(retryExec, IOException.class);
|
477
435
|
|
478
436
|
long objectSize = object.getObjectMetadata().getContentLength();
|
437
|
+
// Some plugin users are parsing this output to get file list.
|
438
|
+
// Keep it for now but might be removed in the future.
|
439
|
+
LOGGER.info("Open S3Object with bucket [{}], key [{}], with size [{}]", bucket, key, objectSize);
|
479
440
|
InputStream inputStream = new ResumableInputStream(object.getObjectContent(), new S3InputStreamReopener(client, request, objectSize, retryExec));
|
480
441
|
return new InputStreamWithHints(inputStream, String.format("s3://%s/%s", bucket, key));
|
481
442
|
}
|
@@ -19,7 +19,7 @@ import static org.embulk.spi.util.RetryExecutor.Retryable;
|
|
19
19
|
* Retryable utility, regardless the occurred exceptions,
|
20
20
|
* Also provide a default approach for exception propagation.
|
21
21
|
*/
|
22
|
-
class DefaultRetryable<T> implements Retryable<T>
|
22
|
+
public class DefaultRetryable<T> implements Retryable<T>
|
23
23
|
{
|
24
24
|
private static final Logger log = Exec.getLogger(DefaultRetryable.class);
|
25
25
|
private static final Set<Integer> NONRETRYABLE_STATUS_CODES = new HashSet<Integer>(2);
|
@@ -11,7 +11,7 @@ public interface RetrySupportPluginTask extends Task
|
|
11
11
|
int getMaximumRetries();
|
12
12
|
|
13
13
|
@Config("initial_retry_interval_millis")
|
14
|
-
@ConfigDefault("
|
14
|
+
@ConfigDefault("2000")
|
15
15
|
int getInitialRetryIntervalMillis();
|
16
16
|
|
17
17
|
@Config("maximum_retry_interval_millis")
|
@@ -0,0 +1,21 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import org.embulk.input.s3.FileList;
|
5
|
+
import org.embulk.spi.util.RetryExecutor;
|
6
|
+
|
7
|
+
public abstract class S3FileExplorer
|
8
|
+
{
|
9
|
+
protected String bucketName;
|
10
|
+
protected AmazonS3 s3Client;
|
11
|
+
protected RetryExecutor retryExecutor;
|
12
|
+
|
13
|
+
public S3FileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor)
|
14
|
+
{
|
15
|
+
this.bucketName = bucketName;
|
16
|
+
this.s3Client = s3Client;
|
17
|
+
this.retryExecutor = retryExecutor;
|
18
|
+
}
|
19
|
+
|
20
|
+
public abstract void addToBuilder(FileList.Builder builder);
|
21
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
5
|
+
import com.amazonaws.services.s3.model.ObjectListing;
|
6
|
+
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
7
|
+
import org.embulk.input.s3.DefaultRetryable;
|
8
|
+
import org.embulk.spi.util.RetryExecutor;
|
9
|
+
|
10
|
+
import java.util.List;
|
11
|
+
|
12
|
+
public class S3NameOrderPrefixFileExplorer extends S3PrefixFileExplorer
|
13
|
+
{
|
14
|
+
private String lastPath;
|
15
|
+
|
16
|
+
public S3NameOrderPrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor,
|
17
|
+
final String pathPrefix, final boolean skipGlacierObjects, final String lastPath)
|
18
|
+
{
|
19
|
+
super(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects);
|
20
|
+
this.lastPath = lastPath;
|
21
|
+
}
|
22
|
+
|
23
|
+
@Override
|
24
|
+
protected List<S3ObjectSummary> fetch()
|
25
|
+
{
|
26
|
+
final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
|
27
|
+
final ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects")
|
28
|
+
{
|
29
|
+
@Override
|
30
|
+
public ObjectListing call()
|
31
|
+
{
|
32
|
+
return s3Client.listObjects(req);
|
33
|
+
}
|
34
|
+
}.executeWith(retryExecutor);
|
35
|
+
lastPath = ol.getNextMarker();
|
36
|
+
|
37
|
+
return ol.getObjectSummaries();
|
38
|
+
}
|
39
|
+
|
40
|
+
@Override
|
41
|
+
protected boolean hasNext()
|
42
|
+
{
|
43
|
+
return lastPath != null;
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
5
|
+
import com.amazonaws.services.s3.model.StorageClass;
|
6
|
+
import org.embulk.config.ConfigException;
|
7
|
+
import org.embulk.input.s3.FileList;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.util.RetryExecutor;
|
10
|
+
import org.slf4j.Logger;
|
11
|
+
|
12
|
+
import java.util.List;
|
13
|
+
|
14
|
+
public abstract class S3PrefixFileExplorer extends S3FileExplorer
|
15
|
+
{
|
16
|
+
private static final Logger LOGGER = Exec.getLogger(S3PrefixFileExplorer.class);
|
17
|
+
|
18
|
+
protected String pathPrefix;
|
19
|
+
|
20
|
+
private final boolean skipGlacierObjects;
|
21
|
+
|
22
|
+
public S3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
|
23
|
+
{
|
24
|
+
super(bucketName, s3Client, retryExecutor);
|
25
|
+
this.pathPrefix = pathPrefix;
|
26
|
+
this.skipGlacierObjects = skipGlacierObjects;
|
27
|
+
}
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void addToBuilder(final FileList.Builder builder)
|
31
|
+
{
|
32
|
+
do {
|
33
|
+
final List<S3ObjectSummary> s3ObjectSummaries = fetch();
|
34
|
+
|
35
|
+
for (final S3ObjectSummary s : s3ObjectSummaries) {
|
36
|
+
if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
|
37
|
+
if (skipGlacierObjects) {
|
38
|
+
LOGGER.warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
|
39
|
+
continue;
|
40
|
+
}
|
41
|
+
throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
|
42
|
+
}
|
43
|
+
if (s.getSize() > 0) {
|
44
|
+
builder.add(s.getKey(), s.getSize());
|
45
|
+
if (!builder.needsMore()) {
|
46
|
+
LOGGER.warn("Too many files matched, stop listing file");
|
47
|
+
return;
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}
|
51
|
+
} while (hasNext());
|
52
|
+
}
|
53
|
+
|
54
|
+
protected abstract List<S3ObjectSummary> fetch();
|
55
|
+
|
56
|
+
protected abstract boolean hasNext();
|
57
|
+
}
|