embulk-input-s3 0.3.0 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
- data/classpath/embulk-input-s3-0.3.5.jar +0 -0
- data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
- data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
- data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
- data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
- data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
- data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
- data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
- data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
- data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
- data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
- metadata +15 -5
- data/classpath/embulk-input-s3-0.3.0.jar +0 -0
- data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
- data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d77ba197fb47f89fc3a890e240ddd963a5acc9fd
|
4
|
+
data.tar.gz: 97647120fdd11ddc13e03916a11429477c707ba4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d81311e7e1e921bc336de2a78c01230616104a3c93ca6ba7699fd934c8536588bb0fd813033fd596774a5995641b753eaf6007d489e3f44daf30edbee95df215
|
7
|
+
data.tar.gz: f4d71fe3d8be542bd7563f7b52d2503d8840314f175ab41fecfb2bf27498188a8ec3e0a58828fcc022bb4b650396e1559b87684134d0d232b65dce633c4450dd
|
Binary file
|
Binary file
|
Binary file
|
@@ -7,15 +7,9 @@ import com.amazonaws.auth.AWSCredentialsProvider;
|
|
7
7
|
import com.amazonaws.retry.PredefinedRetryPolicies;
|
8
8
|
import com.amazonaws.services.s3.AmazonS3;
|
9
9
|
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
|
10
|
-
import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
|
11
10
|
import com.amazonaws.services.s3.model.GetObjectRequest;
|
12
|
-
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
13
|
-
import com.amazonaws.services.s3.model.ObjectListing;
|
14
|
-
import com.amazonaws.services.s3.model.ObjectMetadata;
|
15
11
|
import com.amazonaws.services.s3.model.S3Object;
|
16
12
|
import com.amazonaws.services.s3.model.S3ObjectInputStream;
|
17
|
-
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
18
|
-
import com.amazonaws.services.s3.model.StorageClass;
|
19
13
|
import com.google.common.annotations.VisibleForTesting;
|
20
14
|
import org.embulk.config.Config;
|
21
15
|
import org.embulk.config.ConfigDefault;
|
@@ -26,6 +20,10 @@ import org.embulk.config.ConfigSource;
|
|
26
20
|
import org.embulk.config.Task;
|
27
21
|
import org.embulk.config.TaskReport;
|
28
22
|
import org.embulk.config.TaskSource;
|
23
|
+
import org.embulk.input.s3.explorer.S3NameOrderPrefixFileExplorer;
|
24
|
+
import org.embulk.input.s3.explorer.S3SingleFileExplorer;
|
25
|
+
import org.embulk.input.s3.explorer.S3TimeOrderPrefixFileExplorer;
|
26
|
+
import org.embulk.input.s3.utils.DateUtils;
|
29
27
|
import org.embulk.spi.BufferAllocator;
|
30
28
|
import org.embulk.spi.Exec;
|
31
29
|
import org.embulk.spi.FileInputPlugin;
|
@@ -40,6 +38,9 @@ import org.slf4j.Logger;
|
|
40
38
|
|
41
39
|
import java.io.IOException;
|
42
40
|
import java.io.InputStream;
|
41
|
+
import java.text.SimpleDateFormat;
|
42
|
+
import java.util.Collections;
|
43
|
+
import java.util.Date;
|
43
44
|
import java.util.Iterator;
|
44
45
|
import java.util.List;
|
45
46
|
import java.util.Optional;
|
@@ -51,6 +52,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
51
52
|
implements FileInputPlugin
|
52
53
|
{
|
53
54
|
private static final Logger LOGGER = Exec.getLogger(S3FileInputPlugin.class);
|
55
|
+
private static final String FULL_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
|
54
56
|
|
55
57
|
public interface PluginTask
|
56
58
|
extends AwsCredentialsTask, FileList.Task, RetrySupportPluginTask, Task
|
@@ -88,12 +90,35 @@ public abstract class AbstractS3FileInputPlugin
|
|
88
90
|
@ConfigDefault("false")
|
89
91
|
boolean getSkipGlacierObjects();
|
90
92
|
|
93
|
+
@Config("use_modified_time")
|
94
|
+
@ConfigDefault("false")
|
95
|
+
boolean getUseModifiedTime();
|
96
|
+
|
97
|
+
@Config("last_modified_time")
|
98
|
+
@ConfigDefault("null")
|
99
|
+
Optional<String> getLastModifiedTime();
|
100
|
+
|
91
101
|
// TODO timeout, ssl, etc
|
92
102
|
|
103
|
+
////////////////////////////////////////
|
104
|
+
// Internal configurations
|
105
|
+
////////////////////////////////////////
|
106
|
+
|
93
107
|
FileList getFiles();
|
94
108
|
|
95
109
|
void setFiles(FileList files);
|
96
110
|
|
111
|
+
/**
|
112
|
+
* end_modified_time is conditionally set if modified_time mode is enabled.
|
113
|
+
*
|
114
|
+
* It is internal state and must not be set in config.yml
|
115
|
+
*/
|
116
|
+
@Config("__end_modified_time")
|
117
|
+
@ConfigDefault("null")
|
118
|
+
Optional<Date> getEndModifiedTime();
|
119
|
+
|
120
|
+
void setEndModifiedTime(Optional<Date> endModifiedTime);
|
121
|
+
|
97
122
|
@ConfigInject
|
98
123
|
BufferAllocator getBufferAllocator();
|
99
124
|
}
|
@@ -105,6 +130,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
105
130
|
{
|
106
131
|
PluginTask task = config.loadConfig(getTaskClass());
|
107
132
|
|
133
|
+
errorIfInternalParamsAreSet(task);
|
108
134
|
validateInputTask(task);
|
109
135
|
// list files recursively
|
110
136
|
task.setFiles(listFiles(task));
|
@@ -130,9 +156,15 @@ public abstract class AbstractS3FileInputPlugin
|
|
130
156
|
|
131
157
|
// last_path
|
132
158
|
if (task.getIncremental()) {
|
133
|
-
|
134
|
-
|
135
|
-
|
159
|
+
if (task.getUseModifiedTime()) {
|
160
|
+
Date endModifiedTime = task.getEndModifiedTime().orElse(new Date());
|
161
|
+
configDiff.set("last_modified_time", new SimpleDateFormat(FULL_DATE_FORMAT).format(endModifiedTime));
|
162
|
+
}
|
163
|
+
else {
|
164
|
+
Optional<String> lastPath = task.getFiles().getLastPath(task.getLastPath());
|
165
|
+
LOGGER.info("Incremental job, setting last_path to [{}]", lastPath.orElse(""));
|
166
|
+
configDiff.set("last_path", lastPath);
|
167
|
+
}
|
136
168
|
}
|
137
169
|
return configDiff;
|
138
170
|
}
|
@@ -180,11 +212,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
180
212
|
{
|
181
213
|
ClientConfiguration clientConfig = new ClientConfiguration();
|
182
214
|
|
183
|
-
/** PLT-9886: disable built-in retry*/
|
184
215
|
//clientConfig.setProtocol(Protocol.HTTP);
|
185
|
-
|
216
|
+
clientConfig.setMaxConnections(50); // SDK default: 50
|
186
217
|
// clientConfig.setMaxErrorRetry(3); // SDK default: 3
|
187
|
-
|
218
|
+
clientConfig.setSocketTimeout(8 * 60 * 1000); // SDK default: 50*1000
|
188
219
|
clientConfig.setRetryPolicy(PredefinedRetryPolicies.NO_RETRY_POLICY);
|
189
220
|
// set http proxy
|
190
221
|
if (task.getHttpProxy().isPresent()) {
|
@@ -238,22 +269,35 @@ public abstract class AbstractS3FileInputPlugin
|
|
238
269
|
String bucketName = task.getBucket();
|
239
270
|
FileList.Builder builder = new FileList.Builder(task);
|
240
271
|
RetryExecutor retryExec = retryExecutorFrom(task);
|
272
|
+
|
241
273
|
if (task.getPath().isPresent()) {
|
242
274
|
LOGGER.info("Start getting object with path: [{}]", task.getPath().get());
|
243
|
-
|
275
|
+
new S3SingleFileExplorer(bucketName, client, retryExec, task.getPath().get()).addToBuilder(builder);
|
276
|
+
return builder.build();
|
244
277
|
}
|
245
|
-
else {
|
246
|
-
// does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
|
247
|
-
LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
|
248
|
-
if (task.getPathPrefix().get().equals("/")) {
|
249
|
-
LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
250
|
-
}
|
251
278
|
|
252
|
-
|
253
|
-
|
254
|
-
|
279
|
+
// does not need to verify existent path prefix here since there is the validation requires either path or path_prefix
|
280
|
+
LOGGER.info("Start listing file with prefix [{}]", task.getPathPrefix().get());
|
281
|
+
if (task.getPathPrefix().get().equals("/")) {
|
282
|
+
LOGGER.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
255
283
|
}
|
256
284
|
|
285
|
+
if (task.getUseModifiedTime()) {
|
286
|
+
Date now = new Date();
|
287
|
+
Optional<Date> from = task.getLastModifiedTime().isPresent()
|
288
|
+
? Optional.of(DateUtils.parse(task.getLastModifiedTime().get(), Collections.singletonList(FULL_DATE_FORMAT)))
|
289
|
+
: Optional.empty();
|
290
|
+
task.setEndModifiedTime(Optional.of(now));
|
291
|
+
|
292
|
+
new S3TimeOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
|
293
|
+
task.getSkipGlacierObjects(), from, now).addToBuilder(builder);
|
294
|
+
}
|
295
|
+
else {
|
296
|
+
new S3NameOrderPrefixFileExplorer(bucketName, client, retryExec, task.getPathPrefix().get(),
|
297
|
+
task.getSkipGlacierObjects(), task.getLastPath().orElse(null)).addToBuilder(builder);
|
298
|
+
}
|
299
|
+
|
300
|
+
LOGGER.info("Found total [{}] files", builder.size());
|
257
301
|
return builder.build();
|
258
302
|
}
|
259
303
|
catch (AmazonServiceException ex) {
|
@@ -269,107 +313,13 @@ public abstract class AbstractS3FileInputPlugin
|
|
269
313
|
}
|
270
314
|
}
|
271
315
|
|
272
|
-
|
273
|
-
public void addS3DirectObject(FileList.Builder builder,
|
274
|
-
final AmazonS3 client,
|
275
|
-
String bucket,
|
276
|
-
String objectKey)
|
277
|
-
{
|
278
|
-
addS3DirectObject(builder, client, bucket, objectKey, null);
|
279
|
-
}
|
280
|
-
|
281
|
-
@VisibleForTesting
|
282
|
-
public void addS3DirectObject(FileList.Builder builder,
|
283
|
-
final AmazonS3 client,
|
284
|
-
String bucket,
|
285
|
-
String objectKey,
|
286
|
-
RetryExecutor retryExec)
|
287
|
-
{
|
288
|
-
final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucket, objectKey);
|
289
|
-
|
290
|
-
ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
|
291
|
-
@Override
|
292
|
-
public ObjectMetadata call()
|
293
|
-
{
|
294
|
-
return client.getObjectMetadata(objectMetadataRequest);
|
295
|
-
}
|
296
|
-
}.executeWith(retryExec);
|
297
|
-
|
298
|
-
builder.add(objectKey, objectMetadata.getContentLength());
|
299
|
-
}
|
300
|
-
|
301
|
-
private void validateInputTask(PluginTask task)
|
316
|
+
private void validateInputTask(final PluginTask task)
|
302
317
|
{
|
303
318
|
if (!task.getPathPrefix().isPresent() && !task.getPath().isPresent()) {
|
304
319
|
throw new ConfigException("Either path or path_prefix is required");
|
305
320
|
}
|
306
321
|
}
|
307
322
|
|
308
|
-
@VisibleForTesting
|
309
|
-
public static void listS3FilesByPrefix(FileList.Builder builder,
|
310
|
-
final AmazonS3 client,
|
311
|
-
String bucketName,
|
312
|
-
String prefix,
|
313
|
-
Optional<String> lastPath,
|
314
|
-
boolean skipGlacierObjects)
|
315
|
-
{
|
316
|
-
listS3FilesByPrefix(builder, client, bucketName, prefix, lastPath, skipGlacierObjects, null);
|
317
|
-
}
|
318
|
-
|
319
|
-
/**
|
320
|
-
* Lists S3 filenames filtered by prefix.
|
321
|
-
* <p>
|
322
|
-
* The resulting list does not include the file that's size == 0.
|
323
|
-
* @param builder custom Filelist builder
|
324
|
-
* @param client Amazon S3
|
325
|
-
* @param bucketName Amazon S3 bucket name
|
326
|
-
* @param prefix Amazon S3 bucket name prefix
|
327
|
-
* @param lastPath last path
|
328
|
-
* @param skipGlacierObjects skip gracier objects
|
329
|
-
* @param retryExec a retry executor object to do the retrying
|
330
|
-
*/
|
331
|
-
@VisibleForTesting
|
332
|
-
public static void listS3FilesByPrefix(FileList.Builder builder,
|
333
|
-
final AmazonS3 client,
|
334
|
-
String bucketName,
|
335
|
-
String prefix,
|
336
|
-
Optional<String> lastPath,
|
337
|
-
boolean skipGlacierObjects,
|
338
|
-
RetryExecutor retryExec)
|
339
|
-
{
|
340
|
-
String lastKey = lastPath.orElse(null);
|
341
|
-
do {
|
342
|
-
final String finalLastKey = lastKey;
|
343
|
-
final ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, finalLastKey, null, 1024);
|
344
|
-
ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects") {
|
345
|
-
@Override
|
346
|
-
public ObjectListing call()
|
347
|
-
{
|
348
|
-
return client.listObjects(req);
|
349
|
-
}
|
350
|
-
}.executeWith(retryExec);
|
351
|
-
for (S3ObjectSummary s : ol.getObjectSummaries()) {
|
352
|
-
if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
|
353
|
-
if (skipGlacierObjects) {
|
354
|
-
Exec.getLogger("AbstractS3FileInputPlugin.class").warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
|
355
|
-
continue;
|
356
|
-
}
|
357
|
-
else {
|
358
|
-
throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
|
359
|
-
}
|
360
|
-
}
|
361
|
-
if (s.getSize() > 0) {
|
362
|
-
builder.add(s.getKey(), s.getSize());
|
363
|
-
if (!builder.needsMore()) {
|
364
|
-
LOGGER.warn("Too many files matched, stop listing file");
|
365
|
-
return;
|
366
|
-
}
|
367
|
-
}
|
368
|
-
}
|
369
|
-
lastKey = ol.getNextMarker();
|
370
|
-
} while (lastKey != null);
|
371
|
-
}
|
372
|
-
|
373
323
|
@Override
|
374
324
|
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
375
325
|
{
|
@@ -441,6 +391,14 @@ public abstract class AbstractS3FileInputPlugin
|
|
441
391
|
}
|
442
392
|
}
|
443
393
|
|
394
|
+
@VisibleForTesting
|
395
|
+
static void errorIfInternalParamsAreSet(PluginTask task)
|
396
|
+
{
|
397
|
+
if (task.getEndModifiedTime().isPresent()) {
|
398
|
+
throw new ConfigException("'__end_modified_time' must not be set.");
|
399
|
+
}
|
400
|
+
}
|
401
|
+
|
444
402
|
// TODO create single-file InputStreamFileInput utility
|
445
403
|
private class SingleFileProvider
|
446
404
|
implements InputStreamFileInput.Provider
|
@@ -476,6 +434,9 @@ public abstract class AbstractS3FileInputPlugin
|
|
476
434
|
}.executeWithCheckedException(retryExec, IOException.class);
|
477
435
|
|
478
436
|
long objectSize = object.getObjectMetadata().getContentLength();
|
437
|
+
// Some plugin users are parsing this output to get file list.
|
438
|
+
// Keep it for now but might be removed in the future.
|
439
|
+
LOGGER.info("Open S3Object with bucket [{}], key [{}], with size [{}]", bucket, key, objectSize);
|
479
440
|
InputStream inputStream = new ResumableInputStream(object.getObjectContent(), new S3InputStreamReopener(client, request, objectSize, retryExec));
|
480
441
|
return new InputStreamWithHints(inputStream, String.format("s3://%s/%s", bucket, key));
|
481
442
|
}
|
@@ -19,7 +19,7 @@ import static org.embulk.spi.util.RetryExecutor.Retryable;
|
|
19
19
|
* Retryable utility, regardless the occurred exceptions,
|
20
20
|
* Also provide a default approach for exception propagation.
|
21
21
|
*/
|
22
|
-
class DefaultRetryable<T> implements Retryable<T>
|
22
|
+
public class DefaultRetryable<T> implements Retryable<T>
|
23
23
|
{
|
24
24
|
private static final Logger log = Exec.getLogger(DefaultRetryable.class);
|
25
25
|
private static final Set<Integer> NONRETRYABLE_STATUS_CODES = new HashSet<Integer>(2);
|
@@ -11,7 +11,7 @@ public interface RetrySupportPluginTask extends Task
|
|
11
11
|
int getMaximumRetries();
|
12
12
|
|
13
13
|
@Config("initial_retry_interval_millis")
|
14
|
-
@ConfigDefault("
|
14
|
+
@ConfigDefault("2000")
|
15
15
|
int getInitialRetryIntervalMillis();
|
16
16
|
|
17
17
|
@Config("maximum_retry_interval_millis")
|
@@ -0,0 +1,21 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import org.embulk.input.s3.FileList;
|
5
|
+
import org.embulk.spi.util.RetryExecutor;
|
6
|
+
|
7
|
+
public abstract class S3FileExplorer
|
8
|
+
{
|
9
|
+
protected String bucketName;
|
10
|
+
protected AmazonS3 s3Client;
|
11
|
+
protected RetryExecutor retryExecutor;
|
12
|
+
|
13
|
+
public S3FileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor)
|
14
|
+
{
|
15
|
+
this.bucketName = bucketName;
|
16
|
+
this.s3Client = s3Client;
|
17
|
+
this.retryExecutor = retryExecutor;
|
18
|
+
}
|
19
|
+
|
20
|
+
public abstract void addToBuilder(FileList.Builder builder);
|
21
|
+
}
|
@@ -0,0 +1,45 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
5
|
+
import com.amazonaws.services.s3.model.ObjectListing;
|
6
|
+
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
7
|
+
import org.embulk.input.s3.DefaultRetryable;
|
8
|
+
import org.embulk.spi.util.RetryExecutor;
|
9
|
+
|
10
|
+
import java.util.List;
|
11
|
+
|
12
|
+
public class S3NameOrderPrefixFileExplorer extends S3PrefixFileExplorer
|
13
|
+
{
|
14
|
+
private String lastPath;
|
15
|
+
|
16
|
+
public S3NameOrderPrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor,
|
17
|
+
final String pathPrefix, final boolean skipGlacierObjects, final String lastPath)
|
18
|
+
{
|
19
|
+
super(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects);
|
20
|
+
this.lastPath = lastPath;
|
21
|
+
}
|
22
|
+
|
23
|
+
@Override
|
24
|
+
protected List<S3ObjectSummary> fetch()
|
25
|
+
{
|
26
|
+
final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
|
27
|
+
final ObjectListing ol = new DefaultRetryable<ObjectListing>("Listing objects")
|
28
|
+
{
|
29
|
+
@Override
|
30
|
+
public ObjectListing call()
|
31
|
+
{
|
32
|
+
return s3Client.listObjects(req);
|
33
|
+
}
|
34
|
+
}.executeWith(retryExecutor);
|
35
|
+
lastPath = ol.getNextMarker();
|
36
|
+
|
37
|
+
return ol.getObjectSummaries();
|
38
|
+
}
|
39
|
+
|
40
|
+
@Override
|
41
|
+
protected boolean hasNext()
|
42
|
+
{
|
43
|
+
return lastPath != null;
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
package org.embulk.input.s3.explorer;
|
2
|
+
|
3
|
+
import com.amazonaws.services.s3.AmazonS3;
|
4
|
+
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
5
|
+
import com.amazonaws.services.s3.model.StorageClass;
|
6
|
+
import org.embulk.config.ConfigException;
|
7
|
+
import org.embulk.input.s3.FileList;
|
8
|
+
import org.embulk.spi.Exec;
|
9
|
+
import org.embulk.spi.util.RetryExecutor;
|
10
|
+
import org.slf4j.Logger;
|
11
|
+
|
12
|
+
import java.util.List;
|
13
|
+
|
14
|
+
public abstract class S3PrefixFileExplorer extends S3FileExplorer
|
15
|
+
{
|
16
|
+
private static final Logger LOGGER = Exec.getLogger(S3PrefixFileExplorer.class);
|
17
|
+
|
18
|
+
protected String pathPrefix;
|
19
|
+
|
20
|
+
private final boolean skipGlacierObjects;
|
21
|
+
|
22
|
+
public S3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
|
23
|
+
{
|
24
|
+
super(bucketName, s3Client, retryExecutor);
|
25
|
+
this.pathPrefix = pathPrefix;
|
26
|
+
this.skipGlacierObjects = skipGlacierObjects;
|
27
|
+
}
|
28
|
+
|
29
|
+
@Override
|
30
|
+
public void addToBuilder(final FileList.Builder builder)
|
31
|
+
{
|
32
|
+
do {
|
33
|
+
final List<S3ObjectSummary> s3ObjectSummaries = fetch();
|
34
|
+
|
35
|
+
for (final S3ObjectSummary s : s3ObjectSummaries) {
|
36
|
+
if (s.getStorageClass().equals(StorageClass.Glacier.toString())) {
|
37
|
+
if (skipGlacierObjects) {
|
38
|
+
LOGGER.warn("Skipped \"s3://{}/{}\" that stored at Glacier.", bucketName, s.getKey());
|
39
|
+
continue;
|
40
|
+
}
|
41
|
+
throw new ConfigException("Detected an object stored at Glacier. Set \"skip_glacier_objects\" option to \"true\" to skip this.");
|
42
|
+
}
|
43
|
+
if (s.getSize() > 0) {
|
44
|
+
builder.add(s.getKey(), s.getSize());
|
45
|
+
if (!builder.needsMore()) {
|
46
|
+
LOGGER.warn("Too many files matched, stop listing file");
|
47
|
+
return;
|
48
|
+
}
|
49
|
+
}
|
50
|
+
}
|
51
|
+
} while (hasNext());
|
52
|
+
}
|
53
|
+
|
54
|
+
protected abstract List<S3ObjectSummary> fetch();
|
55
|
+
|
56
|
+
protected abstract boolean hasNext();
|
57
|
+
}
|