embulk-input-s3 0.3.0 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
  3. data/classpath/embulk-input-s3-0.3.5.jar +0 -0
  4. data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
  5. data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
  6. data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
  7. data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
  8. data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
  9. data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
  10. data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
  11. data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
  12. data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
  13. data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
  14. data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
  15. data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
  16. data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
  17. data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
  18. data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
  19. metadata +15 -5
  20. data/classpath/embulk-input-s3-0.3.0.jar +0 -0
  21. data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
  22. data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
@@ -0,0 +1,35 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
5
+ import com.amazonaws.services.s3.model.ObjectMetadata;
6
+ import org.embulk.input.s3.DefaultRetryable;
7
+ import org.embulk.input.s3.FileList;
8
+ import org.embulk.spi.util.RetryExecutor;
9
+
10
+ public class S3SingleFileExplorer extends S3FileExplorer
11
+ {
12
+ private final String path;
13
+
14
+ public S3SingleFileExplorer(final String bucket, final AmazonS3 client, final RetryExecutor retryExecutor, final String path)
15
+ {
16
+ super(bucket, client, retryExecutor);
17
+ this.path = path;
18
+ }
19
+
20
+ @Override
21
+ public void addToBuilder(final FileList.Builder builder)
22
+ {
23
+ final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucketName, path);
24
+
25
+ final ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
26
+ @Override
27
+ public ObjectMetadata call()
28
+ {
29
+ return s3Client.getObjectMetadata(objectMetadataRequest);
30
+ }
31
+ }.executeWith(retryExecutor);
32
+
33
+ builder.add(path, objectMetadata.getContentLength());
34
+ }
35
+ }
@@ -0,0 +1,70 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
7
+ import org.apache.commons.lang3.StringUtils;
8
+ import org.embulk.input.s3.DefaultRetryable;
9
+ import org.embulk.spi.Exec;
10
+ import org.embulk.spi.util.RetryExecutor;
11
+ import org.slf4j.Logger;
12
+
13
+ import java.util.Date;
14
+ import java.util.List;
15
+ import java.util.Optional;
16
+ import java.util.stream.Collectors;
17
+
18
+ public class S3TimeOrderPrefixFileExplorer extends S3PrefixFileExplorer
19
+ {
20
+ private static final Logger LOGGER = Exec.getLogger(S3TimeOrderPrefixFileExplorer.class);
21
+
22
+ private final Optional<Date> from;
23
+ private final Date to;
24
+
25
+ private String lastPath;
26
+
27
+ private int numOfReq = 0;
28
+
29
+ public S3TimeOrderPrefixFileExplorer(final String bucket, final AmazonS3 client, final RetryExecutor retryExecutor,
30
+ final String pathPrefix, final boolean skipGlacierObjects, final Optional<Date> from, final Date to)
31
+ {
32
+ super(bucket, client, retryExecutor, pathPrefix, skipGlacierObjects);
33
+ this.from = from;
34
+ this.to = to;
35
+ }
36
+
37
+ @Override
38
+ public List<S3ObjectSummary> fetch()
39
+ {
40
+ ++numOfReq;
41
+
42
+ final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
43
+ final ObjectListing objectListing = new DefaultRetryable<ObjectListing>("Listing objects")
44
+ {
45
+ @Override
46
+ public ObjectListing call()
47
+ {
48
+ return s3Client.listObjects(req);
49
+ }
50
+ }.executeWith(retryExecutor);
51
+ lastPath = objectListing.getNextMarker();
52
+
53
+ return objectListing.getObjectSummaries()
54
+ .stream()
55
+ .filter(s3ObjectSummary -> s3ObjectSummary.getLastModified().before(to)
56
+ && (!from.isPresent() || s3ObjectSummary.getLastModified().equals(from.get()) || s3ObjectSummary.getLastModified().after(from.get())))
57
+ .collect(Collectors.toList());
58
+ }
59
+
60
+ @Override
61
+ public boolean hasNext()
62
+ {
63
+ if (lastPath == null) {
64
+ LOGGER.info("The total number of LIST requests is {}{}.", numOfReq,
65
+ numOfReq < 10 ? StringUtils.EMPTY : ". Clean up your s3 bucket to reduce the number of requests and improve the ingesting performance");
66
+ return false;
67
+ }
68
+ return true;
69
+ }
70
+ }
@@ -0,0 +1,28 @@
1
+ package org.embulk.input.s3.utils;
2
+
3
+ import com.google.common.base.Joiner;
4
+ import org.embulk.config.ConfigException;
5
+ import org.joda.time.format.DateTimeFormat;
6
+
7
+ import java.util.Date;
8
+ import java.util.List;
9
+
10
+ public class DateUtils
11
+ {
12
+ public static Date parse(final String value, final List<String> supportedFormats)
13
+ throws ConfigException
14
+ {
15
+ for (final String fmt : supportedFormats) {
16
+ try {
17
+ return DateTimeFormat.forPattern(fmt).parseDateTime(value).toDate();
18
+ } catch (final IllegalArgumentException e) {
19
+ // ignorable exception
20
+ }
21
+ }
22
+ throw new ConfigException("Unsupported DateTime value: '" + value + "', supported formats: [" + Joiner.on(",").join(supportedFormats) + "]");
23
+ }
24
+
25
+ private DateUtils()
26
+ {
27
+ }
28
+ }
@@ -1,16 +1,11 @@
1
1
  package org.embulk.input.s3;
2
2
 
3
3
  import com.amazonaws.services.s3.AmazonS3;
4
- import com.amazonaws.services.s3.model.ListObjectsRequest;
5
- import com.amazonaws.services.s3.model.ObjectListing;
6
4
  import com.amazonaws.services.s3.model.Region;
7
- import com.amazonaws.services.s3.model.S3ObjectSummary;
8
- import com.amazonaws.services.s3.model.StorageClass;
9
5
  import com.google.common.collect.ImmutableList;
10
6
  import com.google.common.collect.ImmutableMap;
11
7
  import org.embulk.EmbulkTestRuntime;
12
8
  import org.embulk.config.ConfigDiff;
13
- import org.embulk.config.ConfigException;
14
9
  import org.embulk.config.ConfigSource;
15
10
  import org.embulk.config.TaskReport;
16
11
  import org.embulk.config.TaskSource;
@@ -25,21 +20,15 @@ import org.junit.Before;
25
20
  import org.junit.BeforeClass;
26
21
  import org.junit.Rule;
27
22
  import org.junit.Test;
28
- import org.mockito.Mockito;
29
23
 
30
- import java.lang.reflect.Field;
31
24
  import java.util.ArrayList;
32
25
  import java.util.List;
33
- import java.util.Optional;
34
26
 
35
27
  import static org.embulk.input.s3.S3FileInputPlugin.S3PluginTask;
36
28
  import static org.junit.Assert.assertEquals;
37
29
  import static org.junit.Assert.assertFalse;
38
30
  import static org.junit.Assert.assertNull;
39
31
  import static org.junit.Assume.assumeNotNull;
40
- import static org.mockito.Matchers.any;
41
- import static org.mockito.Mockito.doReturn;
42
- import static org.mockito.Mockito.mock;
43
32
 
44
33
  public class TestS3FileInputPlugin
45
34
  {
@@ -97,7 +86,6 @@ public class TestS3FileInputPlugin
97
86
 
98
87
  @Test
99
88
  public void useLastPath()
100
- throws Exception
101
89
  {
102
90
  ConfigSource config = this.config.deepCopy().set("last_path", EMBULK_S3_TEST_PATH_PREFIX + "/sample_01.csv");
103
91
  ConfigDiff configDiff = runner.transaction(config, new Control(runner, output));
@@ -117,7 +105,6 @@ public class TestS3FileInputPlugin
117
105
 
118
106
  @Test
119
107
  public void emptyFilesWithLastPath()
120
- throws Exception
121
108
  {
122
109
  ConfigSource config = this.config.deepCopy()
123
110
  .set("path_prefix", "empty_files_prefix")
@@ -130,7 +117,6 @@ public class TestS3FileInputPlugin
130
117
 
131
118
  @Test
132
119
  public void useTotalFileCountLimit()
133
- throws Exception
134
120
  {
135
121
  ConfigSource config = this.config.deepCopy().set("total_file_count_limit", 0);
136
122
  ConfigDiff configDiff = runner.transaction(config, new Control(runner, output));
@@ -141,7 +127,6 @@ public class TestS3FileInputPlugin
141
127
 
142
128
  @Test
143
129
  public void usePathMatchPattern()
144
- throws Exception
145
130
  {
146
131
  { // match pattern
147
132
  ConfigSource config = this.config.deepCopy().set("path_match_pattern", "/sample_01");
@@ -227,44 +212,6 @@ public class TestS3FileInputPlugin
227
212
  assertEquals(s3Client.getRegion(), Region.US_Standard);
228
213
  }
229
214
 
230
- @Test(expected = ConfigException.class)
231
- public void useSkipGlacierObjects() throws Exception
232
- {
233
- AmazonS3 client;
234
- client = mock(AmazonS3.class);
235
- doReturn(s3objectList("in/aa/a", StorageClass.Glacier)).when(client).listObjects(any(ListObjectsRequest.class));
236
-
237
- AbstractS3FileInputPlugin plugin = Mockito.mock(AbstractS3FileInputPlugin.class, Mockito.CALLS_REAL_METHODS);
238
- plugin.listS3FilesByPrefix(newFileList(config, "sample_00", 100L), client, "test_bucket", "test_prefix", Optional.empty(), false);
239
- }
240
-
241
- private FileList.Builder newFileList(ConfigSource config, Object... nameAndSize)
242
- {
243
- FileList.Builder builder = new FileList.Builder(config);
244
- for (int i = 0; i < nameAndSize.length; i += 2) {
245
- builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
246
- }
247
- return builder;
248
- }
249
-
250
- private ObjectListing s3objectList(String key, StorageClass storageClass) throws Exception
251
- {
252
- ObjectListing list = new ObjectListing();
253
-
254
- S3ObjectSummary element = new S3ObjectSummary();
255
- element.setKey(key);
256
- element.setStorageClass(storageClass.toString());
257
-
258
- List<S3ObjectSummary> objectSummaries = new ArrayList<>();
259
- objectSummaries.add(element);
260
-
261
- Field field = list.getClass().getDeclaredField("objectSummaries");
262
- field.setAccessible(true);
263
- field.set(list, objectSummaries);
264
-
265
- return list;
266
- }
267
-
268
215
  static class Control
269
216
  implements InputPlugin.Control
270
217
  {
@@ -0,0 +1,67 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import org.embulk.EmbulkTestRuntime;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+ import org.junit.runner.RunWith;
11
+ import org.mockito.ArgumentCaptor;
12
+ import org.mockito.Mock;
13
+ import org.mockito.internal.util.reflection.FieldSetter;
14
+ import org.mockito.runners.MockitoJUnitRunner;
15
+
16
+ import static org.junit.Assert.assertEquals;
17
+ import static org.junit.Assert.assertFalse;
18
+ import static org.mockito.Matchers.any;
19
+ import static org.mockito.Mockito.mock;
20
+ import static org.mockito.Mockito.verify;
21
+ import static org.mockito.Mockito.when;
22
+
23
+ @RunWith(MockitoJUnitRunner.class)
24
+ public class TestS3NameOrderPrefixFileExplorer
25
+ {
26
+ private static final String BUCKET_NAME = "bucket_name";
27
+ private static final String PATH_PREFIX = "path_prefix";
28
+ private static final String LAST_PATH = "last_path";
29
+
30
+ @Rule
31
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
32
+
33
+ @Mock
34
+ private AmazonS3 s3Client;
35
+
36
+ private S3NameOrderPrefixFileExplorer s3NameOrderPrefixFileExplorer;
37
+
38
+ @Before
39
+ public void setUp()
40
+ {
41
+ s3NameOrderPrefixFileExplorer = new S3NameOrderPrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, false, LAST_PATH);
42
+ }
43
+
44
+ @Test
45
+ public void fetch_should_return_list_objects()
46
+ {
47
+ final ObjectListing ol = mock(ObjectListing.class);
48
+ when(s3Client.listObjects(any(ListObjectsRequest.class))).thenReturn(ol);
49
+
50
+ s3NameOrderPrefixFileExplorer.fetch();
51
+ final ArgumentCaptor<ListObjectsRequest> listObjectsRequestCaptor = ArgumentCaptor.forClass(ListObjectsRequest.class);
52
+
53
+ verify(ol).getNextMarker();
54
+ verify(s3Client).listObjects(listObjectsRequestCaptor.capture());
55
+ final ListObjectsRequest listObjectsRequest = listObjectsRequestCaptor.getValue();
56
+ assertEquals(BUCKET_NAME, listObjectsRequest.getBucketName());
57
+ assertEquals(PATH_PREFIX, listObjectsRequest.getPrefix());
58
+ assertEquals(LAST_PATH, listObjectsRequest.getMarker());
59
+ }
60
+
61
+ @Test
62
+ public void hasNext_should_return_false_if_no_lastpath() throws NoSuchFieldException
63
+ {
64
+ new FieldSetter(s3NameOrderPrefixFileExplorer, s3NameOrderPrefixFileExplorer.getClass().getDeclaredField("lastPath")).set(null);
65
+ assertFalse(s3NameOrderPrefixFileExplorer.hasNext());
66
+ }
67
+ }
@@ -0,0 +1,128 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
5
+ import com.amazonaws.services.s3.model.StorageClass;
6
+ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.input.s3.FileList;
10
+ import org.embulk.spi.util.RetryExecutor;
11
+ import org.junit.Before;
12
+ import org.junit.Rule;
13
+ import org.junit.Test;
14
+ import org.junit.runner.RunWith;
15
+ import org.mockito.Mock;
16
+ import org.mockito.runners.MockitoJUnitRunner;
17
+
18
+ import java.util.Collections;
19
+ import java.util.List;
20
+
21
+ import static org.mockito.Mockito.doReturn;
22
+ import static org.mockito.Mockito.never;
23
+ import static org.mockito.Mockito.spy;
24
+ import static org.mockito.Mockito.times;
25
+ import static org.mockito.Mockito.verify;
26
+ import static org.mockito.Mockito.when;
27
+
28
+ @RunWith(MockitoJUnitRunner.class)
29
+ public class TestS3PrefixFileExplorer
30
+ {
31
+ private static final String PATH_PREFIX = "path_prefix";
32
+ private static final String BUCKET_NAME = "bucket_name";
33
+ private static final String OBJECT_KEY = "key";
34
+
35
+ @SuppressFBWarnings("URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
36
+ @Rule
37
+ public EmbulkTestRuntime embulkTestRuntime = new EmbulkTestRuntime();
38
+
39
+ @Mock
40
+ private AmazonS3 s3Client;
41
+
42
+ @Mock
43
+ private FileList.Builder builder;
44
+
45
+ @Mock
46
+ private S3ObjectSummary s3ObjectSummary;
47
+
48
+ private S3PrefixFileExplorer s3PrefixFileExplorer;
49
+
50
+ @Before
51
+ public void setUp()
52
+ {
53
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, false);
54
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
55
+ }
56
+
57
+ @Test(expected = ConfigException.class)
58
+ public void addToBuilder_should_throw_exception_if_notskipped_glacier_storage()
59
+ {
60
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Glacier.toString());
61
+ s3PrefixFileExplorer.addToBuilder(builder);
62
+ }
63
+
64
+ @Test
65
+ public void addToBuilder_should_skip_glacier_storage_if_allowed()
66
+ {
67
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Glacier.toString());
68
+ // override spied object for changing `skipGlacierObjects`
69
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, true);
70
+ doReturn(false).when(s3PrefixFileExplorer).hasNext();
71
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
72
+ s3PrefixFileExplorer.addToBuilder(builder);
73
+
74
+ verify(s3PrefixFileExplorer).hasNext();
75
+ verify(s3ObjectSummary, never()).getSize();
76
+ }
77
+
78
+ @Test
79
+ public void addToBuilder_should_loop_till_nothing_left()
80
+ {
81
+ // There are 3 loops totally but only 2 keys have been imported because the first key is in Glacier storage class and is skipped
82
+ when(builder.needsMore()).thenReturn(true);
83
+ // override spied object for changing `skipGlacierObjects`
84
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, true);
85
+ when(s3ObjectSummary.getStorageClass())
86
+ .thenReturn(StorageClass.Glacier.toString())
87
+ .thenReturn(StorageClass.Standard.toString());
88
+ when(s3ObjectSummary.getSize()).thenReturn(1L);
89
+ when(s3ObjectSummary.getKey()).thenReturn(PATH_PREFIX + OBJECT_KEY);
90
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
91
+ doReturn(true).doReturn(true).doReturn(false).when(s3PrefixFileExplorer).hasNext();
92
+
93
+ s3PrefixFileExplorer.addToBuilder(builder);
94
+ verify(builder, times(2)).add(PATH_PREFIX + OBJECT_KEY, 1);
95
+ }
96
+
97
+ @Test
98
+ public void addToBuilder_should_stop_import_if_too_many_files()
99
+ {
100
+ when(builder.needsMore()).thenReturn(false);
101
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Standard.toString());
102
+ when(s3ObjectSummary.getKey()).thenReturn(PATH_PREFIX + OBJECT_KEY);
103
+ when(s3ObjectSummary.getSize()).thenReturn(1L);
104
+ doReturn(true).when(s3PrefixFileExplorer).hasNext();
105
+ s3PrefixFileExplorer.addToBuilder(builder);
106
+
107
+ verify(builder).add(PATH_PREFIX + OBJECT_KEY, 1);
108
+ verify(s3PrefixFileExplorer, never()).hasNext();
109
+ }
110
+
111
+ private S3PrefixFileExplorer spyS3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
112
+ {
113
+ return spy(new S3PrefixFileExplorer(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects)
114
+ {
115
+ @Override
116
+ protected List<S3ObjectSummary> fetch()
117
+ {
118
+ return null;
119
+ }
120
+
121
+ @Override
122
+ protected boolean hasNext()
123
+ {
124
+ return false;
125
+ }
126
+ });
127
+ }
128
+ }