embulk-input-s3 0.3.0 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (22) hide show
  1. checksums.yaml +4 -4
  2. data/classpath/aws-java-sdk-sts-1.11.466.jar +0 -0
  3. data/classpath/embulk-input-s3-0.3.5.jar +0 -0
  4. data/classpath/embulk-util-aws-credentials-0.3.5.jar +0 -0
  5. data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +78 -117
  6. data/src/main/java/org/embulk/input/s3/DefaultRetryable.java +1 -1
  7. data/src/main/java/org/embulk/input/s3/RetrySupportPluginTask.java +1 -1
  8. data/src/main/java/org/embulk/input/s3/explorer/S3FileExplorer.java +21 -0
  9. data/src/main/java/org/embulk/input/s3/explorer/S3NameOrderPrefixFileExplorer.java +45 -0
  10. data/src/main/java/org/embulk/input/s3/explorer/S3PrefixFileExplorer.java +57 -0
  11. data/src/main/java/org/embulk/input/s3/explorer/S3SingleFileExplorer.java +35 -0
  12. data/src/main/java/org/embulk/input/s3/explorer/S3TimeOrderPrefixFileExplorer.java +70 -0
  13. data/src/main/java/org/embulk/input/s3/utils/DateUtils.java +28 -0
  14. data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +0 -53
  15. data/src/test/java/org/embulk/input/s3/explorer/TestS3NameOrderPrefixFileExplorer.java +67 -0
  16. data/src/test/java/org/embulk/input/s3/explorer/TestS3PrefixFileExplorer.java +128 -0
  17. data/src/test/java/org/embulk/input/s3/explorer/TestS3SingleFileExplorer.java +56 -0
  18. data/src/test/java/org/embulk/input/s3/explorer/TestS3TimeOrderPrefixFileExplorer.java +112 -0
  19. metadata +15 -5
  20. data/classpath/embulk-input-s3-0.3.0.jar +0 -0
  21. data/classpath/embulk-util-aws-credentials-0.3.0.jar +0 -0
  22. data/src/test/java/org/embulk/input/s3/TestAbstractS3FileInputPlugin.java +0 -164
@@ -0,0 +1,35 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
5
+ import com.amazonaws.services.s3.model.ObjectMetadata;
6
+ import org.embulk.input.s3.DefaultRetryable;
7
+ import org.embulk.input.s3.FileList;
8
+ import org.embulk.spi.util.RetryExecutor;
9
+
10
+ public class S3SingleFileExplorer extends S3FileExplorer
11
+ {
12
+ private final String path;
13
+
14
+ public S3SingleFileExplorer(final String bucket, final AmazonS3 client, final RetryExecutor retryExecutor, final String path)
15
+ {
16
+ super(bucket, client, retryExecutor);
17
+ this.path = path;
18
+ }
19
+
20
+ @Override
21
+ public void addToBuilder(final FileList.Builder builder)
22
+ {
23
+ final GetObjectMetadataRequest objectMetadataRequest = new GetObjectMetadataRequest(bucketName, path);
24
+
25
+ final ObjectMetadata objectMetadata = new DefaultRetryable<ObjectMetadata>("Looking up for a single object") {
26
+ @Override
27
+ public ObjectMetadata call()
28
+ {
29
+ return s3Client.getObjectMetadata(objectMetadataRequest);
30
+ }
31
+ }.executeWith(retryExecutor);
32
+
33
+ builder.add(path, objectMetadata.getContentLength());
34
+ }
35
+ }
@@ -0,0 +1,70 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
7
+ import org.apache.commons.lang3.StringUtils;
8
+ import org.embulk.input.s3.DefaultRetryable;
9
+ import org.embulk.spi.Exec;
10
+ import org.embulk.spi.util.RetryExecutor;
11
+ import org.slf4j.Logger;
12
+
13
+ import java.util.Date;
14
+ import java.util.List;
15
+ import java.util.Optional;
16
+ import java.util.stream.Collectors;
17
+
18
+ public class S3TimeOrderPrefixFileExplorer extends S3PrefixFileExplorer
19
+ {
20
+ private static final Logger LOGGER = Exec.getLogger(S3TimeOrderPrefixFileExplorer.class);
21
+
22
+ private final Optional<Date> from;
23
+ private final Date to;
24
+
25
+ private String lastPath;
26
+
27
+ private int numOfReq = 0;
28
+
29
+ public S3TimeOrderPrefixFileExplorer(final String bucket, final AmazonS3 client, final RetryExecutor retryExecutor,
30
+ final String pathPrefix, final boolean skipGlacierObjects, final Optional<Date> from, final Date to)
31
+ {
32
+ super(bucket, client, retryExecutor, pathPrefix, skipGlacierObjects);
33
+ this.from = from;
34
+ this.to = to;
35
+ }
36
+
37
+ @Override
38
+ public List<S3ObjectSummary> fetch()
39
+ {
40
+ ++numOfReq;
41
+
42
+ final ListObjectsRequest req = new ListObjectsRequest(bucketName, pathPrefix, lastPath, null, 1024);
43
+ final ObjectListing objectListing = new DefaultRetryable<ObjectListing>("Listing objects")
44
+ {
45
+ @Override
46
+ public ObjectListing call()
47
+ {
48
+ return s3Client.listObjects(req);
49
+ }
50
+ }.executeWith(retryExecutor);
51
+ lastPath = objectListing.getNextMarker();
52
+
53
+ return objectListing.getObjectSummaries()
54
+ .stream()
55
+ .filter(s3ObjectSummary -> s3ObjectSummary.getLastModified().before(to)
56
+ && (!from.isPresent() || s3ObjectSummary.getLastModified().equals(from.get()) || s3ObjectSummary.getLastModified().after(from.get())))
57
+ .collect(Collectors.toList());
58
+ }
59
+
60
+ @Override
61
+ public boolean hasNext()
62
+ {
63
+ if (lastPath == null) {
64
+ LOGGER.info("The total number of LIST requests is {}{}.", numOfReq,
65
+ numOfReq < 10 ? StringUtils.EMPTY : ". Clean up your s3 bucket to reduce the number of requests and improve the ingesting performance");
66
+ return false;
67
+ }
68
+ return true;
69
+ }
70
+ }
@@ -0,0 +1,28 @@
1
+ package org.embulk.input.s3.utils;
2
+
3
+ import com.google.common.base.Joiner;
4
+ import org.embulk.config.ConfigException;
5
+ import org.joda.time.format.DateTimeFormat;
6
+
7
+ import java.util.Date;
8
+ import java.util.List;
9
+
10
+ public class DateUtils
11
+ {
12
+ public static Date parse(final String value, final List<String> supportedFormats)
13
+ throws ConfigException
14
+ {
15
+ for (final String fmt : supportedFormats) {
16
+ try {
17
+ return DateTimeFormat.forPattern(fmt).parseDateTime(value).toDate();
18
+ } catch (final IllegalArgumentException e) {
19
+ // ignorable exception
20
+ }
21
+ }
22
+ throw new ConfigException("Unsupported DateTime value: '" + value + "', supported formats: [" + Joiner.on(",").join(supportedFormats) + "]");
23
+ }
24
+
25
+ private DateUtils()
26
+ {
27
+ }
28
+ }
@@ -1,16 +1,11 @@
1
1
  package org.embulk.input.s3;
2
2
 
3
3
  import com.amazonaws.services.s3.AmazonS3;
4
- import com.amazonaws.services.s3.model.ListObjectsRequest;
5
- import com.amazonaws.services.s3.model.ObjectListing;
6
4
  import com.amazonaws.services.s3.model.Region;
7
- import com.amazonaws.services.s3.model.S3ObjectSummary;
8
- import com.amazonaws.services.s3.model.StorageClass;
9
5
  import com.google.common.collect.ImmutableList;
10
6
  import com.google.common.collect.ImmutableMap;
11
7
  import org.embulk.EmbulkTestRuntime;
12
8
  import org.embulk.config.ConfigDiff;
13
- import org.embulk.config.ConfigException;
14
9
  import org.embulk.config.ConfigSource;
15
10
  import org.embulk.config.TaskReport;
16
11
  import org.embulk.config.TaskSource;
@@ -25,21 +20,15 @@ import org.junit.Before;
25
20
  import org.junit.BeforeClass;
26
21
  import org.junit.Rule;
27
22
  import org.junit.Test;
28
- import org.mockito.Mockito;
29
23
 
30
- import java.lang.reflect.Field;
31
24
  import java.util.ArrayList;
32
25
  import java.util.List;
33
- import java.util.Optional;
34
26
 
35
27
  import static org.embulk.input.s3.S3FileInputPlugin.S3PluginTask;
36
28
  import static org.junit.Assert.assertEquals;
37
29
  import static org.junit.Assert.assertFalse;
38
30
  import static org.junit.Assert.assertNull;
39
31
  import static org.junit.Assume.assumeNotNull;
40
- import static org.mockito.Matchers.any;
41
- import static org.mockito.Mockito.doReturn;
42
- import static org.mockito.Mockito.mock;
43
32
 
44
33
  public class TestS3FileInputPlugin
45
34
  {
@@ -97,7 +86,6 @@ public class TestS3FileInputPlugin
97
86
 
98
87
  @Test
99
88
  public void useLastPath()
100
- throws Exception
101
89
  {
102
90
  ConfigSource config = this.config.deepCopy().set("last_path", EMBULK_S3_TEST_PATH_PREFIX + "/sample_01.csv");
103
91
  ConfigDiff configDiff = runner.transaction(config, new Control(runner, output));
@@ -117,7 +105,6 @@ public class TestS3FileInputPlugin
117
105
 
118
106
  @Test
119
107
  public void emptyFilesWithLastPath()
120
- throws Exception
121
108
  {
122
109
  ConfigSource config = this.config.deepCopy()
123
110
  .set("path_prefix", "empty_files_prefix")
@@ -130,7 +117,6 @@ public class TestS3FileInputPlugin
130
117
 
131
118
  @Test
132
119
  public void useTotalFileCountLimit()
133
- throws Exception
134
120
  {
135
121
  ConfigSource config = this.config.deepCopy().set("total_file_count_limit", 0);
136
122
  ConfigDiff configDiff = runner.transaction(config, new Control(runner, output));
@@ -141,7 +127,6 @@ public class TestS3FileInputPlugin
141
127
 
142
128
  @Test
143
129
  public void usePathMatchPattern()
144
- throws Exception
145
130
  {
146
131
  { // match pattern
147
132
  ConfigSource config = this.config.deepCopy().set("path_match_pattern", "/sample_01");
@@ -227,44 +212,6 @@ public class TestS3FileInputPlugin
227
212
  assertEquals(s3Client.getRegion(), Region.US_Standard);
228
213
  }
229
214
 
230
- @Test(expected = ConfigException.class)
231
- public void useSkipGlacierObjects() throws Exception
232
- {
233
- AmazonS3 client;
234
- client = mock(AmazonS3.class);
235
- doReturn(s3objectList("in/aa/a", StorageClass.Glacier)).when(client).listObjects(any(ListObjectsRequest.class));
236
-
237
- AbstractS3FileInputPlugin plugin = Mockito.mock(AbstractS3FileInputPlugin.class, Mockito.CALLS_REAL_METHODS);
238
- plugin.listS3FilesByPrefix(newFileList(config, "sample_00", 100L), client, "test_bucket", "test_prefix", Optional.empty(), false);
239
- }
240
-
241
- private FileList.Builder newFileList(ConfigSource config, Object... nameAndSize)
242
- {
243
- FileList.Builder builder = new FileList.Builder(config);
244
- for (int i = 0; i < nameAndSize.length; i += 2) {
245
- builder.add((String) nameAndSize[i], (long) nameAndSize[i + 1]);
246
- }
247
- return builder;
248
- }
249
-
250
- private ObjectListing s3objectList(String key, StorageClass storageClass) throws Exception
251
- {
252
- ObjectListing list = new ObjectListing();
253
-
254
- S3ObjectSummary element = new S3ObjectSummary();
255
- element.setKey(key);
256
- element.setStorageClass(storageClass.toString());
257
-
258
- List<S3ObjectSummary> objectSummaries = new ArrayList<>();
259
- objectSummaries.add(element);
260
-
261
- Field field = list.getClass().getDeclaredField("objectSummaries");
262
- field.setAccessible(true);
263
- field.set(list, objectSummaries);
264
-
265
- return list;
266
- }
267
-
268
215
  static class Control
269
216
  implements InputPlugin.Control
270
217
  {
@@ -0,0 +1,67 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
5
+ import com.amazonaws.services.s3.model.ObjectListing;
6
+ import org.embulk.EmbulkTestRuntime;
7
+ import org.junit.Before;
8
+ import org.junit.Rule;
9
+ import org.junit.Test;
10
+ import org.junit.runner.RunWith;
11
+ import org.mockito.ArgumentCaptor;
12
+ import org.mockito.Mock;
13
+ import org.mockito.internal.util.reflection.FieldSetter;
14
+ import org.mockito.runners.MockitoJUnitRunner;
15
+
16
+ import static org.junit.Assert.assertEquals;
17
+ import static org.junit.Assert.assertFalse;
18
+ import static org.mockito.Matchers.any;
19
+ import static org.mockito.Mockito.mock;
20
+ import static org.mockito.Mockito.verify;
21
+ import static org.mockito.Mockito.when;
22
+
23
+ @RunWith(MockitoJUnitRunner.class)
24
+ public class TestS3NameOrderPrefixFileExplorer
25
+ {
26
+ private static final String BUCKET_NAME = "bucket_name";
27
+ private static final String PATH_PREFIX = "path_prefix";
28
+ private static final String LAST_PATH = "last_path";
29
+
30
+ @Rule
31
+ public EmbulkTestRuntime runtime = new EmbulkTestRuntime();
32
+
33
+ @Mock
34
+ private AmazonS3 s3Client;
35
+
36
+ private S3NameOrderPrefixFileExplorer s3NameOrderPrefixFileExplorer;
37
+
38
+ @Before
39
+ public void setUp()
40
+ {
41
+ s3NameOrderPrefixFileExplorer = new S3NameOrderPrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, false, LAST_PATH);
42
+ }
43
+
44
+ @Test
45
+ public void fetch_should_return_list_objects()
46
+ {
47
+ final ObjectListing ol = mock(ObjectListing.class);
48
+ when(s3Client.listObjects(any(ListObjectsRequest.class))).thenReturn(ol);
49
+
50
+ s3NameOrderPrefixFileExplorer.fetch();
51
+ final ArgumentCaptor<ListObjectsRequest> listObjectsRequestCaptor = ArgumentCaptor.forClass(ListObjectsRequest.class);
52
+
53
+ verify(ol).getNextMarker();
54
+ verify(s3Client).listObjects(listObjectsRequestCaptor.capture());
55
+ final ListObjectsRequest listObjectsRequest = listObjectsRequestCaptor.getValue();
56
+ assertEquals(BUCKET_NAME, listObjectsRequest.getBucketName());
57
+ assertEquals(PATH_PREFIX, listObjectsRequest.getPrefix());
58
+ assertEquals(LAST_PATH, listObjectsRequest.getMarker());
59
+ }
60
+
61
+ @Test
62
+ public void hasNext_should_return_false_if_no_lastpath() throws NoSuchFieldException
63
+ {
64
+ new FieldSetter(s3NameOrderPrefixFileExplorer, s3NameOrderPrefixFileExplorer.getClass().getDeclaredField("lastPath")).set(null);
65
+ assertFalse(s3NameOrderPrefixFileExplorer.hasNext());
66
+ }
67
+ }
@@ -0,0 +1,128 @@
1
+ package org.embulk.input.s3.explorer;
2
+
3
+ import com.amazonaws.services.s3.AmazonS3;
4
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
5
+ import com.amazonaws.services.s3.model.StorageClass;
6
+ import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
7
+ import org.embulk.EmbulkTestRuntime;
8
+ import org.embulk.config.ConfigException;
9
+ import org.embulk.input.s3.FileList;
10
+ import org.embulk.spi.util.RetryExecutor;
11
+ import org.junit.Before;
12
+ import org.junit.Rule;
13
+ import org.junit.Test;
14
+ import org.junit.runner.RunWith;
15
+ import org.mockito.Mock;
16
+ import org.mockito.runners.MockitoJUnitRunner;
17
+
18
+ import java.util.Collections;
19
+ import java.util.List;
20
+
21
+ import static org.mockito.Mockito.doReturn;
22
+ import static org.mockito.Mockito.never;
23
+ import static org.mockito.Mockito.spy;
24
+ import static org.mockito.Mockito.times;
25
+ import static org.mockito.Mockito.verify;
26
+ import static org.mockito.Mockito.when;
27
+
28
+ @RunWith(MockitoJUnitRunner.class)
29
+ public class TestS3PrefixFileExplorer
30
+ {
31
+ private static final String PATH_PREFIX = "path_prefix";
32
+ private static final String BUCKET_NAME = "bucket_name";
33
+ private static final String OBJECT_KEY = "key";
34
+
35
+ @SuppressFBWarnings("URF_UNREAD_PUBLIC_OR_PROTECTED_FIELD")
36
+ @Rule
37
+ public EmbulkTestRuntime embulkTestRuntime = new EmbulkTestRuntime();
38
+
39
+ @Mock
40
+ private AmazonS3 s3Client;
41
+
42
+ @Mock
43
+ private FileList.Builder builder;
44
+
45
+ @Mock
46
+ private S3ObjectSummary s3ObjectSummary;
47
+
48
+ private S3PrefixFileExplorer s3PrefixFileExplorer;
49
+
50
+ @Before
51
+ public void setUp()
52
+ {
53
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, false);
54
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
55
+ }
56
+
57
+ @Test(expected = ConfigException.class)
58
+ public void addToBuilder_should_throw_exception_if_notskipped_glacier_storage()
59
+ {
60
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Glacier.toString());
61
+ s3PrefixFileExplorer.addToBuilder(builder);
62
+ }
63
+
64
+ @Test
65
+ public void addToBuilder_should_skip_glacier_storage_if_allowed()
66
+ {
67
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Glacier.toString());
68
+ // override spied object for changing `skipGlacierObjects`
69
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, true);
70
+ doReturn(false).when(s3PrefixFileExplorer).hasNext();
71
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
72
+ s3PrefixFileExplorer.addToBuilder(builder);
73
+
74
+ verify(s3PrefixFileExplorer).hasNext();
75
+ verify(s3ObjectSummary, never()).getSize();
76
+ }
77
+
78
+ @Test
79
+ public void addToBuilder_should_loop_till_nothing_left()
80
+ {
81
+ // There are 3 loops totally but only 2 keys have been imported because the first key is in Glacier storage class and is skipped
82
+ when(builder.needsMore()).thenReturn(true);
83
+ // override spied object for changing `skipGlacierObjects`
84
+ s3PrefixFileExplorer = spyS3PrefixFileExplorer(BUCKET_NAME, s3Client, null, PATH_PREFIX, true);
85
+ when(s3ObjectSummary.getStorageClass())
86
+ .thenReturn(StorageClass.Glacier.toString())
87
+ .thenReturn(StorageClass.Standard.toString());
88
+ when(s3ObjectSummary.getSize()).thenReturn(1L);
89
+ when(s3ObjectSummary.getKey()).thenReturn(PATH_PREFIX + OBJECT_KEY);
90
+ doReturn(Collections.singletonList(s3ObjectSummary)).when(s3PrefixFileExplorer).fetch();
91
+ doReturn(true).doReturn(true).doReturn(false).when(s3PrefixFileExplorer).hasNext();
92
+
93
+ s3PrefixFileExplorer.addToBuilder(builder);
94
+ verify(builder, times(2)).add(PATH_PREFIX + OBJECT_KEY, 1);
95
+ }
96
+
97
+ @Test
98
+ public void addToBuilder_should_stop_import_if_too_many_files()
99
+ {
100
+ when(builder.needsMore()).thenReturn(false);
101
+ when(s3ObjectSummary.getStorageClass()).thenReturn(StorageClass.Standard.toString());
102
+ when(s3ObjectSummary.getKey()).thenReturn(PATH_PREFIX + OBJECT_KEY);
103
+ when(s3ObjectSummary.getSize()).thenReturn(1L);
104
+ doReturn(true).when(s3PrefixFileExplorer).hasNext();
105
+ s3PrefixFileExplorer.addToBuilder(builder);
106
+
107
+ verify(builder).add(PATH_PREFIX + OBJECT_KEY, 1);
108
+ verify(s3PrefixFileExplorer, never()).hasNext();
109
+ }
110
+
111
+ private S3PrefixFileExplorer spyS3PrefixFileExplorer(final String bucketName, final AmazonS3 s3Client, final RetryExecutor retryExecutor, final String pathPrefix, final boolean skipGlacierObjects)
112
+ {
113
+ return spy(new S3PrefixFileExplorer(bucketName, s3Client, retryExecutor, pathPrefix, skipGlacierObjects)
114
+ {
115
+ @Override
116
+ protected List<S3ObjectSummary> fetch()
117
+ {
118
+ return null;
119
+ }
120
+
121
+ @Override
122
+ protected boolean hasNext()
123
+ {
124
+ return false;
125
+ }
126
+ });
127
+ }
128
+ }