embulk-input-s3 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/{aws-java-sdk-core-1.9.22.jar → aws-java-sdk-core-1.10.33.jar} +0 -0
- data/classpath/aws-java-sdk-kms-1.10.33.jar +0 -0
- data/classpath/aws-java-sdk-s3-1.10.33.jar +0 -0
- data/classpath/embulk-input-s3-0.2.4.jar +0 -0
- data/classpath/{httpclient-4.3.4.jar → httpclient-4.3.6.jar} +0 -0
- data/classpath/{httpcore-4.3.2.jar → httpcore-4.3.3.jar} +0 -0
- data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +23 -55
- data/src/main/java/org/embulk/input/s3/AwsCredentials.java +179 -0
- data/src/main/java/org/embulk/input/s3/AwsCredentialsTask.java +39 -0
- data/src/main/java/org/embulk/input/s3/FileList.java +289 -0
- data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +21 -6
- metadata +11 -9
- data/classpath/aws-java-sdk-kms-1.9.22.jar +0 -0
- data/classpath/aws-java-sdk-s3-1.9.22.jar +0 -0
- data/classpath/embulk-input-s3-0.2.3.jar +0 -0
- data/classpath/joda-time-2.8.2.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db7314fde9364e1d4ec9edbb67f90cfc5c93dbe7
|
4
|
+
data.tar.gz: b3133c6d3ea81cef907d8cd974b5471186e086b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cd7e8ce269c322e7a03262267ae23370a9cff12877b9f33b8bebf1769ebdf093896babe8193324a3f4aa5b37ccae6a67b380e94367ab65fcc381b3d0e9a848e
|
7
|
+
data.tar.gz: 0e934e0baf997bfb29b3e3ed337daa28133a38a513050931902dc5f3d2cd03f652f6b2cb31e7d8fecd5b7e3dfccab869ede056053892af9e3bab2dc8cbd1ae4b
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -3,6 +3,7 @@ package org.embulk.input.s3;
|
|
3
3
|
import java.util.List;
|
4
4
|
import java.util.ArrayList;
|
5
5
|
import java.util.Collections;
|
6
|
+
import java.util.Iterator;
|
6
7
|
import java.io.IOException;
|
7
8
|
import java.io.InterruptedIOException;
|
8
9
|
import java.io.InputStream;
|
@@ -14,8 +15,6 @@ import com.google.common.base.Throwables;
|
|
14
15
|
import org.slf4j.Logger;
|
15
16
|
import com.amazonaws.auth.AWSCredentials;
|
16
17
|
import com.amazonaws.auth.AWSCredentialsProvider;
|
17
|
-
import com.amazonaws.auth.BasicAWSCredentials;
|
18
|
-
import com.amazonaws.auth.AnonymousAWSCredentials;
|
19
18
|
import com.amazonaws.services.s3.AmazonS3Client;
|
20
19
|
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
21
20
|
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
@@ -48,7 +47,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
48
47
|
private final Logger log = Exec.getLogger(S3FileInputPlugin.class);
|
49
48
|
|
50
49
|
public interface PluginTask
|
51
|
-
extends Task
|
50
|
+
extends AwsCredentialsTask, FileList.Task, Task
|
52
51
|
{
|
53
52
|
@Config("bucket")
|
54
53
|
public String getBucket();
|
@@ -64,16 +63,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
64
63
|
@ConfigDefault("null")
|
65
64
|
public Optional<String> getAccessKeyId();
|
66
65
|
|
67
|
-
@Config("secret_access_key")
|
68
|
-
@ConfigDefault("null")
|
69
|
-
public Optional<String> getSecretAccessKey();
|
70
|
-
|
71
66
|
// TODO timeout, ssl, etc
|
72
67
|
|
73
|
-
|
74
|
-
|
75
|
-
public List<String> getFiles();
|
76
|
-
public void setFiles(List<String> files);
|
68
|
+
public FileList getFiles();
|
69
|
+
public void setFiles(FileList files);
|
77
70
|
|
78
71
|
@ConfigInject
|
79
72
|
public BufferAllocator getBufferAllocator();
|
@@ -90,7 +83,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
90
83
|
task.setFiles(listFiles(task));
|
91
84
|
|
92
85
|
// number of processors is same with number of files
|
93
|
-
return resume(task.dump(), task.getFiles().
|
86
|
+
return resume(task.dump(), task.getFiles().getTaskCount(), control);
|
94
87
|
}
|
95
88
|
|
96
89
|
@Override
|
@@ -109,16 +102,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
109
102
|
ConfigDiff configDiff = Exec.newConfigDiff();
|
110
103
|
|
111
104
|
// last_path
|
112
|
-
|
113
|
-
// keep the last value
|
114
|
-
if (task.getLastPath().isPresent()) {
|
115
|
-
configDiff.set("last_path", task.getLastPath().get());
|
116
|
-
}
|
117
|
-
} else {
|
118
|
-
List<String> files = new ArrayList<String>(task.getFiles());
|
119
|
-
Collections.sort(files);
|
120
|
-
configDiff.set("last_path", files.get(files.size() - 1));
|
121
|
-
}
|
105
|
+
configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
|
122
106
|
|
123
107
|
return configDiff;
|
124
108
|
}
|
@@ -138,24 +122,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
138
122
|
|
139
123
|
protected AWSCredentialsProvider getCredentialsProvider(PluginTask task)
|
140
124
|
{
|
141
|
-
|
142
|
-
if (task.getAccessKeyId().isPresent()) {
|
143
|
-
cred = new BasicAWSCredentials(
|
144
|
-
task.getAccessKeyId().get(),
|
145
|
-
task.getSecretAccessKey().get());
|
146
|
-
} else {
|
147
|
-
cred = new AnonymousAWSCredentials();
|
148
|
-
}
|
149
|
-
return new AWSCredentialsProvider() {
|
150
|
-
public AWSCredentials getCredentials()
|
151
|
-
{
|
152
|
-
return cred;
|
153
|
-
}
|
154
|
-
|
155
|
-
public void refresh()
|
156
|
-
{
|
157
|
-
}
|
158
|
-
};
|
125
|
+
return AwsCredentials.getAWSCredentialsProvider(task);
|
159
126
|
}
|
160
127
|
|
161
128
|
protected ClientConfiguration getClientConfiguration(PluginTask task)
|
@@ -170,7 +137,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
170
137
|
return clientConfig;
|
171
138
|
}
|
172
139
|
|
173
|
-
private
|
140
|
+
private FileList listFiles(PluginTask task)
|
174
141
|
{
|
175
142
|
AmazonS3Client client = newS3Client(task);
|
176
143
|
String bucketName = task.getBucket();
|
@@ -179,7 +146,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
179
146
|
log.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
180
147
|
}
|
181
148
|
|
182
|
-
|
149
|
+
FileList.Builder builder = new FileList.Builder(task);
|
150
|
+
listS3FilesByPrefix(builder, client, bucketName,
|
151
|
+
task.getPathPrefix(), task.getLastPath());
|
152
|
+
return builder.build();
|
183
153
|
}
|
184
154
|
|
185
155
|
/**
|
@@ -187,24 +157,24 @@ public abstract class AbstractS3FileInputPlugin
|
|
187
157
|
*
|
188
158
|
* The resulting list does not include the file that's size == 0.
|
189
159
|
*/
|
190
|
-
public static
|
160
|
+
public static void listS3FilesByPrefix(FileList.Builder builder,
|
161
|
+
AmazonS3Client client, String bucketName,
|
191
162
|
String prefix, Optional<String> lastPath)
|
192
163
|
{
|
193
|
-
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
194
|
-
|
195
164
|
String lastKey = lastPath.orNull();
|
196
165
|
do {
|
197
166
|
ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, lastKey, null, 1024);
|
198
167
|
ObjectListing ol = client.listObjects(req);
|
199
|
-
for(S3ObjectSummary s : ol.getObjectSummaries()) {
|
168
|
+
for (S3ObjectSummary s : ol.getObjectSummaries()) {
|
200
169
|
if (s.getSize() > 0) {
|
201
|
-
builder.add(s.getKey());
|
170
|
+
builder.add(s.getKey(), s.getSize());
|
171
|
+
if (!builder.needsMore()) {
|
172
|
+
return;
|
173
|
+
}
|
202
174
|
}
|
203
175
|
}
|
204
176
|
lastKey = ol.getNextMarker();
|
205
177
|
} while(lastKey != null);
|
206
|
-
|
207
|
-
return builder.build();
|
208
178
|
}
|
209
179
|
|
210
180
|
@Override
|
@@ -308,24 +278,22 @@ public abstract class AbstractS3FileInputPlugin
|
|
308
278
|
{
|
309
279
|
private AmazonS3Client client;
|
310
280
|
private final String bucket;
|
311
|
-
private final String
|
312
|
-
private boolean opened = false;
|
281
|
+
private final Iterator<String> iterator;
|
313
282
|
|
314
283
|
public SingleFileProvider(PluginTask task, int taskIndex)
|
315
284
|
{
|
316
285
|
this.client = newS3Client(task);
|
317
286
|
this.bucket = task.getBucket();
|
318
|
-
this.
|
287
|
+
this.iterator = task.getFiles().get(taskIndex).iterator();
|
319
288
|
}
|
320
289
|
|
321
290
|
@Override
|
322
291
|
public InputStream openNext() throws IOException
|
323
292
|
{
|
324
|
-
if (
|
293
|
+
if (!iterator.hasNext()) {
|
325
294
|
return null;
|
326
295
|
}
|
327
|
-
|
328
|
-
GetObjectRequest request = new GetObjectRequest(bucket, key);
|
296
|
+
GetObjectRequest request = new GetObjectRequest(bucket, iterator.next());
|
329
297
|
S3Object obj = client.getObject(request);
|
330
298
|
return new ResumableInputStream(obj.getObjectContent(), new S3InputStreamReopener(client, request, obj.getObjectMetadata().getContentLength()));
|
331
299
|
}
|
@@ -0,0 +1,179 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import com.amazonaws.auth.AWSCredentials;
|
5
|
+
import com.amazonaws.auth.AWSCredentialsProvider;
|
6
|
+
import com.amazonaws.auth.AWSSessionCredentials;
|
7
|
+
import com.amazonaws.auth.AWSSessionCredentialsProvider;
|
8
|
+
import com.amazonaws.auth.AnonymousAWSCredentials;
|
9
|
+
import com.amazonaws.auth.BasicAWSCredentials;
|
10
|
+
import com.amazonaws.auth.BasicSessionCredentials;
|
11
|
+
import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
|
12
|
+
import com.amazonaws.auth.InstanceProfileCredentialsProvider;
|
13
|
+
import com.amazonaws.auth.SystemPropertiesCredentialsProvider;
|
14
|
+
import com.amazonaws.auth.profile.ProfileCredentialsProvider;
|
15
|
+
import com.amazonaws.auth.profile.ProfilesConfigFile;
|
16
|
+
import org.embulk.config.ConfigException;
|
17
|
+
import org.embulk.spi.Exec;
|
18
|
+
import org.embulk.spi.unit.LocalFile;
|
19
|
+
import org.slf4j.Logger;
|
20
|
+
|
21
|
+
public abstract class AwsCredentials
|
22
|
+
{
|
23
|
+
private AwsCredentials() { }
|
24
|
+
|
25
|
+
public static AWSCredentialsProvider getAWSCredentialsProvider(AwsCredentialsTask task)
|
26
|
+
{
|
27
|
+
switch (task.getAuthMethod()) {
|
28
|
+
case "basic":
|
29
|
+
// for backward compatibility
|
30
|
+
if (!task.getAccessKeyId().isPresent() && !task.getAccessKeyId().isPresent()) {
|
31
|
+
final Logger log = Exec.getLogger(AwsCredentials.class);
|
32
|
+
log.warn("Both access_key_id and secret_access_key are not set. Assuming that 'auth_method: anonymous' option is set.");
|
33
|
+
log.warn("If you intentionally use anonymous authentication, please set 'auth_method: anonymous' option.");
|
34
|
+
log.warn("This behavior will be removed in a futurte release.");
|
35
|
+
reject(task.getSessionToken(), "session_token");
|
36
|
+
reject(task.getProfileFile(), "profile_file");
|
37
|
+
reject(task.getProfileName(), "profile_name");
|
38
|
+
return new AWSCredentialsProvider() {
|
39
|
+
public AWSCredentials getCredentials()
|
40
|
+
{
|
41
|
+
return new AnonymousAWSCredentials();
|
42
|
+
}
|
43
|
+
|
44
|
+
public void refresh() { }
|
45
|
+
};
|
46
|
+
}
|
47
|
+
else {
|
48
|
+
String accessKeyId = require(task.getAccessKeyId(), "'access_key_id', 'secret_access_key'");
|
49
|
+
String secretAccessKey = require(task.getSecretAccessKey(), "'secret_access_key'");
|
50
|
+
reject(task.getSessionToken(), "session_token");
|
51
|
+
reject(task.getProfileFile(), "profile_file");
|
52
|
+
reject(task.getProfileName(), "profile_name");
|
53
|
+
final BasicAWSCredentials creds = new BasicAWSCredentials(accessKeyId, secretAccessKey);
|
54
|
+
return new AWSCredentialsProvider() {
|
55
|
+
public AWSCredentials getCredentials()
|
56
|
+
{
|
57
|
+
return creds;
|
58
|
+
}
|
59
|
+
|
60
|
+
public void refresh() { }
|
61
|
+
};
|
62
|
+
}
|
63
|
+
|
64
|
+
case "env":
|
65
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
66
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
67
|
+
reject(task.getSessionToken(), "session_token");
|
68
|
+
reject(task.getProfileFile(), "profile_file");
|
69
|
+
reject(task.getProfileName(), "profile_name");
|
70
|
+
return overwriteBasicCredentials(task, new EnvironmentVariableCredentialsProvider().getCredentials());
|
71
|
+
|
72
|
+
case "instance":
|
73
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
74
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
75
|
+
reject(task.getSessionToken(), "session_token");
|
76
|
+
reject(task.getProfileFile(), "profile_file");
|
77
|
+
reject(task.getProfileName(), "profile_name");
|
78
|
+
return new InstanceProfileCredentialsProvider();
|
79
|
+
|
80
|
+
case "profile":
|
81
|
+
{
|
82
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
83
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
84
|
+
reject(task.getSessionToken(), "session_token");
|
85
|
+
|
86
|
+
String profileName = task.getProfileName().or("default");
|
87
|
+
ProfileCredentialsProvider provider;
|
88
|
+
if (task.getProfileFile().isPresent()) {
|
89
|
+
ProfilesConfigFile file = new ProfilesConfigFile(task.getProfileFile().get().getFile());
|
90
|
+
provider = new ProfileCredentialsProvider(file, profileName);
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
provider = new ProfileCredentialsProvider(profileName);
|
94
|
+
}
|
95
|
+
task.setProfileName(Optional.<String>absent());
|
96
|
+
task.setProfileFile(Optional.<LocalFile>absent());
|
97
|
+
|
98
|
+
return overwriteBasicCredentials(task, provider.getCredentials());
|
99
|
+
}
|
100
|
+
|
101
|
+
case "properties":
|
102
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
103
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
104
|
+
reject(task.getSessionToken(), "session_token");
|
105
|
+
reject(task.getProfileFile(), "profile_file");
|
106
|
+
reject(task.getProfileName(), "profile_name");
|
107
|
+
return overwriteBasicCredentials(task, new SystemPropertiesCredentialsProvider().getCredentials());
|
108
|
+
|
109
|
+
case "anonymous":
|
110
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
111
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
112
|
+
reject(task.getSessionToken(), "session_token");
|
113
|
+
reject(task.getProfileFile(), "profile_file");
|
114
|
+
reject(task.getProfileName(), "profile_name");
|
115
|
+
return new AWSCredentialsProvider() {
|
116
|
+
public AWSCredentials getCredentials()
|
117
|
+
{
|
118
|
+
return new AnonymousAWSCredentials();
|
119
|
+
}
|
120
|
+
|
121
|
+
public void refresh() { }
|
122
|
+
};
|
123
|
+
|
124
|
+
case "session":
|
125
|
+
{
|
126
|
+
String accessKeyId = require(task.getAccessKeyId(), "'access_key_id', 'secret_access_key', 'session_token'");
|
127
|
+
String secretAccessKey = require(task.getSecretAccessKey(), "'secret_access_key', 'session_token'");
|
128
|
+
String sessionToken = require(task.getSessionToken(), "'session_token'");
|
129
|
+
reject(task.getProfileFile(), "profile_file");
|
130
|
+
reject(task.getProfileName(), "profile_name");
|
131
|
+
final AWSSessionCredentials creds = new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken);
|
132
|
+
return new AWSSessionCredentialsProvider() {
|
133
|
+
public AWSSessionCredentials getCredentials()
|
134
|
+
{
|
135
|
+
return creds;
|
136
|
+
}
|
137
|
+
|
138
|
+
public void refresh() { }
|
139
|
+
};
|
140
|
+
}
|
141
|
+
|
142
|
+
default:
|
143
|
+
throw new ConfigException(String.format("Unknwon auth_method '%s'. Supported methods are basic, instance, profile, properties, anonymous, and session.",
|
144
|
+
task.getAuthMethod()));
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
private static AWSCredentialsProvider overwriteBasicCredentials(AwsCredentialsTask task, final AWSCredentials creds)
|
149
|
+
{
|
150
|
+
task.setAuthMethod("basic");
|
151
|
+
task.setAccessKeyId(Optional.of(creds.getAWSAccessKeyId()));
|
152
|
+
task.setSecretAccessKey(Optional.of(creds.getAWSSecretKey()));
|
153
|
+
return new AWSCredentialsProvider() {
|
154
|
+
public AWSCredentials getCredentials()
|
155
|
+
{
|
156
|
+
return creds;
|
157
|
+
}
|
158
|
+
|
159
|
+
public void refresh() { }
|
160
|
+
};
|
161
|
+
}
|
162
|
+
|
163
|
+
private static <T> T require(Optional<T> value, String message)
|
164
|
+
{
|
165
|
+
if (value.isPresent()) {
|
166
|
+
return value.get();
|
167
|
+
}
|
168
|
+
else {
|
169
|
+
throw new ConfigException("Required option is not set: " + message);
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
private static <T> void reject(Optional<T> value, String message)
|
174
|
+
{
|
175
|
+
if (value.isPresent()) {
|
176
|
+
throw new ConfigException("Invalid option is set: " + message);
|
177
|
+
}
|
178
|
+
}
|
179
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigDefault;
|
6
|
+
import org.embulk.spi.unit.LocalFile;
|
7
|
+
|
8
|
+
public interface AwsCredentialsTask
|
9
|
+
{
|
10
|
+
@Config("auth_method")
|
11
|
+
@ConfigDefault("\"basic\"")
|
12
|
+
String getAuthMethod();
|
13
|
+
void setAuthMethod(String method);
|
14
|
+
|
15
|
+
@Config("access_key_id")
|
16
|
+
@ConfigDefault("null")
|
17
|
+
Optional<String> getAccessKeyId();
|
18
|
+
void setAccessKeyId(Optional<String> value);
|
19
|
+
|
20
|
+
@Config("secret_access_key")
|
21
|
+
@ConfigDefault("null")
|
22
|
+
Optional<String> getSecretAccessKey();
|
23
|
+
void setSecretAccessKey(Optional<String> value);
|
24
|
+
|
25
|
+
@Config("session_token")
|
26
|
+
@ConfigDefault("null")
|
27
|
+
Optional<String> getSessionToken();
|
28
|
+
void setSessionToken(Optional<String> value);
|
29
|
+
|
30
|
+
@Config("profile_file")
|
31
|
+
@ConfigDefault("null")
|
32
|
+
Optional<LocalFile> getProfileFile();
|
33
|
+
void setProfileFile(Optional<LocalFile> value);
|
34
|
+
|
35
|
+
@Config("profile_name")
|
36
|
+
@ConfigDefault("null")
|
37
|
+
Optional<String> getProfileName();
|
38
|
+
void setProfileName(Optional<String> value);
|
39
|
+
}
|
@@ -0,0 +1,289 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.AbstractList;
|
5
|
+
import java.util.ArrayList;
|
6
|
+
import java.util.zip.GZIPInputStream;
|
7
|
+
import java.util.zip.GZIPOutputStream;
|
8
|
+
import java.io.InputStream;
|
9
|
+
import java.io.OutputStream;
|
10
|
+
import java.io.BufferedOutputStream;
|
11
|
+
import java.io.BufferedInputStream;
|
12
|
+
import java.io.ByteArrayInputStream;
|
13
|
+
import java.io.ByteArrayOutputStream;
|
14
|
+
import java.io.IOException;
|
15
|
+
import java.nio.ByteBuffer;
|
16
|
+
import java.nio.charset.StandardCharsets;
|
17
|
+
import org.embulk.config.Config;
|
18
|
+
import org.embulk.config.ConfigDefault;
|
19
|
+
import org.embulk.config.ConfigSource;
|
20
|
+
import com.google.common.base.Throwables;
|
21
|
+
import com.google.common.base.Optional;
|
22
|
+
import com.google.common.collect.ImmutableList;
|
23
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
24
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
25
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
26
|
+
|
27
|
+
// this class should be moved to embulk-core
|
28
|
+
public class FileList
|
29
|
+
{
|
30
|
+
public interface Task
|
31
|
+
{
|
32
|
+
@Config("total_file_count_limit")
|
33
|
+
@ConfigDefault("2147483647")
|
34
|
+
int getTotalFileCountLimit();
|
35
|
+
}
|
36
|
+
|
37
|
+
public static class Entry
|
38
|
+
{
|
39
|
+
private int index;
|
40
|
+
private long size;
|
41
|
+
|
42
|
+
@JsonCreator
|
43
|
+
public Entry(
|
44
|
+
@JsonProperty("index") int index,
|
45
|
+
@JsonProperty("size") long size)
|
46
|
+
{
|
47
|
+
this.index = index;
|
48
|
+
this.size = size;
|
49
|
+
}
|
50
|
+
|
51
|
+
@JsonProperty("index")
|
52
|
+
public int getIndex() { return index; }
|
53
|
+
|
54
|
+
@JsonProperty("size")
|
55
|
+
public long getSize() { return size; }
|
56
|
+
}
|
57
|
+
|
58
|
+
public static class Builder
|
59
|
+
{
|
60
|
+
private final ByteArrayOutputStream binary;
|
61
|
+
private final OutputStream stream;
|
62
|
+
private final List<Entry> entries = new ArrayList<>();
|
63
|
+
private String last = null;
|
64
|
+
|
65
|
+
private int limitCount = Integer.MAX_VALUE;
|
66
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
67
|
+
|
68
|
+
public Builder(Task task)
|
69
|
+
{
|
70
|
+
this();
|
71
|
+
this.limitCount = task.getTotalFileCountLimit();
|
72
|
+
}
|
73
|
+
|
74
|
+
public Builder(ConfigSource config)
|
75
|
+
{
|
76
|
+
this();
|
77
|
+
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
78
|
+
}
|
79
|
+
|
80
|
+
public Builder()
|
81
|
+
{
|
82
|
+
binary = new ByteArrayOutputStream();
|
83
|
+
try {
|
84
|
+
stream = new BufferedOutputStream(new GZIPOutputStream(binary));
|
85
|
+
}
|
86
|
+
catch (IOException ex) {
|
87
|
+
throw Throwables.propagate(ex);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
public Builder limitTotalFileCount(int limitCount)
|
92
|
+
{
|
93
|
+
this.limitCount = limitCount;
|
94
|
+
return this;
|
95
|
+
}
|
96
|
+
|
97
|
+
public int size()
|
98
|
+
{
|
99
|
+
return entries.size();
|
100
|
+
}
|
101
|
+
|
102
|
+
public boolean needsMore()
|
103
|
+
{
|
104
|
+
return size() < limitCount;
|
105
|
+
}
|
106
|
+
|
107
|
+
public synchronized boolean add(String path, long size)
|
108
|
+
{
|
109
|
+
// TODO throw IllegalStateException if stream is already closed
|
110
|
+
|
111
|
+
if (!needsMore()) {
|
112
|
+
return false;
|
113
|
+
}
|
114
|
+
|
115
|
+
// TODO in the future, support some other filtering parameters (file name suffix filter, regex filter, etc)
|
116
|
+
// and return false if filtered out.
|
117
|
+
|
118
|
+
int index = entries.size();
|
119
|
+
entries.add(new Entry(index, size));
|
120
|
+
|
121
|
+
byte[] data = path.getBytes(StandardCharsets.UTF_8);
|
122
|
+
castBuffer.putInt(0, data.length);
|
123
|
+
try {
|
124
|
+
stream.write(castBuffer.array());
|
125
|
+
stream.write(data);
|
126
|
+
}
|
127
|
+
catch (IOException ex) {
|
128
|
+
throw Throwables.propagate(ex);
|
129
|
+
}
|
130
|
+
|
131
|
+
last = path;
|
132
|
+
return true;
|
133
|
+
}
|
134
|
+
|
135
|
+
public FileList build()
|
136
|
+
{
|
137
|
+
try {
|
138
|
+
stream.close();
|
139
|
+
}
|
140
|
+
catch (IOException ex) {
|
141
|
+
throw Throwables.propagate(ex);
|
142
|
+
}
|
143
|
+
return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
144
|
+
}
|
145
|
+
|
146
|
+
private List<List<Entry>> getSplits(List<Entry> all)
|
147
|
+
{
|
148
|
+
// TODO combine multiple entries into one task using some configuration parameters
|
149
|
+
List<List<Entry>> tasks = new ArrayList<>();
|
150
|
+
for (Entry entry : all) {
|
151
|
+
tasks.add(ImmutableList.of(entry));
|
152
|
+
}
|
153
|
+
return tasks;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
private final byte[] data;
|
158
|
+
private final List<List<Entry>> tasks;
|
159
|
+
private final Optional<String> last;
|
160
|
+
|
161
|
+
@JsonCreator
|
162
|
+
@Deprecated
|
163
|
+
public FileList(
|
164
|
+
@JsonProperty("data") byte[] data,
|
165
|
+
@JsonProperty("tasks") List<List<Entry>> tasks,
|
166
|
+
@JsonProperty("last") Optional<String> last)
|
167
|
+
{
|
168
|
+
this.data = data;
|
169
|
+
this.tasks = tasks;
|
170
|
+
this.last = last;
|
171
|
+
}
|
172
|
+
|
173
|
+
@JsonIgnore
|
174
|
+
public Optional<String> getLastPath(Optional<String> lastLastPath)
|
175
|
+
{
|
176
|
+
if (last.isPresent()) {
|
177
|
+
return last;
|
178
|
+
}
|
179
|
+
return lastLastPath;
|
180
|
+
}
|
181
|
+
|
182
|
+
@JsonIgnore
|
183
|
+
public int getTaskCount()
|
184
|
+
{
|
185
|
+
return tasks.size();
|
186
|
+
}
|
187
|
+
|
188
|
+
@JsonIgnore
|
189
|
+
public List<String> get(int i)
|
190
|
+
{
|
191
|
+
return new EntryList(data, tasks.get(i));
|
192
|
+
}
|
193
|
+
|
194
|
+
@JsonProperty("data")
|
195
|
+
@Deprecated
|
196
|
+
public byte[] getData()
|
197
|
+
{
|
198
|
+
return data;
|
199
|
+
}
|
200
|
+
|
201
|
+
@JsonProperty("tasks")
|
202
|
+
@Deprecated
|
203
|
+
public List<List<Entry>> getTasks()
|
204
|
+
{
|
205
|
+
return tasks;
|
206
|
+
}
|
207
|
+
|
208
|
+
@JsonProperty("last")
|
209
|
+
@Deprecated
|
210
|
+
public Optional<String> getLast()
|
211
|
+
{
|
212
|
+
return last;
|
213
|
+
}
|
214
|
+
|
215
|
+
private class EntryList
|
216
|
+
extends AbstractList<String>
|
217
|
+
{
|
218
|
+
private final byte[] data;
|
219
|
+
private final List<Entry> entries;
|
220
|
+
private InputStream stream;
|
221
|
+
private int current;
|
222
|
+
|
223
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
224
|
+
|
225
|
+
public EntryList(byte[] data, List<Entry> entries)
|
226
|
+
{
|
227
|
+
this.data = data;
|
228
|
+
this.entries = entries;
|
229
|
+
try {
|
230
|
+
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
231
|
+
}
|
232
|
+
catch (IOException ex) {
|
233
|
+
throw Throwables.propagate(ex);
|
234
|
+
}
|
235
|
+
this.current = 0;
|
236
|
+
}
|
237
|
+
|
238
|
+
@Override
|
239
|
+
public synchronized String get(int i)
|
240
|
+
{
|
241
|
+
Entry e = entries.get(i);
|
242
|
+
if (e.getIndex() < current) {
|
243
|
+
// rewind to the head
|
244
|
+
try {
|
245
|
+
stream.close();
|
246
|
+
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
247
|
+
}
|
248
|
+
catch (IOException ex) {
|
249
|
+
throw Throwables.propagate(ex);
|
250
|
+
}
|
251
|
+
current = 0;
|
252
|
+
}
|
253
|
+
|
254
|
+
while (current < e.getIndex()) {
|
255
|
+
readNext();
|
256
|
+
}
|
257
|
+
// now current == e.getIndex()
|
258
|
+
return readNextString();
|
259
|
+
}
|
260
|
+
|
261
|
+
@Override
|
262
|
+
public int size()
|
263
|
+
{
|
264
|
+
return entries.size();
|
265
|
+
}
|
266
|
+
|
267
|
+
private byte[] readNext()
|
268
|
+
{
|
269
|
+
try {
|
270
|
+
stream.read(castBuffer.array());
|
271
|
+
int n = castBuffer.getInt(0);
|
272
|
+
byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
|
273
|
+
stream.read(b);
|
274
|
+
|
275
|
+
current++;
|
276
|
+
|
277
|
+
return b;
|
278
|
+
}
|
279
|
+
catch (IOException ex) {
|
280
|
+
throw Throwables.propagate(ex);
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
private String readNextString()
|
285
|
+
{
|
286
|
+
return new String(readNext(), StandardCharsets.UTF_8);
|
287
|
+
}
|
288
|
+
}
|
289
|
+
}
|
@@ -72,7 +72,9 @@ public class TestS3FileInputPlugin
|
|
72
72
|
doReturn("in/file/").doReturn(null).when(ol).getNextMarker();
|
73
73
|
|
74
74
|
// It counts only size != 0 files.
|
75
|
-
|
75
|
+
FileList.Builder builder = new FileList.Builder();
|
76
|
+
S3FileInputPlugin.listS3FilesByPrefix(builder, client, "bucketName", "prefix", Optional.<String>absent());
|
77
|
+
assertEquals(1, builder.size());
|
76
78
|
}
|
77
79
|
|
78
80
|
@Test
|
@@ -90,7 +92,7 @@ public class TestS3FileInputPlugin
|
|
90
92
|
public List<TaskReport> run(TaskSource taskSource, int taskCount)
|
91
93
|
{
|
92
94
|
assertEquals(3, taskCount);
|
93
|
-
List<String> files = taskSource.loadTask(S3PluginTask.class).getFiles();
|
95
|
+
List<String> files = fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles());
|
94
96
|
assertArrayEquals(new String[]{"in/aa/a", "in/aa/b", "in/aa/c"}, files.toArray(new String[files.size()]));
|
95
97
|
return emptyTaskReports(taskCount);
|
96
98
|
}
|
@@ -108,12 +110,12 @@ public class TestS3FileInputPlugin
|
|
108
110
|
public List<TaskReport> run(TaskSource taskSource, int taskCount)
|
109
111
|
{
|
110
112
|
assertEquals(0, taskCount);
|
111
|
-
assertTrue(taskSource.loadTask(S3PluginTask.class).getFiles().isEmpty());
|
113
|
+
assertTrue(fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles()).isEmpty());
|
112
114
|
return emptyTaskReports(taskCount);
|
113
115
|
}
|
114
116
|
});
|
115
117
|
|
116
|
-
|
118
|
+
assertEquals(null, configDiff.get(String.class, "last_path", null));
|
117
119
|
}
|
118
120
|
|
119
121
|
{ // if files are empty, keep the previous last_path.
|
@@ -126,7 +128,7 @@ public class TestS3FileInputPlugin
|
|
126
128
|
@Override
|
127
129
|
public List<TaskReport> run(TaskSource taskSource, int taskCount) {
|
128
130
|
assertEquals(0, taskCount);
|
129
|
-
assertTrue(taskSource.loadTask(S3PluginTask.class).getFiles().isEmpty());
|
131
|
+
assertTrue(fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles()).isEmpty());
|
130
132
|
return emptyTaskReports(taskCount);
|
131
133
|
}
|
132
134
|
});
|
@@ -143,7 +145,9 @@ public class TestS3FileInputPlugin
|
|
143
145
|
doReturn(client).when(plugin).newS3Client(any(PluginTask.class));
|
144
146
|
|
145
147
|
PluginTask task = config.loadConfig(plugin.getTaskClass());
|
146
|
-
|
148
|
+
FileList.Builder builder = new FileList.Builder();
|
149
|
+
builder.add("in/aa/a", 100);
|
150
|
+
task.setFiles(builder.build());
|
147
151
|
|
148
152
|
StringBuilder sbuf = new StringBuilder();
|
149
153
|
try (S3FileInput input = (S3FileInput) plugin.open(task.dump(), 0)) {
|
@@ -218,4 +222,15 @@ public class TestS3FileInputPlugin
|
|
218
222
|
}
|
219
223
|
return reports.build();
|
220
224
|
}
|
225
|
+
|
226
|
+
private static List<String> fileListToList(FileList list)
|
227
|
+
{
|
228
|
+
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
229
|
+
for (int i=0; i < list.getTaskCount(); i++) {
|
230
|
+
for (String path : list.get(i)) {
|
231
|
+
builder.add(path);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
return builder.build();
|
235
|
+
}
|
221
236
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-s3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -48,18 +48,20 @@ files:
|
|
48
48
|
- build.gradle
|
49
49
|
- lib/embulk/input/s3.rb
|
50
50
|
- src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java
|
51
|
+
- src/main/java/org/embulk/input/s3/AwsCredentials.java
|
52
|
+
- src/main/java/org/embulk/input/s3/AwsCredentialsTask.java
|
53
|
+
- src/main/java/org/embulk/input/s3/FileList.java
|
51
54
|
- src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
|
52
55
|
- src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
|
53
56
|
- src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
|
54
|
-
- classpath/aws-java-sdk-core-1.
|
55
|
-
- classpath/aws-java-sdk-kms-1.
|
56
|
-
- classpath/aws-java-sdk-s3-1.
|
57
|
+
- classpath/aws-java-sdk-core-1.10.33.jar
|
58
|
+
- classpath/aws-java-sdk-kms-1.10.33.jar
|
59
|
+
- classpath/aws-java-sdk-s3-1.10.33.jar
|
57
60
|
- classpath/commons-codec-1.6.jar
|
58
|
-
- classpath/embulk-input-s3-0.2.
|
59
|
-
- classpath/httpclient-4.3.
|
60
|
-
- classpath/httpcore-4.3.
|
61
|
+
- classpath/embulk-input-s3-0.2.4.jar
|
62
|
+
- classpath/httpclient-4.3.6.jar
|
63
|
+
- classpath/httpcore-4.3.3.jar
|
61
64
|
- classpath/jcl-over-slf4j-1.7.12.jar
|
62
|
-
- classpath/joda-time-2.8.2.jar
|
63
65
|
homepage: https://github.com/embulk/embulk-input-s3
|
64
66
|
licenses:
|
65
67
|
- Apache 2.0
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|