embulk-input-s3 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/classpath/{aws-java-sdk-core-1.9.22.jar → aws-java-sdk-core-1.10.33.jar} +0 -0
- data/classpath/aws-java-sdk-kms-1.10.33.jar +0 -0
- data/classpath/aws-java-sdk-s3-1.10.33.jar +0 -0
- data/classpath/embulk-input-s3-0.2.4.jar +0 -0
- data/classpath/{httpclient-4.3.4.jar → httpclient-4.3.6.jar} +0 -0
- data/classpath/{httpcore-4.3.2.jar → httpcore-4.3.3.jar} +0 -0
- data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +23 -55
- data/src/main/java/org/embulk/input/s3/AwsCredentials.java +179 -0
- data/src/main/java/org/embulk/input/s3/AwsCredentialsTask.java +39 -0
- data/src/main/java/org/embulk/input/s3/FileList.java +289 -0
- data/src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java +21 -6
- metadata +11 -9
- data/classpath/aws-java-sdk-kms-1.9.22.jar +0 -0
- data/classpath/aws-java-sdk-s3-1.9.22.jar +0 -0
- data/classpath/embulk-input-s3-0.2.3.jar +0 -0
- data/classpath/joda-time-2.8.2.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: db7314fde9364e1d4ec9edbb67f90cfc5c93dbe7
|
4
|
+
data.tar.gz: b3133c6d3ea81cef907d8cd974b5471186e086b6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0cd7e8ce269c322e7a03262267ae23370a9cff12877b9f33b8bebf1769ebdf093896babe8193324a3f4aa5b37ccae6a67b380e94367ab65fcc381b3d0e9a848e
|
7
|
+
data.tar.gz: 0e934e0baf997bfb29b3e3ed337daa28133a38a513050931902dc5f3d2cd03f652f6b2cb31e7d8fecd5b7e3dfccab869ede056053892af9e3bab2dc8cbd1ae4b
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -3,6 +3,7 @@ package org.embulk.input.s3;
|
|
3
3
|
import java.util.List;
|
4
4
|
import java.util.ArrayList;
|
5
5
|
import java.util.Collections;
|
6
|
+
import java.util.Iterator;
|
6
7
|
import java.io.IOException;
|
7
8
|
import java.io.InterruptedIOException;
|
8
9
|
import java.io.InputStream;
|
@@ -14,8 +15,6 @@ import com.google.common.base.Throwables;
|
|
14
15
|
import org.slf4j.Logger;
|
15
16
|
import com.amazonaws.auth.AWSCredentials;
|
16
17
|
import com.amazonaws.auth.AWSCredentialsProvider;
|
17
|
-
import com.amazonaws.auth.BasicAWSCredentials;
|
18
|
-
import com.amazonaws.auth.AnonymousAWSCredentials;
|
19
18
|
import com.amazonaws.services.s3.AmazonS3Client;
|
20
19
|
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
21
20
|
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
@@ -48,7 +47,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
48
47
|
private final Logger log = Exec.getLogger(S3FileInputPlugin.class);
|
49
48
|
|
50
49
|
public interface PluginTask
|
51
|
-
extends Task
|
50
|
+
extends AwsCredentialsTask, FileList.Task, Task
|
52
51
|
{
|
53
52
|
@Config("bucket")
|
54
53
|
public String getBucket();
|
@@ -64,16 +63,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
64
63
|
@ConfigDefault("null")
|
65
64
|
public Optional<String> getAccessKeyId();
|
66
65
|
|
67
|
-
@Config("secret_access_key")
|
68
|
-
@ConfigDefault("null")
|
69
|
-
public Optional<String> getSecretAccessKey();
|
70
|
-
|
71
66
|
// TODO timeout, ssl, etc
|
72
67
|
|
73
|
-
|
74
|
-
|
75
|
-
public List<String> getFiles();
|
76
|
-
public void setFiles(List<String> files);
|
68
|
+
public FileList getFiles();
|
69
|
+
public void setFiles(FileList files);
|
77
70
|
|
78
71
|
@ConfigInject
|
79
72
|
public BufferAllocator getBufferAllocator();
|
@@ -90,7 +83,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
90
83
|
task.setFiles(listFiles(task));
|
91
84
|
|
92
85
|
// number of processors is same with number of files
|
93
|
-
return resume(task.dump(), task.getFiles().
|
86
|
+
return resume(task.dump(), task.getFiles().getTaskCount(), control);
|
94
87
|
}
|
95
88
|
|
96
89
|
@Override
|
@@ -109,16 +102,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
109
102
|
ConfigDiff configDiff = Exec.newConfigDiff();
|
110
103
|
|
111
104
|
// last_path
|
112
|
-
|
113
|
-
// keep the last value
|
114
|
-
if (task.getLastPath().isPresent()) {
|
115
|
-
configDiff.set("last_path", task.getLastPath().get());
|
116
|
-
}
|
117
|
-
} else {
|
118
|
-
List<String> files = new ArrayList<String>(task.getFiles());
|
119
|
-
Collections.sort(files);
|
120
|
-
configDiff.set("last_path", files.get(files.size() - 1));
|
121
|
-
}
|
105
|
+
configDiff.set("last_path", task.getFiles().getLastPath(task.getLastPath()));
|
122
106
|
|
123
107
|
return configDiff;
|
124
108
|
}
|
@@ -138,24 +122,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
138
122
|
|
139
123
|
protected AWSCredentialsProvider getCredentialsProvider(PluginTask task)
|
140
124
|
{
|
141
|
-
|
142
|
-
if (task.getAccessKeyId().isPresent()) {
|
143
|
-
cred = new BasicAWSCredentials(
|
144
|
-
task.getAccessKeyId().get(),
|
145
|
-
task.getSecretAccessKey().get());
|
146
|
-
} else {
|
147
|
-
cred = new AnonymousAWSCredentials();
|
148
|
-
}
|
149
|
-
return new AWSCredentialsProvider() {
|
150
|
-
public AWSCredentials getCredentials()
|
151
|
-
{
|
152
|
-
return cred;
|
153
|
-
}
|
154
|
-
|
155
|
-
public void refresh()
|
156
|
-
{
|
157
|
-
}
|
158
|
-
};
|
125
|
+
return AwsCredentials.getAWSCredentialsProvider(task);
|
159
126
|
}
|
160
127
|
|
161
128
|
protected ClientConfiguration getClientConfiguration(PluginTask task)
|
@@ -170,7 +137,7 @@ public abstract class AbstractS3FileInputPlugin
|
|
170
137
|
return clientConfig;
|
171
138
|
}
|
172
139
|
|
173
|
-
private
|
140
|
+
private FileList listFiles(PluginTask task)
|
174
141
|
{
|
175
142
|
AmazonS3Client client = newS3Client(task);
|
176
143
|
String bucketName = task.getBucket();
|
@@ -179,7 +146,10 @@ public abstract class AbstractS3FileInputPlugin
|
|
179
146
|
log.info("Listing files with prefix \"/\". This doesn't mean all files in a bucket. If you intend to read all files, use \"path_prefix: ''\" (empty string) instead.");
|
180
147
|
}
|
181
148
|
|
182
|
-
|
149
|
+
FileList.Builder builder = new FileList.Builder(task);
|
150
|
+
listS3FilesByPrefix(builder, client, bucketName,
|
151
|
+
task.getPathPrefix(), task.getLastPath());
|
152
|
+
return builder.build();
|
183
153
|
}
|
184
154
|
|
185
155
|
/**
|
@@ -187,24 +157,24 @@ public abstract class AbstractS3FileInputPlugin
|
|
187
157
|
*
|
188
158
|
* The resulting list does not include the file that's size == 0.
|
189
159
|
*/
|
190
|
-
public static
|
160
|
+
public static void listS3FilesByPrefix(FileList.Builder builder,
|
161
|
+
AmazonS3Client client, String bucketName,
|
191
162
|
String prefix, Optional<String> lastPath)
|
192
163
|
{
|
193
|
-
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
194
|
-
|
195
164
|
String lastKey = lastPath.orNull();
|
196
165
|
do {
|
197
166
|
ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, lastKey, null, 1024);
|
198
167
|
ObjectListing ol = client.listObjects(req);
|
199
|
-
for(S3ObjectSummary s : ol.getObjectSummaries()) {
|
168
|
+
for (S3ObjectSummary s : ol.getObjectSummaries()) {
|
200
169
|
if (s.getSize() > 0) {
|
201
|
-
builder.add(s.getKey());
|
170
|
+
builder.add(s.getKey(), s.getSize());
|
171
|
+
if (!builder.needsMore()) {
|
172
|
+
return;
|
173
|
+
}
|
202
174
|
}
|
203
175
|
}
|
204
176
|
lastKey = ol.getNextMarker();
|
205
177
|
} while(lastKey != null);
|
206
|
-
|
207
|
-
return builder.build();
|
208
178
|
}
|
209
179
|
|
210
180
|
@Override
|
@@ -308,24 +278,22 @@ public abstract class AbstractS3FileInputPlugin
|
|
308
278
|
{
|
309
279
|
private AmazonS3Client client;
|
310
280
|
private final String bucket;
|
311
|
-
private final String
|
312
|
-
private boolean opened = false;
|
281
|
+
private final Iterator<String> iterator;
|
313
282
|
|
314
283
|
public SingleFileProvider(PluginTask task, int taskIndex)
|
315
284
|
{
|
316
285
|
this.client = newS3Client(task);
|
317
286
|
this.bucket = task.getBucket();
|
318
|
-
this.
|
287
|
+
this.iterator = task.getFiles().get(taskIndex).iterator();
|
319
288
|
}
|
320
289
|
|
321
290
|
@Override
|
322
291
|
public InputStream openNext() throws IOException
|
323
292
|
{
|
324
|
-
if (
|
293
|
+
if (!iterator.hasNext()) {
|
325
294
|
return null;
|
326
295
|
}
|
327
|
-
|
328
|
-
GetObjectRequest request = new GetObjectRequest(bucket, key);
|
296
|
+
GetObjectRequest request = new GetObjectRequest(bucket, iterator.next());
|
329
297
|
S3Object obj = client.getObject(request);
|
330
298
|
return new ResumableInputStream(obj.getObjectContent(), new S3InputStreamReopener(client, request, obj.getObjectMetadata().getContentLength()));
|
331
299
|
}
|
@@ -0,0 +1,179 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import com.amazonaws.auth.AWSCredentials;
|
5
|
+
import com.amazonaws.auth.AWSCredentialsProvider;
|
6
|
+
import com.amazonaws.auth.AWSSessionCredentials;
|
7
|
+
import com.amazonaws.auth.AWSSessionCredentialsProvider;
|
8
|
+
import com.amazonaws.auth.AnonymousAWSCredentials;
|
9
|
+
import com.amazonaws.auth.BasicAWSCredentials;
|
10
|
+
import com.amazonaws.auth.BasicSessionCredentials;
|
11
|
+
import com.amazonaws.auth.EnvironmentVariableCredentialsProvider;
|
12
|
+
import com.amazonaws.auth.InstanceProfileCredentialsProvider;
|
13
|
+
import com.amazonaws.auth.SystemPropertiesCredentialsProvider;
|
14
|
+
import com.amazonaws.auth.profile.ProfileCredentialsProvider;
|
15
|
+
import com.amazonaws.auth.profile.ProfilesConfigFile;
|
16
|
+
import org.embulk.config.ConfigException;
|
17
|
+
import org.embulk.spi.Exec;
|
18
|
+
import org.embulk.spi.unit.LocalFile;
|
19
|
+
import org.slf4j.Logger;
|
20
|
+
|
21
|
+
public abstract class AwsCredentials
|
22
|
+
{
|
23
|
+
private AwsCredentials() { }
|
24
|
+
|
25
|
+
public static AWSCredentialsProvider getAWSCredentialsProvider(AwsCredentialsTask task)
|
26
|
+
{
|
27
|
+
switch (task.getAuthMethod()) {
|
28
|
+
case "basic":
|
29
|
+
// for backward compatibility
|
30
|
+
if (!task.getAccessKeyId().isPresent() && !task.getAccessKeyId().isPresent()) {
|
31
|
+
final Logger log = Exec.getLogger(AwsCredentials.class);
|
32
|
+
log.warn("Both access_key_id and secret_access_key are not set. Assuming that 'auth_method: anonymous' option is set.");
|
33
|
+
log.warn("If you intentionally use anonymous authentication, please set 'auth_method: anonymous' option.");
|
34
|
+
log.warn("This behavior will be removed in a futurte release.");
|
35
|
+
reject(task.getSessionToken(), "session_token");
|
36
|
+
reject(task.getProfileFile(), "profile_file");
|
37
|
+
reject(task.getProfileName(), "profile_name");
|
38
|
+
return new AWSCredentialsProvider() {
|
39
|
+
public AWSCredentials getCredentials()
|
40
|
+
{
|
41
|
+
return new AnonymousAWSCredentials();
|
42
|
+
}
|
43
|
+
|
44
|
+
public void refresh() { }
|
45
|
+
};
|
46
|
+
}
|
47
|
+
else {
|
48
|
+
String accessKeyId = require(task.getAccessKeyId(), "'access_key_id', 'secret_access_key'");
|
49
|
+
String secretAccessKey = require(task.getSecretAccessKey(), "'secret_access_key'");
|
50
|
+
reject(task.getSessionToken(), "session_token");
|
51
|
+
reject(task.getProfileFile(), "profile_file");
|
52
|
+
reject(task.getProfileName(), "profile_name");
|
53
|
+
final BasicAWSCredentials creds = new BasicAWSCredentials(accessKeyId, secretAccessKey);
|
54
|
+
return new AWSCredentialsProvider() {
|
55
|
+
public AWSCredentials getCredentials()
|
56
|
+
{
|
57
|
+
return creds;
|
58
|
+
}
|
59
|
+
|
60
|
+
public void refresh() { }
|
61
|
+
};
|
62
|
+
}
|
63
|
+
|
64
|
+
case "env":
|
65
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
66
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
67
|
+
reject(task.getSessionToken(), "session_token");
|
68
|
+
reject(task.getProfileFile(), "profile_file");
|
69
|
+
reject(task.getProfileName(), "profile_name");
|
70
|
+
return overwriteBasicCredentials(task, new EnvironmentVariableCredentialsProvider().getCredentials());
|
71
|
+
|
72
|
+
case "instance":
|
73
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
74
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
75
|
+
reject(task.getSessionToken(), "session_token");
|
76
|
+
reject(task.getProfileFile(), "profile_file");
|
77
|
+
reject(task.getProfileName(), "profile_name");
|
78
|
+
return new InstanceProfileCredentialsProvider();
|
79
|
+
|
80
|
+
case "profile":
|
81
|
+
{
|
82
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
83
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
84
|
+
reject(task.getSessionToken(), "session_token");
|
85
|
+
|
86
|
+
String profileName = task.getProfileName().or("default");
|
87
|
+
ProfileCredentialsProvider provider;
|
88
|
+
if (task.getProfileFile().isPresent()) {
|
89
|
+
ProfilesConfigFile file = new ProfilesConfigFile(task.getProfileFile().get().getFile());
|
90
|
+
provider = new ProfileCredentialsProvider(file, profileName);
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
provider = new ProfileCredentialsProvider(profileName);
|
94
|
+
}
|
95
|
+
task.setProfileName(Optional.<String>absent());
|
96
|
+
task.setProfileFile(Optional.<LocalFile>absent());
|
97
|
+
|
98
|
+
return overwriteBasicCredentials(task, provider.getCredentials());
|
99
|
+
}
|
100
|
+
|
101
|
+
case "properties":
|
102
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
103
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
104
|
+
reject(task.getSessionToken(), "session_token");
|
105
|
+
reject(task.getProfileFile(), "profile_file");
|
106
|
+
reject(task.getProfileName(), "profile_name");
|
107
|
+
return overwriteBasicCredentials(task, new SystemPropertiesCredentialsProvider().getCredentials());
|
108
|
+
|
109
|
+
case "anonymous":
|
110
|
+
reject(task.getAccessKeyId(), "access_key_id");
|
111
|
+
reject(task.getSecretAccessKey(), "secret_access_key");
|
112
|
+
reject(task.getSessionToken(), "session_token");
|
113
|
+
reject(task.getProfileFile(), "profile_file");
|
114
|
+
reject(task.getProfileName(), "profile_name");
|
115
|
+
return new AWSCredentialsProvider() {
|
116
|
+
public AWSCredentials getCredentials()
|
117
|
+
{
|
118
|
+
return new AnonymousAWSCredentials();
|
119
|
+
}
|
120
|
+
|
121
|
+
public void refresh() { }
|
122
|
+
};
|
123
|
+
|
124
|
+
case "session":
|
125
|
+
{
|
126
|
+
String accessKeyId = require(task.getAccessKeyId(), "'access_key_id', 'secret_access_key', 'session_token'");
|
127
|
+
String secretAccessKey = require(task.getSecretAccessKey(), "'secret_access_key', 'session_token'");
|
128
|
+
String sessionToken = require(task.getSessionToken(), "'session_token'");
|
129
|
+
reject(task.getProfileFile(), "profile_file");
|
130
|
+
reject(task.getProfileName(), "profile_name");
|
131
|
+
final AWSSessionCredentials creds = new BasicSessionCredentials(accessKeyId, secretAccessKey, sessionToken);
|
132
|
+
return new AWSSessionCredentialsProvider() {
|
133
|
+
public AWSSessionCredentials getCredentials()
|
134
|
+
{
|
135
|
+
return creds;
|
136
|
+
}
|
137
|
+
|
138
|
+
public void refresh() { }
|
139
|
+
};
|
140
|
+
}
|
141
|
+
|
142
|
+
default:
|
143
|
+
throw new ConfigException(String.format("Unknwon auth_method '%s'. Supported methods are basic, instance, profile, properties, anonymous, and session.",
|
144
|
+
task.getAuthMethod()));
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
private static AWSCredentialsProvider overwriteBasicCredentials(AwsCredentialsTask task, final AWSCredentials creds)
|
149
|
+
{
|
150
|
+
task.setAuthMethod("basic");
|
151
|
+
task.setAccessKeyId(Optional.of(creds.getAWSAccessKeyId()));
|
152
|
+
task.setSecretAccessKey(Optional.of(creds.getAWSSecretKey()));
|
153
|
+
return new AWSCredentialsProvider() {
|
154
|
+
public AWSCredentials getCredentials()
|
155
|
+
{
|
156
|
+
return creds;
|
157
|
+
}
|
158
|
+
|
159
|
+
public void refresh() { }
|
160
|
+
};
|
161
|
+
}
|
162
|
+
|
163
|
+
private static <T> T require(Optional<T> value, String message)
|
164
|
+
{
|
165
|
+
if (value.isPresent()) {
|
166
|
+
return value.get();
|
167
|
+
}
|
168
|
+
else {
|
169
|
+
throw new ConfigException("Required option is not set: " + message);
|
170
|
+
}
|
171
|
+
}
|
172
|
+
|
173
|
+
private static <T> void reject(Optional<T> value, String message)
|
174
|
+
{
|
175
|
+
if (value.isPresent()) {
|
176
|
+
throw new ConfigException("Invalid option is set: " + message);
|
177
|
+
}
|
178
|
+
}
|
179
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import com.google.common.base.Optional;
|
4
|
+
import org.embulk.config.Config;
|
5
|
+
import org.embulk.config.ConfigDefault;
|
6
|
+
import org.embulk.spi.unit.LocalFile;
|
7
|
+
|
8
|
+
public interface AwsCredentialsTask
|
9
|
+
{
|
10
|
+
@Config("auth_method")
|
11
|
+
@ConfigDefault("\"basic\"")
|
12
|
+
String getAuthMethod();
|
13
|
+
void setAuthMethod(String method);
|
14
|
+
|
15
|
+
@Config("access_key_id")
|
16
|
+
@ConfigDefault("null")
|
17
|
+
Optional<String> getAccessKeyId();
|
18
|
+
void setAccessKeyId(Optional<String> value);
|
19
|
+
|
20
|
+
@Config("secret_access_key")
|
21
|
+
@ConfigDefault("null")
|
22
|
+
Optional<String> getSecretAccessKey();
|
23
|
+
void setSecretAccessKey(Optional<String> value);
|
24
|
+
|
25
|
+
@Config("session_token")
|
26
|
+
@ConfigDefault("null")
|
27
|
+
Optional<String> getSessionToken();
|
28
|
+
void setSessionToken(Optional<String> value);
|
29
|
+
|
30
|
+
@Config("profile_file")
|
31
|
+
@ConfigDefault("null")
|
32
|
+
Optional<LocalFile> getProfileFile();
|
33
|
+
void setProfileFile(Optional<LocalFile> value);
|
34
|
+
|
35
|
+
@Config("profile_name")
|
36
|
+
@ConfigDefault("null")
|
37
|
+
Optional<String> getProfileName();
|
38
|
+
void setProfileName(Optional<String> value);
|
39
|
+
}
|
@@ -0,0 +1,289 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.AbstractList;
|
5
|
+
import java.util.ArrayList;
|
6
|
+
import java.util.zip.GZIPInputStream;
|
7
|
+
import java.util.zip.GZIPOutputStream;
|
8
|
+
import java.io.InputStream;
|
9
|
+
import java.io.OutputStream;
|
10
|
+
import java.io.BufferedOutputStream;
|
11
|
+
import java.io.BufferedInputStream;
|
12
|
+
import java.io.ByteArrayInputStream;
|
13
|
+
import java.io.ByteArrayOutputStream;
|
14
|
+
import java.io.IOException;
|
15
|
+
import java.nio.ByteBuffer;
|
16
|
+
import java.nio.charset.StandardCharsets;
|
17
|
+
import org.embulk.config.Config;
|
18
|
+
import org.embulk.config.ConfigDefault;
|
19
|
+
import org.embulk.config.ConfigSource;
|
20
|
+
import com.google.common.base.Throwables;
|
21
|
+
import com.google.common.base.Optional;
|
22
|
+
import com.google.common.collect.ImmutableList;
|
23
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
24
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
25
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
26
|
+
|
27
|
+
// this class should be moved to embulk-core
|
28
|
+
public class FileList
|
29
|
+
{
|
30
|
+
public interface Task
|
31
|
+
{
|
32
|
+
@Config("total_file_count_limit")
|
33
|
+
@ConfigDefault("2147483647")
|
34
|
+
int getTotalFileCountLimit();
|
35
|
+
}
|
36
|
+
|
37
|
+
public static class Entry
|
38
|
+
{
|
39
|
+
private int index;
|
40
|
+
private long size;
|
41
|
+
|
42
|
+
@JsonCreator
|
43
|
+
public Entry(
|
44
|
+
@JsonProperty("index") int index,
|
45
|
+
@JsonProperty("size") long size)
|
46
|
+
{
|
47
|
+
this.index = index;
|
48
|
+
this.size = size;
|
49
|
+
}
|
50
|
+
|
51
|
+
@JsonProperty("index")
|
52
|
+
public int getIndex() { return index; }
|
53
|
+
|
54
|
+
@JsonProperty("size")
|
55
|
+
public long getSize() { return size; }
|
56
|
+
}
|
57
|
+
|
58
|
+
public static class Builder
|
59
|
+
{
|
60
|
+
private final ByteArrayOutputStream binary;
|
61
|
+
private final OutputStream stream;
|
62
|
+
private final List<Entry> entries = new ArrayList<>();
|
63
|
+
private String last = null;
|
64
|
+
|
65
|
+
private int limitCount = Integer.MAX_VALUE;
|
66
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
67
|
+
|
68
|
+
public Builder(Task task)
|
69
|
+
{
|
70
|
+
this();
|
71
|
+
this.limitCount = task.getTotalFileCountLimit();
|
72
|
+
}
|
73
|
+
|
74
|
+
public Builder(ConfigSource config)
|
75
|
+
{
|
76
|
+
this();
|
77
|
+
this.limitCount = config.get(int.class, "total_file_count_limit", Integer.MAX_VALUE);
|
78
|
+
}
|
79
|
+
|
80
|
+
public Builder()
|
81
|
+
{
|
82
|
+
binary = new ByteArrayOutputStream();
|
83
|
+
try {
|
84
|
+
stream = new BufferedOutputStream(new GZIPOutputStream(binary));
|
85
|
+
}
|
86
|
+
catch (IOException ex) {
|
87
|
+
throw Throwables.propagate(ex);
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
public Builder limitTotalFileCount(int limitCount)
|
92
|
+
{
|
93
|
+
this.limitCount = limitCount;
|
94
|
+
return this;
|
95
|
+
}
|
96
|
+
|
97
|
+
public int size()
|
98
|
+
{
|
99
|
+
return entries.size();
|
100
|
+
}
|
101
|
+
|
102
|
+
public boolean needsMore()
|
103
|
+
{
|
104
|
+
return size() < limitCount;
|
105
|
+
}
|
106
|
+
|
107
|
+
public synchronized boolean add(String path, long size)
|
108
|
+
{
|
109
|
+
// TODO throw IllegalStateException if stream is already closed
|
110
|
+
|
111
|
+
if (!needsMore()) {
|
112
|
+
return false;
|
113
|
+
}
|
114
|
+
|
115
|
+
// TODO in the future, support some other filtering parameters (file name suffix filter, regex filter, etc)
|
116
|
+
// and return false if filtered out.
|
117
|
+
|
118
|
+
int index = entries.size();
|
119
|
+
entries.add(new Entry(index, size));
|
120
|
+
|
121
|
+
byte[] data = path.getBytes(StandardCharsets.UTF_8);
|
122
|
+
castBuffer.putInt(0, data.length);
|
123
|
+
try {
|
124
|
+
stream.write(castBuffer.array());
|
125
|
+
stream.write(data);
|
126
|
+
}
|
127
|
+
catch (IOException ex) {
|
128
|
+
throw Throwables.propagate(ex);
|
129
|
+
}
|
130
|
+
|
131
|
+
last = path;
|
132
|
+
return true;
|
133
|
+
}
|
134
|
+
|
135
|
+
public FileList build()
|
136
|
+
{
|
137
|
+
try {
|
138
|
+
stream.close();
|
139
|
+
}
|
140
|
+
catch (IOException ex) {
|
141
|
+
throw Throwables.propagate(ex);
|
142
|
+
}
|
143
|
+
return new FileList(binary.toByteArray(), getSplits(entries), Optional.fromNullable(last));
|
144
|
+
}
|
145
|
+
|
146
|
+
private List<List<Entry>> getSplits(List<Entry> all)
|
147
|
+
{
|
148
|
+
// TODO combine multiple entries into one task using some configuration parameters
|
149
|
+
List<List<Entry>> tasks = new ArrayList<>();
|
150
|
+
for (Entry entry : all) {
|
151
|
+
tasks.add(ImmutableList.of(entry));
|
152
|
+
}
|
153
|
+
return tasks;
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
157
|
+
private final byte[] data;
|
158
|
+
private final List<List<Entry>> tasks;
|
159
|
+
private final Optional<String> last;
|
160
|
+
|
161
|
+
@JsonCreator
|
162
|
+
@Deprecated
|
163
|
+
public FileList(
|
164
|
+
@JsonProperty("data") byte[] data,
|
165
|
+
@JsonProperty("tasks") List<List<Entry>> tasks,
|
166
|
+
@JsonProperty("last") Optional<String> last)
|
167
|
+
{
|
168
|
+
this.data = data;
|
169
|
+
this.tasks = tasks;
|
170
|
+
this.last = last;
|
171
|
+
}
|
172
|
+
|
173
|
+
@JsonIgnore
|
174
|
+
public Optional<String> getLastPath(Optional<String> lastLastPath)
|
175
|
+
{
|
176
|
+
if (last.isPresent()) {
|
177
|
+
return last;
|
178
|
+
}
|
179
|
+
return lastLastPath;
|
180
|
+
}
|
181
|
+
|
182
|
+
@JsonIgnore
|
183
|
+
public int getTaskCount()
|
184
|
+
{
|
185
|
+
return tasks.size();
|
186
|
+
}
|
187
|
+
|
188
|
+
@JsonIgnore
|
189
|
+
public List<String> get(int i)
|
190
|
+
{
|
191
|
+
return new EntryList(data, tasks.get(i));
|
192
|
+
}
|
193
|
+
|
194
|
+
@JsonProperty("data")
|
195
|
+
@Deprecated
|
196
|
+
public byte[] getData()
|
197
|
+
{
|
198
|
+
return data;
|
199
|
+
}
|
200
|
+
|
201
|
+
@JsonProperty("tasks")
|
202
|
+
@Deprecated
|
203
|
+
public List<List<Entry>> getTasks()
|
204
|
+
{
|
205
|
+
return tasks;
|
206
|
+
}
|
207
|
+
|
208
|
+
@JsonProperty("last")
|
209
|
+
@Deprecated
|
210
|
+
public Optional<String> getLast()
|
211
|
+
{
|
212
|
+
return last;
|
213
|
+
}
|
214
|
+
|
215
|
+
private class EntryList
|
216
|
+
extends AbstractList<String>
|
217
|
+
{
|
218
|
+
private final byte[] data;
|
219
|
+
private final List<Entry> entries;
|
220
|
+
private InputStream stream;
|
221
|
+
private int current;
|
222
|
+
|
223
|
+
private final ByteBuffer castBuffer = ByteBuffer.allocate(4);
|
224
|
+
|
225
|
+
public EntryList(byte[] data, List<Entry> entries)
|
226
|
+
{
|
227
|
+
this.data = data;
|
228
|
+
this.entries = entries;
|
229
|
+
try {
|
230
|
+
this.stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
231
|
+
}
|
232
|
+
catch (IOException ex) {
|
233
|
+
throw Throwables.propagate(ex);
|
234
|
+
}
|
235
|
+
this.current = 0;
|
236
|
+
}
|
237
|
+
|
238
|
+
@Override
|
239
|
+
public synchronized String get(int i)
|
240
|
+
{
|
241
|
+
Entry e = entries.get(i);
|
242
|
+
if (e.getIndex() < current) {
|
243
|
+
// rewind to the head
|
244
|
+
try {
|
245
|
+
stream.close();
|
246
|
+
stream = new BufferedInputStream(new GZIPInputStream(new ByteArrayInputStream(data)));
|
247
|
+
}
|
248
|
+
catch (IOException ex) {
|
249
|
+
throw Throwables.propagate(ex);
|
250
|
+
}
|
251
|
+
current = 0;
|
252
|
+
}
|
253
|
+
|
254
|
+
while (current < e.getIndex()) {
|
255
|
+
readNext();
|
256
|
+
}
|
257
|
+
// now current == e.getIndex()
|
258
|
+
return readNextString();
|
259
|
+
}
|
260
|
+
|
261
|
+
@Override
|
262
|
+
public int size()
|
263
|
+
{
|
264
|
+
return entries.size();
|
265
|
+
}
|
266
|
+
|
267
|
+
private byte[] readNext()
|
268
|
+
{
|
269
|
+
try {
|
270
|
+
stream.read(castBuffer.array());
|
271
|
+
int n = castBuffer.getInt(0);
|
272
|
+
byte[] b = new byte[n]; // here should be able to use a pooled buffer because read data is ignored if readNextString doesn't call this method
|
273
|
+
stream.read(b);
|
274
|
+
|
275
|
+
current++;
|
276
|
+
|
277
|
+
return b;
|
278
|
+
}
|
279
|
+
catch (IOException ex) {
|
280
|
+
throw Throwables.propagate(ex);
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
private String readNextString()
|
285
|
+
{
|
286
|
+
return new String(readNext(), StandardCharsets.UTF_8);
|
287
|
+
}
|
288
|
+
}
|
289
|
+
}
|
@@ -72,7 +72,9 @@ public class TestS3FileInputPlugin
|
|
72
72
|
doReturn("in/file/").doReturn(null).when(ol).getNextMarker();
|
73
73
|
|
74
74
|
// It counts only size != 0 files.
|
75
|
-
|
75
|
+
FileList.Builder builder = new FileList.Builder();
|
76
|
+
S3FileInputPlugin.listS3FilesByPrefix(builder, client, "bucketName", "prefix", Optional.<String>absent());
|
77
|
+
assertEquals(1, builder.size());
|
76
78
|
}
|
77
79
|
|
78
80
|
@Test
|
@@ -90,7 +92,7 @@ public class TestS3FileInputPlugin
|
|
90
92
|
public List<TaskReport> run(TaskSource taskSource, int taskCount)
|
91
93
|
{
|
92
94
|
assertEquals(3, taskCount);
|
93
|
-
List<String> files = taskSource.loadTask(S3PluginTask.class).getFiles();
|
95
|
+
List<String> files = fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles());
|
94
96
|
assertArrayEquals(new String[]{"in/aa/a", "in/aa/b", "in/aa/c"}, files.toArray(new String[files.size()]));
|
95
97
|
return emptyTaskReports(taskCount);
|
96
98
|
}
|
@@ -108,12 +110,12 @@ public class TestS3FileInputPlugin
|
|
108
110
|
public List<TaskReport> run(TaskSource taskSource, int taskCount)
|
109
111
|
{
|
110
112
|
assertEquals(0, taskCount);
|
111
|
-
assertTrue(taskSource.loadTask(S3PluginTask.class).getFiles().isEmpty());
|
113
|
+
assertTrue(fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles()).isEmpty());
|
112
114
|
return emptyTaskReports(taskCount);
|
113
115
|
}
|
114
116
|
});
|
115
117
|
|
116
|
-
|
118
|
+
assertEquals(null, configDiff.get(String.class, "last_path", null));
|
117
119
|
}
|
118
120
|
|
119
121
|
{ // if files are empty, keep the previous last_path.
|
@@ -126,7 +128,7 @@ public class TestS3FileInputPlugin
|
|
126
128
|
@Override
|
127
129
|
public List<TaskReport> run(TaskSource taskSource, int taskCount) {
|
128
130
|
assertEquals(0, taskCount);
|
129
|
-
assertTrue(taskSource.loadTask(S3PluginTask.class).getFiles().isEmpty());
|
131
|
+
assertTrue(fileListToList(taskSource.loadTask(S3PluginTask.class).getFiles()).isEmpty());
|
130
132
|
return emptyTaskReports(taskCount);
|
131
133
|
}
|
132
134
|
});
|
@@ -143,7 +145,9 @@ public class TestS3FileInputPlugin
|
|
143
145
|
doReturn(client).when(plugin).newS3Client(any(PluginTask.class));
|
144
146
|
|
145
147
|
PluginTask task = config.loadConfig(plugin.getTaskClass());
|
146
|
-
|
148
|
+
FileList.Builder builder = new FileList.Builder();
|
149
|
+
builder.add("in/aa/a", 100);
|
150
|
+
task.setFiles(builder.build());
|
147
151
|
|
148
152
|
StringBuilder sbuf = new StringBuilder();
|
149
153
|
try (S3FileInput input = (S3FileInput) plugin.open(task.dump(), 0)) {
|
@@ -218,4 +222,15 @@ public class TestS3FileInputPlugin
|
|
218
222
|
}
|
219
223
|
return reports.build();
|
220
224
|
}
|
225
|
+
|
226
|
+
private static List<String> fileListToList(FileList list)
|
227
|
+
{
|
228
|
+
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
229
|
+
for (int i=0; i < list.getTaskCount(); i++) {
|
230
|
+
for (String path : list.get(i)) {
|
231
|
+
builder.add(path);
|
232
|
+
}
|
233
|
+
}
|
234
|
+
return builder.build();
|
235
|
+
}
|
221
236
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-s3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -48,18 +48,20 @@ files:
|
|
48
48
|
- build.gradle
|
49
49
|
- lib/embulk/input/s3.rb
|
50
50
|
- src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java
|
51
|
+
- src/main/java/org/embulk/input/s3/AwsCredentials.java
|
52
|
+
- src/main/java/org/embulk/input/s3/AwsCredentialsTask.java
|
53
|
+
- src/main/java/org/embulk/input/s3/FileList.java
|
51
54
|
- src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
|
52
55
|
- src/test/java/org/embulk/input/s3/TestS3FileInputPlugin.java
|
53
56
|
- src/test/java/org/embulk/input/s3/TestS3InputStreamReopener.java
|
54
|
-
- classpath/aws-java-sdk-core-1.
|
55
|
-
- classpath/aws-java-sdk-kms-1.
|
56
|
-
- classpath/aws-java-sdk-s3-1.
|
57
|
+
- classpath/aws-java-sdk-core-1.10.33.jar
|
58
|
+
- classpath/aws-java-sdk-kms-1.10.33.jar
|
59
|
+
- classpath/aws-java-sdk-s3-1.10.33.jar
|
57
60
|
- classpath/commons-codec-1.6.jar
|
58
|
-
- classpath/embulk-input-s3-0.2.
|
59
|
-
- classpath/httpclient-4.3.
|
60
|
-
- classpath/httpcore-4.3.
|
61
|
+
- classpath/embulk-input-s3-0.2.4.jar
|
62
|
+
- classpath/httpclient-4.3.6.jar
|
63
|
+
- classpath/httpcore-4.3.3.jar
|
61
64
|
- classpath/jcl-over-slf4j-1.7.12.jar
|
62
|
-
- classpath/joda-time-2.8.2.jar
|
63
65
|
homepage: https://github.com/embulk/embulk-input-s3
|
64
66
|
licenses:
|
65
67
|
- Apache 2.0
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|