embulk-input-s3 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/build.gradle +0 -38
- data/classpath/aws-java-sdk-core-1.9.22.jar +0 -0
- data/classpath/aws-java-sdk-kms-1.9.22.jar +0 -0
- data/classpath/aws-java-sdk-s3-1.9.22.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/embulk-input-s3-0.2.0.jar +0 -0
- data/classpath/httpclient-4.3.4.jar +0 -0
- data/classpath/httpcore-4.3.2.jar +0 -0
- data/classpath/joda-time-2.8.1.jar +0 -0
- data/src/main/java/org/embulk/input/s3/AbstractS3FileInputPlugin.java +318 -0
- data/src/main/java/org/embulk/input/s3/S3FileInputPlugin.java +11 -299
- metadata +5 -12
- data/.gitignore +0 -6
- data/ChangeLog +0 -25
- data/README.md +0 -34
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +0 -6
- data/gradlew +0 -164
- data/gradlew.bat +0 -90
- data/src/main/java/org/embulk/input/s3/RetryExecutor.java +0 -130
- data/src/main/java/org/embulk/input/s3/RetryableInputStream.java +0 -128
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 02cd2ac88b83d92846715d194928f38a32693ebe
|
4
|
+
data.tar.gz: 145815003f894fc2a2a224266e00908dae89cf79
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9d81495964337639a2a6e7de2ce9ca3c3e5b39ca869ff1873e5ed119de95ac8ae778fe3450d85068dcd7b773bbfd8345835b46ab226ad73f1117c2fb614c0bc
|
7
|
+
data.tar.gz: 5f50eb790d085fb427a23e3819bdada69bb0cafba84e6e4175a8a317e7cf7e37e0a675866a87b893600c6ccfeb59654662e452c56d93a556d5559319c2760543
|
data/build.gradle
CHANGED
@@ -1,41 +1,3 @@
|
|
1
|
-
plugins {
|
2
|
-
id "com.jfrog.bintray" version "1.1"
|
3
|
-
id "com.github.jruby-gradle.base" version "0.1.5"
|
4
|
-
id "java"
|
5
|
-
}
|
6
|
-
import com.github.jrubygradle.JRubyExec
|
7
|
-
repositories {
|
8
|
-
mavenCentral()
|
9
|
-
mavenLocal()
|
10
|
-
jcenter()
|
11
|
-
}
|
12
|
-
configurations {
|
13
|
-
provided
|
14
|
-
}
|
15
|
-
|
16
|
-
version = "0.1.7"
|
17
|
-
|
18
|
-
dependencies {
|
19
|
-
compile "org.embulk:embulk-core:0.5.3"
|
20
|
-
provided "org.embulk:embulk-core:0.5.3"
|
21
|
-
compile "com.amazonaws:aws-java-sdk-s3:1.9.22"
|
22
|
-
testCompile "junit:junit:4.+"
|
23
|
-
testCompile "org.mockito:mockito-core:1.+"
|
24
|
-
}
|
25
|
-
|
26
|
-
task classpath(type: Copy, dependsOn: ["jar"]) {
|
27
|
-
doFirst { file("classpath").deleteDir() }
|
28
|
-
from (configurations.runtime - configurations.provided + files(jar.archivePath))
|
29
|
-
into "classpath"
|
30
|
-
}
|
31
|
-
clean { delete 'classpath' }
|
32
|
-
|
33
|
-
task gem(type: JRubyExec, dependsOn: ["build", "gemspec", "classpath"]) {
|
34
|
-
jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
|
35
|
-
script "build/gemspec"
|
36
|
-
doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
|
37
|
-
}
|
38
|
-
|
39
1
|
task gemspec << { file("build/gemspec").write($/
|
40
2
|
Gem::Specification.new do |spec|
|
41
3
|
spec.name = "${project.name}"
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,318 @@
|
|
1
|
+
package org.embulk.input.s3;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.ArrayList;
|
5
|
+
import java.util.Collections;
|
6
|
+
import java.io.IOException;
|
7
|
+
import java.io.InterruptedIOException;
|
8
|
+
import java.io.InputStream;
|
9
|
+
import com.google.common.collect.ImmutableList;
|
10
|
+
import com.google.common.base.Optional;
|
11
|
+
import com.google.common.base.Throwables;
|
12
|
+
import org.slf4j.Logger;
|
13
|
+
import com.amazonaws.auth.AWSCredentials;
|
14
|
+
import com.amazonaws.auth.AWSCredentialsProvider;
|
15
|
+
import com.amazonaws.auth.BasicAWSCredentials;
|
16
|
+
import com.amazonaws.services.s3.AmazonS3Client;
|
17
|
+
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
18
|
+
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
19
|
+
import com.amazonaws.services.s3.model.ObjectListing;
|
20
|
+
import com.amazonaws.services.s3.model.GetObjectRequest;
|
21
|
+
import com.amazonaws.services.s3.model.S3Object;
|
22
|
+
import com.amazonaws.ClientConfiguration;
|
23
|
+
import com.amazonaws.Protocol;
|
24
|
+
import org.embulk.config.Config;
|
25
|
+
import org.embulk.config.ConfigInject;
|
26
|
+
import org.embulk.config.ConfigDefault;
|
27
|
+
import org.embulk.config.Task;
|
28
|
+
import org.embulk.config.TaskSource;
|
29
|
+
import org.embulk.config.ConfigSource;
|
30
|
+
import org.embulk.config.ConfigDiff;
|
31
|
+
import org.embulk.config.CommitReport;
|
32
|
+
import org.embulk.spi.BufferAllocator;
|
33
|
+
import org.embulk.spi.Exec;
|
34
|
+
import org.embulk.spi.FileInputPlugin;
|
35
|
+
import org.embulk.spi.TransactionalFileInput;
|
36
|
+
import org.embulk.spi.util.InputStreamFileInput;
|
37
|
+
import org.embulk.spi.util.ResumableInputStream;
|
38
|
+
import org.embulk.spi.util.RetryExecutor.Retryable;
|
39
|
+
import org.embulk.spi.util.RetryExecutor.RetryGiveupException;
|
40
|
+
import static org.embulk.spi.util.RetryExecutor.retryExecutor;
|
41
|
+
|
42
|
+
public abstract class AbstractS3FileInputPlugin
|
43
|
+
implements FileInputPlugin
|
44
|
+
{
|
45
|
+
public interface PluginTask
|
46
|
+
extends Task
|
47
|
+
{
|
48
|
+
@Config("bucket")
|
49
|
+
public String getBucket();
|
50
|
+
|
51
|
+
@Config("path_prefix")
|
52
|
+
public String getPathPrefix();
|
53
|
+
|
54
|
+
@Config("last_path")
|
55
|
+
@ConfigDefault("null")
|
56
|
+
public Optional<String> getLastPath();
|
57
|
+
|
58
|
+
@Config("access_key_id")
|
59
|
+
public String getAccessKeyId();
|
60
|
+
|
61
|
+
@Config("secret_access_key")
|
62
|
+
public String getSecretAccessKey();
|
63
|
+
|
64
|
+
// TODO timeout, ssl, etc
|
65
|
+
|
66
|
+
// TODO support more options such as STS
|
67
|
+
|
68
|
+
public List<String> getFiles();
|
69
|
+
public void setFiles(List<String> files);
|
70
|
+
|
71
|
+
@ConfigInject
|
72
|
+
public BufferAllocator getBufferAllocator();
|
73
|
+
}
|
74
|
+
|
75
|
+
protected abstract Class<? extends PluginTask> getTaskClass();
|
76
|
+
|
77
|
+
@Override
|
78
|
+
public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
|
79
|
+
{
|
80
|
+
PluginTask task = config.loadConfig(getTaskClass());
|
81
|
+
|
82
|
+
// list files recursively
|
83
|
+
task.setFiles(listFiles(task));
|
84
|
+
|
85
|
+
// number of processors is same with number of files
|
86
|
+
return resume(task.dump(), task.getFiles().size(), control);
|
87
|
+
}
|
88
|
+
|
89
|
+
@Override
|
90
|
+
public ConfigDiff resume(TaskSource taskSource,
|
91
|
+
int taskCount,
|
92
|
+
FileInputPlugin.Control control)
|
93
|
+
{
|
94
|
+
PluginTask task = taskSource.loadTask(getTaskClass());
|
95
|
+
|
96
|
+
// validate task
|
97
|
+
newS3Client(task);
|
98
|
+
|
99
|
+
control.run(taskSource, taskCount);
|
100
|
+
|
101
|
+
// build next config
|
102
|
+
ConfigDiff configDiff = Exec.newConfigDiff();
|
103
|
+
|
104
|
+
// last_path
|
105
|
+
if (task.getFiles().isEmpty()) {
|
106
|
+
// keep the last value
|
107
|
+
if (task.getLastPath().isPresent()) {
|
108
|
+
configDiff.set("last_path", task.getLastPath().get());
|
109
|
+
}
|
110
|
+
} else {
|
111
|
+
List<String> files = new ArrayList<String>(task.getFiles());
|
112
|
+
Collections.sort(files);
|
113
|
+
configDiff.set("last_path", files.get(files.size() - 1));
|
114
|
+
}
|
115
|
+
|
116
|
+
return configDiff;
|
117
|
+
}
|
118
|
+
|
119
|
+
@Override
|
120
|
+
public void cleanup(TaskSource taskSource,
|
121
|
+
int taskCount,
|
122
|
+
List<CommitReport> successCommitReports)
|
123
|
+
{
|
124
|
+
// do nothing
|
125
|
+
}
|
126
|
+
|
127
|
+
protected AmazonS3Client newS3Client(PluginTask task)
|
128
|
+
{
|
129
|
+
return new AmazonS3Client(getCredentialsProvider(task), getClientConfiguration(task));
|
130
|
+
}
|
131
|
+
|
132
|
+
protected AWSCredentialsProvider getCredentialsProvider(PluginTask task)
|
133
|
+
{
|
134
|
+
final AWSCredentials cred = new BasicAWSCredentials(
|
135
|
+
task.getAccessKeyId(), task.getSecretAccessKey());
|
136
|
+
return new AWSCredentialsProvider() {
|
137
|
+
public AWSCredentials getCredentials()
|
138
|
+
{
|
139
|
+
return cred;
|
140
|
+
}
|
141
|
+
|
142
|
+
public void refresh()
|
143
|
+
{
|
144
|
+
}
|
145
|
+
};
|
146
|
+
}
|
147
|
+
|
148
|
+
protected ClientConfiguration getClientConfiguration(PluginTask task)
|
149
|
+
{
|
150
|
+
ClientConfiguration clientConfig = new ClientConfiguration();
|
151
|
+
|
152
|
+
//clientConfig.setProtocol(Protocol.HTTP);
|
153
|
+
clientConfig.setMaxConnections(50); // SDK default: 50
|
154
|
+
clientConfig.setMaxErrorRetry(3); // SDK default: 3
|
155
|
+
clientConfig.setSocketTimeout(8*60*1000); // SDK default: 50*1000
|
156
|
+
|
157
|
+
return clientConfig;
|
158
|
+
}
|
159
|
+
|
160
|
+
private List<String> listFiles(PluginTask task)
|
161
|
+
{
|
162
|
+
AmazonS3Client client = newS3Client(task);
|
163
|
+
String bucketName = task.getBucket();
|
164
|
+
|
165
|
+
return listS3FilesByPrefix(client, bucketName, task.getPathPrefix(), task.getLastPath());
|
166
|
+
}
|
167
|
+
|
168
|
+
/**
|
169
|
+
* Lists S3 filenames filtered by prefix.
|
170
|
+
*
|
171
|
+
* The resulting list does not include the file that's size == 0.
|
172
|
+
*/
|
173
|
+
public static List<String> listS3FilesByPrefix(AmazonS3Client client, String bucketName,
|
174
|
+
String prefix, Optional<String> lastPath)
|
175
|
+
{
|
176
|
+
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
177
|
+
|
178
|
+
String lastKey = lastPath.orNull();
|
179
|
+
do {
|
180
|
+
ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, lastKey, null, 1024);
|
181
|
+
ObjectListing ol = client.listObjects(req);
|
182
|
+
for(S3ObjectSummary s : ol.getObjectSummaries()) {
|
183
|
+
if (s.getSize() > 0) {
|
184
|
+
builder.add(s.getKey());
|
185
|
+
}
|
186
|
+
}
|
187
|
+
lastKey = ol.getNextMarker();
|
188
|
+
} while(lastKey != null);
|
189
|
+
|
190
|
+
return builder.build();
|
191
|
+
}
|
192
|
+
|
193
|
+
@Override
|
194
|
+
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
195
|
+
{
|
196
|
+
PluginTask task = taskSource.loadTask(getTaskClass());
|
197
|
+
return new S3FileInput(task, taskIndex);
|
198
|
+
}
|
199
|
+
|
200
|
+
private static class S3InputStreamReopener
|
201
|
+
implements ResumableInputStream.Reopener
|
202
|
+
{
|
203
|
+
private final Logger log = Exec.getLogger(S3InputStreamReopener.class);
|
204
|
+
|
205
|
+
private final AmazonS3Client client;
|
206
|
+
private final GetObjectRequest request;
|
207
|
+
private final long contentLength;
|
208
|
+
|
209
|
+
public S3InputStreamReopener(AmazonS3Client client, GetObjectRequest request, long contentLength)
|
210
|
+
{
|
211
|
+
this.client = client;
|
212
|
+
this.request = request;
|
213
|
+
this.contentLength = contentLength;
|
214
|
+
}
|
215
|
+
|
216
|
+
@Override
|
217
|
+
public InputStream reopen(final long offset, final Exception closedCause) throws IOException
|
218
|
+
{
|
219
|
+
try {
|
220
|
+
return retryExecutor()
|
221
|
+
.withRetryLimit(3)
|
222
|
+
.withInitialRetryWait(500)
|
223
|
+
.withMaxRetryWait(30*1000)
|
224
|
+
.runInterruptible(new Retryable<InputStream>() {
|
225
|
+
@Override
|
226
|
+
public InputStream call() throws InterruptedIOException
|
227
|
+
{
|
228
|
+
log.warn(String.format("S3 read failed. Retrying GET request with %,d bytes offset", offset), closedCause);
|
229
|
+
request.setRange(offset, contentLength - 1); // [first, last]
|
230
|
+
return client.getObject(request).getObjectContent();
|
231
|
+
}
|
232
|
+
|
233
|
+
@Override
|
234
|
+
public boolean isRetryableException(Exception exception)
|
235
|
+
{
|
236
|
+
return true; // TODO
|
237
|
+
}
|
238
|
+
|
239
|
+
@Override
|
240
|
+
public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
|
241
|
+
throws RetryGiveupException
|
242
|
+
{
|
243
|
+
String message = String.format("S3 GET request failed. Retrying %d/%d after %d seconds. Message: %s",
|
244
|
+
retryCount, retryLimit, retryWait/1000, exception.getMessage());
|
245
|
+
if (retryCount % 3 == 0) {
|
246
|
+
log.warn(message, exception);
|
247
|
+
} else {
|
248
|
+
log.warn(message);
|
249
|
+
}
|
250
|
+
}
|
251
|
+
|
252
|
+
@Override
|
253
|
+
public void onGiveup(Exception firstException, Exception lastException)
|
254
|
+
throws RetryGiveupException
|
255
|
+
{
|
256
|
+
}
|
257
|
+
});
|
258
|
+
} catch (RetryGiveupException ex) {
|
259
|
+
Throwables.propagateIfInstanceOf(ex.getCause(), IOException.class);
|
260
|
+
throw Throwables.propagate(ex.getCause());
|
261
|
+
} catch (InterruptedException ex) {
|
262
|
+
throw new InterruptedIOException();
|
263
|
+
}
|
264
|
+
}
|
265
|
+
}
|
266
|
+
|
267
|
+
public class S3FileInput
|
268
|
+
extends InputStreamFileInput
|
269
|
+
implements TransactionalFileInput
|
270
|
+
{
|
271
|
+
public S3FileInput(PluginTask task, int taskIndex)
|
272
|
+
{
|
273
|
+
super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
|
274
|
+
}
|
275
|
+
|
276
|
+
public void abort() { }
|
277
|
+
|
278
|
+
public CommitReport commit()
|
279
|
+
{
|
280
|
+
return Exec.newCommitReport();
|
281
|
+
}
|
282
|
+
|
283
|
+
@Override
|
284
|
+
public void close() { }
|
285
|
+
}
|
286
|
+
|
287
|
+
// TODO create single-file InputStreamFileInput utility
|
288
|
+
private class SingleFileProvider
|
289
|
+
implements InputStreamFileInput.Provider
|
290
|
+
{
|
291
|
+
private AmazonS3Client client;
|
292
|
+
private final String bucket;
|
293
|
+
private final String key;
|
294
|
+
private boolean opened = false;
|
295
|
+
|
296
|
+
public SingleFileProvider(PluginTask task, int taskIndex)
|
297
|
+
{
|
298
|
+
this.client = newS3Client(task);
|
299
|
+
this.bucket = task.getBucket();
|
300
|
+
this.key = task.getFiles().get(taskIndex);
|
301
|
+
}
|
302
|
+
|
303
|
+
@Override
|
304
|
+
public InputStream openNext() throws IOException
|
305
|
+
{
|
306
|
+
if (opened) {
|
307
|
+
return null;
|
308
|
+
}
|
309
|
+
opened = true;
|
310
|
+
GetObjectRequest request = new GetObjectRequest(bucket, key);
|
311
|
+
S3Object obj = client.getObject(request);
|
312
|
+
return new ResumableInputStream(obj.getObjectContent(), new S3InputStreamReopener(client, request, obj.getObjectMetadata().getContentLength()));
|
313
|
+
}
|
314
|
+
|
315
|
+
@Override
|
316
|
+
public void close() { }
|
317
|
+
}
|
318
|
+
}
|
@@ -1,327 +1,39 @@
|
|
1
1
|
package org.embulk.input.s3;
|
2
2
|
|
3
|
-
import java.util.List;
|
4
|
-
import java.util.ArrayList;
|
5
|
-
import java.util.Collections;
|
6
|
-
import java.io.IOException;
|
7
|
-
import java.io.InterruptedIOException;
|
8
|
-
import java.io.InputStream;
|
9
|
-
import com.google.common.collect.ImmutableList;
|
10
3
|
import com.google.common.base.Optional;
|
11
|
-
import com.google.common.base.Throwables;
|
12
|
-
import org.slf4j.Logger;
|
13
|
-
import com.amazonaws.auth.AWSCredentials;
|
14
|
-
import com.amazonaws.auth.AWSCredentialsProvider;
|
15
|
-
import com.amazonaws.auth.BasicAWSCredentials;
|
16
4
|
import com.amazonaws.services.s3.AmazonS3Client;
|
17
|
-
import com.amazonaws.services.s3.model.ListObjectsRequest;
|
18
|
-
import com.amazonaws.services.s3.model.S3ObjectSummary;
|
19
|
-
import com.amazonaws.services.s3.model.ObjectListing;
|
20
|
-
import com.amazonaws.services.s3.model.GetObjectRequest;
|
21
|
-
import com.amazonaws.services.s3.model.S3Object;
|
22
|
-
import com.amazonaws.ClientConfiguration;
|
23
|
-
import com.amazonaws.Protocol;
|
24
5
|
import org.embulk.config.Config;
|
25
|
-
import org.embulk.config.ConfigInject;
|
26
6
|
import org.embulk.config.ConfigDefault;
|
27
|
-
import org.embulk.
|
28
|
-
import org.embulk.config.TaskSource;
|
29
|
-
import org.embulk.config.ConfigSource;
|
30
|
-
import org.embulk.config.ConfigDiff;
|
31
|
-
import org.embulk.config.CommitReport;
|
32
|
-
import org.embulk.spi.BufferAllocator;
|
33
|
-
import org.embulk.spi.Exec;
|
34
|
-
import org.embulk.spi.FileInputPlugin;
|
35
|
-
import org.embulk.spi.TransactionalFileInput;
|
36
|
-
import org.embulk.spi.util.InputStreamFileInput;
|
37
|
-
import org.embulk.input.s3.RetryExecutor.Retryable;
|
38
|
-
import org.embulk.input.s3.RetryExecutor.RetryGiveupException;
|
39
|
-
import static org.embulk.input.s3.RetryExecutor.retryExecutor;
|
7
|
+
import org.embulk.input.s3.AbstractS3FileInputPlugin;
|
40
8
|
|
41
9
|
public class S3FileInputPlugin
|
42
|
-
|
10
|
+
extends AbstractS3FileInputPlugin
|
43
11
|
{
|
44
|
-
public interface
|
45
|
-
extends
|
12
|
+
public interface S3PluginTask
|
13
|
+
extends PluginTask
|
46
14
|
{
|
47
|
-
@Config("bucket")
|
48
|
-
public String getBucket();
|
49
|
-
|
50
|
-
@Config("path_prefix")
|
51
|
-
public String getPathPrefix();
|
52
|
-
|
53
|
-
@Config("last_path")
|
54
|
-
@ConfigDefault("null")
|
55
|
-
public Optional<String> getLastPath();
|
56
|
-
|
57
15
|
@Config("endpoint")
|
58
16
|
@ConfigDefault("null")
|
59
17
|
public Optional<String> getEndpoint();
|
60
|
-
|
61
|
-
// TODO timeout, ssl, etc
|
62
|
-
|
63
|
-
@Config("access_key_id")
|
64
|
-
public String getAccessKeyId();
|
65
|
-
|
66
|
-
@Config("secret_access_key")
|
67
|
-
public String getSecretAccessKey();
|
68
|
-
|
69
|
-
// TODO support more options such as STS
|
70
|
-
|
71
|
-
public List<String> getFiles();
|
72
|
-
public void setFiles(List<String> files);
|
73
|
-
|
74
|
-
@ConfigInject
|
75
|
-
public BufferAllocator getBufferAllocator();
|
76
18
|
}
|
77
19
|
|
78
20
|
@Override
|
79
|
-
|
21
|
+
protected Class<? extends PluginTask> getTaskClass()
|
80
22
|
{
|
81
|
-
|
82
|
-
|
83
|
-
// list files recursively
|
84
|
-
task.setFiles(listFiles(task));
|
85
|
-
|
86
|
-
// TODO what if task.getFiles().isEmpty()?
|
87
|
-
|
88
|
-
// number of processors is same with number of files
|
89
|
-
return resume(task.dump(), task.getFiles().size(), control);
|
23
|
+
return S3PluginTask.class;
|
90
24
|
}
|
91
25
|
|
92
26
|
@Override
|
93
|
-
|
94
|
-
int taskCount,
|
95
|
-
FileInputPlugin.Control control)
|
27
|
+
protected AmazonS3Client newS3Client(PluginTask task)
|
96
28
|
{
|
97
|
-
|
98
|
-
|
99
|
-
control.run(taskSource, taskCount);
|
29
|
+
S3PluginTask t = (S3PluginTask) task;
|
100
30
|
|
101
|
-
|
102
|
-
ConfigDiff configDiff = Exec.newConfigDiff();
|
31
|
+
AmazonS3Client client = super.newS3Client(t);
|
103
32
|
|
104
|
-
|
105
|
-
|
106
|
-
// keep the last value
|
107
|
-
if (task.getLastPath().isPresent()) {
|
108
|
-
configDiff.set("last_path", task.getLastPath().get());
|
109
|
-
}
|
110
|
-
} else {
|
111
|
-
List<String> files = new ArrayList<String>(task.getFiles());
|
112
|
-
Collections.sort(files);
|
113
|
-
configDiff.set("last_path", files.get(files.size() - 1));
|
33
|
+
if (t.getEndpoint().isPresent()) {
|
34
|
+
client.setEndpoint(t.getEndpoint().get());
|
114
35
|
}
|
115
36
|
|
116
|
-
return configDiff;
|
117
|
-
}
|
118
|
-
|
119
|
-
@Override
|
120
|
-
public void cleanup(TaskSource taskSource,
|
121
|
-
int taskCount,
|
122
|
-
List<CommitReport> successCommitReports)
|
123
|
-
{
|
124
|
-
// do nothing
|
125
|
-
}
|
126
|
-
|
127
|
-
public static AWSCredentialsProvider getCredentialsProvider(PluginTask task)
|
128
|
-
{
|
129
|
-
final AWSCredentials cred = new BasicAWSCredentials(
|
130
|
-
task.getAccessKeyId(), task.getSecretAccessKey());
|
131
|
-
return new AWSCredentialsProvider() {
|
132
|
-
public AWSCredentials getCredentials()
|
133
|
-
{
|
134
|
-
return cred;
|
135
|
-
}
|
136
|
-
|
137
|
-
public void refresh()
|
138
|
-
{
|
139
|
-
}
|
140
|
-
};
|
141
|
-
}
|
142
|
-
|
143
|
-
private static AmazonS3Client newS3Client(PluginTask task)
|
144
|
-
{
|
145
|
-
AWSCredentialsProvider credentials = getCredentialsProvider(task);
|
146
|
-
AmazonS3Client client = newS3Client(credentials, task.getEndpoint());
|
147
37
|
return client;
|
148
38
|
}
|
149
|
-
|
150
|
-
private static AmazonS3Client newS3Client(AWSCredentialsProvider credentials,
|
151
|
-
Optional<String> endpoint)
|
152
|
-
{
|
153
|
-
// TODO get config from AmazonS3Task
|
154
|
-
ClientConfiguration clientConfig = new ClientConfiguration();
|
155
|
-
//clientConfig.setProtocol(Protocol.HTTP);
|
156
|
-
clientConfig.setMaxConnections(50); // SDK default: 50
|
157
|
-
clientConfig.setMaxErrorRetry(3); // SDK default: 3
|
158
|
-
clientConfig.setSocketTimeout(8*60*1000); // SDK default: 50*1000
|
159
|
-
|
160
|
-
AmazonS3Client client = new AmazonS3Client(credentials, clientConfig);
|
161
|
-
|
162
|
-
if (endpoint.isPresent()) {
|
163
|
-
client.setEndpoint(endpoint.get());
|
164
|
-
}
|
165
|
-
|
166
|
-
return client;
|
167
|
-
}
|
168
|
-
|
169
|
-
public List<String> listFiles(PluginTask task)
|
170
|
-
{
|
171
|
-
AmazonS3Client client = newS3Client(task);
|
172
|
-
String bucketName = task.getBucket();
|
173
|
-
|
174
|
-
return listS3FilesByPrefix(client, bucketName, task.getPathPrefix(), task.getLastPath());
|
175
|
-
}
|
176
|
-
|
177
|
-
/**
|
178
|
-
* Lists S3 filenames filtered by prefix.
|
179
|
-
*
|
180
|
-
* The resulting list does not include the file that's size == 0.
|
181
|
-
*/
|
182
|
-
public static List<String> listS3FilesByPrefix(AmazonS3Client client, String bucketName,
|
183
|
-
String prefix, Optional<String> lastPath)
|
184
|
-
{
|
185
|
-
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
186
|
-
|
187
|
-
String lastKey = lastPath.orNull();
|
188
|
-
do {
|
189
|
-
ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, lastKey, null, 1024);
|
190
|
-
ObjectListing ol = client.listObjects(req);
|
191
|
-
for(S3ObjectSummary s : ol.getObjectSummaries()) {
|
192
|
-
if (s.getSize() > 0) {
|
193
|
-
builder.add(s.getKey());
|
194
|
-
}
|
195
|
-
}
|
196
|
-
lastKey = ol.getNextMarker();
|
197
|
-
} while(lastKey != null);
|
198
|
-
|
199
|
-
return builder.build();
|
200
|
-
}
|
201
|
-
|
202
|
-
@Override
|
203
|
-
public TransactionalFileInput open(TaskSource taskSource, int taskIndex)
|
204
|
-
{
|
205
|
-
PluginTask task = taskSource.loadTask(PluginTask.class);
|
206
|
-
return new S3FileInput(task, taskIndex);
|
207
|
-
}
|
208
|
-
|
209
|
-
private static class S3RetryableOpener
|
210
|
-
implements RetryableInputStream.Opener
|
211
|
-
{
|
212
|
-
private final Logger log = Exec.getLogger(S3FileInputPlugin.class);
|
213
|
-
|
214
|
-
private final AmazonS3Client client;
|
215
|
-
private final GetObjectRequest request;
|
216
|
-
private final long contentLength;
|
217
|
-
|
218
|
-
public S3RetryableOpener(AmazonS3Client client, GetObjectRequest request, long contentLength)
|
219
|
-
{
|
220
|
-
this.client = client;
|
221
|
-
this.request = request;
|
222
|
-
this.contentLength = contentLength;
|
223
|
-
}
|
224
|
-
|
225
|
-
@Override
|
226
|
-
public InputStream open(final long offset, final Exception exception) throws IOException
|
227
|
-
{
|
228
|
-
try {
|
229
|
-
return retryExecutor()
|
230
|
-
.withRetryLimit(3)
|
231
|
-
.withInitialRetryWait(500)
|
232
|
-
.withMaxRetryWait(30*1000)
|
233
|
-
.runInterruptible(new Retryable<InputStream>() {
|
234
|
-
@Override
|
235
|
-
public InputStream call() throws InterruptedIOException
|
236
|
-
{
|
237
|
-
log.warn(String.format("S3 read failed. Retrying GET request with %,d bytes offset", offset), exception);
|
238
|
-
request.setRange(offset, contentLength - 1); // [first, last]
|
239
|
-
return client.getObject(request).getObjectContent();
|
240
|
-
}
|
241
|
-
|
242
|
-
@Override
|
243
|
-
public boolean isRetryableException(Exception exception)
|
244
|
-
{
|
245
|
-
return true; // TODO
|
246
|
-
}
|
247
|
-
|
248
|
-
@Override
|
249
|
-
public void onRetry(Exception exception, int retryCount, int retryLimit, int retryWait)
|
250
|
-
throws RetryGiveupException
|
251
|
-
{
|
252
|
-
String message = String.format("S3 GET request failed. Retrying %d/%d after %d seconds. Message: %s",
|
253
|
-
retryCount, retryLimit, retryWait/1000, exception.getMessage());
|
254
|
-
if (retryCount % 3 == 0) {
|
255
|
-
log.warn(message, exception);
|
256
|
-
} else {
|
257
|
-
log.warn(message);
|
258
|
-
}
|
259
|
-
}
|
260
|
-
|
261
|
-
@Override
|
262
|
-
public void onGiveup(Exception firstException, Exception lastException)
|
263
|
-
throws RetryGiveupException
|
264
|
-
{
|
265
|
-
}
|
266
|
-
});
|
267
|
-
} catch (RetryGiveupException ex) {
|
268
|
-
Throwables.propagateIfInstanceOf(ex.getCause(), IOException.class);
|
269
|
-
throw Throwables.propagate(ex.getCause());
|
270
|
-
} catch (InterruptedException ex) {
|
271
|
-
throw new InterruptedIOException();
|
272
|
-
}
|
273
|
-
}
|
274
|
-
}
|
275
|
-
|
276
|
-
public static class S3FileInput
|
277
|
-
extends InputStreamFileInput
|
278
|
-
implements TransactionalFileInput
|
279
|
-
{
|
280
|
-
// TODO create single-file InputStreamFileInput utility
|
281
|
-
private static class SingleFileProvider
|
282
|
-
implements InputStreamFileInput.Provider
|
283
|
-
{
|
284
|
-
private AmazonS3Client client;
|
285
|
-
private final String bucket;
|
286
|
-
private final String key;
|
287
|
-
private boolean opened = false;
|
288
|
-
|
289
|
-
public SingleFileProvider(PluginTask task, int taskIndex)
|
290
|
-
{
|
291
|
-
this.client = newS3Client(task);
|
292
|
-
this.bucket = task.getBucket();
|
293
|
-
this.key = task.getFiles().get(taskIndex);
|
294
|
-
}
|
295
|
-
|
296
|
-
@Override
|
297
|
-
public InputStream openNext() throws IOException
|
298
|
-
{
|
299
|
-
if (opened) {
|
300
|
-
return null;
|
301
|
-
}
|
302
|
-
opened = true;
|
303
|
-
GetObjectRequest request = new GetObjectRequest(bucket, key);
|
304
|
-
S3Object obj = client.getObject(request);
|
305
|
-
return new RetryableInputStream(obj.getObjectContent(), new S3RetryableOpener(client, request, obj.getObjectMetadata().getContentLength()));
|
306
|
-
}
|
307
|
-
|
308
|
-
@Override
|
309
|
-
public void close() { }
|
310
|
-
}
|
311
|
-
|
312
|
-
public S3FileInput(PluginTask task, int taskIndex)
|
313
|
-
{
|
314
|
-
super(task.getBufferAllocator(), new SingleFileProvider(task, taskIndex));
|
315
|
-
}
|
316
|
-
|
317
|
-
public void abort() { }
|
318
|
-
|
319
|
-
public CommitReport commit()
|
320
|
-
{
|
321
|
-
return Exec.newCommitReport();
|
322
|
-
}
|
323
|
-
|
324
|
-
@Override
|
325
|
-
public void close() { }
|
326
|
-
}
|
327
39
|
}
|