embulk-input-s3 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a189f02204b02562ddf626857a6fd8a0d07d41f1
4
+ data.tar.gz: bc27649cbbf48e4052cbac15f548e451e990e36f
5
+ SHA512:
6
+ metadata.gz: abef90a11eafb493f993812835d04cfbd268ea805408c9aaffd82b12723b5b5360b7f840575f38653f64d1caad5140c0397c44a0344990c79df5efe083b0eb13
7
+ data.tar.gz: 8e489005081d3efd5d6cb2b9c1e1342f0172323705ab6728938ab2a9fe98c4a948c127848ec003dd83bb96418225b4f06bde35e5449a368304ed438b9cfef173
@@ -0,0 +1,6 @@
1
+ *~
2
+ .idea
3
+ build/
4
+ /classpath/
5
+ /.gradle
6
+ /*.gem
@@ -0,0 +1,34 @@
1
+ # S3 file input plugin for Embulk
2
+
3
+ ## Overview
4
+
5
+ * Plugin type: **file input**
6
+ * Rollback supported: **yes**
7
+ * Resume supported: **yes**
8
+ * Cleanup supported: **yes**
9
+
10
+ ## Configuration
11
+
12
+ - **bucket** S3 bucket name (string, required)
13
+ - **path_prefix** prefix of target keys (string, required)
14
+ - **endpoint** S3 endpoint login user name (string, optional)
15
+ - **access_key_id** AWS access key id (string, required)
16
+ - **secret_access_key** AWS secret key (string, required)
17
+
18
+ ## Example
19
+
20
+ ```yaml
21
+ in:
22
+ type: s3
23
+ bucket: my-s3-bucket
24
+ endpoint: s3-us-west-1.amazonaws.com
25
+ access_key_id: ABCXYZ123ABCXYZ123
26
+ secret_access_key: AbCxYz123aBcXyZ123
27
+ ```
28
+
29
+ ## Build
30
+
31
+ ```
32
+ ./gradlew gem
33
+ ```
34
+
@@ -0,0 +1,62 @@
1
+ plugins {
2
+ id "com.jfrog.bintray" version "1.1"
3
+ id "com.github.jruby-gradle.base" version "0.1.5"
4
+ id "java"
5
+ }
6
+ import com.github.jrubygradle.JRubyExec
7
+ repositories {
8
+ mavenCentral()
9
+ mavenLocal()
10
+ jcenter()
11
+ }
12
+ configurations {
13
+ provided
14
+ }
15
+
16
+ version = "0.1.0"
17
+
18
+ dependencies {
19
+ compile "org.embulk:embulk-core:0.4.0"
20
+ provided "org.embulk:embulk-core:0.4.0"
21
+ compile "com.amazonaws:aws-java-sdk-s3:1.9.17"
22
+ testCompile "junit:junit:4.+"
23
+ testCompile "org.mockito:mockito-core:1.+"
24
+ }
25
+
26
+ task classpath(type: Copy, dependsOn: ["jar"]) {
27
+ doFirst { file("classpath").deleteDir() }
28
+ from (configurations.runtime - configurations.provided + files(jar.archivePath))
29
+ into "classpath"
30
+ }
31
+ clean { delete 'classpath' }
32
+
33
+ task gem(type: JRubyExec, dependsOn: ["build", "gemspec", "classpath"]) {
34
+ jrubyArgs "-rrubygems/gem_runner", "-eGem::GemRunner.new.run(ARGV)", "build"
35
+ script "build/gemspec"
36
+ doLast { ant.move(file: "${project.name}-${project.version}.gem", todir: "pkg") }
37
+ }
38
+
39
+ task gemspec << { file("build/gemspec").write($/
40
+ Gem::Specification.new do |gem|
41
+ gem.summary = "S3 file input plugin for Embulk, a plugin-based parallel bulk data loader"
42
+
43
+ gem.name = "${project.name}"
44
+ gem.version = "${project.version}"
45
+ gem.description = gem.summary
46
+ gem.authors = ["Sadayuki Furuhashi"]
47
+ gem.email = ["frsyuki@gmail.com"]
48
+ gem.license = "Apache 2.0"
49
+ gem.homepage = "https://github.com/embulk/embulk-input-s3"
50
+
51
+ gem.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
52
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
53
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
54
+ gem.require_paths = ["lib"]
55
+ gem.has_rdoc = false
56
+
57
+ gem.add_development_dependency "bundler", [">= 1.0"]
58
+ gem.add_development_dependency "rake", [">= 0.10.0"]
59
+ gem.add_development_dependency "test-unit", ["~> 3.0.2"]
60
+ end
61
+ /$)
62
+ }
@@ -0,0 +1,6 @@
1
+ #Wed Feb 04 13:46:12 PST 2015
2
+ distributionBase=GRADLE_USER_HOME
3
+ distributionPath=wrapper/dists
4
+ zipStoreBase=GRADLE_USER_HOME
5
+ zipStorePath=wrapper/dists
6
+ distributionUrl=https\://services.gradle.org/distributions/gradle-2.2.1-bin.zip
data/gradlew ADDED
@@ -0,0 +1,164 @@
1
+ #!/usr/bin/env bash
2
+
3
+ ##############################################################################
4
+ ##
5
+ ## Gradle start up script for UN*X
6
+ ##
7
+ ##############################################################################
8
+
9
+ # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
10
+ DEFAULT_JVM_OPTS=""
11
+
12
+ APP_NAME="Gradle"
13
+ APP_BASE_NAME=`basename "$0"`
14
+
15
+ # Use the maximum available, or set MAX_FD != -1 to use that value.
16
+ MAX_FD="maximum"
17
+
18
+ warn ( ) {
19
+ echo "$*"
20
+ }
21
+
22
+ die ( ) {
23
+ echo
24
+ echo "$*"
25
+ echo
26
+ exit 1
27
+ }
28
+
29
+ # OS specific support (must be 'true' or 'false').
30
+ cygwin=false
31
+ msys=false
32
+ darwin=false
33
+ case "`uname`" in
34
+ CYGWIN* )
35
+ cygwin=true
36
+ ;;
37
+ Darwin* )
38
+ darwin=true
39
+ ;;
40
+ MINGW* )
41
+ msys=true
42
+ ;;
43
+ esac
44
+
45
+ # For Cygwin, ensure paths are in UNIX format before anything is touched.
46
+ if $cygwin ; then
47
+ [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
48
+ fi
49
+
50
+ # Attempt to set APP_HOME
51
+ # Resolve links: $0 may be a link
52
+ PRG="$0"
53
+ # Need this for relative symlinks.
54
+ while [ -h "$PRG" ] ; do
55
+ ls=`ls -ld "$PRG"`
56
+ link=`expr "$ls" : '.*-> \(.*\)$'`
57
+ if expr "$link" : '/.*' > /dev/null; then
58
+ PRG="$link"
59
+ else
60
+ PRG=`dirname "$PRG"`"/$link"
61
+ fi
62
+ done
63
+ SAVED="`pwd`"
64
+ cd "`dirname \"$PRG\"`/" >&-
65
+ APP_HOME="`pwd -P`"
66
+ cd "$SAVED" >&-
67
+
68
+ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
69
+
70
+ # Determine the Java command to use to start the JVM.
71
+ if [ -n "$JAVA_HOME" ] ; then
72
+ if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
73
+ # IBM's JDK on AIX uses strange locations for the executables
74
+ JAVACMD="$JAVA_HOME/jre/sh/java"
75
+ else
76
+ JAVACMD="$JAVA_HOME/bin/java"
77
+ fi
78
+ if [ ! -x "$JAVACMD" ] ; then
79
+ die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
80
+
81
+ Please set the JAVA_HOME variable in your environment to match the
82
+ location of your Java installation."
83
+ fi
84
+ else
85
+ JAVACMD="java"
86
+ which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
87
+
88
+ Please set the JAVA_HOME variable in your environment to match the
89
+ location of your Java installation."
90
+ fi
91
+
92
+ # Increase the maximum file descriptors if we can.
93
+ if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
94
+ MAX_FD_LIMIT=`ulimit -H -n`
95
+ if [ $? -eq 0 ] ; then
96
+ if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
97
+ MAX_FD="$MAX_FD_LIMIT"
98
+ fi
99
+ ulimit -n $MAX_FD
100
+ if [ $? -ne 0 ] ; then
101
+ warn "Could not set maximum file descriptor limit: $MAX_FD"
102
+ fi
103
+ else
104
+ warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105
+ fi
106
+ fi
107
+
108
+ # For Darwin, add options to specify how the application appears in the dock
109
+ if $darwin; then
110
+ GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111
+ fi
112
+
113
+ # For Cygwin, switch paths to Windows format before running java
114
+ if $cygwin ; then
115
+ APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116
+ CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117
+
118
+ # We build the pattern for arguments to be converted via cygpath
119
+ ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120
+ SEP=""
121
+ for dir in $ROOTDIRSRAW ; do
122
+ ROOTDIRS="$ROOTDIRS$SEP$dir"
123
+ SEP="|"
124
+ done
125
+ OURCYGPATTERN="(^($ROOTDIRS))"
126
+ # Add a user-defined pattern to the cygpath arguments
127
+ if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128
+ OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129
+ fi
130
+ # Now convert the arguments - kludge to limit ourselves to /bin/sh
131
+ i=0
132
+ for arg in "$@" ; do
133
+ CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134
+ CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135
+
136
+ if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137
+ eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138
+ else
139
+ eval `echo args$i`="\"$arg\""
140
+ fi
141
+ i=$((i+1))
142
+ done
143
+ case $i in
144
+ (0) set -- ;;
145
+ (1) set -- "$args0" ;;
146
+ (2) set -- "$args0" "$args1" ;;
147
+ (3) set -- "$args0" "$args1" "$args2" ;;
148
+ (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149
+ (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150
+ (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151
+ (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152
+ (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153
+ (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154
+ esac
155
+ fi
156
+
157
+ # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158
+ function splitJvmOpts() {
159
+ JVM_OPTS=("$@")
160
+ }
161
+ eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162
+ JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163
+
164
+ exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_input(
2
+ :s3, "org.embulk.input.s3.S3FileInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,268 @@
1
+ package org.embulk.input.s3;
2
+
3
+ import java.util.List;
4
+ import java.io.File;
5
+ import java.io.FileInputStream;
6
+ import java.io.InputStream;
7
+ import java.io.IOException;
8
+ import java.nio.file.Path;
9
+ import java.nio.file.Paths;
10
+ import java.nio.file.Files;
11
+ import java.nio.file.SimpleFileVisitor;
12
+ import java.nio.file.FileVisitResult;
13
+ import java.nio.file.attribute.BasicFileAttributes;
14
+ import javax.validation.constraints.NotNull;
15
+ import com.google.common.collect.ImmutableList;
16
+ import com.fasterxml.jackson.annotation.JacksonInject;
17
+ import org.embulk.config.Config;
18
+ import org.embulk.config.Task;
19
+ import org.embulk.config.TaskSource;
20
+ import org.embulk.config.ConfigSource;
21
+ import org.embulk.config.ConfigDiff;
22
+ import org.embulk.config.CommitReport;
23
+ import org.embulk.spi.BufferAllocator;
24
+ import org.embulk.spi.Exec;
25
+ import org.embulk.spi.FileInputPlugin;
26
+ import org.embulk.spi.TransactionalFileInput;
27
+ import org.embulk.spi.util.InputStreamFileInput;
28
+ import org.slf4j.Logger;
29
+
30
+ import static org.embulk.spi.util.Inputs.formatPath;
31
+
32
+ import java.util.List;
33
+ import java.io.IOException;
34
+ import java.io.InputStream;
35
+ import com.google.common.collect.ImmutableList;
36
+ import com.google.common.base.Optional;
37
+ import com.fasterxml.jackson.annotation.JacksonInject;
38
+ import com.amazonaws.auth.AWSCredentials;
39
+ import com.amazonaws.auth.AWSCredentialsProvider;
40
+ import com.amazonaws.auth.BasicAWSCredentials;
41
+ import com.amazonaws.services.s3.AmazonS3Client;
42
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
43
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
44
+ import com.amazonaws.services.s3.model.ObjectListing;
45
+ import com.amazonaws.services.s3.model.GetObjectRequest;
46
+ import com.amazonaws.services.s3.model.S3Object;
47
+ import com.amazonaws.ClientConfiguration;
48
+ import com.amazonaws.Protocol;
49
+ import org.embulk.config.Config;
50
+ import org.embulk.config.ConfigDefault;
51
+ import org.embulk.config.Task;
52
+ import org.embulk.config.TaskSource;
53
+ import org.embulk.config.ConfigSource;
54
+ import org.embulk.config.ConfigDiff;
55
+ import org.embulk.config.CommitReport;
56
+ import org.embulk.spi.BufferAllocator;
57
+ import org.embulk.spi.Exec;
58
+ import org.embulk.spi.FileInputPlugin;
59
+ import org.embulk.spi.TransactionalFileInput;
60
+ import org.embulk.spi.util.InputStreamFileInput;
61
+
62
+ public class S3FileInputPlugin
63
+ implements FileInputPlugin
64
+ {
65
+ public interface PluginTask
66
+ extends Task
67
+ {
68
+ @Config("bucket")
69
+ public String getBucket();
70
+
71
+ @Config("path_prefix")
72
+ public String getPathPrefix();
73
+
74
+ @Config("last_path")
75
+ @ConfigDefault("null")
76
+ public Optional<String> getLastPath();
77
+
78
+ @Config("endpoint")
79
+ @ConfigDefault("null")
80
+ public Optional<String> getEndpoint();
81
+
82
+ // TODO timeout, ssl, etc
83
+
84
+ @Config("access_key_id")
85
+ public String getAccessKeyId();
86
+
87
+ @Config("secret_access_key")
88
+ public String getSecretAccessKey();
89
+
90
+ // TODO support more options such as STS
91
+
92
+ public List<String> getFiles();
93
+ public void setFiles(List<String> files);
94
+
95
+ @JacksonInject
96
+ public BufferAllocator getBufferAllocator();
97
+ }
98
+
99
+ @Override
100
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control)
101
+ {
102
+ PluginTask task = config.loadConfig(PluginTask.class);
103
+
104
+ // list files recursively
105
+ task.setFiles(listFiles(task));
106
+
107
+ // number of processors is same with number of files
108
+ return resume(task.dump(), task.getFiles().size(), control);
109
+ }
110
+
111
+ @Override
112
+ public ConfigDiff resume(TaskSource taskSource,
113
+ int processorCount,
114
+ FileInputPlugin.Control control)
115
+ {
116
+ control.run(taskSource, processorCount);
117
+ return Exec.newConfigDiff();
118
+ }
119
+
120
+ @Override
121
+ public void cleanup(TaskSource taskSource,
122
+ int processorCount,
123
+ List<CommitReport> successCommitReports)
124
+ {
125
+ // do nothing
126
+ }
127
+
128
+ public static AWSCredentialsProvider getCredentialsProvider(PluginTask task)
129
+ {
130
+ final AWSCredentials cred = new BasicAWSCredentials(
131
+ task.getAccessKeyId(), task.getSecretAccessKey());
132
+ return new AWSCredentialsProvider() {
133
+ public AWSCredentials getCredentials()
134
+ {
135
+ return cred;
136
+ }
137
+
138
+ public void refresh()
139
+ {
140
+ }
141
+ };
142
+ }
143
+
144
+ private static AmazonS3Client newS3Client(PluginTask task)
145
+ {
146
+ AWSCredentialsProvider credentials = getCredentialsProvider(task);
147
+ AmazonS3Client client = newS3Client(credentials, task.getEndpoint());
148
+ return client;
149
+ }
150
+
151
+ private static AmazonS3Client newS3Client(AWSCredentialsProvider credentials,
152
+ Optional<String> endpoint)
153
+ {
154
+ // TODO get config from AmazonS3Task
155
+ ClientConfiguration clientConfig = new ClientConfiguration();
156
+ //clientConfig.setProtocol(Protocol.HTTP);
157
+ clientConfig.setMaxConnections(50); // SDK default: 50
158
+ clientConfig.setMaxErrorRetry(3); // SDK default: 3
159
+ clientConfig.setSocketTimeout(8*60*1000); // SDK default: 50*1000
160
+
161
+ AmazonS3Client client = new AmazonS3Client(credentials, clientConfig);
162
+
163
+ if (endpoint.isPresent()) {
164
+ client.setEndpoint(endpoint.get());
165
+ }
166
+
167
+ return client;
168
+ }
169
+
170
+ public List<String> listFiles(PluginTask task)
171
+ {
172
+ AmazonS3Client client = newS3Client(task);
173
+ String bucketName = task.getBucket();
174
+
175
+ return listS3FilesByPrefix(client, bucketName, task.getPathPrefix(), task.getLastPath());
176
+ }
177
+
178
+ /**
179
+ * Lists S3 filenames filtered by prefix.
180
+ *
181
+ * The resulting list does not include the file that's size == 0.
182
+ */
183
+ public static List<String> listS3FilesByPrefix(AmazonS3Client client, String bucketName,
184
+ String prefix, Optional<String> lastPath)
185
+ {
186
+ ImmutableList.Builder<String> builder = ImmutableList.builder();
187
+
188
+ String lastKey = lastPath.orNull();
189
+ do {
190
+ ListObjectsRequest req = new ListObjectsRequest(bucketName, prefix, lastKey, null, 1024);
191
+ ObjectListing ol = client.listObjects(req);
192
+ for(S3ObjectSummary s : ol.getObjectSummaries()) {
193
+ if (s.getSize() > 0) {
194
+ builder.add(s.getKey());
195
+ }
196
+ }
197
+ lastKey = ol.getNextMarker();
198
+ } while(lastKey != null);
199
+
200
+ return builder.build();
201
+ }
202
+
203
+ @Override
204
+ public TransactionalFileInput open(TaskSource taskSource, int processorIndex)
205
+ {
206
+ PluginTask task = taskSource.loadTask(PluginTask.class);
207
+ return new S3FileInput(task, processorIndex);
208
+ }
209
+
210
+ public static class S3FileInput
211
+ extends InputStreamFileInput
212
+ implements TransactionalFileInput
213
+ {
214
+ // TODO create single-file InputStreamFileInput utility
215
+ private static class SingleFileProvider
216
+ implements InputStreamFileInput.Provider
217
+ {
218
+ private AmazonS3Client client;
219
+ private final String bucket;
220
+ private final String key;
221
+ private boolean opened = false;
222
+
223
+ public SingleFileProvider(PluginTask task, int processorIndex)
224
+ {
225
+ this.client = newS3Client(task);
226
+ this.bucket = task.getBucket();
227
+ this.key = task.getFiles().get(processorIndex);
228
+ }
229
+
230
+ @Override
231
+ public InputStream openNext() throws IOException
232
+ {
233
+ if (opened) {
234
+ return null;
235
+ }
236
+ opened = true;
237
+ GetObjectRequest request = new GetObjectRequest(bucket, key);
238
+ //if (pos > 0) {
239
+ // request.setRange(pos, contentLength);
240
+ //}
241
+ S3Object obj = client.getObject(request);
242
+ //if (pos <= 0) {
243
+ // // first call
244
+ // contentLength = obj.getObjectMetadata().getContentLength();
245
+ //}
246
+ return obj.getObjectContent();
247
+ }
248
+
249
+ @Override
250
+ public void close() { }
251
+ }
252
+
253
+ public S3FileInput(PluginTask task, int processorIndex)
254
+ {
255
+ super(task.getBufferAllocator(), new SingleFileProvider(task, processorIndex));
256
+ }
257
+
258
+ public void abort() { }
259
+
260
+ public CommitReport commit()
261
+ {
262
+ return Exec.newCommitReport();
263
+ }
264
+
265
+ @Override
266
+ public void close() { }
267
+ }
268
+ }
@@ -0,0 +1,42 @@
1
+ package org.embulk.plugin.s3;
2
+
3
+ import static org.junit.Assert.*;
4
+ import java.util.List;
5
+ import org.junit.Test;
6
+ import org.mockito.Mockito;
7
+
8
+ import com.google.common.base.Optional;
9
+ import com.google.common.collect.ImmutableList;
10
+ import com.amazonaws.services.s3.AmazonS3Client;
11
+ import com.amazonaws.services.s3.model.ListObjectsRequest;
12
+ import com.amazonaws.services.s3.model.ObjectListing;
13
+ import com.amazonaws.services.s3.model.S3ObjectSummary;
14
+
15
+ public class TestS3FileInputPlugin
16
+ {
17
+ @Test
18
+ public void listS3FilesByPrefix()
19
+ {
20
+ // AWSS3Client returns list1 for the first iteration and list2 next.
21
+ List<S3ObjectSummary> list1 = ImmutableList.<S3ObjectSummary> of(bucket("in/", 0), bucket("in/file/", 0),
22
+ bucket("in/file/sample.csv.gz", 12345));
23
+ List<S3ObjectSummary> list2 = ImmutableList.<S3ObjectSummary> of(bucket("sample2.csv.gz", 0));
24
+ ObjectListing ol = Mockito.mock(ObjectListing.class);
25
+
26
+ Mockito.doReturn(list1).doReturn(list2).when(ol).getObjectSummaries();
27
+ AmazonS3Client client = Mockito.mock(AmazonS3Client.class);
28
+ Mockito.doReturn(ol).when(client).listObjects(Mockito.any(ListObjectsRequest.class));
29
+ Mockito.doReturn("in/file/").doReturn(null).when(ol).getNextMarker();
30
+
31
+ // It counts only size != 0 files.
32
+ assertEquals(1, S3FileInputPlugin.listS3FilesByPrefix(client, "bucketName", "prefix", Optional.<String>absent()).size());
33
+ }
34
+
35
+ private S3ObjectSummary bucket(String key, long size)
36
+ {
37
+ S3ObjectSummary bucket = new S3ObjectSummary();
38
+ bucket.setKey(key);
39
+ bucket.setSize(size);
40
+ return bucket;
41
+ }
42
+ }
metadata ADDED
@@ -0,0 +1,104 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-s3
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sadayuki Furuhashi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ version_requirements: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ requirement: !ruby/object:Gem::Requirement
21
+ requirements:
22
+ - - '>='
23
+ - !ruby/object:Gem::Version
24
+ version: '1.0'
25
+ prerelease: false
26
+ type: :development
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.10.0
34
+ requirement: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: 0.10.0
39
+ prerelease: false
40
+ type: :development
41
+ - !ruby/object:Gem::Dependency
42
+ name: test-unit
43
+ version_requirements: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: 3.0.2
48
+ requirement: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ~>
51
+ - !ruby/object:Gem::Version
52
+ version: 3.0.2
53
+ prerelease: false
54
+ type: :development
55
+ description: S3 file input plugin for Embulk, a plugin-based parallel bulk data loader
56
+ email:
57
+ - frsyuki@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - .gitignore
63
+ - README.md
64
+ - build.gradle
65
+ - gradle/wrapper/gradle-wrapper.jar
66
+ - gradle/wrapper/gradle-wrapper.properties
67
+ - gradlew
68
+ - gradlew.bat
69
+ - lib/embulk/input/s3.rb
70
+ - src/main/java/org/embulk/input/s3/S3FileInputPlugin.java
71
+ - src/test/java/org/embulk/plugin/s3/TestS3FileInputPlugin.java
72
+ - classpath/aws-java-sdk-core-1.9.17.jar
73
+ - classpath/aws-java-sdk-kms-1.9.17.jar
74
+ - classpath/aws-java-sdk-s3-1.9.17.jar
75
+ - classpath/commons-codec-1.6.jar
76
+ - classpath/commons-logging-1.1.3.jar
77
+ - classpath/embulk-input-s3-0.1.0.jar
78
+ - classpath/httpclient-4.3.4.jar
79
+ - classpath/httpcore-4.3.2.jar
80
+ homepage: https://github.com/embulk/embulk-input-s3
81
+ licenses:
82
+ - Apache 2.0
83
+ metadata: {}
84
+ post_install_message:
85
+ rdoc_options: []
86
+ require_paths:
87
+ - lib
88
+ required_ruby_version: !ruby/object:Gem::Requirement
89
+ requirements:
90
+ - - '>='
91
+ - !ruby/object:Gem::Version
92
+ version: '0'
93
+ required_rubygems_version: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ requirements: []
99
+ rubyforge_project:
100
+ rubygems_version: 2.1.9
101
+ signing_key:
102
+ specification_version: 4
103
+ summary: S3 file input plugin for Embulk, a plugin-based parallel bulk data loader
104
+ test_files: []