embulk-output-redshift 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -4
- data/classpath/{embulk-output-jdbc-0.5.0.jar → embulk-output-jdbc-0.5.1.jar} +0 -0
- data/classpath/{embulk-output-postgresql-0.5.0.jar → embulk-output-postgresql-0.5.1.jar} +0 -0
- data/classpath/embulk-output-redshift-0.5.1.jar +0 -0
- data/src/main/java/org/embulk/output/RedshiftOutputPlugin.java +1 -0
- data/src/main/java/org/embulk/output/redshift/RedshiftCopyBatchInsert.java +121 -58
- metadata +5 -5
- data/classpath/embulk-output-redshift-0.5.0.jar +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c8c2e9f95d662d1860ccba4b210c6dae2d1cc44b
|
4
|
+
data.tar.gz: cd0afa3f4352bc9732e27d1fdd132398114356c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 15a7eb5c206c87822f163dd105af21444cf3d925f01e01e0eb7e4ab0e499646b814e9f9b377b49df677b79cb7a559ac97674f7179f699f2dc3cd96523e99216c
|
7
|
+
data.tar.gz: dd060ac7a977ee44b20e5927c3e121d2b930e81b18282b0407a658921db8c7f9fb392fdb277c2078b9dbf08f13de3d987337cb2064c7637205726b82035f85f9
|
data/README.md
CHANGED
@@ -5,8 +5,8 @@ Redshift output plugins for Embulk loads records to Redshift.
|
|
5
5
|
## Overview
|
6
6
|
|
7
7
|
* **Plugin type**: output
|
8
|
-
* **Load all or nothing**: depnds on the mode. see
|
9
|
-
* **Resume supported**: depnds on the mode. see
|
8
|
+
* **Load all or nothing**: depnds on the mode. see below.
|
9
|
+
* **Resume supported**: depnds on the mode. see below.
|
10
10
|
|
11
11
|
## Configuration
|
12
12
|
|
@@ -19,11 +19,11 @@ Redshift output plugins for Embulk loads records to Redshift.
|
|
19
19
|
- **table**: destination table name (string, required)
|
20
20
|
- **access_key_id**: access key id for AWS
|
21
21
|
- **secret_access_key**: secret access key for AWS
|
22
|
-
- **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` and `sts:GetFederationToken`.
|
22
|
+
- **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:DeleteObject`, , `s3:ListBucket` and `sts:GetFederationToken`. (string, default: "", but we strongly recommend that you use IAM user for security reasons. see below.)
|
23
23
|
- **s3_bucket**: S3 bucket name for temporary files
|
24
24
|
- **s3_key_prefix**: S3 key prefix for temporary files (string, default:"")
|
25
25
|
- **options**: extra connection properties (hash, default: {})
|
26
|
-
- **mode**: "
|
26
|
+
- **mode**: "insert", "insert_direct", "truncate_insert", or "replace". See below. (string, required)
|
27
27
|
- **batch_size**: size of a single batch insert (integer, default: 16777216)
|
28
28
|
- **default_timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp into a SQL string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
|
29
29
|
- **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
|
@@ -32,6 +32,7 @@ Redshift output plugins for Embulk loads records to Redshift.
|
|
32
32
|
- **timestamp_format**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, default: `%Y-%m-%d %H:%M:%S.%6N`)
|
33
33
|
- **timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp value into a SQL string. In this cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
|
34
34
|
|
35
|
+
|
35
36
|
### Modes
|
36
37
|
|
37
38
|
* **insert**:
|
@@ -98,3 +99,8 @@ out:
|
|
98
99
|
```
|
99
100
|
$ ./gradlew gem
|
100
101
|
```
|
102
|
+
|
103
|
+
### Security
|
104
|
+
This plugin requires AWS access credentials so that it may write temporary files to S3. There are two security options, Standard and Federated.
|
105
|
+
To use Standard security, give **aws_key_id** and **secret_access_key**. To use Federated mode, also give the **iam_user_name** field.
|
106
|
+
Federated mode really means temporary credentials, so that a man-in-the-middle attack will see AWS credentials that are only valid for 1 calendar day after the transaction.
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,15 +1,27 @@
|
|
1
1
|
package org.embulk.output.redshift;
|
2
2
|
|
3
|
-
import java.
|
4
|
-
import java.util.concurrent.Callable;
|
5
|
-
import java.util.UUID;
|
3
|
+
import java.io.BufferedWriter;
|
6
4
|
import java.io.File;
|
7
|
-
import java.io.IOException;
|
8
5
|
import java.io.FileOutputStream;
|
6
|
+
import java.io.IOException;
|
9
7
|
import java.io.OutputStreamWriter;
|
10
|
-
import java.io.Closeable;
|
11
|
-
import java.io.BufferedWriter;
|
12
8
|
import java.sql.SQLException;
|
9
|
+
import java.util.ArrayList;
|
10
|
+
import java.util.List;
|
11
|
+
import java.util.UUID;
|
12
|
+
import java.util.concurrent.Callable;
|
13
|
+
import java.util.concurrent.ExecutionException;
|
14
|
+
import java.util.concurrent.ExecutorService;
|
15
|
+
import java.util.concurrent.Executors;
|
16
|
+
import java.util.concurrent.Future;
|
17
|
+
import java.util.concurrent.TimeUnit;
|
18
|
+
import java.util.zip.GZIPOutputStream;
|
19
|
+
|
20
|
+
import org.embulk.output.jdbc.JdbcSchema;
|
21
|
+
import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
|
22
|
+
import org.embulk.spi.Exec;
|
23
|
+
import org.slf4j.Logger;
|
24
|
+
|
13
25
|
import com.amazonaws.auth.AWSCredentialsProvider;
|
14
26
|
import com.amazonaws.auth.BasicSessionCredentials;
|
15
27
|
import com.amazonaws.auth.policy.Policy;
|
@@ -19,13 +31,9 @@ import com.amazonaws.auth.policy.Statement.Effect;
|
|
19
31
|
import com.amazonaws.auth.policy.actions.S3Actions;
|
20
32
|
import com.amazonaws.services.s3.AmazonS3Client;
|
21
33
|
import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
|
34
|
+
import com.amazonaws.services.securitytoken.model.Credentials;
|
22
35
|
import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
|
23
36
|
import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
|
24
|
-
import com.amazonaws.services.securitytoken.model.Credentials;
|
25
|
-
import org.slf4j.Logger;
|
26
|
-
import org.embulk.spi.Exec;
|
27
|
-
import org.embulk.output.jdbc.JdbcSchema;
|
28
|
-
import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
|
29
37
|
|
30
38
|
public class RedshiftCopyBatchInsert
|
31
39
|
extends AbstractPostgreSQLCopyBatchInsert
|
@@ -35,13 +43,16 @@ public class RedshiftCopyBatchInsert
|
|
35
43
|
private final String s3BucketName;
|
36
44
|
private final String s3KeyPrefix;
|
37
45
|
private final String iamReaderUserName;
|
46
|
+
private final AWSCredentialsProvider credentialsProvider;
|
38
47
|
private final AmazonS3Client s3;
|
39
48
|
private final AWSSecurityTokenServiceClient sts;
|
49
|
+
private final ExecutorService executorService;
|
40
50
|
|
41
51
|
private RedshiftOutputConnection connection = null;
|
42
52
|
private String copySqlBeforeFrom = null;
|
43
53
|
private long totalRows;
|
44
54
|
private int fileCount;
|
55
|
+
private List<Future<Void>> uploadAndCopyFutures;
|
45
56
|
|
46
57
|
public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
|
47
58
|
|
@@ -58,8 +69,12 @@ public class RedshiftCopyBatchInsert
|
|
58
69
|
this.s3KeyPrefix = s3KeyPrefix + "/";
|
59
70
|
}
|
60
71
|
this.iamReaderUserName = iamReaderUserName;
|
72
|
+
this.credentialsProvider = credentialsProvider;
|
61
73
|
this.s3 = new AmazonS3Client(credentialsProvider); // TODO options
|
62
74
|
this.sts = new AWSSecurityTokenServiceClient(credentialsProvider); // options
|
75
|
+
|
76
|
+
this.executorService = Executors.newCachedThreadPool();
|
77
|
+
this.uploadAndCopyFutures = new ArrayList<Future<Void>>();
|
63
78
|
}
|
64
79
|
|
65
80
|
@Override
|
@@ -86,28 +101,51 @@ public class RedshiftCopyBatchInsert
|
|
86
101
|
{
|
87
102
|
File file = closeCurrentFile(); // flush buffered data in writer
|
88
103
|
|
89
|
-
|
90
|
-
new
|
91
|
-
|
104
|
+
String s3KeyName = s3KeyPrefix + UUID.randomUUID().toString();
|
105
|
+
UploadTask uploadTask = new UploadTask(file, batchRows, s3KeyName);
|
106
|
+
Future<Void> uploadFuture = executorService.submit(uploadTask);
|
107
|
+
uploadAndCopyFutures.add(uploadFuture);
|
108
|
+
|
109
|
+
CopyTask copyTask = new CopyTask(uploadFuture, s3KeyName);
|
110
|
+
uploadAndCopyFutures.add(executorService.submit(copyTask));
|
92
111
|
|
93
112
|
fileCount++;
|
94
113
|
totalRows += batchRows;
|
95
114
|
batchRows = 0;
|
96
115
|
|
97
116
|
openNewFile();
|
98
|
-
file.delete();
|
99
117
|
}
|
100
118
|
|
101
119
|
@Override
|
102
120
|
public void finish() throws IOException, SQLException
|
103
121
|
{
|
104
122
|
super.finish();
|
123
|
+
|
124
|
+
for (Future<Void> uploadAndCopyFuture : uploadAndCopyFutures) {
|
125
|
+
try {
|
126
|
+
uploadAndCopyFuture.get();
|
127
|
+
|
128
|
+
} catch (InterruptedException e) {
|
129
|
+
throw new RuntimeException(e);
|
130
|
+
} catch (ExecutionException e) {
|
131
|
+
if (e.getCause() instanceof SQLException) {
|
132
|
+
throw (SQLException)e.getCause();
|
133
|
+
}
|
134
|
+
throw new RuntimeException(e);
|
135
|
+
}
|
136
|
+
}
|
137
|
+
|
105
138
|
logger.info("Loaded {} files.", fileCount);
|
106
139
|
}
|
107
140
|
|
108
141
|
@Override
|
109
142
|
public void close() throws IOException, SQLException
|
110
143
|
{
|
144
|
+
executorService.shutdownNow();
|
145
|
+
try {
|
146
|
+
executorService.awaitTermination(60, TimeUnit.SECONDS);
|
147
|
+
} catch (InterruptedException e) {}
|
148
|
+
|
111
149
|
s3.shutdown();
|
112
150
|
closeCurrentFile().delete();
|
113
151
|
if (connection != null) {
|
@@ -127,60 +165,97 @@ public class RedshiftCopyBatchInsert
|
|
127
165
|
.withActions(S3Actions.GetObject)
|
128
166
|
.withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
|
129
167
|
);
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
168
|
+
if (iamReaderUserName != null && iamReaderUserName.length() > 0) {
|
169
|
+
GetFederationTokenRequest req = new GetFederationTokenRequest();
|
170
|
+
req.setDurationSeconds(86400); // 3600 - 129600
|
171
|
+
req.setName(iamReaderUserName);
|
172
|
+
req.setPolicy(policy.toJson());
|
173
|
+
|
174
|
+
GetFederationTokenResult res = sts.getFederationToken(req);
|
175
|
+
Credentials c = res.getCredentials();
|
176
|
+
|
177
|
+
return new BasicSessionCredentials(
|
178
|
+
c.getAccessKeyId(),
|
179
|
+
c.getSecretAccessKey(),
|
180
|
+
c.getSessionToken());
|
181
|
+
} else {
|
182
|
+
return new BasicSessionCredentials(credentialsProvider.getCredentials().getAWSAccessKeyId(),
|
183
|
+
credentialsProvider.getCredentials().getAWSSecretKey(), null);
|
184
|
+
}
|
142
185
|
}
|
143
186
|
|
144
|
-
private class
|
187
|
+
private class UploadTask implements Callable<Void>
|
145
188
|
{
|
146
189
|
private final File file;
|
147
190
|
private final int batchRows;
|
148
191
|
private final String s3KeyName;
|
149
192
|
|
150
|
-
public
|
193
|
+
public UploadTask(File file, int batchRows, String s3KeyName)
|
151
194
|
{
|
152
195
|
this.file = file;
|
153
196
|
this.batchRows = batchRows;
|
154
197
|
this.s3KeyName = s3KeyName;
|
155
198
|
}
|
156
199
|
|
157
|
-
public Void call()
|
200
|
+
public Void call() {
|
158
201
|
logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
|
159
202
|
s3KeyName, file.length(), batchRows));
|
160
|
-
s3.putObject(s3BucketName, s3KeyName, file);
|
161
203
|
|
162
|
-
RedshiftOutputConnection con = connector.connect(true);
|
163
204
|
try {
|
164
|
-
logger.info("Running COPY from file {}", s3KeyName);
|
165
|
-
|
166
|
-
// create temporary credential right before COPY operation because
|
167
|
-
// it has timeout.
|
168
|
-
// TODO skip this step if iamReaderUserName is not set
|
169
|
-
BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
|
170
|
-
|
171
205
|
long startTime = System.currentTimeMillis();
|
172
|
-
|
206
|
+
s3.putObject(s3BucketName, s3KeyName, file);
|
173
207
|
double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
|
174
208
|
|
175
|
-
logger.info(String.format("
|
209
|
+
logger.info(String.format("Uploaded file %s (%.2f seconds)", s3KeyName, seconds));
|
210
|
+
} finally {
|
211
|
+
file.delete();
|
212
|
+
}
|
213
|
+
|
214
|
+
return null;
|
215
|
+
}
|
216
|
+
}
|
176
217
|
|
218
|
+
private class CopyTask implements Callable<Void>
|
219
|
+
{
|
220
|
+
private final Future<Void> uploadFuture;
|
221
|
+
private final String s3KeyName;
|
222
|
+
|
223
|
+
public CopyTask(Future<Void> uploadFuture, String s3KeyName)
|
224
|
+
{
|
225
|
+
this.uploadFuture = uploadFuture;
|
226
|
+
this.s3KeyName = s3KeyName;
|
227
|
+
}
|
228
|
+
|
229
|
+
public Void call() throws SQLException, InterruptedException, ExecutionException {
|
230
|
+
try {
|
231
|
+
uploadFuture.get();
|
232
|
+
|
233
|
+
RedshiftOutputConnection con = connector.connect(true);
|
234
|
+
try {
|
235
|
+
logger.info("Running COPY from file {}", s3KeyName);
|
236
|
+
|
237
|
+
// create temporary credential right before COPY operation because
|
238
|
+
// it has timeout.
|
239
|
+
BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
|
240
|
+
|
241
|
+
long startTime = System.currentTimeMillis();
|
242
|
+
con.runCopy(buildCopySQL(creds));
|
243
|
+
double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
|
244
|
+
|
245
|
+
logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
|
246
|
+
|
247
|
+
} finally {
|
248
|
+
con.close();
|
249
|
+
}
|
177
250
|
} finally {
|
178
|
-
|
251
|
+
s3.deleteObject(s3BucketName, s3KeyName);
|
179
252
|
}
|
180
253
|
|
181
254
|
return null;
|
182
255
|
}
|
183
256
|
|
257
|
+
|
258
|
+
|
184
259
|
private String buildCopySQL(BasicSessionCredentials creds)
|
185
260
|
{
|
186
261
|
StringBuilder sb = new StringBuilder();
|
@@ -194,25 +269,13 @@ public class RedshiftCopyBatchInsert
|
|
194
269
|
sb.append(creds.getAWSAccessKeyId());
|
195
270
|
sb.append(";aws_secret_access_key=");
|
196
271
|
sb.append(creds.getAWSSecretKey());
|
197
|
-
|
198
|
-
|
272
|
+
if (creds.getSessionToken() != null) {
|
273
|
+
sb.append(";token=");
|
274
|
+
sb.append(creds.getSessionToken());
|
275
|
+
}
|
199
276
|
sb.append("' ");
|
200
277
|
sb.append(COPY_AFTER_FROM);
|
201
278
|
return sb.toString();
|
202
279
|
}
|
203
280
|
}
|
204
|
-
|
205
|
-
private static class DeleteFileFinalizer implements Closeable
|
206
|
-
{
|
207
|
-
private File file;
|
208
|
-
|
209
|
-
public DeleteFileFinalizer(File file) {
|
210
|
-
this.file = file;
|
211
|
-
}
|
212
|
-
|
213
|
-
@Override
|
214
|
-
public void close() throws IOException {
|
215
|
-
file.delete();
|
216
|
-
}
|
217
|
-
}
|
218
281
|
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-redshift
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-03-29 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Inserts or updates records to a table.
|
14
14
|
email:
|
@@ -30,9 +30,9 @@ files:
|
|
30
30
|
- classpath/aws-java-sdk-sts-1.10.33.jar
|
31
31
|
- classpath/commons-codec-1.6.jar
|
32
32
|
- classpath/commons-logging-1.1.3.jar
|
33
|
-
- classpath/embulk-output-jdbc-0.5.
|
34
|
-
- classpath/embulk-output-postgresql-0.5.
|
35
|
-
- classpath/embulk-output-redshift-0.5.
|
33
|
+
- classpath/embulk-output-jdbc-0.5.1.jar
|
34
|
+
- classpath/embulk-output-postgresql-0.5.1.jar
|
35
|
+
- classpath/embulk-output-redshift-0.5.1.jar
|
36
36
|
- classpath/httpclient-4.3.6.jar
|
37
37
|
- classpath/httpcore-4.3.3.jar
|
38
38
|
- classpath/postgresql-9.4-1205-jdbc41.jar
|
Binary file
|