embulk-output-redshift 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fae200c792daa5799eea798de528bfe89716cad7
4
- data.tar.gz: eaf7d9294216da01c956b3498165fbb876f80a11
3
+ metadata.gz: c8c2e9f95d662d1860ccba4b210c6dae2d1cc44b
4
+ data.tar.gz: cd0afa3f4352bc9732e27d1fdd132398114356c6
5
5
  SHA512:
6
- metadata.gz: c47c00911b4bf7a34a38994afef62e21a3695c8ea990e4c1c45b2db2102eb890b3480e9f62677c60a622a47b796e67816adfeddfb8dd601b7586e60e9fb1c8e9
7
- data.tar.gz: 21d1dcec368cc4be9cf3e29e001605bf06e7bd727873d8685c409faa257599c6e041cc77aebf161466be8f5bbb3ee0bf054db860bf5fbcd17046b42ff72a4891
6
+ metadata.gz: 15a7eb5c206c87822f163dd105af21444cf3d925f01e01e0eb7e4ab0e499646b814e9f9b377b49df677b79cb7a559ac97674f7179f699f2dc3cd96523e99216c
7
+ data.tar.gz: dd060ac7a977ee44b20e5927c3e121d2b930e81b18282b0407a658921db8c7f9fb392fdb277c2078b9dbf08f13de3d987337cb2064c7637205726b82035f85f9
data/README.md CHANGED
@@ -5,8 +5,8 @@ Redshift output plugins for Embulk loads records to Redshift.
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: output
8
- * **Load all or nothing**: depnds on the mode. see bellow.
9
- * **Resume supported**: depnds on the mode. see bellow.
8
+ * **Load all or nothing**: depnds on the mode. see below.
9
+ * **Resume supported**: depnds on the mode. see below.
10
10
 
11
11
  ## Configuration
12
12
 
@@ -19,11 +19,11 @@ Redshift output plugins for Embulk loads records to Redshift.
19
19
  - **table**: destination table name (string, required)
20
20
  - **access_key_id**: access key id for AWS
21
21
  - **secret_access_key**: secret access key for AWS
22
- - **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` and `sts:GetFederationToken`.
22
+ - **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:DeleteObject`, , `s3:ListBucket` and `sts:GetFederationToken`. (string, default: "", but we strongly recommend that you use IAM user for security reasons. see below.)
23
23
  - **s3_bucket**: S3 bucket name for temporary files
24
24
  - **s3_key_prefix**: S3 key prefix for temporary files (string, default:"")
25
25
  - **options**: extra connection properties (hash, default: {})
26
- - **mode**: "replace" or "insert" (string, required)
26
+ - **mode**: "insert", "insert_direct", "truncate_insert", or "replace". See below. (string, required)
27
27
  - **batch_size**: size of a single batch insert (integer, default: 16777216)
28
28
  - **default_timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp into a SQL string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
29
29
  - **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
@@ -32,6 +32,7 @@ Redshift output plugins for Embulk loads records to Redshift.
32
32
  - **timestamp_format**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, default: `%Y-%m-%d %H:%M:%S.%6N`)
33
33
  - **timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp value into a SQL string. In this cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
34
34
 
35
+
35
36
  ### Modes
36
37
 
37
38
  * **insert**:
@@ -98,3 +99,8 @@ out:
98
99
  ```
99
100
  $ ./gradlew gem
100
101
  ```
102
+
103
+ ### Security
104
+ This plugin requires AWS access credentials so that it may write temporary files to S3. There are two security options, Standard and Federated.
105
+ To use Standard security, give **aws_key_id** and **secret_access_key**. To use Federated mode, also give the **iam_user_name** field.
106
+ Federated mode really means temporary credentials, so that a man-in-the-middle attack will see AWS credentials that are only valid for 1 calendar day after the transaction.
@@ -53,6 +53,7 @@ public class RedshiftOutputPlugin
53
53
  public String getSecretAccessKey();
54
54
 
55
55
  @Config("iam_user_name")
56
+ @ConfigDefault("\"\"")
56
57
  public String getIamUserName();
57
58
 
58
59
  @Config("s3_bucket")
@@ -1,15 +1,27 @@
1
1
  package org.embulk.output.redshift;
2
2
 
3
- import java.util.zip.GZIPOutputStream;
4
- import java.util.concurrent.Callable;
5
- import java.util.UUID;
3
+ import java.io.BufferedWriter;
6
4
  import java.io.File;
7
- import java.io.IOException;
8
5
  import java.io.FileOutputStream;
6
+ import java.io.IOException;
9
7
  import java.io.OutputStreamWriter;
10
- import java.io.Closeable;
11
- import java.io.BufferedWriter;
12
8
  import java.sql.SQLException;
9
+ import java.util.ArrayList;
10
+ import java.util.List;
11
+ import java.util.UUID;
12
+ import java.util.concurrent.Callable;
13
+ import java.util.concurrent.ExecutionException;
14
+ import java.util.concurrent.ExecutorService;
15
+ import java.util.concurrent.Executors;
16
+ import java.util.concurrent.Future;
17
+ import java.util.concurrent.TimeUnit;
18
+ import java.util.zip.GZIPOutputStream;
19
+
20
+ import org.embulk.output.jdbc.JdbcSchema;
21
+ import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
22
+ import org.embulk.spi.Exec;
23
+ import org.slf4j.Logger;
24
+
13
25
  import com.amazonaws.auth.AWSCredentialsProvider;
14
26
  import com.amazonaws.auth.BasicSessionCredentials;
15
27
  import com.amazonaws.auth.policy.Policy;
@@ -19,13 +31,9 @@ import com.amazonaws.auth.policy.Statement.Effect;
19
31
  import com.amazonaws.auth.policy.actions.S3Actions;
20
32
  import com.amazonaws.services.s3.AmazonS3Client;
21
33
  import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
34
+ import com.amazonaws.services.securitytoken.model.Credentials;
22
35
  import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
23
36
  import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
24
- import com.amazonaws.services.securitytoken.model.Credentials;
25
- import org.slf4j.Logger;
26
- import org.embulk.spi.Exec;
27
- import org.embulk.output.jdbc.JdbcSchema;
28
- import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
29
37
 
30
38
  public class RedshiftCopyBatchInsert
31
39
  extends AbstractPostgreSQLCopyBatchInsert
@@ -35,13 +43,16 @@ public class RedshiftCopyBatchInsert
35
43
  private final String s3BucketName;
36
44
  private final String s3KeyPrefix;
37
45
  private final String iamReaderUserName;
46
+ private final AWSCredentialsProvider credentialsProvider;
38
47
  private final AmazonS3Client s3;
39
48
  private final AWSSecurityTokenServiceClient sts;
49
+ private final ExecutorService executorService;
40
50
 
41
51
  private RedshiftOutputConnection connection = null;
42
52
  private String copySqlBeforeFrom = null;
43
53
  private long totalRows;
44
54
  private int fileCount;
55
+ private List<Future<Void>> uploadAndCopyFutures;
45
56
 
46
57
  public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
47
58
 
@@ -58,8 +69,12 @@ public class RedshiftCopyBatchInsert
58
69
  this.s3KeyPrefix = s3KeyPrefix + "/";
59
70
  }
60
71
  this.iamReaderUserName = iamReaderUserName;
72
+ this.credentialsProvider = credentialsProvider;
61
73
  this.s3 = new AmazonS3Client(credentialsProvider); // TODO options
62
74
  this.sts = new AWSSecurityTokenServiceClient(credentialsProvider); // options
75
+
76
+ this.executorService = Executors.newCachedThreadPool();
77
+ this.uploadAndCopyFutures = new ArrayList<Future<Void>>();
63
78
  }
64
79
 
65
80
  @Override
@@ -86,28 +101,51 @@ public class RedshiftCopyBatchInsert
86
101
  {
87
102
  File file = closeCurrentFile(); // flush buffered data in writer
88
103
 
89
- // TODO multi-threading
90
- new UploadAndCopyTask(file, batchRows, s3KeyPrefix + UUID.randomUUID().toString()).call();
91
- new DeleteFileFinalizer(file).close();
104
+ String s3KeyName = s3KeyPrefix + UUID.randomUUID().toString();
105
+ UploadTask uploadTask = new UploadTask(file, batchRows, s3KeyName);
106
+ Future<Void> uploadFuture = executorService.submit(uploadTask);
107
+ uploadAndCopyFutures.add(uploadFuture);
108
+
109
+ CopyTask copyTask = new CopyTask(uploadFuture, s3KeyName);
110
+ uploadAndCopyFutures.add(executorService.submit(copyTask));
92
111
 
93
112
  fileCount++;
94
113
  totalRows += batchRows;
95
114
  batchRows = 0;
96
115
 
97
116
  openNewFile();
98
- file.delete();
99
117
  }
100
118
 
101
119
  @Override
102
120
  public void finish() throws IOException, SQLException
103
121
  {
104
122
  super.finish();
123
+
124
+ for (Future<Void> uploadAndCopyFuture : uploadAndCopyFutures) {
125
+ try {
126
+ uploadAndCopyFuture.get();
127
+
128
+ } catch (InterruptedException e) {
129
+ throw new RuntimeException(e);
130
+ } catch (ExecutionException e) {
131
+ if (e.getCause() instanceof SQLException) {
132
+ throw (SQLException)e.getCause();
133
+ }
134
+ throw new RuntimeException(e);
135
+ }
136
+ }
137
+
105
138
  logger.info("Loaded {} files.", fileCount);
106
139
  }
107
140
 
108
141
  @Override
109
142
  public void close() throws IOException, SQLException
110
143
  {
144
+ executorService.shutdownNow();
145
+ try {
146
+ executorService.awaitTermination(60, TimeUnit.SECONDS);
147
+ } catch (InterruptedException e) {}
148
+
111
149
  s3.shutdown();
112
150
  closeCurrentFile().delete();
113
151
  if (connection != null) {
@@ -127,60 +165,97 @@ public class RedshiftCopyBatchInsert
127
165
  .withActions(S3Actions.GetObject)
128
166
  .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
129
167
  );
130
- GetFederationTokenRequest req = new GetFederationTokenRequest();
131
- req.setDurationSeconds(86400); // 3600 - 129600
132
- req.setName(iamReaderUserName);
133
- req.setPolicy(policy.toJson());
134
-
135
- GetFederationTokenResult res = sts.getFederationToken(req);
136
- Credentials c = res.getCredentials();
137
-
138
- return new BasicSessionCredentials(
139
- c.getAccessKeyId(),
140
- c.getSecretAccessKey(),
141
- c.getSessionToken());
168
+ if (iamReaderUserName != null && iamReaderUserName.length() > 0) {
169
+ GetFederationTokenRequest req = new GetFederationTokenRequest();
170
+ req.setDurationSeconds(86400); // 3600 - 129600
171
+ req.setName(iamReaderUserName);
172
+ req.setPolicy(policy.toJson());
173
+
174
+ GetFederationTokenResult res = sts.getFederationToken(req);
175
+ Credentials c = res.getCredentials();
176
+
177
+ return new BasicSessionCredentials(
178
+ c.getAccessKeyId(),
179
+ c.getSecretAccessKey(),
180
+ c.getSessionToken());
181
+ } else {
182
+ return new BasicSessionCredentials(credentialsProvider.getCredentials().getAWSAccessKeyId(),
183
+ credentialsProvider.getCredentials().getAWSSecretKey(), null);
184
+ }
142
185
  }
143
186
 
144
- private class UploadAndCopyTask implements Callable<Void>
187
+ private class UploadTask implements Callable<Void>
145
188
  {
146
189
  private final File file;
147
190
  private final int batchRows;
148
191
  private final String s3KeyName;
149
192
 
150
- public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
193
+ public UploadTask(File file, int batchRows, String s3KeyName)
151
194
  {
152
195
  this.file = file;
153
196
  this.batchRows = batchRows;
154
197
  this.s3KeyName = s3KeyName;
155
198
  }
156
199
 
157
- public Void call() throws SQLException {
200
+ public Void call() {
158
201
  logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
159
202
  s3KeyName, file.length(), batchRows));
160
- s3.putObject(s3BucketName, s3KeyName, file);
161
203
 
162
- RedshiftOutputConnection con = connector.connect(true);
163
204
  try {
164
- logger.info("Running COPY from file {}", s3KeyName);
165
-
166
- // create temporary credential right before COPY operation because
167
- // it has timeout.
168
- // TODO skip this step if iamReaderUserName is not set
169
- BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
170
-
171
205
  long startTime = System.currentTimeMillis();
172
- con.runCopy(buildCopySQL(creds));
206
+ s3.putObject(s3BucketName, s3KeyName, file);
173
207
  double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
174
208
 
175
- logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
209
+ logger.info(String.format("Uploaded file %s (%.2f seconds)", s3KeyName, seconds));
210
+ } finally {
211
+ file.delete();
212
+ }
213
+
214
+ return null;
215
+ }
216
+ }
176
217
 
218
+ private class CopyTask implements Callable<Void>
219
+ {
220
+ private final Future<Void> uploadFuture;
221
+ private final String s3KeyName;
222
+
223
+ public CopyTask(Future<Void> uploadFuture, String s3KeyName)
224
+ {
225
+ this.uploadFuture = uploadFuture;
226
+ this.s3KeyName = s3KeyName;
227
+ }
228
+
229
+ public Void call() throws SQLException, InterruptedException, ExecutionException {
230
+ try {
231
+ uploadFuture.get();
232
+
233
+ RedshiftOutputConnection con = connector.connect(true);
234
+ try {
235
+ logger.info("Running COPY from file {}", s3KeyName);
236
+
237
+ // create temporary credential right before COPY operation because
238
+ // it has timeout.
239
+ BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
240
+
241
+ long startTime = System.currentTimeMillis();
242
+ con.runCopy(buildCopySQL(creds));
243
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
244
+
245
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
246
+
247
+ } finally {
248
+ con.close();
249
+ }
177
250
  } finally {
178
- con.close();
251
+ s3.deleteObject(s3BucketName, s3KeyName);
179
252
  }
180
253
 
181
254
  return null;
182
255
  }
183
256
 
257
+
258
+
184
259
  private String buildCopySQL(BasicSessionCredentials creds)
185
260
  {
186
261
  StringBuilder sb = new StringBuilder();
@@ -194,25 +269,13 @@ public class RedshiftCopyBatchInsert
194
269
  sb.append(creds.getAWSAccessKeyId());
195
270
  sb.append(";aws_secret_access_key=");
196
271
  sb.append(creds.getAWSSecretKey());
197
- sb.append(";token=");
198
- sb.append(creds.getSessionToken());
272
+ if (creds.getSessionToken() != null) {
273
+ sb.append(";token=");
274
+ sb.append(creds.getSessionToken());
275
+ }
199
276
  sb.append("' ");
200
277
  sb.append(COPY_AFTER_FROM);
201
278
  return sb.toString();
202
279
  }
203
280
  }
204
-
205
- private static class DeleteFileFinalizer implements Closeable
206
- {
207
- private File file;
208
-
209
- public DeleteFileFinalizer(File file) {
210
- this.file = file;
211
- }
212
-
213
- @Override
214
- public void close() throws IOException {
215
- file.delete();
216
- }
217
- }
218
281
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-15 00:00:00.000000000 Z
11
+ date: 2016-03-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Inserts or updates records to a table.
14
14
  email:
@@ -30,9 +30,9 @@ files:
30
30
  - classpath/aws-java-sdk-sts-1.10.33.jar
31
31
  - classpath/commons-codec-1.6.jar
32
32
  - classpath/commons-logging-1.1.3.jar
33
- - classpath/embulk-output-jdbc-0.5.0.jar
34
- - classpath/embulk-output-postgresql-0.5.0.jar
35
- - classpath/embulk-output-redshift-0.5.0.jar
33
+ - classpath/embulk-output-jdbc-0.5.1.jar
34
+ - classpath/embulk-output-postgresql-0.5.1.jar
35
+ - classpath/embulk-output-redshift-0.5.1.jar
36
36
  - classpath/httpclient-4.3.6.jar
37
37
  - classpath/httpcore-4.3.3.jar
38
38
  - classpath/postgresql-9.4-1205-jdbc41.jar