embulk-output-redshift 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: fae200c792daa5799eea798de528bfe89716cad7
4
- data.tar.gz: eaf7d9294216da01c956b3498165fbb876f80a11
3
+ metadata.gz: c8c2e9f95d662d1860ccba4b210c6dae2d1cc44b
4
+ data.tar.gz: cd0afa3f4352bc9732e27d1fdd132398114356c6
5
5
  SHA512:
6
- metadata.gz: c47c00911b4bf7a34a38994afef62e21a3695c8ea990e4c1c45b2db2102eb890b3480e9f62677c60a622a47b796e67816adfeddfb8dd601b7586e60e9fb1c8e9
7
- data.tar.gz: 21d1dcec368cc4be9cf3e29e001605bf06e7bd727873d8685c409faa257599c6e041cc77aebf161466be8f5bbb3ee0bf054db860bf5fbcd17046b42ff72a4891
6
+ metadata.gz: 15a7eb5c206c87822f163dd105af21444cf3d925f01e01e0eb7e4ab0e499646b814e9f9b377b49df677b79cb7a559ac97674f7179f699f2dc3cd96523e99216c
7
+ data.tar.gz: dd060ac7a977ee44b20e5927c3e121d2b930e81b18282b0407a658921db8c7f9fb392fdb277c2078b9dbf08f13de3d987337cb2064c7637205726b82035f85f9
data/README.md CHANGED
@@ -5,8 +5,8 @@ Redshift output plugins for Embulk loads records to Redshift.
5
5
  ## Overview
6
6
 
7
7
  * **Plugin type**: output
8
- * **Load all or nothing**: depnds on the mode. see bellow.
9
- * **Resume supported**: depnds on the mode. see bellow.
8
+ * **Load all or nothing**: depnds on the mode. see below.
9
+ * **Resume supported**: depnds on the mode. see below.
10
10
 
11
11
  ## Configuration
12
12
 
@@ -19,11 +19,11 @@ Redshift output plugins for Embulk loads records to Redshift.
19
19
  - **table**: destination table name (string, required)
20
20
  - **access_key_id**: access key id for AWS
21
21
  - **secret_access_key**: secret access key for AWS
22
- - **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` and `sts:GetFederationToken`.
22
+ - **iam_user_name**: IAM user name for uploading temporary files to S3. The user should have permissions of `s3:GetObject`, `s3:PutObject`, `s3:DeleteObject`, , `s3:ListBucket` and `sts:GetFederationToken`. (string, default: "", but we strongly recommend that you use IAM user for security reasons. see below.)
23
23
  - **s3_bucket**: S3 bucket name for temporary files
24
24
  - **s3_key_prefix**: S3 key prefix for temporary files (string, default:"")
25
25
  - **options**: extra connection properties (hash, default: {})
26
- - **mode**: "replace" or "insert" (string, required)
26
+ - **mode**: "insert", "insert_direct", "truncate_insert", or "replace". See below. (string, required)
27
27
  - **batch_size**: size of a single batch insert (integer, default: 16777216)
28
28
  - **default_timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp into a SQL string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
29
29
  - **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
@@ -32,6 +32,7 @@ Redshift output plugins for Embulk loads records to Redshift.
32
32
  - **timestamp_format**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, default: `%Y-%m-%d %H:%M:%S.%6N`)
33
33
  - **timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp value into a SQL string. In this cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
34
34
 
35
+
35
36
  ### Modes
36
37
 
37
38
  * **insert**:
@@ -98,3 +99,8 @@ out:
98
99
  ```
99
100
  $ ./gradlew gem
100
101
  ```
102
+
103
+ ### Security
104
+ This plugin requires AWS access credentials so that it may write temporary files to S3. There are two security options, Standard and Federated.
105
+ To use Standard security, give **aws_key_id** and **secret_access_key**. To use Federated mode, also give the **iam_user_name** field.
106
+ Federated mode really means temporary credentials, so that a man-in-the-middle attack will see AWS credentials that are only valid for 1 calendar day after the transaction.
@@ -53,6 +53,7 @@ public class RedshiftOutputPlugin
53
53
  public String getSecretAccessKey();
54
54
 
55
55
  @Config("iam_user_name")
56
+ @ConfigDefault("\"\"")
56
57
  public String getIamUserName();
57
58
 
58
59
  @Config("s3_bucket")
@@ -1,15 +1,27 @@
1
1
  package org.embulk.output.redshift;
2
2
 
3
- import java.util.zip.GZIPOutputStream;
4
- import java.util.concurrent.Callable;
5
- import java.util.UUID;
3
+ import java.io.BufferedWriter;
6
4
  import java.io.File;
7
- import java.io.IOException;
8
5
  import java.io.FileOutputStream;
6
+ import java.io.IOException;
9
7
  import java.io.OutputStreamWriter;
10
- import java.io.Closeable;
11
- import java.io.BufferedWriter;
12
8
  import java.sql.SQLException;
9
+ import java.util.ArrayList;
10
+ import java.util.List;
11
+ import java.util.UUID;
12
+ import java.util.concurrent.Callable;
13
+ import java.util.concurrent.ExecutionException;
14
+ import java.util.concurrent.ExecutorService;
15
+ import java.util.concurrent.Executors;
16
+ import java.util.concurrent.Future;
17
+ import java.util.concurrent.TimeUnit;
18
+ import java.util.zip.GZIPOutputStream;
19
+
20
+ import org.embulk.output.jdbc.JdbcSchema;
21
+ import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
22
+ import org.embulk.spi.Exec;
23
+ import org.slf4j.Logger;
24
+
13
25
  import com.amazonaws.auth.AWSCredentialsProvider;
14
26
  import com.amazonaws.auth.BasicSessionCredentials;
15
27
  import com.amazonaws.auth.policy.Policy;
@@ -19,13 +31,9 @@ import com.amazonaws.auth.policy.Statement.Effect;
19
31
  import com.amazonaws.auth.policy.actions.S3Actions;
20
32
  import com.amazonaws.services.s3.AmazonS3Client;
21
33
  import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
34
+ import com.amazonaws.services.securitytoken.model.Credentials;
22
35
  import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
23
36
  import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
24
- import com.amazonaws.services.securitytoken.model.Credentials;
25
- import org.slf4j.Logger;
26
- import org.embulk.spi.Exec;
27
- import org.embulk.output.jdbc.JdbcSchema;
28
- import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
29
37
 
30
38
  public class RedshiftCopyBatchInsert
31
39
  extends AbstractPostgreSQLCopyBatchInsert
@@ -35,13 +43,16 @@ public class RedshiftCopyBatchInsert
35
43
  private final String s3BucketName;
36
44
  private final String s3KeyPrefix;
37
45
  private final String iamReaderUserName;
46
+ private final AWSCredentialsProvider credentialsProvider;
38
47
  private final AmazonS3Client s3;
39
48
  private final AWSSecurityTokenServiceClient sts;
49
+ private final ExecutorService executorService;
40
50
 
41
51
  private RedshiftOutputConnection connection = null;
42
52
  private String copySqlBeforeFrom = null;
43
53
  private long totalRows;
44
54
  private int fileCount;
55
+ private List<Future<Void>> uploadAndCopyFutures;
45
56
 
46
57
  public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
47
58
 
@@ -58,8 +69,12 @@ public class RedshiftCopyBatchInsert
58
69
  this.s3KeyPrefix = s3KeyPrefix + "/";
59
70
  }
60
71
  this.iamReaderUserName = iamReaderUserName;
72
+ this.credentialsProvider = credentialsProvider;
61
73
  this.s3 = new AmazonS3Client(credentialsProvider); // TODO options
62
74
  this.sts = new AWSSecurityTokenServiceClient(credentialsProvider); // options
75
+
76
+ this.executorService = Executors.newCachedThreadPool();
77
+ this.uploadAndCopyFutures = new ArrayList<Future<Void>>();
63
78
  }
64
79
 
65
80
  @Override
@@ -86,28 +101,51 @@ public class RedshiftCopyBatchInsert
86
101
  {
87
102
  File file = closeCurrentFile(); // flush buffered data in writer
88
103
 
89
- // TODO multi-threading
90
- new UploadAndCopyTask(file, batchRows, s3KeyPrefix + UUID.randomUUID().toString()).call();
91
- new DeleteFileFinalizer(file).close();
104
+ String s3KeyName = s3KeyPrefix + UUID.randomUUID().toString();
105
+ UploadTask uploadTask = new UploadTask(file, batchRows, s3KeyName);
106
+ Future<Void> uploadFuture = executorService.submit(uploadTask);
107
+ uploadAndCopyFutures.add(uploadFuture);
108
+
109
+ CopyTask copyTask = new CopyTask(uploadFuture, s3KeyName);
110
+ uploadAndCopyFutures.add(executorService.submit(copyTask));
92
111
 
93
112
  fileCount++;
94
113
  totalRows += batchRows;
95
114
  batchRows = 0;
96
115
 
97
116
  openNewFile();
98
- file.delete();
99
117
  }
100
118
 
101
119
  @Override
102
120
  public void finish() throws IOException, SQLException
103
121
  {
104
122
  super.finish();
123
+
124
+ for (Future<Void> uploadAndCopyFuture : uploadAndCopyFutures) {
125
+ try {
126
+ uploadAndCopyFuture.get();
127
+
128
+ } catch (InterruptedException e) {
129
+ throw new RuntimeException(e);
130
+ } catch (ExecutionException e) {
131
+ if (e.getCause() instanceof SQLException) {
132
+ throw (SQLException)e.getCause();
133
+ }
134
+ throw new RuntimeException(e);
135
+ }
136
+ }
137
+
105
138
  logger.info("Loaded {} files.", fileCount);
106
139
  }
107
140
 
108
141
  @Override
109
142
  public void close() throws IOException, SQLException
110
143
  {
144
+ executorService.shutdownNow();
145
+ try {
146
+ executorService.awaitTermination(60, TimeUnit.SECONDS);
147
+ } catch (InterruptedException e) {}
148
+
111
149
  s3.shutdown();
112
150
  closeCurrentFile().delete();
113
151
  if (connection != null) {
@@ -127,60 +165,97 @@ public class RedshiftCopyBatchInsert
127
165
  .withActions(S3Actions.GetObject)
128
166
  .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
129
167
  );
130
- GetFederationTokenRequest req = new GetFederationTokenRequest();
131
- req.setDurationSeconds(86400); // 3600 - 129600
132
- req.setName(iamReaderUserName);
133
- req.setPolicy(policy.toJson());
134
-
135
- GetFederationTokenResult res = sts.getFederationToken(req);
136
- Credentials c = res.getCredentials();
137
-
138
- return new BasicSessionCredentials(
139
- c.getAccessKeyId(),
140
- c.getSecretAccessKey(),
141
- c.getSessionToken());
168
+ if (iamReaderUserName != null && iamReaderUserName.length() > 0) {
169
+ GetFederationTokenRequest req = new GetFederationTokenRequest();
170
+ req.setDurationSeconds(86400); // 3600 - 129600
171
+ req.setName(iamReaderUserName);
172
+ req.setPolicy(policy.toJson());
173
+
174
+ GetFederationTokenResult res = sts.getFederationToken(req);
175
+ Credentials c = res.getCredentials();
176
+
177
+ return new BasicSessionCredentials(
178
+ c.getAccessKeyId(),
179
+ c.getSecretAccessKey(),
180
+ c.getSessionToken());
181
+ } else {
182
+ return new BasicSessionCredentials(credentialsProvider.getCredentials().getAWSAccessKeyId(),
183
+ credentialsProvider.getCredentials().getAWSSecretKey(), null);
184
+ }
142
185
  }
143
186
 
144
- private class UploadAndCopyTask implements Callable<Void>
187
+ private class UploadTask implements Callable<Void>
145
188
  {
146
189
  private final File file;
147
190
  private final int batchRows;
148
191
  private final String s3KeyName;
149
192
 
150
- public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
193
+ public UploadTask(File file, int batchRows, String s3KeyName)
151
194
  {
152
195
  this.file = file;
153
196
  this.batchRows = batchRows;
154
197
  this.s3KeyName = s3KeyName;
155
198
  }
156
199
 
157
- public Void call() throws SQLException {
200
+ public Void call() {
158
201
  logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
159
202
  s3KeyName, file.length(), batchRows));
160
- s3.putObject(s3BucketName, s3KeyName, file);
161
203
 
162
- RedshiftOutputConnection con = connector.connect(true);
163
204
  try {
164
- logger.info("Running COPY from file {}", s3KeyName);
165
-
166
- // create temporary credential right before COPY operation because
167
- // it has timeout.
168
- // TODO skip this step if iamReaderUserName is not set
169
- BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
170
-
171
205
  long startTime = System.currentTimeMillis();
172
- con.runCopy(buildCopySQL(creds));
206
+ s3.putObject(s3BucketName, s3KeyName, file);
173
207
  double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
174
208
 
175
- logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
209
+ logger.info(String.format("Uploaded file %s (%.2f seconds)", s3KeyName, seconds));
210
+ } finally {
211
+ file.delete();
212
+ }
213
+
214
+ return null;
215
+ }
216
+ }
176
217
 
218
+ private class CopyTask implements Callable<Void>
219
+ {
220
+ private final Future<Void> uploadFuture;
221
+ private final String s3KeyName;
222
+
223
+ public CopyTask(Future<Void> uploadFuture, String s3KeyName)
224
+ {
225
+ this.uploadFuture = uploadFuture;
226
+ this.s3KeyName = s3KeyName;
227
+ }
228
+
229
+ public Void call() throws SQLException, InterruptedException, ExecutionException {
230
+ try {
231
+ uploadFuture.get();
232
+
233
+ RedshiftOutputConnection con = connector.connect(true);
234
+ try {
235
+ logger.info("Running COPY from file {}", s3KeyName);
236
+
237
+ // create temporary credential right before COPY operation because
238
+ // it has timeout.
239
+ BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
240
+
241
+ long startTime = System.currentTimeMillis();
242
+ con.runCopy(buildCopySQL(creds));
243
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
244
+
245
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
246
+
247
+ } finally {
248
+ con.close();
249
+ }
177
250
  } finally {
178
- con.close();
251
+ s3.deleteObject(s3BucketName, s3KeyName);
179
252
  }
180
253
 
181
254
  return null;
182
255
  }
183
256
 
257
+
258
+
184
259
  private String buildCopySQL(BasicSessionCredentials creds)
185
260
  {
186
261
  StringBuilder sb = new StringBuilder();
@@ -194,25 +269,13 @@ public class RedshiftCopyBatchInsert
194
269
  sb.append(creds.getAWSAccessKeyId());
195
270
  sb.append(";aws_secret_access_key=");
196
271
  sb.append(creds.getAWSSecretKey());
197
- sb.append(";token=");
198
- sb.append(creds.getSessionToken());
272
+ if (creds.getSessionToken() != null) {
273
+ sb.append(";token=");
274
+ sb.append(creds.getSessionToken());
275
+ }
199
276
  sb.append("' ");
200
277
  sb.append(COPY_AFTER_FROM);
201
278
  return sb.toString();
202
279
  }
203
280
  }
204
-
205
- private static class DeleteFileFinalizer implements Closeable
206
- {
207
- private File file;
208
-
209
- public DeleteFileFinalizer(File file) {
210
- this.file = file;
211
- }
212
-
213
- @Override
214
- public void close() throws IOException {
215
- file.delete();
216
- }
217
- }
218
281
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-15 00:00:00.000000000 Z
11
+ date: 2016-03-29 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Inserts or updates records to a table.
14
14
  email:
@@ -30,9 +30,9 @@ files:
30
30
  - classpath/aws-java-sdk-sts-1.10.33.jar
31
31
  - classpath/commons-codec-1.6.jar
32
32
  - classpath/commons-logging-1.1.3.jar
33
- - classpath/embulk-output-jdbc-0.5.0.jar
34
- - classpath/embulk-output-postgresql-0.5.0.jar
35
- - classpath/embulk-output-redshift-0.5.0.jar
33
+ - classpath/embulk-output-jdbc-0.5.1.jar
34
+ - classpath/embulk-output-postgresql-0.5.1.jar
35
+ - classpath/embulk-output-redshift-0.5.1.jar
36
36
  - classpath/httpclient-4.3.6.jar
37
37
  - classpath/httpcore-4.3.3.jar
38
38
  - classpath/postgresql-9.4-1205-jdbc41.jar