embulk-output-redshift 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 58358c6f921e03a15a58929b8241a40cc97f5ff1
4
- data.tar.gz: a7b29d7fdd1247da37dac5687c132eef280e6b30
3
+ metadata.gz: f739710245a2663409cf49beb31c8e18cb148684
4
+ data.tar.gz: fe5268568d22eed5d8fc40e9117684ba5989a715
5
5
  SHA512:
6
- metadata.gz: e2f87d3cc3f9a1d5aa1c94b8a9b00fabc8deef097a2c9e6b8da7d3ddf81a3a5907094ef22ec675f49d4567d274a848ce6161424f9ba5edbbd885d98a9a6940ec
7
- data.tar.gz: 37db160a1993e53804837b379530ffb746f9c77e535e4d5f6f723a060e0135be1854e8c85dde3ea7ba23351791cf9ac4a1d704c8631fbdf2674fd5bd2ebf203b
6
+ metadata.gz: 9d8808f711394ed62b840faa26d1472f871e7c7e71b069b9279a5342b1580b9d496c8d1103984f97e942a715d16f91bd402a2baed03c3dc68625d0727761064b
7
+ data.tar.gz: a65ba3a389a4f3e80cc179c18d5ed0d8e0e62a550776dd73b0888ae4b2832ccd96f9b3e680d202723ab667d4104aaa62af25dc7e9e51699634dc767d0f578701
data/README.md CHANGED
@@ -1,47 +1,97 @@
1
- # Redshift output plugins for Embulk
2
-
3
- Redshift output plugins for Embulk loads records to Redshift.
4
-
5
- ## Overview
6
-
7
- * **Plugin type**: output
8
- * **Load all or nothing**: depnds on the mode:
9
- * **insert**: no
10
- * **replace**: yes
11
- * **Resume supported**: no
12
-
13
- ## Configuration
14
-
15
- - **host**: database host name (string, required)
16
- - **port**: database port number (integer, default: 5439)
17
- - **user**: database login user name (string, required)
18
- - **password**: database login password (string, default: "")
19
- - **database**: destination database name (string, required)
20
- - **schema**: destination schema name (string, default: "public")
21
- - **table**: destination table name (string, required)
22
- - **mode**: "replace" or "insert" (string, required)
23
- - **batch_size**: size of a single batch insert (integer, default: 16777216)
24
- - **options**: extra connection properties (hash, default: {})
25
-
26
- ### Example
27
-
28
- ```yaml
29
- out:
30
- type: redshift
31
- host: myinstance.us-west-2.redshift.amazonaws.com
32
- user: pg
33
- password: ""
34
- database: my_database
35
- table: my_table
36
- access_key_id: ABCXYZ123ABCXYZ123
37
- secret_access_key: AbCxYz123aBcXyZ123
38
- s3_bucket: my-redshift-transfer-bucket
39
- iam_user_name: my-s3-read-only
40
- mode: insert
41
- ```
42
-
43
- ### Build
44
-
45
- ```
46
- $ ./gradlew gem
47
- ```
1
+ # Redshift output plugins for Embulk
2
+
3
+ Redshift output plugins for Embulk loads records to Redshift.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: output
8
+ * **Load all or nothing**: depnds on the mode. see bellow.
9
+ * **Resume supported**: depnds on the mode. see bellow.
10
+
11
+ ## Configuration
12
+
13
+ - **host**: database host name (string, required)
14
+ - **port**: database port number (integer, default: 5439)
15
+ - **user**: database login user name (string, required)
16
+ - **password**: database login password (string, default: "")
17
+ - **database**: destination database name (string, required)
18
+ - **schema**: destination schema name (string, default: "public")
19
+ - **table**: destination table name (string, required)
20
+ - **options**: extra connection properties (hash, default: {})
21
+ - **mode**: "replace" or "insert" (string, required)
22
+ - **batch_size**: size of a single batch insert (integer, default: 16777216)
23
+ - **default_timezone**: If input column type (embulk type) is timestamp and destination column type is `string` or `nstring`, this plugin needs to format the timestamp into a string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
24
+ - **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
25
+ - **type**: type of a column when this plugin creates new tables (e.g. `VARCHAR(255)`, `INTEGER NOT NULL UNIQUE`). This used when this plugin creates intermediate tables (insert, truncate_insert and merge modes), when it creates the target table (insert_direct and replace modes), and when it creates nonexistent target table automatically. (string, default: depends on input column type. `BIGINT` if input column type is long, `BOOLEAN` if boolean, `DOUBLE PRECISION` if double, `CLOB` if string, `TIMESTAMP` if timestamp)
26
+ - **value_type**: This plugin converts input column type (embulk type) into a database type to build a INSERT statement. This value_type option controls the type of the value in a INSERT statement. (string, default: depends on input column type. Available values options are: `byte`, `short`, `int`, `long`, `double`, `float`, `boolean`, `string`, `nstring`, `date`, `time`, `timestamp`, `decimal`, `null`, `pass`)
27
+ - **timestamp_format**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, default: `%Y-%m-%d %H:%M:%S.%6N`)
28
+ - **timezone**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. And if the input column type is timestamp and value_type is `date`, this plugin needs to consider timezone. In those cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
29
+
30
+ ### Modes
31
+
32
+ * **insert**:
33
+ * Behavior: This mode writes rows to some intermediate tables first. If all those tasks run correctly, runs `INSERT INTO <target_table> SELECT * FROM <intermediate_table_1> UNION ALL SELECT * FROM <intermediate_table_2> UNION ALL ...` query.
34
+ * Transactional: Yes. This mode successfully writes all rows, or fails with writing zero rows.
35
+ * Resumable: Yes.
36
+ * **insert_direct**:
37
+ * Behavior: This mode inserts rows to the target table directly.
38
+ * Transactional: No. If fails, the target table could have some rows inserted.
39
+ * Resumable: No.
40
+ * **truncate_insert**:
41
+ * Behavior: Same with `insert` mode excepting that it truncates the target table right before the last `INSERT ...` query.
42
+ * Transactional: Yes.
43
+ * Resumable: Yes.
44
+ * **merge**:
45
+ * Behavior: This mode writes rows to some intermediate tables first. If all those tasks run correctly, runs `INSERT INTO <target_table> SELECT * FROM <intermediate_table_1> UNION ALL SELECT * FROM <intermediate_table_2> UNION ALL ... ON DUPLICATE KEY UPDATE ...` query.
46
+ * Transactional: Yes.
47
+ * Resumable: Yes.
48
+ * **replace**:
49
+ * Behavior: Same with `insert` mode excepting that it truncates the target table right before the last `INSERT ...` query.
50
+ * Transactional: Yes.
51
+ * Resumable: No.
52
+
53
+ ### Example
54
+
55
+ ```yaml
56
+ out:
57
+ type: redshift
58
+ host: myinstance.us-west-2.redshift.amazonaws.com
59
+ user: pg
60
+ password: ""
61
+ database: my_database
62
+ table: my_table
63
+ access_key_id: ABCXYZ123ABCXYZ123
64
+ secret_access_key: AbCxYz123aBcXyZ123
65
+ s3_bucket: my-redshift-transfer-bucket
66
+ iam_user_name: my-s3-read-only
67
+ mode: insert
68
+ ```
69
+
70
+ Advanced configuration:
71
+
72
+ ```yaml
73
+ out:
74
+ type: redshift
75
+ host: myinstance.us-west-2.redshift.amazonaws.com
76
+ user: pg
77
+ password: ""
78
+ database: my_database
79
+ table: my_table
80
+ access_key_id: ABCXYZ123ABCXYZ123
81
+ secret_access_key: AbCxYz123aBcXyZ123
82
+ s3_bucket: my-redshift-transfer-bucket
83
+ iam_user_name: my-s3-read-only
84
+ options: {loglevel: 2}
85
+ mode: insert_direct
86
+ column_options:
87
+ my_col_1: {type: 'VARCHAR(255)'}
88
+ my_col_3: {type: 'INT NOT NULL'}
89
+ my_col_4: {value_type: string, timestamp_format: `%Y-%m-%d %H:%M:%S %z`, timezone: '-0700'}
90
+ my_col_5: {type: 'DECIMAL(18,9)', value_type: pass}
91
+ ```
92
+
93
+ ### Build
94
+
95
+ ```
96
+ $ ./gradlew gem
97
+ ```
data/build.gradle CHANGED
@@ -1,9 +1,9 @@
1
- dependencies {
2
- compile project(':embulk-output-jdbc')
3
- compile project(':embulk-output-postgresql')
4
-
5
- compile "com.amazonaws:aws-java-sdk-s3:1.9.17"
6
- compile "com.amazonaws:aws-java-sdk-sts:1.9.17"
7
-
8
- testCompile project(':embulk-output-jdbc').sourceSets.test.output
9
- }
1
+ dependencies {
2
+ compile project(':embulk-output-jdbc')
3
+ compile project(':embulk-output-postgresql')
4
+
5
+ compile "com.amazonaws:aws-java-sdk-s3:1.9.17"
6
+ compile "com.amazonaws:aws-java-sdk-sts:1.9.17"
7
+
8
+ testCompile project(':embulk-output-jdbc').sourceSets.test.output
9
+ }
@@ -1,3 +1,3 @@
1
- Embulk::JavaPlugin.register_output(
2
- :redshift, "org.embulk.output.RedshiftOutputPlugin",
3
- File.expand_path('../../../../classpath', __FILE__))
1
+ Embulk::JavaPlugin.register_output(
2
+ :redshift, "org.embulk.output.RedshiftOutputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -1,10 +1,14 @@
1
1
  package org.embulk.output;
2
2
 
3
+ import java.util.List;
3
4
  import java.util.Properties;
4
5
  import java.io.IOException;
5
6
  import java.sql.SQLException;
6
7
  import org.slf4j.Logger;
8
+ import com.google.common.base.Optional;
9
+ import com.google.common.collect.ImmutableSet;
7
10
  import com.amazonaws.auth.AWSCredentials;
11
+ import com.amazonaws.auth.AWSCredentialsProvider;
8
12
  import com.amazonaws.auth.BasicAWSCredentials;
9
13
  import org.embulk.spi.Exec;
10
14
  import org.embulk.config.Config;
@@ -61,6 +65,15 @@ public class RedshiftOutputPlugin
61
65
  return RedshiftPluginTask.class;
62
66
  }
63
67
 
68
+ @Override
69
+ protected Features getFeatures(PluginTask task)
70
+ {
71
+ return new Features()
72
+ .setMaxTableNameLength(30)
73
+ .setSupportedModes(ImmutableSet.of(Mode.INSERT, Mode.INSERT_DIRECT, Mode.MERGE, Mode.TRUNCATE_INSERT, Mode.REPLACE))
74
+ .setIgnoreMergeKeys(false);
75
+ }
76
+
64
77
  @Override
65
78
  protected RedshiftOutputConnector getConnector(PluginTask task, boolean retryableMetadataOperation)
66
79
  {
@@ -70,8 +83,6 @@ public class RedshiftOutputPlugin
70
83
  t.getHost(), t.getPort(), t.getDatabase());
71
84
 
72
85
  Properties props = new Properties();
73
- props.setProperty("user", t.getUser());
74
- props.setProperty("password", t.getPassword());
75
86
  props.setProperty("loginTimeout", "300"); // seconds
76
87
  props.setProperty("socketTimeout", "1800"); // seconds
77
88
 
@@ -98,19 +109,39 @@ public class RedshiftOutputPlugin
98
109
 
99
110
  props.putAll(t.getOptions());
100
111
 
112
+ props.setProperty("user", t.getUser());
113
+ logger.info("Connecting to {} options {}", url, props);
114
+ props.setProperty("password", t.getPassword());
115
+
101
116
  return new RedshiftOutputConnector(url, props, t.getSchema());
102
117
  }
103
118
 
119
+ private static AWSCredentialsProvider getAWSCredentialsProvider(RedshiftPluginTask task)
120
+ {
121
+ final AWSCredentials creds = new BasicAWSCredentials(
122
+ task.getAccessKeyId(), task.getSecretAccessKey());
123
+ return new AWSCredentialsProvider() {
124
+ @Override
125
+ public AWSCredentials getCredentials()
126
+ {
127
+ return creds;
128
+ }
129
+
130
+ @Override
131
+ public void refresh()
132
+ {
133
+ }
134
+ };
135
+ }
136
+
104
137
  @Override
105
- protected BatchInsert newBatchInsert(PluginTask task) throws IOException, SQLException
138
+ protected BatchInsert newBatchInsert(PluginTask task, Optional<List<String>> mergeKeys) throws IOException, SQLException
106
139
  {
107
- if (task.getMode().isMerge()) {
108
- throw new UnsupportedOperationException("mode 'merge' is not implemented for this type");
140
+ if (mergeKeys.isPresent()) {
141
+ throw new UnsupportedOperationException("Redshift output plugin doesn't support 'merge_direct' mode. Use 'merge' mode instead.");
109
142
  }
110
143
  RedshiftPluginTask t = (RedshiftPluginTask) task;
111
- AWSCredentials creds = new BasicAWSCredentials(
112
- t.getAccessKeyId(), t.getSecretAccessKey());
113
144
  return new RedshiftCopyBatchInsert(getConnector(task, true),
114
- creds, t.getS3Bucket(), t.getIamUserName());
145
+ getAWSCredentialsProvider(t), t.getS3Bucket(), t.getIamUserName());
115
146
  }
116
147
  }
@@ -1,216 +1,214 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.util.zip.GZIPOutputStream;
4
- import java.util.concurrent.Callable;
5
- import java.util.UUID;
6
- import java.io.File;
7
- import java.io.IOException;
8
- import java.io.FileOutputStream;
9
- import java.io.OutputStreamWriter;
10
- import java.io.Closeable;
11
- import java.io.Writer;
12
- import java.io.BufferedWriter;
13
- import java.sql.Connection;
14
- import java.sql.SQLException;
15
- import com.amazonaws.auth.AWSCredentials;
16
- import com.amazonaws.auth.BasicSessionCredentials;
17
- import com.amazonaws.auth.policy.Policy;
18
- import com.amazonaws.auth.policy.Resource;
19
- import com.amazonaws.auth.policy.Statement;
20
- import com.amazonaws.auth.policy.Statement.Effect;
21
- import com.amazonaws.auth.policy.actions.S3Actions;
22
- import com.amazonaws.services.s3.AmazonS3Client;
23
- import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
24
- import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
25
- import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
26
- import com.amazonaws.services.securitytoken.model.Credentials;
27
- import org.slf4j.Logger;
28
- import org.embulk.spi.Exec;
29
- import org.embulk.output.jdbc.JdbcSchema;
30
- import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
31
-
32
- public class RedshiftCopyBatchInsert
33
- extends AbstractPostgreSQLCopyBatchInsert
34
- {
35
- private final Logger logger = Exec.getLogger(RedshiftCopyBatchInsert.class);
36
- private final RedshiftOutputConnector connector;
37
- private final AWSCredentials awsCredentials;
38
- private final String s3BucketName;
39
- private final String iamReaderUserName;
40
- private final AmazonS3Client s3;
41
- private final AWSSecurityTokenServiceClient sts;
42
-
43
- private RedshiftOutputConnection connection = null;
44
- private String copySqlBeforeFrom = null;
45
- private long totalRows;
46
- private int fileCount;
47
-
48
- public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
49
-
50
- public RedshiftCopyBatchInsert(RedshiftOutputConnector connector,
51
- AWSCredentials awsCredentials, String s3BucketName,
52
- String iamReaderUserName) throws IOException, SQLException
53
- {
54
- super();
55
- this.connector = connector;
56
- this.awsCredentials = awsCredentials;
57
- this.s3BucketName = s3BucketName;
58
- this.iamReaderUserName = iamReaderUserName;
59
- this.s3 = new AmazonS3Client(awsCredentials); // TODO options
60
- this.sts = new AWSSecurityTokenServiceClient(awsCredentials); // options
61
- }
62
-
63
- @Override
64
- public void prepare(String loadTable, JdbcSchema insertSchema) throws SQLException
65
- {
66
- this.connection = connector.connect(true);
67
- this.copySqlBeforeFrom = connection.buildCopySQLBeforeFrom(loadTable, insertSchema);
68
- logger.info("Copy SQL: "+copySqlBeforeFrom+" ? "+COPY_AFTER_FROM);
69
- }
70
-
71
- @Override
72
- protected BufferedWriter openWriter(File newFile) throws IOException
73
- {
74
- // Redshift supports gzip
75
- return new BufferedWriter(
76
- new OutputStreamWriter(
77
- new GZIPOutputStream(new FileOutputStream(newFile)),
78
- FILE_CHARSET)
79
- );
80
- }
81
-
82
- @Override
83
- public void flush() throws IOException, SQLException
84
- {
85
- File file = closeCurrentFile(); // flush buffered data in writer
86
-
87
- // TODO multi-threading
88
- new UploadAndCopyTask(file, batchRows, UUID.randomUUID().toString()).call();
89
- new DeleteFileFinalizer(file).close();
90
-
91
- fileCount++;
92
- totalRows += batchRows;
93
- batchRows = 0;
94
-
95
- openNewFile();
96
- file.delete();
97
- }
98
-
99
- @Override
100
- public void finish() throws IOException, SQLException
101
- {
102
- super.finish();
103
- logger.info("Loaded {} files.", fileCount);
104
- }
105
-
106
- @Override
107
- public void close() throws IOException, SQLException
108
- {
109
- s3.shutdown();
110
- closeCurrentFile().delete();
111
- if (connection != null) {
112
- connection.close();
113
- connection = null;
114
- }
115
- }
116
-
117
- private BasicSessionCredentials generateReaderSessionCredentials(String s3KeyName)
118
- {
119
- Policy policy = new Policy()
120
- .withStatements(
121
- new Statement(Effect.Allow)
122
- .withActions(S3Actions.ListObjects)
123
- .withResources(new Resource("arn:aws:s3:::"+s3BucketName)),
124
- new Statement(Effect.Allow)
125
- .withActions(S3Actions.GetObject)
126
- .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
127
- );
128
- GetFederationTokenRequest req = new GetFederationTokenRequest();
129
- req.setDurationSeconds(86400); // 3600 - 129600
130
- req.setName(iamReaderUserName);
131
- req.setPolicy(policy.toJson());
132
-
133
- GetFederationTokenResult res = sts.getFederationToken(req);
134
- Credentials c = res.getCredentials();
135
-
136
- return new BasicSessionCredentials(
137
- c.getAccessKeyId(),
138
- c.getSecretAccessKey(),
139
- c.getSessionToken());
140
- }
141
-
142
- private class UploadAndCopyTask implements Callable<Void>
143
- {
144
- private final File file;
145
- private final int batchRows;
146
- private final String s3KeyName;
147
-
148
- public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
149
- {
150
- this.file = file;
151
- this.batchRows = batchRows;
152
- this.s3KeyName = s3KeyName;
153
- }
154
-
155
- public Void call() throws SQLException {
156
- logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
157
- s3KeyName, file.length(), batchRows));
158
- s3.putObject(s3BucketName, s3KeyName, file);
159
-
160
- RedshiftOutputConnection con = connector.connect(true);
161
- try {
162
- logger.info("Running COPY from file {}", s3KeyName);
163
-
164
- // create temporary credential right before COPY operation because
165
- // it has timeout.
166
- // TODO skip this step if iamReaderUserName is not set
167
- BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
168
-
169
- long startTime = System.currentTimeMillis();
170
- con.runCopy(buildCopySQL(creds));
171
- double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
172
-
173
- logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
174
-
175
- } finally {
176
- con.close();
177
- }
178
-
179
- return null;
180
- }
181
-
182
- private String buildCopySQL(BasicSessionCredentials creds)
183
- {
184
- StringBuilder sb = new StringBuilder();
185
- sb.append(copySqlBeforeFrom);
186
- sb.append(" FROM 's3://");
187
- sb.append(s3BucketName);
188
- sb.append("/");
189
- sb.append(s3KeyName);
190
- sb.append("' CREDENTIALS '");
191
- sb.append("aws_access_key_id=");
192
- sb.append(creds.getAWSAccessKeyId());
193
- sb.append(";aws_secret_access_key=");
194
- sb.append(creds.getAWSSecretKey());
195
- sb.append(";token=");
196
- sb.append(creds.getSessionToken());
197
- sb.append("' ");
198
- sb.append(COPY_AFTER_FROM);
199
- return sb.toString();
200
- }
201
- }
202
-
203
- private static class DeleteFileFinalizer implements Closeable
204
- {
205
- private File file;
206
-
207
- public DeleteFileFinalizer(File file) {
208
- this.file = file;
209
- }
210
-
211
- @Override
212
- public void close() throws IOException {
213
- file.delete();
214
- }
215
- }
216
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.util.zip.GZIPOutputStream;
4
+ import java.util.concurrent.Callable;
5
+ import java.util.UUID;
6
+ import java.io.File;
7
+ import java.io.IOException;
8
+ import java.io.FileOutputStream;
9
+ import java.io.OutputStreamWriter;
10
+ import java.io.Closeable;
11
+ import java.io.Writer;
12
+ import java.io.BufferedWriter;
13
+ import java.sql.Connection;
14
+ import java.sql.SQLException;
15
+ import com.amazonaws.auth.AWSCredentialsProvider;
16
+ import com.amazonaws.auth.BasicSessionCredentials;
17
+ import com.amazonaws.auth.policy.Policy;
18
+ import com.amazonaws.auth.policy.Resource;
19
+ import com.amazonaws.auth.policy.Statement;
20
+ import com.amazonaws.auth.policy.Statement.Effect;
21
+ import com.amazonaws.auth.policy.actions.S3Actions;
22
+ import com.amazonaws.services.s3.AmazonS3Client;
23
+ import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
24
+ import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
25
+ import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
26
+ import com.amazonaws.services.securitytoken.model.Credentials;
27
+ import org.slf4j.Logger;
28
+ import org.embulk.spi.Exec;
29
+ import org.embulk.output.jdbc.JdbcSchema;
30
+ import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
31
+
32
+ public class RedshiftCopyBatchInsert
33
+ extends AbstractPostgreSQLCopyBatchInsert
34
+ {
35
+ private final Logger logger = Exec.getLogger(RedshiftCopyBatchInsert.class);
36
+ private final RedshiftOutputConnector connector;
37
+ private final String s3BucketName;
38
+ private final String iamReaderUserName;
39
+ private final AmazonS3Client s3;
40
+ private final AWSSecurityTokenServiceClient sts;
41
+
42
+ private RedshiftOutputConnection connection = null;
43
+ private String copySqlBeforeFrom = null;
44
+ private long totalRows;
45
+ private int fileCount;
46
+
47
+ public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
48
+
49
+ public RedshiftCopyBatchInsert(RedshiftOutputConnector connector,
50
+ AWSCredentialsProvider credentialsProvider, String s3BucketName,
51
+ String iamReaderUserName) throws IOException, SQLException
52
+ {
53
+ super();
54
+ this.connector = connector;
55
+ this.s3BucketName = s3BucketName;
56
+ this.iamReaderUserName = iamReaderUserName;
57
+ this.s3 = new AmazonS3Client(credentialsProvider); // TODO options
58
+ this.sts = new AWSSecurityTokenServiceClient(credentialsProvider); // options
59
+ }
60
+
61
+ @Override
62
+ public void prepare(String loadTable, JdbcSchema insertSchema) throws SQLException
63
+ {
64
+ this.connection = connector.connect(true);
65
+ this.copySqlBeforeFrom = connection.buildCopySQLBeforeFrom(loadTable, insertSchema);
66
+ logger.info("Copy SQL: "+copySqlBeforeFrom+" ? "+COPY_AFTER_FROM);
67
+ }
68
+
69
+ @Override
70
+ protected BufferedWriter openWriter(File newFile) throws IOException
71
+ {
72
+ // Redshift supports gzip
73
+ return new BufferedWriter(
74
+ new OutputStreamWriter(
75
+ new GZIPOutputStream(new FileOutputStream(newFile)),
76
+ FILE_CHARSET)
77
+ );
78
+ }
79
+
80
+ @Override
81
+ public void flush() throws IOException, SQLException
82
+ {
83
+ File file = closeCurrentFile(); // flush buffered data in writer
84
+
85
+ // TODO multi-threading
86
+ new UploadAndCopyTask(file, batchRows, UUID.randomUUID().toString()).call();
87
+ new DeleteFileFinalizer(file).close();
88
+
89
+ fileCount++;
90
+ totalRows += batchRows;
91
+ batchRows = 0;
92
+
93
+ openNewFile();
94
+ file.delete();
95
+ }
96
+
97
+ @Override
98
+ public void finish() throws IOException, SQLException
99
+ {
100
+ super.finish();
101
+ logger.info("Loaded {} files.", fileCount);
102
+ }
103
+
104
+ @Override
105
+ public void close() throws IOException, SQLException
106
+ {
107
+ s3.shutdown();
108
+ closeCurrentFile().delete();
109
+ if (connection != null) {
110
+ connection.close();
111
+ connection = null;
112
+ }
113
+ }
114
+
115
+ private BasicSessionCredentials generateReaderSessionCredentials(String s3KeyName)
116
+ {
117
+ Policy policy = new Policy()
118
+ .withStatements(
119
+ new Statement(Effect.Allow)
120
+ .withActions(S3Actions.ListObjects)
121
+ .withResources(new Resource("arn:aws:s3:::"+s3BucketName)),
122
+ new Statement(Effect.Allow)
123
+ .withActions(S3Actions.GetObject)
124
+ .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
125
+ );
126
+ GetFederationTokenRequest req = new GetFederationTokenRequest();
127
+ req.setDurationSeconds(86400); // 3600 - 129600
128
+ req.setName(iamReaderUserName);
129
+ req.setPolicy(policy.toJson());
130
+
131
+ GetFederationTokenResult res = sts.getFederationToken(req);
132
+ Credentials c = res.getCredentials();
133
+
134
+ return new BasicSessionCredentials(
135
+ c.getAccessKeyId(),
136
+ c.getSecretAccessKey(),
137
+ c.getSessionToken());
138
+ }
139
+
140
+ private class UploadAndCopyTask implements Callable<Void>
141
+ {
142
+ private final File file;
143
+ private final int batchRows;
144
+ private final String s3KeyName;
145
+
146
+ public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
147
+ {
148
+ this.file = file;
149
+ this.batchRows = batchRows;
150
+ this.s3KeyName = s3KeyName;
151
+ }
152
+
153
+ public Void call() throws SQLException {
154
+ logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
155
+ s3KeyName, file.length(), batchRows));
156
+ s3.putObject(s3BucketName, s3KeyName, file);
157
+
158
+ RedshiftOutputConnection con = connector.connect(true);
159
+ try {
160
+ logger.info("Running COPY from file {}", s3KeyName);
161
+
162
+ // create temporary credential right before COPY operation because
163
+ // it has timeout.
164
+ // TODO skip this step if iamReaderUserName is not set
165
+ BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
166
+
167
+ long startTime = System.currentTimeMillis();
168
+ con.runCopy(buildCopySQL(creds));
169
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
170
+
171
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
172
+
173
+ } finally {
174
+ con.close();
175
+ }
176
+
177
+ return null;
178
+ }
179
+
180
+ private String buildCopySQL(BasicSessionCredentials creds)
181
+ {
182
+ StringBuilder sb = new StringBuilder();
183
+ sb.append(copySqlBeforeFrom);
184
+ sb.append(" FROM 's3://");
185
+ sb.append(s3BucketName);
186
+ sb.append("/");
187
+ sb.append(s3KeyName);
188
+ sb.append("' CREDENTIALS '");
189
+ sb.append("aws_access_key_id=");
190
+ sb.append(creds.getAWSAccessKeyId());
191
+ sb.append(";aws_secret_access_key=");
192
+ sb.append(creds.getAWSSecretKey());
193
+ sb.append(";token=");
194
+ sb.append(creds.getSessionToken());
195
+ sb.append("' ");
196
+ sb.append(COPY_AFTER_FROM);
197
+ return sb.toString();
198
+ }
199
+ }
200
+
201
+ private static class DeleteFileFinalizer implements Closeable
202
+ {
203
+ private File file;
204
+
205
+ public DeleteFileFinalizer(File file) {
206
+ this.file = file;
207
+ }
208
+
209
+ @Override
210
+ public void close() throws IOException {
211
+ file.delete();
212
+ }
213
+ }
214
+ }
@@ -1,122 +1,122 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.sql.Connection;
4
- import java.sql.SQLException;
5
- import java.sql.Statement;
6
- import org.slf4j.Logger;
7
- import org.embulk.spi.Exec;
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
- import org.embulk.output.jdbc.JdbcColumn;
10
- import org.embulk.output.jdbc.JdbcSchema;
11
-
12
- public class RedshiftOutputConnection
13
- extends JdbcOutputConnection
14
- {
15
- private final Logger logger = Exec.getLogger(RedshiftOutputConnection.class);
16
-
17
- public RedshiftOutputConnection(Connection connection, String schemaName, boolean autoCommit)
18
- throws SQLException
19
- {
20
- super(connection, schemaName);
21
- connection.setAutoCommit(autoCommit);
22
- }
23
-
24
- // Redshift does not support DROP TABLE IF EXISTS.
25
- // Here runs DROP TABLE and ignores errors.
26
- @Override
27
- public void dropTableIfExists(String tableName) throws SQLException
28
- {
29
- Statement stmt = connection.createStatement();
30
- try {
31
- String sql = String.format("DROP TABLE IF EXISTS %s", quoteIdentifierString(tableName));
32
- executeUpdate(stmt, sql);
33
- commitIfNecessary(connection);
34
- } catch (SQLException ex) {
35
- // ignore errors.
36
- // TODO here should ignore only 'table "XXX" does not exist' errors.
37
- SQLException ignored = safeRollback(connection, ex);
38
- } finally {
39
- stmt.close();
40
- }
41
- }
42
-
43
- // Redshift does not support DROP TABLE IF EXISTS.
44
- // Dropping part runs DROP TABLE and ignores errors.
45
- @Override
46
- public void replaceTable(String fromTable, JdbcSchema schema, String toTable) throws SQLException
47
- {
48
- Statement stmt = connection.createStatement();
49
- try {
50
- try {
51
- StringBuilder sb = new StringBuilder();
52
- sb.append("DROP TABLE ");
53
- quoteIdentifierString(sb, toTable);
54
- String sql = sb.toString();
55
- executeUpdate(stmt, sql);
56
- } catch (SQLException ex) {
57
- // ignore errors.
58
- // TODO here should ignore only 'table "XXX" does not exist' errors.
59
- // rollback or comimt is required to recover failed transaction
60
- SQLException ignored = safeRollback(connection, ex);
61
- }
62
-
63
- {
64
- StringBuilder sb = new StringBuilder();
65
- sb.append("ALTER TABLE ");
66
- quoteIdentifierString(sb, fromTable);
67
- sb.append(" RENAME TO ");
68
- quoteIdentifierString(sb, toTable);
69
- String sql = sb.toString();
70
- executeUpdate(stmt, sql);
71
- }
72
-
73
- commitIfNecessary(connection);
74
- } catch (SQLException ex) {
75
- throw safeRollback(connection, ex);
76
- } finally {
77
- stmt.close();
78
- }
79
- }
80
-
81
- @Override
82
- protected String convertTypeName(String typeName)
83
- {
84
- // Redshift does not support TEXT type.
85
- switch(typeName) {
86
- case "CLOB":
87
- return "VARCHAR(65535)";
88
- case "TEXT":
89
- return "VARCHAR(65535)";
90
- case "BLOB":
91
- return "BYTEA";
92
- default:
93
- return typeName;
94
- }
95
- }
96
-
97
- public String buildCopySQLBeforeFrom(String tableName, JdbcSchema tableSchema)
98
- {
99
- StringBuilder sb = new StringBuilder();
100
-
101
- sb.append("COPY ");
102
- quoteIdentifierString(sb, tableName);
103
- sb.append(" (");
104
- for(int i=0; i < tableSchema.getCount(); i++) {
105
- if(i != 0) { sb.append(", "); }
106
- quoteIdentifierString(sb, tableSchema.getColumnName(i));
107
- }
108
- sb.append(")");
109
-
110
- return sb.toString();
111
- }
112
-
113
- public void runCopy(String sql) throws SQLException
114
- {
115
- Statement stmt = connection.createStatement();
116
- try {
117
- stmt.executeUpdate(sql);
118
- } finally {
119
- stmt.close();
120
- }
121
- }
122
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.sql.Connection;
4
+ import java.sql.SQLException;
5
+ import java.sql.Statement;
6
+ import org.slf4j.Logger;
7
+ import org.embulk.spi.Exec;
8
+ import org.embulk.output.jdbc.JdbcOutputConnection;
9
+ import org.embulk.output.jdbc.JdbcColumn;
10
+ import org.embulk.output.jdbc.JdbcSchema;
11
+
12
+ public class RedshiftOutputConnection
13
+ extends JdbcOutputConnection
14
+ {
15
+ private final Logger logger = Exec.getLogger(RedshiftOutputConnection.class);
16
+
17
+ public RedshiftOutputConnection(Connection connection, String schemaName, boolean autoCommit)
18
+ throws SQLException
19
+ {
20
+ super(connection, schemaName);
21
+ connection.setAutoCommit(autoCommit);
22
+ }
23
+
24
+ // Redshift does not support DROP TABLE IF EXISTS.
25
+ // Here runs DROP TABLE and ignores errors.
26
+ @Override
27
+ public void dropTableIfExists(String tableName) throws SQLException
28
+ {
29
+ Statement stmt = connection.createStatement();
30
+ try {
31
+ String sql = String.format("DROP TABLE IF EXISTS %s", quoteIdentifierString(tableName));
32
+ executeUpdate(stmt, sql);
33
+ commitIfNecessary(connection);
34
+ } catch (SQLException ex) {
35
+ // ignore errors.
36
+ // TODO here should ignore only 'table "XXX" does not exist' errors.
37
+ SQLException ignored = safeRollback(connection, ex);
38
+ } finally {
39
+ stmt.close();
40
+ }
41
+ }
42
+
43
+ // Redshift does not support DROP TABLE IF EXISTS.
44
+ // Dropping part runs DROP TABLE and ignores errors.
45
+ @Override
46
+ public void replaceTable(String fromTable, JdbcSchema schema, String toTable) throws SQLException
47
+ {
48
+ Statement stmt = connection.createStatement();
49
+ try {
50
+ try {
51
+ StringBuilder sb = new StringBuilder();
52
+ sb.append("DROP TABLE ");
53
+ quoteIdentifierString(sb, toTable);
54
+ String sql = sb.toString();
55
+ executeUpdate(stmt, sql);
56
+ } catch (SQLException ex) {
57
+ // ignore errors.
58
+ // TODO here should ignore only 'table "XXX" does not exist' errors.
59
+ // rollback or comimt is required to recover failed transaction
60
+ SQLException ignored = safeRollback(connection, ex);
61
+ }
62
+
63
+ {
64
+ StringBuilder sb = new StringBuilder();
65
+ sb.append("ALTER TABLE ");
66
+ quoteIdentifierString(sb, fromTable);
67
+ sb.append(" RENAME TO ");
68
+ quoteIdentifierString(sb, toTable);
69
+ String sql = sb.toString();
70
+ executeUpdate(stmt, sql);
71
+ }
72
+
73
+ commitIfNecessary(connection);
74
+ } catch (SQLException ex) {
75
+ throw safeRollback(connection, ex);
76
+ } finally {
77
+ stmt.close();
78
+ }
79
+ }
80
+
81
+ @Override
82
+ protected String buildColumnTypeName(JdbcColumn c)
83
+ {
84
+ // Redshift does not support TEXT type.
85
+ switch(c.getSimpleTypeName()) {
86
+ case "CLOB":
87
+ return "VARCHAR(65535)";
88
+ case "TEXT":
89
+ return "VARCHAR(65535)";
90
+ case "BLOB":
91
+ return "BYTEA";
92
+ default:
93
+ return super.buildColumnTypeName(c);
94
+ }
95
+ }
96
+
97
+ public String buildCopySQLBeforeFrom(String tableName, JdbcSchema tableSchema)
98
+ {
99
+ StringBuilder sb = new StringBuilder();
100
+
101
+ sb.append("COPY ");
102
+ quoteIdentifierString(sb, tableName);
103
+ sb.append(" (");
104
+ for(int i=0; i < tableSchema.getCount(); i++) {
105
+ if(i != 0) { sb.append(", "); }
106
+ quoteIdentifierString(sb, tableSchema.getColumnName(i));
107
+ }
108
+ sb.append(")");
109
+
110
+ return sb.toString();
111
+ }
112
+
113
+ public void runCopy(String sql) throws SQLException
114
+ {
115
+ Statement stmt = connection.createStatement();
116
+ try {
117
+ stmt.executeUpdate(sql);
118
+ } finally {
119
+ stmt.close();
120
+ }
121
+ }
122
+ }
@@ -1,40 +1,40 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.util.Properties;
4
- import java.sql.Driver;
5
- import java.sql.Connection;
6
- import java.sql.SQLException;
7
- import org.embulk.output.jdbc.JdbcOutputConnector;
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
-
10
- public class RedshiftOutputConnector
11
- implements JdbcOutputConnector
12
- {
13
- private static final Driver driver = new org.postgresql.Driver();
14
-
15
- private final String url;
16
- private final Properties properties;
17
- private final String schemaName;
18
-
19
- public RedshiftOutputConnector(String url, Properties properties, String schemaName)
20
- {
21
- this.url = url;
22
- this.properties = properties;
23
- this.schemaName = schemaName;
24
- }
25
-
26
- @Override
27
- public RedshiftOutputConnection connect(boolean autoCommit) throws SQLException
28
- {
29
- Connection c = driver.connect(url, properties);
30
- try {
31
- RedshiftOutputConnection con = new RedshiftOutputConnection(c, schemaName, autoCommit);
32
- c = null;
33
- return con;
34
- } finally {
35
- if (c != null) {
36
- c.close();
37
- }
38
- }
39
- }
40
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.util.Properties;
4
+ import java.sql.Driver;
5
+ import java.sql.Connection;
6
+ import java.sql.SQLException;
7
+ import org.embulk.output.jdbc.JdbcOutputConnector;
8
+ import org.embulk.output.jdbc.JdbcOutputConnection;
9
+
10
+ public class RedshiftOutputConnector
11
+ implements JdbcOutputConnector
12
+ {
13
+ private static final Driver driver = new org.postgresql.Driver();
14
+
15
+ private final String url;
16
+ private final Properties properties;
17
+ private final String schemaName;
18
+
19
+ public RedshiftOutputConnector(String url, Properties properties, String schemaName)
20
+ {
21
+ this.url = url;
22
+ this.properties = properties;
23
+ this.schemaName = schemaName;
24
+ }
25
+
26
+ @Override
27
+ public RedshiftOutputConnection connect(boolean autoCommit) throws SQLException
28
+ {
29
+ Connection c = driver.connect(url, properties);
30
+ try {
31
+ RedshiftOutputConnection con = new RedshiftOutputConnection(c, schemaName, autoCommit);
32
+ c = null;
33
+ return con;
34
+ } finally {
35
+ if (c != null) {
36
+ c.close();
37
+ }
38
+ }
39
+ }
40
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-12 00:00:00.000000000 Z
11
+ date: 2015-05-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Inserts or updates records to a table.
14
14
  email:
@@ -30,9 +30,9 @@ files:
30
30
  - classpath/aws-java-sdk-sts-1.9.17.jar
31
31
  - classpath/commons-codec-1.6.jar
32
32
  - classpath/commons-logging-1.1.3.jar
33
- - classpath/embulk-output-jdbc-0.2.4.jar
34
- - classpath/embulk-output-postgresql-0.2.4.jar
35
- - classpath/embulk-output-redshift-0.2.4.jar
33
+ - classpath/embulk-output-jdbc-0.3.0.jar
34
+ - classpath/embulk-output-postgresql-0.3.0.jar
35
+ - classpath/embulk-output-redshift-0.3.0.jar
36
36
  - classpath/httpclient-4.3.4.jar
37
37
  - classpath/httpcore-4.3.2.jar
38
38
  - classpath/jna-4.1.0.jar
Binary file