embulk-output-redshift 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 58358c6f921e03a15a58929b8241a40cc97f5ff1
4
- data.tar.gz: a7b29d7fdd1247da37dac5687c132eef280e6b30
3
+ metadata.gz: f739710245a2663409cf49beb31c8e18cb148684
4
+ data.tar.gz: fe5268568d22eed5d8fc40e9117684ba5989a715
5
5
  SHA512:
6
- metadata.gz: e2f87d3cc3f9a1d5aa1c94b8a9b00fabc8deef097a2c9e6b8da7d3ddf81a3a5907094ef22ec675f49d4567d274a848ce6161424f9ba5edbbd885d98a9a6940ec
7
- data.tar.gz: 37db160a1993e53804837b379530ffb746f9c77e535e4d5f6f723a060e0135be1854e8c85dde3ea7ba23351791cf9ac4a1d704c8631fbdf2674fd5bd2ebf203b
6
+ metadata.gz: 9d8808f711394ed62b840faa26d1472f871e7c7e71b069b9279a5342b1580b9d496c8d1103984f97e942a715d16f91bd402a2baed03c3dc68625d0727761064b
7
+ data.tar.gz: a65ba3a389a4f3e80cc179c18d5ed0d8e0e62a550776dd73b0888ae4b2832ccd96f9b3e680d202723ab667d4104aaa62af25dc7e9e51699634dc767d0f578701
data/README.md CHANGED
@@ -1,47 +1,97 @@
1
- # Redshift output plugins for Embulk
2
-
3
- Redshift output plugins for Embulk loads records to Redshift.
4
-
5
- ## Overview
6
-
7
- * **Plugin type**: output
8
- * **Load all or nothing**: depnds on the mode:
9
- * **insert**: no
10
- * **replace**: yes
11
- * **Resume supported**: no
12
-
13
- ## Configuration
14
-
15
- - **host**: database host name (string, required)
16
- - **port**: database port number (integer, default: 5439)
17
- - **user**: database login user name (string, required)
18
- - **password**: database login password (string, default: "")
19
- - **database**: destination database name (string, required)
20
- - **schema**: destination schema name (string, default: "public")
21
- - **table**: destination table name (string, required)
22
- - **mode**: "replace" or "insert" (string, required)
23
- - **batch_size**: size of a single batch insert (integer, default: 16777216)
24
- - **options**: extra connection properties (hash, default: {})
25
-
26
- ### Example
27
-
28
- ```yaml
29
- out:
30
- type: redshift
31
- host: myinstance.us-west-2.redshift.amazonaws.com
32
- user: pg
33
- password: ""
34
- database: my_database
35
- table: my_table
36
- access_key_id: ABCXYZ123ABCXYZ123
37
- secret_access_key: AbCxYz123aBcXyZ123
38
- s3_bucket: my-redshift-transfer-bucket
39
- iam_user_name: my-s3-read-only
40
- mode: insert
41
- ```
42
-
43
- ### Build
44
-
45
- ```
46
- $ ./gradlew gem
47
- ```
1
+ # Redshift output plugins for Embulk
2
+
3
+ Redshift output plugins for Embulk loads records to Redshift.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: output
8
+ * **Load all or nothing**: depnds on the mode. see bellow.
9
+ * **Resume supported**: depnds on the mode. see bellow.
10
+
11
+ ## Configuration
12
+
13
+ - **host**: database host name (string, required)
14
+ - **port**: database port number (integer, default: 5439)
15
+ - **user**: database login user name (string, required)
16
+ - **password**: database login password (string, default: "")
17
+ - **database**: destination database name (string, required)
18
+ - **schema**: destination schema name (string, default: "public")
19
+ - **table**: destination table name (string, required)
20
+ - **options**: extra connection properties (hash, default: {})
21
+ - **mode**: "replace" or "insert" (string, required)
22
+ - **batch_size**: size of a single batch insert (integer, default: 16777216)
23
+ - **default_timezone**: If input column type (embulk type) is timestamp and destination column type is `string` or `nstring`, this plugin needs to format the timestamp into a string. This default_timezone option is used to control the timezone. You can overwrite timezone for each columns using column_options option. (string, default: `UTC`)
24
+ - **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
25
+ - **type**: type of a column when this plugin creates new tables (e.g. `VARCHAR(255)`, `INTEGER NOT NULL UNIQUE`). This used when this plugin creates intermediate tables (insert, truncate_insert and merge modes), when it creates the target table (insert_direct and replace modes), and when it creates nonexistent target table automatically. (string, default: depends on input column type. `BIGINT` if input column type is long, `BOOLEAN` if boolean, `DOUBLE PRECISION` if double, `CLOB` if string, `TIMESTAMP` if timestamp)
26
+ - **value_type**: This plugin converts input column type (embulk type) into a database type to build a INSERT statement. This value_type option controls the type of the value in a INSERT statement. (string, default: depends on input column type. Available values options are: `byte`, `short`, `int`, `long`, `double`, `float`, `boolean`, `string`, `nstring`, `date`, `time`, `timestamp`, `decimal`, `null`, `pass`)
27
+ - **timestamp_format**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, default: `%Y-%m-%d %H:%M:%S.%6N`)
28
+ - **timezone**: If input column type (embulk type) is timestamp and value_type is `string` or `nstring`, this plugin needs to format the timestamp value into a string. And if the input column type is timestamp and value_type is `date`, this plugin needs to consider timezone. In those cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
29
+
30
+ ### Modes
31
+
32
+ * **insert**:
33
+ * Behavior: This mode writes rows to some intermediate tables first. If all those tasks run correctly, runs `INSERT INTO <target_table> SELECT * FROM <intermediate_table_1> UNION ALL SELECT * FROM <intermediate_table_2> UNION ALL ...` query.
34
+ * Transactional: Yes. This mode successfully writes all rows, or fails with writing zero rows.
35
+ * Resumable: Yes.
36
+ * **insert_direct**:
37
+ * Behavior: This mode inserts rows to the target table directly.
38
+ * Transactional: No. If fails, the target table could have some rows inserted.
39
+ * Resumable: No.
40
+ * **truncate_insert**:
41
+ * Behavior: Same with `insert` mode excepting that it truncates the target table right before the last `INSERT ...` query.
42
+ * Transactional: Yes.
43
+ * Resumable: Yes.
44
+ * **merge**:
45
+ * Behavior: This mode writes rows to some intermediate tables first. If all those tasks run correctly, runs `INSERT INTO <target_table> SELECT * FROM <intermediate_table_1> UNION ALL SELECT * FROM <intermediate_table_2> UNION ALL ... ON DUPLICATE KEY UPDATE ...` query.
46
+ * Transactional: Yes.
47
+ * Resumable: Yes.
48
+ * **replace**:
49
+ * Behavior: Same with `insert` mode excepting that it truncates the target table right before the last `INSERT ...` query.
50
+ * Transactional: Yes.
51
+ * Resumable: No.
52
+
53
+ ### Example
54
+
55
+ ```yaml
56
+ out:
57
+ type: redshift
58
+ host: myinstance.us-west-2.redshift.amazonaws.com
59
+ user: pg
60
+ password: ""
61
+ database: my_database
62
+ table: my_table
63
+ access_key_id: ABCXYZ123ABCXYZ123
64
+ secret_access_key: AbCxYz123aBcXyZ123
65
+ s3_bucket: my-redshift-transfer-bucket
66
+ iam_user_name: my-s3-read-only
67
+ mode: insert
68
+ ```
69
+
70
+ Advanced configuration:
71
+
72
+ ```yaml
73
+ out:
74
+ type: redshift
75
+ host: myinstance.us-west-2.redshift.amazonaws.com
76
+ user: pg
77
+ password: ""
78
+ database: my_database
79
+ table: my_table
80
+ access_key_id: ABCXYZ123ABCXYZ123
81
+ secret_access_key: AbCxYz123aBcXyZ123
82
+ s3_bucket: my-redshift-transfer-bucket
83
+ iam_user_name: my-s3-read-only
84
+ options: {loglevel: 2}
85
+ mode: insert_direct
86
+ column_options:
87
+ my_col_1: {type: 'VARCHAR(255)'}
88
+ my_col_3: {type: 'INT NOT NULL'}
89
+ my_col_4: {value_type: string, timestamp_format: `%Y-%m-%d %H:%M:%S %z`, timezone: '-0700'}
90
+ my_col_5: {type: 'DECIMAL(18,9)', value_type: pass}
91
+ ```
92
+
93
+ ### Build
94
+
95
+ ```
96
+ $ ./gradlew gem
97
+ ```
data/build.gradle CHANGED
@@ -1,9 +1,9 @@
1
- dependencies {
2
- compile project(':embulk-output-jdbc')
3
- compile project(':embulk-output-postgresql')
4
-
5
- compile "com.amazonaws:aws-java-sdk-s3:1.9.17"
6
- compile "com.amazonaws:aws-java-sdk-sts:1.9.17"
7
-
8
- testCompile project(':embulk-output-jdbc').sourceSets.test.output
9
- }
1
+ dependencies {
2
+ compile project(':embulk-output-jdbc')
3
+ compile project(':embulk-output-postgresql')
4
+
5
+ compile "com.amazonaws:aws-java-sdk-s3:1.9.17"
6
+ compile "com.amazonaws:aws-java-sdk-sts:1.9.17"
7
+
8
+ testCompile project(':embulk-output-jdbc').sourceSets.test.output
9
+ }
@@ -1,3 +1,3 @@
1
- Embulk::JavaPlugin.register_output(
2
- :redshift, "org.embulk.output.RedshiftOutputPlugin",
3
- File.expand_path('../../../../classpath', __FILE__))
1
+ Embulk::JavaPlugin.register_output(
2
+ :redshift, "org.embulk.output.RedshiftOutputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -1,10 +1,14 @@
1
1
  package org.embulk.output;
2
2
 
3
+ import java.util.List;
3
4
  import java.util.Properties;
4
5
  import java.io.IOException;
5
6
  import java.sql.SQLException;
6
7
  import org.slf4j.Logger;
8
+ import com.google.common.base.Optional;
9
+ import com.google.common.collect.ImmutableSet;
7
10
  import com.amazonaws.auth.AWSCredentials;
11
+ import com.amazonaws.auth.AWSCredentialsProvider;
8
12
  import com.amazonaws.auth.BasicAWSCredentials;
9
13
  import org.embulk.spi.Exec;
10
14
  import org.embulk.config.Config;
@@ -61,6 +65,15 @@ public class RedshiftOutputPlugin
61
65
  return RedshiftPluginTask.class;
62
66
  }
63
67
 
68
+ @Override
69
+ protected Features getFeatures(PluginTask task)
70
+ {
71
+ return new Features()
72
+ .setMaxTableNameLength(30)
73
+ .setSupportedModes(ImmutableSet.of(Mode.INSERT, Mode.INSERT_DIRECT, Mode.MERGE, Mode.TRUNCATE_INSERT, Mode.REPLACE))
74
+ .setIgnoreMergeKeys(false);
75
+ }
76
+
64
77
  @Override
65
78
  protected RedshiftOutputConnector getConnector(PluginTask task, boolean retryableMetadataOperation)
66
79
  {
@@ -70,8 +83,6 @@ public class RedshiftOutputPlugin
70
83
  t.getHost(), t.getPort(), t.getDatabase());
71
84
 
72
85
  Properties props = new Properties();
73
- props.setProperty("user", t.getUser());
74
- props.setProperty("password", t.getPassword());
75
86
  props.setProperty("loginTimeout", "300"); // seconds
76
87
  props.setProperty("socketTimeout", "1800"); // seconds
77
88
 
@@ -98,19 +109,39 @@ public class RedshiftOutputPlugin
98
109
 
99
110
  props.putAll(t.getOptions());
100
111
 
112
+ props.setProperty("user", t.getUser());
113
+ logger.info("Connecting to {} options {}", url, props);
114
+ props.setProperty("password", t.getPassword());
115
+
101
116
  return new RedshiftOutputConnector(url, props, t.getSchema());
102
117
  }
103
118
 
119
+ private static AWSCredentialsProvider getAWSCredentialsProvider(RedshiftPluginTask task)
120
+ {
121
+ final AWSCredentials creds = new BasicAWSCredentials(
122
+ task.getAccessKeyId(), task.getSecretAccessKey());
123
+ return new AWSCredentialsProvider() {
124
+ @Override
125
+ public AWSCredentials getCredentials()
126
+ {
127
+ return creds;
128
+ }
129
+
130
+ @Override
131
+ public void refresh()
132
+ {
133
+ }
134
+ };
135
+ }
136
+
104
137
  @Override
105
- protected BatchInsert newBatchInsert(PluginTask task) throws IOException, SQLException
138
+ protected BatchInsert newBatchInsert(PluginTask task, Optional<List<String>> mergeKeys) throws IOException, SQLException
106
139
  {
107
- if (task.getMode().isMerge()) {
108
- throw new UnsupportedOperationException("mode 'merge' is not implemented for this type");
140
+ if (mergeKeys.isPresent()) {
141
+ throw new UnsupportedOperationException("Redshift output plugin doesn't support 'merge_direct' mode. Use 'merge' mode instead.");
109
142
  }
110
143
  RedshiftPluginTask t = (RedshiftPluginTask) task;
111
- AWSCredentials creds = new BasicAWSCredentials(
112
- t.getAccessKeyId(), t.getSecretAccessKey());
113
144
  return new RedshiftCopyBatchInsert(getConnector(task, true),
114
- creds, t.getS3Bucket(), t.getIamUserName());
145
+ getAWSCredentialsProvider(t), t.getS3Bucket(), t.getIamUserName());
115
146
  }
116
147
  }
@@ -1,216 +1,214 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.util.zip.GZIPOutputStream;
4
- import java.util.concurrent.Callable;
5
- import java.util.UUID;
6
- import java.io.File;
7
- import java.io.IOException;
8
- import java.io.FileOutputStream;
9
- import java.io.OutputStreamWriter;
10
- import java.io.Closeable;
11
- import java.io.Writer;
12
- import java.io.BufferedWriter;
13
- import java.sql.Connection;
14
- import java.sql.SQLException;
15
- import com.amazonaws.auth.AWSCredentials;
16
- import com.amazonaws.auth.BasicSessionCredentials;
17
- import com.amazonaws.auth.policy.Policy;
18
- import com.amazonaws.auth.policy.Resource;
19
- import com.amazonaws.auth.policy.Statement;
20
- import com.amazonaws.auth.policy.Statement.Effect;
21
- import com.amazonaws.auth.policy.actions.S3Actions;
22
- import com.amazonaws.services.s3.AmazonS3Client;
23
- import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
24
- import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
25
- import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
26
- import com.amazonaws.services.securitytoken.model.Credentials;
27
- import org.slf4j.Logger;
28
- import org.embulk.spi.Exec;
29
- import org.embulk.output.jdbc.JdbcSchema;
30
- import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
31
-
32
- public class RedshiftCopyBatchInsert
33
- extends AbstractPostgreSQLCopyBatchInsert
34
- {
35
- private final Logger logger = Exec.getLogger(RedshiftCopyBatchInsert.class);
36
- private final RedshiftOutputConnector connector;
37
- private final AWSCredentials awsCredentials;
38
- private final String s3BucketName;
39
- private final String iamReaderUserName;
40
- private final AmazonS3Client s3;
41
- private final AWSSecurityTokenServiceClient sts;
42
-
43
- private RedshiftOutputConnection connection = null;
44
- private String copySqlBeforeFrom = null;
45
- private long totalRows;
46
- private int fileCount;
47
-
48
- public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
49
-
50
- public RedshiftCopyBatchInsert(RedshiftOutputConnector connector,
51
- AWSCredentials awsCredentials, String s3BucketName,
52
- String iamReaderUserName) throws IOException, SQLException
53
- {
54
- super();
55
- this.connector = connector;
56
- this.awsCredentials = awsCredentials;
57
- this.s3BucketName = s3BucketName;
58
- this.iamReaderUserName = iamReaderUserName;
59
- this.s3 = new AmazonS3Client(awsCredentials); // TODO options
60
- this.sts = new AWSSecurityTokenServiceClient(awsCredentials); // options
61
- }
62
-
63
- @Override
64
- public void prepare(String loadTable, JdbcSchema insertSchema) throws SQLException
65
- {
66
- this.connection = connector.connect(true);
67
- this.copySqlBeforeFrom = connection.buildCopySQLBeforeFrom(loadTable, insertSchema);
68
- logger.info("Copy SQL: "+copySqlBeforeFrom+" ? "+COPY_AFTER_FROM);
69
- }
70
-
71
- @Override
72
- protected BufferedWriter openWriter(File newFile) throws IOException
73
- {
74
- // Redshift supports gzip
75
- return new BufferedWriter(
76
- new OutputStreamWriter(
77
- new GZIPOutputStream(new FileOutputStream(newFile)),
78
- FILE_CHARSET)
79
- );
80
- }
81
-
82
- @Override
83
- public void flush() throws IOException, SQLException
84
- {
85
- File file = closeCurrentFile(); // flush buffered data in writer
86
-
87
- // TODO multi-threading
88
- new UploadAndCopyTask(file, batchRows, UUID.randomUUID().toString()).call();
89
- new DeleteFileFinalizer(file).close();
90
-
91
- fileCount++;
92
- totalRows += batchRows;
93
- batchRows = 0;
94
-
95
- openNewFile();
96
- file.delete();
97
- }
98
-
99
- @Override
100
- public void finish() throws IOException, SQLException
101
- {
102
- super.finish();
103
- logger.info("Loaded {} files.", fileCount);
104
- }
105
-
106
- @Override
107
- public void close() throws IOException, SQLException
108
- {
109
- s3.shutdown();
110
- closeCurrentFile().delete();
111
- if (connection != null) {
112
- connection.close();
113
- connection = null;
114
- }
115
- }
116
-
117
- private BasicSessionCredentials generateReaderSessionCredentials(String s3KeyName)
118
- {
119
- Policy policy = new Policy()
120
- .withStatements(
121
- new Statement(Effect.Allow)
122
- .withActions(S3Actions.ListObjects)
123
- .withResources(new Resource("arn:aws:s3:::"+s3BucketName)),
124
- new Statement(Effect.Allow)
125
- .withActions(S3Actions.GetObject)
126
- .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
127
- );
128
- GetFederationTokenRequest req = new GetFederationTokenRequest();
129
- req.setDurationSeconds(86400); // 3600 - 129600
130
- req.setName(iamReaderUserName);
131
- req.setPolicy(policy.toJson());
132
-
133
- GetFederationTokenResult res = sts.getFederationToken(req);
134
- Credentials c = res.getCredentials();
135
-
136
- return new BasicSessionCredentials(
137
- c.getAccessKeyId(),
138
- c.getSecretAccessKey(),
139
- c.getSessionToken());
140
- }
141
-
142
- private class UploadAndCopyTask implements Callable<Void>
143
- {
144
- private final File file;
145
- private final int batchRows;
146
- private final String s3KeyName;
147
-
148
- public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
149
- {
150
- this.file = file;
151
- this.batchRows = batchRows;
152
- this.s3KeyName = s3KeyName;
153
- }
154
-
155
- public Void call() throws SQLException {
156
- logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
157
- s3KeyName, file.length(), batchRows));
158
- s3.putObject(s3BucketName, s3KeyName, file);
159
-
160
- RedshiftOutputConnection con = connector.connect(true);
161
- try {
162
- logger.info("Running COPY from file {}", s3KeyName);
163
-
164
- // create temporary credential right before COPY operation because
165
- // it has timeout.
166
- // TODO skip this step if iamReaderUserName is not set
167
- BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
168
-
169
- long startTime = System.currentTimeMillis();
170
- con.runCopy(buildCopySQL(creds));
171
- double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
172
-
173
- logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
174
-
175
- } finally {
176
- con.close();
177
- }
178
-
179
- return null;
180
- }
181
-
182
- private String buildCopySQL(BasicSessionCredentials creds)
183
- {
184
- StringBuilder sb = new StringBuilder();
185
- sb.append(copySqlBeforeFrom);
186
- sb.append(" FROM 's3://");
187
- sb.append(s3BucketName);
188
- sb.append("/");
189
- sb.append(s3KeyName);
190
- sb.append("' CREDENTIALS '");
191
- sb.append("aws_access_key_id=");
192
- sb.append(creds.getAWSAccessKeyId());
193
- sb.append(";aws_secret_access_key=");
194
- sb.append(creds.getAWSSecretKey());
195
- sb.append(";token=");
196
- sb.append(creds.getSessionToken());
197
- sb.append("' ");
198
- sb.append(COPY_AFTER_FROM);
199
- return sb.toString();
200
- }
201
- }
202
-
203
- private static class DeleteFileFinalizer implements Closeable
204
- {
205
- private File file;
206
-
207
- public DeleteFileFinalizer(File file) {
208
- this.file = file;
209
- }
210
-
211
- @Override
212
- public void close() throws IOException {
213
- file.delete();
214
- }
215
- }
216
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.util.zip.GZIPOutputStream;
4
+ import java.util.concurrent.Callable;
5
+ import java.util.UUID;
6
+ import java.io.File;
7
+ import java.io.IOException;
8
+ import java.io.FileOutputStream;
9
+ import java.io.OutputStreamWriter;
10
+ import java.io.Closeable;
11
+ import java.io.Writer;
12
+ import java.io.BufferedWriter;
13
+ import java.sql.Connection;
14
+ import java.sql.SQLException;
15
+ import com.amazonaws.auth.AWSCredentialsProvider;
16
+ import com.amazonaws.auth.BasicSessionCredentials;
17
+ import com.amazonaws.auth.policy.Policy;
18
+ import com.amazonaws.auth.policy.Resource;
19
+ import com.amazonaws.auth.policy.Statement;
20
+ import com.amazonaws.auth.policy.Statement.Effect;
21
+ import com.amazonaws.auth.policy.actions.S3Actions;
22
+ import com.amazonaws.services.s3.AmazonS3Client;
23
+ import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClient;
24
+ import com.amazonaws.services.securitytoken.model.GetFederationTokenRequest;
25
+ import com.amazonaws.services.securitytoken.model.GetFederationTokenResult;
26
+ import com.amazonaws.services.securitytoken.model.Credentials;
27
+ import org.slf4j.Logger;
28
+ import org.embulk.spi.Exec;
29
+ import org.embulk.output.jdbc.JdbcSchema;
30
+ import org.embulk.output.postgresql.AbstractPostgreSQLCopyBatchInsert;
31
+
32
+ public class RedshiftCopyBatchInsert
33
+ extends AbstractPostgreSQLCopyBatchInsert
34
+ {
35
+ private final Logger logger = Exec.getLogger(RedshiftCopyBatchInsert.class);
36
+ private final RedshiftOutputConnector connector;
37
+ private final String s3BucketName;
38
+ private final String iamReaderUserName;
39
+ private final AmazonS3Client s3;
40
+ private final AWSSecurityTokenServiceClient sts;
41
+
42
+ private RedshiftOutputConnection connection = null;
43
+ private String copySqlBeforeFrom = null;
44
+ private long totalRows;
45
+ private int fileCount;
46
+
47
+ public static final String COPY_AFTER_FROM = "GZIP DELIMITER '\\t' NULL '\\N' ESCAPE TRUNCATECOLUMNS ACCEPTINVCHARS STATUPDATE OFF COMPUPDATE OFF";
48
+
49
+ public RedshiftCopyBatchInsert(RedshiftOutputConnector connector,
50
+ AWSCredentialsProvider credentialsProvider, String s3BucketName,
51
+ String iamReaderUserName) throws IOException, SQLException
52
+ {
53
+ super();
54
+ this.connector = connector;
55
+ this.s3BucketName = s3BucketName;
56
+ this.iamReaderUserName = iamReaderUserName;
57
+ this.s3 = new AmazonS3Client(credentialsProvider); // TODO options
58
+ this.sts = new AWSSecurityTokenServiceClient(credentialsProvider); // options
59
+ }
60
+
61
+ @Override
62
+ public void prepare(String loadTable, JdbcSchema insertSchema) throws SQLException
63
+ {
64
+ this.connection = connector.connect(true);
65
+ this.copySqlBeforeFrom = connection.buildCopySQLBeforeFrom(loadTable, insertSchema);
66
+ logger.info("Copy SQL: "+copySqlBeforeFrom+" ? "+COPY_AFTER_FROM);
67
+ }
68
+
69
+ @Override
70
+ protected BufferedWriter openWriter(File newFile) throws IOException
71
+ {
72
+ // Redshift supports gzip
73
+ return new BufferedWriter(
74
+ new OutputStreamWriter(
75
+ new GZIPOutputStream(new FileOutputStream(newFile)),
76
+ FILE_CHARSET)
77
+ );
78
+ }
79
+
80
+ @Override
81
+ public void flush() throws IOException, SQLException
82
+ {
83
+ File file = closeCurrentFile(); // flush buffered data in writer
84
+
85
+ // TODO multi-threading
86
+ new UploadAndCopyTask(file, batchRows, UUID.randomUUID().toString()).call();
87
+ new DeleteFileFinalizer(file).close();
88
+
89
+ fileCount++;
90
+ totalRows += batchRows;
91
+ batchRows = 0;
92
+
93
+ openNewFile();
94
+ file.delete();
95
+ }
96
+
97
+ @Override
98
+ public void finish() throws IOException, SQLException
99
+ {
100
+ super.finish();
101
+ logger.info("Loaded {} files.", fileCount);
102
+ }
103
+
104
+ @Override
105
+ public void close() throws IOException, SQLException
106
+ {
107
+ s3.shutdown();
108
+ closeCurrentFile().delete();
109
+ if (connection != null) {
110
+ connection.close();
111
+ connection = null;
112
+ }
113
+ }
114
+
115
+ private BasicSessionCredentials generateReaderSessionCredentials(String s3KeyName)
116
+ {
117
+ Policy policy = new Policy()
118
+ .withStatements(
119
+ new Statement(Effect.Allow)
120
+ .withActions(S3Actions.ListObjects)
121
+ .withResources(new Resource("arn:aws:s3:::"+s3BucketName)),
122
+ new Statement(Effect.Allow)
123
+ .withActions(S3Actions.GetObject)
124
+ .withResources(new Resource("arn:aws:s3:::"+s3BucketName+"/"+s3KeyName)) // TODO encode file name using percent encoding
125
+ );
126
+ GetFederationTokenRequest req = new GetFederationTokenRequest();
127
+ req.setDurationSeconds(86400); // 3600 - 129600
128
+ req.setName(iamReaderUserName);
129
+ req.setPolicy(policy.toJson());
130
+
131
+ GetFederationTokenResult res = sts.getFederationToken(req);
132
+ Credentials c = res.getCredentials();
133
+
134
+ return new BasicSessionCredentials(
135
+ c.getAccessKeyId(),
136
+ c.getSecretAccessKey(),
137
+ c.getSessionToken());
138
+ }
139
+
140
+ private class UploadAndCopyTask implements Callable<Void>
141
+ {
142
+ private final File file;
143
+ private final int batchRows;
144
+ private final String s3KeyName;
145
+
146
+ public UploadAndCopyTask(File file, int batchRows, String s3KeyName)
147
+ {
148
+ this.file = file;
149
+ this.batchRows = batchRows;
150
+ this.s3KeyName = s3KeyName;
151
+ }
152
+
153
+ public Void call() throws SQLException {
154
+ logger.info(String.format("Uploading file id %s to S3 (%,d bytes %,d rows)",
155
+ s3KeyName, file.length(), batchRows));
156
+ s3.putObject(s3BucketName, s3KeyName, file);
157
+
158
+ RedshiftOutputConnection con = connector.connect(true);
159
+ try {
160
+ logger.info("Running COPY from file {}", s3KeyName);
161
+
162
+ // create temporary credential right before COPY operation because
163
+ // it has timeout.
164
+ // TODO skip this step if iamReaderUserName is not set
165
+ BasicSessionCredentials creds = generateReaderSessionCredentials(s3KeyName);
166
+
167
+ long startTime = System.currentTimeMillis();
168
+ con.runCopy(buildCopySQL(creds));
169
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
170
+
171
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", s3KeyName, seconds));
172
+
173
+ } finally {
174
+ con.close();
175
+ }
176
+
177
+ return null;
178
+ }
179
+
180
+ private String buildCopySQL(BasicSessionCredentials creds)
181
+ {
182
+ StringBuilder sb = new StringBuilder();
183
+ sb.append(copySqlBeforeFrom);
184
+ sb.append(" FROM 's3://");
185
+ sb.append(s3BucketName);
186
+ sb.append("/");
187
+ sb.append(s3KeyName);
188
+ sb.append("' CREDENTIALS '");
189
+ sb.append("aws_access_key_id=");
190
+ sb.append(creds.getAWSAccessKeyId());
191
+ sb.append(";aws_secret_access_key=");
192
+ sb.append(creds.getAWSSecretKey());
193
+ sb.append(";token=");
194
+ sb.append(creds.getSessionToken());
195
+ sb.append("' ");
196
+ sb.append(COPY_AFTER_FROM);
197
+ return sb.toString();
198
+ }
199
+ }
200
+
201
+ private static class DeleteFileFinalizer implements Closeable
202
+ {
203
+ private File file;
204
+
205
+ public DeleteFileFinalizer(File file) {
206
+ this.file = file;
207
+ }
208
+
209
+ @Override
210
+ public void close() throws IOException {
211
+ file.delete();
212
+ }
213
+ }
214
+ }
@@ -1,122 +1,122 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.sql.Connection;
4
- import java.sql.SQLException;
5
- import java.sql.Statement;
6
- import org.slf4j.Logger;
7
- import org.embulk.spi.Exec;
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
- import org.embulk.output.jdbc.JdbcColumn;
10
- import org.embulk.output.jdbc.JdbcSchema;
11
-
12
- public class RedshiftOutputConnection
13
- extends JdbcOutputConnection
14
- {
15
- private final Logger logger = Exec.getLogger(RedshiftOutputConnection.class);
16
-
17
- public RedshiftOutputConnection(Connection connection, String schemaName, boolean autoCommit)
18
- throws SQLException
19
- {
20
- super(connection, schemaName);
21
- connection.setAutoCommit(autoCommit);
22
- }
23
-
24
- // Redshift does not support DROP TABLE IF EXISTS.
25
- // Here runs DROP TABLE and ignores errors.
26
- @Override
27
- public void dropTableIfExists(String tableName) throws SQLException
28
- {
29
- Statement stmt = connection.createStatement();
30
- try {
31
- String sql = String.format("DROP TABLE IF EXISTS %s", quoteIdentifierString(tableName));
32
- executeUpdate(stmt, sql);
33
- commitIfNecessary(connection);
34
- } catch (SQLException ex) {
35
- // ignore errors.
36
- // TODO here should ignore only 'table "XXX" does not exist' errors.
37
- SQLException ignored = safeRollback(connection, ex);
38
- } finally {
39
- stmt.close();
40
- }
41
- }
42
-
43
- // Redshift does not support DROP TABLE IF EXISTS.
44
- // Dropping part runs DROP TABLE and ignores errors.
45
- @Override
46
- public void replaceTable(String fromTable, JdbcSchema schema, String toTable) throws SQLException
47
- {
48
- Statement stmt = connection.createStatement();
49
- try {
50
- try {
51
- StringBuilder sb = new StringBuilder();
52
- sb.append("DROP TABLE ");
53
- quoteIdentifierString(sb, toTable);
54
- String sql = sb.toString();
55
- executeUpdate(stmt, sql);
56
- } catch (SQLException ex) {
57
- // ignore errors.
58
- // TODO here should ignore only 'table "XXX" does not exist' errors.
59
- // rollback or comimt is required to recover failed transaction
60
- SQLException ignored = safeRollback(connection, ex);
61
- }
62
-
63
- {
64
- StringBuilder sb = new StringBuilder();
65
- sb.append("ALTER TABLE ");
66
- quoteIdentifierString(sb, fromTable);
67
- sb.append(" RENAME TO ");
68
- quoteIdentifierString(sb, toTable);
69
- String sql = sb.toString();
70
- executeUpdate(stmt, sql);
71
- }
72
-
73
- commitIfNecessary(connection);
74
- } catch (SQLException ex) {
75
- throw safeRollback(connection, ex);
76
- } finally {
77
- stmt.close();
78
- }
79
- }
80
-
81
- @Override
82
- protected String convertTypeName(String typeName)
83
- {
84
- // Redshift does not support TEXT type.
85
- switch(typeName) {
86
- case "CLOB":
87
- return "VARCHAR(65535)";
88
- case "TEXT":
89
- return "VARCHAR(65535)";
90
- case "BLOB":
91
- return "BYTEA";
92
- default:
93
- return typeName;
94
- }
95
- }
96
-
97
- public String buildCopySQLBeforeFrom(String tableName, JdbcSchema tableSchema)
98
- {
99
- StringBuilder sb = new StringBuilder();
100
-
101
- sb.append("COPY ");
102
- quoteIdentifierString(sb, tableName);
103
- sb.append(" (");
104
- for(int i=0; i < tableSchema.getCount(); i++) {
105
- if(i != 0) { sb.append(", "); }
106
- quoteIdentifierString(sb, tableSchema.getColumnName(i));
107
- }
108
- sb.append(")");
109
-
110
- return sb.toString();
111
- }
112
-
113
- public void runCopy(String sql) throws SQLException
114
- {
115
- Statement stmt = connection.createStatement();
116
- try {
117
- stmt.executeUpdate(sql);
118
- } finally {
119
- stmt.close();
120
- }
121
- }
122
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.sql.Connection;
4
+ import java.sql.SQLException;
5
+ import java.sql.Statement;
6
+ import org.slf4j.Logger;
7
+ import org.embulk.spi.Exec;
8
+ import org.embulk.output.jdbc.JdbcOutputConnection;
9
+ import org.embulk.output.jdbc.JdbcColumn;
10
+ import org.embulk.output.jdbc.JdbcSchema;
11
+
12
+ public class RedshiftOutputConnection
13
+ extends JdbcOutputConnection
14
+ {
15
+ private final Logger logger = Exec.getLogger(RedshiftOutputConnection.class);
16
+
17
+ public RedshiftOutputConnection(Connection connection, String schemaName, boolean autoCommit)
18
+ throws SQLException
19
+ {
20
+ super(connection, schemaName);
21
+ connection.setAutoCommit(autoCommit);
22
+ }
23
+
24
+ // Redshift does not support DROP TABLE IF EXISTS.
25
+ // Here runs DROP TABLE and ignores errors.
26
+ @Override
27
+ public void dropTableIfExists(String tableName) throws SQLException
28
+ {
29
+ Statement stmt = connection.createStatement();
30
+ try {
31
+ String sql = String.format("DROP TABLE IF EXISTS %s", quoteIdentifierString(tableName));
32
+ executeUpdate(stmt, sql);
33
+ commitIfNecessary(connection);
34
+ } catch (SQLException ex) {
35
+ // ignore errors.
36
+ // TODO here should ignore only 'table "XXX" does not exist' errors.
37
+ SQLException ignored = safeRollback(connection, ex);
38
+ } finally {
39
+ stmt.close();
40
+ }
41
+ }
42
+
43
+ // Redshift does not support DROP TABLE IF EXISTS.
44
+ // Dropping part runs DROP TABLE and ignores errors.
45
+ @Override
46
+ public void replaceTable(String fromTable, JdbcSchema schema, String toTable) throws SQLException
47
+ {
48
+ Statement stmt = connection.createStatement();
49
+ try {
50
+ try {
51
+ StringBuilder sb = new StringBuilder();
52
+ sb.append("DROP TABLE ");
53
+ quoteIdentifierString(sb, toTable);
54
+ String sql = sb.toString();
55
+ executeUpdate(stmt, sql);
56
+ } catch (SQLException ex) {
57
+ // ignore errors.
58
+ // TODO here should ignore only 'table "XXX" does not exist' errors.
59
+ // rollback or comimt is required to recover failed transaction
60
+ SQLException ignored = safeRollback(connection, ex);
61
+ }
62
+
63
+ {
64
+ StringBuilder sb = new StringBuilder();
65
+ sb.append("ALTER TABLE ");
66
+ quoteIdentifierString(sb, fromTable);
67
+ sb.append(" RENAME TO ");
68
+ quoteIdentifierString(sb, toTable);
69
+ String sql = sb.toString();
70
+ executeUpdate(stmt, sql);
71
+ }
72
+
73
+ commitIfNecessary(connection);
74
+ } catch (SQLException ex) {
75
+ throw safeRollback(connection, ex);
76
+ } finally {
77
+ stmt.close();
78
+ }
79
+ }
80
+
81
+ @Override
82
+ protected String buildColumnTypeName(JdbcColumn c)
83
+ {
84
+ // Redshift does not support TEXT type.
85
+ switch(c.getSimpleTypeName()) {
86
+ case "CLOB":
87
+ return "VARCHAR(65535)";
88
+ case "TEXT":
89
+ return "VARCHAR(65535)";
90
+ case "BLOB":
91
+ return "BYTEA";
92
+ default:
93
+ return super.buildColumnTypeName(c);
94
+ }
95
+ }
96
+
97
+ public String buildCopySQLBeforeFrom(String tableName, JdbcSchema tableSchema)
98
+ {
99
+ StringBuilder sb = new StringBuilder();
100
+
101
+ sb.append("COPY ");
102
+ quoteIdentifierString(sb, tableName);
103
+ sb.append(" (");
104
+ for(int i=0; i < tableSchema.getCount(); i++) {
105
+ if(i != 0) { sb.append(", "); }
106
+ quoteIdentifierString(sb, tableSchema.getColumnName(i));
107
+ }
108
+ sb.append(")");
109
+
110
+ return sb.toString();
111
+ }
112
+
113
+ public void runCopy(String sql) throws SQLException
114
+ {
115
+ Statement stmt = connection.createStatement();
116
+ try {
117
+ stmt.executeUpdate(sql);
118
+ } finally {
119
+ stmt.close();
120
+ }
121
+ }
122
+ }
@@ -1,40 +1,40 @@
1
- package org.embulk.output.redshift;
2
-
3
- import java.util.Properties;
4
- import java.sql.Driver;
5
- import java.sql.Connection;
6
- import java.sql.SQLException;
7
- import org.embulk.output.jdbc.JdbcOutputConnector;
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
-
10
- public class RedshiftOutputConnector
11
- implements JdbcOutputConnector
12
- {
13
- private static final Driver driver = new org.postgresql.Driver();
14
-
15
- private final String url;
16
- private final Properties properties;
17
- private final String schemaName;
18
-
19
- public RedshiftOutputConnector(String url, Properties properties, String schemaName)
20
- {
21
- this.url = url;
22
- this.properties = properties;
23
- this.schemaName = schemaName;
24
- }
25
-
26
- @Override
27
- public RedshiftOutputConnection connect(boolean autoCommit) throws SQLException
28
- {
29
- Connection c = driver.connect(url, properties);
30
- try {
31
- RedshiftOutputConnection con = new RedshiftOutputConnection(c, schemaName, autoCommit);
32
- c = null;
33
- return con;
34
- } finally {
35
- if (c != null) {
36
- c.close();
37
- }
38
- }
39
- }
40
- }
1
+ package org.embulk.output.redshift;
2
+
3
+ import java.util.Properties;
4
+ import java.sql.Driver;
5
+ import java.sql.Connection;
6
+ import java.sql.SQLException;
7
+ import org.embulk.output.jdbc.JdbcOutputConnector;
8
+ import org.embulk.output.jdbc.JdbcOutputConnection;
9
+
10
+ public class RedshiftOutputConnector
11
+ implements JdbcOutputConnector
12
+ {
13
+ private static final Driver driver = new org.postgresql.Driver();
14
+
15
+ private final String url;
16
+ private final Properties properties;
17
+ private final String schemaName;
18
+
19
+ public RedshiftOutputConnector(String url, Properties properties, String schemaName)
20
+ {
21
+ this.url = url;
22
+ this.properties = properties;
23
+ this.schemaName = schemaName;
24
+ }
25
+
26
+ @Override
27
+ public RedshiftOutputConnection connect(boolean autoCommit) throws SQLException
28
+ {
29
+ Connection c = driver.connect(url, properties);
30
+ try {
31
+ RedshiftOutputConnection con = new RedshiftOutputConnection(c, schemaName, autoCommit);
32
+ c = null;
33
+ return con;
34
+ } finally {
35
+ if (c != null) {
36
+ c.close();
37
+ }
38
+ }
39
+ }
40
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-redshift
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-12 00:00:00.000000000 Z
11
+ date: 2015-05-19 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Inserts or updates records to a table.
14
14
  email:
@@ -30,9 +30,9 @@ files:
30
30
  - classpath/aws-java-sdk-sts-1.9.17.jar
31
31
  - classpath/commons-codec-1.6.jar
32
32
  - classpath/commons-logging-1.1.3.jar
33
- - classpath/embulk-output-jdbc-0.2.4.jar
34
- - classpath/embulk-output-postgresql-0.2.4.jar
35
- - classpath/embulk-output-redshift-0.2.4.jar
33
+ - classpath/embulk-output-jdbc-0.3.0.jar
34
+ - classpath/embulk-output-postgresql-0.3.0.jar
35
+ - classpath/embulk-output-redshift-0.3.0.jar
36
36
  - classpath/httpclient-4.3.4.jar
37
37
  - classpath/httpcore-4.3.2.jar
38
38
  - classpath/jna-4.1.0.jar
Binary file