embulk-output-snowflake 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,373 @@
1
+ package org.embulk.output.snowflake;
2
+
3
+ import org.embulk.output.jdbc.BatchInsert;
4
+ import org.embulk.output.jdbc.JdbcOutputConnector;
5
+ import org.embulk.output.jdbc.JdbcSchema;
6
+ import org.embulk.output.jdbc.TableIdentifier;
7
+ import org.embulk.spi.time.Timestamp;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.io.*;
11
+ import java.math.BigDecimal;
12
+ import java.nio.charset.Charset;
13
+ import java.sql.SQLException;
14
+ import java.util.*;
15
+ import java.util.concurrent.*;
16
+ import java.util.zip.GZIPOutputStream;
17
+ import org.slf4j.LoggerFactory;
18
+
19
+ public class SnowflakeCopyBatchInsert implements BatchInsert {
20
+ private final Logger logger = LoggerFactory.getLogger(SnowflakeCopyBatchInsert.class);
21
+ private final JdbcOutputConnector connector;
22
+ protected static final Charset FILE_CHARSET = Charset.forName("UTF-8");
23
+ private final ExecutorService executorService;
24
+ private final StageIdentifier stageIdentifier;
25
+ private final boolean deleteStageFile;
26
+
27
+ protected static final String nullString = "\\N";
28
+ protected static final String newLineString = "\n";
29
+ protected static final String delimiterString = "\t";
30
+
31
+ private SnowflakeOutputConnection connection = null;
32
+ private TableIdentifier tableIdentifier = null;
33
+ protected File currentFile;
34
+ protected BufferedWriter writer;
35
+ protected int index;
36
+ protected int batchRows;
37
+ private long totalRows;
38
+ private int fileCount;
39
+ private List<Future<Void>> uploadAndCopyFutures;
40
+
41
+ public SnowflakeCopyBatchInsert(JdbcOutputConnector connector, StageIdentifier stageIdentifier,
42
+ boolean deleteStageFile) throws IOException {
43
+ this.index = 0;
44
+ openNewFile();
45
+ this.connector = connector;
46
+ this.stageIdentifier = stageIdentifier;
47
+ this.executorService = Executors.newCachedThreadPool();
48
+ this.deleteStageFile = deleteStageFile;
49
+ this.uploadAndCopyFutures = new ArrayList();
50
+ }
51
+
52
+ @Override
53
+ public void prepare(TableIdentifier loadTable, JdbcSchema insertSchema) throws SQLException {
54
+ this.connection = (SnowflakeOutputConnection) connector.connect(true);
55
+ this.connection.runCreateStage(stageIdentifier);
56
+ this.tableIdentifier = loadTable;
57
+ }
58
+
59
+ private File createTempFile() throws IOException {
60
+ return File.createTempFile("embulk-output-snowflake-copy-", ".tsv.tmp"); // TODO configurable temporary file path
61
+ }
62
+
63
+ protected File openNewFile() throws IOException {
64
+ File newFile = createTempFile();
65
+ File oldFile = closeCurrentFile();
66
+ this.writer = openWriter(newFile);
67
+ currentFile = newFile;
68
+ return oldFile;
69
+ }
70
+
71
+ protected File closeCurrentFile() throws IOException {
72
+ if (writer != null) {
73
+ writer.close();
74
+ writer = null;
75
+ }
76
+ return currentFile;
77
+ }
78
+
79
+ protected BufferedWriter openWriter(File newFile) throws IOException {
80
+ // Snowflake supports gzip
81
+ return new BufferedWriter(
82
+ new OutputStreamWriter(
83
+ new GZIPOutputStream(new FileOutputStream(newFile)),
84
+ FILE_CHARSET)
85
+ );
86
+ }
87
+
88
+ public int getBatchWeight() {
89
+ long fsize = currentFile.length();
90
+ if (fsize > Integer.MAX_VALUE) {
91
+ return Integer.MAX_VALUE;
92
+ } else {
93
+ return (int) fsize;
94
+ }
95
+ }
96
+
97
+ public void add() throws IOException {
98
+ writer.write(newLineString);
99
+ batchRows++;
100
+ index = 0;
101
+ }
102
+
103
+ private void appendDelimiter() throws IOException {
104
+ if (index != 0) {
105
+ writer.write(delimiterString);
106
+ }
107
+ index++;
108
+ }
109
+
110
+ public void setNull(int sqlType) throws IOException {
111
+ appendDelimiter();
112
+ writer.write(nullString);
113
+ }
114
+
115
+ public void setBoolean(boolean v) throws IOException {
116
+ appendDelimiter();
117
+ writer.write(String.valueOf(v));
118
+ }
119
+
120
+ public void setByte(byte v) throws IOException {
121
+ appendDelimiter();
122
+ setEscapedString(String.valueOf(v));
123
+ }
124
+
125
+ public void setShort(short v) throws IOException {
126
+ appendDelimiter();
127
+ writer.write(String.valueOf(v));
128
+ }
129
+
130
+ public void setInt(int v) throws IOException {
131
+ appendDelimiter();
132
+ writer.write(String.valueOf(v));
133
+ }
134
+
135
+ public void setLong(long v) throws IOException {
136
+ appendDelimiter();
137
+ writer.write(String.valueOf(v));
138
+ }
139
+
140
+ public void setFloat(float v) throws IOException {
141
+ appendDelimiter();
142
+ writer.write(String.valueOf(v));
143
+ }
144
+
145
+ public void setDouble(double v) throws IOException {
146
+ appendDelimiter();
147
+ writer.write(String.valueOf(v));
148
+ }
149
+
150
+ public void setBigDecimal(BigDecimal v) throws IOException {
151
+ appendDelimiter();
152
+ writer.write(String.valueOf(v));
153
+ }
154
+
155
+ public void setString(String v) throws IOException {
156
+ appendDelimiter();
157
+ setEscapedString(v);
158
+ }
159
+
160
+ public void setNString(String v) throws IOException {
161
+ appendDelimiter();
162
+ setEscapedString(v);
163
+ }
164
+
165
+ public void setBytes(byte[] v) throws IOException {
166
+ appendDelimiter();
167
+ setEscapedString(String.valueOf(v));
168
+ }
169
+
170
+ public void setSqlDate(Timestamp v, Calendar cal) throws IOException {
171
+ appendDelimiter();
172
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
173
+ String f = String.format(Locale.ENGLISH, "%04d-%02d-%02d",
174
+ cal.get(Calendar.YEAR),
175
+ cal.get(Calendar.MONTH) + 1,
176
+ cal.get(Calendar.DAY_OF_MONTH));
177
+ writer.write(f);
178
+ }
179
+
180
+ public void setSqlTime(Timestamp v, Calendar cal) throws IOException {
181
+ appendDelimiter();
182
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
183
+ String f = String.format(Locale.ENGLISH, "%02d:%02d:%02d.%06d",
184
+ cal.get(Calendar.HOUR_OF_DAY),
185
+ cal.get(Calendar.MINUTE),
186
+ cal.get(Calendar.SECOND),
187
+ v.getNano() / 1000);
188
+ writer.write(f);
189
+ }
190
+
191
+ public void setSqlTimestamp(Timestamp v, Calendar cal) throws IOException {
192
+ appendDelimiter();
193
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
194
+ int zoneOffset = cal.get(Calendar.ZONE_OFFSET) / 1000 / 60; // zone offset considering DST in minute
195
+ String offset;
196
+ if (zoneOffset >= 0) {
197
+ offset = String.format(Locale.ENGLISH, "+%02d%02d", zoneOffset / 60, zoneOffset % 60);
198
+ } else {
199
+ offset = String.format(Locale.ENGLISH, "-%02d%02d", -zoneOffset / 60, -zoneOffset % 60);
200
+ }
201
+ String f = String.format(Locale.ENGLISH, "%d-%02d-%02d %02d:%02d:%02d.%06d%s",
202
+ cal.get(Calendar.YEAR),
203
+ cal.get(Calendar.MONTH) + 1,
204
+ cal.get(Calendar.DAY_OF_MONTH),
205
+ cal.get(Calendar.HOUR_OF_DAY),
206
+ cal.get(Calendar.MINUTE),
207
+ cal.get(Calendar.SECOND),
208
+ v.getNano() / 1000,
209
+ offset);
210
+ writer.write(f);
211
+ }
212
+
213
+ private void setEscapedString(String v) throws IOException {
214
+ for (char c : v.toCharArray()) {
215
+ writer.write(escape(c));
216
+ }
217
+ }
218
+
219
+ @Override
220
+ public void flush() throws IOException, SQLException {
221
+ File file = closeCurrentFile(); // flush buffered data in writer
222
+
223
+ String snowflakeStageFileName = "embulk_snowflake_" + SnowflakeUtils.randomString(8);
224
+
225
+ UploadTask uploadTask = new UploadTask(file, batchRows, stageIdentifier, snowflakeStageFileName);
226
+ Future<Void> uploadFuture = executorService.submit(uploadTask);
227
+ uploadAndCopyFutures.add(uploadFuture);
228
+
229
+ CopyTask copyTask = new CopyTask(uploadFuture, snowflakeStageFileName);
230
+ uploadAndCopyFutures.add(executorService.submit(copyTask));
231
+
232
+ fileCount++;
233
+ totalRows += batchRows;
234
+ batchRows = 0;
235
+
236
+ openNewFile();
237
+ }
238
+
239
+ public void close() throws IOException, SQLException {
240
+ executorService.shutdownNow();
241
+
242
+ try {
243
+ executorService.awaitTermination(60, TimeUnit.SECONDS);
244
+ } catch (InterruptedException e) {
245
+ }
246
+
247
+ closeCurrentFile().delete();
248
+ if (connection != null) {
249
+ connection.close();
250
+ connection = null;
251
+ }
252
+ }
253
+
254
+ @Override
255
+ public void finish() throws IOException, SQLException
256
+ {
257
+ for (Future<Void> uploadAndCopyFuture : uploadAndCopyFutures) {
258
+ try {
259
+ uploadAndCopyFuture.get();
260
+
261
+ } catch (InterruptedException e) {
262
+ throw new RuntimeException(e);
263
+ } catch (ExecutionException e) {
264
+ if (e.getCause() instanceof SQLException) {
265
+ throw (SQLException)e.getCause();
266
+ }
267
+ throw new RuntimeException(e);
268
+ }
269
+ }
270
+
271
+ logger.info("Loaded {} files.", fileCount);
272
+ }
273
+
274
+ @Override
275
+ public int[] getLastUpdateCounts()
276
+ {
277
+ // need not be implemented because SnowflakeCopyBatchInsert won't retry.
278
+ return new int[]{};
279
+ }
280
+
281
+ // Escape \, \n, \t, \r
282
+ // Remove \0
283
+ protected String escape(char c) {
284
+ switch (c) {
285
+ case '\\':
286
+ return "\\\\";
287
+ case '\n':
288
+ return "\\n";
289
+ case '\t':
290
+ return "\\t";
291
+ case '\r':
292
+ return "\\r";
293
+ case 0:
294
+ return "";
295
+ default:
296
+ return String.valueOf(c);
297
+ }
298
+ }
299
+
300
+ private class UploadTask implements Callable<Void> {
301
+ private final File file;
302
+ private final int batchRows;
303
+ private final String snowflakeStageFileName;
304
+ private final StageIdentifier stageIdentifier;
305
+
306
+ public UploadTask(File file, int batchRows, StageIdentifier stageIdentifier, String snowflakeStageFileName) {
307
+ this.file = file;
308
+ this.batchRows = batchRows;
309
+ this.snowflakeStageFileName = snowflakeStageFileName;
310
+ this.stageIdentifier = stageIdentifier;
311
+ }
312
+
313
+ public Void call() throws IOException, SQLException {
314
+ logger.info(String.format("Uploading file id %s to Snowflake (%,d bytes %,d rows)",
315
+ snowflakeStageFileName, file.length(), batchRows));
316
+
317
+ try {
318
+ long startTime = System.currentTimeMillis();
319
+ // put file to snowflake internal storage
320
+ SnowflakeOutputConnection con = (SnowflakeOutputConnection) connector.connect(true);
321
+
322
+ FileInputStream fileInputStream = new FileInputStream(file);
323
+ con.runUploadFile(stageIdentifier, snowflakeStageFileName, fileInputStream);
324
+
325
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
326
+
327
+ logger.info(String.format("Uploaded file %s (%.2f seconds)", snowflakeStageFileName, seconds));
328
+ } finally {
329
+ file.delete();
330
+ }
331
+
332
+ return null;
333
+ }
334
+ }
335
+
336
+
337
+ private class CopyTask implements Callable<Void> {
338
+ private final Future<Void> uploadFuture;
339
+ private final String snowflakeStageFileName;
340
+
341
+ public CopyTask(Future<Void> uploadFuture, String snowflakeStageFileName) {
342
+ this.uploadFuture = uploadFuture;
343
+ this.snowflakeStageFileName = snowflakeStageFileName;
344
+ }
345
+
346
+ public Void call() throws SQLException, InterruptedException, ExecutionException {
347
+ try {
348
+ uploadFuture.get();
349
+
350
+ SnowflakeOutputConnection con = (SnowflakeOutputConnection) connector.connect(true);
351
+ try {
352
+ logger.info("Running COPY from file {}", snowflakeStageFileName);
353
+
354
+ long startTime = System.currentTimeMillis();
355
+ con.runCopy(tableIdentifier, stageIdentifier, snowflakeStageFileName, delimiterString);
356
+
357
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
358
+
359
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", snowflakeStageFileName, seconds));
360
+
361
+ } finally {
362
+ con.close();
363
+ }
364
+ } finally {
365
+ if (deleteStageFile) {
366
+ connection.runDeleteStageFile(stageIdentifier, snowflakeStageFileName);
367
+ }
368
+ }
369
+
370
+ return null;
371
+ }
372
+ }
373
+ }
@@ -1,51 +1,122 @@
1
1
  package org.embulk.output.snowflake;
2
2
 
3
+ import net.snowflake.client.jdbc.SnowflakeConnection;
4
+ import org.embulk.output.jdbc.JdbcColumn;
5
+ import org.embulk.output.jdbc.JdbcOutputConnection;
6
+ import org.embulk.output.jdbc.TableIdentifier;
7
+
8
+ import java.io.FileInputStream;
3
9
  import java.sql.Connection;
4
- import java.sql.ResultSet;
5
10
  import java.sql.SQLException;
6
11
  import java.sql.Statement;
7
12
 
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
- import org.embulk.output.jdbc.JdbcUtils;
10
- import org.embulk.output.jdbc.TableIdentifier;
11
-
12
- public class SnowflakeOutputConnection
13
- extends JdbcOutputConnection
14
- {
15
- public SnowflakeOutputConnection(Connection connection, String schemaName)
16
- throws SQLException
13
+ public class SnowflakeOutputConnection extends JdbcOutputConnection {
14
+ public SnowflakeOutputConnection(Connection connection)
15
+ throws SQLException
17
16
  {
18
- super(connection, schemaName);
17
+ super(connection, null);
19
18
  }
20
19
 
21
- @Override
22
- public boolean tableExists(TableIdentifier table) throws SQLException
20
+ public void runCopy(TableIdentifier tableIdentifier, StageIdentifier stageIdentifier, String filename, String delimiterString) throws SQLException
23
21
  {
24
- String schemaName = JdbcUtils.escapeSearchString(table.getSchemaName(), connection.getMetaData().getSearchStringEscape());
25
- String database = connection.getCatalog();
26
- try (ResultSet rs = connection.getMetaData().getTables(database, schemaName, table.getTableName(), null)) {
27
- return rs.next();
28
- }
22
+ String sql = buildCopySQL(tableIdentifier, stageIdentifier, filename, delimiterString);
23
+ runUpdate(sql);
24
+ }
25
+ public void runCreateStage(StageIdentifier stageIdentifier) throws SQLException {
26
+ String sql = buildCreateStageSQL(stageIdentifier);
27
+ runUpdate(sql);
29
28
  }
30
29
 
31
- @Override
32
- public boolean tableExists(String tableName) throws SQLException
33
- {
34
- return tableExists(new TableIdentifier(connection.getCatalog(), schemaName, tableName));
30
+ public void runDropStage(StageIdentifier stageIdentifier) throws SQLException {
31
+ String sql = buildDropStageSQL(stageIdentifier);
32
+ runUpdate(sql);
35
33
  }
36
34
 
37
- @Override
38
- protected void setSearchPath(String schema) throws SQLException
35
+ public void runUploadFile(StageIdentifier stageIdentifier, String filename ,FileInputStream fileInputStream) throws SQLException{
36
+ connection.unwrap(SnowflakeConnection.class).uploadStream(stageIdentifier.getStageName(), stageIdentifier.getDestPrefix().or("/"),
37
+ fileInputStream, filename + ".csv.gz", false);
38
+ }
39
+
40
+ public void runDeleteStageFile(StageIdentifier stageIdentifier, String filename) throws SQLException{
41
+ String sql = buildDeleteStageFileSQL(stageIdentifier, filename);
42
+ runUpdate(sql);
43
+ }
44
+
45
+ protected void runUpdate(String sql) throws SQLException
39
46
  {
40
47
  Statement stmt = connection.createStatement();
41
48
  try {
42
- String sql = "USE SCHEMA " + quoteIdentifierString(schema);
43
- executeUpdate(stmt, sql);
44
- commitIfNecessary(connection);
49
+ stmt.executeUpdate(sql);
45
50
  } finally {
46
51
  stmt.close();
47
52
  }
48
53
  }
49
54
 
55
+ @Override
56
+ protected String buildColumnTypeName(JdbcColumn c) {
57
+ switch(c.getSimpleTypeName()) {
58
+ case "CLOB":
59
+ return "VARCHAR(65535)";
60
+ default:
61
+ return super.buildColumnTypeName(c);
62
+ }
63
+ }
64
+
65
+ protected String buildCreateStageSQL(StageIdentifier stageIdentifier){
66
+ StringBuilder sb = new StringBuilder();
67
+ sb.append("CREATE STAGE IF NOT EXISTS ");
68
+ quoteStageIdentifier(sb, stageIdentifier);
69
+ sb.append(";");
70
+ return sb.toString();
71
+ }
72
+
73
+ protected String buildDropStageSQL(StageIdentifier stageIdentifier){
74
+ StringBuilder sb = new StringBuilder();
75
+ sb.append("DROP STAGE ");
76
+ quoteStageIdentifier(sb, stageIdentifier);
77
+ sb.append(";");
78
+ return sb.toString();
79
+ }
50
80
 
81
+ protected void quoteStageIdentifier(StringBuilder sb, StageIdentifier stageIdentifier){
82
+ sb.append(stageIdentifier.getDatabase());
83
+ sb.append(".");
84
+ sb.append(stageIdentifier.getSchemaName());
85
+ sb.append(".");
86
+ sb.append(stageIdentifier.getStageName());
87
+ }
88
+
89
+ protected String buildCopySQL(TableIdentifier tableIdentifier, StageIdentifier stageIdentifier, String snowflakeStageFileName, String delimiterString){
90
+ StringBuilder sb = new StringBuilder();
91
+ sb.append("COPY INTO ");
92
+ quoteTableIdentifier(sb, tableIdentifier);
93
+ sb.append(" FROM ");
94
+ quoteInternalStoragePath(sb, stageIdentifier, snowflakeStageFileName);
95
+ sb.append(" FILE_FORMAT = ( TYPE = CSV FIELD_DELIMITER = '");
96
+ sb.append(delimiterString);
97
+ sb.append("');");
98
+ return sb.toString();
99
+ }
100
+
101
+ protected String buildDeleteStageFileSQL(StageIdentifier stageIdentifier, String snowflakeStageFileName){
102
+ StringBuilder sb = new StringBuilder();
103
+ sb.append("REMOVE ");
104
+ quoteInternalStoragePath(sb, stageIdentifier, snowflakeStageFileName);
105
+ sb.append(';');
106
+ return sb.toString();
107
+ }
108
+
109
+ protected String quoteInternalStoragePath(StringBuilder sb, StageIdentifier stageIdentifier,
110
+ String snowflakeStageFileName){
111
+ sb.append("@");
112
+ quoteStageIdentifier(sb, stageIdentifier);
113
+ if (stageIdentifier.getDestPrefix().isPresent()){
114
+ sb.append("/");
115
+ sb.append(stageIdentifier.getDestPrefix().get());
116
+ }
117
+ sb.append("/");
118
+ sb.append(snowflakeStageFileName);
119
+ sb.append(".csv.gz");
120
+ return sb.toString();
121
+ }
51
122
  }