embulk-output-snowflake 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,373 @@
1
+ package org.embulk.output.snowflake;
2
+
3
+ import org.embulk.output.jdbc.BatchInsert;
4
+ import org.embulk.output.jdbc.JdbcOutputConnector;
5
+ import org.embulk.output.jdbc.JdbcSchema;
6
+ import org.embulk.output.jdbc.TableIdentifier;
7
+ import org.embulk.spi.time.Timestamp;
8
+ import org.slf4j.Logger;
9
+
10
+ import java.io.*;
11
+ import java.math.BigDecimal;
12
+ import java.nio.charset.Charset;
13
+ import java.sql.SQLException;
14
+ import java.util.*;
15
+ import java.util.concurrent.*;
16
+ import java.util.zip.GZIPOutputStream;
17
+ import org.slf4j.LoggerFactory;
18
+
19
+ public class SnowflakeCopyBatchInsert implements BatchInsert {
20
+ private final Logger logger = LoggerFactory.getLogger(SnowflakeCopyBatchInsert.class);
21
+ private final JdbcOutputConnector connector;
22
+ protected static final Charset FILE_CHARSET = Charset.forName("UTF-8");
23
+ private final ExecutorService executorService;
24
+ private final StageIdentifier stageIdentifier;
25
+ private final boolean deleteStageFile;
26
+
27
+ protected static final String nullString = "\\N";
28
+ protected static final String newLineString = "\n";
29
+ protected static final String delimiterString = "\t";
30
+
31
+ private SnowflakeOutputConnection connection = null;
32
+ private TableIdentifier tableIdentifier = null;
33
+ protected File currentFile;
34
+ protected BufferedWriter writer;
35
+ protected int index;
36
+ protected int batchRows;
37
+ private long totalRows;
38
+ private int fileCount;
39
+ private List<Future<Void>> uploadAndCopyFutures;
40
+
41
+ public SnowflakeCopyBatchInsert(JdbcOutputConnector connector, StageIdentifier stageIdentifier,
42
+ boolean deleteStageFile) throws IOException {
43
+ this.index = 0;
44
+ openNewFile();
45
+ this.connector = connector;
46
+ this.stageIdentifier = stageIdentifier;
47
+ this.executorService = Executors.newCachedThreadPool();
48
+ this.deleteStageFile = deleteStageFile;
49
+ this.uploadAndCopyFutures = new ArrayList();
50
+ }
51
+
52
+ @Override
53
+ public void prepare(TableIdentifier loadTable, JdbcSchema insertSchema) throws SQLException {
54
+ this.connection = (SnowflakeOutputConnection) connector.connect(true);
55
+ this.connection.runCreateStage(stageIdentifier);
56
+ this.tableIdentifier = loadTable;
57
+ }
58
+
59
+ private File createTempFile() throws IOException {
60
+ return File.createTempFile("embulk-output-snowflake-copy-", ".tsv.tmp"); // TODO configurable temporary file path
61
+ }
62
+
63
+ protected File openNewFile() throws IOException {
64
+ File newFile = createTempFile();
65
+ File oldFile = closeCurrentFile();
66
+ this.writer = openWriter(newFile);
67
+ currentFile = newFile;
68
+ return oldFile;
69
+ }
70
+
71
+ protected File closeCurrentFile() throws IOException {
72
+ if (writer != null) {
73
+ writer.close();
74
+ writer = null;
75
+ }
76
+ return currentFile;
77
+ }
78
+
79
+ protected BufferedWriter openWriter(File newFile) throws IOException {
80
+ // Snowflake supports gzip
81
+ return new BufferedWriter(
82
+ new OutputStreamWriter(
83
+ new GZIPOutputStream(new FileOutputStream(newFile)),
84
+ FILE_CHARSET)
85
+ );
86
+ }
87
+
88
+ public int getBatchWeight() {
89
+ long fsize = currentFile.length();
90
+ if (fsize > Integer.MAX_VALUE) {
91
+ return Integer.MAX_VALUE;
92
+ } else {
93
+ return (int) fsize;
94
+ }
95
+ }
96
+
97
+ public void add() throws IOException {
98
+ writer.write(newLineString);
99
+ batchRows++;
100
+ index = 0;
101
+ }
102
+
103
+ private void appendDelimiter() throws IOException {
104
+ if (index != 0) {
105
+ writer.write(delimiterString);
106
+ }
107
+ index++;
108
+ }
109
+
110
+ public void setNull(int sqlType) throws IOException {
111
+ appendDelimiter();
112
+ writer.write(nullString);
113
+ }
114
+
115
+ public void setBoolean(boolean v) throws IOException {
116
+ appendDelimiter();
117
+ writer.write(String.valueOf(v));
118
+ }
119
+
120
+ public void setByte(byte v) throws IOException {
121
+ appendDelimiter();
122
+ setEscapedString(String.valueOf(v));
123
+ }
124
+
125
+ public void setShort(short v) throws IOException {
126
+ appendDelimiter();
127
+ writer.write(String.valueOf(v));
128
+ }
129
+
130
+ public void setInt(int v) throws IOException {
131
+ appendDelimiter();
132
+ writer.write(String.valueOf(v));
133
+ }
134
+
135
+ public void setLong(long v) throws IOException {
136
+ appendDelimiter();
137
+ writer.write(String.valueOf(v));
138
+ }
139
+
140
+ public void setFloat(float v) throws IOException {
141
+ appendDelimiter();
142
+ writer.write(String.valueOf(v));
143
+ }
144
+
145
+ public void setDouble(double v) throws IOException {
146
+ appendDelimiter();
147
+ writer.write(String.valueOf(v));
148
+ }
149
+
150
+ public void setBigDecimal(BigDecimal v) throws IOException {
151
+ appendDelimiter();
152
+ writer.write(String.valueOf(v));
153
+ }
154
+
155
+ public void setString(String v) throws IOException {
156
+ appendDelimiter();
157
+ setEscapedString(v);
158
+ }
159
+
160
+ public void setNString(String v) throws IOException {
161
+ appendDelimiter();
162
+ setEscapedString(v);
163
+ }
164
+
165
+ public void setBytes(byte[] v) throws IOException {
166
+ appendDelimiter();
167
+ setEscapedString(String.valueOf(v));
168
+ }
169
+
170
+ public void setSqlDate(Timestamp v, Calendar cal) throws IOException {
171
+ appendDelimiter();
172
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
173
+ String f = String.format(Locale.ENGLISH, "%04d-%02d-%02d",
174
+ cal.get(Calendar.YEAR),
175
+ cal.get(Calendar.MONTH) + 1,
176
+ cal.get(Calendar.DAY_OF_MONTH));
177
+ writer.write(f);
178
+ }
179
+
180
+ public void setSqlTime(Timestamp v, Calendar cal) throws IOException {
181
+ appendDelimiter();
182
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
183
+ String f = String.format(Locale.ENGLISH, "%02d:%02d:%02d.%06d",
184
+ cal.get(Calendar.HOUR_OF_DAY),
185
+ cal.get(Calendar.MINUTE),
186
+ cal.get(Calendar.SECOND),
187
+ v.getNano() / 1000);
188
+ writer.write(f);
189
+ }
190
+
191
+ public void setSqlTimestamp(Timestamp v, Calendar cal) throws IOException {
192
+ appendDelimiter();
193
+ cal.setTimeInMillis(v.getEpochSecond() * 1000);
194
+ int zoneOffset = cal.get(Calendar.ZONE_OFFSET) / 1000 / 60; // zone offset considering DST in minute
195
+ String offset;
196
+ if (zoneOffset >= 0) {
197
+ offset = String.format(Locale.ENGLISH, "+%02d%02d", zoneOffset / 60, zoneOffset % 60);
198
+ } else {
199
+ offset = String.format(Locale.ENGLISH, "-%02d%02d", -zoneOffset / 60, -zoneOffset % 60);
200
+ }
201
+ String f = String.format(Locale.ENGLISH, "%d-%02d-%02d %02d:%02d:%02d.%06d%s",
202
+ cal.get(Calendar.YEAR),
203
+ cal.get(Calendar.MONTH) + 1,
204
+ cal.get(Calendar.DAY_OF_MONTH),
205
+ cal.get(Calendar.HOUR_OF_DAY),
206
+ cal.get(Calendar.MINUTE),
207
+ cal.get(Calendar.SECOND),
208
+ v.getNano() / 1000,
209
+ offset);
210
+ writer.write(f);
211
+ }
212
+
213
+ private void setEscapedString(String v) throws IOException {
214
+ for (char c : v.toCharArray()) {
215
+ writer.write(escape(c));
216
+ }
217
+ }
218
+
219
+ @Override
220
+ public void flush() throws IOException, SQLException {
221
+ File file = closeCurrentFile(); // flush buffered data in writer
222
+
223
+ String snowflakeStageFileName = "embulk_snowflake_" + SnowflakeUtils.randomString(8);
224
+
225
+ UploadTask uploadTask = new UploadTask(file, batchRows, stageIdentifier, snowflakeStageFileName);
226
+ Future<Void> uploadFuture = executorService.submit(uploadTask);
227
+ uploadAndCopyFutures.add(uploadFuture);
228
+
229
+ CopyTask copyTask = new CopyTask(uploadFuture, snowflakeStageFileName);
230
+ uploadAndCopyFutures.add(executorService.submit(copyTask));
231
+
232
+ fileCount++;
233
+ totalRows += batchRows;
234
+ batchRows = 0;
235
+
236
+ openNewFile();
237
+ }
238
+
239
+ public void close() throws IOException, SQLException {
240
+ executorService.shutdownNow();
241
+
242
+ try {
243
+ executorService.awaitTermination(60, TimeUnit.SECONDS);
244
+ } catch (InterruptedException e) {
245
+ }
246
+
247
+ closeCurrentFile().delete();
248
+ if (connection != null) {
249
+ connection.close();
250
+ connection = null;
251
+ }
252
+ }
253
+
254
+ @Override
255
+ public void finish() throws IOException, SQLException
256
+ {
257
+ for (Future<Void> uploadAndCopyFuture : uploadAndCopyFutures) {
258
+ try {
259
+ uploadAndCopyFuture.get();
260
+
261
+ } catch (InterruptedException e) {
262
+ throw new RuntimeException(e);
263
+ } catch (ExecutionException e) {
264
+ if (e.getCause() instanceof SQLException) {
265
+ throw (SQLException)e.getCause();
266
+ }
267
+ throw new RuntimeException(e);
268
+ }
269
+ }
270
+
271
+ logger.info("Loaded {} files.", fileCount);
272
+ }
273
+
274
+ @Override
275
+ public int[] getLastUpdateCounts()
276
+ {
277
+ // need not be implemented because SnowflakeCopyBatchInsert won't retry.
278
+ return new int[]{};
279
+ }
280
+
281
+ // Escape \, \n, \t, \r
282
+ // Remove \0
283
+ protected String escape(char c) {
284
+ switch (c) {
285
+ case '\\':
286
+ return "\\\\";
287
+ case '\n':
288
+ return "\\n";
289
+ case '\t':
290
+ return "\\t";
291
+ case '\r':
292
+ return "\\r";
293
+ case 0:
294
+ return "";
295
+ default:
296
+ return String.valueOf(c);
297
+ }
298
+ }
299
+
300
+ private class UploadTask implements Callable<Void> {
301
+ private final File file;
302
+ private final int batchRows;
303
+ private final String snowflakeStageFileName;
304
+ private final StageIdentifier stageIdentifier;
305
+
306
+ public UploadTask(File file, int batchRows, StageIdentifier stageIdentifier, String snowflakeStageFileName) {
307
+ this.file = file;
308
+ this.batchRows = batchRows;
309
+ this.snowflakeStageFileName = snowflakeStageFileName;
310
+ this.stageIdentifier = stageIdentifier;
311
+ }
312
+
313
+ public Void call() throws IOException, SQLException {
314
+ logger.info(String.format("Uploading file id %s to Snowflake (%,d bytes %,d rows)",
315
+ snowflakeStageFileName, file.length(), batchRows));
316
+
317
+ try {
318
+ long startTime = System.currentTimeMillis();
319
+ // put file to snowflake internal storage
320
+ SnowflakeOutputConnection con = (SnowflakeOutputConnection) connector.connect(true);
321
+
322
+ FileInputStream fileInputStream = new FileInputStream(file);
323
+ con.runUploadFile(stageIdentifier, snowflakeStageFileName, fileInputStream);
324
+
325
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
326
+
327
+ logger.info(String.format("Uploaded file %s (%.2f seconds)", snowflakeStageFileName, seconds));
328
+ } finally {
329
+ file.delete();
330
+ }
331
+
332
+ return null;
333
+ }
334
+ }
335
+
336
+
337
+ private class CopyTask implements Callable<Void> {
338
+ private final Future<Void> uploadFuture;
339
+ private final String snowflakeStageFileName;
340
+
341
+ public CopyTask(Future<Void> uploadFuture, String snowflakeStageFileName) {
342
+ this.uploadFuture = uploadFuture;
343
+ this.snowflakeStageFileName = snowflakeStageFileName;
344
+ }
345
+
346
+ public Void call() throws SQLException, InterruptedException, ExecutionException {
347
+ try {
348
+ uploadFuture.get();
349
+
350
+ SnowflakeOutputConnection con = (SnowflakeOutputConnection) connector.connect(true);
351
+ try {
352
+ logger.info("Running COPY from file {}", snowflakeStageFileName);
353
+
354
+ long startTime = System.currentTimeMillis();
355
+ con.runCopy(tableIdentifier, stageIdentifier, snowflakeStageFileName, delimiterString);
356
+
357
+ double seconds = (System.currentTimeMillis() - startTime) / 1000.0;
358
+
359
+ logger.info(String.format("Loaded file %s (%.2f seconds for COPY)", snowflakeStageFileName, seconds));
360
+
361
+ } finally {
362
+ con.close();
363
+ }
364
+ } finally {
365
+ if (deleteStageFile) {
366
+ connection.runDeleteStageFile(stageIdentifier, snowflakeStageFileName);
367
+ }
368
+ }
369
+
370
+ return null;
371
+ }
372
+ }
373
+ }
@@ -1,51 +1,122 @@
1
1
  package org.embulk.output.snowflake;
2
2
 
3
+ import net.snowflake.client.jdbc.SnowflakeConnection;
4
+ import org.embulk.output.jdbc.JdbcColumn;
5
+ import org.embulk.output.jdbc.JdbcOutputConnection;
6
+ import org.embulk.output.jdbc.TableIdentifier;
7
+
8
+ import java.io.FileInputStream;
3
9
  import java.sql.Connection;
4
- import java.sql.ResultSet;
5
10
  import java.sql.SQLException;
6
11
  import java.sql.Statement;
7
12
 
8
- import org.embulk.output.jdbc.JdbcOutputConnection;
9
- import org.embulk.output.jdbc.JdbcUtils;
10
- import org.embulk.output.jdbc.TableIdentifier;
11
-
12
- public class SnowflakeOutputConnection
13
- extends JdbcOutputConnection
14
- {
15
- public SnowflakeOutputConnection(Connection connection, String schemaName)
16
- throws SQLException
13
+ public class SnowflakeOutputConnection extends JdbcOutputConnection {
14
+ public SnowflakeOutputConnection(Connection connection)
15
+ throws SQLException
17
16
  {
18
- super(connection, schemaName);
17
+ super(connection, null);
19
18
  }
20
19
 
21
- @Override
22
- public boolean tableExists(TableIdentifier table) throws SQLException
20
+ public void runCopy(TableIdentifier tableIdentifier, StageIdentifier stageIdentifier, String filename, String delimiterString) throws SQLException
23
21
  {
24
- String schemaName = JdbcUtils.escapeSearchString(table.getSchemaName(), connection.getMetaData().getSearchStringEscape());
25
- String database = connection.getCatalog();
26
- try (ResultSet rs = connection.getMetaData().getTables(database, schemaName, table.getTableName(), null)) {
27
- return rs.next();
28
- }
22
+ String sql = buildCopySQL(tableIdentifier, stageIdentifier, filename, delimiterString);
23
+ runUpdate(sql);
24
+ }
25
+ public void runCreateStage(StageIdentifier stageIdentifier) throws SQLException {
26
+ String sql = buildCreateStageSQL(stageIdentifier);
27
+ runUpdate(sql);
29
28
  }
30
29
 
31
- @Override
32
- public boolean tableExists(String tableName) throws SQLException
33
- {
34
- return tableExists(new TableIdentifier(connection.getCatalog(), schemaName, tableName));
30
+ public void runDropStage(StageIdentifier stageIdentifier) throws SQLException {
31
+ String sql = buildDropStageSQL(stageIdentifier);
32
+ runUpdate(sql);
35
33
  }
36
34
 
37
- @Override
38
- protected void setSearchPath(String schema) throws SQLException
35
+ public void runUploadFile(StageIdentifier stageIdentifier, String filename ,FileInputStream fileInputStream) throws SQLException{
36
+ connection.unwrap(SnowflakeConnection.class).uploadStream(stageIdentifier.getStageName(), stageIdentifier.getDestPrefix().or("/"),
37
+ fileInputStream, filename + ".csv.gz", false);
38
+ }
39
+
40
+ public void runDeleteStageFile(StageIdentifier stageIdentifier, String filename) throws SQLException{
41
+ String sql = buildDeleteStageFileSQL(stageIdentifier, filename);
42
+ runUpdate(sql);
43
+ }
44
+
45
+ protected void runUpdate(String sql) throws SQLException
39
46
  {
40
47
  Statement stmt = connection.createStatement();
41
48
  try {
42
- String sql = "USE SCHEMA " + quoteIdentifierString(schema);
43
- executeUpdate(stmt, sql);
44
- commitIfNecessary(connection);
49
+ stmt.executeUpdate(sql);
45
50
  } finally {
46
51
  stmt.close();
47
52
  }
48
53
  }
49
54
 
55
+ @Override
56
+ protected String buildColumnTypeName(JdbcColumn c) {
57
+ switch(c.getSimpleTypeName()) {
58
+ case "CLOB":
59
+ return "VARCHAR(65535)";
60
+ default:
61
+ return super.buildColumnTypeName(c);
62
+ }
63
+ }
64
+
65
+ protected String buildCreateStageSQL(StageIdentifier stageIdentifier){
66
+ StringBuilder sb = new StringBuilder();
67
+ sb.append("CREATE STAGE IF NOT EXISTS ");
68
+ quoteStageIdentifier(sb, stageIdentifier);
69
+ sb.append(";");
70
+ return sb.toString();
71
+ }
72
+
73
+ protected String buildDropStageSQL(StageIdentifier stageIdentifier){
74
+ StringBuilder sb = new StringBuilder();
75
+ sb.append("DROP STAGE ");
76
+ quoteStageIdentifier(sb, stageIdentifier);
77
+ sb.append(";");
78
+ return sb.toString();
79
+ }
50
80
 
81
+ protected void quoteStageIdentifier(StringBuilder sb, StageIdentifier stageIdentifier){
82
+ sb.append(stageIdentifier.getDatabase());
83
+ sb.append(".");
84
+ sb.append(stageIdentifier.getSchemaName());
85
+ sb.append(".");
86
+ sb.append(stageIdentifier.getStageName());
87
+ }
88
+
89
+ protected String buildCopySQL(TableIdentifier tableIdentifier, StageIdentifier stageIdentifier, String snowflakeStageFileName, String delimiterString){
90
+ StringBuilder sb = new StringBuilder();
91
+ sb.append("COPY INTO ");
92
+ quoteTableIdentifier(sb, tableIdentifier);
93
+ sb.append(" FROM ");
94
+ quoteInternalStoragePath(sb, stageIdentifier, snowflakeStageFileName);
95
+ sb.append(" FILE_FORMAT = ( TYPE = CSV FIELD_DELIMITER = '");
96
+ sb.append(delimiterString);
97
+ sb.append("');");
98
+ return sb.toString();
99
+ }
100
+
101
+ protected String buildDeleteStageFileSQL(StageIdentifier stageIdentifier, String snowflakeStageFileName){
102
+ StringBuilder sb = new StringBuilder();
103
+ sb.append("REMOVE ");
104
+ quoteInternalStoragePath(sb, stageIdentifier, snowflakeStageFileName);
105
+ sb.append(';');
106
+ return sb.toString();
107
+ }
108
+
109
+ protected String quoteInternalStoragePath(StringBuilder sb, StageIdentifier stageIdentifier,
110
+ String snowflakeStageFileName){
111
+ sb.append("@");
112
+ quoteStageIdentifier(sb, stageIdentifier);
113
+ if (stageIdentifier.getDestPrefix().isPresent()){
114
+ sb.append("/");
115
+ sb.append(stageIdentifier.getDestPrefix().get());
116
+ }
117
+ sb.append("/");
118
+ sb.append(snowflakeStageFileName);
119
+ sb.append(".csv.gz");
120
+ return sb.toString();
121
+ }
51
122
  }