embulk-input-bigquery_extract_files 0.0.7 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,495 +1,519 @@
1
- package org.embulk.input.bigquery_export_gcs;
2
-
3
- import java.io.File;
4
- import java.io.FileInputStream;
5
- import java.io.FileNotFoundException;
6
- import java.io.FileOutputStream;
7
- import java.io.IOException;
8
- import java.io.InputStream;
9
- import java.nio.file.FileSystems;
10
- import java.nio.file.Path;
11
- import java.util.Date;
12
- import java.util.List;
13
- import java.util.UUID;
14
- import java.util.regex.Matcher;
15
- import java.util.regex.Pattern;
16
-
17
- import org.apache.commons.lang3.StringUtils;
18
- import org.apache.commons.lang3.time.FastDateFormat;
19
- import org.embulk.input.bigquery_export_gcs.BigqueryExportGcsFileInputPlugin.PluginTask;
20
- import org.embulk.spi.Exec;
21
- import org.embulk.spi.Schema;
22
- import org.embulk.spi.type.Types;
23
- import org.slf4j.Logger;
24
-
25
- import com.fasterxml.jackson.core.JsonProcessingException;
26
- import com.fasterxml.jackson.databind.ObjectMapper;
27
- import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
28
- import com.google.api.client.googleapis.json.GoogleJsonResponseException;
29
- import com.google.api.client.http.HttpTransport;
30
- import com.google.api.client.http.javanet.NetHttpTransport;
31
- import com.google.api.client.json.JsonFactory;
32
- import com.google.api.client.json.jackson2.JacksonFactory;
33
- import com.google.api.client.repackaged.com.google.common.base.Strings;
34
- import com.google.api.services.bigquery.Bigquery;
35
- import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
36
- import com.google.api.services.bigquery.Bigquery.Tables.Delete;
37
- import com.google.api.services.bigquery.BigqueryScopes;
38
- import com.google.api.services.bigquery.model.Job;
39
- import com.google.api.services.bigquery.model.JobConfiguration;
40
- import com.google.api.services.bigquery.model.JobConfigurationExtract;
41
- import com.google.api.services.bigquery.model.JobConfigurationQuery;
42
- import com.google.api.services.bigquery.model.JobReference;
43
- import com.google.api.services.bigquery.model.Table;
44
- import com.google.api.services.bigquery.model.TableFieldSchema;
45
- import com.google.api.services.bigquery.model.TableReference;
46
- import com.google.api.services.bigquery.model.TableSchema;
47
- import com.google.api.services.storage.Storage;
48
- import com.google.api.services.storage.StorageScopes;
49
- import com.google.api.services.storage.model.Bucket;
50
- import com.google.api.services.storage.model.Objects;
51
- import com.google.api.services.storage.model.StorageObject;
52
- import com.google.common.base.Optional;
53
- import com.google.common.collect.ImmutableList;
54
- import com.google.common.collect.Lists;
55
-
56
- import io.airlift.slice.RuntimeIOException;
57
-
58
- /**
59
- *
60
- *
61
- *
62
- * #reference :
63
- *
64
- * # https://github.com/embulk/embulk
65
- * # https://github.com/embulk/embulk-input-s3
66
- * # https://github.com/embulk/embulk-input-gcs
67
- * # https://github.com/embulk/embulk-input-jdbc
68
- * # https://github.com/GoogleCloudPlatform/java-docs-samples/blob/master/storage/json-api/src/main/java/StorageSample.java
69
- *
70
- *
71
- * @author george 2017. 11. 16.
72
- *
73
- */
74
- public class BigqueryExportUtils
75
- {
76
- private static final Logger log = Exec.getLogger(BigqueryExportUtils.class);
77
-
78
-
79
- public static String parseQueryToBaseTableName(String query){
80
- if( query == null){
81
- return null;
82
- }
83
-
84
- Pattern p = Pattern.compile(" from [\\[]?([^ \\$\\[\\]]+)[\\]]?", Pattern.CASE_INSENSITIVE);
85
- Matcher m = p.matcher(query);
86
- if(m.find() && m.groupCount() > 0){
87
- return Strings.nullToEmpty(m.group(1)).replaceAll(".*\\.","").replaceAll("[^\\w\\s]","");
88
- }else{
89
- return null;
90
- }
91
- }
92
-
93
- public static String generateTempTableName(String query){
94
- return generateTempTableName(query, null);
95
- }
96
-
97
- public static String generateTempTableName(String query, String tablename){
98
-
99
- String tname = tablename;
100
-
101
- if (tname == null){
102
- tname = parseQueryToBaseTableName(query);
103
- if(tname == null){
104
- tname = "temp";
105
- }
106
- }
107
-
108
- return "embulk_" + tname + "_" + FastDateFormat.getInstance("yyyyMMdd_HHmmss").format(new Date()) + "_" + UUID.randomUUID().toString().replaceAll("-", "");
109
- }
110
-
111
-
112
- public static void executeQueryToDestinationWorkTable(Bigquery bigquery, PluginTask task) throws IOException, InterruptedException {
113
-
114
- log.info("execute Query to Table ");
115
- log.info("# Query # {}",task.getQuery().get());
116
- log.info("# Table # {}.{} ",task.getWorkDataset(), task.getWorkTable());
117
-
118
- JobConfigurationQuery queryConfig = new JobConfigurationQuery();
119
- queryConfig.setQuery(task.getQuery().get());
120
- queryConfig.setDestinationTable(new TableReference()
121
- .setProjectId(task.getProject())
122
- .setDatasetId(task.getWorkDataset())
123
- .setTableId(task.getWorkTable()));
124
- queryConfig.setUseLegacySql(task.getUseLegacySql());
125
- queryConfig.setCreateDisposition(task.getCreateDisposition());
126
- queryConfig.setWriteDisposition(task.getWriteDisposition());
127
- queryConfig.setUseQueryCache(task.getQueryCache());
128
- queryConfig.setAllowLargeResults(true);
129
-
130
- com.google.api.services.bigquery.Bigquery.Jobs.Insert insert = bigquery.jobs().insert(task.getProject(),
131
- new Job().setConfiguration(new JobConfiguration().setQuery(queryConfig))
132
- );
133
- Job jobRes = insert.execute(); // ~~~~~~~~~~~~~~~~~~~~~ API CALL
134
-
135
- JobReference jobRef = jobRes.getJobReference();
136
- String jobId = jobRef.getJobId();
137
-
138
- log.info("query to Table jobId : {} : waiting for job end...",jobId);
139
-
140
- Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
141
-
142
- log.debug("waiting for job end....... {}", lastJob.toPrettyString());
143
- }
144
-
145
- public static void parseGcsUri(PluginTask task){
146
-
147
- if(StringUtils.isEmpty(task.getGcsUri()) || false == task.getGcsUri().matches("gs://[^/]+/.+") ){
148
- throw new RuntimeException("gcs_uri not found : " + task.getGcsUri());
149
- }
150
-
151
- task.setGcsBucket(task.getGcsUri().replaceAll("gs://([^/]+)/.+", "$1"));
152
- task.setGcsBlobNamePrefix(task.getGcsUri().replaceAll("gs://[^/]+/(.+)", "$1").replaceAll("[\\*]*$", ""));
153
-
154
- }
155
-
156
-
157
- /***
158
- *
159
- * google cloud sdk
160
- *
161
- * @param task
162
- * @throws IOException
163
- * @throws FileNotFoundException
164
- */
165
- public static Bigquery newBigqueryClient(PluginTask task){
166
- log.debug("# Starting Google BigQuery API ... ");
167
- try {
168
- GoogleCredentialSet set = googleCredential(task);
169
- return new Bigquery.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
170
- } catch (Exception e) {
171
- throw new RuntimeException("bigquery connect fail",e);
172
- }
173
-
174
- }
175
-
176
- public static Storage newGcsClient(PluginTask task) throws FileNotFoundException, IOException{
177
- log.debug("# Starting Google Cloud Storage ... ");
178
- GoogleCredentialSet set = googleCredential(task);
179
- return new Storage.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
180
- }
181
-
182
-
183
- public static class GoogleCredentialSet {
184
- public GoogleCredential googleCredential = null;
185
- public HttpTransport transport = new NetHttpTransport();
186
- public JsonFactory jsonFactory = new JacksonFactory();
187
- }
188
-
189
- public static GoogleCredentialSet googleCredential(PluginTask task) throws IOException {
190
- GoogleCredentialSet ret = new GoogleCredentialSet();
191
-
192
- log.debug("### init googleCredentialFile : {} ",task.getJsonKeyfile());
193
-
194
- ret.transport = new NetHttpTransport();
195
- ret.jsonFactory = new JacksonFactory();
196
-
197
- GoogleCredential credential = GoogleCredential.fromStream(new FileInputStream( task.getJsonKeyfile() ), ret.transport, ret.jsonFactory);
198
- if (credential.createScopedRequired()) {
199
- credential = credential.createScoped(BigqueryScopes.all()).createScoped(StorageScopes.all());
200
- }
201
- ret.googleCredential = credential;
202
- return ret;
203
- }
204
-
205
-
206
- public static List<String> getFileListFromGcs(PluginTask task) throws FileNotFoundException, IOException{
207
- Storage gcs = newGcsClient(task);
208
- return getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
209
- }
210
-
211
- public static List<String> getFileListFromGcs(Storage gcs, String bucket, String blobName) throws IOException{
212
- ImmutableList.Builder<String> builder = ImmutableList.builder();
213
- Storage.Objects.List listRequest = gcs.objects().list(bucket).setPrefix(blobName);
214
- Objects objects;
215
-
216
- do {
217
- objects = listRequest.execute();
218
- if(objects.getItems() == null){
219
- log.error("file not found in gs://{}/{}",bucket,blobName);
220
- return builder.build();
221
- }
222
- for(StorageObject obj : objects.getItems()){
223
- builder.add(obj.getName());
224
- }
225
- listRequest.setPageToken(objects.getNextPageToken());
226
- } while (null != objects.getNextPageToken());
227
-
228
- return builder.build().asList();
229
- }
230
-
231
- public static final String TYPE_INTEGER = "INTEGER";
232
- public static final String TYPE_STRING = "STRING";
233
- public static final String TYPE_FLOAT = "FLOAT";
234
- public static final String TYPE_TIMESTAMP = "TIMESTAMP";
235
-
236
- public static Schema convertTableSchemaToEmbulkSchema(Table table){
237
- Schema.Builder builder = Schema.builder();
238
- TableSchema ts = table.getSchema();
239
- for( TableFieldSchema field : ts.getFields() ){
240
- String name = field.getName();
241
- org.embulk.spi.type.Type type = Types.JSON;
242
- switch(field.getType()){
243
- case "INTEGER":
244
- builder.add(name, Types.LONG);
245
- break;
246
- case "FLOAT":
247
- builder.add(name, Types.DOUBLE);
248
- break;
249
- case "TIMESTAMP":
250
- builder.add(name, Types.TIMESTAMP);
251
- break;
252
- default:
253
- builder.add(name, Types.STRING);
254
- break;
255
- }
256
- }
257
- return builder.build();
258
- }
259
-
260
- public static PHASE initTask(PluginTask task) {
261
-
262
- if(task.getQuery().isPresent()){
263
- task.setWorkId(generateTempTableName(task.getQuery().get()));
264
-
265
- if(task.getTempTable().isPresent() == false){
266
- task.setTempTable(Optional.of(task.getWorkId()));
267
- }
268
- if(task.getTempDataset().isPresent() == false && task.getDataset().isPresent()){
269
- task.setTempDataset(Optional.of(task.getDataset().get()));
270
- }
271
-
272
- // actual target table setting
273
- task.setWorkDataset(task.getTempDataset().get());
274
- task.setWorkTable(task.getTempTable().get());
275
-
276
- return PHASE.QUERY;
277
- }else if(task.getTable().isPresent() && task.getDataset().isPresent()){
278
- task.setWorkId(generateTempTableName(null, task.getTable().get()));
279
- // actual target table setting
280
- task.setWorkDataset(task.getDataset().get());
281
- task.setWorkTable(task.getTable().get());
282
-
283
- return PHASE.TABLE;
284
-
285
- }else{
286
- throw new RuntimeException("please set config file [dataset]+[table] or [query]");
287
- }
288
- }
289
-
290
- public static Schema extractWorkTable(Bigquery bigquery, PluginTask task) throws FileNotFoundException, IOException, InterruptedException{
291
-
292
- Table table = bigquery.tables().get(task.getProject(), task.getWorkDataset(), task.getWorkTable()).execute();
293
-
294
- Schema embulkSchema = convertTableSchemaToEmbulkSchema(table);
295
- //task.setSchame(embulkSchema);
296
- log.debug("Table Schema : {}", table.getSchema());
297
-
298
- //Tabledata. req = bigquery.tabledata().list(projectId, dataset, table);
299
-
300
- log.info("start table extract [{}.{}] to {} ...", task.getWorkDataset(), task.getWorkTable(), task.getGcsUri());
301
-
302
- Job jobReq = new Job();
303
- JobConfigurationExtract extract = new JobConfigurationExtract();
304
- extract.setDestinationFormat(task.getFileFormat().get());
305
- extract.setCompression(task.getCompression().get());
306
- extract.setDestinationUris(Lists.newArrayList(task.getGcsUri()));
307
- extract.setSourceTable(table.getTableReference());
308
- jobReq.setConfiguration(new JobConfiguration().setExtract(extract));
309
-
310
- Insert jobInsert = bigquery.jobs().insert(task.getProject(), jobReq);
311
- Job res = jobInsert.execute();
312
-
313
- JobReference jobRef = res.getJobReference();
314
- String jobId = jobRef.getJobId();
315
- log.info("extract jobId : {}",jobId);
316
- log.debug("waiting for job end....... ");
317
-
318
- Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
319
-
320
- log.info("table extract result : {}",lastJob.toPrettyString());
321
-
322
- return embulkSchema;
323
- }
324
-
325
- public static Job waitForJob(Bigquery bigquery, String project, String jobId, int bigqueryJobWaitingSecond) throws IOException, InterruptedException{
326
- int maxAttempts = bigqueryJobWaitingSecond;
327
- int initialRetryDelay = 1000; // ms
328
- Job pollingJob = null;
329
- log.info("waiting for job end : {}",jobId);
330
- int tryCnt = 0;
331
- for (tryCnt=0; tryCnt < maxAttempts; tryCnt++){
332
- pollingJob = bigquery.jobs().get(project, jobId).execute();
333
- String state = pollingJob.getStatus().getState();
334
- log.debug("Job Status {} : {}",jobId, state);
335
-
336
- if (pollingJob.getStatus().getState().equals("DONE")) {
337
- break;
338
- }
339
- log.info("waiting {} ... ",tryCnt);
340
- Thread.sleep(initialRetryDelay);
341
- }
342
- if(tryCnt + 1 == maxAttempts){
343
- log.error("Bigquery Job Waiting exceed : over {} second...", bigqueryJobWaitingSecond);
344
- }
345
-
346
- return pollingJob;
347
- }
348
-
349
- public static Schema predictSchema(Bigquery bigquery){
350
- Schema schema = Schema.builder().add("", org.embulk.spi.type.Types.LONG).build();
351
- return schema;
352
- }
353
-
354
- /**
355
- *
356
- * https://github.com/google/google-api-java-client-samples/blob/master/storage-cmdline-sample/src/main/java/com/google/api/services/samples/storage/examples/ObjectsDownloadExample.java
357
- *
358
- */
359
- public static InputStream openInputStream(PluginTask task, String file)
360
- {
361
- try {
362
-
363
-
364
- Storage gcs = newGcsClient(task);
365
-
366
-
367
- Path fullLocalFilePath = getFullPath(task, file);
368
-
369
- log.info("Start download : gs://{}/{} ...to ... {} ",task.getGcsBucket(), file, task.getTempLocalPath());
370
-
371
- Storage.Objects.Get getObject = gcs.objects().get(task.getGcsBucket(), file);
372
- getObject.getMediaHttpDownloader().setDirectDownloadEnabled(true);
373
-
374
- // return getObject.executeMediaAsInputStream() // direct InputStream ?? I Think this is faster then temp file. but ...
375
-
376
- try(FileOutputStream s = new FileOutputStream(fullLocalFilePath.toFile())){
377
- getObject.executeMediaAndDownloadTo(s);
378
- }
379
- return new FileInputStream(fullLocalFilePath.toFile());
380
-
381
- } catch (FileNotFoundException e) {
382
- log.error("gcs file not found error",e);
383
- return null;
384
- } catch(IOException e){
385
- log.error("gcs file read error",e);
386
- return null;
387
- }
388
- }
389
-
390
-
391
- public static Path getFullPath(PluginTask task, String file){
392
- String baseName = file.replaceFirst(".*/", "");
393
- Path fullLocalFilePath = FileSystems.getDefault().getPath(task.getTempLocalPath(), baseName);
394
- return fullLocalFilePath ;
395
- }
396
-
397
- public enum SCHEMA_TYPE{
398
- EMBULK,
399
- AVRO
400
- }
401
-
402
- public static Schema decnodeSchemaJson(String json) {
403
- ObjectMapper mapper = new ObjectMapper();
404
- try {
405
- Schema schema = mapper.readValue(json, Schema.class);
406
- return schema;
407
- } catch (Exception e) {
408
- log.error("error when parse schema object : " + json,e);
409
- return null;
410
- }
411
- }
412
-
413
- public static void writeSchemaFile(Schema schema, String schemaType, File file) {
414
- ObjectMapper mapper = new ObjectMapper();
415
- try {
416
- mapper.writeValue(file, schema);
417
- } catch (Exception e) {
418
- log.error("error when create schema json {}",file);
419
- throw new RuntimeException(e);
420
- }
421
- }
422
-
423
- public static String generateSchemaJson(Schema schema, String schemaType) {
424
- SCHEMA_TYPE tp = SCHEMA_TYPE.EMBULK;
425
- if(schemaType != null) {
426
- tp.valueOf(schemaType);
427
- }
428
-
429
- ObjectMapper mapper = new ObjectMapper();
430
- try {
431
- String jsonString = mapper.writeValueAsString(schema);
432
- return jsonString;
433
- } catch (JsonProcessingException e) {
434
- log.error("error when create schema json",e);
435
- return null;
436
- }
437
- //for(Column col : schema.getColumns()) {
438
- }
439
-
440
- public static String toPrettyString(Object obj){
441
- try {
442
- ObjectMapper mapper = new ObjectMapper();
443
- String str = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj);
444
- return str;
445
- } catch (Exception e) {
446
- log.error("JSON format error",e);
447
- return java.util.Objects.toString(obj);
448
- }
449
- }
450
-
451
- /**
452
- *
453
- * @param task
454
- */
455
- public static void removeTempTable(PluginTask task){
456
- try {
457
- log.info("Remove temp table {}.{}",task.getTempDataset().get(), task.getTempTable().get());
458
- Bigquery bigquery = newBigqueryClient(task);
459
- Delete del = bigquery.tables().delete(task.getProject(), task.getTempDataset().get(), task.getTempTable().get());
460
- del.execute();
461
- } catch (Exception e) {
462
- log.error("# Remove temp table FAIL : " + task.getTempDataset().orNull() + "." + task.getTempTable().orNull(),e);
463
- }
464
- }
465
-
466
- public static void removeGcsFilesBeforeExecuting(PluginTask task){
467
- try {
468
- log.info("start cleanup gs://{}/{} ... ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
469
- Storage gcs = BigqueryExportUtils.newGcsClient(task);
470
- List<String> fileList = getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
471
- for(String f : fileList){
472
- log.info("cleanup gs://{}/{} ... ",task.getGcsBucket(), f);
473
- gcs.objects().delete(task.getGcsBucket(), f).execute();
474
- }
475
- } catch (GoogleJsonResponseException e) {
476
- if(e.getStatusCode() == 404){
477
- log.info("file not found in gs://{}/{} :: it's ok ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
478
- }else{
479
- throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
480
- }
481
- } catch (Exception e) {
482
- throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
483
- }
484
- }
485
-
486
- public static void removeTempGcsFiles(PluginTask task, String file){
487
- try {
488
- Storage gcs = BigqueryExportUtils.newGcsClient(task);
489
- log.info("delete finish file gs://{}/{}", task.getGcsBucket(), file);
490
- gcs.objects().delete(task.getGcsBucket(), file).execute();
491
- } catch (Exception e) {
492
- log.error("# Remove temp gcs file FAIL : " + file,e);
493
- }
494
- }
495
- }
1
+ package org.embulk.input.bigquery_export_gcs;
2
+
3
+ import java.io.File;
4
+ import java.io.FileInputStream;
5
+ import java.io.FileNotFoundException;
6
+ import java.io.FileOutputStream;
7
+ import java.io.IOException;
8
+ import java.io.InputStream;
9
+ import java.math.BigInteger;
10
+ import java.nio.file.FileSystems;
11
+ import java.nio.file.Path;
12
+ import java.util.Date;
13
+ import java.util.List;
14
+ import java.util.UUID;
15
+ import java.util.regex.Matcher;
16
+ import java.util.regex.Pattern;
17
+
18
+ import org.apache.commons.lang3.StringUtils;
19
+ import org.apache.commons.lang3.time.FastDateFormat;
20
+ import org.embulk.input.bigquery_export_gcs.BigqueryExportGcsFileInputPlugin.PluginTask;
21
+ import org.embulk.spi.ColumnConfig;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.Schema;
24
+ import org.embulk.spi.SchemaConfig;
25
+ import org.embulk.spi.type.Types;
26
+ import org.slf4j.Logger;
27
+
28
+ import com.fasterxml.jackson.core.JsonProcessingException;
29
+ import com.fasterxml.jackson.databind.ObjectMapper;
30
+ import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
31
+ import com.google.api.client.googleapis.json.GoogleJsonResponseException;
32
+ import com.google.api.client.http.HttpTransport;
33
+ import com.google.api.client.http.javanet.NetHttpTransport;
34
+ import com.google.api.client.json.JsonFactory;
35
+ import com.google.api.client.json.jackson2.JacksonFactory;
36
+ import com.google.api.client.repackaged.com.google.common.base.Strings;
37
+ import com.google.api.services.bigquery.Bigquery;
38
+ import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
39
+ import com.google.api.services.bigquery.Bigquery.Tabledata;
40
+ import com.google.api.services.bigquery.Bigquery.Tables.Delete;
41
+ import com.google.api.services.bigquery.BigqueryScopes;
42
+ import com.google.api.services.bigquery.model.Job;
43
+ import com.google.api.services.bigquery.model.JobConfiguration;
44
+ import com.google.api.services.bigquery.model.JobConfigurationExtract;
45
+ import com.google.api.services.bigquery.model.JobConfigurationQuery;
46
+ import com.google.api.services.bigquery.model.JobReference;
47
+ import com.google.api.services.bigquery.model.Table;
48
+ import com.google.api.services.bigquery.model.TableDataList;
49
+ import com.google.api.services.bigquery.model.TableFieldSchema;
50
+ import com.google.api.services.bigquery.model.TableReference;
51
+ import com.google.api.services.bigquery.model.TableRow;
52
+ import com.google.api.services.bigquery.model.TableSchema;
53
+ import com.google.api.services.storage.Storage;
54
+ import com.google.api.services.storage.StorageScopes;
55
+ import com.google.api.services.storage.model.Bucket;
56
+ import com.google.api.services.storage.model.Objects;
57
+ import com.google.api.services.storage.model.StorageObject;
58
+ import com.google.common.base.Optional;
59
+ import com.google.common.collect.ImmutableList;
60
+ import com.google.common.collect.Lists;
61
+
62
+ import io.airlift.slice.RuntimeIOException;
63
+
64
+ /**
65
+ *
66
+ *
67
+ *
68
+ * #reference :
69
+ *
70
+ * # https://github.com/embulk/embulk
71
+ * # https://github.com/embulk/embulk-input-s3
72
+ * # https://github.com/embulk/embulk-input-gcs
73
+ * # https://github.com/embulk/embulk-input-jdbc
74
+ * # https://github.com/GoogleCloudPlatform/java-docs-samples/blob/master/storage/json-api/src/main/java/StorageSample.java
75
+ *
76
+ *
77
+ * @author george 2017. 11. 16.
78
+ *
79
+ */
80
+ public class BigqueryExportUtils
81
+ {
82
+ private static final Logger log = Exec.getLogger(BigqueryExportUtils.class);
83
+
84
+
85
+ public static String parseQueryToBaseTableName(String query){
86
+ if( query == null){
87
+ return null;
88
+ }
89
+
90
+ Pattern p = Pattern.compile(" from [\\[]?([^ \\$\\[\\]]+)[\\]]?", Pattern.CASE_INSENSITIVE);
91
+ Matcher m = p.matcher(query);
92
+ if(m.find() && m.groupCount() > 0){
93
+ return Strings.nullToEmpty(m.group(1)).replaceAll(".*\\.","").replaceAll("[^\\w\\s]","");
94
+ }else{
95
+ return null;
96
+ }
97
+ }
98
+
99
+ public static String generateTempTableName(String query){
100
+ return generateTempTableName(query, null);
101
+ }
102
+
103
+ public static String generateTempTableName(String query, String tablename){
104
+
105
+ String tname = tablename;
106
+
107
+ if (tname == null){
108
+ tname = parseQueryToBaseTableName(query);
109
+ if(tname == null){
110
+ tname = "temp";
111
+ }
112
+ }
113
+
114
+ return "embulk_" + tname + "_" + FastDateFormat.getInstance("yyyyMMdd_HHmmss").format(new Date()) + "_" + UUID.randomUUID().toString().replaceAll("-", "");
115
+ }
116
+
117
+
118
+ public static void executeQueryToDestinationWorkTable(Bigquery bigquery, PluginTask task) throws IOException, InterruptedException {
119
+
120
+ log.info("execute Query to Table ");
121
+ log.info("# Query # {}",task.getQuery().get());
122
+ log.info("# Table # {}.{} ",task.getWorkDataset(), task.getWorkTable());
123
+
124
+ JobConfigurationQuery queryConfig = new JobConfigurationQuery();
125
+ queryConfig.setQuery(task.getQuery().get());
126
+ queryConfig.setDestinationTable(new TableReference()
127
+ .setProjectId(task.getProject())
128
+ .setDatasetId(task.getWorkDataset())
129
+ .setTableId(task.getWorkTable()));
130
+ queryConfig.setUseLegacySql(task.getUseLegacySql());
131
+ queryConfig.setCreateDisposition(task.getCreateDisposition());
132
+ queryConfig.setWriteDisposition(task.getWriteDisposition());
133
+ queryConfig.setUseQueryCache(task.getQueryCache());
134
+ queryConfig.setAllowLargeResults(true);
135
+
136
+ com.google.api.services.bigquery.Bigquery.Jobs.Insert insert = bigquery.jobs().insert(task.getProject(),
137
+ new Job().setConfiguration(new JobConfiguration().setQuery(queryConfig))
138
+ );
139
+ Job jobRes = insert.execute(); // ~~~~~~~~~~~~~~~~~~~~~ API CALL
140
+
141
+ JobReference jobRef = jobRes.getJobReference();
142
+ String jobId = jobRef.getJobId();
143
+
144
+ log.info("query to Table jobId : {} : waiting for job end...",jobId);
145
+
146
+ Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
147
+
148
+ log.debug("waiting for job end....... {}", lastJob.toPrettyString());
149
+ }
150
+
151
+ public static void parseGcsUri(PluginTask task){
152
+
153
+ if(StringUtils.isEmpty(task.getGcsUri()) || false == task.getGcsUri().matches("gs://[^/]+/.+") ){
154
+ throw new RuntimeException("gcs_uri not found : " + task.getGcsUri());
155
+ }
156
+
157
+ task.setGcsBucket(task.getGcsUri().replaceAll("gs://([^/]+)/.+", "$1"));
158
+ task.setGcsBlobNamePrefix(task.getGcsUri().replaceAll("gs://[^/]+/(.+)", "$1").replaceAll("[\\*]*$", ""));
159
+
160
+ }
161
+
162
+
163
+ /***
164
+ *
165
+ * google cloud sdk
166
+ *
167
+ * @param task
168
+ * @throws IOException
169
+ * @throws FileNotFoundException
170
+ */
171
+ public static Bigquery newBigqueryClient(PluginTask task){
172
+ log.debug("# Starting Google BigQuery API ... ");
173
+ try {
174
+ GoogleCredentialSet set = googleCredential(task);
175
+ return new Bigquery.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
176
+ } catch (Exception e) {
177
+ throw new RuntimeException("bigquery connect fail",e);
178
+ }
179
+
180
+ }
181
+
182
+ public static Storage newGcsClient(PluginTask task) throws FileNotFoundException, IOException{
183
+ log.debug("# Starting Google Cloud Storage ... ");
184
+ GoogleCredentialSet set = googleCredential(task);
185
+ return new Storage.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
186
+ }
187
+
188
+
189
+ public static class GoogleCredentialSet {
190
+ public GoogleCredential googleCredential = null;
191
+ public HttpTransport transport = new NetHttpTransport();
192
+ public JsonFactory jsonFactory = new JacksonFactory();
193
+ }
194
+
195
+ public static GoogleCredentialSet googleCredential(PluginTask task) throws IOException {
196
+ GoogleCredentialSet ret = new GoogleCredentialSet();
197
+
198
+ log.debug("### init googleCredentialFile : {} ",task.getJsonKeyfile());
199
+
200
+ ret.transport = new NetHttpTransport();
201
+ ret.jsonFactory = new JacksonFactory();
202
+
203
+ GoogleCredential credential = GoogleCredential.fromStream(new FileInputStream( task.getJsonKeyfile() ), ret.transport, ret.jsonFactory);
204
+ if (credential.createScopedRequired()) {
205
+ credential = credential.createScoped(BigqueryScopes.all()).createScoped(StorageScopes.all());
206
+ }
207
+ ret.googleCredential = credential;
208
+ return ret;
209
+ }
210
+
211
+
212
+ public static List<String> getFileListFromGcs(PluginTask task) throws FileNotFoundException, IOException{
213
+ Storage gcs = newGcsClient(task);
214
+ return getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
215
+ }
216
+
217
+ public static List<String> getFileListFromGcs(Storage gcs, String bucket, String blobName) throws IOException{
218
+ ImmutableList.Builder<String> builder = ImmutableList.builder();
219
+ Storage.Objects.List listRequest = gcs.objects().list(bucket).setPrefix(blobName);
220
+ Objects objects;
221
+
222
+ do {
223
+ objects = listRequest.execute();
224
+ if(objects.getItems() == null){
225
+ log.error("file not found in gs://{}/{}",bucket,blobName);
226
+ return builder.build();
227
+ }
228
+ for(StorageObject obj : objects.getItems()){
229
+ builder.add(obj.getName());
230
+ }
231
+ listRequest.setPageToken(objects.getNextPageToken());
232
+ } while (null != objects.getNextPageToken());
233
+
234
+ return builder.build().asList();
235
+ }
236
+
237
+ public static final String TYPE_INTEGER = "INTEGER";
238
+ public static final String TYPE_STRING = "STRING";
239
+ public static final String TYPE_FLOAT = "FLOAT";
240
+ public static final String TYPE_TIMESTAMP = "TIMESTAMP";
241
+
242
+ public static SchemaConfig getSchemaWithGuess(Bigquery bigquery, PluginTask task, Table table, Schema schema) throws IOException{
243
+ List<ColumnConfig> columns = Lists.newArrayList();
244
+
245
+ com.google.api.services.bigquery.Bigquery.Tabledata.List req = bigquery.tabledata().list(task.getProject(), task.getDataset().get(), table.getTableReference().getTableId());
246
+
247
+ req = req.setMaxResults(new Long(1));
248
+
249
+ TableDataList list = req.execute();
250
+
251
+ for(TableRow row : list.getRows()){
252
+ //row.get(name)
253
+ }
254
+ return new SchemaConfig(columns);
255
+ }
256
+
257
+ public static Schema convertTableSchemaToEmbulkSchema(Table table){
258
+ Schema.Builder builder = Schema.builder();
259
+ TableSchema ts = table.getSchema();
260
+ for( TableFieldSchema field : ts.getFields() ){
261
+ String name = field.getName();
262
+ org.embulk.spi.type.Type type = Types.JSON;
263
+ switch(field.getType()){
264
+ case "INTEGER":
265
+ builder.add(name, Types.LONG);
266
+ break;
267
+ case "FLOAT":
268
+ builder.add(name, Types.DOUBLE);
269
+ break;
270
+ case "TIMESTAMP":
271
+ builder.add(name, Types.TIMESTAMP);
272
+ break;
273
+ default:
274
+ builder.add(name, Types.STRING);
275
+ break;
276
+ }
277
+ }
278
+ return builder.build();
279
+ }
280
+
281
+ public static PHASE initTask(PluginTask task) {
282
+
283
+ if(task.getQuery().isPresent()){
284
+ task.setWorkId(generateTempTableName(task.getQuery().get()));
285
+
286
+ if(task.getTempTable().isPresent() == false){
287
+ task.setTempTable(Optional.of(task.getWorkId()));
288
+ }
289
+ if(task.getTempDataset().isPresent() == false && task.getDataset().isPresent()){
290
+ task.setTempDataset(Optional.of(task.getDataset().get()));
291
+ }
292
+
293
+ // actual target table setting
294
+ task.setWorkDataset(task.getTempDataset().get());
295
+ task.setWorkTable(task.getTempTable().get());
296
+
297
+ return PHASE.QUERY;
298
+ }else if(task.getTable().isPresent() && task.getDataset().isPresent()){
299
+ task.setWorkId(generateTempTableName(null, task.getTable().get()));
300
+ // actual target table setting
301
+ task.setWorkDataset(task.getDataset().get());
302
+ task.setWorkTable(task.getTable().get());
303
+
304
+ return PHASE.TABLE;
305
+
306
+ }else{
307
+ throw new RuntimeException("please set config file [dataset]+[table] or [query]");
308
+ }
309
+ }
310
+
311
+ public static Schema extractWorkTable(Bigquery bigquery, PluginTask task) throws FileNotFoundException, IOException, InterruptedException{
312
+
313
+ Table table = bigquery.tables().get(task.getProject(), task.getWorkDataset(), task.getWorkTable()).execute();
314
+
315
+ Schema embulkSchema = convertTableSchemaToEmbulkSchema(table);
316
+
317
+
318
+ //task.setSchame(embulkSchema);
319
+
320
+ log.debug("Table Schema : {}", table.getSchema());
321
+
322
+ //Tabledata. req = bigquery.tabledata().list(projectId, dataset, table);
323
+
324
+ log.info("start table extract [{}.{}] to {} ...", task.getWorkDataset(), task.getWorkTable(), task.getGcsUri());
325
+
326
+ Job jobReq = new Job();
327
+ JobConfigurationExtract extract = new JobConfigurationExtract();
328
+ extract.setDestinationFormat(task.getFileFormat().get());
329
+ extract.setCompression(task.getCompression().get());
330
+ extract.setDestinationUris(Lists.newArrayList(task.getGcsUri()));
331
+ extract.setSourceTable(table.getTableReference());
332
+ jobReq.setConfiguration(new JobConfiguration().setExtract(extract));
333
+
334
+ Insert jobInsert = bigquery.jobs().insert(task.getProject(), jobReq);
335
+ Job res = jobInsert.execute();
336
+
337
+ JobReference jobRef = res.getJobReference();
338
+ String jobId = jobRef.getJobId();
339
+ log.info("extract jobId : {}",jobId);
340
+ log.debug("waiting for job end....... ");
341
+
342
+ Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
343
+
344
+ log.info("table extract result : {}",lastJob.toPrettyString());
345
+
346
+ return embulkSchema;
347
+ }
348
+
349
+ public static Job waitForJob(Bigquery bigquery, String project, String jobId, int bigqueryJobWaitingSecond) throws IOException, InterruptedException{
350
+ int maxAttempts = bigqueryJobWaitingSecond;
351
+ int initialRetryDelay = 1000; // ms
352
+ Job pollingJob = null;
353
+ log.info("waiting for job end : {}",jobId);
354
+ int tryCnt = 0;
355
+ for (tryCnt=0; tryCnt < maxAttempts; tryCnt++){
356
+ pollingJob = bigquery.jobs().get(project, jobId).execute();
357
+ String state = pollingJob.getStatus().getState();
358
+ log.debug("Job Status {} : {}",jobId, state);
359
+
360
+ if (pollingJob.getStatus().getState().equals("DONE")) {
361
+ break;
362
+ }
363
+ log.info("waiting {} ... ",tryCnt);
364
+ Thread.sleep(initialRetryDelay);
365
+ }
366
+ if(tryCnt + 1 == maxAttempts){
367
+ log.error("Bigquery Job Waiting exceed : over {} second...", bigqueryJobWaitingSecond);
368
+ }
369
+
370
+ return pollingJob;
371
+ }
372
+
373
+ public static Schema predictSchema(Bigquery bigquery){
374
+ Schema schema = Schema.builder().add("", org.embulk.spi.type.Types.LONG).build();
375
+ return schema;
376
+ }
377
+
378
+ /**
379
+ *
380
+ * https://github.com/google/google-api-java-client-samples/blob/master/storage-cmdline-sample/src/main/java/com/google/api/services/samples/storage/examples/ObjectsDownloadExample.java
381
+ *
382
+ */
383
+ public static InputStream openInputStream(PluginTask task, String file)
384
+ {
385
+ try {
386
+
387
+
388
+ Storage gcs = newGcsClient(task);
389
+
390
+
391
+ Path fullLocalFilePath = getFullPath(task, file);
392
+
393
+ log.info("Start download : gs://{}/{} ...to ... {} ",task.getGcsBucket(), file, task.getTempLocalPath());
394
+
395
+ Storage.Objects.Get getObject = gcs.objects().get(task.getGcsBucket(), file);
396
+ getObject.getMediaHttpDownloader().setDirectDownloadEnabled(true);
397
+
398
+ // return getObject.executeMediaAsInputStream() // direct InputStream ?? I Think this is faster then temp file. but ...
399
+
400
+ try(FileOutputStream s = new FileOutputStream(fullLocalFilePath.toFile())){
401
+ getObject.executeMediaAndDownloadTo(s);
402
+ }
403
+ return new FileInputStream(fullLocalFilePath.toFile());
404
+
405
+ } catch (FileNotFoundException e) {
406
+ log.error("gcs file not found error",e);
407
+ return null;
408
+ } catch(IOException e){
409
+ log.error("gcs file read error",e);
410
+ return null;
411
+ }
412
+ }
413
+
414
+
415
+ public static Path getFullPath(PluginTask task, String file){
416
+ String baseName = file.replaceFirst(".*/", "");
417
+ Path fullLocalFilePath = FileSystems.getDefault().getPath(task.getTempLocalPath(), baseName);
418
+ return fullLocalFilePath ;
419
+ }
420
+
421
+ public enum SCHEMA_TYPE{
422
+ EMBULK,
423
+ AVRO
424
+ }
425
+
426
+ public static Schema decnodeSchemaJson(String json) {
427
+ ObjectMapper mapper = new ObjectMapper();
428
+ try {
429
+ Schema schema = mapper.readValue(json, Schema.class);
430
+ return schema;
431
+ } catch (Exception e) {
432
+ log.error("error when parse schema object : " + json,e);
433
+ return null;
434
+ }
435
+ }
436
+
437
+ public static void writeSchemaFile(Schema schema, String schemaType, File file) {
438
+ ObjectMapper mapper = new ObjectMapper();
439
+ try {
440
+ mapper.writeValue(file, schema);
441
+ } catch (Exception e) {
442
+ log.error("error when create schema json {}",file);
443
+ throw new RuntimeException(e);
444
+ }
445
+ }
446
+
447
+ public static String generateSchemaJson(Schema schema, String schemaType) {
448
+ SCHEMA_TYPE tp = SCHEMA_TYPE.EMBULK;
449
+ if(schemaType != null) {
450
+ tp.valueOf(schemaType);
451
+ }
452
+
453
+ ObjectMapper mapper = new ObjectMapper();
454
+ try {
455
+ String jsonString = mapper.writeValueAsString(schema);
456
+ return jsonString;
457
+ } catch (JsonProcessingException e) {
458
+ log.error("error when create schema json",e);
459
+ return null;
460
+ }
461
+ //for(Column col : schema.getColumns()) {
462
+ }
463
+
464
+ public static String toPrettyString(Object obj){
465
+ try {
466
+ ObjectMapper mapper = new ObjectMapper();
467
+ String str = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj);
468
+ return str;
469
+ } catch (Exception e) {
470
+ log.error("JSON format error",e);
471
+ return java.util.Objects.toString(obj);
472
+ }
473
+ }
474
+
475
+ /**
476
+ *
477
+ * @param task
478
+ */
479
+ public static void removeTempTable(PluginTask task){
480
+ try {
481
+ log.info("Remove temp table {}.{}",task.getTempDataset().get(), task.getTempTable().get());
482
+ Bigquery bigquery = newBigqueryClient(task);
483
+ Delete del = bigquery.tables().delete(task.getProject(), task.getTempDataset().get(), task.getTempTable().get());
484
+ del.execute();
485
+ } catch (Exception e) {
486
+ log.error("# Remove temp table FAIL : " + task.getTempDataset().orNull() + "." + task.getTempTable().orNull(),e);
487
+ }
488
+ }
489
+
490
+ public static void removeGcsFilesBeforeExecuting(PluginTask task){
491
+ try {
492
+ log.info("start cleanup gs://{}/{} ... ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
493
+ Storage gcs = BigqueryExportUtils.newGcsClient(task);
494
+ List<String> fileList = getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
495
+ for(String f : fileList){
496
+ log.info("cleanup gs://{}/{} ... ",task.getGcsBucket(), f);
497
+ gcs.objects().delete(task.getGcsBucket(), f).execute();
498
+ }
499
+ } catch (GoogleJsonResponseException e) {
500
+ if(e.getStatusCode() == 404){
501
+ log.info("file not found in gs://{}/{} :: it's ok ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
502
+ }else{
503
+ throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
504
+ }
505
+ } catch (Exception e) {
506
+ throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
507
+ }
508
+ }
509
+
510
+ public static void removeTempGcsFiles(PluginTask task, String file){
511
+ try {
512
+ Storage gcs = BigqueryExportUtils.newGcsClient(task);
513
+ log.info("delete finish file gs://{}/{}", task.getGcsBucket(), file);
514
+ gcs.objects().delete(task.getGcsBucket(), file).execute();
515
+ } catch (Exception e) {
516
+ log.error("# Remove temp gcs file FAIL : " + file,e);
517
+ }
518
+ }
519
+ }