embulk-input-bigquery_extract_files 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,444 @@
1
+ package org.embulk.input.bigquery_export_gcs;
2
+
3
+ import java.io.File;
4
+ import java.io.FileInputStream;
5
+ import java.io.FileNotFoundException;
6
+ import java.io.FileOutputStream;
7
+ import java.io.IOException;
8
+ import java.io.InputStream;
9
+ import java.nio.file.FileSystems;
10
+ import java.nio.file.Path;
11
+ import java.util.Date;
12
+ import java.util.List;
13
+ import java.util.UUID;
14
+ import java.util.regex.Matcher;
15
+ import java.util.regex.Pattern;
16
+
17
+ import org.apache.commons.lang3.StringUtils;
18
+ import org.apache.commons.lang3.time.FastDateFormat;
19
+ import org.embulk.input.bigquery_export_gcs.BigqueryExportGcsFileInputPlugin.PluginTask;
20
+ import org.embulk.spi.Exec;
21
+ import org.embulk.spi.Schema;
22
+ import org.embulk.spi.type.Types;
23
+ import org.slf4j.Logger;
24
+
25
+ import com.fasterxml.jackson.core.JsonProcessingException;
26
+ import com.fasterxml.jackson.databind.ObjectMapper;
27
+ import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
28
+ import com.google.api.client.http.HttpTransport;
29
+ import com.google.api.client.http.javanet.NetHttpTransport;
30
+ import com.google.api.client.json.JsonFactory;
31
+ import com.google.api.client.json.jackson2.JacksonFactory;
32
+ import com.google.api.client.repackaged.com.google.common.base.Strings;
33
+ import com.google.api.services.bigquery.Bigquery;
34
+ import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
35
+ import com.google.api.services.bigquery.Bigquery.Tables.Delete;
36
+ import com.google.api.services.bigquery.BigqueryScopes;
37
+ import com.google.api.services.bigquery.model.Job;
38
+ import com.google.api.services.bigquery.model.JobConfiguration;
39
+ import com.google.api.services.bigquery.model.JobConfigurationExtract;
40
+ import com.google.api.services.bigquery.model.JobConfigurationQuery;
41
+ import com.google.api.services.bigquery.model.JobReference;
42
+ import com.google.api.services.bigquery.model.Table;
43
+ import com.google.api.services.bigquery.model.TableFieldSchema;
44
+ import com.google.api.services.bigquery.model.TableReference;
45
+ import com.google.api.services.bigquery.model.TableSchema;
46
+ import com.google.api.services.storage.Storage;
47
+ import com.google.api.services.storage.StorageScopes;
48
+ import com.google.api.services.storage.model.Bucket;
49
+ import com.google.api.services.storage.model.Objects;
50
+ import com.google.api.services.storage.model.StorageObject;
51
+ import com.google.common.base.Optional;
52
+ import com.google.common.collect.ImmutableList;
53
+ import com.google.common.collect.Lists;
54
+
55
+ /**
56
+ *
57
+ *
58
+ *
59
+ * #reference :
60
+ *
61
+ * # https://github.com/embulk/embulk
62
+ * # https://github.com/embulk/embulk-input-s3
63
+ * # https://github.com/embulk/embulk-input-gcs
64
+ * # https://github.com/embulk/embulk-input-jdbc
65
+ * # https://github.com/GoogleCloudPlatform/java-docs-samples/blob/master/storage/json-api/src/main/java/StorageSample.java
66
+ *
67
+ *
68
+ * @author george 2017. 11. 16.
69
+ *
70
+ */
71
+ public class BigqueryExportUtils
72
+ {
73
+ private static final Logger log = Exec.getLogger(BigqueryExportUtils.class);
74
+
75
+
76
+ public static String parseQueryToBaseTableName(String query){
77
+ if( query == null){
78
+ return null;
79
+ }
80
+
81
+ Pattern p = Pattern.compile(" from [\\[]?([^ \\$\\[\\]]+)[\\]]?", Pattern.CASE_INSENSITIVE);
82
+ Matcher m = p.matcher(query);
83
+ if(m.find() && m.groupCount() > 0){
84
+ return Strings.nullToEmpty(m.group(1)).replaceAll(".*\\.","").replaceAll("[^\\w\\s]","");
85
+ }else{
86
+ return null;
87
+ }
88
+ }
89
+
90
+ public static String generateTempTableName(String query){
91
+ return generateTempTableName(query, null);
92
+ }
93
+
94
+ public static String generateTempTableName(String query, String tablename){
95
+
96
+ String tname = tablename;
97
+
98
+ if (tname == null){
99
+ tname = parseQueryToBaseTableName(query);
100
+ if(tname == null){
101
+ tname = "temp";
102
+ }
103
+ }
104
+
105
+ return "embulk_" + tname + "_" + FastDateFormat.getInstance("yyyyMMdd_HHmmss").format(new Date()) + "_" + UUID.randomUUID().toString().replaceAll("-", "");
106
+ }
107
+
108
+
109
+ public static void executeQueryToDestinationWorkTable(Bigquery bigquery, PluginTask task) throws IOException, InterruptedException {
110
+
111
+ log.info("extract query result {} => {}.{} ",task.getQuery().get(), task.getWorkDataset(), task.getWorkTable());
112
+
113
+ JobConfigurationQuery queryConfig = new JobConfigurationQuery();
114
+ queryConfig.setQuery(task.getQuery().get());
115
+ queryConfig.setDestinationTable(new TableReference()
116
+ .setProjectId(task.getProject())
117
+ .setDatasetId(task.getWorkDataset())
118
+ .setTableId(task.getWorkTable()));
119
+ queryConfig.setUseLegacySql(task.getUseLegacySql());
120
+ queryConfig.setCreateDisposition(task.getCreateDisposition());
121
+ queryConfig.setWriteDisposition(task.getWriteDisposition());
122
+ queryConfig.setUseQueryCache(task.getQueryCache());
123
+ queryConfig.setAllowLargeResults(true);
124
+
125
+ com.google.api.services.bigquery.Bigquery.Jobs.Insert insert = bigquery.jobs().insert(task.getProject(),
126
+ new Job().setConfiguration(new JobConfiguration().setQuery(queryConfig))
127
+ );
128
+ Job jobRes = insert.execute(); // ~~~~~~~~~~~~~~~~~~~~~ API CALL
129
+
130
+ JobReference jobRef = jobRes.getJobReference();
131
+ String jobId = jobRef.getJobId();
132
+
133
+ log.info("query to Table jobId : {}",jobId);
134
+ log.info("waiting for job end....... ");
135
+
136
+ Job lastJob = waitForJob(bigquery, task.getProject(), jobId);
137
+
138
+ log.debug("waiting for job end....... {}", lastJob.toPrettyString());
139
+ }
140
+
141
+ public static void parseGcsUri(PluginTask task){
142
+
143
+ if(StringUtils.isEmpty(task.getGcsUri()) || false == task.getGcsUri().matches("gs://[^/]+/.+") ){
144
+ throw new RuntimeException("gcs_uri not found : " + task.getGcsUri());
145
+ }
146
+
147
+ task.setGcsBucket(task.getGcsUri().replaceAll("gs://([^/]+)/.+", "$1"));
148
+ task.setGcsBlobNamePrefix(task.getGcsUri().replaceAll("gs://[^/]+/(.+)", "$1").replaceAll("[\\*]*$", ""));
149
+
150
+ }
151
+
152
+
153
+ /***
154
+ *
155
+ * google cloud sdk
156
+ *
157
+ * @param task
158
+ * @throws IOException
159
+ * @throws FileNotFoundException
160
+ */
161
+ public static Bigquery newBigqueryClient(PluginTask task) throws FileNotFoundException, IOException{
162
+ log.debug("# Starting Google BigQuery API ... ");
163
+ GoogleCredentialSet set = googleCredential(task);
164
+ return new Bigquery.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
165
+ }
166
+
167
+ public static Storage newGcsClient(PluginTask task) throws FileNotFoundException, IOException{
168
+ log.debug("# Starting Google Cloud Storage ... ");
169
+ GoogleCredentialSet set = googleCredential(task);
170
+ return new Storage.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
171
+ }
172
+
173
+
174
+ public static class GoogleCredentialSet {
175
+ public GoogleCredential googleCredential = null;
176
+ public HttpTransport transport = new NetHttpTransport();
177
+ public JsonFactory jsonFactory = new JacksonFactory();
178
+ }
179
+
180
+ public static GoogleCredentialSet googleCredential(PluginTask task) throws IOException {
181
+ GoogleCredentialSet ret = new GoogleCredentialSet();
182
+
183
+ log.debug("### init googleCredentialFile : {} ",task.getJsonKeyfile());
184
+
185
+ ret.transport = new NetHttpTransport();
186
+ ret.jsonFactory = new JacksonFactory();
187
+
188
+ GoogleCredential credential = GoogleCredential.fromStream(new FileInputStream( task.getJsonKeyfile() ), ret.transport, ret.jsonFactory);
189
+ if (credential.createScopedRequired()) {
190
+ credential = credential.createScoped(BigqueryScopes.all()).createScoped(StorageScopes.all());
191
+ }
192
+ ret.googleCredential = credential;
193
+ return ret;
194
+ }
195
+
196
+
197
+ public static List<String> getFileListFromGcs(PluginTask task) throws FileNotFoundException, IOException{
198
+ Storage gcs = newGcsClient(task);
199
+ return getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
200
+ }
201
+
202
+ public static List<String> getFileListFromGcs(Storage gcs, String bucket, String blobName) throws IOException{
203
+ ImmutableList.Builder<String> builder = ImmutableList.builder();
204
+ Storage.Objects.List listRequest = gcs.objects().list(bucket).setPrefix(blobName);
205
+ Objects objects;
206
+
207
+ do {
208
+ objects = listRequest.execute();
209
+ for(StorageObject obj : objects.getItems()){
210
+ builder.add(obj.getName());
211
+ }
212
+ listRequest.setPageToken(objects.getNextPageToken());
213
+ } while (null != objects.getNextPageToken());
214
+
215
+ return builder.build().asList();
216
+ }
217
+
218
+ public static final String TYPE_INTEGER = "INTEGER";
219
+ public static final String TYPE_STRING = "STRING";
220
+ public static final String TYPE_FLOAT = "FLOAT";
221
+ public static final String TYPE_TIMESTAMP = "TIMESTAMP";
222
+
223
+ public static Schema convertTableSchemaToEmbulkSchema(Table table){
224
+ Schema.Builder builder = Schema.builder();
225
+ TableSchema ts = table.getSchema();
226
+ for( TableFieldSchema field : ts.getFields() ){
227
+ String name = field.getName();
228
+ org.embulk.spi.type.Type type = Types.JSON;
229
+ switch(field.getType()){
230
+ case "INTEGER":
231
+ builder.add(name, Types.LONG);
232
+ break;
233
+ case "FLOAT":
234
+ builder.add(name, Types.DOUBLE);
235
+ break;
236
+ case "TIMESTAMP":
237
+ builder.add(name, Types.TIMESTAMP);
238
+ break;
239
+ default:
240
+ builder.add(name, Types.STRING);
241
+ break;
242
+ }
243
+ }
244
+ return builder.build();
245
+ }
246
+
247
+ public static void initWorkTableWithExecuteQuery(Bigquery bigquery, PluginTask task) throws FileNotFoundException, IOException, InterruptedException{
248
+
249
+ if(task.getQuery().isPresent()){
250
+ task.setWorkId(generateTempTableName(task.getQuery().get()));
251
+
252
+ if(task.getTempTable().isPresent() == false){
253
+ task.setTempTable(Optional.of(task.getWorkId()));
254
+ }
255
+ if(task.getTempDataset().isPresent() == false && task.getDataset().isPresent()){
256
+ task.setTempDataset(Optional.of(task.getDataset().get()));
257
+ }
258
+
259
+ // actual target table setting
260
+ task.setWorkDataset(task.getTempDataset().get());
261
+ task.setWorkTable(task.getTempTable().get());
262
+
263
+ // call google api
264
+ executeQueryToDestinationWorkTable(bigquery, task);
265
+
266
+ }else if(task.getTable().isPresent() && task.getDataset().isPresent()){
267
+ task.setWorkId(generateTempTableName(null, task.getTable().get()));
268
+ // actual target table setting
269
+ task.setWorkDataset(task.getDataset().get());
270
+ task.setWorkTable(task.getTable().get());
271
+ }else{
272
+ throw new IOException("please set config file [dataset]+[table] or [query]");
273
+ }
274
+ }
275
+
276
+ public static Schema extractWorkTable(Bigquery bigquery, PluginTask task) throws FileNotFoundException, IOException, InterruptedException{
277
+
278
+ Table table = bigquery.tables().get(task.getProject(), task.getWorkDataset(), task.getWorkTable()).execute();
279
+
280
+ Schema embulkSchema = convertTableSchemaToEmbulkSchema(table);
281
+ //task.setSchame(embulkSchema);
282
+ log.debug("Table Schema : {}", table.getSchema());
283
+
284
+ //Tabledata. req = bigquery.tabledata().list(projectId, dataset, table);
285
+
286
+ log.info("start table extract [{}.{}] to {} ...", task.getWorkDataset(), task.getWorkTable(), task.getGcsUri());
287
+
288
+ Job jobReq = new Job();
289
+ JobConfigurationExtract extract = new JobConfigurationExtract();
290
+ extract.setDestinationFormat(task.getFileFormat().get());
291
+ extract.setCompression(task.getCompression().get());
292
+ extract.setDestinationUris(Lists.newArrayList(task.getGcsUri()));
293
+ extract.setSourceTable(table.getTableReference());
294
+ jobReq.setConfiguration(new JobConfiguration().setExtract(extract));
295
+
296
+ Insert jobInsert = bigquery.jobs().insert(task.getProject(), jobReq);
297
+ Job res = jobInsert.execute();
298
+
299
+ JobReference jobRef = res.getJobReference();
300
+ String jobId = jobRef.getJobId();
301
+ log.info("extract jobId : {}",jobId);
302
+ log.debug("waiting for job end....... ");
303
+
304
+ waitForJob(bigquery, task.getProject(), jobId);
305
+ return embulkSchema;
306
+ }
307
+
308
+ public static Job waitForJob(Bigquery bigquery, String project, String jobId) throws IOException, InterruptedException{
309
+ int maxAttempts = 20;
310
+ int initialRetryDelay = 1000; // ms
311
+ Job pollingJob = null;
312
+ for (int i=0; i < maxAttempts; i++){
313
+ pollingJob = bigquery.jobs().get(project, jobId).execute();
314
+ String state = pollingJob.getStatus().getState();
315
+ log.debug("Job Status {} : {}",jobId, state);
316
+ if (pollingJob.getStatus().getState().equals("DONE")) {
317
+ break;
318
+ }
319
+ log.debug("wait 1 second and waiting for end ...");
320
+ Thread.sleep(initialRetryDelay);
321
+
322
+ }
323
+ return pollingJob;
324
+ }
325
+
326
+ public static Schema predictSchema(Bigquery bigquery){
327
+ Schema schema = Schema.builder().add("", org.embulk.spi.type.Types.LONG).build();
328
+ return schema;
329
+ }
330
+
331
+ /**
332
+ *
333
+ * https://github.com/google/google-api-java-client-samples/blob/master/storage-cmdline-sample/src/main/java/com/google/api/services/samples/storage/examples/ObjectsDownloadExample.java
334
+ *
335
+ */
336
+ public static InputStream openInputStream(PluginTask task, String file)
337
+ {
338
+ try {
339
+
340
+
341
+ Storage gcs = newGcsClient(task);
342
+
343
+
344
+ Path fullLocalFilePath = getFullPath(task, file);
345
+
346
+ log.info("Start download : gs://{}/{} ...to ... {} ",task.getGcsBucket(), file, task.getTempLocalPath());
347
+
348
+ Storage.Objects.Get getObject = gcs.objects().get(task.getGcsBucket(), file);
349
+ getObject.getMediaHttpDownloader().setDirectDownloadEnabled(true);
350
+
351
+ // return getObject.executeMediaAsInputStream() // direct InputStream ?? I Think this is faster then temp file. but ...
352
+
353
+ try(FileOutputStream s = new FileOutputStream(fullLocalFilePath.toFile())){
354
+ getObject.executeMediaAndDownloadTo(s);
355
+ }
356
+ return new FileInputStream(fullLocalFilePath.toFile());
357
+
358
+ } catch (FileNotFoundException e) {
359
+ log.error("gcs file not found error",e);
360
+ return null;
361
+ } catch(IOException e){
362
+ log.error("gcs file read error",e);
363
+ return null;
364
+ }
365
+ }
366
+
367
+
368
+ public static Path getFullPath(PluginTask task, String file){
369
+ String baseName = file.replaceFirst(".*/", "");
370
+ Path fullLocalFilePath = FileSystems.getDefault().getPath(task.getTempLocalPath(), baseName);
371
+ return fullLocalFilePath ;
372
+ }
373
+
374
+ public enum SCHEMA_TYPE{
375
+ EMBULK,
376
+ AVRO
377
+ }
378
+
379
+ public static Schema decnodeSchemaJson(String json) {
380
+ ObjectMapper mapper = new ObjectMapper();
381
+ try {
382
+ Schema schema = mapper.readValue(json, Schema.class);
383
+ return schema;
384
+ } catch (Exception e) {
385
+ log.error("error when parse schema object : " + json,e);
386
+ return null;
387
+ }
388
+ }
389
+
390
+ public static void writeSchemaFile(Schema schema, String schemaType, File file) {
391
+ ObjectMapper mapper = new ObjectMapper();
392
+ try {
393
+ mapper.writeValue(file, schema);
394
+ } catch (Exception e) {
395
+ log.error("error when create schema json {}",file);
396
+ throw new RuntimeException(e);
397
+ }
398
+ }
399
+
400
+ public static String generateSchemaJson(Schema schema, String schemaType) {
401
+ SCHEMA_TYPE tp = SCHEMA_TYPE.EMBULK;
402
+ if(schemaType != null) {
403
+ tp.valueOf(schemaType);
404
+ }
405
+
406
+ ObjectMapper mapper = new ObjectMapper();
407
+ try {
408
+ String jsonString = mapper.writeValueAsString(schema);
409
+ return jsonString;
410
+ } catch (JsonProcessingException e) {
411
+ log.error("error when create schema json",e);
412
+ return null;
413
+ }
414
+
415
+ //for(Column col : schema.getColumns()) {
416
+
417
+ }
418
+
419
+ /**
420
+ *
421
+ * @param task
422
+ */
423
+ public static void removeTempTable(PluginTask task){
424
+ try {
425
+ log.info("Remove temp table {}.{}",task.getTempDataset().get(), task.getTempTable().get());
426
+ Bigquery bigquery = newBigqueryClient(task);
427
+ Delete del = bigquery.tables().delete(task.getProject(), task.getTempDataset().get(), task.getTempTable().get());
428
+ del.execute();
429
+ } catch (Exception e) {
430
+ log.error("# Remove temp table FAIL : " + task.getTempDataset().orNull() + "." + task.getTempTable().orNull(),e);
431
+ }
432
+ }
433
+
434
+ public static void removeTempGcsFiles(PluginTask task, String file){
435
+ try {
436
+ Storage gcs = BigqueryExportUtils.newGcsClient(task);
437
+ Bucket bucket = gcs.buckets().get(task.getGcsBucket()).execute();
438
+ List<String> fileList = task.getFiles();
439
+
440
+ } catch (Exception e) {
441
+ log.error("# Remove temp table FAIL : " + task.getTempDataset().orNull() + "." + task.getTempTable().orNull(),e);
442
+ }
443
+ }
444
+ }
@@ -0,0 +1,33 @@
1
+ package org.embulk.input.bigquery_export_gcs;
2
+
3
+ import java.io.FileNotFoundException;
4
+ import java.io.IOException;
5
+ import java.io.InputStream;
6
+
7
+ import org.junit.Test;
8
+ import org.slf4j.Logger;
9
+ import org.slf4j.LoggerFactory;
10
+
11
+ public class TestGoogleCloudAccessData extends UnitTestInitializer
12
+ {
13
+ private static final Logger log = LoggerFactory.getLogger(TestGoogleCloudAccessData.class);
14
+
15
+ @Test
16
+ public void envTest(){
17
+ log.info("{}",System.getenv("GCP_PROJECT"));
18
+ }
19
+
20
+ @Test
21
+ public void testGcsInputStreamOpen() throws FileNotFoundException, IOException
22
+ {
23
+ BigqueryExportGcsFileInputPlugin.PluginTask task = config.loadConfig(BigqueryExportGcsFileInputPlugin.PluginTask.class );
24
+
25
+ plugin.executeBigqueryApi(task);
26
+
27
+ InputStream ins = BigqueryExportUtils.openInputStream(task, task.getFiles().get(0));
28
+
29
+ log.info("file size : {}",org.apache.commons.compress.utils.IOUtils.toByteArray(ins).length);
30
+ }
31
+
32
+
33
+ }