embulk-input-bigquery_extract_files 0.0.7 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +16 -16
- data/LICENSE.txt +21 -21
- data/README.md +168 -144
- data/build.gradle +104 -102
- data/config.yml +31 -31
- data/config/checkstyle/checkstyle.xml +128 -128
- data/config/checkstyle/default.xml +108 -108
- data/gradle/wrapper/gradle-wrapper.properties +5 -5
- data/gradlew +172 -172
- data/lib/embulk/input/bigquery_extract_files.rb +3 -3
- data/src/main/java/org/embulk/input/bigquery_export_gcs/BigqueryExportGcsFileInputPlugin.java +378 -383
- data/src/main/java/org/embulk/input/bigquery_export_gcs/BigqueryExportUtils.java +519 -495
- data/src/main/java/org/embulk/input/bigquery_export_gcs/PHASE.java +7 -7
- data/src/test/java/org/embulk/input/bigquery_export_gcs/TestGoogleCloudAccessData.java +33 -33
- data/src/test/java/org/embulk/input/bigquery_export_gcs/TestPluginFunctions.java +56 -56
- data/src/test/java/org/embulk/input/bigquery_export_gcs/UnitTestInitializer.java +96 -96
- metadata +18 -15
@@ -1,495 +1,519 @@
|
|
1
|
-
package org.embulk.input.bigquery_export_gcs;
|
2
|
-
|
3
|
-
import java.io.File;
|
4
|
-
import java.io.FileInputStream;
|
5
|
-
import java.io.FileNotFoundException;
|
6
|
-
import java.io.FileOutputStream;
|
7
|
-
import java.io.IOException;
|
8
|
-
import java.io.InputStream;
|
9
|
-
import java.
|
10
|
-
import java.nio.file.
|
11
|
-
import java.
|
12
|
-
import java.util.
|
13
|
-
import java.util.
|
14
|
-
import java.util.
|
15
|
-
import java.util.regex.
|
16
|
-
|
17
|
-
|
18
|
-
import org.apache.commons.lang3.
|
19
|
-
import org.
|
20
|
-
import org.embulk.
|
21
|
-
import org.embulk.spi.
|
22
|
-
import org.embulk.spi.
|
23
|
-
import org.
|
24
|
-
|
25
|
-
import
|
26
|
-
import
|
27
|
-
|
28
|
-
import com.
|
29
|
-
import com.
|
30
|
-
import com.google.api.client.
|
31
|
-
import com.google.api.client.json.
|
32
|
-
import com.google.api.client.
|
33
|
-
import com.google.api.client.
|
34
|
-
import com.google.api.
|
35
|
-
import com.google.api.
|
36
|
-
import com.google.api.
|
37
|
-
import com.google.api.services.bigquery.
|
38
|
-
import com.google.api.services.bigquery.
|
39
|
-
import com.google.api.services.bigquery.
|
40
|
-
import com.google.api.services.bigquery.
|
41
|
-
import com.google.api.services.bigquery.
|
42
|
-
import com.google.api.services.bigquery.model.
|
43
|
-
import com.google.api.services.bigquery.model.
|
44
|
-
import com.google.api.services.bigquery.model.
|
45
|
-
import com.google.api.services.bigquery.model.
|
46
|
-
import com.google.api.services.bigquery.model.
|
47
|
-
import com.google.api.services.
|
48
|
-
import com.google.api.services.
|
49
|
-
import com.google.api.services.
|
50
|
-
import com.google.api.services.
|
51
|
-
import com.google.api.services.
|
52
|
-
import com.google.
|
53
|
-
import com.google.
|
54
|
-
import com.google.
|
55
|
-
|
56
|
-
import
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
*
|
66
|
-
*
|
67
|
-
*
|
68
|
-
* #
|
69
|
-
*
|
70
|
-
*
|
71
|
-
*
|
72
|
-
*
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
if(
|
87
|
-
return
|
88
|
-
}
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
queryConfig
|
125
|
-
queryConfig.
|
126
|
-
queryConfig.
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
}
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
ret.
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
Storage
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
log.
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
}
|
348
|
-
|
349
|
-
public static
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
log.
|
493
|
-
|
494
|
-
|
495
|
-
|
1
|
+
package org.embulk.input.bigquery_export_gcs;
|
2
|
+
|
3
|
+
import java.io.File;
|
4
|
+
import java.io.FileInputStream;
|
5
|
+
import java.io.FileNotFoundException;
|
6
|
+
import java.io.FileOutputStream;
|
7
|
+
import java.io.IOException;
|
8
|
+
import java.io.InputStream;
|
9
|
+
import java.math.BigInteger;
|
10
|
+
import java.nio.file.FileSystems;
|
11
|
+
import java.nio.file.Path;
|
12
|
+
import java.util.Date;
|
13
|
+
import java.util.List;
|
14
|
+
import java.util.UUID;
|
15
|
+
import java.util.regex.Matcher;
|
16
|
+
import java.util.regex.Pattern;
|
17
|
+
|
18
|
+
import org.apache.commons.lang3.StringUtils;
|
19
|
+
import org.apache.commons.lang3.time.FastDateFormat;
|
20
|
+
import org.embulk.input.bigquery_export_gcs.BigqueryExportGcsFileInputPlugin.PluginTask;
|
21
|
+
import org.embulk.spi.ColumnConfig;
|
22
|
+
import org.embulk.spi.Exec;
|
23
|
+
import org.embulk.spi.Schema;
|
24
|
+
import org.embulk.spi.SchemaConfig;
|
25
|
+
import org.embulk.spi.type.Types;
|
26
|
+
import org.slf4j.Logger;
|
27
|
+
|
28
|
+
import com.fasterxml.jackson.core.JsonProcessingException;
|
29
|
+
import com.fasterxml.jackson.databind.ObjectMapper;
|
30
|
+
import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
|
31
|
+
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
|
32
|
+
import com.google.api.client.http.HttpTransport;
|
33
|
+
import com.google.api.client.http.javanet.NetHttpTransport;
|
34
|
+
import com.google.api.client.json.JsonFactory;
|
35
|
+
import com.google.api.client.json.jackson2.JacksonFactory;
|
36
|
+
import com.google.api.client.repackaged.com.google.common.base.Strings;
|
37
|
+
import com.google.api.services.bigquery.Bigquery;
|
38
|
+
import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
|
39
|
+
import com.google.api.services.bigquery.Bigquery.Tabledata;
|
40
|
+
import com.google.api.services.bigquery.Bigquery.Tables.Delete;
|
41
|
+
import com.google.api.services.bigquery.BigqueryScopes;
|
42
|
+
import com.google.api.services.bigquery.model.Job;
|
43
|
+
import com.google.api.services.bigquery.model.JobConfiguration;
|
44
|
+
import com.google.api.services.bigquery.model.JobConfigurationExtract;
|
45
|
+
import com.google.api.services.bigquery.model.JobConfigurationQuery;
|
46
|
+
import com.google.api.services.bigquery.model.JobReference;
|
47
|
+
import com.google.api.services.bigquery.model.Table;
|
48
|
+
import com.google.api.services.bigquery.model.TableDataList;
|
49
|
+
import com.google.api.services.bigquery.model.TableFieldSchema;
|
50
|
+
import com.google.api.services.bigquery.model.TableReference;
|
51
|
+
import com.google.api.services.bigquery.model.TableRow;
|
52
|
+
import com.google.api.services.bigquery.model.TableSchema;
|
53
|
+
import com.google.api.services.storage.Storage;
|
54
|
+
import com.google.api.services.storage.StorageScopes;
|
55
|
+
import com.google.api.services.storage.model.Bucket;
|
56
|
+
import com.google.api.services.storage.model.Objects;
|
57
|
+
import com.google.api.services.storage.model.StorageObject;
|
58
|
+
import com.google.common.base.Optional;
|
59
|
+
import com.google.common.collect.ImmutableList;
|
60
|
+
import com.google.common.collect.Lists;
|
61
|
+
|
62
|
+
import io.airlift.slice.RuntimeIOException;
|
63
|
+
|
64
|
+
/**
|
65
|
+
*
|
66
|
+
*
|
67
|
+
*
|
68
|
+
* #reference :
|
69
|
+
*
|
70
|
+
* # https://github.com/embulk/embulk
|
71
|
+
* # https://github.com/embulk/embulk-input-s3
|
72
|
+
* # https://github.com/embulk/embulk-input-gcs
|
73
|
+
* # https://github.com/embulk/embulk-input-jdbc
|
74
|
+
* # https://github.com/GoogleCloudPlatform/java-docs-samples/blob/master/storage/json-api/src/main/java/StorageSample.java
|
75
|
+
*
|
76
|
+
*
|
77
|
+
* @author george 2017. 11. 16.
|
78
|
+
*
|
79
|
+
*/
|
80
|
+
public class BigqueryExportUtils
|
81
|
+
{
|
82
|
+
private static final Logger log = Exec.getLogger(BigqueryExportUtils.class);
|
83
|
+
|
84
|
+
|
85
|
+
public static String parseQueryToBaseTableName(String query){
|
86
|
+
if( query == null){
|
87
|
+
return null;
|
88
|
+
}
|
89
|
+
|
90
|
+
Pattern p = Pattern.compile(" from [\\[]?([^ \\$\\[\\]]+)[\\]]?", Pattern.CASE_INSENSITIVE);
|
91
|
+
Matcher m = p.matcher(query);
|
92
|
+
if(m.find() && m.groupCount() > 0){
|
93
|
+
return Strings.nullToEmpty(m.group(1)).replaceAll(".*\\.","").replaceAll("[^\\w\\s]","");
|
94
|
+
}else{
|
95
|
+
return null;
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
public static String generateTempTableName(String query){
|
100
|
+
return generateTempTableName(query, null);
|
101
|
+
}
|
102
|
+
|
103
|
+
public static String generateTempTableName(String query, String tablename){
|
104
|
+
|
105
|
+
String tname = tablename;
|
106
|
+
|
107
|
+
if (tname == null){
|
108
|
+
tname = parseQueryToBaseTableName(query);
|
109
|
+
if(tname == null){
|
110
|
+
tname = "temp";
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
return "embulk_" + tname + "_" + FastDateFormat.getInstance("yyyyMMdd_HHmmss").format(new Date()) + "_" + UUID.randomUUID().toString().replaceAll("-", "");
|
115
|
+
}
|
116
|
+
|
117
|
+
|
118
|
+
public static void executeQueryToDestinationWorkTable(Bigquery bigquery, PluginTask task) throws IOException, InterruptedException {
|
119
|
+
|
120
|
+
log.info("execute Query to Table ");
|
121
|
+
log.info("# Query # {}",task.getQuery().get());
|
122
|
+
log.info("# Table # {}.{} ",task.getWorkDataset(), task.getWorkTable());
|
123
|
+
|
124
|
+
JobConfigurationQuery queryConfig = new JobConfigurationQuery();
|
125
|
+
queryConfig.setQuery(task.getQuery().get());
|
126
|
+
queryConfig.setDestinationTable(new TableReference()
|
127
|
+
.setProjectId(task.getProject())
|
128
|
+
.setDatasetId(task.getWorkDataset())
|
129
|
+
.setTableId(task.getWorkTable()));
|
130
|
+
queryConfig.setUseLegacySql(task.getUseLegacySql());
|
131
|
+
queryConfig.setCreateDisposition(task.getCreateDisposition());
|
132
|
+
queryConfig.setWriteDisposition(task.getWriteDisposition());
|
133
|
+
queryConfig.setUseQueryCache(task.getQueryCache());
|
134
|
+
queryConfig.setAllowLargeResults(true);
|
135
|
+
|
136
|
+
com.google.api.services.bigquery.Bigquery.Jobs.Insert insert = bigquery.jobs().insert(task.getProject(),
|
137
|
+
new Job().setConfiguration(new JobConfiguration().setQuery(queryConfig))
|
138
|
+
);
|
139
|
+
Job jobRes = insert.execute(); // ~~~~~~~~~~~~~~~~~~~~~ API CALL
|
140
|
+
|
141
|
+
JobReference jobRef = jobRes.getJobReference();
|
142
|
+
String jobId = jobRef.getJobId();
|
143
|
+
|
144
|
+
log.info("query to Table jobId : {} : waiting for job end...",jobId);
|
145
|
+
|
146
|
+
Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
|
147
|
+
|
148
|
+
log.debug("waiting for job end....... {}", lastJob.toPrettyString());
|
149
|
+
}
|
150
|
+
|
151
|
+
public static void parseGcsUri(PluginTask task){
|
152
|
+
|
153
|
+
if(StringUtils.isEmpty(task.getGcsUri()) || false == task.getGcsUri().matches("gs://[^/]+/.+") ){
|
154
|
+
throw new RuntimeException("gcs_uri not found : " + task.getGcsUri());
|
155
|
+
}
|
156
|
+
|
157
|
+
task.setGcsBucket(task.getGcsUri().replaceAll("gs://([^/]+)/.+", "$1"));
|
158
|
+
task.setGcsBlobNamePrefix(task.getGcsUri().replaceAll("gs://[^/]+/(.+)", "$1").replaceAll("[\\*]*$", ""));
|
159
|
+
|
160
|
+
}
|
161
|
+
|
162
|
+
|
163
|
+
/***
|
164
|
+
*
|
165
|
+
* google cloud sdk
|
166
|
+
*
|
167
|
+
* @param task
|
168
|
+
* @throws IOException
|
169
|
+
* @throws FileNotFoundException
|
170
|
+
*/
|
171
|
+
public static Bigquery newBigqueryClient(PluginTask task){
|
172
|
+
log.debug("# Starting Google BigQuery API ... ");
|
173
|
+
try {
|
174
|
+
GoogleCredentialSet set = googleCredential(task);
|
175
|
+
return new Bigquery.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
|
176
|
+
} catch (Exception e) {
|
177
|
+
throw new RuntimeException("bigquery connect fail",e);
|
178
|
+
}
|
179
|
+
|
180
|
+
}
|
181
|
+
|
182
|
+
public static Storage newGcsClient(PluginTask task) throws FileNotFoundException, IOException{
|
183
|
+
log.debug("# Starting Google Cloud Storage ... ");
|
184
|
+
GoogleCredentialSet set = googleCredential(task);
|
185
|
+
return new Storage.Builder(set.transport, set.jsonFactory, set.googleCredential).setApplicationName("embulk-input-bigquey-export-gcs").build();
|
186
|
+
}
|
187
|
+
|
188
|
+
|
189
|
+
public static class GoogleCredentialSet {
|
190
|
+
public GoogleCredential googleCredential = null;
|
191
|
+
public HttpTransport transport = new NetHttpTransport();
|
192
|
+
public JsonFactory jsonFactory = new JacksonFactory();
|
193
|
+
}
|
194
|
+
|
195
|
+
public static GoogleCredentialSet googleCredential(PluginTask task) throws IOException {
|
196
|
+
GoogleCredentialSet ret = new GoogleCredentialSet();
|
197
|
+
|
198
|
+
log.debug("### init googleCredentialFile : {} ",task.getJsonKeyfile());
|
199
|
+
|
200
|
+
ret.transport = new NetHttpTransport();
|
201
|
+
ret.jsonFactory = new JacksonFactory();
|
202
|
+
|
203
|
+
GoogleCredential credential = GoogleCredential.fromStream(new FileInputStream( task.getJsonKeyfile() ), ret.transport, ret.jsonFactory);
|
204
|
+
if (credential.createScopedRequired()) {
|
205
|
+
credential = credential.createScoped(BigqueryScopes.all()).createScoped(StorageScopes.all());
|
206
|
+
}
|
207
|
+
ret.googleCredential = credential;
|
208
|
+
return ret;
|
209
|
+
}
|
210
|
+
|
211
|
+
|
212
|
+
public static List<String> getFileListFromGcs(PluginTask task) throws FileNotFoundException, IOException{
|
213
|
+
Storage gcs = newGcsClient(task);
|
214
|
+
return getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
|
215
|
+
}
|
216
|
+
|
217
|
+
public static List<String> getFileListFromGcs(Storage gcs, String bucket, String blobName) throws IOException{
|
218
|
+
ImmutableList.Builder<String> builder = ImmutableList.builder();
|
219
|
+
Storage.Objects.List listRequest = gcs.objects().list(bucket).setPrefix(blobName);
|
220
|
+
Objects objects;
|
221
|
+
|
222
|
+
do {
|
223
|
+
objects = listRequest.execute();
|
224
|
+
if(objects.getItems() == null){
|
225
|
+
log.error("file not found in gs://{}/{}",bucket,blobName);
|
226
|
+
return builder.build();
|
227
|
+
}
|
228
|
+
for(StorageObject obj : objects.getItems()){
|
229
|
+
builder.add(obj.getName());
|
230
|
+
}
|
231
|
+
listRequest.setPageToken(objects.getNextPageToken());
|
232
|
+
} while (null != objects.getNextPageToken());
|
233
|
+
|
234
|
+
return builder.build().asList();
|
235
|
+
}
|
236
|
+
|
237
|
+
public static final String TYPE_INTEGER = "INTEGER";
|
238
|
+
public static final String TYPE_STRING = "STRING";
|
239
|
+
public static final String TYPE_FLOAT = "FLOAT";
|
240
|
+
public static final String TYPE_TIMESTAMP = "TIMESTAMP";
|
241
|
+
|
242
|
+
public static SchemaConfig getSchemaWithGuess(Bigquery bigquery, PluginTask task, Table table, Schema schema) throws IOException{
|
243
|
+
List<ColumnConfig> columns = Lists.newArrayList();
|
244
|
+
|
245
|
+
com.google.api.services.bigquery.Bigquery.Tabledata.List req = bigquery.tabledata().list(task.getProject(), task.getDataset().get(), table.getTableReference().getTableId());
|
246
|
+
|
247
|
+
req = req.setMaxResults(new Long(1));
|
248
|
+
|
249
|
+
TableDataList list = req.execute();
|
250
|
+
|
251
|
+
for(TableRow row : list.getRows()){
|
252
|
+
//row.get(name)
|
253
|
+
}
|
254
|
+
return new SchemaConfig(columns);
|
255
|
+
}
|
256
|
+
|
257
|
+
public static Schema convertTableSchemaToEmbulkSchema(Table table){
|
258
|
+
Schema.Builder builder = Schema.builder();
|
259
|
+
TableSchema ts = table.getSchema();
|
260
|
+
for( TableFieldSchema field : ts.getFields() ){
|
261
|
+
String name = field.getName();
|
262
|
+
org.embulk.spi.type.Type type = Types.JSON;
|
263
|
+
switch(field.getType()){
|
264
|
+
case "INTEGER":
|
265
|
+
builder.add(name, Types.LONG);
|
266
|
+
break;
|
267
|
+
case "FLOAT":
|
268
|
+
builder.add(name, Types.DOUBLE);
|
269
|
+
break;
|
270
|
+
case "TIMESTAMP":
|
271
|
+
builder.add(name, Types.TIMESTAMP);
|
272
|
+
break;
|
273
|
+
default:
|
274
|
+
builder.add(name, Types.STRING);
|
275
|
+
break;
|
276
|
+
}
|
277
|
+
}
|
278
|
+
return builder.build();
|
279
|
+
}
|
280
|
+
|
281
|
+
public static PHASE initTask(PluginTask task) {
|
282
|
+
|
283
|
+
if(task.getQuery().isPresent()){
|
284
|
+
task.setWorkId(generateTempTableName(task.getQuery().get()));
|
285
|
+
|
286
|
+
if(task.getTempTable().isPresent() == false){
|
287
|
+
task.setTempTable(Optional.of(task.getWorkId()));
|
288
|
+
}
|
289
|
+
if(task.getTempDataset().isPresent() == false && task.getDataset().isPresent()){
|
290
|
+
task.setTempDataset(Optional.of(task.getDataset().get()));
|
291
|
+
}
|
292
|
+
|
293
|
+
// actual target table setting
|
294
|
+
task.setWorkDataset(task.getTempDataset().get());
|
295
|
+
task.setWorkTable(task.getTempTable().get());
|
296
|
+
|
297
|
+
return PHASE.QUERY;
|
298
|
+
}else if(task.getTable().isPresent() && task.getDataset().isPresent()){
|
299
|
+
task.setWorkId(generateTempTableName(null, task.getTable().get()));
|
300
|
+
// actual target table setting
|
301
|
+
task.setWorkDataset(task.getDataset().get());
|
302
|
+
task.setWorkTable(task.getTable().get());
|
303
|
+
|
304
|
+
return PHASE.TABLE;
|
305
|
+
|
306
|
+
}else{
|
307
|
+
throw new RuntimeException("please set config file [dataset]+[table] or [query]");
|
308
|
+
}
|
309
|
+
}
|
310
|
+
|
311
|
+
public static Schema extractWorkTable(Bigquery bigquery, PluginTask task) throws FileNotFoundException, IOException, InterruptedException{
|
312
|
+
|
313
|
+
Table table = bigquery.tables().get(task.getProject(), task.getWorkDataset(), task.getWorkTable()).execute();
|
314
|
+
|
315
|
+
Schema embulkSchema = convertTableSchemaToEmbulkSchema(table);
|
316
|
+
|
317
|
+
|
318
|
+
//task.setSchame(embulkSchema);
|
319
|
+
|
320
|
+
log.debug("Table Schema : {}", table.getSchema());
|
321
|
+
|
322
|
+
//Tabledata. req = bigquery.tabledata().list(projectId, dataset, table);
|
323
|
+
|
324
|
+
log.info("start table extract [{}.{}] to {} ...", task.getWorkDataset(), task.getWorkTable(), task.getGcsUri());
|
325
|
+
|
326
|
+
Job jobReq = new Job();
|
327
|
+
JobConfigurationExtract extract = new JobConfigurationExtract();
|
328
|
+
extract.setDestinationFormat(task.getFileFormat().get());
|
329
|
+
extract.setCompression(task.getCompression().get());
|
330
|
+
extract.setDestinationUris(Lists.newArrayList(task.getGcsUri()));
|
331
|
+
extract.setSourceTable(table.getTableReference());
|
332
|
+
jobReq.setConfiguration(new JobConfiguration().setExtract(extract));
|
333
|
+
|
334
|
+
Insert jobInsert = bigquery.jobs().insert(task.getProject(), jobReq);
|
335
|
+
Job res = jobInsert.execute();
|
336
|
+
|
337
|
+
JobReference jobRef = res.getJobReference();
|
338
|
+
String jobId = jobRef.getJobId();
|
339
|
+
log.info("extract jobId : {}",jobId);
|
340
|
+
log.debug("waiting for job end....... ");
|
341
|
+
|
342
|
+
Job lastJob = waitForJob(bigquery, task.getProject(), jobId, task.getBigqueryJobWaitingSecond().get());
|
343
|
+
|
344
|
+
log.info("table extract result : {}",lastJob.toPrettyString());
|
345
|
+
|
346
|
+
return embulkSchema;
|
347
|
+
}
|
348
|
+
|
349
|
+
public static Job waitForJob(Bigquery bigquery, String project, String jobId, int bigqueryJobWaitingSecond) throws IOException, InterruptedException{
|
350
|
+
int maxAttempts = bigqueryJobWaitingSecond;
|
351
|
+
int initialRetryDelay = 1000; // ms
|
352
|
+
Job pollingJob = null;
|
353
|
+
log.info("waiting for job end : {}",jobId);
|
354
|
+
int tryCnt = 0;
|
355
|
+
for (tryCnt=0; tryCnt < maxAttempts; tryCnt++){
|
356
|
+
pollingJob = bigquery.jobs().get(project, jobId).execute();
|
357
|
+
String state = pollingJob.getStatus().getState();
|
358
|
+
log.debug("Job Status {} : {}",jobId, state);
|
359
|
+
|
360
|
+
if (pollingJob.getStatus().getState().equals("DONE")) {
|
361
|
+
break;
|
362
|
+
}
|
363
|
+
log.info("waiting {} ... ",tryCnt);
|
364
|
+
Thread.sleep(initialRetryDelay);
|
365
|
+
}
|
366
|
+
if(tryCnt + 1 == maxAttempts){
|
367
|
+
log.error("Bigquery Job Waiting exceed : over {} second...", bigqueryJobWaitingSecond);
|
368
|
+
}
|
369
|
+
|
370
|
+
return pollingJob;
|
371
|
+
}
|
372
|
+
|
373
|
+
public static Schema predictSchema(Bigquery bigquery){
|
374
|
+
Schema schema = Schema.builder().add("", org.embulk.spi.type.Types.LONG).build();
|
375
|
+
return schema;
|
376
|
+
}
|
377
|
+
|
378
|
+
/**
|
379
|
+
*
|
380
|
+
* https://github.com/google/google-api-java-client-samples/blob/master/storage-cmdline-sample/src/main/java/com/google/api/services/samples/storage/examples/ObjectsDownloadExample.java
|
381
|
+
*
|
382
|
+
*/
|
383
|
+
public static InputStream openInputStream(PluginTask task, String file)
|
384
|
+
{
|
385
|
+
try {
|
386
|
+
|
387
|
+
|
388
|
+
Storage gcs = newGcsClient(task);
|
389
|
+
|
390
|
+
|
391
|
+
Path fullLocalFilePath = getFullPath(task, file);
|
392
|
+
|
393
|
+
log.info("Start download : gs://{}/{} ...to ... {} ",task.getGcsBucket(), file, task.getTempLocalPath());
|
394
|
+
|
395
|
+
Storage.Objects.Get getObject = gcs.objects().get(task.getGcsBucket(), file);
|
396
|
+
getObject.getMediaHttpDownloader().setDirectDownloadEnabled(true);
|
397
|
+
|
398
|
+
// return getObject.executeMediaAsInputStream() // direct InputStream ?? I Think this is faster then temp file. but ...
|
399
|
+
|
400
|
+
try(FileOutputStream s = new FileOutputStream(fullLocalFilePath.toFile())){
|
401
|
+
getObject.executeMediaAndDownloadTo(s);
|
402
|
+
}
|
403
|
+
return new FileInputStream(fullLocalFilePath.toFile());
|
404
|
+
|
405
|
+
} catch (FileNotFoundException e) {
|
406
|
+
log.error("gcs file not found error",e);
|
407
|
+
return null;
|
408
|
+
} catch(IOException e){
|
409
|
+
log.error("gcs file read error",e);
|
410
|
+
return null;
|
411
|
+
}
|
412
|
+
}
|
413
|
+
|
414
|
+
|
415
|
+
public static Path getFullPath(PluginTask task, String file){
|
416
|
+
String baseName = file.replaceFirst(".*/", "");
|
417
|
+
Path fullLocalFilePath = FileSystems.getDefault().getPath(task.getTempLocalPath(), baseName);
|
418
|
+
return fullLocalFilePath ;
|
419
|
+
}
|
420
|
+
|
421
|
+
public enum SCHEMA_TYPE{
|
422
|
+
EMBULK,
|
423
|
+
AVRO
|
424
|
+
}
|
425
|
+
|
426
|
+
public static Schema decnodeSchemaJson(String json) {
|
427
|
+
ObjectMapper mapper = new ObjectMapper();
|
428
|
+
try {
|
429
|
+
Schema schema = mapper.readValue(json, Schema.class);
|
430
|
+
return schema;
|
431
|
+
} catch (Exception e) {
|
432
|
+
log.error("error when parse schema object : " + json,e);
|
433
|
+
return null;
|
434
|
+
}
|
435
|
+
}
|
436
|
+
|
437
|
+
public static void writeSchemaFile(Schema schema, String schemaType, File file) {
|
438
|
+
ObjectMapper mapper = new ObjectMapper();
|
439
|
+
try {
|
440
|
+
mapper.writeValue(file, schema);
|
441
|
+
} catch (Exception e) {
|
442
|
+
log.error("error when create schema json {}",file);
|
443
|
+
throw new RuntimeException(e);
|
444
|
+
}
|
445
|
+
}
|
446
|
+
|
447
|
+
public static String generateSchemaJson(Schema schema, String schemaType) {
|
448
|
+
SCHEMA_TYPE tp = SCHEMA_TYPE.EMBULK;
|
449
|
+
if(schemaType != null) {
|
450
|
+
tp.valueOf(schemaType);
|
451
|
+
}
|
452
|
+
|
453
|
+
ObjectMapper mapper = new ObjectMapper();
|
454
|
+
try {
|
455
|
+
String jsonString = mapper.writeValueAsString(schema);
|
456
|
+
return jsonString;
|
457
|
+
} catch (JsonProcessingException e) {
|
458
|
+
log.error("error when create schema json",e);
|
459
|
+
return null;
|
460
|
+
}
|
461
|
+
//for(Column col : schema.getColumns()) {
|
462
|
+
}
|
463
|
+
|
464
|
+
public static String toPrettyString(Object obj){
|
465
|
+
try {
|
466
|
+
ObjectMapper mapper = new ObjectMapper();
|
467
|
+
String str = mapper.writerWithDefaultPrettyPrinter().writeValueAsString(obj);
|
468
|
+
return str;
|
469
|
+
} catch (Exception e) {
|
470
|
+
log.error("JSON format error",e);
|
471
|
+
return java.util.Objects.toString(obj);
|
472
|
+
}
|
473
|
+
}
|
474
|
+
|
475
|
+
/**
|
476
|
+
*
|
477
|
+
* @param task
|
478
|
+
*/
|
479
|
+
public static void removeTempTable(PluginTask task){
|
480
|
+
try {
|
481
|
+
log.info("Remove temp table {}.{}",task.getTempDataset().get(), task.getTempTable().get());
|
482
|
+
Bigquery bigquery = newBigqueryClient(task);
|
483
|
+
Delete del = bigquery.tables().delete(task.getProject(), task.getTempDataset().get(), task.getTempTable().get());
|
484
|
+
del.execute();
|
485
|
+
} catch (Exception e) {
|
486
|
+
log.error("# Remove temp table FAIL : " + task.getTempDataset().orNull() + "." + task.getTempTable().orNull(),e);
|
487
|
+
}
|
488
|
+
}
|
489
|
+
|
490
|
+
public static void removeGcsFilesBeforeExecuting(PluginTask task){
|
491
|
+
try {
|
492
|
+
log.info("start cleanup gs://{}/{} ... ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
|
493
|
+
Storage gcs = BigqueryExportUtils.newGcsClient(task);
|
494
|
+
List<String> fileList = getFileListFromGcs(gcs, task.getGcsBucket(), task.getGcsBlobNamePrefix());
|
495
|
+
for(String f : fileList){
|
496
|
+
log.info("cleanup gs://{}/{} ... ",task.getGcsBucket(), f);
|
497
|
+
gcs.objects().delete(task.getGcsBucket(), f).execute();
|
498
|
+
}
|
499
|
+
} catch (GoogleJsonResponseException e) {
|
500
|
+
if(e.getStatusCode() == 404){
|
501
|
+
log.info("file not found in gs://{}/{} :: it's ok ",task.getGcsBucket(), task.getGcsBlobNamePrefix());
|
502
|
+
}else{
|
503
|
+
throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
|
504
|
+
}
|
505
|
+
} catch (Exception e) {
|
506
|
+
throw new RuntimeException("# Remove GCS files gs://" + task.getGcsBucket() + "/" + task.getGcsBlobNamePrefix(),e);
|
507
|
+
}
|
508
|
+
}
|
509
|
+
|
510
|
+
public static void removeTempGcsFiles(PluginTask task, String file){
|
511
|
+
try {
|
512
|
+
Storage gcs = BigqueryExportUtils.newGcsClient(task);
|
513
|
+
log.info("delete finish file gs://{}/{}", task.getGcsBucket(), file);
|
514
|
+
gcs.objects().delete(task.getGcsBucket(), file).execute();
|
515
|
+
} catch (Exception e) {
|
516
|
+
log.error("# Remove temp gcs file FAIL : " + file,e);
|
517
|
+
}
|
518
|
+
}
|
519
|
+
}
|