embulk-output-bigquery 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/LICENSE.txt +21 -0
- data/README.md +87 -0
- data/build.gradle +64 -0
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +6 -0
- data/gradlew +164 -0
- data/gradlew.bat +90 -0
- data/lib/embulk/output/bigquery.rb +3 -0
- data/settings.gradle +2 -0
- data/src/main/java/org/embulk/output/BigqueryAuthentication.java +99 -0
- data/src/main/java/org/embulk/output/BigqueryGcsWriter.java +201 -0
- data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +293 -0
- data/src/main/java/org/embulk/output/BigqueryWriter.java +432 -0
- data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +5 -0
- data/src/test/java/org/embulk/output/TestBigqueryGcsWriter.java +5 -0
- data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +5 -0
- data/src/test/java/org/embulk/output/TestBigqueryWriter.java +5 -0
- metadata +104 -0
@@ -0,0 +1,432 @@
|
|
1
|
+
package org.embulk.output;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.util.ArrayList;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.Iterator;
|
7
|
+
import java.util.HashMap;
|
8
|
+
import java.util.IllegalFormatException;
|
9
|
+
import java.util.concurrent.Callable;
|
10
|
+
import java.util.concurrent.TimeoutException;
|
11
|
+
import org.apache.commons.lang3.StringUtils;
|
12
|
+
import com.google.common.base.Optional;
|
13
|
+
import com.google.common.collect.ImmutableSet;
|
14
|
+
import java.security.GeneralSecurityException;
|
15
|
+
|
16
|
+
import org.embulk.spi.Exec;
|
17
|
+
import org.slf4j.Logger;
|
18
|
+
|
19
|
+
import com.google.api.services.bigquery.Bigquery;
|
20
|
+
import com.google.api.services.bigquery.BigqueryScopes;
|
21
|
+
import com.google.api.services.bigquery.Bigquery.Datasets;
|
22
|
+
import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
|
23
|
+
import com.google.api.services.bigquery.Bigquery.Jobs.GetQueryResults;
|
24
|
+
import com.google.api.services.bigquery.model.Job;
|
25
|
+
import com.google.api.services.bigquery.model.JobConfiguration;
|
26
|
+
import com.google.api.services.bigquery.model.JobConfigurationLoad;
|
27
|
+
import com.google.api.services.bigquery.model.JobStatus;
|
28
|
+
import com.google.api.services.bigquery.model.JobStatistics;
|
29
|
+
import com.google.api.services.bigquery.model.JobReference;
|
30
|
+
import com.google.api.services.bigquery.model.DatasetList;
|
31
|
+
import com.google.api.services.bigquery.model.TableSchema;
|
32
|
+
import com.google.api.services.bigquery.model.TableReference;
|
33
|
+
import com.google.api.services.bigquery.model.TableFieldSchema;
|
34
|
+
import com.google.api.services.bigquery.model.TableCell;
|
35
|
+
import com.google.api.services.bigquery.model.TableRow;
|
36
|
+
|
37
|
+
public class BigqueryWriter
|
38
|
+
{
|
39
|
+
|
40
|
+
private final Logger log = Exec.getLogger(BigqueryWriter.class);
|
41
|
+
private final String project;
|
42
|
+
private final String dataset;
|
43
|
+
private final String table;
|
44
|
+
private final boolean autoCreateTable;
|
45
|
+
private final Optional<String> schemaPath;
|
46
|
+
private final String bucket;
|
47
|
+
private final String sourceFormat;
|
48
|
+
private final String fieldDelimiter;
|
49
|
+
private final int maxBadrecords;
|
50
|
+
private final long jobStatusMaxPollingTime;
|
51
|
+
private final long jobStatusPollingInterval;
|
52
|
+
private final boolean isSkipJobResultCheck;
|
53
|
+
private final Bigquery bigQueryClient;
|
54
|
+
private final EmbulkBigqueryTask writerTask;
|
55
|
+
|
56
|
+
public BigqueryWriter(Builder builder) throws IOException, GeneralSecurityException
|
57
|
+
{
|
58
|
+
this.project = builder.project;
|
59
|
+
this.dataset = builder.dataset;
|
60
|
+
this.table = builder.table;
|
61
|
+
this.autoCreateTable = builder.autoCreateTable;
|
62
|
+
this.schemaPath = builder.schemaPath;
|
63
|
+
this.bucket = builder.bucket;
|
64
|
+
this.sourceFormat = builder.sourceFormat.toUpperCase();
|
65
|
+
this.fieldDelimiter = builder.fieldDelimiter;
|
66
|
+
this.maxBadrecords = builder.maxBadrecords;
|
67
|
+
this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
|
68
|
+
this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
|
69
|
+
this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
|
70
|
+
|
71
|
+
BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
|
72
|
+
this.bigQueryClient = auth.getBigqueryClient();
|
73
|
+
this.writerTask = new EmbulkBigqueryTask();
|
74
|
+
}
|
75
|
+
|
76
|
+
private String getJobStatus(JobReference jobRef) throws JobFailedException
|
77
|
+
{
|
78
|
+
try {
|
79
|
+
Job job = bigQueryClient.jobs().get(project, jobRef.getJobId()).execute();
|
80
|
+
if (job.getStatus().getErrorResult() != null) {
|
81
|
+
throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s] status:[FAILED]", jobRef.getJobId(), job.getStatus().getErrorResult().getMessage()));
|
82
|
+
}
|
83
|
+
String jobStatus = job.getStatus().getState();
|
84
|
+
if (jobStatus.equals("DONE")) {
|
85
|
+
JobStatistics statistics = job.getStatistics();
|
86
|
+
//log.info(String.format("Job end. create:[%s] end:[%s]", statistics.getCreationTime(), statistics.getEndTime()));
|
87
|
+
log.info(String.format("Job statistics [%s]", statistics.getLoad()));
|
88
|
+
}
|
89
|
+
return jobStatus;
|
90
|
+
} catch (IOException ex) {
|
91
|
+
log.warn(ex.getMessage());
|
92
|
+
return "UNKNOWN";
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
private void getJobStatusUntilDone(JobReference jobRef) throws TimeoutException, JobFailedException
|
97
|
+
{
|
98
|
+
long startTime = System.currentTimeMillis();
|
99
|
+
long elapsedTime;
|
100
|
+
|
101
|
+
try {
|
102
|
+
while (true) {
|
103
|
+
String jobStatus = getJobStatus(jobRef);
|
104
|
+
elapsedTime = System.currentTimeMillis() - startTime;
|
105
|
+
if (jobStatus.equals("DONE")) {
|
106
|
+
log.info(String.format("Job completed successfully. job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, "SUCCESS"));
|
107
|
+
break;
|
108
|
+
} else if (elapsedTime > jobStatusMaxPollingTime * 1000) {
|
109
|
+
throw new TimeoutException(String.format("Checking job status...Timeout. job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, "TIMEOUT"));
|
110
|
+
} else {
|
111
|
+
log.info(String.format("Checking job status... job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, jobStatus));
|
112
|
+
}
|
113
|
+
Thread.sleep(jobStatusPollingInterval * 1000);
|
114
|
+
}
|
115
|
+
} catch (InterruptedException ex) {
|
116
|
+
log.warn(ex.getMessage());
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
public void executeJob() throws IOException, TimeoutException, JobFailedException
|
121
|
+
{
|
122
|
+
// TODO: refactor
|
123
|
+
ArrayList<ArrayList<HashMap<String, String>>> taskList = writerTask.createJobList();
|
124
|
+
for (ArrayList<HashMap<String, String>> task : taskList) {
|
125
|
+
Job job = createJob(task);
|
126
|
+
// TODO: multi-threading
|
127
|
+
new EmbulkBigqueryJob(job).call();
|
128
|
+
}
|
129
|
+
}
|
130
|
+
|
131
|
+
private Job createJob(ArrayList<HashMap<String, String>> task)
|
132
|
+
{
|
133
|
+
log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
|
134
|
+
|
135
|
+
Job job = new Job();
|
136
|
+
JobConfiguration jobConfig = new JobConfiguration();
|
137
|
+
JobConfigurationLoad loadConfig = new JobConfigurationLoad();
|
138
|
+
jobConfig.setLoad(loadConfig);
|
139
|
+
job.setConfiguration(jobConfig);
|
140
|
+
|
141
|
+
loadConfig.setAllowQuotedNewlines(false);
|
142
|
+
if (sourceFormat.equals("NEWLINE_DELIMITED_JSON")) {
|
143
|
+
loadConfig.setSourceFormat("NEWLINE_DELIMITED_JSON");
|
144
|
+
} else {
|
145
|
+
loadConfig.setFieldDelimiter(fieldDelimiter);
|
146
|
+
}
|
147
|
+
if (autoCreateTable) {
|
148
|
+
loadConfig.setSchema(getTableSchema());
|
149
|
+
loadConfig.setWriteDisposition("WRITE_EMPTY");
|
150
|
+
loadConfig.setCreateDisposition("CREATE_IF_NEEDED");
|
151
|
+
log.info(String.format("table:[%s] will be create.", table));
|
152
|
+
} else {
|
153
|
+
loadConfig.setWriteDisposition("WRITE_APPEND");
|
154
|
+
loadConfig.setCreateDisposition("CREATE_NEVER");
|
155
|
+
}
|
156
|
+
loadConfig.setMaxBadRecords(maxBadrecords);
|
157
|
+
|
158
|
+
List<String> sources = new ArrayList<String>();
|
159
|
+
for (HashMap<String, String> file : task) {
|
160
|
+
String sourceFile;
|
161
|
+
String remotePath = getRemotePath(file.get("remote_path"), file.get("file_name"));
|
162
|
+
sourceFile = "gs://" + remotePath;
|
163
|
+
log.info(String.format("Add source file to job [%s]", sourceFile));
|
164
|
+
sources.add(sourceFile);
|
165
|
+
}
|
166
|
+
loadConfig.setSourceUris(sources);
|
167
|
+
loadConfig.setDestinationTable(getTableReference());
|
168
|
+
|
169
|
+
return job;
|
170
|
+
}
|
171
|
+
|
172
|
+
private TableReference getTableReference()
|
173
|
+
{
|
174
|
+
return new TableReference()
|
175
|
+
.setProjectId(project)
|
176
|
+
.setDatasetId(dataset)
|
177
|
+
.setTableId(table);
|
178
|
+
}
|
179
|
+
|
180
|
+
private TableSchema getTableSchema()
|
181
|
+
{
|
182
|
+
TableSchema tableSchema = new TableSchema();
|
183
|
+
List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
|
184
|
+
TableFieldSchema tableField;
|
185
|
+
// TODO import from json file
|
186
|
+
/*
|
187
|
+
for () {
|
188
|
+
tableField = new TableFieldSchema()
|
189
|
+
.setName(name)
|
190
|
+
.setType(type);
|
191
|
+
fields.add(tableField);
|
192
|
+
}
|
193
|
+
*/
|
194
|
+
|
195
|
+
tableSchema.setFields(fields);
|
196
|
+
return tableSchema;
|
197
|
+
}
|
198
|
+
|
199
|
+
private String getRemotePath(String remotePath, String fileName)
|
200
|
+
{
|
201
|
+
String[] pathList = StringUtils.split(remotePath, '/');
|
202
|
+
String path;
|
203
|
+
if (remotePath.isEmpty()) {
|
204
|
+
path = bucket + "/" + fileName;
|
205
|
+
} else {
|
206
|
+
path = bucket + "/" + StringUtils.join(pathList) + "/" + fileName;
|
207
|
+
}
|
208
|
+
return path;
|
209
|
+
}
|
210
|
+
|
211
|
+
public void addTask(Optional<String> remotePath, String fileName, long fileSize)
|
212
|
+
{
|
213
|
+
writerTask.addTaskFile(remotePath, fileName, fileSize);
|
214
|
+
}
|
215
|
+
|
216
|
+
public ArrayList<HashMap<String, String>> getFileList()
|
217
|
+
{
|
218
|
+
return writerTask.getFileList();
|
219
|
+
}
|
220
|
+
|
221
|
+
private class EmbulkBigqueryJob implements Callable<Void>
|
222
|
+
{
|
223
|
+
private final Job job;
|
224
|
+
|
225
|
+
public EmbulkBigqueryJob(Job job)
|
226
|
+
{
|
227
|
+
this.job = job;
|
228
|
+
}
|
229
|
+
|
230
|
+
public Void call() throws IOException, TimeoutException, JobFailedException
|
231
|
+
{
|
232
|
+
Insert insert = bigQueryClient.jobs().insert(project, job);
|
233
|
+
insert.setProjectId(project);
|
234
|
+
JobReference jobRef = insert.execute().getJobReference();
|
235
|
+
log.info(String.format("Job executed. job id:[%s]", jobRef.getJobId()));
|
236
|
+
if (isSkipJobResultCheck) {
|
237
|
+
log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
|
238
|
+
} else {
|
239
|
+
getJobStatusUntilDone(jobRef);
|
240
|
+
}
|
241
|
+
return null;
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
private class EmbulkBigqueryTask
|
246
|
+
{
|
247
|
+
// https://cloud.google.com/bigquery/loading-data-into-bigquery#quota
|
248
|
+
private final long MAX_SIZE_PER_LOAD_JOB = 1000 * 1024 * 1024 * 1024L; // 1TB
|
249
|
+
private final int MAX_NUMBER_OF_FILES_PER_LOAD_JOB = 10000;
|
250
|
+
|
251
|
+
private final ArrayList<HashMap<String, String>> taskList = new ArrayList<HashMap<String, String>>();
|
252
|
+
private final ArrayList<ArrayList<HashMap<String, String>>> jobList = new ArrayList<ArrayList<HashMap<String, String>>>();
|
253
|
+
|
254
|
+
public void addTaskFile(Optional<String> remotePath, String fileName, long fileSize)
|
255
|
+
{
|
256
|
+
HashMap<String, String> task = new HashMap<String, String>();
|
257
|
+
if (remotePath.isPresent()) {
|
258
|
+
task.put("remote_path", remotePath.get());
|
259
|
+
} else {
|
260
|
+
task.put("remote_path", "");
|
261
|
+
}
|
262
|
+
task.put("file_name", fileName);
|
263
|
+
task.put("file_size", String.valueOf(fileSize));
|
264
|
+
taskList.add(task);
|
265
|
+
}
|
266
|
+
|
267
|
+
public ArrayList<ArrayList<HashMap<String, String>>> createJobList()
|
268
|
+
{
|
269
|
+
long currentBundleSize = 0;
|
270
|
+
int currentFileCount = 0;
|
271
|
+
ArrayList<HashMap<String, String>> job = new ArrayList<HashMap<String, String>>();
|
272
|
+
for (HashMap<String, String> task : taskList) {
|
273
|
+
boolean isNeedNextJobList = false;
|
274
|
+
long fileSize = Long.valueOf(task.get("file_size")).longValue();
|
275
|
+
|
276
|
+
if (currentBundleSize + fileSize > MAX_SIZE_PER_LOAD_JOB) {
|
277
|
+
isNeedNextJobList = true;
|
278
|
+
}
|
279
|
+
|
280
|
+
if (currentFileCount >= MAX_NUMBER_OF_FILES_PER_LOAD_JOB) {
|
281
|
+
isNeedNextJobList = true;
|
282
|
+
}
|
283
|
+
|
284
|
+
if (isNeedNextJobList) {
|
285
|
+
jobList.add(job);
|
286
|
+
job = new ArrayList<HashMap<String, String>>();
|
287
|
+
job.add(task);
|
288
|
+
currentBundleSize = 0;
|
289
|
+
} else {
|
290
|
+
job.add(task);
|
291
|
+
}
|
292
|
+
currentBundleSize += fileSize;
|
293
|
+
currentFileCount++;
|
294
|
+
|
295
|
+
log.debug(String.format("currentBundleSize:%s currentFileCount:%s", currentBundleSize, currentFileCount));
|
296
|
+
log.debug(String.format("fileSize:%s, MAX_SIZE_PER_LOAD_JOB:%s MAX_NUMBER_OF_FILES_PER_LOAD_JOB:%s",
|
297
|
+
fileSize, MAX_SIZE_PER_LOAD_JOB, MAX_NUMBER_OF_FILES_PER_LOAD_JOB));
|
298
|
+
|
299
|
+
}
|
300
|
+
if (job.size() > 0) {
|
301
|
+
jobList.add(job);
|
302
|
+
}
|
303
|
+
return jobList;
|
304
|
+
}
|
305
|
+
|
306
|
+
public ArrayList<HashMap<String, String>> getFileList()
|
307
|
+
{
|
308
|
+
return taskList;
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
public static class Builder
|
313
|
+
{
|
314
|
+
private final String serviceAccountEmail;
|
315
|
+
private String p12KeyFilePath;
|
316
|
+
private String applicationName;
|
317
|
+
private String project;
|
318
|
+
private String dataset;
|
319
|
+
private String table;
|
320
|
+
private boolean autoCreateTable;
|
321
|
+
private Optional<String> schemaPath;
|
322
|
+
private String bucket;
|
323
|
+
private String sourceFormat;
|
324
|
+
private String fieldDelimiter;
|
325
|
+
private int maxBadrecords;
|
326
|
+
private int jobStatusMaxPollingTime;
|
327
|
+
private int jobStatusPollingInterval;
|
328
|
+
private boolean isSkipJobResultCheck;
|
329
|
+
|
330
|
+
|
331
|
+
public Builder(String serviceAccountEmail)
|
332
|
+
{
|
333
|
+
this.serviceAccountEmail = serviceAccountEmail;
|
334
|
+
}
|
335
|
+
|
336
|
+
public Builder setP12KeyFilePath(String p12KeyFilePath)
|
337
|
+
{
|
338
|
+
this.p12KeyFilePath = p12KeyFilePath;
|
339
|
+
return this;
|
340
|
+
}
|
341
|
+
|
342
|
+
public Builder setApplicationName(String applicationName)
|
343
|
+
{
|
344
|
+
this.applicationName = applicationName;
|
345
|
+
return this;
|
346
|
+
}
|
347
|
+
|
348
|
+
public Builder setProject(String project)
|
349
|
+
{
|
350
|
+
this.project = project;
|
351
|
+
return this;
|
352
|
+
}
|
353
|
+
|
354
|
+
public Builder setDataset(String dataset)
|
355
|
+
{
|
356
|
+
this.dataset = dataset;
|
357
|
+
return this;
|
358
|
+
}
|
359
|
+
|
360
|
+
public Builder setTable(String table)
|
361
|
+
{
|
362
|
+
this.table = table;
|
363
|
+
return this;
|
364
|
+
}
|
365
|
+
|
366
|
+
public Builder setAutoCreateTable(boolean autoCreateTable)
|
367
|
+
{
|
368
|
+
this.autoCreateTable = autoCreateTable;
|
369
|
+
return this;
|
370
|
+
}
|
371
|
+
|
372
|
+
public Builder setSchemaPath(Optional<String> schemaPath)
|
373
|
+
{
|
374
|
+
this.schemaPath = schemaPath;
|
375
|
+
return this;
|
376
|
+
}
|
377
|
+
|
378
|
+
public Builder setBucket(String bucket)
|
379
|
+
{
|
380
|
+
this.bucket = bucket;
|
381
|
+
return this;
|
382
|
+
}
|
383
|
+
|
384
|
+
public Builder setSourceFormat(String sourceFormat)
|
385
|
+
{
|
386
|
+
this.sourceFormat = sourceFormat;
|
387
|
+
return this;
|
388
|
+
}
|
389
|
+
|
390
|
+
public Builder setFieldDelimiter(String fieldDelimiter)
|
391
|
+
{
|
392
|
+
this.fieldDelimiter = fieldDelimiter;
|
393
|
+
return this;
|
394
|
+
}
|
395
|
+
|
396
|
+
public Builder setMaxBadrecords(int maxBadrecords)
|
397
|
+
{
|
398
|
+
this.maxBadrecords = maxBadrecords;
|
399
|
+
return this;
|
400
|
+
}
|
401
|
+
|
402
|
+
public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
|
403
|
+
{
|
404
|
+
this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
|
405
|
+
return this;
|
406
|
+
}
|
407
|
+
|
408
|
+
public Builder setJobStatusPollingInterval(int jobStatusPollingInterval)
|
409
|
+
{
|
410
|
+
this.jobStatusPollingInterval = jobStatusPollingInterval;
|
411
|
+
return this;
|
412
|
+
}
|
413
|
+
|
414
|
+
public Builder setIsSkipJobResultCheck(boolean isSkipJobResultCheck)
|
415
|
+
{
|
416
|
+
this.isSkipJobResultCheck = isSkipJobResultCheck;
|
417
|
+
return this;
|
418
|
+
}
|
419
|
+
|
420
|
+
public BigqueryWriter build() throws IOException, GeneralSecurityException
|
421
|
+
{
|
422
|
+
return new BigqueryWriter(this);
|
423
|
+
}
|
424
|
+
}
|
425
|
+
|
426
|
+
public class JobFailedException extends Exception
|
427
|
+
{
|
428
|
+
public JobFailedException(String message) {
|
429
|
+
super(message);
|
430
|
+
}
|
431
|
+
}
|
432
|
+
}
|