embulk-output-bigquery 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,432 @@
1
+ package org.embulk.output;
2
+
3
+ import java.io.IOException;
4
+ import java.util.ArrayList;
5
+ import java.util.List;
6
+ import java.util.Iterator;
7
+ import java.util.HashMap;
8
+ import java.util.IllegalFormatException;
9
+ import java.util.concurrent.Callable;
10
+ import java.util.concurrent.TimeoutException;
11
+ import org.apache.commons.lang3.StringUtils;
12
+ import com.google.common.base.Optional;
13
+ import com.google.common.collect.ImmutableSet;
14
+ import java.security.GeneralSecurityException;
15
+
16
+ import org.embulk.spi.Exec;
17
+ import org.slf4j.Logger;
18
+
19
+ import com.google.api.services.bigquery.Bigquery;
20
+ import com.google.api.services.bigquery.BigqueryScopes;
21
+ import com.google.api.services.bigquery.Bigquery.Datasets;
22
+ import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
23
+ import com.google.api.services.bigquery.Bigquery.Jobs.GetQueryResults;
24
+ import com.google.api.services.bigquery.model.Job;
25
+ import com.google.api.services.bigquery.model.JobConfiguration;
26
+ import com.google.api.services.bigquery.model.JobConfigurationLoad;
27
+ import com.google.api.services.bigquery.model.JobStatus;
28
+ import com.google.api.services.bigquery.model.JobStatistics;
29
+ import com.google.api.services.bigquery.model.JobReference;
30
+ import com.google.api.services.bigquery.model.DatasetList;
31
+ import com.google.api.services.bigquery.model.TableSchema;
32
+ import com.google.api.services.bigquery.model.TableReference;
33
+ import com.google.api.services.bigquery.model.TableFieldSchema;
34
+ import com.google.api.services.bigquery.model.TableCell;
35
+ import com.google.api.services.bigquery.model.TableRow;
36
+
37
+ public class BigqueryWriter
38
+ {
39
+
40
+ private final Logger log = Exec.getLogger(BigqueryWriter.class);
41
+ private final String project;
42
+ private final String dataset;
43
+ private final String table;
44
+ private final boolean autoCreateTable;
45
+ private final Optional<String> schemaPath;
46
+ private final String bucket;
47
+ private final String sourceFormat;
48
+ private final String fieldDelimiter;
49
+ private final int maxBadrecords;
50
+ private final long jobStatusMaxPollingTime;
51
+ private final long jobStatusPollingInterval;
52
+ private final boolean isSkipJobResultCheck;
53
+ private final Bigquery bigQueryClient;
54
+ private final EmbulkBigqueryTask writerTask;
55
+
56
+ public BigqueryWriter(Builder builder) throws IOException, GeneralSecurityException
57
+ {
58
+ this.project = builder.project;
59
+ this.dataset = builder.dataset;
60
+ this.table = builder.table;
61
+ this.autoCreateTable = builder.autoCreateTable;
62
+ this.schemaPath = builder.schemaPath;
63
+ this.bucket = builder.bucket;
64
+ this.sourceFormat = builder.sourceFormat.toUpperCase();
65
+ this.fieldDelimiter = builder.fieldDelimiter;
66
+ this.maxBadrecords = builder.maxBadrecords;
67
+ this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
68
+ this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
69
+ this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
70
+
71
+ BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
72
+ this.bigQueryClient = auth.getBigqueryClient();
73
+ this.writerTask = new EmbulkBigqueryTask();
74
+ }
75
+
76
+ private String getJobStatus(JobReference jobRef) throws JobFailedException
77
+ {
78
+ try {
79
+ Job job = bigQueryClient.jobs().get(project, jobRef.getJobId()).execute();
80
+ if (job.getStatus().getErrorResult() != null) {
81
+ throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s] status:[FAILED]", jobRef.getJobId(), job.getStatus().getErrorResult().getMessage()));
82
+ }
83
+ String jobStatus = job.getStatus().getState();
84
+ if (jobStatus.equals("DONE")) {
85
+ JobStatistics statistics = job.getStatistics();
86
+ //log.info(String.format("Job end. create:[%s] end:[%s]", statistics.getCreationTime(), statistics.getEndTime()));
87
+ log.info(String.format("Job statistics [%s]", statistics.getLoad()));
88
+ }
89
+ return jobStatus;
90
+ } catch (IOException ex) {
91
+ log.warn(ex.getMessage());
92
+ return "UNKNOWN";
93
+ }
94
+ }
95
+
96
+ private void getJobStatusUntilDone(JobReference jobRef) throws TimeoutException, JobFailedException
97
+ {
98
+ long startTime = System.currentTimeMillis();
99
+ long elapsedTime;
100
+
101
+ try {
102
+ while (true) {
103
+ String jobStatus = getJobStatus(jobRef);
104
+ elapsedTime = System.currentTimeMillis() - startTime;
105
+ if (jobStatus.equals("DONE")) {
106
+ log.info(String.format("Job completed successfully. job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, "SUCCESS"));
107
+ break;
108
+ } else if (elapsedTime > jobStatusMaxPollingTime * 1000) {
109
+ throw new TimeoutException(String.format("Checking job status...Timeout. job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, "TIMEOUT"));
110
+ } else {
111
+ log.info(String.format("Checking job status... job_id:[%s] elapsed_time:%dms status:[%s]", jobRef.getJobId(), elapsedTime, jobStatus));
112
+ }
113
+ Thread.sleep(jobStatusPollingInterval * 1000);
114
+ }
115
+ } catch (InterruptedException ex) {
116
+ log.warn(ex.getMessage());
117
+ }
118
+ }
119
+
120
+ public void executeJob() throws IOException, TimeoutException, JobFailedException
121
+ {
122
+ // TODO: refactor
123
+ ArrayList<ArrayList<HashMap<String, String>>> taskList = writerTask.createJobList();
124
+ for (ArrayList<HashMap<String, String>> task : taskList) {
125
+ Job job = createJob(task);
126
+ // TODO: multi-threading
127
+ new EmbulkBigqueryJob(job).call();
128
+ }
129
+ }
130
+
131
+ private Job createJob(ArrayList<HashMap<String, String>> task)
132
+ {
133
+ log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
134
+
135
+ Job job = new Job();
136
+ JobConfiguration jobConfig = new JobConfiguration();
137
+ JobConfigurationLoad loadConfig = new JobConfigurationLoad();
138
+ jobConfig.setLoad(loadConfig);
139
+ job.setConfiguration(jobConfig);
140
+
141
+ loadConfig.setAllowQuotedNewlines(false);
142
+ if (sourceFormat.equals("NEWLINE_DELIMITED_JSON")) {
143
+ loadConfig.setSourceFormat("NEWLINE_DELIMITED_JSON");
144
+ } else {
145
+ loadConfig.setFieldDelimiter(fieldDelimiter);
146
+ }
147
+ if (autoCreateTable) {
148
+ loadConfig.setSchema(getTableSchema());
149
+ loadConfig.setWriteDisposition("WRITE_EMPTY");
150
+ loadConfig.setCreateDisposition("CREATE_IF_NEEDED");
151
+ log.info(String.format("table:[%s] will be create.", table));
152
+ } else {
153
+ loadConfig.setWriteDisposition("WRITE_APPEND");
154
+ loadConfig.setCreateDisposition("CREATE_NEVER");
155
+ }
156
+ loadConfig.setMaxBadRecords(maxBadrecords);
157
+
158
+ List<String> sources = new ArrayList<String>();
159
+ for (HashMap<String, String> file : task) {
160
+ String sourceFile;
161
+ String remotePath = getRemotePath(file.get("remote_path"), file.get("file_name"));
162
+ sourceFile = "gs://" + remotePath;
163
+ log.info(String.format("Add source file to job [%s]", sourceFile));
164
+ sources.add(sourceFile);
165
+ }
166
+ loadConfig.setSourceUris(sources);
167
+ loadConfig.setDestinationTable(getTableReference());
168
+
169
+ return job;
170
+ }
171
+
172
+ private TableReference getTableReference()
173
+ {
174
+ return new TableReference()
175
+ .setProjectId(project)
176
+ .setDatasetId(dataset)
177
+ .setTableId(table);
178
+ }
179
+
180
+ private TableSchema getTableSchema()
181
+ {
182
+ TableSchema tableSchema = new TableSchema();
183
+ List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
184
+ TableFieldSchema tableField;
185
+ // TODO import from json file
186
+ /*
187
+ for () {
188
+ tableField = new TableFieldSchema()
189
+ .setName(name)
190
+ .setType(type);
191
+ fields.add(tableField);
192
+ }
193
+ */
194
+
195
+ tableSchema.setFields(fields);
196
+ return tableSchema;
197
+ }
198
+
199
+ private String getRemotePath(String remotePath, String fileName)
200
+ {
201
+ String[] pathList = StringUtils.split(remotePath, '/');
202
+ String path;
203
+ if (remotePath.isEmpty()) {
204
+ path = bucket + "/" + fileName;
205
+ } else {
206
+ path = bucket + "/" + StringUtils.join(pathList) + "/" + fileName;
207
+ }
208
+ return path;
209
+ }
210
+
211
+ public void addTask(Optional<String> remotePath, String fileName, long fileSize)
212
+ {
213
+ writerTask.addTaskFile(remotePath, fileName, fileSize);
214
+ }
215
+
216
+ public ArrayList<HashMap<String, String>> getFileList()
217
+ {
218
+ return writerTask.getFileList();
219
+ }
220
+
221
+ private class EmbulkBigqueryJob implements Callable<Void>
222
+ {
223
+ private final Job job;
224
+
225
+ public EmbulkBigqueryJob(Job job)
226
+ {
227
+ this.job = job;
228
+ }
229
+
230
+ public Void call() throws IOException, TimeoutException, JobFailedException
231
+ {
232
+ Insert insert = bigQueryClient.jobs().insert(project, job);
233
+ insert.setProjectId(project);
234
+ JobReference jobRef = insert.execute().getJobReference();
235
+ log.info(String.format("Job executed. job id:[%s]", jobRef.getJobId()));
236
+ if (isSkipJobResultCheck) {
237
+ log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
238
+ } else {
239
+ getJobStatusUntilDone(jobRef);
240
+ }
241
+ return null;
242
+ }
243
+ }
244
+
245
+ private class EmbulkBigqueryTask
246
+ {
247
+ // https://cloud.google.com/bigquery/loading-data-into-bigquery#quota
248
+ private final long MAX_SIZE_PER_LOAD_JOB = 1000 * 1024 * 1024 * 1024L; // 1TB
249
+ private final int MAX_NUMBER_OF_FILES_PER_LOAD_JOB = 10000;
250
+
251
+ private final ArrayList<HashMap<String, String>> taskList = new ArrayList<HashMap<String, String>>();
252
+ private final ArrayList<ArrayList<HashMap<String, String>>> jobList = new ArrayList<ArrayList<HashMap<String, String>>>();
253
+
254
+ public void addTaskFile(Optional<String> remotePath, String fileName, long fileSize)
255
+ {
256
+ HashMap<String, String> task = new HashMap<String, String>();
257
+ if (remotePath.isPresent()) {
258
+ task.put("remote_path", remotePath.get());
259
+ } else {
260
+ task.put("remote_path", "");
261
+ }
262
+ task.put("file_name", fileName);
263
+ task.put("file_size", String.valueOf(fileSize));
264
+ taskList.add(task);
265
+ }
266
+
267
+ public ArrayList<ArrayList<HashMap<String, String>>> createJobList()
268
+ {
269
+ long currentBundleSize = 0;
270
+ int currentFileCount = 0;
271
+ ArrayList<HashMap<String, String>> job = new ArrayList<HashMap<String, String>>();
272
+ for (HashMap<String, String> task : taskList) {
273
+ boolean isNeedNextJobList = false;
274
+ long fileSize = Long.valueOf(task.get("file_size")).longValue();
275
+
276
+ if (currentBundleSize + fileSize > MAX_SIZE_PER_LOAD_JOB) {
277
+ isNeedNextJobList = true;
278
+ }
279
+
280
+ if (currentFileCount >= MAX_NUMBER_OF_FILES_PER_LOAD_JOB) {
281
+ isNeedNextJobList = true;
282
+ }
283
+
284
+ if (isNeedNextJobList) {
285
+ jobList.add(job);
286
+ job = new ArrayList<HashMap<String, String>>();
287
+ job.add(task);
288
+ currentBundleSize = 0;
289
+ } else {
290
+ job.add(task);
291
+ }
292
+ currentBundleSize += fileSize;
293
+ currentFileCount++;
294
+
295
+ log.debug(String.format("currentBundleSize:%s currentFileCount:%s", currentBundleSize, currentFileCount));
296
+ log.debug(String.format("fileSize:%s, MAX_SIZE_PER_LOAD_JOB:%s MAX_NUMBER_OF_FILES_PER_LOAD_JOB:%s",
297
+ fileSize, MAX_SIZE_PER_LOAD_JOB, MAX_NUMBER_OF_FILES_PER_LOAD_JOB));
298
+
299
+ }
300
+ if (job.size() > 0) {
301
+ jobList.add(job);
302
+ }
303
+ return jobList;
304
+ }
305
+
306
+ public ArrayList<HashMap<String, String>> getFileList()
307
+ {
308
+ return taskList;
309
+ }
310
+ }
311
+
312
+ public static class Builder
313
+ {
314
+ private final String serviceAccountEmail;
315
+ private String p12KeyFilePath;
316
+ private String applicationName;
317
+ private String project;
318
+ private String dataset;
319
+ private String table;
320
+ private boolean autoCreateTable;
321
+ private Optional<String> schemaPath;
322
+ private String bucket;
323
+ private String sourceFormat;
324
+ private String fieldDelimiter;
325
+ private int maxBadrecords;
326
+ private int jobStatusMaxPollingTime;
327
+ private int jobStatusPollingInterval;
328
+ private boolean isSkipJobResultCheck;
329
+
330
+
331
+ public Builder(String serviceAccountEmail)
332
+ {
333
+ this.serviceAccountEmail = serviceAccountEmail;
334
+ }
335
+
336
+ public Builder setP12KeyFilePath(String p12KeyFilePath)
337
+ {
338
+ this.p12KeyFilePath = p12KeyFilePath;
339
+ return this;
340
+ }
341
+
342
+ public Builder setApplicationName(String applicationName)
343
+ {
344
+ this.applicationName = applicationName;
345
+ return this;
346
+ }
347
+
348
+ public Builder setProject(String project)
349
+ {
350
+ this.project = project;
351
+ return this;
352
+ }
353
+
354
+ public Builder setDataset(String dataset)
355
+ {
356
+ this.dataset = dataset;
357
+ return this;
358
+ }
359
+
360
+ public Builder setTable(String table)
361
+ {
362
+ this.table = table;
363
+ return this;
364
+ }
365
+
366
+ public Builder setAutoCreateTable(boolean autoCreateTable)
367
+ {
368
+ this.autoCreateTable = autoCreateTable;
369
+ return this;
370
+ }
371
+
372
+ public Builder setSchemaPath(Optional<String> schemaPath)
373
+ {
374
+ this.schemaPath = schemaPath;
375
+ return this;
376
+ }
377
+
378
+ public Builder setBucket(String bucket)
379
+ {
380
+ this.bucket = bucket;
381
+ return this;
382
+ }
383
+
384
+ public Builder setSourceFormat(String sourceFormat)
385
+ {
386
+ this.sourceFormat = sourceFormat;
387
+ return this;
388
+ }
389
+
390
+ public Builder setFieldDelimiter(String fieldDelimiter)
391
+ {
392
+ this.fieldDelimiter = fieldDelimiter;
393
+ return this;
394
+ }
395
+
396
+ public Builder setMaxBadrecords(int maxBadrecords)
397
+ {
398
+ this.maxBadrecords = maxBadrecords;
399
+ return this;
400
+ }
401
+
402
+ public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
403
+ {
404
+ this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
405
+ return this;
406
+ }
407
+
408
+ public Builder setJobStatusPollingInterval(int jobStatusPollingInterval)
409
+ {
410
+ this.jobStatusPollingInterval = jobStatusPollingInterval;
411
+ return this;
412
+ }
413
+
414
+ public Builder setIsSkipJobResultCheck(boolean isSkipJobResultCheck)
415
+ {
416
+ this.isSkipJobResultCheck = isSkipJobResultCheck;
417
+ return this;
418
+ }
419
+
420
+ public BigqueryWriter build() throws IOException, GeneralSecurityException
421
+ {
422
+ return new BigqueryWriter(this);
423
+ }
424
+ }
425
+
426
+ public class JobFailedException extends Exception
427
+ {
428
+ public JobFailedException(String message) {
429
+ super(message);
430
+ }
431
+ }
432
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.output;
2
+
3
+ public class TestBigqueryAuthentication
4
+ {
5
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.output;
2
+
3
+ public class TestBigqueryGcsWriter
4
+ {
5
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.output;
2
+
3
+ public class TestBigqueryOutputPlugin
4
+ {
5
+ }