embulk-output-bigquery 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_output(
2
+ "bigquery", "org.embulk.output.BigqueryOutputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
data/settings.gradle ADDED
@@ -0,0 +1,2 @@
1
+ rootProject.name = 'embulk-output-bigquery'
2
+
@@ -0,0 +1,99 @@
1
+ package org.embulk.output;
2
+
3
+ import java.io.File;
4
+ import java.io.FileNotFoundException;
5
+ import java.io.FileInputStream;
6
+ import java.io.IOException;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+ import java.util.IllegalFormatException;
10
+ import com.google.api.client.auth.oauth2.Credential;
11
+ import com.google.api.client.auth.oauth2.CredentialRefreshListener;
12
+ import com.google.api.client.auth.oauth2.TokenErrorResponse;
13
+ import com.google.api.client.auth.oauth2.TokenResponse;
14
+ import com.google.common.collect.ImmutableList;
15
+ import java.security.GeneralSecurityException;
16
+
17
+ import org.embulk.spi.Exec;
18
+ import org.slf4j.Logger;
19
+
20
+ import com.google.api.client.googleapis.auth.oauth2.GoogleCredential;
21
+ import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
22
+ import com.google.api.client.http.HttpTransport;
23
+ import com.google.api.client.http.InputStreamContent;
24
+ import com.google.api.client.json.JsonFactory;
25
+ import com.google.api.client.json.jackson2.JacksonFactory;
26
+ import com.google.api.services.storage.Storage;
27
+ import com.google.api.services.storage.StorageScopes;
28
+ import com.google.api.services.bigquery.Bigquery;
29
+ import com.google.api.services.bigquery.BigqueryScopes;
30
+ import com.google.api.services.bigquery.model.ProjectList;
31
+
32
+ public class BigqueryAuthentication
33
+ {
34
+
35
+ private final Logger log = Exec.getLogger(BigqueryAuthentication.class);
36
+ private final String serviceAccountEmail;
37
+ private final String p12KeyFilePath;
38
+ private final String applicationName;
39
+ private final HttpTransport httpTransport;
40
+ private final JsonFactory jsonFactory;
41
+ private final GoogleCredential credentials;
42
+
43
+ public BigqueryAuthentication(String serviceAccountEmail, String p12KeyFilePath, String applicationName) throws IOException, GeneralSecurityException
44
+ {
45
+ this.serviceAccountEmail = serviceAccountEmail;
46
+ this.p12KeyFilePath = p12KeyFilePath;
47
+ this.applicationName = applicationName;
48
+
49
+ this.httpTransport = GoogleNetHttpTransport.newTrustedTransport();
50
+ this.jsonFactory = new JacksonFactory();
51
+ this.credentials = getCredentialProvider();
52
+ }
53
+
54
+ /**
55
+ * @see https://developers.google.com/accounts/docs/OAuth2ServiceAccount#authorizingrequests
56
+ */
57
+ private GoogleCredential getCredentialProvider() throws IOException, GeneralSecurityException
58
+ {
59
+ // @see https://cloud.google.com/compute/docs/api/how-tos/authorization
60
+ // @see https://developers.google.com/resources/api-libraries/documentation/storage/v1/java/latest/com/google/api/services/storage/STORAGE_SCOPE.html
61
+ GoogleCredential cred = new GoogleCredential.Builder()
62
+ .setTransport(httpTransport)
63
+ .setJsonFactory(jsonFactory)
64
+ .setServiceAccountId(serviceAccountEmail)
65
+ .setServiceAccountScopes(
66
+ ImmutableList.of(
67
+ BigqueryScopes.DEVSTORAGE_READ_WRITE,
68
+ BigqueryScopes.BIGQUERY
69
+ )
70
+ )
71
+ .setServiceAccountPrivateKeyFromP12File(new File(p12KeyFilePath))
72
+ .build();
73
+ return cred;
74
+ }
75
+
76
+ public Bigquery getBigqueryClient() throws IOException
77
+ {
78
+ Bigquery client = new Bigquery.Builder(httpTransport, jsonFactory, credentials)
79
+ .setHttpRequestInitializer(credentials)
80
+ .setApplicationName(applicationName)
81
+ .build();
82
+
83
+ // For throw IOException when authentication is failed.
84
+ long maxResults = 1;
85
+ Bigquery.Projects.List req = client.projects().list().setMaxResults(maxResults);
86
+ ProjectList projectList = req.execute();
87
+
88
+ return client;
89
+ }
90
+
91
+ public Storage getGcsClient() throws IOException
92
+ {
93
+ Storage client = new Storage.Builder(httpTransport, jsonFactory, credentials)
94
+ .setApplicationName(applicationName)
95
+ .build();
96
+
97
+ return client;
98
+ }
99
+ }
@@ -0,0 +1,201 @@
1
+ package org.embulk.output;
2
+
3
+ import java.io.File;
4
+ import java.io.FileNotFoundException;
5
+ import java.io.FileInputStream;
6
+ import java.io.IOException;
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+ import java.util.Collection;
10
+ import java.util.Iterator;
11
+ import java.util.IllegalFormatException;
12
+ import java.nio.charset.Charset;
13
+ import java.nio.charset.StandardCharsets;
14
+ import com.google.common.base.Optional;
15
+ import com.google.common.collect.ImmutableList;
16
+ //import eu.medsea.mimeutil.MimeType;
17
+ //import eu.medsea.mimeutil.MimeUtil;
18
+ //import eu.medsea.mimeutil.detector.MimeDetector;
19
+ import org.apache.commons.lang3.StringUtils;
20
+ import org.apache.commons.codec.binary.Base64;
21
+ import java.security.GeneralSecurityException;
22
+
23
+ import org.embulk.spi.Exec;
24
+ import org.slf4j.Logger;
25
+
26
+ import com.google.api.services.storage.Storage;
27
+ import com.google.api.services.storage.StorageScopes;
28
+ import com.google.api.services.storage.model.Bucket;
29
+ import com.google.api.services.storage.model.Objects;
30
+ import com.google.api.services.storage.model.StorageObject;
31
+
32
+ import com.google.api.client.http.InputStreamContent;
33
+
34
+ public class BigqueryGcsWriter
35
+ {
36
+
37
+ private final Logger log = Exec.getLogger(BigqueryGcsWriter.class);
38
+ private final String bucket;
39
+ private final String sourceFormat;
40
+ private final boolean isFileCompressed;
41
+ private final boolean deleteFromBucketWhenJobEnd;
42
+ private Storage storageClient;
43
+
44
+ public BigqueryGcsWriter(Builder builder) throws IOException, GeneralSecurityException
45
+ {
46
+ this.bucket = builder.bucket;
47
+ this.sourceFormat = builder.sourceFormat.toUpperCase();
48
+ this.isFileCompressed = builder.isFileCompressed;
49
+ this.deleteFromBucketWhenJobEnd = builder.deleteFromBucketWhenJobEnd;
50
+
51
+ BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
52
+ this.storageClient = auth.getGcsClient();
53
+ }
54
+
55
+ public void uploadFile(String localFilePath, String fileName, Optional<String> remotePath) throws IOException
56
+ {
57
+ FileInputStream stream = null;
58
+
59
+ try {
60
+ String path;
61
+ if (remotePath.isPresent()) {
62
+ path = remotePath.get();
63
+ } else {
64
+ path = "";
65
+ }
66
+ String gcsPath = getRemotePath(path, fileName);
67
+ StorageObject objectMetadata = new StorageObject().setName(gcsPath);
68
+ log.info(String.format("Uploading file [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
69
+
70
+ File file = new File(localFilePath);
71
+ stream = new FileInputStream(file);
72
+ InputStreamContent content = new InputStreamContent(getContentType(), stream);
73
+ Storage.Objects.Insert insertObject = storageClient.objects().insert(bucket, objectMetadata, content);
74
+ insertObject.setDisableGZipContent(true);
75
+
76
+ StorageObject response = insertObject.execute();
77
+ log.info(String.format("Upload completed [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
78
+ } finally {
79
+ stream.close();
80
+ }
81
+ }
82
+
83
+ private String getRemotePath(String remotePath, String fileName)
84
+ {
85
+ if (remotePath.isEmpty()) {
86
+ return fileName;
87
+ }
88
+ String[] pathList = StringUtils.split(remotePath, '/');
89
+ String path = StringUtils.join(pathList) + "/";
90
+ if (!path.endsWith("/")) {
91
+ path = path + "/";
92
+ }
93
+ return path + fileName;
94
+ }
95
+
96
+ public void deleteFile(String remotePath, String fileName) throws IOException
97
+ {
98
+ String path = getRemotePath(remotePath, fileName);
99
+ storageClient.objects().delete(bucket, path).execute();
100
+ log.info(String.format("Delete remote file [gs://%s/%s]", bucket, path));
101
+ }
102
+
103
+ public boolean getDeleteFromBucketWhenJobEnd()
104
+ {
105
+ return this.deleteFromBucketWhenJobEnd;
106
+ }
107
+
108
+ private String getContentType()
109
+ {
110
+ if (isFileCompressed) {
111
+ return "application/x-gzip";
112
+ } else {
113
+ if (sourceFormat.equals("NEWLINE_DELIMITED_JSON)")) {
114
+ return "application/json";
115
+ } else {
116
+ return "text/csv";
117
+ }
118
+ }
119
+ }
120
+
121
+ /*
122
+ private void registerMimeDetector()
123
+ {
124
+ String mimeDetector = "eu.medsea.mimeutil.detector.MagicMimeMimeDetector";
125
+ MimeDetector registeredMimeDetector = MimeUtil.getMimeDetector(mimeDetector);
126
+ MimeUtil.registerMimeDetector(mimeDetector);
127
+ }
128
+
129
+ public String detectMimeType(File file)
130
+ {
131
+ try {
132
+ Collection<?> mimeTypes = MimeUtil.getMimeTypes(file);
133
+ if (!mimeTypes.isEmpty()) {
134
+ Iterator<?> iterator = mimeTypes.iterator();
135
+ MimeType mimeType = (MimeType) iterator.next();
136
+ return mimeType.getMediaType() + "/" + mimeType.getSubType();
137
+ }
138
+ } catch (Exception ex) {
139
+ }
140
+ return "application/octet-stream";
141
+ }
142
+ */
143
+
144
+ public static class Builder
145
+ {
146
+ private final String serviceAccountEmail;
147
+ private String p12KeyFilePath;
148
+ private String applicationName;
149
+ private String bucket;
150
+ private String sourceFormat;
151
+ private boolean isFileCompressed;
152
+ private boolean deleteFromBucketWhenJobEnd;
153
+ private boolean enableMd5hashCheck;
154
+
155
+ public Builder(String serviceAccountEmail)
156
+ {
157
+ this.serviceAccountEmail = serviceAccountEmail;
158
+ }
159
+
160
+ public Builder setP12KeyFilePath(String p12KeyFilePath)
161
+ {
162
+ this.p12KeyFilePath = p12KeyFilePath;
163
+ return this;
164
+ }
165
+
166
+ public Builder setApplicationName(String applicationName)
167
+ {
168
+ this.applicationName = applicationName;
169
+ return this;
170
+ }
171
+
172
+ public Builder setBucket(String bucket)
173
+ {
174
+ this.bucket = bucket;
175
+ return this;
176
+ }
177
+
178
+ public Builder setSourceFormat(String sourceFormat)
179
+ {
180
+ this.sourceFormat = sourceFormat;
181
+ return this;
182
+ }
183
+
184
+ public Builder setIsFileCompressed(boolean isFileCompressed)
185
+ {
186
+ this.isFileCompressed = isFileCompressed;
187
+ return this;
188
+ }
189
+
190
+ public Builder setDeleteFromBucketWhenJobEnd(boolean deleteFromBucketWhenJobEnd)
191
+ {
192
+ this.deleteFromBucketWhenJobEnd = deleteFromBucketWhenJobEnd;
193
+ return this;
194
+ }
195
+
196
+ public BigqueryGcsWriter build() throws IOException, GeneralSecurityException
197
+ {
198
+ return new BigqueryGcsWriter(this);
199
+ }
200
+ }
201
+ }
@@ -0,0 +1,293 @@
1
+ package org.embulk.output;
2
+
3
+ import java.io.File;
4
+ import java.io.FileWriter;
5
+ import java.io.FileNotFoundException;
6
+ import java.io.FileOutputStream;
7
+ import java.io.BufferedOutputStream;
8
+ import java.io.IOException;
9
+ import java.util.List;
10
+ import java.util.ArrayList;
11
+ import java.util.HashMap;
12
+ import java.util.concurrent.TimeoutException;
13
+ import com.google.common.base.Optional;
14
+ import com.google.common.base.Throwables;
15
+ import java.security.GeneralSecurityException;
16
+
17
+ import org.embulk.config.Config;
18
+ import org.embulk.config.ConfigException;
19
+ import org.embulk.config.ConfigDefault;
20
+ import org.embulk.config.ConfigSource;
21
+ import org.embulk.config.ConfigDiff;
22
+ import org.embulk.config.CommitReport;
23
+ import org.embulk.config.Task;
24
+ import org.embulk.config.TaskSource;
25
+ import org.embulk.spi.Buffer;
26
+ import org.embulk.spi.FileOutputPlugin;
27
+ import org.embulk.spi.TransactionalFileOutput;
28
+ import org.embulk.spi.Exec;
29
+
30
+ import org.slf4j.Logger;
31
+
32
+ public class BigqueryOutputPlugin
33
+ implements FileOutputPlugin
34
+ {
35
+ public interface PluginTask
36
+ extends Task
37
+ {
38
+ @Config("service_account_email")
39
+ public String getServiceAccountEmail();
40
+
41
+ @Config("p12_keyfile_path")
42
+ public String getP12KeyfilePath();
43
+
44
+ @Config("application_name")
45
+ @ConfigDefault("\"Embulk BigQuery plugin\"")
46
+ public String getApplicationName();
47
+
48
+ @Config("path_prefix")
49
+ public String getPathPrefix();
50
+
51
+ @Config("sequence_format")
52
+ @ConfigDefault("\".%03d.%02d\"")
53
+ public String getSequenceFormat();
54
+
55
+ @Config("file_ext")
56
+ public String getFileNameExtension();
57
+
58
+ @Config("source_format")
59
+ @ConfigDefault("\"CSV\"")
60
+ public String getSourceFormat();
61
+
62
+ @Config("is_file_compressed")
63
+ @ConfigDefault("true")
64
+ public boolean getIsFileCompressed();
65
+
66
+ @Config("field_delimiter")
67
+ @ConfigDefault("\",\"")
68
+ public String getFieldDelimiter();
69
+
70
+ @Config("max_bad_records")
71
+ @ConfigDefault("0")
72
+ public int getMaxBadrecords();
73
+
74
+ @Config("delete_from_local_when_upload_end")
75
+ @ConfigDefault("false")
76
+ public boolean getDeleteFromLocalWhenUploadEnd();
77
+
78
+ @Config("delete_from_bucket_when_job_end")
79
+ @ConfigDefault("false")
80
+ public boolean getDeleteFromBucketWhenJobEnd();
81
+
82
+ @Config("bucket")
83
+ public String getBucket();
84
+
85
+ @Config("remote_path")
86
+ @ConfigDefault("null")
87
+ public Optional<String> getRemotePath();
88
+
89
+ @Config("project")
90
+ public String getProject();
91
+
92
+ @Config("dataset")
93
+ public String getDataset();
94
+
95
+ @Config("table")
96
+ public String getTable();
97
+
98
+ @Config("auto_create_table")
99
+ @ConfigDefault("false")
100
+ public boolean getAutoCreateTable();
101
+
102
+ @Config("schema_path")
103
+ @ConfigDefault("null")
104
+ public Optional<String> getSchemaPath();
105
+
106
+ @Config("job_status_max_polling_time")
107
+ @ConfigDefault("3600")
108
+ public int getJobStatusMaxPollingTime();
109
+
110
+ @Config("job_status_polling_interval")
111
+ @ConfigDefault("10")
112
+ public int getJobStatusPollingInterval();
113
+
114
+ @Config("is_skip_job_result_check")
115
+ @ConfigDefault("0")
116
+ public boolean getIsSkipJobResultCheck();
117
+ }
118
+
119
+ private final Logger log = Exec.getLogger(BigqueryOutputPlugin.class);
120
+ private static BigqueryGcsWriter bigQueryGcsWriter;
121
+ private static BigqueryWriter bigQueryWriter;
122
+
123
+ public ConfigDiff transaction(ConfigSource config, int taskCount,
124
+ FileOutputPlugin.Control control)
125
+ {
126
+ final PluginTask task = config.loadConfig(PluginTask.class);
127
+
128
+ try {
129
+ bigQueryGcsWriter = new BigqueryGcsWriter.Builder(task.getServiceAccountEmail())
130
+ .setP12KeyFilePath(task.getP12KeyfilePath())
131
+ .setApplicationName(task.getApplicationName())
132
+ .setBucket(task.getBucket())
133
+ .setSourceFormat(task.getSourceFormat())
134
+ .setIsFileCompressed(task.getIsFileCompressed())
135
+ .setDeleteFromBucketWhenJobEnd(task.getDeleteFromBucketWhenJobEnd())
136
+ .build();
137
+
138
+ bigQueryWriter = new BigqueryWriter.Builder(task.getServiceAccountEmail())
139
+ .setP12KeyFilePath(task.getP12KeyfilePath())
140
+ .setApplicationName(task.getApplicationName())
141
+ .setProject(task.getProject())
142
+ .setDataset(task.getDataset())
143
+ .setTable(task.getTable())
144
+ .setAutoCreateTable(task.getAutoCreateTable())
145
+ .setSchemaPath(task.getSchemaPath())
146
+ .setBucket(task.getBucket())
147
+ .setSourceFormat(task.getSourceFormat())
148
+ .setFieldDelimiter(task.getFieldDelimiter())
149
+ .setMaxBadrecords(task.getMaxBadrecords())
150
+ .setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
151
+ .setJobStatusPollingInterval(task.getJobStatusPollingInterval())
152
+ .setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
153
+ .build();
154
+ } catch (IOException | GeneralSecurityException ex) {
155
+ log.warn("Google Authentication was failed. Please Check your configurations.");
156
+ throw new ConfigException(ex);
157
+ }
158
+ // non-retryable (non-idempotent) output:
159
+ return resume(task.dump(), taskCount, control);
160
+ }
161
+
162
+ public ConfigDiff resume(TaskSource taskSource,
163
+ int taskCount,
164
+ FileOutputPlugin.Control control)
165
+ {
166
+ control.run(taskSource);
167
+
168
+ try {
169
+ bigQueryWriter.executeJob();
170
+ // TODO refactor
171
+ if (bigQueryGcsWriter.getDeleteFromBucketWhenJobEnd()) {
172
+ ArrayList<HashMap<String, String>> fileList = bigQueryWriter.getFileList();
173
+ for (HashMap<String, String> file : fileList) {
174
+ bigQueryGcsWriter.deleteFile(file.get("remote_path"), file.get("file_name"));
175
+ }
176
+ }
177
+ } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
178
+ log.warn(ex.getMessage());
179
+ throw Throwables.propagate(ex);
180
+ }
181
+ return Exec.newConfigDiff();
182
+ }
183
+
184
+ @Override
185
+ public void cleanup(TaskSource taskSource,
186
+ int taskCount,
187
+ List<CommitReport> successCommitReports)
188
+ {
189
+ }
190
+
191
+ @Override
192
+ public TransactionalFileOutput open(TaskSource taskSource, final int taskIndex)
193
+ {
194
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
195
+
196
+ final String pathPrefix = task.getPathPrefix();
197
+ final String sequenceFormat = task.getSequenceFormat();
198
+ final String pathSuffix = task.getFileNameExtension();
199
+ final Optional<String> remotePath = task.getRemotePath();
200
+
201
+ return new TransactionalFileOutput() {
202
+ private int fileIndex = 0;
203
+ private BufferedOutputStream output = null;
204
+ private File file;
205
+ private String filePath;
206
+ private String fileName;
207
+ private long fileSize;
208
+
209
+ public void nextFile()
210
+ {
211
+ closeFile();
212
+
213
+ try {
214
+ String suffix = pathSuffix;
215
+ if (!suffix.startsWith(".")) {
216
+ suffix = "." + suffix;
217
+ }
218
+ filePath = pathPrefix + String.format(sequenceFormat, taskIndex, fileIndex) + suffix;
219
+ file = new File(filePath);
220
+ fileName = file.getName();
221
+ fileSize = file.length();
222
+
223
+ String parentPath = file.getParent();
224
+ File dir = new File(parentPath);
225
+ if (!dir.exists()) {
226
+ dir.mkdir();
227
+ }
228
+ log.info(String.format("Writing file [%s]", filePath));
229
+ output = new BufferedOutputStream(new FileOutputStream(filePath));
230
+ } catch (FileNotFoundException ex) {
231
+ throw Throwables.propagate(ex);
232
+ }
233
+ fileIndex++;
234
+ }
235
+
236
+ private void closeFile()
237
+ {
238
+ if (output != null) {
239
+ try {
240
+ output.close();
241
+ } catch (IOException ex) {
242
+ throw Throwables.propagate(ex);
243
+ }
244
+ }
245
+ }
246
+
247
+ public void add(Buffer buffer)
248
+ {
249
+ try {
250
+ output.write(buffer.array(), buffer.offset(), buffer.limit());
251
+ } catch (IOException ex) {
252
+ throw Throwables.propagate(ex);
253
+ } finally {
254
+ buffer.release();
255
+ }
256
+ }
257
+
258
+ public void finish()
259
+ {
260
+ closeFile();
261
+ if (fileName != null) {
262
+ try {
263
+ bigQueryGcsWriter.uploadFile(filePath, fileName, remotePath);
264
+
265
+ if (task.getDeleteFromLocalWhenUploadEnd()) {
266
+ log.info(String.format("Delete local file [%s]", filePath));
267
+ file.delete();
268
+ }
269
+
270
+ bigQueryWriter.addTask(remotePath, fileName, fileSize);
271
+ } catch (IOException ex) {
272
+ throw Throwables.propagate(ex);
273
+ }
274
+ }
275
+ }
276
+
277
+ public void close()
278
+ {
279
+ closeFile();
280
+ }
281
+
282
+ public void abort()
283
+ {
284
+ }
285
+
286
+ public CommitReport commit()
287
+ {
288
+ CommitReport report = Exec.newCommitReport();
289
+ return report;
290
+ }
291
+ };
292
+ }
293
+ }