embulk-output-bigquery 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 76284d5061944ac0c9f2f315cb46bc681b81dce0
4
- data.tar.gz: f75c285ff585d5177273fd70897e320c6375c6f3
3
+ metadata.gz: 46c61dd1c73ff99c3c69bd217ca772f07b2e1127
4
+ data.tar.gz: ba184360972884260c1fe90264af7d5386791804
5
5
  SHA512:
6
- metadata.gz: 7d4b83984a02db3af2dfb769f136d650c0458b5c3e9f0ab64823a558a8ca61885845a466133cc97c6092be60f05258d1e1d24ffd61c0fa78eb7018027d63a02c
7
- data.tar.gz: 7e4296ff13ebd7c3c9ff68e6a1701a4e26214b47a08714fe4ae31889d919433ee3f42341ad65651a44e8a85d3cf43f8bc8a83238c0d67d44468e0f39b48940d5
6
+ metadata.gz: aa693e59cb4b45c2d43f07479f3d61e63242185be9964d4f00b83a4a784a0443ae270a63760f3f2f188e74deb77cbb94a89a18db49d2c5cd4621f18b73363ab3
7
+ data.tar.gz: 7c0ea783220de28befd7c565ff83ec5ff58f13af0db16b3d341a12c3e415adeacba375e5688a42fcbb26d0402a48071622ed5b161fa52fd08b1f56444faf66e1
data/README.md CHANGED
@@ -1,17 +1,17 @@
1
1
 
2
2
  # embulk-output-bigquery
3
3
 
4
- [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/) via [GCS(Google Cloud Storage)](https://cloud.google.com/storage/)
4
+ [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/)
5
5
 
6
6
  ## Overview
7
7
 
8
- load data into Google BigQuery as batch jobs via GCS for big amount of data
8
+ load data into Google BigQuery as batch jobs for big amount of data
9
9
  https://developers.google.com/bigquery/loading-data-into-bigquery
10
10
 
11
11
  * **Plugin type**: output
12
12
  * **Resume supported**: no
13
13
  * **Cleanup supported**: no
14
- * **Dynamic table creating**: todo
14
+ * **Dynamic table creating**: yes
15
15
 
16
16
  ### NOT IMPLEMENTED
17
17
  * insert data over streaming inserts
@@ -30,32 +30,19 @@ OAuth flow for installed applications.
30
30
  - **sequence_format**: (string, optional, default is %03d.%02d)
31
31
  - **file_ext**: (string, required)
32
32
  - **source_format**: file type (NEWLINE_DELIMITED_JSON or CSV) (string, required, default is CSV)
33
- - **is_file_compressed**: upload file is gzip compressed or not. (boolean, optional, default is 1)
34
- - **bucket**: Google Cloud Storage output bucket name (string, required)
35
- - **remote_path**: folder name in GCS bucket (string, optional)
36
33
  - **project**: project_id (string, required)
37
34
  - **dataset**: dataset (string, required)
38
35
  - **table**: table name (string, required)
36
+ - **auto_create_table**: (boolean, optional default is 0)
37
+ - **schema_path**: (string, optional)
39
38
  - **application_name**: application name anything you like (string, optional)
40
- - **delete_from_local_when_upload_end**: (boolean, optional, default is 0)
41
- - **delete_from_bucket_when_job_end**: (boolean, optional, default is 0)
39
+ - **delete_from_local_when_job_end**: (boolean, optional, default is 0)
42
40
  - **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
43
41
  - **job_status_polling_interval**: job status polling interval. (int, optional, default is 10 sec)
44
42
  - **is_skip_job_result_check**: (boolean, optional, default is 0)
45
-
46
- ## Support for Google BigQuery Quota policy
47
- embulk-output-bigquery support following [Google BigQuery Quota policy](https://cloud.google.com/bigquery/loading-data-into-bigquery#quota).
48
-
49
- * Supported
50
- * Maximum size per load job: 1TB across all input files
51
- * Maximum number of files per load job: 10,000
52
- * embulk-output-bigquery divides a file into more than one job, like below.
53
- * job1: file1(1GB) file2(1GB)...file10(1GB)
54
- * job2: file11(1GB) file12(1GB)
55
-
56
- * Not Supported
57
- * Daily limit: 1,000 load jobs per table per day (including failures)
58
- * 10,000 load jobs per project per day (including failures)
43
+ - **field_delimiter**: (string, optional, default is ",")
44
+ - **max_bad_records**: (int, optional, default is 0)
45
+ - **encoding**: (UTF-8 or ISO-8859-1) (string, optional, default is "UTF-8")
59
46
 
60
47
  ## Example
61
48
 
@@ -67,10 +54,7 @@ out:
67
54
  path_prefix: /path/to/output
68
55
  file_ext: csv.gz
69
56
  source_format: CSV
70
- is_file_compressed: 1
71
57
  project: your-project-000
72
- bucket: output_bucket_name
73
- remote_path: folder_name
74
58
  dataset: your_dataset_name
75
59
  table: your_table_name
76
60
  formatter:
@@ -80,6 +64,21 @@ out:
80
64
  - {type: gzip}
81
65
  ```
82
66
 
67
+ ## Dynamic table creating
68
+
69
+ When `auto_create_table` is set to true, try to create the table using BigQuery API.
70
+
71
+ To describe the schema of the target table, please write schema path.
72
+
73
+ `table` option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
74
+ format of ruby to construct table name.
75
+
76
+ ```
77
+ auto_create_table: true
78
+ table: table_%Y_%m
79
+ schema_path: /path/to/schema.json
80
+ ```
81
+
83
82
  ## Build
84
83
 
85
84
  ```
data/build.gradle CHANGED
@@ -15,16 +15,14 @@ configurations {
15
15
  sourceCompatibility = 1.7
16
16
  targetCompatibility = 1.7
17
17
 
18
- version = "0.1.1"
18
+ version = "0.1.2"
19
19
 
20
20
  dependencies {
21
21
  compile "org.embulk:embulk-core:0.5.1"
22
22
  provided "org.embulk:embulk-core:0.5.1"
23
23
 
24
24
  compile "com.google.http-client:google-http-client-jackson2:1.19.0"
25
- compile ("com.google.apis:google-api-services-storage:v1-rev27-1.19.1") {exclude module: "guava-jdk5"}
26
25
  compile "com.google.apis:google-api-services-bigquery:v2-rev193-1.19.1"
27
- compile "eu.medsea.mimeutil:mime-util:2.1.3"
28
26
 
29
27
  testCompile "junit:junit:4.+"
30
28
  }
@@ -23,8 +23,6 @@ import com.google.api.client.http.HttpTransport;
23
23
  import com.google.api.client.http.InputStreamContent;
24
24
  import com.google.api.client.json.JsonFactory;
25
25
  import com.google.api.client.json.jackson2.JacksonFactory;
26
- import com.google.api.services.storage.Storage;
27
- import com.google.api.services.storage.StorageScopes;
28
26
  import com.google.api.services.bigquery.Bigquery;
29
27
  import com.google.api.services.bigquery.BigqueryScopes;
30
28
  import com.google.api.services.bigquery.model.ProjectList;
@@ -64,7 +62,6 @@ public class BigqueryAuthentication
64
62
  .setServiceAccountId(serviceAccountEmail)
65
63
  .setServiceAccountScopes(
66
64
  ImmutableList.of(
67
- BigqueryScopes.DEVSTORAGE_READ_WRITE,
68
65
  BigqueryScopes.BIGQUERY
69
66
  )
70
67
  )
@@ -87,13 +84,4 @@ public class BigqueryAuthentication
87
84
 
88
85
  return client;
89
86
  }
90
-
91
- public Storage getGcsClient() throws IOException
92
- {
93
- Storage client = new Storage.Builder(httpTransport, jsonFactory, credentials)
94
- .setApplicationName(applicationName)
95
- .build();
96
-
97
- return client;
98
- }
99
87
  }
@@ -13,6 +13,7 @@ import java.util.concurrent.TimeoutException;
13
13
  import com.google.common.base.Optional;
14
14
  import com.google.common.base.Throwables;
15
15
  import java.security.GeneralSecurityException;
16
+ import org.jruby.embed.ScriptingContainer;
16
17
 
17
18
  import org.embulk.config.Config;
18
19
  import org.embulk.config.ConfigException;
@@ -59,10 +60,6 @@ public class BigqueryOutputPlugin
59
60
  @ConfigDefault("\"CSV\"")
60
61
  public String getSourceFormat();
61
62
 
62
- @Config("is_file_compressed")
63
- @ConfigDefault("true")
64
- public boolean getIsFileCompressed();
65
-
66
63
  @Config("field_delimiter")
67
64
  @ConfigDefault("\",\"")
68
65
  public String getFieldDelimiter();
@@ -71,20 +68,13 @@ public class BigqueryOutputPlugin
71
68
  @ConfigDefault("0")
72
69
  public int getMaxBadrecords();
73
70
 
74
- @Config("delete_from_local_when_upload_end")
75
- @ConfigDefault("false")
76
- public boolean getDeleteFromLocalWhenUploadEnd();
71
+ @Config("encoding")
72
+ @ConfigDefault("\"UTF-8\"")
73
+ public String getEncoding();
77
74
 
78
- @Config("delete_from_bucket_when_job_end")
75
+ @Config("delete_from_local_when_job_end")
79
76
  @ConfigDefault("false")
80
- public boolean getDeleteFromBucketWhenJobEnd();
81
-
82
- @Config("bucket")
83
- public String getBucket();
84
-
85
- @Config("remote_path")
86
- @ConfigDefault("null")
87
- public Optional<String> getRemotePath();
77
+ public boolean getDeleteFromLocalWhenJobEnd();
88
78
 
89
79
  @Config("project")
90
80
  public String getProject();
@@ -117,7 +107,6 @@ public class BigqueryOutputPlugin
117
107
  }
118
108
 
119
109
  private final Logger log = Exec.getLogger(BigqueryOutputPlugin.class);
120
- private static BigqueryGcsWriter bigQueryGcsWriter;
121
110
  private static BigqueryWriter bigQueryWriter;
122
111
 
123
112
  public ConfigDiff transaction(ConfigSource config, int taskCount,
@@ -126,33 +115,25 @@ public class BigqueryOutputPlugin
126
115
  final PluginTask task = config.loadConfig(PluginTask.class);
127
116
 
128
117
  try {
129
- bigQueryGcsWriter = new BigqueryGcsWriter.Builder(task.getServiceAccountEmail())
130
- .setP12KeyFilePath(task.getP12KeyfilePath())
131
- .setApplicationName(task.getApplicationName())
132
- .setBucket(task.getBucket())
133
- .setSourceFormat(task.getSourceFormat())
134
- .setIsFileCompressed(task.getIsFileCompressed())
135
- .setDeleteFromBucketWhenJobEnd(task.getDeleteFromBucketWhenJobEnd())
136
- .build();
137
-
138
118
  bigQueryWriter = new BigqueryWriter.Builder(task.getServiceAccountEmail())
139
119
  .setP12KeyFilePath(task.getP12KeyfilePath())
140
120
  .setApplicationName(task.getApplicationName())
141
121
  .setProject(task.getProject())
142
122
  .setDataset(task.getDataset())
143
- .setTable(task.getTable())
123
+ .setTable(generateTableName(task.getTable()))
144
124
  .setAutoCreateTable(task.getAutoCreateTable())
145
125
  .setSchemaPath(task.getSchemaPath())
146
- .setBucket(task.getBucket())
147
126
  .setSourceFormat(task.getSourceFormat())
148
127
  .setFieldDelimiter(task.getFieldDelimiter())
149
128
  .setMaxBadrecords(task.getMaxBadrecords())
129
+ .setEncoding(task.getEncoding())
150
130
  .setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
151
131
  .setJobStatusPollingInterval(task.getJobStatusPollingInterval())
152
132
  .setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
153
133
  .build();
134
+ } catch (FileNotFoundException ex) {
135
+ throw new ConfigException(ex);
154
136
  } catch (IOException | GeneralSecurityException ex) {
155
- log.warn("Google Authentication was failed. Please Check your configurations.");
156
137
  throw new ConfigException(ex);
157
138
  }
158
139
  // non-retryable (non-idempotent) output:
@@ -165,19 +146,6 @@ public class BigqueryOutputPlugin
165
146
  {
166
147
  control.run(taskSource);
167
148
 
168
- try {
169
- bigQueryWriter.executeJob();
170
- // TODO refactor
171
- if (bigQueryGcsWriter.getDeleteFromBucketWhenJobEnd()) {
172
- ArrayList<HashMap<String, String>> fileList = bigQueryWriter.getFileList();
173
- for (HashMap<String, String> file : fileList) {
174
- bigQueryGcsWriter.deleteFile(file.get("remote_path"), file.get("file_name"));
175
- }
176
- }
177
- } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
178
- log.warn(ex.getMessage());
179
- throw Throwables.propagate(ex);
180
- }
181
149
  return Exec.newConfigDiff();
182
150
  }
183
151
 
@@ -196,7 +164,6 @@ public class BigqueryOutputPlugin
196
164
  final String pathPrefix = task.getPathPrefix();
197
165
  final String sequenceFormat = task.getSequenceFormat();
198
166
  final String pathSuffix = task.getFileNameExtension();
199
- final Optional<String> remotePath = task.getRemotePath();
200
167
 
201
168
  return new TransactionalFileOutput() {
202
169
  private int fileIndex = 0;
@@ -217,7 +184,6 @@ public class BigqueryOutputPlugin
217
184
  }
218
185
  filePath = pathPrefix + String.format(sequenceFormat, taskIndex, fileIndex) + suffix;
219
186
  file = new File(filePath);
220
- fileName = file.getName();
221
187
 
222
188
  String parentPath = file.getParent();
223
189
  File dir = new File(parentPath);
@@ -257,18 +223,15 @@ public class BigqueryOutputPlugin
257
223
  public void finish()
258
224
  {
259
225
  closeFile();
260
- if (fileName != null) {
261
- fileSize = file.length();
226
+ if (filePath != null) {
262
227
  try {
263
- bigQueryGcsWriter.uploadFile(filePath, fileName, remotePath);
228
+ bigQueryWriter.executeLoad(filePath);
264
229
 
265
- if (task.getDeleteFromLocalWhenUploadEnd()) {
230
+ if (task.getDeleteFromLocalWhenJobEnd()) {
266
231
  log.info(String.format("Delete local file [%s]", filePath));
267
232
  file.delete();
268
233
  }
269
-
270
- bigQueryWriter.addTask(remotePath, fileName, fileSize);
271
- } catch (IOException ex) {
234
+ } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
272
235
  throw Throwables.propagate(ex);
273
236
  }
274
237
  }
@@ -290,4 +253,13 @@ public class BigqueryOutputPlugin
290
253
  }
291
254
  };
292
255
  }
256
+
257
+ // Parse like "table_%Y_%m"(include pattern or not) format using Java is difficult. So use jRuby.
258
+ public String generateTableName(String tableName)
259
+ {
260
+ ScriptingContainer jruby = new ScriptingContainer();
261
+ Object result = jruby.runScriptlet("Time.now.strftime('" + tableName + "')");
262
+
263
+ return result.toString();
264
+ }
293
265
  }
@@ -1,6 +1,11 @@
1
1
  package org.embulk.output;
2
2
 
3
+ import java.io.File;
3
4
  import java.io.IOException;
5
+ import java.io.FileNotFoundException;
6
+ import java.io.FileInputStream;
7
+ import java.io.BufferedInputStream;
8
+ import com.google.api.client.http.InputStreamContent;
4
9
  import java.util.ArrayList;
5
10
  import java.util.List;
6
11
  import java.util.Iterator;
@@ -11,14 +16,19 @@ import java.util.concurrent.TimeoutException;
11
16
  import org.apache.commons.lang3.StringUtils;
12
17
  import com.google.common.base.Optional;
13
18
  import com.google.common.collect.ImmutableSet;
19
+ import com.google.common.base.Throwables;
14
20
  import java.security.GeneralSecurityException;
15
21
 
22
+ import com.fasterxml.jackson.databind.ObjectMapper;
23
+ import com.fasterxml.jackson.core.type.TypeReference;
24
+
16
25
  import org.embulk.spi.Exec;
17
26
  import org.slf4j.Logger;
18
27
 
19
28
  import com.google.api.services.bigquery.Bigquery;
20
29
  import com.google.api.services.bigquery.BigqueryScopes;
21
30
  import com.google.api.services.bigquery.Bigquery.Datasets;
31
+ import com.google.api.services.bigquery.Bigquery.Tables;
22
32
  import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
23
33
  import com.google.api.services.bigquery.Bigquery.Jobs.GetQueryResults;
24
34
  import com.google.api.services.bigquery.model.Job;
@@ -28,11 +38,19 @@ import com.google.api.services.bigquery.model.JobStatus;
28
38
  import com.google.api.services.bigquery.model.JobStatistics;
29
39
  import com.google.api.services.bigquery.model.JobReference;
30
40
  import com.google.api.services.bigquery.model.DatasetList;
41
+ import com.google.api.services.bigquery.model.Table;
42
+ import com.google.api.services.bigquery.model.TableList;
31
43
  import com.google.api.services.bigquery.model.TableSchema;
32
44
  import com.google.api.services.bigquery.model.TableReference;
33
45
  import com.google.api.services.bigquery.model.TableFieldSchema;
34
46
  import com.google.api.services.bigquery.model.TableCell;
35
47
  import com.google.api.services.bigquery.model.TableRow;
48
+ import com.google.api.services.bigquery.model.ErrorProto;
49
+ import com.google.api.client.googleapis.json.GoogleJsonResponseException;
50
+
51
+ import com.google.api.client.googleapis.media.MediaHttpUploader;
52
+ import com.google.api.client.googleapis.media.MediaHttpUploaderProgressListener;
53
+ import com.google.api.client.googleapis.media.MediaHttpUploader.UploadState;
36
54
 
37
55
  public class BigqueryWriter
38
56
  {
@@ -43,43 +61,58 @@ public class BigqueryWriter
43
61
  private final String table;
44
62
  private final boolean autoCreateTable;
45
63
  private final Optional<String> schemaPath;
46
- private final String bucket;
64
+ private final TableSchema tableSchema;
47
65
  private final String sourceFormat;
48
66
  private final String fieldDelimiter;
49
67
  private final int maxBadrecords;
68
+ private final String encoding;
50
69
  private final long jobStatusMaxPollingTime;
51
70
  private final long jobStatusPollingInterval;
52
71
  private final boolean isSkipJobResultCheck;
53
72
  private final Bigquery bigQueryClient;
54
- private final EmbulkBigqueryTask writerTask;
55
73
 
56
- public BigqueryWriter(Builder builder) throws IOException, GeneralSecurityException
74
+ public BigqueryWriter(Builder builder) throws FileNotFoundException, IOException, GeneralSecurityException
57
75
  {
58
76
  this.project = builder.project;
59
77
  this.dataset = builder.dataset;
60
78
  this.table = builder.table;
61
79
  this.autoCreateTable = builder.autoCreateTable;
62
80
  this.schemaPath = builder.schemaPath;
63
- this.bucket = builder.bucket;
64
81
  this.sourceFormat = builder.sourceFormat.toUpperCase();
65
82
  this.fieldDelimiter = builder.fieldDelimiter;
66
83
  this.maxBadrecords = builder.maxBadrecords;
84
+ this.encoding = builder.encoding.toUpperCase();
67
85
  this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
68
86
  this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
69
87
  this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
70
88
 
71
89
  BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
72
90
  this.bigQueryClient = auth.getBigqueryClient();
73
- this.writerTask = new EmbulkBigqueryTask();
91
+
92
+ checkConfig();
93
+ if (autoCreateTable) {
94
+ this.tableSchema = createTableSchema(builder.schemaPath);
95
+ } else {
96
+ this.tableSchema = null;
97
+ }
74
98
  }
75
99
 
76
100
  private String getJobStatus(JobReference jobRef) throws JobFailedException
77
101
  {
78
102
  try {
79
103
  Job job = bigQueryClient.jobs().get(project, jobRef.getJobId()).execute();
80
- if (job.getStatus().getErrorResult() != null) {
81
- throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s] status:[FAILED]", jobRef.getJobId(), job.getStatus().getErrorResult().getMessage()));
104
+
105
+ ErrorProto fatalError = job.getStatus().getErrorResult();
106
+ if (fatalError != null) {
107
+ throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s][%s] status:[FAILED]", jobRef.getJobId(), fatalError.getReason(), fatalError.getMessage()));
108
+ }
109
+ List<ErrorProto> errors = job.getStatus().getErrors();
110
+ if (errors != null) {
111
+ for (ErrorProto error : errors) {
112
+ log.warn(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
113
+ }
82
114
  }
115
+
83
116
  String jobStatus = job.getStatus().getState();
84
117
  if (jobStatus.equals("DONE")) {
85
118
  JobStatistics statistics = job.getStatistics();
@@ -117,59 +150,68 @@ public class BigqueryWriter
117
150
  }
118
151
  }
119
152
 
120
- public void executeJob() throws IOException, TimeoutException, JobFailedException
121
- {
122
- // TODO: refactor
123
- ArrayList<ArrayList<HashMap<String, String>>> taskList = writerTask.createJobList();
124
- for (ArrayList<HashMap<String, String>> task : taskList) {
125
- Job job = createJob(task);
126
- // TODO: multi-threading
127
- new EmbulkBigqueryJob(job).call();
128
- }
129
- }
130
-
131
- private Job createJob(ArrayList<HashMap<String, String>> task)
153
+ public void executeLoad(String localFilePath) throws GoogleJsonResponseException, IOException, TimeoutException, JobFailedException
132
154
  {
133
155
  log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
134
156
 
135
157
  Job job = new Job();
158
+ JobReference jobRef = null;
136
159
  JobConfiguration jobConfig = new JobConfiguration();
137
160
  JobConfigurationLoad loadConfig = new JobConfigurationLoad();
138
161
  jobConfig.setLoad(loadConfig);
139
162
  job.setConfiguration(jobConfig);
140
163
 
141
164
  loadConfig.setAllowQuotedNewlines(false);
165
+ loadConfig.setEncoding(encoding);
166
+ loadConfig.setMaxBadRecords(maxBadrecords);
142
167
  if (sourceFormat.equals("NEWLINE_DELIMITED_JSON")) {
143
168
  loadConfig.setSourceFormat("NEWLINE_DELIMITED_JSON");
144
169
  } else {
145
170
  loadConfig.setFieldDelimiter(fieldDelimiter);
146
171
  }
172
+ loadConfig.setWriteDisposition("WRITE_APPEND");
147
173
  if (autoCreateTable) {
148
- loadConfig.setSchema(getTableSchema());
149
- loadConfig.setWriteDisposition("WRITE_EMPTY");
174
+ loadConfig.setSchema(tableSchema);
150
175
  loadConfig.setCreateDisposition("CREATE_IF_NEEDED");
151
- log.info(String.format("table:[%s] will be create.", table));
176
+ log.info(String.format("table:[%s] will be create if not exists", table));
152
177
  } else {
153
- loadConfig.setWriteDisposition("WRITE_APPEND");
154
178
  loadConfig.setCreateDisposition("CREATE_NEVER");
155
179
  }
156
- loadConfig.setMaxBadRecords(maxBadrecords);
157
180
 
158
- List<String> sources = new ArrayList<String>();
159
- for (HashMap<String, String> file : task) {
160
- String sourceFile;
161
- String remotePath = getRemotePath(file.get("remote_path"), file.get("file_name"));
162
- sourceFile = "gs://" + remotePath;
163
- log.info(String.format("Add source file to job [%s]", sourceFile));
164
- sources.add(sourceFile);
165
- }
166
- loadConfig.setSourceUris(sources);
167
- loadConfig.setDestinationTable(getTableReference());
181
+ loadConfig.setDestinationTable(createTableReference());
182
+
183
+ File file = new File(localFilePath);
184
+ InputStreamContent mediaContent = new InputStreamContent("application/octet-stream",
185
+ new BufferedInputStream(
186
+ new FileInputStream(file)));
187
+ mediaContent.setLength(file.length());
168
188
 
169
- return job;
189
+ Insert insert = bigQueryClient.jobs().insert(project, job, mediaContent);
190
+ insert.setProjectId(project);
191
+ insert.setDisableGZipContent(true);
192
+
193
+ // @see https://code.google.com/p/google-api-java-client/wiki/MediaUpload
194
+ UploadProgressListener listner = new UploadProgressListener();
195
+ listner.setFileName(localFilePath);
196
+ insert.getMediaHttpUploader()
197
+ .setProgressListener(listner)
198
+ .setDirectUploadEnabled(false);
199
+
200
+ try {
201
+ jobRef = insert.execute().getJobReference();
202
+ } catch (Exception ex) {
203
+ log.warn("Job execution was failed. Please check your settings or data... like data matches schema");
204
+ throw Throwables.propagate(ex);
205
+ }
206
+ log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
207
+ if (isSkipJobResultCheck) {
208
+ log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
209
+ } else {
210
+ getJobStatusUntilDone(jobRef);
211
+ }
170
212
  }
171
213
 
172
- private TableReference getTableReference()
214
+ private TableReference createTableReference()
173
215
  {
174
216
  return new TableReference()
175
217
  .setProjectId(project)
@@ -177,135 +219,78 @@ public class BigqueryWriter
177
219
  .setTableId(table);
178
220
  }
179
221
 
180
- private TableSchema getTableSchema()
222
+ private TableSchema createTableSchema(Optional<String> schemaPath) throws FileNotFoundException, IOException
181
223
  {
182
- TableSchema tableSchema = new TableSchema();
183
- List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
184
- TableFieldSchema tableField;
185
- // TODO import from json file
186
- /*
187
- for () {
188
- tableField = new TableFieldSchema()
189
- .setName(name)
190
- .setType(type);
191
- fields.add(tableField);
224
+ String path = schemaPath.orNull();
225
+ File file = new File(path);
226
+ FileInputStream stream = null;
227
+ try {
228
+ stream = new FileInputStream(file);
229
+ ObjectMapper mapper = new ObjectMapper();
230
+ List<TableFieldSchema> fields = mapper.readValue(stream, new TypeReference<List<TableFieldSchema>>() {});
231
+ TableSchema tableSchema = new TableSchema().setFields(fields);
232
+ return tableSchema;
233
+ } finally {
234
+ if (stream != null) {
235
+ stream.close();
236
+ }
192
237
  }
193
- */
194
-
195
- tableSchema.setFields(fields);
196
- return tableSchema;
197
238
  }
198
239
 
199
- private String getRemotePath(String remotePath, String fileName)
240
+ public boolean isExistTable(String tableName) throws IOException
200
241
  {
201
- String[] pathList = StringUtils.split(remotePath, '/');
202
- String path;
203
- if (remotePath.isEmpty()) {
204
- path = bucket + "/" + fileName;
205
- } else {
206
- path = bucket + "/" + StringUtils.join(pathList) + "/" + fileName;
242
+ Tables tableRequest = bigQueryClient.tables();
243
+ try {
244
+ Table tableData = tableRequest.get(project, dataset, tableName).execute();
245
+ } catch (GoogleJsonResponseException ex) {
246
+ return false;
207
247
  }
208
- return path;
209
- }
210
-
211
- public void addTask(Optional<String> remotePath, String fileName, long fileSize)
212
- {
213
- writerTask.addTaskFile(remotePath, fileName, fileSize);
214
- }
215
-
216
- public ArrayList<HashMap<String, String>> getFileList()
217
- {
218
- return writerTask.getFileList();
248
+ return true;
219
249
  }
220
250
 
221
- private class EmbulkBigqueryJob implements Callable<Void>
251
+ public void checkConfig() throws FileNotFoundException, IOException
222
252
  {
223
- private final Job job;
224
-
225
- public EmbulkBigqueryJob(Job job)
226
- {
227
- this.job = job;
228
- }
229
-
230
- public Void call() throws IOException, TimeoutException, JobFailedException
231
- {
232
- Insert insert = bigQueryClient.jobs().insert(project, job);
233
- insert.setProjectId(project);
234
- JobReference jobRef = insert.execute().getJobReference();
235
- log.info(String.format("Job executed. job id:[%s]", jobRef.getJobId()));
236
- if (isSkipJobResultCheck) {
237
- log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
253
+ if (autoCreateTable) {
254
+ if (!schemaPath.isPresent()) {
255
+ throw new IOException("schema_path is empty");
238
256
  } else {
239
- getJobStatusUntilDone(jobRef);
257
+ File file = new File(schemaPath.orNull());
258
+ if (!file.exists()) {
259
+ throw new FileNotFoundException("Can not load schema file.");
260
+ }
261
+ }
262
+ } else {
263
+ if (!isExistTable(table)) {
264
+ throw new IOException(String.format("table [%s] is not exists", table));
240
265
  }
241
- return null;
242
266
  }
243
267
  }
244
268
 
245
- private class EmbulkBigqueryTask
269
+ private class UploadProgressListener implements MediaHttpUploaderProgressListener
246
270
  {
247
- // https://cloud.google.com/bigquery/loading-data-into-bigquery#quota
248
- private final long MAX_SIZE_PER_LOAD_JOB = 1000 * 1024 * 1024 * 1024L; // 1TB
249
- private final int MAX_NUMBER_OF_FILES_PER_LOAD_JOB = 10000;
271
+ private String fileName;
250
272
 
251
- private final ArrayList<HashMap<String, String>> taskList = new ArrayList<HashMap<String, String>>();
252
- private final ArrayList<ArrayList<HashMap<String, String>>> jobList = new ArrayList<ArrayList<HashMap<String, String>>>();
253
-
254
- public void addTaskFile(Optional<String> remotePath, String fileName, long fileSize)
273
+ @Override
274
+ public void progressChanged(MediaHttpUploader uploader) throws IOException
255
275
  {
256
- HashMap<String, String> task = new HashMap<String, String>();
257
- if (remotePath.isPresent()) {
258
- task.put("remote_path", remotePath.get());
259
- } else {
260
- task.put("remote_path", "");
261
- }
262
- task.put("file_name", fileName);
263
- task.put("file_size", String.valueOf(fileSize));
264
- taskList.add(task);
265
- }
266
-
267
- public ArrayList<ArrayList<HashMap<String, String>>> createJobList()
268
- {
269
- long currentBundleSize = 0;
270
- int currentFileCount = 0;
271
- ArrayList<HashMap<String, String>> job = new ArrayList<HashMap<String, String>>();
272
- for (HashMap<String, String> task : taskList) {
273
- boolean isNeedNextJobList = false;
274
- long fileSize = Long.valueOf(task.get("file_size")).longValue();
275
-
276
- if (currentBundleSize + fileSize > MAX_SIZE_PER_LOAD_JOB) {
277
- isNeedNextJobList = true;
278
- }
279
-
280
- if (currentFileCount >= MAX_NUMBER_OF_FILES_PER_LOAD_JOB) {
281
- isNeedNextJobList = true;
282
- }
283
-
284
- if (isNeedNextJobList) {
285
- jobList.add(job);
286
- job = new ArrayList<HashMap<String, String>>();
287
- job.add(task);
288
- currentBundleSize = 0;
289
- } else {
290
- job.add(task);
291
- }
292
- currentBundleSize += fileSize;
293
- currentFileCount++;
294
-
295
- log.debug(String.format("currentBundleSize:%s currentFileCount:%s", currentBundleSize, currentFileCount));
296
- log.debug(String.format("fileSize:%s, MAX_SIZE_PER_LOAD_JOB:%s MAX_NUMBER_OF_FILES_PER_LOAD_JOB:%s",
297
- fileSize, MAX_SIZE_PER_LOAD_JOB, MAX_NUMBER_OF_FILES_PER_LOAD_JOB));
298
-
299
- }
300
- if (job.size() > 0) {
301
- jobList.add(job);
276
+ switch (uploader.getUploadState()) {
277
+ case INITIATION_STARTED:
278
+ log.info(String.format("Upload start [%s]", fileName));
279
+ break;
280
+ case INITIATION_COMPLETE:
281
+ //log.info(String.format("Upload initiation completed file [%s]", fileName));
282
+ break;
283
+ case MEDIA_IN_PROGRESS:
284
+ log.debug(String.format("Uploading [%s] progress %3.0f", fileName, uploader.getProgress() * 100) + "%");
285
+ break;
286
+ case MEDIA_COMPLETE:
287
+ log.info(String.format("Upload completed [%s]", fileName));
302
288
  }
303
- return jobList;
304
289
  }
305
290
 
306
- public ArrayList<HashMap<String, String>> getFileList()
291
+ public void setFileName(String fileName)
307
292
  {
308
- return taskList;
293
+ this.fileName = fileName;
309
294
  }
310
295
  }
311
296
 
@@ -319,10 +304,10 @@ public class BigqueryWriter
319
304
  private String table;
320
305
  private boolean autoCreateTable;
321
306
  private Optional<String> schemaPath;
322
- private String bucket;
323
307
  private String sourceFormat;
324
308
  private String fieldDelimiter;
325
309
  private int maxBadrecords;
310
+ private String encoding;
326
311
  private int jobStatusMaxPollingTime;
327
312
  private int jobStatusPollingInterval;
328
313
  private boolean isSkipJobResultCheck;
@@ -375,12 +360,6 @@ public class BigqueryWriter
375
360
  return this;
376
361
  }
377
362
 
378
- public Builder setBucket(String bucket)
379
- {
380
- this.bucket = bucket;
381
- return this;
382
- }
383
-
384
363
  public Builder setSourceFormat(String sourceFormat)
385
364
  {
386
365
  this.sourceFormat = sourceFormat;
@@ -399,6 +378,12 @@ public class BigqueryWriter
399
378
  return this;
400
379
  }
401
380
 
381
+ public Builder setEncoding(String encoding)
382
+ {
383
+ this.encoding = encoding;
384
+ return this;
385
+ }
386
+
402
387
  public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
403
388
  {
404
389
  this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-17 00:00:00.000000000 Z
11
+ date: 2015-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -56,19 +56,16 @@ files:
56
56
  - lib/embulk/output/bigquery.rb
57
57
  - settings.gradle
58
58
  - src/main/java/org/embulk/output/BigqueryAuthentication.java
59
- - src/main/java/org/embulk/output/BigqueryGcsWriter.java
60
59
  - src/main/java/org/embulk/output/BigqueryOutputPlugin.java
61
60
  - src/main/java/org/embulk/output/BigqueryWriter.java
62
61
  - src/test/java/org/embulk/output/TestBigqueryAuthentication.java
63
- - src/test/java/org/embulk/output/TestBigqueryGcsWriter.java
64
62
  - src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java
65
63
  - src/test/java/org/embulk/output/TestBigqueryWriter.java
66
64
  - classpath/commons-codec-1.3.jar
67
65
  - classpath/commons-logging-1.1.1.jar
68
- - classpath/embulk-output-bigquery-0.1.1.jar
66
+ - classpath/embulk-output-bigquery-0.1.2.jar
69
67
  - classpath/google-api-client-1.19.1.jar
70
68
  - classpath/google-api-services-bigquery-v2-rev193-1.19.1.jar
71
- - classpath/google-api-services-storage-v1-rev27-1.19.1.jar
72
69
  - classpath/google-http-client-1.19.0.jar
73
70
  - classpath/google-http-client-jackson2-1.19.0.jar
74
71
  - classpath/google-oauth-client-1.19.0.jar
@@ -76,7 +73,6 @@ files:
76
73
  - classpath/httpclient-4.0.1.jar
77
74
  - classpath/httpcore-4.0.1.jar
78
75
  - classpath/jsr305-1.3.9.jar
79
- - classpath/mime-util-2.1.3.jar
80
76
  homepage: https://github.com/sakama/embulk-output-bigquery
81
77
  licenses:
82
78
  - Apache-2.0
@@ -1,201 +0,0 @@
1
- package org.embulk.output;
2
-
3
- import java.io.File;
4
- import java.io.FileNotFoundException;
5
- import java.io.FileInputStream;
6
- import java.io.IOException;
7
- import java.util.ArrayList;
8
- import java.util.List;
9
- import java.util.Collection;
10
- import java.util.Iterator;
11
- import java.util.IllegalFormatException;
12
- import java.nio.charset.Charset;
13
- import java.nio.charset.StandardCharsets;
14
- import com.google.common.base.Optional;
15
- import com.google.common.collect.ImmutableList;
16
- //import eu.medsea.mimeutil.MimeType;
17
- //import eu.medsea.mimeutil.MimeUtil;
18
- //import eu.medsea.mimeutil.detector.MimeDetector;
19
- import org.apache.commons.lang3.StringUtils;
20
- import org.apache.commons.codec.binary.Base64;
21
- import java.security.GeneralSecurityException;
22
-
23
- import org.embulk.spi.Exec;
24
- import org.slf4j.Logger;
25
-
26
- import com.google.api.services.storage.Storage;
27
- import com.google.api.services.storage.StorageScopes;
28
- import com.google.api.services.storage.model.Bucket;
29
- import com.google.api.services.storage.model.Objects;
30
- import com.google.api.services.storage.model.StorageObject;
31
-
32
- import com.google.api.client.http.InputStreamContent;
33
-
34
- public class BigqueryGcsWriter
35
- {
36
-
37
- private final Logger log = Exec.getLogger(BigqueryGcsWriter.class);
38
- private final String bucket;
39
- private final String sourceFormat;
40
- private final boolean isFileCompressed;
41
- private final boolean deleteFromBucketWhenJobEnd;
42
- private Storage storageClient;
43
-
44
- public BigqueryGcsWriter(Builder builder) throws IOException, GeneralSecurityException
45
- {
46
- this.bucket = builder.bucket;
47
- this.sourceFormat = builder.sourceFormat.toUpperCase();
48
- this.isFileCompressed = builder.isFileCompressed;
49
- this.deleteFromBucketWhenJobEnd = builder.deleteFromBucketWhenJobEnd;
50
-
51
- BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
52
- this.storageClient = auth.getGcsClient();
53
- }
54
-
55
- public void uploadFile(String localFilePath, String fileName, Optional<String> remotePath) throws IOException
56
- {
57
- FileInputStream stream = null;
58
-
59
- try {
60
- String path;
61
- if (remotePath.isPresent()) {
62
- path = remotePath.get();
63
- } else {
64
- path = "";
65
- }
66
- String gcsPath = getRemotePath(path, fileName);
67
- StorageObject objectMetadata = new StorageObject().setName(gcsPath);
68
- log.info(String.format("Uploading file [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
69
-
70
- File file = new File(localFilePath);
71
- stream = new FileInputStream(file);
72
- InputStreamContent content = new InputStreamContent(getContentType(), stream);
73
- Storage.Objects.Insert insertObject = storageClient.objects().insert(bucket, objectMetadata, content);
74
- insertObject.setDisableGZipContent(true);
75
-
76
- StorageObject response = insertObject.execute();
77
- log.info(String.format("Upload completed [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
78
- } finally {
79
- stream.close();
80
- }
81
- }
82
-
83
- private String getRemotePath(String remotePath, String fileName)
84
- {
85
- if (remotePath.isEmpty()) {
86
- return fileName;
87
- }
88
- String[] pathList = StringUtils.split(remotePath, '/');
89
- String path = StringUtils.join(pathList) + "/";
90
- if (!path.endsWith("/")) {
91
- path = path + "/";
92
- }
93
- return path + fileName;
94
- }
95
-
96
- public void deleteFile(String remotePath, String fileName) throws IOException
97
- {
98
- String path = getRemotePath(remotePath, fileName);
99
- storageClient.objects().delete(bucket, path).execute();
100
- log.info(String.format("Delete remote file [gs://%s/%s]", bucket, path));
101
- }
102
-
103
- public boolean getDeleteFromBucketWhenJobEnd()
104
- {
105
- return this.deleteFromBucketWhenJobEnd;
106
- }
107
-
108
- private String getContentType()
109
- {
110
- if (isFileCompressed) {
111
- return "application/x-gzip";
112
- } else {
113
- if (sourceFormat.equals("NEWLINE_DELIMITED_JSON)")) {
114
- return "application/json";
115
- } else {
116
- return "text/csv";
117
- }
118
- }
119
- }
120
-
121
- /*
122
- private void registerMimeDetector()
123
- {
124
- String mimeDetector = "eu.medsea.mimeutil.detector.MagicMimeMimeDetector";
125
- MimeDetector registeredMimeDetector = MimeUtil.getMimeDetector(mimeDetector);
126
- MimeUtil.registerMimeDetector(mimeDetector);
127
- }
128
-
129
- public String detectMimeType(File file)
130
- {
131
- try {
132
- Collection<?> mimeTypes = MimeUtil.getMimeTypes(file);
133
- if (!mimeTypes.isEmpty()) {
134
- Iterator<?> iterator = mimeTypes.iterator();
135
- MimeType mimeType = (MimeType) iterator.next();
136
- return mimeType.getMediaType() + "/" + mimeType.getSubType();
137
- }
138
- } catch (Exception ex) {
139
- }
140
- return "application/octet-stream";
141
- }
142
- */
143
-
144
- public static class Builder
145
- {
146
- private final String serviceAccountEmail;
147
- private String p12KeyFilePath;
148
- private String applicationName;
149
- private String bucket;
150
- private String sourceFormat;
151
- private boolean isFileCompressed;
152
- private boolean deleteFromBucketWhenJobEnd;
153
- private boolean enableMd5hashCheck;
154
-
155
- public Builder(String serviceAccountEmail)
156
- {
157
- this.serviceAccountEmail = serviceAccountEmail;
158
- }
159
-
160
- public Builder setP12KeyFilePath(String p12KeyFilePath)
161
- {
162
- this.p12KeyFilePath = p12KeyFilePath;
163
- return this;
164
- }
165
-
166
- public Builder setApplicationName(String applicationName)
167
- {
168
- this.applicationName = applicationName;
169
- return this;
170
- }
171
-
172
- public Builder setBucket(String bucket)
173
- {
174
- this.bucket = bucket;
175
- return this;
176
- }
177
-
178
- public Builder setSourceFormat(String sourceFormat)
179
- {
180
- this.sourceFormat = sourceFormat;
181
- return this;
182
- }
183
-
184
- public Builder setIsFileCompressed(boolean isFileCompressed)
185
- {
186
- this.isFileCompressed = isFileCompressed;
187
- return this;
188
- }
189
-
190
- public Builder setDeleteFromBucketWhenJobEnd(boolean deleteFromBucketWhenJobEnd)
191
- {
192
- this.deleteFromBucketWhenJobEnd = deleteFromBucketWhenJobEnd;
193
- return this;
194
- }
195
-
196
- public BigqueryGcsWriter build() throws IOException, GeneralSecurityException
197
- {
198
- return new BigqueryGcsWriter(this);
199
- }
200
- }
201
- }
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestBigqueryGcsWriter
4
- {
5
- }