embulk-output-bigquery 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 76284d5061944ac0c9f2f315cb46bc681b81dce0
4
- data.tar.gz: f75c285ff585d5177273fd70897e320c6375c6f3
3
+ metadata.gz: 46c61dd1c73ff99c3c69bd217ca772f07b2e1127
4
+ data.tar.gz: ba184360972884260c1fe90264af7d5386791804
5
5
  SHA512:
6
- metadata.gz: 7d4b83984a02db3af2dfb769f136d650c0458b5c3e9f0ab64823a558a8ca61885845a466133cc97c6092be60f05258d1e1d24ffd61c0fa78eb7018027d63a02c
7
- data.tar.gz: 7e4296ff13ebd7c3c9ff68e6a1701a4e26214b47a08714fe4ae31889d919433ee3f42341ad65651a44e8a85d3cf43f8bc8a83238c0d67d44468e0f39b48940d5
6
+ metadata.gz: aa693e59cb4b45c2d43f07479f3d61e63242185be9964d4f00b83a4a784a0443ae270a63760f3f2f188e74deb77cbb94a89a18db49d2c5cd4621f18b73363ab3
7
+ data.tar.gz: 7c0ea783220de28befd7c565ff83ec5ff58f13af0db16b3d341a12c3e415adeacba375e5688a42fcbb26d0402a48071622ed5b161fa52fd08b1f56444faf66e1
data/README.md CHANGED
@@ -1,17 +1,17 @@
1
1
 
2
2
  # embulk-output-bigquery
3
3
 
4
- [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/) via [GCS(Google Cloud Storage)](https://cloud.google.com/storage/)
4
+ [Embulk](https://github.com/embulk/embulk/) output plugin to load/insert data into [Google BigQuery](https://cloud.google.com/bigquery/)
5
5
 
6
6
  ## Overview
7
7
 
8
- load data into Google BigQuery as batch jobs via GCS for big amount of data
8
+ load data into Google BigQuery as batch jobs for big amount of data
9
9
  https://developers.google.com/bigquery/loading-data-into-bigquery
10
10
 
11
11
  * **Plugin type**: output
12
12
  * **Resume supported**: no
13
13
  * **Cleanup supported**: no
14
- * **Dynamic table creating**: todo
14
+ * **Dynamic table creating**: yes
15
15
 
16
16
  ### NOT IMPLEMENTED
17
17
  * insert data over streaming inserts
@@ -30,32 +30,19 @@ OAuth flow for installed applications.
30
30
  - **sequence_format**: (string, optional, default is %03d.%02d)
31
31
  - **file_ext**: (string, required)
32
32
  - **source_format**: file type (NEWLINE_DELIMITED_JSON or CSV) (string, required, default is CSV)
33
- - **is_file_compressed**: upload file is gzip compressed or not. (boolean, optional, default is 1)
34
- - **bucket**: Google Cloud Storage output bucket name (string, required)
35
- - **remote_path**: folder name in GCS bucket (string, optional)
36
33
  - **project**: project_id (string, required)
37
34
  - **dataset**: dataset (string, required)
38
35
  - **table**: table name (string, required)
36
+ - **auto_create_table**: (boolean, optional default is 0)
37
+ - **schema_path**: (string, optional)
39
38
  - **application_name**: application name anything you like (string, optional)
40
- - **delete_from_local_when_upload_end**: (boolean, optional, default is 0)
41
- - **delete_from_bucket_when_job_end**: (boolean, optional, default is 0)
39
+ - **delete_from_local_when_job_end**: (boolean, optional, default is 0)
42
40
  - **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
43
41
  - **job_status_polling_interval**: job status polling interval. (int, optional, default is 10 sec)
44
42
  - **is_skip_job_result_check**: (boolean, optional, default is 0)
45
-
46
- ## Support for Google BigQuery Quota policy
47
- embulk-output-bigquery support following [Google BigQuery Quota policy](https://cloud.google.com/bigquery/loading-data-into-bigquery#quota).
48
-
49
- * Supported
50
- * Maximum size per load job: 1TB across all input files
51
- * Maximum number of files per load job: 10,000
52
- * embulk-output-bigquery divides a file into more than one job, like below.
53
- * job1: file1(1GB) file2(1GB)...file10(1GB)
54
- * job2: file11(1GB) file12(1GB)
55
-
56
- * Not Supported
57
- * Daily limit: 1,000 load jobs per table per day (including failures)
58
- * 10,000 load jobs per project per day (including failures)
43
+ - **field_delimiter**: (string, optional, default is ",")
44
+ - **max_bad_records**: (int, optional, default is 0)
45
+ - **encoding**: (UTF-8 or ISO-8859-1) (string, optional, default is "UTF-8")
59
46
 
60
47
  ## Example
61
48
 
@@ -67,10 +54,7 @@ out:
67
54
  path_prefix: /path/to/output
68
55
  file_ext: csv.gz
69
56
  source_format: CSV
70
- is_file_compressed: 1
71
57
  project: your-project-000
72
- bucket: output_bucket_name
73
- remote_path: folder_name
74
58
  dataset: your_dataset_name
75
59
  table: your_table_name
76
60
  formatter:
@@ -80,6 +64,21 @@ out:
80
64
  - {type: gzip}
81
65
  ```
82
66
 
67
+ ## Dynamic table creating
68
+
69
+ When `auto_create_table` is set to true, try to create the table using BigQuery API.
70
+
71
+ To describe the schema of the target table, please write schema path.
72
+
73
+ `table` option accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
74
+ format of ruby to construct table name.
75
+
76
+ ```
77
+ auto_create_table: true
78
+ table: table_%Y_%m
79
+ schema_path: /path/to/schema.json
80
+ ```
81
+
83
82
  ## Build
84
83
 
85
84
  ```
data/build.gradle CHANGED
@@ -15,16 +15,14 @@ configurations {
15
15
  sourceCompatibility = 1.7
16
16
  targetCompatibility = 1.7
17
17
 
18
- version = "0.1.1"
18
+ version = "0.1.2"
19
19
 
20
20
  dependencies {
21
21
  compile "org.embulk:embulk-core:0.5.1"
22
22
  provided "org.embulk:embulk-core:0.5.1"
23
23
 
24
24
  compile "com.google.http-client:google-http-client-jackson2:1.19.0"
25
- compile ("com.google.apis:google-api-services-storage:v1-rev27-1.19.1") {exclude module: "guava-jdk5"}
26
25
  compile "com.google.apis:google-api-services-bigquery:v2-rev193-1.19.1"
27
- compile "eu.medsea.mimeutil:mime-util:2.1.3"
28
26
 
29
27
  testCompile "junit:junit:4.+"
30
28
  }
@@ -23,8 +23,6 @@ import com.google.api.client.http.HttpTransport;
23
23
  import com.google.api.client.http.InputStreamContent;
24
24
  import com.google.api.client.json.JsonFactory;
25
25
  import com.google.api.client.json.jackson2.JacksonFactory;
26
- import com.google.api.services.storage.Storage;
27
- import com.google.api.services.storage.StorageScopes;
28
26
  import com.google.api.services.bigquery.Bigquery;
29
27
  import com.google.api.services.bigquery.BigqueryScopes;
30
28
  import com.google.api.services.bigquery.model.ProjectList;
@@ -64,7 +62,6 @@ public class BigqueryAuthentication
64
62
  .setServiceAccountId(serviceAccountEmail)
65
63
  .setServiceAccountScopes(
66
64
  ImmutableList.of(
67
- BigqueryScopes.DEVSTORAGE_READ_WRITE,
68
65
  BigqueryScopes.BIGQUERY
69
66
  )
70
67
  )
@@ -87,13 +84,4 @@ public class BigqueryAuthentication
87
84
 
88
85
  return client;
89
86
  }
90
-
91
- public Storage getGcsClient() throws IOException
92
- {
93
- Storage client = new Storage.Builder(httpTransport, jsonFactory, credentials)
94
- .setApplicationName(applicationName)
95
- .build();
96
-
97
- return client;
98
- }
99
87
  }
@@ -13,6 +13,7 @@ import java.util.concurrent.TimeoutException;
13
13
  import com.google.common.base.Optional;
14
14
  import com.google.common.base.Throwables;
15
15
  import java.security.GeneralSecurityException;
16
+ import org.jruby.embed.ScriptingContainer;
16
17
 
17
18
  import org.embulk.config.Config;
18
19
  import org.embulk.config.ConfigException;
@@ -59,10 +60,6 @@ public class BigqueryOutputPlugin
59
60
  @ConfigDefault("\"CSV\"")
60
61
  public String getSourceFormat();
61
62
 
62
- @Config("is_file_compressed")
63
- @ConfigDefault("true")
64
- public boolean getIsFileCompressed();
65
-
66
63
  @Config("field_delimiter")
67
64
  @ConfigDefault("\",\"")
68
65
  public String getFieldDelimiter();
@@ -71,20 +68,13 @@ public class BigqueryOutputPlugin
71
68
  @ConfigDefault("0")
72
69
  public int getMaxBadrecords();
73
70
 
74
- @Config("delete_from_local_when_upload_end")
75
- @ConfigDefault("false")
76
- public boolean getDeleteFromLocalWhenUploadEnd();
71
+ @Config("encoding")
72
+ @ConfigDefault("\"UTF-8\"")
73
+ public String getEncoding();
77
74
 
78
- @Config("delete_from_bucket_when_job_end")
75
+ @Config("delete_from_local_when_job_end")
79
76
  @ConfigDefault("false")
80
- public boolean getDeleteFromBucketWhenJobEnd();
81
-
82
- @Config("bucket")
83
- public String getBucket();
84
-
85
- @Config("remote_path")
86
- @ConfigDefault("null")
87
- public Optional<String> getRemotePath();
77
+ public boolean getDeleteFromLocalWhenJobEnd();
88
78
 
89
79
  @Config("project")
90
80
  public String getProject();
@@ -117,7 +107,6 @@ public class BigqueryOutputPlugin
117
107
  }
118
108
 
119
109
  private final Logger log = Exec.getLogger(BigqueryOutputPlugin.class);
120
- private static BigqueryGcsWriter bigQueryGcsWriter;
121
110
  private static BigqueryWriter bigQueryWriter;
122
111
 
123
112
  public ConfigDiff transaction(ConfigSource config, int taskCount,
@@ -126,33 +115,25 @@ public class BigqueryOutputPlugin
126
115
  final PluginTask task = config.loadConfig(PluginTask.class);
127
116
 
128
117
  try {
129
- bigQueryGcsWriter = new BigqueryGcsWriter.Builder(task.getServiceAccountEmail())
130
- .setP12KeyFilePath(task.getP12KeyfilePath())
131
- .setApplicationName(task.getApplicationName())
132
- .setBucket(task.getBucket())
133
- .setSourceFormat(task.getSourceFormat())
134
- .setIsFileCompressed(task.getIsFileCompressed())
135
- .setDeleteFromBucketWhenJobEnd(task.getDeleteFromBucketWhenJobEnd())
136
- .build();
137
-
138
118
  bigQueryWriter = new BigqueryWriter.Builder(task.getServiceAccountEmail())
139
119
  .setP12KeyFilePath(task.getP12KeyfilePath())
140
120
  .setApplicationName(task.getApplicationName())
141
121
  .setProject(task.getProject())
142
122
  .setDataset(task.getDataset())
143
- .setTable(task.getTable())
123
+ .setTable(generateTableName(task.getTable()))
144
124
  .setAutoCreateTable(task.getAutoCreateTable())
145
125
  .setSchemaPath(task.getSchemaPath())
146
- .setBucket(task.getBucket())
147
126
  .setSourceFormat(task.getSourceFormat())
148
127
  .setFieldDelimiter(task.getFieldDelimiter())
149
128
  .setMaxBadrecords(task.getMaxBadrecords())
129
+ .setEncoding(task.getEncoding())
150
130
  .setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
151
131
  .setJobStatusPollingInterval(task.getJobStatusPollingInterval())
152
132
  .setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
153
133
  .build();
134
+ } catch (FileNotFoundException ex) {
135
+ throw new ConfigException(ex);
154
136
  } catch (IOException | GeneralSecurityException ex) {
155
- log.warn("Google Authentication was failed. Please Check your configurations.");
156
137
  throw new ConfigException(ex);
157
138
  }
158
139
  // non-retryable (non-idempotent) output:
@@ -165,19 +146,6 @@ public class BigqueryOutputPlugin
165
146
  {
166
147
  control.run(taskSource);
167
148
 
168
- try {
169
- bigQueryWriter.executeJob();
170
- // TODO refactor
171
- if (bigQueryGcsWriter.getDeleteFromBucketWhenJobEnd()) {
172
- ArrayList<HashMap<String, String>> fileList = bigQueryWriter.getFileList();
173
- for (HashMap<String, String> file : fileList) {
174
- bigQueryGcsWriter.deleteFile(file.get("remote_path"), file.get("file_name"));
175
- }
176
- }
177
- } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
178
- log.warn(ex.getMessage());
179
- throw Throwables.propagate(ex);
180
- }
181
149
  return Exec.newConfigDiff();
182
150
  }
183
151
 
@@ -196,7 +164,6 @@ public class BigqueryOutputPlugin
196
164
  final String pathPrefix = task.getPathPrefix();
197
165
  final String sequenceFormat = task.getSequenceFormat();
198
166
  final String pathSuffix = task.getFileNameExtension();
199
- final Optional<String> remotePath = task.getRemotePath();
200
167
 
201
168
  return new TransactionalFileOutput() {
202
169
  private int fileIndex = 0;
@@ -217,7 +184,6 @@ public class BigqueryOutputPlugin
217
184
  }
218
185
  filePath = pathPrefix + String.format(sequenceFormat, taskIndex, fileIndex) + suffix;
219
186
  file = new File(filePath);
220
- fileName = file.getName();
221
187
 
222
188
  String parentPath = file.getParent();
223
189
  File dir = new File(parentPath);
@@ -257,18 +223,15 @@ public class BigqueryOutputPlugin
257
223
  public void finish()
258
224
  {
259
225
  closeFile();
260
- if (fileName != null) {
261
- fileSize = file.length();
226
+ if (filePath != null) {
262
227
  try {
263
- bigQueryGcsWriter.uploadFile(filePath, fileName, remotePath);
228
+ bigQueryWriter.executeLoad(filePath);
264
229
 
265
- if (task.getDeleteFromLocalWhenUploadEnd()) {
230
+ if (task.getDeleteFromLocalWhenJobEnd()) {
266
231
  log.info(String.format("Delete local file [%s]", filePath));
267
232
  file.delete();
268
233
  }
269
-
270
- bigQueryWriter.addTask(remotePath, fileName, fileSize);
271
- } catch (IOException ex) {
234
+ } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
272
235
  throw Throwables.propagate(ex);
273
236
  }
274
237
  }
@@ -290,4 +253,13 @@ public class BigqueryOutputPlugin
290
253
  }
291
254
  };
292
255
  }
256
+
257
+ // Parse like "table_%Y_%m"(include pattern or not) format using Java is difficult. So use jRuby.
258
+ public String generateTableName(String tableName)
259
+ {
260
+ ScriptingContainer jruby = new ScriptingContainer();
261
+ Object result = jruby.runScriptlet("Time.now.strftime('" + tableName + "')");
262
+
263
+ return result.toString();
264
+ }
293
265
  }
@@ -1,6 +1,11 @@
1
1
  package org.embulk.output;
2
2
 
3
+ import java.io.File;
3
4
  import java.io.IOException;
5
+ import java.io.FileNotFoundException;
6
+ import java.io.FileInputStream;
7
+ import java.io.BufferedInputStream;
8
+ import com.google.api.client.http.InputStreamContent;
4
9
  import java.util.ArrayList;
5
10
  import java.util.List;
6
11
  import java.util.Iterator;
@@ -11,14 +16,19 @@ import java.util.concurrent.TimeoutException;
11
16
  import org.apache.commons.lang3.StringUtils;
12
17
  import com.google.common.base.Optional;
13
18
  import com.google.common.collect.ImmutableSet;
19
+ import com.google.common.base.Throwables;
14
20
  import java.security.GeneralSecurityException;
15
21
 
22
+ import com.fasterxml.jackson.databind.ObjectMapper;
23
+ import com.fasterxml.jackson.core.type.TypeReference;
24
+
16
25
  import org.embulk.spi.Exec;
17
26
  import org.slf4j.Logger;
18
27
 
19
28
  import com.google.api.services.bigquery.Bigquery;
20
29
  import com.google.api.services.bigquery.BigqueryScopes;
21
30
  import com.google.api.services.bigquery.Bigquery.Datasets;
31
+ import com.google.api.services.bigquery.Bigquery.Tables;
22
32
  import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
23
33
  import com.google.api.services.bigquery.Bigquery.Jobs.GetQueryResults;
24
34
  import com.google.api.services.bigquery.model.Job;
@@ -28,11 +38,19 @@ import com.google.api.services.bigquery.model.JobStatus;
28
38
  import com.google.api.services.bigquery.model.JobStatistics;
29
39
  import com.google.api.services.bigquery.model.JobReference;
30
40
  import com.google.api.services.bigquery.model.DatasetList;
41
+ import com.google.api.services.bigquery.model.Table;
42
+ import com.google.api.services.bigquery.model.TableList;
31
43
  import com.google.api.services.bigquery.model.TableSchema;
32
44
  import com.google.api.services.bigquery.model.TableReference;
33
45
  import com.google.api.services.bigquery.model.TableFieldSchema;
34
46
  import com.google.api.services.bigquery.model.TableCell;
35
47
  import com.google.api.services.bigquery.model.TableRow;
48
+ import com.google.api.services.bigquery.model.ErrorProto;
49
+ import com.google.api.client.googleapis.json.GoogleJsonResponseException;
50
+
51
+ import com.google.api.client.googleapis.media.MediaHttpUploader;
52
+ import com.google.api.client.googleapis.media.MediaHttpUploaderProgressListener;
53
+ import com.google.api.client.googleapis.media.MediaHttpUploader.UploadState;
36
54
 
37
55
  public class BigqueryWriter
38
56
  {
@@ -43,43 +61,58 @@ public class BigqueryWriter
43
61
  private final String table;
44
62
  private final boolean autoCreateTable;
45
63
  private final Optional<String> schemaPath;
46
- private final String bucket;
64
+ private final TableSchema tableSchema;
47
65
  private final String sourceFormat;
48
66
  private final String fieldDelimiter;
49
67
  private final int maxBadrecords;
68
+ private final String encoding;
50
69
  private final long jobStatusMaxPollingTime;
51
70
  private final long jobStatusPollingInterval;
52
71
  private final boolean isSkipJobResultCheck;
53
72
  private final Bigquery bigQueryClient;
54
- private final EmbulkBigqueryTask writerTask;
55
73
 
56
- public BigqueryWriter(Builder builder) throws IOException, GeneralSecurityException
74
+ public BigqueryWriter(Builder builder) throws FileNotFoundException, IOException, GeneralSecurityException
57
75
  {
58
76
  this.project = builder.project;
59
77
  this.dataset = builder.dataset;
60
78
  this.table = builder.table;
61
79
  this.autoCreateTable = builder.autoCreateTable;
62
80
  this.schemaPath = builder.schemaPath;
63
- this.bucket = builder.bucket;
64
81
  this.sourceFormat = builder.sourceFormat.toUpperCase();
65
82
  this.fieldDelimiter = builder.fieldDelimiter;
66
83
  this.maxBadrecords = builder.maxBadrecords;
84
+ this.encoding = builder.encoding.toUpperCase();
67
85
  this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
68
86
  this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
69
87
  this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
70
88
 
71
89
  BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
72
90
  this.bigQueryClient = auth.getBigqueryClient();
73
- this.writerTask = new EmbulkBigqueryTask();
91
+
92
+ checkConfig();
93
+ if (autoCreateTable) {
94
+ this.tableSchema = createTableSchema(builder.schemaPath);
95
+ } else {
96
+ this.tableSchema = null;
97
+ }
74
98
  }
75
99
 
76
100
  private String getJobStatus(JobReference jobRef) throws JobFailedException
77
101
  {
78
102
  try {
79
103
  Job job = bigQueryClient.jobs().get(project, jobRef.getJobId()).execute();
80
- if (job.getStatus().getErrorResult() != null) {
81
- throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s] status:[FAILED]", jobRef.getJobId(), job.getStatus().getErrorResult().getMessage()));
104
+
105
+ ErrorProto fatalError = job.getStatus().getErrorResult();
106
+ if (fatalError != null) {
107
+ throw new JobFailedException(String.format("Job failed. job id:[%s] reason:[%s][%s] status:[FAILED]", jobRef.getJobId(), fatalError.getReason(), fatalError.getMessage()));
108
+ }
109
+ List<ErrorProto> errors = job.getStatus().getErrors();
110
+ if (errors != null) {
111
+ for (ErrorProto error : errors) {
112
+ log.warn(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
113
+ }
82
114
  }
115
+
83
116
  String jobStatus = job.getStatus().getState();
84
117
  if (jobStatus.equals("DONE")) {
85
118
  JobStatistics statistics = job.getStatistics();
@@ -117,59 +150,68 @@ public class BigqueryWriter
117
150
  }
118
151
  }
119
152
 
120
- public void executeJob() throws IOException, TimeoutException, JobFailedException
121
- {
122
- // TODO: refactor
123
- ArrayList<ArrayList<HashMap<String, String>>> taskList = writerTask.createJobList();
124
- for (ArrayList<HashMap<String, String>> task : taskList) {
125
- Job job = createJob(task);
126
- // TODO: multi-threading
127
- new EmbulkBigqueryJob(job).call();
128
- }
129
- }
130
-
131
- private Job createJob(ArrayList<HashMap<String, String>> task)
153
+ public void executeLoad(String localFilePath) throws GoogleJsonResponseException, IOException, TimeoutException, JobFailedException
132
154
  {
133
155
  log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
134
156
 
135
157
  Job job = new Job();
158
+ JobReference jobRef = null;
136
159
  JobConfiguration jobConfig = new JobConfiguration();
137
160
  JobConfigurationLoad loadConfig = new JobConfigurationLoad();
138
161
  jobConfig.setLoad(loadConfig);
139
162
  job.setConfiguration(jobConfig);
140
163
 
141
164
  loadConfig.setAllowQuotedNewlines(false);
165
+ loadConfig.setEncoding(encoding);
166
+ loadConfig.setMaxBadRecords(maxBadrecords);
142
167
  if (sourceFormat.equals("NEWLINE_DELIMITED_JSON")) {
143
168
  loadConfig.setSourceFormat("NEWLINE_DELIMITED_JSON");
144
169
  } else {
145
170
  loadConfig.setFieldDelimiter(fieldDelimiter);
146
171
  }
172
+ loadConfig.setWriteDisposition("WRITE_APPEND");
147
173
  if (autoCreateTable) {
148
- loadConfig.setSchema(getTableSchema());
149
- loadConfig.setWriteDisposition("WRITE_EMPTY");
174
+ loadConfig.setSchema(tableSchema);
150
175
  loadConfig.setCreateDisposition("CREATE_IF_NEEDED");
151
- log.info(String.format("table:[%s] will be create.", table));
176
+ log.info(String.format("table:[%s] will be create if not exists", table));
152
177
  } else {
153
- loadConfig.setWriteDisposition("WRITE_APPEND");
154
178
  loadConfig.setCreateDisposition("CREATE_NEVER");
155
179
  }
156
- loadConfig.setMaxBadRecords(maxBadrecords);
157
180
 
158
- List<String> sources = new ArrayList<String>();
159
- for (HashMap<String, String> file : task) {
160
- String sourceFile;
161
- String remotePath = getRemotePath(file.get("remote_path"), file.get("file_name"));
162
- sourceFile = "gs://" + remotePath;
163
- log.info(String.format("Add source file to job [%s]", sourceFile));
164
- sources.add(sourceFile);
165
- }
166
- loadConfig.setSourceUris(sources);
167
- loadConfig.setDestinationTable(getTableReference());
181
+ loadConfig.setDestinationTable(createTableReference());
182
+
183
+ File file = new File(localFilePath);
184
+ InputStreamContent mediaContent = new InputStreamContent("application/octet-stream",
185
+ new BufferedInputStream(
186
+ new FileInputStream(file)));
187
+ mediaContent.setLength(file.length());
168
188
 
169
- return job;
189
+ Insert insert = bigQueryClient.jobs().insert(project, job, mediaContent);
190
+ insert.setProjectId(project);
191
+ insert.setDisableGZipContent(true);
192
+
193
+ // @see https://code.google.com/p/google-api-java-client/wiki/MediaUpload
194
+ UploadProgressListener listner = new UploadProgressListener();
195
+ listner.setFileName(localFilePath);
196
+ insert.getMediaHttpUploader()
197
+ .setProgressListener(listner)
198
+ .setDirectUploadEnabled(false);
199
+
200
+ try {
201
+ jobRef = insert.execute().getJobReference();
202
+ } catch (Exception ex) {
203
+ log.warn("Job execution was failed. Please check your settings or data... like data matches schema");
204
+ throw Throwables.propagate(ex);
205
+ }
206
+ log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
207
+ if (isSkipJobResultCheck) {
208
+ log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
209
+ } else {
210
+ getJobStatusUntilDone(jobRef);
211
+ }
170
212
  }
171
213
 
172
- private TableReference getTableReference()
214
+ private TableReference createTableReference()
173
215
  {
174
216
  return new TableReference()
175
217
  .setProjectId(project)
@@ -177,135 +219,78 @@ public class BigqueryWriter
177
219
  .setTableId(table);
178
220
  }
179
221
 
180
- private TableSchema getTableSchema()
222
+ private TableSchema createTableSchema(Optional<String> schemaPath) throws FileNotFoundException, IOException
181
223
  {
182
- TableSchema tableSchema = new TableSchema();
183
- List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
184
- TableFieldSchema tableField;
185
- // TODO import from json file
186
- /*
187
- for () {
188
- tableField = new TableFieldSchema()
189
- .setName(name)
190
- .setType(type);
191
- fields.add(tableField);
224
+ String path = schemaPath.orNull();
225
+ File file = new File(path);
226
+ FileInputStream stream = null;
227
+ try {
228
+ stream = new FileInputStream(file);
229
+ ObjectMapper mapper = new ObjectMapper();
230
+ List<TableFieldSchema> fields = mapper.readValue(stream, new TypeReference<List<TableFieldSchema>>() {});
231
+ TableSchema tableSchema = new TableSchema().setFields(fields);
232
+ return tableSchema;
233
+ } finally {
234
+ if (stream != null) {
235
+ stream.close();
236
+ }
192
237
  }
193
- */
194
-
195
- tableSchema.setFields(fields);
196
- return tableSchema;
197
238
  }
198
239
 
199
- private String getRemotePath(String remotePath, String fileName)
240
+ public boolean isExistTable(String tableName) throws IOException
200
241
  {
201
- String[] pathList = StringUtils.split(remotePath, '/');
202
- String path;
203
- if (remotePath.isEmpty()) {
204
- path = bucket + "/" + fileName;
205
- } else {
206
- path = bucket + "/" + StringUtils.join(pathList) + "/" + fileName;
242
+ Tables tableRequest = bigQueryClient.tables();
243
+ try {
244
+ Table tableData = tableRequest.get(project, dataset, tableName).execute();
245
+ } catch (GoogleJsonResponseException ex) {
246
+ return false;
207
247
  }
208
- return path;
209
- }
210
-
211
- public void addTask(Optional<String> remotePath, String fileName, long fileSize)
212
- {
213
- writerTask.addTaskFile(remotePath, fileName, fileSize);
214
- }
215
-
216
- public ArrayList<HashMap<String, String>> getFileList()
217
- {
218
- return writerTask.getFileList();
248
+ return true;
219
249
  }
220
250
 
221
- private class EmbulkBigqueryJob implements Callable<Void>
251
+ public void checkConfig() throws FileNotFoundException, IOException
222
252
  {
223
- private final Job job;
224
-
225
- public EmbulkBigqueryJob(Job job)
226
- {
227
- this.job = job;
228
- }
229
-
230
- public Void call() throws IOException, TimeoutException, JobFailedException
231
- {
232
- Insert insert = bigQueryClient.jobs().insert(project, job);
233
- insert.setProjectId(project);
234
- JobReference jobRef = insert.execute().getJobReference();
235
- log.info(String.format("Job executed. job id:[%s]", jobRef.getJobId()));
236
- if (isSkipJobResultCheck) {
237
- log.info(String.format("Skip job status check. job id:[%s]", jobRef.getJobId()));
253
+ if (autoCreateTable) {
254
+ if (!schemaPath.isPresent()) {
255
+ throw new IOException("schema_path is empty");
238
256
  } else {
239
- getJobStatusUntilDone(jobRef);
257
+ File file = new File(schemaPath.orNull());
258
+ if (!file.exists()) {
259
+ throw new FileNotFoundException("Can not load schema file.");
260
+ }
261
+ }
262
+ } else {
263
+ if (!isExistTable(table)) {
264
+ throw new IOException(String.format("table [%s] is not exists", table));
240
265
  }
241
- return null;
242
266
  }
243
267
  }
244
268
 
245
- private class EmbulkBigqueryTask
269
+ private class UploadProgressListener implements MediaHttpUploaderProgressListener
246
270
  {
247
- // https://cloud.google.com/bigquery/loading-data-into-bigquery#quota
248
- private final long MAX_SIZE_PER_LOAD_JOB = 1000 * 1024 * 1024 * 1024L; // 1TB
249
- private final int MAX_NUMBER_OF_FILES_PER_LOAD_JOB = 10000;
271
+ private String fileName;
250
272
 
251
- private final ArrayList<HashMap<String, String>> taskList = new ArrayList<HashMap<String, String>>();
252
- private final ArrayList<ArrayList<HashMap<String, String>>> jobList = new ArrayList<ArrayList<HashMap<String, String>>>();
253
-
254
- public void addTaskFile(Optional<String> remotePath, String fileName, long fileSize)
273
+ @Override
274
+ public void progressChanged(MediaHttpUploader uploader) throws IOException
255
275
  {
256
- HashMap<String, String> task = new HashMap<String, String>();
257
- if (remotePath.isPresent()) {
258
- task.put("remote_path", remotePath.get());
259
- } else {
260
- task.put("remote_path", "");
261
- }
262
- task.put("file_name", fileName);
263
- task.put("file_size", String.valueOf(fileSize));
264
- taskList.add(task);
265
- }
266
-
267
- public ArrayList<ArrayList<HashMap<String, String>>> createJobList()
268
- {
269
- long currentBundleSize = 0;
270
- int currentFileCount = 0;
271
- ArrayList<HashMap<String, String>> job = new ArrayList<HashMap<String, String>>();
272
- for (HashMap<String, String> task : taskList) {
273
- boolean isNeedNextJobList = false;
274
- long fileSize = Long.valueOf(task.get("file_size")).longValue();
275
-
276
- if (currentBundleSize + fileSize > MAX_SIZE_PER_LOAD_JOB) {
277
- isNeedNextJobList = true;
278
- }
279
-
280
- if (currentFileCount >= MAX_NUMBER_OF_FILES_PER_LOAD_JOB) {
281
- isNeedNextJobList = true;
282
- }
283
-
284
- if (isNeedNextJobList) {
285
- jobList.add(job);
286
- job = new ArrayList<HashMap<String, String>>();
287
- job.add(task);
288
- currentBundleSize = 0;
289
- } else {
290
- job.add(task);
291
- }
292
- currentBundleSize += fileSize;
293
- currentFileCount++;
294
-
295
- log.debug(String.format("currentBundleSize:%s currentFileCount:%s", currentBundleSize, currentFileCount));
296
- log.debug(String.format("fileSize:%s, MAX_SIZE_PER_LOAD_JOB:%s MAX_NUMBER_OF_FILES_PER_LOAD_JOB:%s",
297
- fileSize, MAX_SIZE_PER_LOAD_JOB, MAX_NUMBER_OF_FILES_PER_LOAD_JOB));
298
-
299
- }
300
- if (job.size() > 0) {
301
- jobList.add(job);
276
+ switch (uploader.getUploadState()) {
277
+ case INITIATION_STARTED:
278
+ log.info(String.format("Upload start [%s]", fileName));
279
+ break;
280
+ case INITIATION_COMPLETE:
281
+ //log.info(String.format("Upload initiation completed file [%s]", fileName));
282
+ break;
283
+ case MEDIA_IN_PROGRESS:
284
+ log.debug(String.format("Uploading [%s] progress %3.0f", fileName, uploader.getProgress() * 100) + "%");
285
+ break;
286
+ case MEDIA_COMPLETE:
287
+ log.info(String.format("Upload completed [%s]", fileName));
302
288
  }
303
- return jobList;
304
289
  }
305
290
 
306
- public ArrayList<HashMap<String, String>> getFileList()
291
+ public void setFileName(String fileName)
307
292
  {
308
- return taskList;
293
+ this.fileName = fileName;
309
294
  }
310
295
  }
311
296
 
@@ -319,10 +304,10 @@ public class BigqueryWriter
319
304
  private String table;
320
305
  private boolean autoCreateTable;
321
306
  private Optional<String> schemaPath;
322
- private String bucket;
323
307
  private String sourceFormat;
324
308
  private String fieldDelimiter;
325
309
  private int maxBadrecords;
310
+ private String encoding;
326
311
  private int jobStatusMaxPollingTime;
327
312
  private int jobStatusPollingInterval;
328
313
  private boolean isSkipJobResultCheck;
@@ -375,12 +360,6 @@ public class BigqueryWriter
375
360
  return this;
376
361
  }
377
362
 
378
- public Builder setBucket(String bucket)
379
- {
380
- this.bucket = bucket;
381
- return this;
382
- }
383
-
384
363
  public Builder setSourceFormat(String sourceFormat)
385
364
  {
386
365
  this.sourceFormat = sourceFormat;
@@ -399,6 +378,12 @@ public class BigqueryWriter
399
378
  return this;
400
379
  }
401
380
 
381
+ public Builder setEncoding(String encoding)
382
+ {
383
+ this.encoding = encoding;
384
+ return this;
385
+ }
386
+
402
387
  public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
403
388
  {
404
389
  this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-17 00:00:00.000000000 Z
11
+ date: 2015-04-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -56,19 +56,16 @@ files:
56
56
  - lib/embulk/output/bigquery.rb
57
57
  - settings.gradle
58
58
  - src/main/java/org/embulk/output/BigqueryAuthentication.java
59
- - src/main/java/org/embulk/output/BigqueryGcsWriter.java
60
59
  - src/main/java/org/embulk/output/BigqueryOutputPlugin.java
61
60
  - src/main/java/org/embulk/output/BigqueryWriter.java
62
61
  - src/test/java/org/embulk/output/TestBigqueryAuthentication.java
63
- - src/test/java/org/embulk/output/TestBigqueryGcsWriter.java
64
62
  - src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java
65
63
  - src/test/java/org/embulk/output/TestBigqueryWriter.java
66
64
  - classpath/commons-codec-1.3.jar
67
65
  - classpath/commons-logging-1.1.1.jar
68
- - classpath/embulk-output-bigquery-0.1.1.jar
66
+ - classpath/embulk-output-bigquery-0.1.2.jar
69
67
  - classpath/google-api-client-1.19.1.jar
70
68
  - classpath/google-api-services-bigquery-v2-rev193-1.19.1.jar
71
- - classpath/google-api-services-storage-v1-rev27-1.19.1.jar
72
69
  - classpath/google-http-client-1.19.0.jar
73
70
  - classpath/google-http-client-jackson2-1.19.0.jar
74
71
  - classpath/google-oauth-client-1.19.0.jar
@@ -76,7 +73,6 @@ files:
76
73
  - classpath/httpclient-4.0.1.jar
77
74
  - classpath/httpcore-4.0.1.jar
78
75
  - classpath/jsr305-1.3.9.jar
79
- - classpath/mime-util-2.1.3.jar
80
76
  homepage: https://github.com/sakama/embulk-output-bigquery
81
77
  licenses:
82
78
  - Apache-2.0
@@ -1,201 +0,0 @@
1
- package org.embulk.output;
2
-
3
- import java.io.File;
4
- import java.io.FileNotFoundException;
5
- import java.io.FileInputStream;
6
- import java.io.IOException;
7
- import java.util.ArrayList;
8
- import java.util.List;
9
- import java.util.Collection;
10
- import java.util.Iterator;
11
- import java.util.IllegalFormatException;
12
- import java.nio.charset.Charset;
13
- import java.nio.charset.StandardCharsets;
14
- import com.google.common.base.Optional;
15
- import com.google.common.collect.ImmutableList;
16
- //import eu.medsea.mimeutil.MimeType;
17
- //import eu.medsea.mimeutil.MimeUtil;
18
- //import eu.medsea.mimeutil.detector.MimeDetector;
19
- import org.apache.commons.lang3.StringUtils;
20
- import org.apache.commons.codec.binary.Base64;
21
- import java.security.GeneralSecurityException;
22
-
23
- import org.embulk.spi.Exec;
24
- import org.slf4j.Logger;
25
-
26
- import com.google.api.services.storage.Storage;
27
- import com.google.api.services.storage.StorageScopes;
28
- import com.google.api.services.storage.model.Bucket;
29
- import com.google.api.services.storage.model.Objects;
30
- import com.google.api.services.storage.model.StorageObject;
31
-
32
- import com.google.api.client.http.InputStreamContent;
33
-
34
- public class BigqueryGcsWriter
35
- {
36
-
37
- private final Logger log = Exec.getLogger(BigqueryGcsWriter.class);
38
- private final String bucket;
39
- private final String sourceFormat;
40
- private final boolean isFileCompressed;
41
- private final boolean deleteFromBucketWhenJobEnd;
42
- private Storage storageClient;
43
-
44
- public BigqueryGcsWriter(Builder builder) throws IOException, GeneralSecurityException
45
- {
46
- this.bucket = builder.bucket;
47
- this.sourceFormat = builder.sourceFormat.toUpperCase();
48
- this.isFileCompressed = builder.isFileCompressed;
49
- this.deleteFromBucketWhenJobEnd = builder.deleteFromBucketWhenJobEnd;
50
-
51
- BigqueryAuthentication auth = new BigqueryAuthentication(builder.serviceAccountEmail, builder.p12KeyFilePath, builder.applicationName);
52
- this.storageClient = auth.getGcsClient();
53
- }
54
-
55
- public void uploadFile(String localFilePath, String fileName, Optional<String> remotePath) throws IOException
56
- {
57
- FileInputStream stream = null;
58
-
59
- try {
60
- String path;
61
- if (remotePath.isPresent()) {
62
- path = remotePath.get();
63
- } else {
64
- path = "";
65
- }
66
- String gcsPath = getRemotePath(path, fileName);
67
- StorageObject objectMetadata = new StorageObject().setName(gcsPath);
68
- log.info(String.format("Uploading file [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
69
-
70
- File file = new File(localFilePath);
71
- stream = new FileInputStream(file);
72
- InputStreamContent content = new InputStreamContent(getContentType(), stream);
73
- Storage.Objects.Insert insertObject = storageClient.objects().insert(bucket, objectMetadata, content);
74
- insertObject.setDisableGZipContent(true);
75
-
76
- StorageObject response = insertObject.execute();
77
- log.info(String.format("Upload completed [%s] to [gs://%s/%s]", localFilePath, bucket, gcsPath));
78
- } finally {
79
- stream.close();
80
- }
81
- }
82
-
83
- private String getRemotePath(String remotePath, String fileName)
84
- {
85
- if (remotePath.isEmpty()) {
86
- return fileName;
87
- }
88
- String[] pathList = StringUtils.split(remotePath, '/');
89
- String path = StringUtils.join(pathList) + "/";
90
- if (!path.endsWith("/")) {
91
- path = path + "/";
92
- }
93
- return path + fileName;
94
- }
95
-
96
- public void deleteFile(String remotePath, String fileName) throws IOException
97
- {
98
- String path = getRemotePath(remotePath, fileName);
99
- storageClient.objects().delete(bucket, path).execute();
100
- log.info(String.format("Delete remote file [gs://%s/%s]", bucket, path));
101
- }
102
-
103
- public boolean getDeleteFromBucketWhenJobEnd()
104
- {
105
- return this.deleteFromBucketWhenJobEnd;
106
- }
107
-
108
- private String getContentType()
109
- {
110
- if (isFileCompressed) {
111
- return "application/x-gzip";
112
- } else {
113
- if (sourceFormat.equals("NEWLINE_DELIMITED_JSON)")) {
114
- return "application/json";
115
- } else {
116
- return "text/csv";
117
- }
118
- }
119
- }
120
-
121
- /*
122
- private void registerMimeDetector()
123
- {
124
- String mimeDetector = "eu.medsea.mimeutil.detector.MagicMimeMimeDetector";
125
- MimeDetector registeredMimeDetector = MimeUtil.getMimeDetector(mimeDetector);
126
- MimeUtil.registerMimeDetector(mimeDetector);
127
- }
128
-
129
- public String detectMimeType(File file)
130
- {
131
- try {
132
- Collection<?> mimeTypes = MimeUtil.getMimeTypes(file);
133
- if (!mimeTypes.isEmpty()) {
134
- Iterator<?> iterator = mimeTypes.iterator();
135
- MimeType mimeType = (MimeType) iterator.next();
136
- return mimeType.getMediaType() + "/" + mimeType.getSubType();
137
- }
138
- } catch (Exception ex) {
139
- }
140
- return "application/octet-stream";
141
- }
142
- */
143
-
144
- public static class Builder
145
- {
146
- private final String serviceAccountEmail;
147
- private String p12KeyFilePath;
148
- private String applicationName;
149
- private String bucket;
150
- private String sourceFormat;
151
- private boolean isFileCompressed;
152
- private boolean deleteFromBucketWhenJobEnd;
153
- private boolean enableMd5hashCheck;
154
-
155
- public Builder(String serviceAccountEmail)
156
- {
157
- this.serviceAccountEmail = serviceAccountEmail;
158
- }
159
-
160
- public Builder setP12KeyFilePath(String p12KeyFilePath)
161
- {
162
- this.p12KeyFilePath = p12KeyFilePath;
163
- return this;
164
- }
165
-
166
- public Builder setApplicationName(String applicationName)
167
- {
168
- this.applicationName = applicationName;
169
- return this;
170
- }
171
-
172
- public Builder setBucket(String bucket)
173
- {
174
- this.bucket = bucket;
175
- return this;
176
- }
177
-
178
- public Builder setSourceFormat(String sourceFormat)
179
- {
180
- this.sourceFormat = sourceFormat;
181
- return this;
182
- }
183
-
184
- public Builder setIsFileCompressed(boolean isFileCompressed)
185
- {
186
- this.isFileCompressed = isFileCompressed;
187
- return this;
188
- }
189
-
190
- public Builder setDeleteFromBucketWhenJobEnd(boolean deleteFromBucketWhenJobEnd)
191
- {
192
- this.deleteFromBucketWhenJobEnd = deleteFromBucketWhenJobEnd;
193
- return this;
194
- }
195
-
196
- public BigqueryGcsWriter build() throws IOException, GeneralSecurityException
197
- {
198
- return new BigqueryGcsWriter(this);
199
- }
200
- }
201
- }
@@ -1,5 +0,0 @@
1
- package org.embulk.output;
2
-
3
- public class TestBigqueryGcsWriter
4
- {
5
- }