embulk-output-bigquery 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b37d638ca9c217221687cdcadfbd45257291aef4
4
- data.tar.gz: 972bf78e9ce75972fd3f2e1f77389a7383c3d2a0
3
+ metadata.gz: 37643ccea137f84b59a056d75826b54c8548c0b6
4
+ data.tar.gz: 1eafbf6ab69c81039c22712cda93ceb099ec9d92
5
5
  SHA512:
6
- metadata.gz: 6d18639e76da80f45e2852df8408ec4c9c655e06a77a776c42dfb52310cc787e8319fd8f182eef20c59ffd01e5939ccb3799b4a7bad5b20f416aa668d513b3e3
7
- data.tar.gz: 5a37cded1558ba6f3fbb1d4c475c46a593151a32e831299c00c386185d32828ec72810c2334863f7e03cdc7a3b2d974e7d1ec0cf653276073e7d336398ae92ba
6
+ metadata.gz: 3f8e1a7b70fb1a71060338b5aa2c4676a02ee56289fe5f0470fda09081ae2cec3b5c4fece89993beab298b43fb3769b3db8d28b485a4abacca40b57fdb6f3759
7
+ data.tar.gz: d4b1aab992230a263642b6eb0cac08d1981d6cf54728ef76fec84da323888172ba6c6ed891f1cb7914f95dc9887ee21856d43e0fb044fa3c8e1aafd7cc8ab428
data/README.md CHANGED
@@ -36,6 +36,7 @@ OAuth flow for installed applications.
36
36
  - **table**: table name (string, required)
37
37
  - **auto_create_table**: (boolean, optional default is 0)
38
38
  - **schema_path**: (string, optional)
39
+ - **prevent_duplicate_insert**: (boolean, optional default is 0)
39
40
  - **application_name**: application name anything you like (string, optional)
40
41
  - **delete_from_local_when_job_end**: (boolean, optional, default is 0)
41
42
  - **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
@@ -120,6 +121,22 @@ out:
120
121
  schema_path: /path/to/schema.json
121
122
  ```
122
123
 
124
+ ### Data Consistency
125
+
126
+ When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
127
+
128
+ `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding)`
129
+
130
+ [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
131
+
132
+ In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
133
+
134
+ ```yaml
135
+ out:
136
+ type: bigquery
137
+ prevent_duplicate_insert: true
138
+ ```
139
+
123
140
  ## Build
124
141
 
125
142
  ```
data/build.gradle CHANGED
@@ -15,14 +15,14 @@ configurations {
15
15
  sourceCompatibility = 1.7
16
16
  targetCompatibility = 1.7
17
17
 
18
- version = "0.1.3"
18
+ version = "0.1.4"
19
19
 
20
20
  dependencies {
21
21
  compile "org.embulk:embulk-core:0.5.1"
22
22
  provided "org.embulk:embulk-core:0.5.1"
23
23
 
24
- compile "com.google.http-client:google-http-client-jackson2:1.19.0"
25
- compile "com.google.apis:google-api-services-bigquery:v2-rev193-1.19.1"
24
+ compile "com.google.http-client:google-http-client-jackson2:1.20.0"
25
+ compile "com.google.apis:google-api-services-bigquery:v2-rev205-1.20.0"
26
26
 
27
27
  testCompile "junit:junit:4.+"
28
28
  }
@@ -5,6 +5,7 @@ import java.io.FileNotFoundException;
5
5
  import java.io.FileOutputStream;
6
6
  import java.io.BufferedOutputStream;
7
7
  import java.io.IOException;
8
+ import java.security.NoSuchAlgorithmException;
8
9
  import java.util.List;
9
10
  import java.util.concurrent.TimeoutException;
10
11
  import com.google.common.base.Optional;
@@ -96,6 +97,10 @@ public class BigqueryOutputPlugin
96
97
  @ConfigDefault("null")
97
98
  public Optional<String> getSchemaPath();
98
99
 
100
+ @Config("prevent_duplicate_insert")
101
+ @ConfigDefault("false")
102
+ public boolean getPreventDuplicateInsert();
103
+
99
104
  @Config("job_status_max_polling_time")
100
105
  @ConfigDefault("3600")
101
106
  public int getJobStatusMaxPollingTime();
@@ -105,7 +110,7 @@ public class BigqueryOutputPlugin
105
110
  public int getJobStatusPollingInterval();
106
111
 
107
112
  @Config("is_skip_job_result_check")
108
- @ConfigDefault("0")
113
+ @ConfigDefault("false")
109
114
  public boolean getIsSkipJobResultCheck();
110
115
  }
111
116
 
@@ -131,6 +136,7 @@ public class BigqueryOutputPlugin
131
136
  .setFieldDelimiter(task.getFieldDelimiter())
132
137
  .setMaxBadrecords(task.getMaxBadrecords())
133
138
  .setEncoding(task.getEncoding())
139
+ .setPreventDuplicateInsert(task.getPreventDuplicateInsert())
134
140
  .setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
135
141
  .setJobStatusPollingInterval(task.getJobStatusPollingInterval())
136
142
  .setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
@@ -233,7 +239,8 @@ public class BigqueryOutputPlugin
233
239
  log.info(String.format("Delete local file [%s]", filePath));
234
240
  file.delete();
235
241
  }
236
- } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
242
+ } catch (NoSuchAlgorithmException | TimeoutException | BigqueryWriter.JobFailedException | IOException ex) {
243
+ log.error(ex.getMessage());
237
244
  throw Throwables.propagate(ex);
238
245
  }
239
246
  }
@@ -6,14 +6,18 @@ import java.io.FileNotFoundException;
6
6
  import java.io.FileInputStream;
7
7
  import java.io.BufferedInputStream;
8
8
  import com.google.api.client.http.InputStreamContent;
9
+ import java.security.MessageDigest;
10
+ import java.security.NoSuchAlgorithmException;
9
11
  import java.util.List;
10
12
  import java.util.concurrent.TimeoutException;
11
13
  import com.google.common.base.Optional;
14
+ import com.google.api.client.util.Base64;
12
15
  import com.google.common.base.Throwables;
13
16
  import java.security.GeneralSecurityException;
14
17
  import com.fasterxml.jackson.databind.ObjectMapper;
15
18
  import com.fasterxml.jackson.core.type.TypeReference;
16
19
 
20
+ import org.apache.commons.codec.binary.Hex;
17
21
  import org.embulk.spi.Exec;
18
22
  import org.slf4j.Logger;
19
23
 
@@ -48,6 +52,7 @@ public class BigqueryWriter
48
52
  private final String fieldDelimiter;
49
53
  private final int maxBadrecords;
50
54
  private final String encoding;
55
+ private final boolean preventDuplicateInsert;
51
56
  private final long jobStatusMaxPollingTime;
52
57
  private final long jobStatusPollingInterval;
53
58
  private final boolean isSkipJobResultCheck;
@@ -64,6 +69,7 @@ public class BigqueryWriter
64
69
  this.fieldDelimiter = builder.fieldDelimiter;
65
70
  this.maxBadrecords = builder.maxBadrecords;
66
71
  this.encoding = builder.encoding.toUpperCase();
72
+ this.preventDuplicateInsert = builder.preventDuplicateInsert;
67
73
  this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
68
74
  this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
69
75
  this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
@@ -91,7 +97,7 @@ public class BigqueryWriter
91
97
  List<ErrorProto> errors = job.getStatus().getErrors();
92
98
  if (errors != null) {
93
99
  for (ErrorProto error : errors) {
94
- log.warn(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
100
+ log.error(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
95
101
  }
96
102
  }
97
103
 
@@ -132,17 +138,24 @@ public class BigqueryWriter
132
138
  }
133
139
  }
134
140
 
135
- public void executeLoad(String localFilePath) throws GoogleJsonResponseException, IOException, TimeoutException, JobFailedException
141
+ public void executeLoad(String localFilePath) throws GoogleJsonResponseException, NoSuchAlgorithmException,
142
+ TimeoutException, JobFailedException, IOException
136
143
  {
137
144
  log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
138
145
 
139
146
  Job job = new Job();
140
- JobReference jobRef = null;
147
+ JobReference jobRef = new JobReference();
141
148
  JobConfiguration jobConfig = new JobConfiguration();
142
149
  JobConfigurationLoad loadConfig = new JobConfigurationLoad();
143
150
  jobConfig.setLoad(loadConfig);
144
151
  job.setConfiguration(jobConfig);
145
152
 
153
+ if (preventDuplicateInsert) {
154
+ String jobId = createJobId(localFilePath);
155
+ jobRef.setJobId(jobId);
156
+ job.setJobReference(jobRef);
157
+ }
158
+
146
159
  loadConfig.setAllowQuotedNewlines(false);
147
160
  loadConfig.setEncoding(encoding);
148
161
  loadConfig.setMaxBadRecords(maxBadrecords);
@@ -181,9 +194,8 @@ public class BigqueryWriter
181
194
 
182
195
  try {
183
196
  jobRef = insert.execute().getJobReference();
184
- } catch (Exception ex) {
185
- log.warn("Job execution was failed. Please check your settings or data... like data matches schema");
186
- throw Throwables.propagate(ex);
197
+ } catch (IllegalStateException ex) {
198
+ throw new JobFailedException(ex.getMessage());
187
199
  }
188
200
  log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
189
201
  if (isSkipJobResultCheck) {
@@ -193,6 +205,25 @@ public class BigqueryWriter
193
205
  }
194
206
  }
195
207
 
208
+ private String createJobId(String localFilePath) throws NoSuchAlgorithmException, IOException
209
+ {
210
+ StringBuilder sb = new StringBuilder();
211
+ sb.append(getLocalMd5hash(localFilePath));
212
+ sb.append(dataset);
213
+ sb.append(table);
214
+ sb.append(tableSchema);
215
+ sb.append(sourceFormat);
216
+ sb.append(fieldDelimiter);
217
+ sb.append(maxBadrecords);
218
+ sb.append(encoding);
219
+
220
+ MessageDigest md = MessageDigest.getInstance("MD5");
221
+ String str = new String(sb);
222
+ byte[] digest = md.digest(str.getBytes());
223
+ String hash = new String(Hex.encodeHex(digest));
224
+ return "embulk_job_" + hash;
225
+ }
226
+
196
227
  private TableReference createTableReference()
197
228
  {
198
229
  return new TableReference()
@@ -248,6 +279,28 @@ public class BigqueryWriter
248
279
  }
249
280
  }
250
281
 
282
+ private String getLocalMd5hash(String filePath) throws NoSuchAlgorithmException, IOException
283
+ {
284
+ FileInputStream stream = null;
285
+ try {
286
+ stream = new FileInputStream(filePath);
287
+ MessageDigest digest = MessageDigest.getInstance("MD5");
288
+
289
+ byte[] bytesBuffer = new byte[1024];
290
+ int bytesRead = -1;
291
+
292
+ while ((bytesRead = stream.read(bytesBuffer)) != -1) {
293
+ digest.update(bytesBuffer, 0, bytesRead);
294
+ }
295
+ byte[] hashedBytes = digest.digest();
296
+
297
+ byte[] encoded = (hashedBytes);
298
+ return new String(encoded);
299
+ } finally {
300
+ stream.close();
301
+ }
302
+ }
303
+
251
304
  private class UploadProgressListener implements MediaHttpUploaderProgressListener
252
305
  {
253
306
  private String fileName;
@@ -291,6 +344,7 @@ public class BigqueryWriter
291
344
  private String fieldDelimiter;
292
345
  private int maxBadrecords;
293
346
  private String encoding;
347
+ private boolean preventDuplicateInsert;
294
348
  private int jobStatusMaxPollingTime;
295
349
  private int jobStatusPollingInterval;
296
350
  private boolean isSkipJobResultCheck;
@@ -372,6 +426,12 @@ public class BigqueryWriter
372
426
  return this;
373
427
  }
374
428
 
429
+ public Builder setPreventDuplicateInsert(boolean preventDuplicateInsert)
430
+ {
431
+ this.preventDuplicateInsert = preventDuplicateInsert;
432
+ return this;
433
+ }
434
+
375
435
  public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
376
436
  {
377
437
  this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
@@ -396,7 +456,7 @@ public class BigqueryWriter
396
456
  }
397
457
  }
398
458
 
399
- public class JobFailedException extends Exception
459
+ public class JobFailedException extends RuntimeException
400
460
  {
401
461
  public JobFailedException(String message) {
402
462
  super(message);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-06 00:00:00.000000000 Z
11
+ date: 2015-04-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -62,12 +62,12 @@ files:
62
62
  - src/test/java/org/embulk/output/TestBigqueryWriter.java
63
63
  - classpath/commons-codec-1.3.jar
64
64
  - classpath/commons-logging-1.1.1.jar
65
- - classpath/embulk-output-bigquery-0.1.3.jar
66
- - classpath/google-api-client-1.19.1.jar
67
- - classpath/google-api-services-bigquery-v2-rev193-1.19.1.jar
68
- - classpath/google-http-client-1.19.0.jar
69
- - classpath/google-http-client-jackson2-1.19.0.jar
70
- - classpath/google-oauth-client-1.19.0.jar
65
+ - classpath/embulk-output-bigquery-0.1.4.jar
66
+ - classpath/google-api-client-1.20.0.jar
67
+ - classpath/google-api-services-bigquery-v2-rev205-1.20.0.jar
68
+ - classpath/google-http-client-1.20.0.jar
69
+ - classpath/google-http-client-jackson2-1.20.0.jar
70
+ - classpath/google-oauth-client-1.20.0.jar
71
71
  - classpath/guava-jdk5-13.0.jar
72
72
  - classpath/httpclient-4.0.1.jar
73
73
  - classpath/httpcore-4.0.1.jar