embulk-output-bigquery 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b37d638ca9c217221687cdcadfbd45257291aef4
4
- data.tar.gz: 972bf78e9ce75972fd3f2e1f77389a7383c3d2a0
3
+ metadata.gz: 37643ccea137f84b59a056d75826b54c8548c0b6
4
+ data.tar.gz: 1eafbf6ab69c81039c22712cda93ceb099ec9d92
5
5
  SHA512:
6
- metadata.gz: 6d18639e76da80f45e2852df8408ec4c9c655e06a77a776c42dfb52310cc787e8319fd8f182eef20c59ffd01e5939ccb3799b4a7bad5b20f416aa668d513b3e3
7
- data.tar.gz: 5a37cded1558ba6f3fbb1d4c475c46a593151a32e831299c00c386185d32828ec72810c2334863f7e03cdc7a3b2d974e7d1ec0cf653276073e7d336398ae92ba
6
+ metadata.gz: 3f8e1a7b70fb1a71060338b5aa2c4676a02ee56289fe5f0470fda09081ae2cec3b5c4fece89993beab298b43fb3769b3db8d28b485a4abacca40b57fdb6f3759
7
+ data.tar.gz: d4b1aab992230a263642b6eb0cac08d1981d6cf54728ef76fec84da323888172ba6c6ed891f1cb7914f95dc9887ee21856d43e0fb044fa3c8e1aafd7cc8ab428
data/README.md CHANGED
@@ -36,6 +36,7 @@ OAuth flow for installed applications.
36
36
  - **table**: table name (string, required)
37
37
  - **auto_create_table**: (boolean, optional default is 0)
38
38
  - **schema_path**: (string, optional)
39
+ - **prevent_duplicate_insert**: (boolean, optional default is 0)
39
40
  - **application_name**: application name anything you like (string, optional)
40
41
  - **delete_from_local_when_job_end**: (boolean, optional, default is 0)
41
42
  - **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
@@ -120,6 +121,22 @@ out:
120
121
  schema_path: /path/to/schema.json
121
122
  ```
122
123
 
124
+ ### Data Consistency
125
+
126
+ When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
127
+
128
+ `job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding)`
129
+
130
+ [job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
131
+
132
+ In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
133
+
134
+ ```yaml
135
+ out:
136
+ type: bigquery
137
+ prevent_duplicate_insert: true
138
+ ```
139
+
123
140
  ## Build
124
141
 
125
142
  ```
data/build.gradle CHANGED
@@ -15,14 +15,14 @@ configurations {
15
15
  sourceCompatibility = 1.7
16
16
  targetCompatibility = 1.7
17
17
 
18
- version = "0.1.3"
18
+ version = "0.1.4"
19
19
 
20
20
  dependencies {
21
21
  compile "org.embulk:embulk-core:0.5.1"
22
22
  provided "org.embulk:embulk-core:0.5.1"
23
23
 
24
- compile "com.google.http-client:google-http-client-jackson2:1.19.0"
25
- compile "com.google.apis:google-api-services-bigquery:v2-rev193-1.19.1"
24
+ compile "com.google.http-client:google-http-client-jackson2:1.20.0"
25
+ compile "com.google.apis:google-api-services-bigquery:v2-rev205-1.20.0"
26
26
 
27
27
  testCompile "junit:junit:4.+"
28
28
  }
@@ -5,6 +5,7 @@ import java.io.FileNotFoundException;
5
5
  import java.io.FileOutputStream;
6
6
  import java.io.BufferedOutputStream;
7
7
  import java.io.IOException;
8
+ import java.security.NoSuchAlgorithmException;
8
9
  import java.util.List;
9
10
  import java.util.concurrent.TimeoutException;
10
11
  import com.google.common.base.Optional;
@@ -96,6 +97,10 @@ public class BigqueryOutputPlugin
96
97
  @ConfigDefault("null")
97
98
  public Optional<String> getSchemaPath();
98
99
 
100
+ @Config("prevent_duplicate_insert")
101
+ @ConfigDefault("false")
102
+ public boolean getPreventDuplicateInsert();
103
+
99
104
  @Config("job_status_max_polling_time")
100
105
  @ConfigDefault("3600")
101
106
  public int getJobStatusMaxPollingTime();
@@ -105,7 +110,7 @@ public class BigqueryOutputPlugin
105
110
  public int getJobStatusPollingInterval();
106
111
 
107
112
  @Config("is_skip_job_result_check")
108
- @ConfigDefault("0")
113
+ @ConfigDefault("false")
109
114
  public boolean getIsSkipJobResultCheck();
110
115
  }
111
116
 
@@ -131,6 +136,7 @@ public class BigqueryOutputPlugin
131
136
  .setFieldDelimiter(task.getFieldDelimiter())
132
137
  .setMaxBadrecords(task.getMaxBadrecords())
133
138
  .setEncoding(task.getEncoding())
139
+ .setPreventDuplicateInsert(task.getPreventDuplicateInsert())
134
140
  .setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
135
141
  .setJobStatusPollingInterval(task.getJobStatusPollingInterval())
136
142
  .setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
@@ -233,7 +239,8 @@ public class BigqueryOutputPlugin
233
239
  log.info(String.format("Delete local file [%s]", filePath));
234
240
  file.delete();
235
241
  }
236
- } catch (IOException | TimeoutException | BigqueryWriter.JobFailedException ex) {
242
+ } catch (NoSuchAlgorithmException | TimeoutException | BigqueryWriter.JobFailedException | IOException ex) {
243
+ log.error(ex.getMessage());
237
244
  throw Throwables.propagate(ex);
238
245
  }
239
246
  }
@@ -6,14 +6,18 @@ import java.io.FileNotFoundException;
6
6
  import java.io.FileInputStream;
7
7
  import java.io.BufferedInputStream;
8
8
  import com.google.api.client.http.InputStreamContent;
9
+ import java.security.MessageDigest;
10
+ import java.security.NoSuchAlgorithmException;
9
11
  import java.util.List;
10
12
  import java.util.concurrent.TimeoutException;
11
13
  import com.google.common.base.Optional;
14
+ import com.google.api.client.util.Base64;
12
15
  import com.google.common.base.Throwables;
13
16
  import java.security.GeneralSecurityException;
14
17
  import com.fasterxml.jackson.databind.ObjectMapper;
15
18
  import com.fasterxml.jackson.core.type.TypeReference;
16
19
 
20
+ import org.apache.commons.codec.binary.Hex;
17
21
  import org.embulk.spi.Exec;
18
22
  import org.slf4j.Logger;
19
23
 
@@ -48,6 +52,7 @@ public class BigqueryWriter
48
52
  private final String fieldDelimiter;
49
53
  private final int maxBadrecords;
50
54
  private final String encoding;
55
+ private final boolean preventDuplicateInsert;
51
56
  private final long jobStatusMaxPollingTime;
52
57
  private final long jobStatusPollingInterval;
53
58
  private final boolean isSkipJobResultCheck;
@@ -64,6 +69,7 @@ public class BigqueryWriter
64
69
  this.fieldDelimiter = builder.fieldDelimiter;
65
70
  this.maxBadrecords = builder.maxBadrecords;
66
71
  this.encoding = builder.encoding.toUpperCase();
72
+ this.preventDuplicateInsert = builder.preventDuplicateInsert;
67
73
  this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
68
74
  this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
69
75
  this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
@@ -91,7 +97,7 @@ public class BigqueryWriter
91
97
  List<ErrorProto> errors = job.getStatus().getErrors();
92
98
  if (errors != null) {
93
99
  for (ErrorProto error : errors) {
94
- log.warn(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
100
+ log.error(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
95
101
  }
96
102
  }
97
103
 
@@ -132,17 +138,24 @@ public class BigqueryWriter
132
138
  }
133
139
  }
134
140
 
135
- public void executeLoad(String localFilePath) throws GoogleJsonResponseException, IOException, TimeoutException, JobFailedException
141
+ public void executeLoad(String localFilePath) throws GoogleJsonResponseException, NoSuchAlgorithmException,
142
+ TimeoutException, JobFailedException, IOException
136
143
  {
137
144
  log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
138
145
 
139
146
  Job job = new Job();
140
- JobReference jobRef = null;
147
+ JobReference jobRef = new JobReference();
141
148
  JobConfiguration jobConfig = new JobConfiguration();
142
149
  JobConfigurationLoad loadConfig = new JobConfigurationLoad();
143
150
  jobConfig.setLoad(loadConfig);
144
151
  job.setConfiguration(jobConfig);
145
152
 
153
+ if (preventDuplicateInsert) {
154
+ String jobId = createJobId(localFilePath);
155
+ jobRef.setJobId(jobId);
156
+ job.setJobReference(jobRef);
157
+ }
158
+
146
159
  loadConfig.setAllowQuotedNewlines(false);
147
160
  loadConfig.setEncoding(encoding);
148
161
  loadConfig.setMaxBadRecords(maxBadrecords);
@@ -181,9 +194,8 @@ public class BigqueryWriter
181
194
 
182
195
  try {
183
196
  jobRef = insert.execute().getJobReference();
184
- } catch (Exception ex) {
185
- log.warn("Job execution was failed. Please check your settings or data... like data matches schema");
186
- throw Throwables.propagate(ex);
197
+ } catch (IllegalStateException ex) {
198
+ throw new JobFailedException(ex.getMessage());
187
199
  }
188
200
  log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
189
201
  if (isSkipJobResultCheck) {
@@ -193,6 +205,25 @@ public class BigqueryWriter
193
205
  }
194
206
  }
195
207
 
208
+ private String createJobId(String localFilePath) throws NoSuchAlgorithmException, IOException
209
+ {
210
+ StringBuilder sb = new StringBuilder();
211
+ sb.append(getLocalMd5hash(localFilePath));
212
+ sb.append(dataset);
213
+ sb.append(table);
214
+ sb.append(tableSchema);
215
+ sb.append(sourceFormat);
216
+ sb.append(fieldDelimiter);
217
+ sb.append(maxBadrecords);
218
+ sb.append(encoding);
219
+
220
+ MessageDigest md = MessageDigest.getInstance("MD5");
221
+ String str = new String(sb);
222
+ byte[] digest = md.digest(str.getBytes());
223
+ String hash = new String(Hex.encodeHex(digest));
224
+ return "embulk_job_" + hash;
225
+ }
226
+
196
227
  private TableReference createTableReference()
197
228
  {
198
229
  return new TableReference()
@@ -248,6 +279,28 @@ public class BigqueryWriter
248
279
  }
249
280
  }
250
281
 
282
+ private String getLocalMd5hash(String filePath) throws NoSuchAlgorithmException, IOException
283
+ {
284
+ FileInputStream stream = null;
285
+ try {
286
+ stream = new FileInputStream(filePath);
287
+ MessageDigest digest = MessageDigest.getInstance("MD5");
288
+
289
+ byte[] bytesBuffer = new byte[1024];
290
+ int bytesRead = -1;
291
+
292
+ while ((bytesRead = stream.read(bytesBuffer)) != -1) {
293
+ digest.update(bytesBuffer, 0, bytesRead);
294
+ }
295
+ byte[] hashedBytes = digest.digest();
296
+
297
+ byte[] encoded = (hashedBytes);
298
+ return new String(encoded);
299
+ } finally {
300
+ stream.close();
301
+ }
302
+ }
303
+
251
304
  private class UploadProgressListener implements MediaHttpUploaderProgressListener
252
305
  {
253
306
  private String fileName;
@@ -291,6 +344,7 @@ public class BigqueryWriter
291
344
  private String fieldDelimiter;
292
345
  private int maxBadrecords;
293
346
  private String encoding;
347
+ private boolean preventDuplicateInsert;
294
348
  private int jobStatusMaxPollingTime;
295
349
  private int jobStatusPollingInterval;
296
350
  private boolean isSkipJobResultCheck;
@@ -372,6 +426,12 @@ public class BigqueryWriter
372
426
  return this;
373
427
  }
374
428
 
429
+ public Builder setPreventDuplicateInsert(boolean preventDuplicateInsert)
430
+ {
431
+ this.preventDuplicateInsert = preventDuplicateInsert;
432
+ return this;
433
+ }
434
+
375
435
  public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
376
436
  {
377
437
  this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
@@ -396,7 +456,7 @@ public class BigqueryWriter
396
456
  }
397
457
  }
398
458
 
399
- public class JobFailedException extends Exception
459
+ public class JobFailedException extends RuntimeException
400
460
  {
401
461
  public JobFailedException(String message) {
402
462
  super(message);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-06 00:00:00.000000000 Z
11
+ date: 2015-04-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -62,12 +62,12 @@ files:
62
62
  - src/test/java/org/embulk/output/TestBigqueryWriter.java
63
63
  - classpath/commons-codec-1.3.jar
64
64
  - classpath/commons-logging-1.1.1.jar
65
- - classpath/embulk-output-bigquery-0.1.3.jar
66
- - classpath/google-api-client-1.19.1.jar
67
- - classpath/google-api-services-bigquery-v2-rev193-1.19.1.jar
68
- - classpath/google-http-client-1.19.0.jar
69
- - classpath/google-http-client-jackson2-1.19.0.jar
70
- - classpath/google-oauth-client-1.19.0.jar
65
+ - classpath/embulk-output-bigquery-0.1.4.jar
66
+ - classpath/google-api-client-1.20.0.jar
67
+ - classpath/google-api-services-bigquery-v2-rev205-1.20.0.jar
68
+ - classpath/google-http-client-1.20.0.jar
69
+ - classpath/google-http-client-jackson2-1.20.0.jar
70
+ - classpath/google-oauth-client-1.20.0.jar
71
71
  - classpath/guava-jdk5-13.0.jar
72
72
  - classpath/httpclient-4.0.1.jar
73
73
  - classpath/httpcore-4.0.1.jar