embulk-output-bigquery 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -0
- data/build.gradle +3 -3
- data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +9 -2
- data/src/main/java/org/embulk/output/BigqueryWriter.java +67 -7
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37643ccea137f84b59a056d75826b54c8548c0b6
|
4
|
+
data.tar.gz: 1eafbf6ab69c81039c22712cda93ceb099ec9d92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f8e1a7b70fb1a71060338b5aa2c4676a02ee56289fe5f0470fda09081ae2cec3b5c4fece89993beab298b43fb3769b3db8d28b485a4abacca40b57fdb6f3759
|
7
|
+
data.tar.gz: d4b1aab992230a263642b6eb0cac08d1981d6cf54728ef76fec84da323888172ba6c6ed891f1cb7914f95dc9887ee21856d43e0fb044fa3c8e1aafd7cc8ab428
|
data/README.md
CHANGED
@@ -36,6 +36,7 @@ OAuth flow for installed applications.
|
|
36
36
|
- **table**: table name (string, required)
|
37
37
|
- **auto_create_table**: (boolean, optional default is 0)
|
38
38
|
- **schema_path**: (string, optional)
|
39
|
+
- **prevent_duplicate_insert**: (boolean, optional default is 0)
|
39
40
|
- **application_name**: application name anything you like (string, optional)
|
40
41
|
- **delete_from_local_when_job_end**: (boolean, optional, default is 0)
|
41
42
|
- **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
|
@@ -120,6 +121,22 @@ out:
|
|
120
121
|
schema_path: /path/to/schema.json
|
121
122
|
```
|
122
123
|
|
124
|
+
### Data Consistency
|
125
|
+
|
126
|
+
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
|
127
|
+
|
128
|
+
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding)`
|
129
|
+
|
130
|
+
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
|
131
|
+
|
132
|
+
In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
|
133
|
+
|
134
|
+
```yaml
|
135
|
+
out:
|
136
|
+
type: bigquery
|
137
|
+
prevent_duplicate_insert: true
|
138
|
+
```
|
139
|
+
|
123
140
|
## Build
|
124
141
|
|
125
142
|
```
|
data/build.gradle
CHANGED
@@ -15,14 +15,14 @@ configurations {
|
|
15
15
|
sourceCompatibility = 1.7
|
16
16
|
targetCompatibility = 1.7
|
17
17
|
|
18
|
-
version = "0.1.
|
18
|
+
version = "0.1.4"
|
19
19
|
|
20
20
|
dependencies {
|
21
21
|
compile "org.embulk:embulk-core:0.5.1"
|
22
22
|
provided "org.embulk:embulk-core:0.5.1"
|
23
23
|
|
24
|
-
compile "com.google.http-client:google-http-client-jackson2:1.
|
25
|
-
compile "com.google.apis:google-api-services-bigquery:v2-
|
24
|
+
compile "com.google.http-client:google-http-client-jackson2:1.20.0"
|
25
|
+
compile "com.google.apis:google-api-services-bigquery:v2-rev205-1.20.0"
|
26
26
|
|
27
27
|
testCompile "junit:junit:4.+"
|
28
28
|
}
|
@@ -5,6 +5,7 @@ import java.io.FileNotFoundException;
|
|
5
5
|
import java.io.FileOutputStream;
|
6
6
|
import java.io.BufferedOutputStream;
|
7
7
|
import java.io.IOException;
|
8
|
+
import java.security.NoSuchAlgorithmException;
|
8
9
|
import java.util.List;
|
9
10
|
import java.util.concurrent.TimeoutException;
|
10
11
|
import com.google.common.base.Optional;
|
@@ -96,6 +97,10 @@ public class BigqueryOutputPlugin
|
|
96
97
|
@ConfigDefault("null")
|
97
98
|
public Optional<String> getSchemaPath();
|
98
99
|
|
100
|
+
@Config("prevent_duplicate_insert")
|
101
|
+
@ConfigDefault("false")
|
102
|
+
public boolean getPreventDuplicateInsert();
|
103
|
+
|
99
104
|
@Config("job_status_max_polling_time")
|
100
105
|
@ConfigDefault("3600")
|
101
106
|
public int getJobStatusMaxPollingTime();
|
@@ -105,7 +110,7 @@ public class BigqueryOutputPlugin
|
|
105
110
|
public int getJobStatusPollingInterval();
|
106
111
|
|
107
112
|
@Config("is_skip_job_result_check")
|
108
|
-
@ConfigDefault("
|
113
|
+
@ConfigDefault("false")
|
109
114
|
public boolean getIsSkipJobResultCheck();
|
110
115
|
}
|
111
116
|
|
@@ -131,6 +136,7 @@ public class BigqueryOutputPlugin
|
|
131
136
|
.setFieldDelimiter(task.getFieldDelimiter())
|
132
137
|
.setMaxBadrecords(task.getMaxBadrecords())
|
133
138
|
.setEncoding(task.getEncoding())
|
139
|
+
.setPreventDuplicateInsert(task.getPreventDuplicateInsert())
|
134
140
|
.setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
|
135
141
|
.setJobStatusPollingInterval(task.getJobStatusPollingInterval())
|
136
142
|
.setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
|
@@ -233,7 +239,8 @@ public class BigqueryOutputPlugin
|
|
233
239
|
log.info(String.format("Delete local file [%s]", filePath));
|
234
240
|
file.delete();
|
235
241
|
}
|
236
|
-
} catch (
|
242
|
+
} catch (NoSuchAlgorithmException | TimeoutException | BigqueryWriter.JobFailedException | IOException ex) {
|
243
|
+
log.error(ex.getMessage());
|
237
244
|
throw Throwables.propagate(ex);
|
238
245
|
}
|
239
246
|
}
|
@@ -6,14 +6,18 @@ import java.io.FileNotFoundException;
|
|
6
6
|
import java.io.FileInputStream;
|
7
7
|
import java.io.BufferedInputStream;
|
8
8
|
import com.google.api.client.http.InputStreamContent;
|
9
|
+
import java.security.MessageDigest;
|
10
|
+
import java.security.NoSuchAlgorithmException;
|
9
11
|
import java.util.List;
|
10
12
|
import java.util.concurrent.TimeoutException;
|
11
13
|
import com.google.common.base.Optional;
|
14
|
+
import com.google.api.client.util.Base64;
|
12
15
|
import com.google.common.base.Throwables;
|
13
16
|
import java.security.GeneralSecurityException;
|
14
17
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
15
18
|
import com.fasterxml.jackson.core.type.TypeReference;
|
16
19
|
|
20
|
+
import org.apache.commons.codec.binary.Hex;
|
17
21
|
import org.embulk.spi.Exec;
|
18
22
|
import org.slf4j.Logger;
|
19
23
|
|
@@ -48,6 +52,7 @@ public class BigqueryWriter
|
|
48
52
|
private final String fieldDelimiter;
|
49
53
|
private final int maxBadrecords;
|
50
54
|
private final String encoding;
|
55
|
+
private final boolean preventDuplicateInsert;
|
51
56
|
private final long jobStatusMaxPollingTime;
|
52
57
|
private final long jobStatusPollingInterval;
|
53
58
|
private final boolean isSkipJobResultCheck;
|
@@ -64,6 +69,7 @@ public class BigqueryWriter
|
|
64
69
|
this.fieldDelimiter = builder.fieldDelimiter;
|
65
70
|
this.maxBadrecords = builder.maxBadrecords;
|
66
71
|
this.encoding = builder.encoding.toUpperCase();
|
72
|
+
this.preventDuplicateInsert = builder.preventDuplicateInsert;
|
67
73
|
this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
|
68
74
|
this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
|
69
75
|
this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
|
@@ -91,7 +97,7 @@ public class BigqueryWriter
|
|
91
97
|
List<ErrorProto> errors = job.getStatus().getErrors();
|
92
98
|
if (errors != null) {
|
93
99
|
for (ErrorProto error : errors) {
|
94
|
-
log.
|
100
|
+
log.error(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
|
95
101
|
}
|
96
102
|
}
|
97
103
|
|
@@ -132,17 +138,24 @@ public class BigqueryWriter
|
|
132
138
|
}
|
133
139
|
}
|
134
140
|
|
135
|
-
public void executeLoad(String localFilePath) throws GoogleJsonResponseException,
|
141
|
+
public void executeLoad(String localFilePath) throws GoogleJsonResponseException, NoSuchAlgorithmException,
|
142
|
+
TimeoutException, JobFailedException, IOException
|
136
143
|
{
|
137
144
|
log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
|
138
145
|
|
139
146
|
Job job = new Job();
|
140
|
-
JobReference jobRef =
|
147
|
+
JobReference jobRef = new JobReference();
|
141
148
|
JobConfiguration jobConfig = new JobConfiguration();
|
142
149
|
JobConfigurationLoad loadConfig = new JobConfigurationLoad();
|
143
150
|
jobConfig.setLoad(loadConfig);
|
144
151
|
job.setConfiguration(jobConfig);
|
145
152
|
|
153
|
+
if (preventDuplicateInsert) {
|
154
|
+
String jobId = createJobId(localFilePath);
|
155
|
+
jobRef.setJobId(jobId);
|
156
|
+
job.setJobReference(jobRef);
|
157
|
+
}
|
158
|
+
|
146
159
|
loadConfig.setAllowQuotedNewlines(false);
|
147
160
|
loadConfig.setEncoding(encoding);
|
148
161
|
loadConfig.setMaxBadRecords(maxBadrecords);
|
@@ -181,9 +194,8 @@ public class BigqueryWriter
|
|
181
194
|
|
182
195
|
try {
|
183
196
|
jobRef = insert.execute().getJobReference();
|
184
|
-
} catch (
|
185
|
-
|
186
|
-
throw Throwables.propagate(ex);
|
197
|
+
} catch (IllegalStateException ex) {
|
198
|
+
throw new JobFailedException(ex.getMessage());
|
187
199
|
}
|
188
200
|
log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
|
189
201
|
if (isSkipJobResultCheck) {
|
@@ -193,6 +205,25 @@ public class BigqueryWriter
|
|
193
205
|
}
|
194
206
|
}
|
195
207
|
|
208
|
+
private String createJobId(String localFilePath) throws NoSuchAlgorithmException, IOException
|
209
|
+
{
|
210
|
+
StringBuilder sb = new StringBuilder();
|
211
|
+
sb.append(getLocalMd5hash(localFilePath));
|
212
|
+
sb.append(dataset);
|
213
|
+
sb.append(table);
|
214
|
+
sb.append(tableSchema);
|
215
|
+
sb.append(sourceFormat);
|
216
|
+
sb.append(fieldDelimiter);
|
217
|
+
sb.append(maxBadrecords);
|
218
|
+
sb.append(encoding);
|
219
|
+
|
220
|
+
MessageDigest md = MessageDigest.getInstance("MD5");
|
221
|
+
String str = new String(sb);
|
222
|
+
byte[] digest = md.digest(str.getBytes());
|
223
|
+
String hash = new String(Hex.encodeHex(digest));
|
224
|
+
return "embulk_job_" + hash;
|
225
|
+
}
|
226
|
+
|
196
227
|
private TableReference createTableReference()
|
197
228
|
{
|
198
229
|
return new TableReference()
|
@@ -248,6 +279,28 @@ public class BigqueryWriter
|
|
248
279
|
}
|
249
280
|
}
|
250
281
|
|
282
|
+
private String getLocalMd5hash(String filePath) throws NoSuchAlgorithmException, IOException
|
283
|
+
{
|
284
|
+
FileInputStream stream = null;
|
285
|
+
try {
|
286
|
+
stream = new FileInputStream(filePath);
|
287
|
+
MessageDigest digest = MessageDigest.getInstance("MD5");
|
288
|
+
|
289
|
+
byte[] bytesBuffer = new byte[1024];
|
290
|
+
int bytesRead = -1;
|
291
|
+
|
292
|
+
while ((bytesRead = stream.read(bytesBuffer)) != -1) {
|
293
|
+
digest.update(bytesBuffer, 0, bytesRead);
|
294
|
+
}
|
295
|
+
byte[] hashedBytes = digest.digest();
|
296
|
+
|
297
|
+
byte[] encoded = (hashedBytes);
|
298
|
+
return new String(encoded);
|
299
|
+
} finally {
|
300
|
+
stream.close();
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
251
304
|
private class UploadProgressListener implements MediaHttpUploaderProgressListener
|
252
305
|
{
|
253
306
|
private String fileName;
|
@@ -291,6 +344,7 @@ public class BigqueryWriter
|
|
291
344
|
private String fieldDelimiter;
|
292
345
|
private int maxBadrecords;
|
293
346
|
private String encoding;
|
347
|
+
private boolean preventDuplicateInsert;
|
294
348
|
private int jobStatusMaxPollingTime;
|
295
349
|
private int jobStatusPollingInterval;
|
296
350
|
private boolean isSkipJobResultCheck;
|
@@ -372,6 +426,12 @@ public class BigqueryWriter
|
|
372
426
|
return this;
|
373
427
|
}
|
374
428
|
|
429
|
+
public Builder setPreventDuplicateInsert(boolean preventDuplicateInsert)
|
430
|
+
{
|
431
|
+
this.preventDuplicateInsert = preventDuplicateInsert;
|
432
|
+
return this;
|
433
|
+
}
|
434
|
+
|
375
435
|
public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
|
376
436
|
{
|
377
437
|
this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
|
@@ -396,7 +456,7 @@ public class BigqueryWriter
|
|
396
456
|
}
|
397
457
|
}
|
398
458
|
|
399
|
-
public class JobFailedException extends
|
459
|
+
public class JobFailedException extends RuntimeException
|
400
460
|
{
|
401
461
|
public JobFailedException(String message) {
|
402
462
|
super(message);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -62,12 +62,12 @@ files:
|
|
62
62
|
- src/test/java/org/embulk/output/TestBigqueryWriter.java
|
63
63
|
- classpath/commons-codec-1.3.jar
|
64
64
|
- classpath/commons-logging-1.1.1.jar
|
65
|
-
- classpath/embulk-output-bigquery-0.1.
|
66
|
-
- classpath/google-api-client-1.
|
67
|
-
- classpath/google-api-services-bigquery-v2-
|
68
|
-
- classpath/google-http-client-1.
|
69
|
-
- classpath/google-http-client-jackson2-1.
|
70
|
-
- classpath/google-oauth-client-1.
|
65
|
+
- classpath/embulk-output-bigquery-0.1.4.jar
|
66
|
+
- classpath/google-api-client-1.20.0.jar
|
67
|
+
- classpath/google-api-services-bigquery-v2-rev205-1.20.0.jar
|
68
|
+
- classpath/google-http-client-1.20.0.jar
|
69
|
+
- classpath/google-http-client-jackson2-1.20.0.jar
|
70
|
+
- classpath/google-oauth-client-1.20.0.jar
|
71
71
|
- classpath/guava-jdk5-13.0.jar
|
72
72
|
- classpath/httpclient-4.0.1.jar
|
73
73
|
- classpath/httpcore-4.0.1.jar
|