embulk-output-bigquery 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +17 -0
- data/build.gradle +3 -3
- data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +9 -2
- data/src/main/java/org/embulk/output/BigqueryWriter.java +67 -7
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37643ccea137f84b59a056d75826b54c8548c0b6
|
4
|
+
data.tar.gz: 1eafbf6ab69c81039c22712cda93ceb099ec9d92
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f8e1a7b70fb1a71060338b5aa2c4676a02ee56289fe5f0470fda09081ae2cec3b5c4fece89993beab298b43fb3769b3db8d28b485a4abacca40b57fdb6f3759
|
7
|
+
data.tar.gz: d4b1aab992230a263642b6eb0cac08d1981d6cf54728ef76fec84da323888172ba6c6ed891f1cb7914f95dc9887ee21856d43e0fb044fa3c8e1aafd7cc8ab428
|
data/README.md
CHANGED
@@ -36,6 +36,7 @@ OAuth flow for installed applications.
|
|
36
36
|
- **table**: table name (string, required)
|
37
37
|
- **auto_create_table**: (boolean, optional default is 0)
|
38
38
|
- **schema_path**: (string, optional)
|
39
|
+
- **prevent_duplicate_insert**: (boolean, optional default is 0)
|
39
40
|
- **application_name**: application name anything you like (string, optional)
|
40
41
|
- **delete_from_local_when_job_end**: (boolean, optional, default is 0)
|
41
42
|
- **job_status_max_polling_time**: max job status polling time. (int, optional, default is 3600 sec)
|
@@ -120,6 +121,22 @@ out:
|
|
120
121
|
schema_path: /path/to/schema.json
|
121
122
|
```
|
122
123
|
|
124
|
+
### Data Consistency
|
125
|
+
|
126
|
+
When `prevent_duplicate_insert` is set to true, embulk-output-bigquery generate job ID from md5 hash of file and other options to prevent duplicate data insertion.
|
127
|
+
|
128
|
+
`job ID = md5(md5(file) + dataset + table + schema + source_format + file_delimiter + max_bad_records + encoding)`
|
129
|
+
|
130
|
+
[job ID must be unique(including failures)](https://cloud.google.com/bigquery/loading-data-into-bigquery#consistency). So same data can't insert with same settings.
|
131
|
+
|
132
|
+
In other words, you can retry as many times as you like, in case something bad error(like network error) happens before job insertion.
|
133
|
+
|
134
|
+
```yaml
|
135
|
+
out:
|
136
|
+
type: bigquery
|
137
|
+
prevent_duplicate_insert: true
|
138
|
+
```
|
139
|
+
|
123
140
|
## Build
|
124
141
|
|
125
142
|
```
|
data/build.gradle
CHANGED
@@ -15,14 +15,14 @@ configurations {
|
|
15
15
|
sourceCompatibility = 1.7
|
16
16
|
targetCompatibility = 1.7
|
17
17
|
|
18
|
-
version = "0.1.
|
18
|
+
version = "0.1.4"
|
19
19
|
|
20
20
|
dependencies {
|
21
21
|
compile "org.embulk:embulk-core:0.5.1"
|
22
22
|
provided "org.embulk:embulk-core:0.5.1"
|
23
23
|
|
24
|
-
compile "com.google.http-client:google-http-client-jackson2:1.
|
25
|
-
compile "com.google.apis:google-api-services-bigquery:v2-
|
24
|
+
compile "com.google.http-client:google-http-client-jackson2:1.20.0"
|
25
|
+
compile "com.google.apis:google-api-services-bigquery:v2-rev205-1.20.0"
|
26
26
|
|
27
27
|
testCompile "junit:junit:4.+"
|
28
28
|
}
|
@@ -5,6 +5,7 @@ import java.io.FileNotFoundException;
|
|
5
5
|
import java.io.FileOutputStream;
|
6
6
|
import java.io.BufferedOutputStream;
|
7
7
|
import java.io.IOException;
|
8
|
+
import java.security.NoSuchAlgorithmException;
|
8
9
|
import java.util.List;
|
9
10
|
import java.util.concurrent.TimeoutException;
|
10
11
|
import com.google.common.base.Optional;
|
@@ -96,6 +97,10 @@ public class BigqueryOutputPlugin
|
|
96
97
|
@ConfigDefault("null")
|
97
98
|
public Optional<String> getSchemaPath();
|
98
99
|
|
100
|
+
@Config("prevent_duplicate_insert")
|
101
|
+
@ConfigDefault("false")
|
102
|
+
public boolean getPreventDuplicateInsert();
|
103
|
+
|
99
104
|
@Config("job_status_max_polling_time")
|
100
105
|
@ConfigDefault("3600")
|
101
106
|
public int getJobStatusMaxPollingTime();
|
@@ -105,7 +110,7 @@ public class BigqueryOutputPlugin
|
|
105
110
|
public int getJobStatusPollingInterval();
|
106
111
|
|
107
112
|
@Config("is_skip_job_result_check")
|
108
|
-
@ConfigDefault("
|
113
|
+
@ConfigDefault("false")
|
109
114
|
public boolean getIsSkipJobResultCheck();
|
110
115
|
}
|
111
116
|
|
@@ -131,6 +136,7 @@ public class BigqueryOutputPlugin
|
|
131
136
|
.setFieldDelimiter(task.getFieldDelimiter())
|
132
137
|
.setMaxBadrecords(task.getMaxBadrecords())
|
133
138
|
.setEncoding(task.getEncoding())
|
139
|
+
.setPreventDuplicateInsert(task.getPreventDuplicateInsert())
|
134
140
|
.setJobStatusMaxPollingTime(task.getJobStatusMaxPollingTime())
|
135
141
|
.setJobStatusPollingInterval(task.getJobStatusPollingInterval())
|
136
142
|
.setIsSkipJobResultCheck(task.getIsSkipJobResultCheck())
|
@@ -233,7 +239,8 @@ public class BigqueryOutputPlugin
|
|
233
239
|
log.info(String.format("Delete local file [%s]", filePath));
|
234
240
|
file.delete();
|
235
241
|
}
|
236
|
-
} catch (
|
242
|
+
} catch (NoSuchAlgorithmException | TimeoutException | BigqueryWriter.JobFailedException | IOException ex) {
|
243
|
+
log.error(ex.getMessage());
|
237
244
|
throw Throwables.propagate(ex);
|
238
245
|
}
|
239
246
|
}
|
@@ -6,14 +6,18 @@ import java.io.FileNotFoundException;
|
|
6
6
|
import java.io.FileInputStream;
|
7
7
|
import java.io.BufferedInputStream;
|
8
8
|
import com.google.api.client.http.InputStreamContent;
|
9
|
+
import java.security.MessageDigest;
|
10
|
+
import java.security.NoSuchAlgorithmException;
|
9
11
|
import java.util.List;
|
10
12
|
import java.util.concurrent.TimeoutException;
|
11
13
|
import com.google.common.base.Optional;
|
14
|
+
import com.google.api.client.util.Base64;
|
12
15
|
import com.google.common.base.Throwables;
|
13
16
|
import java.security.GeneralSecurityException;
|
14
17
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
15
18
|
import com.fasterxml.jackson.core.type.TypeReference;
|
16
19
|
|
20
|
+
import org.apache.commons.codec.binary.Hex;
|
17
21
|
import org.embulk.spi.Exec;
|
18
22
|
import org.slf4j.Logger;
|
19
23
|
|
@@ -48,6 +52,7 @@ public class BigqueryWriter
|
|
48
52
|
private final String fieldDelimiter;
|
49
53
|
private final int maxBadrecords;
|
50
54
|
private final String encoding;
|
55
|
+
private final boolean preventDuplicateInsert;
|
51
56
|
private final long jobStatusMaxPollingTime;
|
52
57
|
private final long jobStatusPollingInterval;
|
53
58
|
private final boolean isSkipJobResultCheck;
|
@@ -64,6 +69,7 @@ public class BigqueryWriter
|
|
64
69
|
this.fieldDelimiter = builder.fieldDelimiter;
|
65
70
|
this.maxBadrecords = builder.maxBadrecords;
|
66
71
|
this.encoding = builder.encoding.toUpperCase();
|
72
|
+
this.preventDuplicateInsert = builder.preventDuplicateInsert;
|
67
73
|
this.jobStatusMaxPollingTime = builder.jobStatusMaxPollingTime;
|
68
74
|
this.jobStatusPollingInterval = builder.jobStatusPollingInterval;
|
69
75
|
this.isSkipJobResultCheck = builder.isSkipJobResultCheck;
|
@@ -91,7 +97,7 @@ public class BigqueryWriter
|
|
91
97
|
List<ErrorProto> errors = job.getStatus().getErrors();
|
92
98
|
if (errors != null) {
|
93
99
|
for (ErrorProto error : errors) {
|
94
|
-
log.
|
100
|
+
log.error(String.format("Error: job id:[%s] reason[%s][%s] location:[%s]", jobRef.getJobId(), error.getReason(), error.getMessage(), error.getLocation()));
|
95
101
|
}
|
96
102
|
}
|
97
103
|
|
@@ -132,17 +138,24 @@ public class BigqueryWriter
|
|
132
138
|
}
|
133
139
|
}
|
134
140
|
|
135
|
-
public void executeLoad(String localFilePath) throws GoogleJsonResponseException,
|
141
|
+
public void executeLoad(String localFilePath) throws GoogleJsonResponseException, NoSuchAlgorithmException,
|
142
|
+
TimeoutException, JobFailedException, IOException
|
136
143
|
{
|
137
144
|
log.info(String.format("Job preparing... project:%s dataset:%s table:%s", project, dataset, table));
|
138
145
|
|
139
146
|
Job job = new Job();
|
140
|
-
JobReference jobRef =
|
147
|
+
JobReference jobRef = new JobReference();
|
141
148
|
JobConfiguration jobConfig = new JobConfiguration();
|
142
149
|
JobConfigurationLoad loadConfig = new JobConfigurationLoad();
|
143
150
|
jobConfig.setLoad(loadConfig);
|
144
151
|
job.setConfiguration(jobConfig);
|
145
152
|
|
153
|
+
if (preventDuplicateInsert) {
|
154
|
+
String jobId = createJobId(localFilePath);
|
155
|
+
jobRef.setJobId(jobId);
|
156
|
+
job.setJobReference(jobRef);
|
157
|
+
}
|
158
|
+
|
146
159
|
loadConfig.setAllowQuotedNewlines(false);
|
147
160
|
loadConfig.setEncoding(encoding);
|
148
161
|
loadConfig.setMaxBadRecords(maxBadrecords);
|
@@ -181,9 +194,8 @@ public class BigqueryWriter
|
|
181
194
|
|
182
195
|
try {
|
183
196
|
jobRef = insert.execute().getJobReference();
|
184
|
-
} catch (
|
185
|
-
|
186
|
-
throw Throwables.propagate(ex);
|
197
|
+
} catch (IllegalStateException ex) {
|
198
|
+
throw new JobFailedException(ex.getMessage());
|
187
199
|
}
|
188
200
|
log.info(String.format("Job executed. job id:[%s] file:[%s]", jobRef.getJobId(), localFilePath));
|
189
201
|
if (isSkipJobResultCheck) {
|
@@ -193,6 +205,25 @@ public class BigqueryWriter
|
|
193
205
|
}
|
194
206
|
}
|
195
207
|
|
208
|
+
private String createJobId(String localFilePath) throws NoSuchAlgorithmException, IOException
|
209
|
+
{
|
210
|
+
StringBuilder sb = new StringBuilder();
|
211
|
+
sb.append(getLocalMd5hash(localFilePath));
|
212
|
+
sb.append(dataset);
|
213
|
+
sb.append(table);
|
214
|
+
sb.append(tableSchema);
|
215
|
+
sb.append(sourceFormat);
|
216
|
+
sb.append(fieldDelimiter);
|
217
|
+
sb.append(maxBadrecords);
|
218
|
+
sb.append(encoding);
|
219
|
+
|
220
|
+
MessageDigest md = MessageDigest.getInstance("MD5");
|
221
|
+
String str = new String(sb);
|
222
|
+
byte[] digest = md.digest(str.getBytes());
|
223
|
+
String hash = new String(Hex.encodeHex(digest));
|
224
|
+
return "embulk_job_" + hash;
|
225
|
+
}
|
226
|
+
|
196
227
|
private TableReference createTableReference()
|
197
228
|
{
|
198
229
|
return new TableReference()
|
@@ -248,6 +279,28 @@ public class BigqueryWriter
|
|
248
279
|
}
|
249
280
|
}
|
250
281
|
|
282
|
+
private String getLocalMd5hash(String filePath) throws NoSuchAlgorithmException, IOException
|
283
|
+
{
|
284
|
+
FileInputStream stream = null;
|
285
|
+
try {
|
286
|
+
stream = new FileInputStream(filePath);
|
287
|
+
MessageDigest digest = MessageDigest.getInstance("MD5");
|
288
|
+
|
289
|
+
byte[] bytesBuffer = new byte[1024];
|
290
|
+
int bytesRead = -1;
|
291
|
+
|
292
|
+
while ((bytesRead = stream.read(bytesBuffer)) != -1) {
|
293
|
+
digest.update(bytesBuffer, 0, bytesRead);
|
294
|
+
}
|
295
|
+
byte[] hashedBytes = digest.digest();
|
296
|
+
|
297
|
+
byte[] encoded = (hashedBytes);
|
298
|
+
return new String(encoded);
|
299
|
+
} finally {
|
300
|
+
stream.close();
|
301
|
+
}
|
302
|
+
}
|
303
|
+
|
251
304
|
private class UploadProgressListener implements MediaHttpUploaderProgressListener
|
252
305
|
{
|
253
306
|
private String fileName;
|
@@ -291,6 +344,7 @@ public class BigqueryWriter
|
|
291
344
|
private String fieldDelimiter;
|
292
345
|
private int maxBadrecords;
|
293
346
|
private String encoding;
|
347
|
+
private boolean preventDuplicateInsert;
|
294
348
|
private int jobStatusMaxPollingTime;
|
295
349
|
private int jobStatusPollingInterval;
|
296
350
|
private boolean isSkipJobResultCheck;
|
@@ -372,6 +426,12 @@ public class BigqueryWriter
|
|
372
426
|
return this;
|
373
427
|
}
|
374
428
|
|
429
|
+
public Builder setPreventDuplicateInsert(boolean preventDuplicateInsert)
|
430
|
+
{
|
431
|
+
this.preventDuplicateInsert = preventDuplicateInsert;
|
432
|
+
return this;
|
433
|
+
}
|
434
|
+
|
375
435
|
public Builder setJobStatusMaxPollingTime(int jobStatusMaxPollingTime)
|
376
436
|
{
|
377
437
|
this.jobStatusMaxPollingTime = jobStatusMaxPollingTime;
|
@@ -396,7 +456,7 @@ public class BigqueryWriter
|
|
396
456
|
}
|
397
457
|
}
|
398
458
|
|
399
|
-
public class JobFailedException extends
|
459
|
+
public class JobFailedException extends RuntimeException
|
400
460
|
{
|
401
461
|
public JobFailedException(String message) {
|
402
462
|
super(message);
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -62,12 +62,12 @@ files:
|
|
62
62
|
- src/test/java/org/embulk/output/TestBigqueryWriter.java
|
63
63
|
- classpath/commons-codec-1.3.jar
|
64
64
|
- classpath/commons-logging-1.1.1.jar
|
65
|
-
- classpath/embulk-output-bigquery-0.1.
|
66
|
-
- classpath/google-api-client-1.
|
67
|
-
- classpath/google-api-services-bigquery-v2-
|
68
|
-
- classpath/google-http-client-1.
|
69
|
-
- classpath/google-http-client-jackson2-1.
|
70
|
-
- classpath/google-oauth-client-1.
|
65
|
+
- classpath/embulk-output-bigquery-0.1.4.jar
|
66
|
+
- classpath/google-api-client-1.20.0.jar
|
67
|
+
- classpath/google-api-services-bigquery-v2-rev205-1.20.0.jar
|
68
|
+
- classpath/google-http-client-1.20.0.jar
|
69
|
+
- classpath/google-http-client-jackson2-1.20.0.jar
|
70
|
+
- classpath/google-oauth-client-1.20.0.jar
|
71
71
|
- classpath/guava-jdk5-13.0.jar
|
72
72
|
- classpath/httpclient-4.0.1.jar
|
73
73
|
- classpath/httpcore-4.0.1.jar
|