embulk-output-bigquery 0.3.4 → 0.3.5
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b0856b220a3d9c7b78dbffe45b35edf8e10b4fba
|
4
|
+
data.tar.gz: 8ce985e90cfd9aa9b88c6cb7ec994d6e89184584
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02c3432b3494df8ca2f1901dda75488380574a7047228434a800d1578975ce9b6abb0bd65a0a5fc95285077d6963d19160241ed96e2b42d28cd30920e8c66230
|
7
|
+
data.tar.gz: dd7aec6748550836b6195d5ee7d403ca126a52d194fe9a0a8e3bf7257bb7639931f3e6df7fde2908a1e8f610577b085ca30c50876a800040ff7b71aedb80e1f8
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
## 0.3.5 - 2016-06-13
|
2
|
+
|
3
|
+
* [enhancement] retry backendError and internalError in waiting load job
|
4
|
+
* [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
|
5
|
+
|
1
6
|
## 0.3.4 - 2016-06-01
|
2
7
|
|
3
8
|
* [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.5"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -40,59 +40,77 @@ module Embulk
|
|
40
40
|
self.fields
|
41
41
|
end
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
def load_from_gcs(object_uris, table)
|
43
|
+
def with_retry_job(&block)
|
44
|
+
retries = 0
|
46
45
|
begin
|
47
|
-
|
48
|
-
|
49
|
-
if
|
50
|
-
|
46
|
+
yield
|
47
|
+
rescue BackendError, InternalError => e
|
48
|
+
if retries < @task['retries']
|
49
|
+
retries += 1
|
50
|
+
Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
|
51
|
+
retry
|
51
52
|
else
|
52
|
-
|
53
|
+
Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
|
54
|
+
raise e
|
53
55
|
end
|
54
|
-
|
56
|
+
end
|
57
|
+
end
|
55
58
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
59
|
+
# @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
|
60
|
+
# @return [Array] responses
|
61
|
+
def load_from_gcs(object_uris, table)
|
62
|
+
with_retry_job do
|
63
|
+
begin
|
64
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
65
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
66
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
67
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
68
|
+
else
|
69
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
70
|
+
end
|
71
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
|
72
|
+
|
73
|
+
body = {
|
74
|
+
job_reference: {
|
75
|
+
project_id: @project,
|
76
|
+
job_id: job_id,
|
77
|
+
},
|
78
|
+
configuration: {
|
79
|
+
load: {
|
80
|
+
destination_table: {
|
81
|
+
project_id: @project,
|
82
|
+
dataset_id: @dataset,
|
83
|
+
table_id: table,
|
84
|
+
},
|
85
|
+
schema: {
|
86
|
+
fields: fields,
|
87
|
+
},
|
88
|
+
write_disposition: 'WRITE_APPEND',
|
89
|
+
source_format: @task['source_format'],
|
90
|
+
max_bad_records: @task['max_bad_records'],
|
91
|
+
field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
|
92
|
+
encoding: @task['encoding'],
|
93
|
+
ignore_unknown_values: @task['ignore_unknown_values'],
|
94
|
+
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
95
|
+
source_uris: object_uris,
|
96
|
+
}
|
79
97
|
}
|
80
98
|
}
|
81
|
-
|
82
|
-
opts = {}
|
99
|
+
opts = {}
|
83
100
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
101
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
102
|
+
response = client.insert_job(@project, body, opts)
|
103
|
+
unless @task['is_skip_job_result_check']
|
104
|
+
response = wait_load('Load', response)
|
105
|
+
end
|
106
|
+
[response]
|
107
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
108
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
109
|
+
Embulk.logger.error {
|
110
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
111
|
+
}
|
112
|
+
raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
88
113
|
end
|
89
|
-
[response]
|
90
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
91
|
-
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
92
|
-
Embulk.logger.error {
|
93
|
-
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
94
|
-
}
|
95
|
-
raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
96
114
|
end
|
97
115
|
end
|
98
116
|
|
@@ -126,90 +144,93 @@ module Embulk
|
|
126
144
|
end
|
127
145
|
|
128
146
|
def load(path, table)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
147
|
+
with_retry_job do
|
148
|
+
begin
|
149
|
+
if File.exist?(path)
|
150
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
151
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
152
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
153
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
154
|
+
else
|
155
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
156
|
+
end
|
157
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
135
158
|
else
|
136
|
-
|
159
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
160
|
+
return
|
137
161
|
end
|
138
|
-
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
139
|
-
else
|
140
|
-
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
141
|
-
return
|
142
|
-
end
|
143
162
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
163
|
+
body = {
|
164
|
+
job_reference: {
|
165
|
+
project_id: @project,
|
166
|
+
job_id: job_id,
|
167
|
+
},
|
168
|
+
configuration: {
|
169
|
+
load: {
|
170
|
+
destination_table: {
|
171
|
+
project_id: @project,
|
172
|
+
dataset_id: @dataset,
|
173
|
+
table_id: table,
|
174
|
+
},
|
175
|
+
schema: {
|
176
|
+
fields: fields,
|
177
|
+
},
|
178
|
+
write_disposition: 'WRITE_APPEND',
|
179
|
+
source_format: @task['source_format'],
|
180
|
+
max_bad_records: @task['max_bad_records'],
|
181
|
+
field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
|
182
|
+
encoding: @task['encoding'],
|
183
|
+
ignore_unknown_values: @task['ignore_unknown_values'],
|
184
|
+
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
185
|
+
}
|
166
186
|
}
|
167
187
|
}
|
168
|
-
}
|
169
188
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
189
|
+
opts = {
|
190
|
+
upload_source: path,
|
191
|
+
content_type: "application/octet-stream",
|
192
|
+
# options: {
|
193
|
+
# retries: @task['retries'],
|
194
|
+
# timeout_sec: @task['timeout_sec'],
|
195
|
+
# open_timeout_sec: @task['open_timeout_sec']
|
196
|
+
# },
|
197
|
+
}
|
198
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
199
|
+
response = client.insert_job(@project, body, opts)
|
200
|
+
unless @task['is_skip_job_result_check']
|
201
|
+
response = wait_load('Load', response)
|
202
|
+
end
|
203
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
204
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
205
|
+
Embulk.logger.error {
|
206
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
207
|
+
}
|
208
|
+
raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
183
209
|
end
|
184
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
185
|
-
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
186
|
-
Embulk.logger.error {
|
187
|
-
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
188
|
-
}
|
189
|
-
raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
190
210
|
end
|
191
211
|
end
|
192
212
|
|
193
213
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
194
|
-
|
195
|
-
|
196
|
-
|
214
|
+
with_retry_job do
|
215
|
+
begin
|
216
|
+
destination_dataset ||= @dataset
|
217
|
+
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
197
218
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
219
|
+
Embulk.logger.info {
|
220
|
+
"embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
|
221
|
+
"#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
|
222
|
+
}
|
202
223
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
224
|
+
body = {
|
225
|
+
job_reference: {
|
226
|
+
project_id: @project,
|
227
|
+
job_id: job_id,
|
228
|
+
},
|
229
|
+
configuration: {
|
230
|
+
copy: {
|
231
|
+
create_deposition: 'CREATE_IF_NEEDED',
|
232
|
+
write_disposition: write_disposition,
|
233
|
+
source_table: {
|
213
234
|
project_id: @project,
|
214
235
|
dataset_id: @dataset,
|
215
236
|
table_id: source_table,
|
@@ -219,21 +240,22 @@ module Embulk
|
|
219
240
|
dataset_id: destination_dataset,
|
220
241
|
table_id: destination_table,
|
221
242
|
},
|
243
|
+
}
|
222
244
|
}
|
223
245
|
}
|
224
|
-
}
|
225
246
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
247
|
+
opts = {}
|
248
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
249
|
+
response = client.insert_job(@project, body, opts)
|
250
|
+
wait_load('Copy', response)
|
251
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
252
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
253
|
+
Embulk.logger.error {
|
254
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
255
|
+
}
|
256
|
+
raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
|
257
|
+
"to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
|
258
|
+
end
|
237
259
|
end
|
238
260
|
end
|
239
261
|
|
@@ -273,11 +295,15 @@ module Embulk
|
|
273
295
|
# `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
|
274
296
|
# Otherwise, this returns nil.
|
275
297
|
if _errors = _response.status.errors
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
}
|
280
|
-
|
298
|
+
msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
|
299
|
+
if _errors.any? {|error| error.reason == 'backendError' }
|
300
|
+
raise BackendError, msg
|
301
|
+
elsif _errors.any? {|error| error.reason == 'internalError' }
|
302
|
+
raise InternalError, msg
|
303
|
+
else
|
304
|
+
Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
|
305
|
+
raise Error, msg
|
306
|
+
end
|
281
307
|
end
|
282
308
|
|
283
309
|
Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
|
+
require 'java'
|
2
3
|
require 'google/apis/storage_v1'
|
3
4
|
require_relative 'google_client'
|
4
5
|
require_relative 'helper'
|
@@ -49,6 +50,7 @@ module Embulk
|
|
49
50
|
object_uri = URI.join("gs://#{bucket}", object).to_s
|
50
51
|
|
51
52
|
started = Time.now
|
53
|
+
retries = 0
|
52
54
|
begin
|
53
55
|
Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
|
54
56
|
body = {
|
@@ -68,6 +70,21 @@ module Embulk
|
|
68
70
|
"embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
69
71
|
}
|
70
72
|
raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
|
73
|
+
rescue ::Java::Java.net.SocketException => e
|
74
|
+
# I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
|
75
|
+
# I am doubting as this is caused by Google's unstable network
|
76
|
+
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
77
|
+
if e.message == 'Broken pipe' || e.message == 'Connection reset'
|
78
|
+
if retries < @task['retries']
|
79
|
+
response = {message: e.message, error_class: e.class}
|
80
|
+
Embulk.logger.warn {
|
81
|
+
"embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
82
|
+
}
|
83
|
+
retries += 1 # want to share with google-api-ruby-client, but it is difficult
|
84
|
+
retry
|
85
|
+
end
|
86
|
+
end
|
87
|
+
raise e
|
71
88
|
end
|
72
89
|
end
|
73
90
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-06-
|
12
|
+
date: 2016-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|