embulk-output-bigquery 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b0856b220a3d9c7b78dbffe45b35edf8e10b4fba
|
4
|
+
data.tar.gz: 8ce985e90cfd9aa9b88c6cb7ec994d6e89184584
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02c3432b3494df8ca2f1901dda75488380574a7047228434a800d1578975ce9b6abb0bd65a0a5fc95285077d6963d19160241ed96e2b42d28cd30920e8c66230
|
7
|
+
data.tar.gz: dd7aec6748550836b6195d5ee7d403ca126a52d194fe9a0a8e3bf7257bb7639931f3e6df7fde2908a1e8f610577b085ca30c50876a800040ff7b71aedb80e1f8
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,8 @@
|
|
1
|
+
## 0.3.5 - 2016-06-13
|
2
|
+
|
3
|
+
* [enhancement] retry backendError and internalError in waiting load job
|
4
|
+
* [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
|
5
|
+
|
1
6
|
## 0.3.4 - 2016-06-01
|
2
7
|
|
3
8
|
* [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.5"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -40,59 +40,77 @@ module Embulk
|
|
40
40
|
self.fields
|
41
41
|
end
|
42
42
|
|
43
|
-
|
44
|
-
|
45
|
-
def load_from_gcs(object_uris, table)
|
43
|
+
def with_retry_job(&block)
|
44
|
+
retries = 0
|
46
45
|
begin
|
47
|
-
|
48
|
-
|
49
|
-
if
|
50
|
-
|
46
|
+
yield
|
47
|
+
rescue BackendError, InternalError => e
|
48
|
+
if retries < @task['retries']
|
49
|
+
retries += 1
|
50
|
+
Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
|
51
|
+
retry
|
51
52
|
else
|
52
|
-
|
53
|
+
Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
|
54
|
+
raise e
|
53
55
|
end
|
54
|
-
|
56
|
+
end
|
57
|
+
end
|
55
58
|
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
59
|
+
# @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
|
60
|
+
# @return [Array] responses
|
61
|
+
def load_from_gcs(object_uris, table)
|
62
|
+
with_retry_job do
|
63
|
+
begin
|
64
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
65
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
66
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
67
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
68
|
+
else
|
69
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
70
|
+
end
|
71
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
|
72
|
+
|
73
|
+
body = {
|
74
|
+
job_reference: {
|
75
|
+
project_id: @project,
|
76
|
+
job_id: job_id,
|
77
|
+
},
|
78
|
+
configuration: {
|
79
|
+
load: {
|
80
|
+
destination_table: {
|
81
|
+
project_id: @project,
|
82
|
+
dataset_id: @dataset,
|
83
|
+
table_id: table,
|
84
|
+
},
|
85
|
+
schema: {
|
86
|
+
fields: fields,
|
87
|
+
},
|
88
|
+
write_disposition: 'WRITE_APPEND',
|
89
|
+
source_format: @task['source_format'],
|
90
|
+
max_bad_records: @task['max_bad_records'],
|
91
|
+
field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
|
92
|
+
encoding: @task['encoding'],
|
93
|
+
ignore_unknown_values: @task['ignore_unknown_values'],
|
94
|
+
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
95
|
+
source_uris: object_uris,
|
96
|
+
}
|
79
97
|
}
|
80
98
|
}
|
81
|
-
|
82
|
-
opts = {}
|
99
|
+
opts = {}
|
83
100
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
101
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
102
|
+
response = client.insert_job(@project, body, opts)
|
103
|
+
unless @task['is_skip_job_result_check']
|
104
|
+
response = wait_load('Load', response)
|
105
|
+
end
|
106
|
+
[response]
|
107
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
108
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
109
|
+
Embulk.logger.error {
|
110
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
111
|
+
}
|
112
|
+
raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
88
113
|
end
|
89
|
-
[response]
|
90
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
91
|
-
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
92
|
-
Embulk.logger.error {
|
93
|
-
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
94
|
-
}
|
95
|
-
raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
96
114
|
end
|
97
115
|
end
|
98
116
|
|
@@ -126,90 +144,93 @@ module Embulk
|
|
126
144
|
end
|
127
145
|
|
128
146
|
def load(path, table)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
147
|
+
with_retry_job do
|
148
|
+
begin
|
149
|
+
if File.exist?(path)
|
150
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
151
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
152
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
153
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
154
|
+
else
|
155
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
156
|
+
end
|
157
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
135
158
|
else
|
136
|
-
|
159
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
160
|
+
return
|
137
161
|
end
|
138
|
-
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
|
139
|
-
else
|
140
|
-
Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
|
141
|
-
return
|
142
|
-
end
|
143
162
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
163
|
+
body = {
|
164
|
+
job_reference: {
|
165
|
+
project_id: @project,
|
166
|
+
job_id: job_id,
|
167
|
+
},
|
168
|
+
configuration: {
|
169
|
+
load: {
|
170
|
+
destination_table: {
|
171
|
+
project_id: @project,
|
172
|
+
dataset_id: @dataset,
|
173
|
+
table_id: table,
|
174
|
+
},
|
175
|
+
schema: {
|
176
|
+
fields: fields,
|
177
|
+
},
|
178
|
+
write_disposition: 'WRITE_APPEND',
|
179
|
+
source_format: @task['source_format'],
|
180
|
+
max_bad_records: @task['max_bad_records'],
|
181
|
+
field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
|
182
|
+
encoding: @task['encoding'],
|
183
|
+
ignore_unknown_values: @task['ignore_unknown_values'],
|
184
|
+
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
185
|
+
}
|
166
186
|
}
|
167
187
|
}
|
168
|
-
}
|
169
188
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
189
|
+
opts = {
|
190
|
+
upload_source: path,
|
191
|
+
content_type: "application/octet-stream",
|
192
|
+
# options: {
|
193
|
+
# retries: @task['retries'],
|
194
|
+
# timeout_sec: @task['timeout_sec'],
|
195
|
+
# open_timeout_sec: @task['open_timeout_sec']
|
196
|
+
# },
|
197
|
+
}
|
198
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
199
|
+
response = client.insert_job(@project, body, opts)
|
200
|
+
unless @task['is_skip_job_result_check']
|
201
|
+
response = wait_load('Load', response)
|
202
|
+
end
|
203
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
204
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
205
|
+
Embulk.logger.error {
|
206
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
207
|
+
}
|
208
|
+
raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
183
209
|
end
|
184
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
185
|
-
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
186
|
-
Embulk.logger.error {
|
187
|
-
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
188
|
-
}
|
189
|
-
raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
190
210
|
end
|
191
211
|
end
|
192
212
|
|
193
213
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
194
|
-
|
195
|
-
|
196
|
-
|
214
|
+
with_retry_job do
|
215
|
+
begin
|
216
|
+
destination_dataset ||= @dataset
|
217
|
+
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
197
218
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
219
|
+
Embulk.logger.info {
|
220
|
+
"embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
|
221
|
+
"#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
|
222
|
+
}
|
202
223
|
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
224
|
+
body = {
|
225
|
+
job_reference: {
|
226
|
+
project_id: @project,
|
227
|
+
job_id: job_id,
|
228
|
+
},
|
229
|
+
configuration: {
|
230
|
+
copy: {
|
231
|
+
create_deposition: 'CREATE_IF_NEEDED',
|
232
|
+
write_disposition: write_disposition,
|
233
|
+
source_table: {
|
213
234
|
project_id: @project,
|
214
235
|
dataset_id: @dataset,
|
215
236
|
table_id: source_table,
|
@@ -219,21 +240,22 @@ module Embulk
|
|
219
240
|
dataset_id: destination_dataset,
|
220
241
|
table_id: destination_table,
|
221
242
|
},
|
243
|
+
}
|
222
244
|
}
|
223
245
|
}
|
224
|
-
}
|
225
246
|
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
247
|
+
opts = {}
|
248
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
249
|
+
response = client.insert_job(@project, body, opts)
|
250
|
+
wait_load('Copy', response)
|
251
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
252
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
253
|
+
Embulk.logger.error {
|
254
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
255
|
+
}
|
256
|
+
raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
|
257
|
+
"to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
|
258
|
+
end
|
237
259
|
end
|
238
260
|
end
|
239
261
|
|
@@ -273,11 +295,15 @@ module Embulk
|
|
273
295
|
# `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
|
274
296
|
# Otherwise, this returns nil.
|
275
297
|
if _errors = _response.status.errors
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
}
|
280
|
-
|
298
|
+
msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
|
299
|
+
if _errors.any? {|error| error.reason == 'backendError' }
|
300
|
+
raise BackendError, msg
|
301
|
+
elsif _errors.any? {|error| error.reason == 'internalError' }
|
302
|
+
raise InternalError, msg
|
303
|
+
else
|
304
|
+
Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
|
305
|
+
raise Error, msg
|
306
|
+
end
|
281
307
|
end
|
282
308
|
|
283
309
|
Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'uri'
|
2
|
+
require 'java'
|
2
3
|
require 'google/apis/storage_v1'
|
3
4
|
require_relative 'google_client'
|
4
5
|
require_relative 'helper'
|
@@ -49,6 +50,7 @@ module Embulk
|
|
49
50
|
object_uri = URI.join("gs://#{bucket}", object).to_s
|
50
51
|
|
51
52
|
started = Time.now
|
53
|
+
retries = 0
|
52
54
|
begin
|
53
55
|
Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
|
54
56
|
body = {
|
@@ -68,6 +70,21 @@ module Embulk
|
|
68
70
|
"embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
69
71
|
}
|
70
72
|
raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
|
73
|
+
rescue ::Java::Java.net.SocketException => e
|
74
|
+
# I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
|
75
|
+
# I am doubting as this is caused by Google's unstable network
|
76
|
+
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
77
|
+
if e.message == 'Broken pipe' || e.message == 'Connection reset'
|
78
|
+
if retries < @task['retries']
|
79
|
+
response = {message: e.message, error_class: e.class}
|
80
|
+
Embulk.logger.warn {
|
81
|
+
"embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
82
|
+
}
|
83
|
+
retries += 1 # want to share with google-api-ruby-client, but it is difficult
|
84
|
+
retry
|
85
|
+
end
|
86
|
+
end
|
87
|
+
raise e
|
71
88
|
end
|
72
89
|
end
|
73
90
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-06-
|
12
|
+
date: 2016-06-13 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|