embulk-output-bigquery 0.3.4 → 0.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f68ceb57a4eff6886157c585425526389623d0a2
4
- data.tar.gz: b44323059a3057bb5de7fdd7b00d61ce970f3386
3
+ metadata.gz: b0856b220a3d9c7b78dbffe45b35edf8e10b4fba
4
+ data.tar.gz: 8ce985e90cfd9aa9b88c6cb7ec994d6e89184584
5
5
  SHA512:
6
- metadata.gz: 5cc7b1245bda2ae8c5d581c67a09ce0685c7812658c3c47e195362290fd50c13abfb7a3e9bb2360bc01a6d6aa82009ce190bef667cfb1df2cddaeb653c162c14
7
- data.tar.gz: 4f8611f292a61750568c7b15e7ae6f83bc83d09ae3f64b2359b8f6f4e4d4b7ac115e09e6e8fbb5cc2e98b89103b8d1aba0640e0d89b035dbab2e5feea0d47449
6
+ metadata.gz: 02c3432b3494df8ca2f1901dda75488380574a7047228434a800d1578975ce9b6abb0bd65a0a5fc95285077d6963d19160241ed96e2b42d28cd30920e8c66230
7
+ data.tar.gz: dd7aec6748550836b6195d5ee7d403ca126a52d194fe9a0a8e3bf7257bb7639931f3e6df7fde2908a1e8f610577b085ca30c50876a800040ff7b71aedb80e1f8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.3.5 - 2016-06-13
2
+
3
+ * [enhancement] retry backendError and internalError in waiting load job
4
+ * [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
5
+
1
6
  ## 0.3.4 - 2016-06-01
2
7
 
3
8
  * [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.4"
3
+ spec.version = "0.3.5"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -40,59 +40,77 @@ module Embulk
40
40
  self.fields
41
41
  end
42
42
 
43
- # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
44
- # @return [Array] responses
45
- def load_from_gcs(object_uris, table)
43
+ def with_retry_job(&block)
44
+ retries = 0
46
45
  begin
47
- # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
48
- # we should generate job_id in client code, otherwise, retrying would cause duplication
49
- if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
50
- job_id = Helper.create_load_job_id(@task, path, fields)
46
+ yield
47
+ rescue BackendError, InternalError => e
48
+ if retries < @task['retries']
49
+ retries += 1
50
+ Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
51
+ retry
51
52
  else
52
- job_id = "embulk_load_job_#{SecureRandom.uuid}"
53
+ Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
54
+ raise e
53
55
  end
54
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
56
+ end
57
+ end
55
58
 
56
- body = {
57
- job_reference: {
58
- project_id: @project,
59
- job_id: job_id,
60
- },
61
- configuration: {
62
- load: {
63
- destination_table: {
64
- project_id: @project,
65
- dataset_id: @dataset,
66
- table_id: table,
67
- },
68
- schema: {
69
- fields: fields,
70
- },
71
- write_disposition: 'WRITE_APPEND',
72
- source_format: @task['source_format'],
73
- max_bad_records: @task['max_bad_records'],
74
- field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
75
- encoding: @task['encoding'],
76
- ignore_unknown_values: @task['ignore_unknown_values'],
77
- allow_quoted_newlines: @task['allow_quoted_newlines'],
78
- source_uris: object_uris,
59
+ # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
60
+ # @return [Array] responses
61
+ def load_from_gcs(object_uris, table)
62
+ with_retry_job do
63
+ begin
64
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
65
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
66
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
67
+ job_id = Helper.create_load_job_id(@task, path, fields)
68
+ else
69
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
70
+ end
71
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
72
+
73
+ body = {
74
+ job_reference: {
75
+ project_id: @project,
76
+ job_id: job_id,
77
+ },
78
+ configuration: {
79
+ load: {
80
+ destination_table: {
81
+ project_id: @project,
82
+ dataset_id: @dataset,
83
+ table_id: table,
84
+ },
85
+ schema: {
86
+ fields: fields,
87
+ },
88
+ write_disposition: 'WRITE_APPEND',
89
+ source_format: @task['source_format'],
90
+ max_bad_records: @task['max_bad_records'],
91
+ field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
92
+ encoding: @task['encoding'],
93
+ ignore_unknown_values: @task['ignore_unknown_values'],
94
+ allow_quoted_newlines: @task['allow_quoted_newlines'],
95
+ source_uris: object_uris,
96
+ }
79
97
  }
80
98
  }
81
- }
82
- opts = {}
99
+ opts = {}
83
100
 
84
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
85
- response = client.insert_job(@project, body, opts)
86
- unless @task['is_skip_job_result_check']
87
- response = wait_load('Load', response)
101
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
102
+ response = client.insert_job(@project, body, opts)
103
+ unless @task['is_skip_job_result_check']
104
+ response = wait_load('Load', response)
105
+ end
106
+ [response]
107
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
108
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
109
+ Embulk.logger.error {
110
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
111
+ }
112
+ raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
88
113
  end
89
- [response]
90
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
91
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
92
- Embulk.logger.error {
93
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
94
- }
95
- raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
96
114
  end
97
115
  end
98
116
 
@@ -126,90 +144,93 @@ module Embulk
126
144
  end
127
145
 
128
146
  def load(path, table)
129
- begin
130
- if File.exist?(path)
131
- # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
132
- # we should generate job_id in client code, otherwise, retrying would cause duplication
133
- if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
134
- job_id = Helper.create_load_job_id(@task, path, fields)
147
+ with_retry_job do
148
+ begin
149
+ if File.exist?(path)
150
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
151
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
152
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
153
+ job_id = Helper.create_load_job_id(@task, path, fields)
154
+ else
155
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
156
+ end
157
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
135
158
  else
136
- job_id = "embulk_load_job_#{SecureRandom.uuid}"
159
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
160
+ return
137
161
  end
138
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
139
- else
140
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
141
- return
142
- end
143
162
 
144
- body = {
145
- job_reference: {
146
- project_id: @project,
147
- job_id: job_id,
148
- },
149
- configuration: {
150
- load: {
151
- destination_table: {
152
- project_id: @project,
153
- dataset_id: @dataset,
154
- table_id: table,
155
- },
156
- schema: {
157
- fields: fields,
158
- },
159
- write_disposition: 'WRITE_APPEND',
160
- source_format: @task['source_format'],
161
- max_bad_records: @task['max_bad_records'],
162
- field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
163
- encoding: @task['encoding'],
164
- ignore_unknown_values: @task['ignore_unknown_values'],
165
- allow_quoted_newlines: @task['allow_quoted_newlines'],
163
+ body = {
164
+ job_reference: {
165
+ project_id: @project,
166
+ job_id: job_id,
167
+ },
168
+ configuration: {
169
+ load: {
170
+ destination_table: {
171
+ project_id: @project,
172
+ dataset_id: @dataset,
173
+ table_id: table,
174
+ },
175
+ schema: {
176
+ fields: fields,
177
+ },
178
+ write_disposition: 'WRITE_APPEND',
179
+ source_format: @task['source_format'],
180
+ max_bad_records: @task['max_bad_records'],
181
+ field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
182
+ encoding: @task['encoding'],
183
+ ignore_unknown_values: @task['ignore_unknown_values'],
184
+ allow_quoted_newlines: @task['allow_quoted_newlines'],
185
+ }
166
186
  }
167
187
  }
168
- }
169
188
 
170
- opts = {
171
- upload_source: path,
172
- content_type: "application/octet-stream",
173
- # options: {
174
- # retries: @task['retries'],
175
- # timeout_sec: @task['timeout_sec'],
176
- # open_timeout_sec: @task['open_timeout_sec']
177
- # },
178
- }
179
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
180
- response = client.insert_job(@project, body, opts)
181
- unless @task['is_skip_job_result_check']
182
- response = wait_load('Load', response)
189
+ opts = {
190
+ upload_source: path,
191
+ content_type: "application/octet-stream",
192
+ # options: {
193
+ # retries: @task['retries'],
194
+ # timeout_sec: @task['timeout_sec'],
195
+ # open_timeout_sec: @task['open_timeout_sec']
196
+ # },
197
+ }
198
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
199
+ response = client.insert_job(@project, body, opts)
200
+ unless @task['is_skip_job_result_check']
201
+ response = wait_load('Load', response)
202
+ end
203
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
204
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
205
+ Embulk.logger.error {
206
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
207
+ }
208
+ raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
183
209
  end
184
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
185
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
186
- Embulk.logger.error {
187
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
188
- }
189
- raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
190
210
  end
191
211
  end
192
212
 
193
213
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
194
- begin
195
- destination_dataset ||= @dataset
196
- job_id = "embulk_copy_job_#{SecureRandom.uuid}"
214
+ with_retry_job do
215
+ begin
216
+ destination_dataset ||= @dataset
217
+ job_id = "embulk_copy_job_#{SecureRandom.uuid}"
197
218
 
198
- Embulk.logger.info {
199
- "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
200
- "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
201
- }
219
+ Embulk.logger.info {
220
+ "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
221
+ "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
222
+ }
202
223
 
203
- body = {
204
- job_reference: {
205
- project_id: @project,
206
- job_id: job_id,
207
- },
208
- configuration: {
209
- copy: {
210
- create_deposition: 'CREATE_IF_NEEDED',
211
- write_disposition: write_disposition,
212
- source_table: {
224
+ body = {
225
+ job_reference: {
226
+ project_id: @project,
227
+ job_id: job_id,
228
+ },
229
+ configuration: {
230
+ copy: {
231
+ create_deposition: 'CREATE_IF_NEEDED',
232
+ write_disposition: write_disposition,
233
+ source_table: {
213
234
  project_id: @project,
214
235
  dataset_id: @dataset,
215
236
  table_id: source_table,
@@ -219,21 +240,22 @@ module Embulk
219
240
  dataset_id: destination_dataset,
220
241
  table_id: destination_table,
221
242
  },
243
+ }
222
244
  }
223
245
  }
224
- }
225
246
 
226
- opts = {}
227
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
228
- response = client.insert_job(@project, body, opts)
229
- wait_load('Copy', response)
230
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
231
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
232
- Embulk.logger.error {
233
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
234
- }
235
- raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
236
- "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
247
+ opts = {}
248
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
249
+ response = client.insert_job(@project, body, opts)
250
+ wait_load('Copy', response)
251
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
252
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
253
+ Embulk.logger.error {
254
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
255
+ }
256
+ raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
257
+ "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
258
+ end
237
259
  end
238
260
  end
239
261
 
@@ -273,11 +295,15 @@ module Embulk
273
295
  # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
274
296
  # Otherwise, this returns nil.
275
297
  if _errors = _response.status.errors
276
- Embulk.logger.error {
277
- "embulk-output-bigquery: get_job(#{@project}, #{job_id}), " \
278
- "errors:#{_errors.map(&:to_h)}"
279
- }
280
- raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
298
+ msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
299
+ if _errors.any? {|error| error.reason == 'backendError' }
300
+ raise BackendError, msg
301
+ elsif _errors.any? {|error| error.reason == 'internalError' }
302
+ raise InternalError, msg
303
+ else
304
+ Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
305
+ raise Error, msg
306
+ end
281
307
  end
282
308
 
283
309
  Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
@@ -1,4 +1,5 @@
1
1
  require 'uri'
2
+ require 'java'
2
3
  require 'google/apis/storage_v1'
3
4
  require_relative 'google_client'
4
5
  require_relative 'helper'
@@ -49,6 +50,7 @@ module Embulk
49
50
  object_uri = URI.join("gs://#{bucket}", object).to_s
50
51
 
51
52
  started = Time.now
53
+ retries = 0
52
54
  begin
53
55
  Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
54
56
  body = {
@@ -68,6 +70,21 @@ module Embulk
68
70
  "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
69
71
  }
70
72
  raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
73
+ rescue ::Java::Java.net.SocketException => e
74
+ # I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
75
+ # I am doubting as this is caused by Google's unstable network
76
+ # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
77
+ if e.message == 'Broken pipe' || e.message == 'Connection reset'
78
+ if retries < @task['retries']
79
+ response = {message: e.message, error_class: e.class}
80
+ Embulk.logger.warn {
81
+ "embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
82
+ }
83
+ retries += 1 # want to share with google-api-ruby-client, but it is difficult
84
+ retry
85
+ end
86
+ end
87
+ raise e
71
88
  end
72
89
  end
73
90
 
@@ -6,6 +6,8 @@ module Embulk
6
6
  class Error < StandardError; end
7
7
  class JobTimeoutError < Error; end
8
8
  class NotFoundError < Error; end
9
+ class BackendError < Error; end
10
+ class InternalError < Error; end
9
11
 
10
12
  class GoogleClient
11
13
  def initialize(task, scope, client_class)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-06-01 00:00:00.000000000 Z
12
+ date: 2016-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client