embulk-output-bigquery 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f68ceb57a4eff6886157c585425526389623d0a2
4
- data.tar.gz: b44323059a3057bb5de7fdd7b00d61ce970f3386
3
+ metadata.gz: b0856b220a3d9c7b78dbffe45b35edf8e10b4fba
4
+ data.tar.gz: 8ce985e90cfd9aa9b88c6cb7ec994d6e89184584
5
5
  SHA512:
6
- metadata.gz: 5cc7b1245bda2ae8c5d581c67a09ce0685c7812658c3c47e195362290fd50c13abfb7a3e9bb2360bc01a6d6aa82009ce190bef667cfb1df2cddaeb653c162c14
7
- data.tar.gz: 4f8611f292a61750568c7b15e7ae6f83bc83d09ae3f64b2359b8f6f4e4d4b7ac115e09e6e8fbb5cc2e98b89103b8d1aba0640e0d89b035dbab2e5feea0d47449
6
+ metadata.gz: 02c3432b3494df8ca2f1901dda75488380574a7047228434a800d1578975ce9b6abb0bd65a0a5fc95285077d6963d19160241ed96e2b42d28cd30920e8c66230
7
+ data.tar.gz: dd7aec6748550836b6195d5ee7d403ca126a52d194fe9a0a8e3bf7257bb7639931f3e6df7fde2908a1e8f610577b085ca30c50876a800040ff7b71aedb80e1f8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.3.5 - 2016-06-13
2
+
3
+ * [enhancement] retry backendError and internalError in waiting load job
4
+ * [enhancement] retry Broken pipe and Connection reset in inserting object to GCS
5
+
1
6
  ## 0.3.4 - 2016-06-01
2
7
 
3
8
  * [new feature] Add `gcs_bucket` option to load multiple files from a GCS bucket with one load job
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.4"
3
+ spec.version = "0.3.5"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -40,59 +40,77 @@ module Embulk
40
40
  self.fields
41
41
  end
42
42
 
43
- # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
44
- # @return [Array] responses
45
- def load_from_gcs(object_uris, table)
43
+ def with_retry_job(&block)
44
+ retries = 0
46
45
  begin
47
- # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
48
- # we should generate job_id in client code, otherwise, retrying would cause duplication
49
- if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
50
- job_id = Helper.create_load_job_id(@task, path, fields)
46
+ yield
47
+ rescue BackendError, InternalError => e
48
+ if retries < @task['retries']
49
+ retries += 1
50
+ Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.message}" }
51
+ retry
51
52
  else
52
- job_id = "embulk_load_job_#{SecureRandom.uuid}"
53
+ Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.message}" }
54
+ raise e
53
55
  end
54
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
56
+ end
57
+ end
55
58
 
56
- body = {
57
- job_reference: {
58
- project_id: @project,
59
- job_id: job_id,
60
- },
61
- configuration: {
62
- load: {
63
- destination_table: {
64
- project_id: @project,
65
- dataset_id: @dataset,
66
- table_id: table,
67
- },
68
- schema: {
69
- fields: fields,
70
- },
71
- write_disposition: 'WRITE_APPEND',
72
- source_format: @task['source_format'],
73
- max_bad_records: @task['max_bad_records'],
74
- field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
75
- encoding: @task['encoding'],
76
- ignore_unknown_values: @task['ignore_unknown_values'],
77
- allow_quoted_newlines: @task['allow_quoted_newlines'],
78
- source_uris: object_uris,
59
+ # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
60
+ # @return [Array] responses
61
+ def load_from_gcs(object_uris, table)
62
+ with_retry_job do
63
+ begin
64
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
65
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
66
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
67
+ job_id = Helper.create_load_job_id(@task, path, fields)
68
+ else
69
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
70
+ end
71
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
72
+
73
+ body = {
74
+ job_reference: {
75
+ project_id: @project,
76
+ job_id: job_id,
77
+ },
78
+ configuration: {
79
+ load: {
80
+ destination_table: {
81
+ project_id: @project,
82
+ dataset_id: @dataset,
83
+ table_id: table,
84
+ },
85
+ schema: {
86
+ fields: fields,
87
+ },
88
+ write_disposition: 'WRITE_APPEND',
89
+ source_format: @task['source_format'],
90
+ max_bad_records: @task['max_bad_records'],
91
+ field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
92
+ encoding: @task['encoding'],
93
+ ignore_unknown_values: @task['ignore_unknown_values'],
94
+ allow_quoted_newlines: @task['allow_quoted_newlines'],
95
+ source_uris: object_uris,
96
+ }
79
97
  }
80
98
  }
81
- }
82
- opts = {}
99
+ opts = {}
83
100
 
84
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
85
- response = client.insert_job(@project, body, opts)
86
- unless @task['is_skip_job_result_check']
87
- response = wait_load('Load', response)
101
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
102
+ response = client.insert_job(@project, body, opts)
103
+ unless @task['is_skip_job_result_check']
104
+ response = wait_load('Load', response)
105
+ end
106
+ [response]
107
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
108
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
109
+ Embulk.logger.error {
110
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
111
+ }
112
+ raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
88
113
  end
89
- [response]
90
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
91
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
92
- Embulk.logger.error {
93
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
94
- }
95
- raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
96
114
  end
97
115
  end
98
116
 
@@ -126,90 +144,93 @@ module Embulk
126
144
  end
127
145
 
128
146
  def load(path, table)
129
- begin
130
- if File.exist?(path)
131
- # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
132
- # we should generate job_id in client code, otherwise, retrying would cause duplication
133
- if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
134
- job_id = Helper.create_load_job_id(@task, path, fields)
147
+ with_retry_job do
148
+ begin
149
+ if File.exist?(path)
150
+ # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
151
+ # we should generate job_id in client code, otherwise, retrying would cause duplication
152
+ if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
153
+ job_id = Helper.create_load_job_id(@task, path, fields)
154
+ else
155
+ job_id = "embulk_load_job_#{SecureRandom.uuid}"
156
+ end
157
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
135
158
  else
136
- job_id = "embulk_load_job_#{SecureRandom.uuid}"
159
+ Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
160
+ return
137
161
  end
138
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{path} => #{@project}:#{@dataset}.#{table}" }
139
- else
140
- Embulk.logger.info { "embulk-output-bigquery: Load job starting... #{path} does not exist, skipped" }
141
- return
142
- end
143
162
 
144
- body = {
145
- job_reference: {
146
- project_id: @project,
147
- job_id: job_id,
148
- },
149
- configuration: {
150
- load: {
151
- destination_table: {
152
- project_id: @project,
153
- dataset_id: @dataset,
154
- table_id: table,
155
- },
156
- schema: {
157
- fields: fields,
158
- },
159
- write_disposition: 'WRITE_APPEND',
160
- source_format: @task['source_format'],
161
- max_bad_records: @task['max_bad_records'],
162
- field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
163
- encoding: @task['encoding'],
164
- ignore_unknown_values: @task['ignore_unknown_values'],
165
- allow_quoted_newlines: @task['allow_quoted_newlines'],
163
+ body = {
164
+ job_reference: {
165
+ project_id: @project,
166
+ job_id: job_id,
167
+ },
168
+ configuration: {
169
+ load: {
170
+ destination_table: {
171
+ project_id: @project,
172
+ dataset_id: @dataset,
173
+ table_id: table,
174
+ },
175
+ schema: {
176
+ fields: fields,
177
+ },
178
+ write_disposition: 'WRITE_APPEND',
179
+ source_format: @task['source_format'],
180
+ max_bad_records: @task['max_bad_records'],
181
+ field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
182
+ encoding: @task['encoding'],
183
+ ignore_unknown_values: @task['ignore_unknown_values'],
184
+ allow_quoted_newlines: @task['allow_quoted_newlines'],
185
+ }
166
186
  }
167
187
  }
168
- }
169
188
 
170
- opts = {
171
- upload_source: path,
172
- content_type: "application/octet-stream",
173
- # options: {
174
- # retries: @task['retries'],
175
- # timeout_sec: @task['timeout_sec'],
176
- # open_timeout_sec: @task['open_timeout_sec']
177
- # },
178
- }
179
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
180
- response = client.insert_job(@project, body, opts)
181
- unless @task['is_skip_job_result_check']
182
- response = wait_load('Load', response)
189
+ opts = {
190
+ upload_source: path,
191
+ content_type: "application/octet-stream",
192
+ # options: {
193
+ # retries: @task['retries'],
194
+ # timeout_sec: @task['timeout_sec'],
195
+ # open_timeout_sec: @task['open_timeout_sec']
196
+ # },
197
+ }
198
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
199
+ response = client.insert_job(@project, body, opts)
200
+ unless @task['is_skip_job_result_check']
201
+ response = wait_load('Load', response)
202
+ end
203
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
204
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
205
+ Embulk.logger.error {
206
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
207
+ }
208
+ raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
183
209
  end
184
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
185
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
186
- Embulk.logger.error {
187
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
188
- }
189
- raise Error, "failed to load #{path} to #{@project}:#{@dataset}.#{table}, response:#{response}"
190
210
  end
191
211
  end
192
212
 
193
213
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
194
- begin
195
- destination_dataset ||= @dataset
196
- job_id = "embulk_copy_job_#{SecureRandom.uuid}"
214
+ with_retry_job do
215
+ begin
216
+ destination_dataset ||= @dataset
217
+ job_id = "embulk_copy_job_#{SecureRandom.uuid}"
197
218
 
198
- Embulk.logger.info {
199
- "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
200
- "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
201
- }
219
+ Embulk.logger.info {
220
+ "embulk-output-bigquery: Copy job starting... job_id:[#{job_id}] " \
221
+ "#{@project}:#{@dataset}.#{source_table} => #{@project}:#{destination_dataset}.#{destination_table}"
222
+ }
202
223
 
203
- body = {
204
- job_reference: {
205
- project_id: @project,
206
- job_id: job_id,
207
- },
208
- configuration: {
209
- copy: {
210
- create_deposition: 'CREATE_IF_NEEDED',
211
- write_disposition: write_disposition,
212
- source_table: {
224
+ body = {
225
+ job_reference: {
226
+ project_id: @project,
227
+ job_id: job_id,
228
+ },
229
+ configuration: {
230
+ copy: {
231
+ create_deposition: 'CREATE_IF_NEEDED',
232
+ write_disposition: write_disposition,
233
+ source_table: {
213
234
  project_id: @project,
214
235
  dataset_id: @dataset,
215
236
  table_id: source_table,
@@ -219,21 +240,22 @@ module Embulk
219
240
  dataset_id: destination_dataset,
220
241
  table_id: destination_table,
221
242
  },
243
+ }
222
244
  }
223
245
  }
224
- }
225
246
 
226
- opts = {}
227
- Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
228
- response = client.insert_job(@project, body, opts)
229
- wait_load('Copy', response)
230
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
231
- response = {status_code: e.status_code, message: e.message, error_class: e.class}
232
- Embulk.logger.error {
233
- "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
234
- }
235
- raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
236
- "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
247
+ opts = {}
248
+ Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
249
+ response = client.insert_job(@project, body, opts)
250
+ wait_load('Copy', response)
251
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
252
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
253
+ Embulk.logger.error {
254
+ "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
255
+ }
256
+ raise Error, "failed to copy #{@project}:#{@dataset}.#{source_table} " \
257
+ "to #{@project}:#{destination_dataset}.#{destination_table}, response:#{response}"
258
+ end
237
259
  end
238
260
  end
239
261
 
@@ -273,11 +295,15 @@ module Embulk
273
295
  # `errors` returns Array<Google::Apis::BigqueryV2::ErrorProto> if any error exists.
274
296
  # Otherwise, this returns nil.
275
297
  if _errors = _response.status.errors
276
- Embulk.logger.error {
277
- "embulk-output-bigquery: get_job(#{@project}, #{job_id}), " \
278
- "errors:#{_errors.map(&:to_h)}"
279
- }
280
- raise Error, "failed during waiting a #{kind} job, errors:#{_errors.map(&:to_h)}"
298
+ msg = "failed during waiting a #{kind} job, get_job(#{@project}, #{job_id}), errors:#{_errors.map(&:to_h)}"
299
+ if _errors.any? {|error| error.reason == 'backendError' }
300
+ raise BackendError, msg
301
+ elsif _errors.any? {|error| error.reason == 'internalError' }
302
+ raise InternalError, msg
303
+ else
304
+ Embulk.logger.error { "embulk-output-bigquery: #{msg}" }
305
+ raise Error, msg
306
+ end
281
307
  end
282
308
 
283
309
  Embulk.logger.info { "embulk-output-bigquery: #{kind} job response... job_id:[#{job_id}] response.statistics:#{_response.statistics.to_h}" }
@@ -1,4 +1,5 @@
1
1
  require 'uri'
2
+ require 'java'
2
3
  require 'google/apis/storage_v1'
3
4
  require_relative 'google_client'
4
5
  require_relative 'helper'
@@ -49,6 +50,7 @@ module Embulk
49
50
  object_uri = URI.join("gs://#{bucket}", object).to_s
50
51
 
51
52
  started = Time.now
53
+ retries = 0
52
54
  begin
53
55
  Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
54
56
  body = {
@@ -68,6 +70,21 @@ module Embulk
68
70
  "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
69
71
  }
70
72
  raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
73
+ rescue ::Java::Java.net.SocketException => e
74
+ # I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
75
+ # I am doubting as this is caused by Google's unstable network
76
+ # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
77
+ if e.message == 'Broken pipe' || e.message == 'Connection reset'
78
+ if retries < @task['retries']
79
+ response = {message: e.message, error_class: e.class}
80
+ Embulk.logger.warn {
81
+ "embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
82
+ }
83
+ retries += 1 # want to share with google-api-ruby-client, but it is difficult
84
+ retry
85
+ end
86
+ end
87
+ raise e
71
88
  end
72
89
  end
73
90
 
@@ -6,6 +6,8 @@ module Embulk
6
6
  class Error < StandardError; end
7
7
  class JobTimeoutError < Error; end
8
8
  class NotFoundError < Error; end
9
+ class BackendError < Error; end
10
+ class InternalError < Error; end
9
11
 
10
12
  class GoogleClient
11
13
  def initialize(task, scope, client_class)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 0.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-06-01 00:00:00.000000000 Z
12
+ date: 2016-06-13 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client