embulk-output-bigquery 0.3.6 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 75ab420c5b9cd88e768004ba69eadbfee768c042
4
- data.tar.gz: cce52b664a946840e4576cb5d66b95e1082252ae
3
+ metadata.gz: f0c3b2728451f241f860fdcea92a26470db4203d
4
+ data.tar.gz: e38b0f175e46685bd25d9a31a33358bd3220b8c6
5
5
  SHA512:
6
- metadata.gz: 4af32432153efe97e5f304c2a4b20f97ab67af7c0c626ebdcd918665fb996e8946af03cfdc34f61036ec369deb769aeb4857af18876fc1a9448c86e8dd9b0ff6
7
- data.tar.gz: 065b5243b83818a72cab0769a797dc2fa760d138eb464eb01040e99f37d553bf99b940909ea4546b7cf30854f745b493495304d858b1c985428d23f9df0850f0
6
+ metadata.gz: 4639014f1f5ba0a6e791ceb35dccde7cb68fc75162d1a6cbfa06d3e99f882fd397470a97c3dd4edfe47cef786fd884ac53e787e48d01921ca7c1bc6edea58dd1
7
+ data.tar.gz: 5f6db056c5119510db2f00edd2351bd240f9e715e7acf6ad3da7a9dde4b0f5b9fb8b187b97641532dc6215b72dcb9317150da7260ae39235ab3ed22968bd9f67
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.3.7 - 2016-08-03
2
+
3
+ * [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora)
4
+
1
5
  ## 0.3.6 - 2016-06-15
2
6
 
3
7
  * [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007)
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.6"
3
+ spec.version = "0.3.7"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -40,7 +40,7 @@ module Embulk
40
40
  self.fields
41
41
  end
42
42
 
43
- def with_retry_job(&block)
43
+ def with_job_retry(&block)
44
44
  retries = 0
45
45
  begin
46
46
  yield
@@ -59,7 +59,7 @@ module Embulk
59
59
  # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
60
60
  # @return [Array] responses
61
61
  def load_from_gcs(object_uris, table)
62
- with_retry_job do
62
+ with_job_retry do
63
63
  begin
64
64
  # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
65
65
  # we should generate job_id in client code, otherwise, retrying would cause duplication
@@ -99,7 +99,7 @@ module Embulk
99
99
  opts = {}
100
100
 
101
101
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
102
- response = client.insert_job(@project, body, opts)
102
+ response = with_network_retry { client.insert_job(@project, body, opts) }
103
103
  unless @task['is_skip_job_result_check']
104
104
  response = wait_load('Load', response)
105
105
  end
@@ -128,7 +128,7 @@ module Embulk
128
128
  threads = []
129
129
  Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
130
130
  paths.each_with_index do |path, idx|
131
- threads << Thread.new do
131
+ threads << Thread.new(path, idx) do |path, idx|
132
132
  # I am not sure whether google-api-ruby-client is thread-safe,
133
133
  # so let me create new instances for each thread for safe
134
134
  bigquery = self.class.new(@task, @schema, fields)
@@ -138,13 +138,13 @@ module Embulk
138
138
  end
139
139
  ThreadsWait.all_waits(*threads) do |th|
140
140
  idx, response = th.value # raise errors occurred in threads
141
- responses[idx] = response if idx
141
+ responses[idx] = response
142
142
  end
143
143
  responses
144
144
  end
145
145
 
146
146
  def load(path, table)
147
- with_retry_job do
147
+ with_job_retry do
148
148
  begin
149
149
  if File.exist?(path)
150
150
  # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
@@ -196,7 +196,7 @@ module Embulk
196
196
  # },
197
197
  }
198
198
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
199
- response = client.insert_job(@project, body, opts)
199
+ response = with_network_retry { client.insert_job(@project, body, opts) }
200
200
  if @task['is_skip_job_result_check']
201
201
  response
202
202
  else
@@ -213,7 +213,7 @@ module Embulk
213
213
  end
214
214
 
215
215
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
216
- with_retry_job do
216
+ with_job_retry do
217
217
  begin
218
218
  destination_dataset ||= @dataset
219
219
  job_id = "embulk_copy_job_#{SecureRandom.uuid}"
@@ -248,7 +248,7 @@ module Embulk
248
248
 
249
249
  opts = {}
250
250
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
251
- response = client.insert_job(@project, body, opts)
251
+ response = with_network_retry { client.insert_job(@project, body, opts) }
252
252
  wait_load('Copy', response)
253
253
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
254
254
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
@@ -289,7 +289,7 @@ module Embulk
289
289
  "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
290
290
  }
291
291
  sleep wait_interval
292
- _response = client.get_job(@project, job_id)
292
+ _response = with_network_retry { client.get_job(@project, job_id) }
293
293
  end
294
294
  end
295
295
 
@@ -330,7 +330,7 @@ module Embulk
330
330
  }.merge(hint)
331
331
  opts = {}
332
332
  Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{body}, #{opts})" }
333
- client.insert_dataset(@project, body, opts)
333
+ with_network_retry { client.insert_dataset(@project, body, opts) }
334
334
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
335
335
  if e.status_code == 409 && /Already Exists:/ =~ e.message
336
336
  # ignore 'Already Exists' error
@@ -349,7 +349,7 @@ module Embulk
349
349
  dataset ||= @dataset
350
350
  begin
351
351
  Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@project}:#{@dataset}" }
352
- client.get_dataset(@project, dataset)
352
+ with_network_retry { client.get_dataset(@project, dataset) }
353
353
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
354
354
  if e.status_code == 404
355
355
  raise NotFoundError, "Dataset #{@project}:#{dataset} is not found"
@@ -376,7 +376,7 @@ module Embulk
376
376
  }
377
377
  opts = {}
378
378
  Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts})" }
379
- client.insert_table(@project, @dataset, body, opts)
379
+ with_network_retry { client.insert_table(@project, @dataset, body, opts) }
380
380
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
381
381
  if e.status_code == 409 && /Already Exists:/ =~ e.message
382
382
  # ignore 'Already Exists' error
@@ -394,7 +394,7 @@ module Embulk
394
394
  def delete_table(table)
395
395
  begin
396
396
  Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@project}:#{@dataset}.#{table}" }
397
- client.delete_table(@project, @dataset, table)
397
+ with_network_retry { client.delete_table(@project, @dataset, table) }
398
398
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
399
399
  if e.status_code == 404 && /Not found:/ =~ e.message
400
400
  # ignore 'Not Found' error
@@ -412,7 +412,7 @@ module Embulk
412
412
  def get_table(table)
413
413
  begin
414
414
  Embulk.logger.info { "embulk-output-bigquery: Get table... #{@project}:#{@dataset}.#{table}" }
415
- client.get_table(@project, @dataset, table)
415
+ with_network_retry { client.get_table(@project, @dataset, table) }
416
416
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
417
417
  if e.status_code == 404
418
418
  raise NotFoundError, "Table #{@project}:#{@dataset}.#{table} is not found"
@@ -29,7 +29,7 @@ module Embulk
29
29
  opts = {}
30
30
 
31
31
  Embulk.logger.debug { "embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts})" }
32
- client.insert_bucket(@project, body, opts)
32
+ with_network_retry { client.insert_bucket(@project, body, opts) }
33
33
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
34
34
  if e.status_code == 409 && /conflict:/ =~ e.message
35
35
  # ignore 'Already Exists' error
@@ -50,7 +50,6 @@ module Embulk
50
50
  object_uri = URI.join("gs://#{bucket}", object).to_s
51
51
 
52
52
  started = Time.now
53
- retries = 0
54
53
  begin
55
54
  Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
56
55
  body = {
@@ -63,28 +62,13 @@ module Embulk
63
62
 
64
63
  Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
65
64
  # memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
66
- client.insert_object(bucket, body, opts)
65
+ with_network_retry { client.insert_object(bucket, body, opts) }
67
66
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
68
67
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
69
68
  Embulk.logger.error {
70
69
  "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
71
70
  }
72
71
  raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
73
- rescue ::Java::Java.net.SocketException => e
74
- # I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
75
- # I am doubting as this is caused by Google's unstable network
76
- # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
77
- if e.message == 'Broken pipe' || e.message == 'Connection reset'
78
- if retries < @task['retries']
79
- response = {message: e.message, error_class: e.class}
80
- Embulk.logger.warn {
81
- "embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
82
- }
83
- retries += 1 # want to share with google-api-ruby-client, but it is difficult
84
- retry
85
- end
86
- end
87
- raise e
88
72
  end
89
73
  end
90
74
 
@@ -111,7 +95,7 @@ module Embulk
111
95
  opts = {}
112
96
 
113
97
  Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
114
- response = client.delete_object(bucket, object, opts)
98
+ response = with_network_retry { client.delete_object(bucket, object, opts) }
115
99
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
116
100
  if e.status_code == 404 # ignore 'notFound' error
117
101
  return nil
@@ -64,6 +64,27 @@ module Embulk
64
64
  @cached_client_expiration = Time.now + 1800
65
65
  @cached_client = client
66
66
  end
67
+
68
+ # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
69
+ def with_network_retry(&block)
70
+ retries = 0
71
+ begin
72
+ yield
73
+ rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException => e
74
+ if ['Broken pipe', 'Connection reset', 'Connection timed out'].include?(e.message)
75
+ if retries < @task['retries']
76
+ retries += 1
77
+ Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" }
78
+ retry
79
+ else
80
+ Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" }
81
+ raise e
82
+ end
83
+ else
84
+ raise e
85
+ end
86
+ end
87
+ end
67
88
  end
68
89
  end
69
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-06-15 00:00:00.000000000 Z
12
+ date: 2016-08-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client