embulk-output-bigquery 0.3.6 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 75ab420c5b9cd88e768004ba69eadbfee768c042
4
- data.tar.gz: cce52b664a946840e4576cb5d66b95e1082252ae
3
+ metadata.gz: f0c3b2728451f241f860fdcea92a26470db4203d
4
+ data.tar.gz: e38b0f175e46685bd25d9a31a33358bd3220b8c6
5
5
  SHA512:
6
- metadata.gz: 4af32432153efe97e5f304c2a4b20f97ab67af7c0c626ebdcd918665fb996e8946af03cfdc34f61036ec369deb769aeb4857af18876fc1a9448c86e8dd9b0ff6
7
- data.tar.gz: 065b5243b83818a72cab0769a797dc2fa760d138eb464eb01040e99f37d553bf99b940909ea4546b7cf30854f745b493495304d858b1c985428d23f9df0850f0
6
+ metadata.gz: 4639014f1f5ba0a6e791ceb35dccde7cb68fc75162d1a6cbfa06d3e99f882fd397470a97c3dd4edfe47cef786fd884ac53e787e48d01921ca7c1bc6edea58dd1
7
+ data.tar.gz: 5f6db056c5119510db2f00edd2351bd240f9e715e7acf6ad3da7a9dde4b0f5b9fb8b187b97641532dc6215b72dcb9317150da7260ae39235ab3ed22968bd9f67
data/CHANGELOG.md CHANGED
@@ -1,3 +1,7 @@
1
+ ## 0.3.7 - 2016-08-03
2
+
3
+ * [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora)
4
+
1
5
  ## 0.3.6 - 2016-06-15
2
6
 
3
7
  * [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007)
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-bigquery"
3
- spec.version = "0.3.6"
3
+ spec.version = "0.3.7"
4
4
  spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
5
5
  spec.summary = "Google BigQuery output plugin for Embulk"
6
6
  spec.description = "Embulk plugin that insert records to Google BigQuery."
@@ -40,7 +40,7 @@ module Embulk
40
40
  self.fields
41
41
  end
42
42
 
43
- def with_retry_job(&block)
43
+ def with_job_retry(&block)
44
44
  retries = 0
45
45
  begin
46
46
  yield
@@ -59,7 +59,7 @@ module Embulk
59
59
  # @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
60
60
  # @return [Array] responses
61
61
  def load_from_gcs(object_uris, table)
62
- with_retry_job do
62
+ with_job_retry do
63
63
  begin
64
64
  # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
65
65
  # we should generate job_id in client code, otherwise, retrying would cause duplication
@@ -99,7 +99,7 @@ module Embulk
99
99
  opts = {}
100
100
 
101
101
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
102
- response = client.insert_job(@project, body, opts)
102
+ response = with_network_retry { client.insert_job(@project, body, opts) }
103
103
  unless @task['is_skip_job_result_check']
104
104
  response = wait_load('Load', response)
105
105
  end
@@ -128,7 +128,7 @@ module Embulk
128
128
  threads = []
129
129
  Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
130
130
  paths.each_with_index do |path, idx|
131
- threads << Thread.new do
131
+ threads << Thread.new(path, idx) do |path, idx|
132
132
  # I am not sure whether google-api-ruby-client is thread-safe,
133
133
  # so let me create new instances for each thread for safe
134
134
  bigquery = self.class.new(@task, @schema, fields)
@@ -138,13 +138,13 @@ module Embulk
138
138
  end
139
139
  ThreadsWait.all_waits(*threads) do |th|
140
140
  idx, response = th.value # raise errors occurred in threads
141
- responses[idx] = response if idx
141
+ responses[idx] = response
142
142
  end
143
143
  responses
144
144
  end
145
145
 
146
146
  def load(path, table)
147
- with_retry_job do
147
+ with_job_retry do
148
148
  begin
149
149
  if File.exist?(path)
150
150
  # As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
@@ -196,7 +196,7 @@ module Embulk
196
196
  # },
197
197
  }
198
198
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
199
- response = client.insert_job(@project, body, opts)
199
+ response = with_network_retry { client.insert_job(@project, body, opts) }
200
200
  if @task['is_skip_job_result_check']
201
201
  response
202
202
  else
@@ -213,7 +213,7 @@ module Embulk
213
213
  end
214
214
 
215
215
  def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
216
- with_retry_job do
216
+ with_job_retry do
217
217
  begin
218
218
  destination_dataset ||= @dataset
219
219
  job_id = "embulk_copy_job_#{SecureRandom.uuid}"
@@ -248,7 +248,7 @@ module Embulk
248
248
 
249
249
  opts = {}
250
250
  Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
251
- response = client.insert_job(@project, body, opts)
251
+ response = with_network_retry { client.insert_job(@project, body, opts) }
252
252
  wait_load('Copy', response)
253
253
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
254
254
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
@@ -289,7 +289,7 @@ module Embulk
289
289
  "job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
290
290
  }
291
291
  sleep wait_interval
292
- _response = client.get_job(@project, job_id)
292
+ _response = with_network_retry { client.get_job(@project, job_id) }
293
293
  end
294
294
  end
295
295
 
@@ -330,7 +330,7 @@ module Embulk
330
330
  }.merge(hint)
331
331
  opts = {}
332
332
  Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{body}, #{opts})" }
333
- client.insert_dataset(@project, body, opts)
333
+ with_network_retry { client.insert_dataset(@project, body, opts) }
334
334
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
335
335
  if e.status_code == 409 && /Already Exists:/ =~ e.message
336
336
  # ignore 'Already Exists' error
@@ -349,7 +349,7 @@ module Embulk
349
349
  dataset ||= @dataset
350
350
  begin
351
351
  Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@project}:#{@dataset}" }
352
- client.get_dataset(@project, dataset)
352
+ with_network_retry { client.get_dataset(@project, dataset) }
353
353
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
354
354
  if e.status_code == 404
355
355
  raise NotFoundError, "Dataset #{@project}:#{dataset} is not found"
@@ -376,7 +376,7 @@ module Embulk
376
376
  }
377
377
  opts = {}
378
378
  Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts})" }
379
- client.insert_table(@project, @dataset, body, opts)
379
+ with_network_retry { client.insert_table(@project, @dataset, body, opts) }
380
380
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
381
381
  if e.status_code == 409 && /Already Exists:/ =~ e.message
382
382
  # ignore 'Already Exists' error
@@ -394,7 +394,7 @@ module Embulk
394
394
  def delete_table(table)
395
395
  begin
396
396
  Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@project}:#{@dataset}.#{table}" }
397
- client.delete_table(@project, @dataset, table)
397
+ with_network_retry { client.delete_table(@project, @dataset, table) }
398
398
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
399
399
  if e.status_code == 404 && /Not found:/ =~ e.message
400
400
  # ignore 'Not Found' error
@@ -412,7 +412,7 @@ module Embulk
412
412
  def get_table(table)
413
413
  begin
414
414
  Embulk.logger.info { "embulk-output-bigquery: Get table... #{@project}:#{@dataset}.#{table}" }
415
- client.get_table(@project, @dataset, table)
415
+ with_network_retry { client.get_table(@project, @dataset, table) }
416
416
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
417
417
  if e.status_code == 404
418
418
  raise NotFoundError, "Table #{@project}:#{@dataset}.#{table} is not found"
@@ -29,7 +29,7 @@ module Embulk
29
29
  opts = {}
30
30
 
31
31
  Embulk.logger.debug { "embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts})" }
32
- client.insert_bucket(@project, body, opts)
32
+ with_network_retry { client.insert_bucket(@project, body, opts) }
33
33
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
34
34
  if e.status_code == 409 && /conflict:/ =~ e.message
35
35
  # ignore 'Already Exists' error
@@ -50,7 +50,6 @@ module Embulk
50
50
  object_uri = URI.join("gs://#{bucket}", object).to_s
51
51
 
52
52
  started = Time.now
53
- retries = 0
54
53
  begin
55
54
  Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
56
55
  body = {
@@ -63,28 +62,13 @@ module Embulk
63
62
 
64
63
  Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
65
64
  # memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
66
- client.insert_object(bucket, body, opts)
65
+ with_network_retry { client.insert_object(bucket, body, opts) }
67
66
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
68
67
  response = {status_code: e.status_code, message: e.message, error_class: e.class}
69
68
  Embulk.logger.error {
70
69
  "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
71
70
  }
72
71
  raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
73
- rescue ::Java::Java.net.SocketException => e
74
- # I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
75
- # I am doubting as this is caused by Google's unstable network
76
- # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
77
- if e.message == 'Broken pipe' || e.message == 'Connection reset'
78
- if retries < @task['retries']
79
- response = {message: e.message, error_class: e.class}
80
- Embulk.logger.warn {
81
- "embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
82
- }
83
- retries += 1 # want to share with google-api-ruby-client, but it is difficult
84
- retry
85
- end
86
- end
87
- raise e
88
72
  end
89
73
  end
90
74
 
@@ -111,7 +95,7 @@ module Embulk
111
95
  opts = {}
112
96
 
113
97
  Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
114
- response = client.delete_object(bucket, object, opts)
98
+ response = with_network_retry { client.delete_object(bucket, object, opts) }
115
99
  rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
116
100
  if e.status_code == 404 # ignore 'notFound' error
117
101
  return nil
@@ -64,6 +64,27 @@ module Embulk
64
64
  @cached_client_expiration = Time.now + 1800
65
65
  @cached_client = client
66
66
  end
67
+
68
+ # google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
69
+ def with_network_retry(&block)
70
+ retries = 0
71
+ begin
72
+ yield
73
+ rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException => e
74
+ if ['Broken pipe', 'Connection reset', 'Connection timed out'].include?(e.message)
75
+ if retries < @task['retries']
76
+ retries += 1
77
+ Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" }
78
+ retry
79
+ else
80
+ Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" }
81
+ raise e
82
+ end
83
+ else
84
+ raise e
85
+ end
86
+ end
87
+ end
67
88
  end
68
89
  end
69
90
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 0.3.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Satoshi Akama
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-06-15 00:00:00.000000000 Z
12
+ date: 2016-08-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: google-api-client