embulk-output-bigquery 0.3.6 → 0.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0c3b2728451f241f860fdcea92a26470db4203d
|
4
|
+
data.tar.gz: e38b0f175e46685bd25d9a31a33358bd3220b8c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4639014f1f5ba0a6e791ceb35dccde7cb68fc75162d1a6cbfa06d3e99f882fd397470a97c3dd4edfe47cef786fd884ac53e787e48d01921ca7c1bc6edea58dd1
|
7
|
+
data.tar.gz: 5f6db056c5119510db2f00edd2351bd240f9e715e7acf6ad3da7a9dde4b0f5b9fb8b187b97641532dc6215b72dcb9317150da7260ae39235ab3ed22968bd9f67
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## 0.3.7 - 2016-08-03
|
2
|
+
|
3
|
+
* [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora)
|
4
|
+
|
1
5
|
## 0.3.6 - 2016-06-15
|
2
6
|
|
3
7
|
* [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.7"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -40,7 +40,7 @@ module Embulk
|
|
40
40
|
self.fields
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def with_job_retry(&block)
|
44
44
|
retries = 0
|
45
45
|
begin
|
46
46
|
yield
|
@@ -59,7 +59,7 @@ module Embulk
|
|
59
59
|
# @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
|
60
60
|
# @return [Array] responses
|
61
61
|
def load_from_gcs(object_uris, table)
|
62
|
-
|
62
|
+
with_job_retry do
|
63
63
|
begin
|
64
64
|
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
65
65
|
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
@@ -99,7 +99,7 @@ module Embulk
|
|
99
99
|
opts = {}
|
100
100
|
|
101
101
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
102
|
-
response = client.insert_job(@project, body, opts)
|
102
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
103
103
|
unless @task['is_skip_job_result_check']
|
104
104
|
response = wait_load('Load', response)
|
105
105
|
end
|
@@ -128,7 +128,7 @@ module Embulk
|
|
128
128
|
threads = []
|
129
129
|
Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
|
130
130
|
paths.each_with_index do |path, idx|
|
131
|
-
threads << Thread.new do
|
131
|
+
threads << Thread.new(path, idx) do |path, idx|
|
132
132
|
# I am not sure whether google-api-ruby-client is thread-safe,
|
133
133
|
# so let me create new instances for each thread for safe
|
134
134
|
bigquery = self.class.new(@task, @schema, fields)
|
@@ -138,13 +138,13 @@ module Embulk
|
|
138
138
|
end
|
139
139
|
ThreadsWait.all_waits(*threads) do |th|
|
140
140
|
idx, response = th.value # raise errors occurred in threads
|
141
|
-
responses[idx] = response
|
141
|
+
responses[idx] = response
|
142
142
|
end
|
143
143
|
responses
|
144
144
|
end
|
145
145
|
|
146
146
|
def load(path, table)
|
147
|
-
|
147
|
+
with_job_retry do
|
148
148
|
begin
|
149
149
|
if File.exist?(path)
|
150
150
|
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
@@ -196,7 +196,7 @@ module Embulk
|
|
196
196
|
# },
|
197
197
|
}
|
198
198
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
199
|
-
response = client.insert_job(@project, body, opts)
|
199
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
200
200
|
if @task['is_skip_job_result_check']
|
201
201
|
response
|
202
202
|
else
|
@@ -213,7 +213,7 @@ module Embulk
|
|
213
213
|
end
|
214
214
|
|
215
215
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
216
|
-
|
216
|
+
with_job_retry do
|
217
217
|
begin
|
218
218
|
destination_dataset ||= @dataset
|
219
219
|
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
@@ -248,7 +248,7 @@ module Embulk
|
|
248
248
|
|
249
249
|
opts = {}
|
250
250
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
251
|
-
response = client.insert_job(@project, body, opts)
|
251
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
252
252
|
wait_load('Copy', response)
|
253
253
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
254
254
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
@@ -289,7 +289,7 @@ module Embulk
|
|
289
289
|
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
290
290
|
}
|
291
291
|
sleep wait_interval
|
292
|
-
_response = client.get_job(@project, job_id)
|
292
|
+
_response = with_network_retry { client.get_job(@project, job_id) }
|
293
293
|
end
|
294
294
|
end
|
295
295
|
|
@@ -330,7 +330,7 @@ module Embulk
|
|
330
330
|
}.merge(hint)
|
331
331
|
opts = {}
|
332
332
|
Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{body}, #{opts})" }
|
333
|
-
client.insert_dataset(@project, body, opts)
|
333
|
+
with_network_retry { client.insert_dataset(@project, body, opts) }
|
334
334
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
335
335
|
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
336
336
|
# ignore 'Already Exists' error
|
@@ -349,7 +349,7 @@ module Embulk
|
|
349
349
|
dataset ||= @dataset
|
350
350
|
begin
|
351
351
|
Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@project}:#{@dataset}" }
|
352
|
-
client.get_dataset(@project, dataset)
|
352
|
+
with_network_retry { client.get_dataset(@project, dataset) }
|
353
353
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
354
354
|
if e.status_code == 404
|
355
355
|
raise NotFoundError, "Dataset #{@project}:#{dataset} is not found"
|
@@ -376,7 +376,7 @@ module Embulk
|
|
376
376
|
}
|
377
377
|
opts = {}
|
378
378
|
Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts})" }
|
379
|
-
client.insert_table(@project, @dataset, body, opts)
|
379
|
+
with_network_retry { client.insert_table(@project, @dataset, body, opts) }
|
380
380
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
381
381
|
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
382
382
|
# ignore 'Already Exists' error
|
@@ -394,7 +394,7 @@ module Embulk
|
|
394
394
|
def delete_table(table)
|
395
395
|
begin
|
396
396
|
Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@project}:#{@dataset}.#{table}" }
|
397
|
-
client.delete_table(@project, @dataset, table)
|
397
|
+
with_network_retry { client.delete_table(@project, @dataset, table) }
|
398
398
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
399
399
|
if e.status_code == 404 && /Not found:/ =~ e.message
|
400
400
|
# ignore 'Not Found' error
|
@@ -412,7 +412,7 @@ module Embulk
|
|
412
412
|
def get_table(table)
|
413
413
|
begin
|
414
414
|
Embulk.logger.info { "embulk-output-bigquery: Get table... #{@project}:#{@dataset}.#{table}" }
|
415
|
-
client.get_table(@project, @dataset, table)
|
415
|
+
with_network_retry { client.get_table(@project, @dataset, table) }
|
416
416
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
417
417
|
if e.status_code == 404
|
418
418
|
raise NotFoundError, "Table #{@project}:#{@dataset}.#{table} is not found"
|
@@ -29,7 +29,7 @@ module Embulk
|
|
29
29
|
opts = {}
|
30
30
|
|
31
31
|
Embulk.logger.debug { "embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts})" }
|
32
|
-
client.insert_bucket(@project, body, opts)
|
32
|
+
with_network_retry { client.insert_bucket(@project, body, opts) }
|
33
33
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
34
34
|
if e.status_code == 409 && /conflict:/ =~ e.message
|
35
35
|
# ignore 'Already Exists' error
|
@@ -50,7 +50,6 @@ module Embulk
|
|
50
50
|
object_uri = URI.join("gs://#{bucket}", object).to_s
|
51
51
|
|
52
52
|
started = Time.now
|
53
|
-
retries = 0
|
54
53
|
begin
|
55
54
|
Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
|
56
55
|
body = {
|
@@ -63,28 +62,13 @@ module Embulk
|
|
63
62
|
|
64
63
|
Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
|
65
64
|
# memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
|
66
|
-
client.insert_object(bucket, body, opts)
|
65
|
+
with_network_retry { client.insert_object(bucket, body, opts) }
|
67
66
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
68
67
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
69
68
|
Embulk.logger.error {
|
70
69
|
"embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
71
70
|
}
|
72
71
|
raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
|
73
|
-
rescue ::Java::Java.net.SocketException => e
|
74
|
-
# I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
|
75
|
-
# I am doubting as this is caused by Google's unstable network
|
76
|
-
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
77
|
-
if e.message == 'Broken pipe' || e.message == 'Connection reset'
|
78
|
-
if retries < @task['retries']
|
79
|
-
response = {message: e.message, error_class: e.class}
|
80
|
-
Embulk.logger.warn {
|
81
|
-
"embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
82
|
-
}
|
83
|
-
retries += 1 # want to share with google-api-ruby-client, but it is difficult
|
84
|
-
retry
|
85
|
-
end
|
86
|
-
end
|
87
|
-
raise e
|
88
72
|
end
|
89
73
|
end
|
90
74
|
|
@@ -111,7 +95,7 @@ module Embulk
|
|
111
95
|
opts = {}
|
112
96
|
|
113
97
|
Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
|
114
|
-
response = client.delete_object(bucket, object, opts)
|
98
|
+
response = with_network_retry { client.delete_object(bucket, object, opts) }
|
115
99
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
116
100
|
if e.status_code == 404 # ignore 'notFound' error
|
117
101
|
return nil
|
@@ -64,6 +64,27 @@ module Embulk
|
|
64
64
|
@cached_client_expiration = Time.now + 1800
|
65
65
|
@cached_client = client
|
66
66
|
end
|
67
|
+
|
68
|
+
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
69
|
+
def with_network_retry(&block)
|
70
|
+
retries = 0
|
71
|
+
begin
|
72
|
+
yield
|
73
|
+
rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException => e
|
74
|
+
if ['Broken pipe', 'Connection reset', 'Connection timed out'].include?(e.message)
|
75
|
+
if retries < @task['retries']
|
76
|
+
retries += 1
|
77
|
+
Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" }
|
78
|
+
retry
|
79
|
+
else
|
80
|
+
Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" }
|
81
|
+
raise e
|
82
|
+
end
|
83
|
+
else
|
84
|
+
raise e
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
67
88
|
end
|
68
89
|
end
|
69
90
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-08-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|