embulk-output-bigquery 0.3.6 → 0.3.7
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0c3b2728451f241f860fdcea92a26470db4203d
|
4
|
+
data.tar.gz: e38b0f175e46685bd25d9a31a33358bd3220b8c6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4639014f1f5ba0a6e791ceb35dccde7cb68fc75162d1a6cbfa06d3e99f882fd397470a97c3dd4edfe47cef786fd884ac53e787e48d01921ca7c1bc6edea58dd1
|
7
|
+
data.tar.gz: 5f6db056c5119510db2f00edd2351bd240f9e715e7acf6ad3da7a9dde4b0f5b9fb8b187b97641532dc6215b72dcb9317150da7260ae39235ab3ed22968bd9f67
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
## 0.3.7 - 2016-08-03
|
2
|
+
|
3
|
+
* [maintenance] Fix Thread.new to use thread local variables to avoid nil idx error (thanks to @shyouhei and @umisora)
|
4
|
+
|
1
5
|
## 0.3.6 - 2016-06-15
|
2
6
|
|
3
7
|
* [maintenance] if `is_skip_job_result_check` is true, skip output_rows checking (thanks to @joker1007)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.7"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -40,7 +40,7 @@ module Embulk
|
|
40
40
|
self.fields
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def with_job_retry(&block)
|
44
44
|
retries = 0
|
45
45
|
begin
|
46
46
|
yield
|
@@ -59,7 +59,7 @@ module Embulk
|
|
59
59
|
# @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
|
60
60
|
# @return [Array] responses
|
61
61
|
def load_from_gcs(object_uris, table)
|
62
|
-
|
62
|
+
with_job_retry do
|
63
63
|
begin
|
64
64
|
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
65
65
|
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
@@ -99,7 +99,7 @@ module Embulk
|
|
99
99
|
opts = {}
|
100
100
|
|
101
101
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
102
|
-
response = client.insert_job(@project, body, opts)
|
102
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
103
103
|
unless @task['is_skip_job_result_check']
|
104
104
|
response = wait_load('Load', response)
|
105
105
|
end
|
@@ -128,7 +128,7 @@ module Embulk
|
|
128
128
|
threads = []
|
129
129
|
Embulk.logger.debug { "embulk-output-bigquery: LOAD IN PARALLEL #{paths}" }
|
130
130
|
paths.each_with_index do |path, idx|
|
131
|
-
threads << Thread.new do
|
131
|
+
threads << Thread.new(path, idx) do |path, idx|
|
132
132
|
# I am not sure whether google-api-ruby-client is thread-safe,
|
133
133
|
# so let me create new instances for each thread for safe
|
134
134
|
bigquery = self.class.new(@task, @schema, fields)
|
@@ -138,13 +138,13 @@ module Embulk
|
|
138
138
|
end
|
139
139
|
ThreadsWait.all_waits(*threads) do |th|
|
140
140
|
idx, response = th.value # raise errors occurred in threads
|
141
|
-
responses[idx] = response
|
141
|
+
responses[idx] = response
|
142
142
|
end
|
143
143
|
responses
|
144
144
|
end
|
145
145
|
|
146
146
|
def load(path, table)
|
147
|
-
|
147
|
+
with_job_retry do
|
148
148
|
begin
|
149
149
|
if File.exist?(path)
|
150
150
|
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
@@ -196,7 +196,7 @@ module Embulk
|
|
196
196
|
# },
|
197
197
|
}
|
198
198
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
199
|
-
response = client.insert_job(@project, body, opts)
|
199
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
200
200
|
if @task['is_skip_job_result_check']
|
201
201
|
response
|
202
202
|
else
|
@@ -213,7 +213,7 @@ module Embulk
|
|
213
213
|
end
|
214
214
|
|
215
215
|
def copy(source_table, destination_table, destination_dataset = nil, write_disposition: 'WRITE_TRUNCATE')
|
216
|
-
|
216
|
+
with_job_retry do
|
217
217
|
begin
|
218
218
|
destination_dataset ||= @dataset
|
219
219
|
job_id = "embulk_copy_job_#{SecureRandom.uuid}"
|
@@ -248,7 +248,7 @@ module Embulk
|
|
248
248
|
|
249
249
|
opts = {}
|
250
250
|
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
251
|
-
response = client.insert_job(@project, body, opts)
|
251
|
+
response = with_network_retry { client.insert_job(@project, body, opts) }
|
252
252
|
wait_load('Copy', response)
|
253
253
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
254
254
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
@@ -289,7 +289,7 @@ module Embulk
|
|
289
289
|
"job_id:[#{job_id}] elapsed_time:#{elapsed.to_f}sec status:[#{status}]"
|
290
290
|
}
|
291
291
|
sleep wait_interval
|
292
|
-
_response = client.get_job(@project, job_id)
|
292
|
+
_response = with_network_retry { client.get_job(@project, job_id) }
|
293
293
|
end
|
294
294
|
end
|
295
295
|
|
@@ -330,7 +330,7 @@ module Embulk
|
|
330
330
|
}.merge(hint)
|
331
331
|
opts = {}
|
332
332
|
Embulk.logger.debug { "embulk-output-bigquery: insert_dataset(#{@project}, #{dataset}, #{body}, #{opts})" }
|
333
|
-
client.insert_dataset(@project, body, opts)
|
333
|
+
with_network_retry { client.insert_dataset(@project, body, opts) }
|
334
334
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
335
335
|
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
336
336
|
# ignore 'Already Exists' error
|
@@ -349,7 +349,7 @@ module Embulk
|
|
349
349
|
dataset ||= @dataset
|
350
350
|
begin
|
351
351
|
Embulk.logger.info { "embulk-output-bigquery: Get dataset... #{@project}:#{@dataset}" }
|
352
|
-
client.get_dataset(@project, dataset)
|
352
|
+
with_network_retry { client.get_dataset(@project, dataset) }
|
353
353
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
354
354
|
if e.status_code == 404
|
355
355
|
raise NotFoundError, "Dataset #{@project}:#{dataset} is not found"
|
@@ -376,7 +376,7 @@ module Embulk
|
|
376
376
|
}
|
377
377
|
opts = {}
|
378
378
|
Embulk.logger.debug { "embulk-output-bigquery: insert_table(#{@project}, #{@dataset}, #{body}, #{opts})" }
|
379
|
-
client.insert_table(@project, @dataset, body, opts)
|
379
|
+
with_network_retry { client.insert_table(@project, @dataset, body, opts) }
|
380
380
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
381
381
|
if e.status_code == 409 && /Already Exists:/ =~ e.message
|
382
382
|
# ignore 'Already Exists' error
|
@@ -394,7 +394,7 @@ module Embulk
|
|
394
394
|
def delete_table(table)
|
395
395
|
begin
|
396
396
|
Embulk.logger.info { "embulk-output-bigquery: Delete table... #{@project}:#{@dataset}.#{table}" }
|
397
|
-
client.delete_table(@project, @dataset, table)
|
397
|
+
with_network_retry { client.delete_table(@project, @dataset, table) }
|
398
398
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
399
399
|
if e.status_code == 404 && /Not found:/ =~ e.message
|
400
400
|
# ignore 'Not Found' error
|
@@ -412,7 +412,7 @@ module Embulk
|
|
412
412
|
def get_table(table)
|
413
413
|
begin
|
414
414
|
Embulk.logger.info { "embulk-output-bigquery: Get table... #{@project}:#{@dataset}.#{table}" }
|
415
|
-
client.get_table(@project, @dataset, table)
|
415
|
+
with_network_retry { client.get_table(@project, @dataset, table) }
|
416
416
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
417
417
|
if e.status_code == 404
|
418
418
|
raise NotFoundError, "Table #{@project}:#{@dataset}.#{table} is not found"
|
@@ -29,7 +29,7 @@ module Embulk
|
|
29
29
|
opts = {}
|
30
30
|
|
31
31
|
Embulk.logger.debug { "embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts})" }
|
32
|
-
client.insert_bucket(@project, body, opts)
|
32
|
+
with_network_retry { client.insert_bucket(@project, body, opts) }
|
33
33
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
34
34
|
if e.status_code == 409 && /conflict:/ =~ e.message
|
35
35
|
# ignore 'Already Exists' error
|
@@ -50,7 +50,6 @@ module Embulk
|
|
50
50
|
object_uri = URI.join("gs://#{bucket}", object).to_s
|
51
51
|
|
52
52
|
started = Time.now
|
53
|
-
retries = 0
|
54
53
|
begin
|
55
54
|
Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
|
56
55
|
body = {
|
@@ -63,28 +62,13 @@ module Embulk
|
|
63
62
|
|
64
63
|
Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
|
65
64
|
# memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
|
66
|
-
client.insert_object(bucket, body, opts)
|
65
|
+
with_network_retry { client.insert_object(bucket, body, opts) }
|
67
66
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
68
67
|
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
69
68
|
Embulk.logger.error {
|
70
69
|
"embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
71
70
|
}
|
72
71
|
raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
|
73
|
-
rescue ::Java::Java.net.SocketException => e
|
74
|
-
# I encountered `java.net.SocketException: Broken pipe` and `Connection reset` serveral times
|
75
|
-
# I am doubting as this is caused by Google's unstable network
|
76
|
-
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
77
|
-
if e.message == 'Broken pipe' || e.message == 'Connection reset'
|
78
|
-
if retries < @task['retries']
|
79
|
-
response = {message: e.message, error_class: e.class}
|
80
|
-
Embulk.logger.warn {
|
81
|
-
"embulk-output-bigquery: RETRY: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
82
|
-
}
|
83
|
-
retries += 1 # want to share with google-api-ruby-client, but it is difficult
|
84
|
-
retry
|
85
|
-
end
|
86
|
-
end
|
87
|
-
raise e
|
88
72
|
end
|
89
73
|
end
|
90
74
|
|
@@ -111,7 +95,7 @@ module Embulk
|
|
111
95
|
opts = {}
|
112
96
|
|
113
97
|
Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
|
114
|
-
response = client.delete_object(bucket, object, opts)
|
98
|
+
response = with_network_retry { client.delete_object(bucket, object, opts) }
|
115
99
|
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
116
100
|
if e.status_code == 404 # ignore 'notFound' error
|
117
101
|
return nil
|
@@ -64,6 +64,27 @@ module Embulk
|
|
64
64
|
@cached_client_expiration = Time.now + 1800
|
65
65
|
@cached_client = client
|
66
66
|
end
|
67
|
+
|
68
|
+
# google-api-ruby-client itself has a retry feature, but it does not retry with SocketException
|
69
|
+
def with_network_retry(&block)
|
70
|
+
retries = 0
|
71
|
+
begin
|
72
|
+
yield
|
73
|
+
rescue ::Java::Java.net.SocketException, ::Java::Java.net.ConnectException => e
|
74
|
+
if ['Broken pipe', 'Connection reset', 'Connection timed out'].include?(e.message)
|
75
|
+
if retries < @task['retries']
|
76
|
+
retries += 1
|
77
|
+
Embulk.logger.warn { "embulk-output-bigquery: retry \##{retries}, #{e.class} #{e.message}" }
|
78
|
+
retry
|
79
|
+
else
|
80
|
+
Embulk.logger.error { "embulk-output-bigquery: retry exhausted \##{retries}, #{e.class} #{e.message}" }
|
81
|
+
raise e
|
82
|
+
end
|
83
|
+
else
|
84
|
+
raise e
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
67
88
|
end
|
68
89
|
end
|
69
90
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-08-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|