embulk-output-bigquery 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +28 -7
- data/embulk-output-bigquery.gemspec +1 -1
- data/example/config_gcs.yml +32 -0
- data/lib/embulk/output/bigquery.rb +21 -2
- data/lib/embulk/output/bigquery/bigquery_client.rb +65 -63
- data/lib/embulk/output/bigquery/gcs_client.rb +112 -0
- data/lib/embulk/output/bigquery/google_client.rb +68 -0
- data/test/test_transaction.rb +0 -7
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f68ceb57a4eff6886157c585425526389623d0a2
|
4
|
+
data.tar.gz: b44323059a3057bb5de7fdd7b00d61ce970f3386
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5cc7b1245bda2ae8c5d581c67a09ce0685c7812658c3c47e195362290fd50c13abfb7a3e9bb2360bc01a6d6aa82009ce190bef667cfb1df2cddaeb653c162c14
|
7
|
+
data.tar.gz: 4f8611f292a61750568c7b15e7ae6f83bc83d09ae3f64b2359b8f6f4e4d4b7ac115e09e6e8fbb5cc2e98b89103b8d1aba0640e0d89b035dbab2e5feea0d47449
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -37,7 +37,7 @@ v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGE
|
|
37
37
|
|
38
38
|
| name | type | required? | default | description |
|
39
39
|
|:-------------------------------------|:------------|:-----------|:-------------------------|:-----------------------|
|
40
|
-
| mode | string | optional | "append" |
|
40
|
+
| mode | string | optional | "append" | See [Mode](#mode) |
|
41
41
|
| auth_method | string | optional | "private_key" | `private_key` , `json_key` or `compute_engine`
|
42
42
|
| service_account_email | string | required when auth_method is private_key | | Your Google service account email
|
43
43
|
| p12_keyfile | string | required when auth_method is private_key | | Fullpath of private key in P12(PKCS12) format |
|
@@ -46,21 +46,23 @@ v0.3.x has incompatibility changes with v0.2.x. Please see [CHANGELOG.md](CHANGE
|
|
46
46
|
| dataset | string | required | | dataset |
|
47
47
|
| table | string | required | | table name |
|
48
48
|
| auto_create_dataset | boolean | optional | false | automatically create dataset |
|
49
|
-
| auto_create_table | boolean | optional | false |
|
49
|
+
| auto_create_table | boolean | optional | false | See [Dynamic Table Creating](#dynamic-table-creating) |
|
50
50
|
| schema_file | string | optional | | /path/to/schema.json |
|
51
|
-
| template_table | string | optional | | template table name
|
52
|
-
| prevent_duplicate_insert | boolean | optional | false |
|
51
|
+
| template_table | string | optional | | template table name. See [Dynamic Table Creating](#dynamic-table-creating) |
|
52
|
+
| prevent_duplicate_insert | boolean | optional | false | See [Prevent Duplication] (#prevent-duplication) |
|
53
53
|
| job_status_max_polling_time | int | optional | 3600 sec | Max job status polling time |
|
54
54
|
| job_status_polling_interval | int | optional | 10 sec | Job status polling interval |
|
55
55
|
| is_skip_job_result_check | boolean | optional | false | Skip waiting Load job finishes. Available for append, or delete_in_advance mode |
|
56
56
|
| with_rehearsal | boolean | optional | false | Load `rehearsal_counts` records as a rehearsal. Rehearsal loads into REHEARSAL temporary table, and delete finally. You may use this option to investigate data errors as early stage as possible |
|
57
57
|
| rehearsal_counts | integer | optional | 1000 | Specify number of records to load in a rehearsal |
|
58
58
|
| abort_on_error | boolean | optional | true if max_bad_records is 0, otherwise false | Raise an error if number of input rows and number of output rows does not match |
|
59
|
-
| column_options | hash | optional | |
|
59
|
+
| column_options | hash | optional | | See [Column Options](#column-options) |
|
60
60
|
| default_timezone | string | optional | UTC | |
|
61
61
|
| default_timestamp_format | string | optional | %Y-%m-%d %H:%M:%S.%6N | |
|
62
|
-
| payload_column | string | optional | nil |
|
63
|
-
| payload_column_index | integer | optional | nil |
|
62
|
+
| payload_column | string | optional | nil | See [Formatter Performance Issue](#formatter-performance-issue) |
|
63
|
+
| payload_column_index | integer | optional | nil | See [Formatter Performance Issue](#formatter-performance-issue) |
|
64
|
+
| gcs_bucket | stringr | optional | nil | See [GCS Bucket](#gcs-bucket) |
|
65
|
+
| auto_create_gcs_bucket | boolean | optional | false | See [GCS Bucket](#gcs-bucket) |
|
64
66
|
|
65
67
|
Client or request options
|
66
68
|
|
@@ -345,6 +347,25 @@ out:
|
|
345
347
|
prevent_duplicate_insert: true
|
346
348
|
```
|
347
349
|
|
350
|
+
### GCS Bucket
|
351
|
+
|
352
|
+
This is useful to reduce number of consumed jobs, which is limited by [10,000 jobs per project per day](https://cloud.google.com/bigquery/quota-policy#import).
|
353
|
+
|
354
|
+
This plugin originally loads local files into BigQuery in parallel, that is, consumes a number of jobs, say 24 jobs on 24 CPU core machine for example (this depends on embulk parameters such as `min_output_tasks` and `max_threads`).
|
355
|
+
|
356
|
+
BigQuery supports loading multiple files from GCS with one job (but not from local files, sigh), therefore, uploading local files to GCS and then loading from GCS into BigQuery reduces number of consumed jobs.
|
357
|
+
|
358
|
+
Using `gcs_bucket` option, such strategy is enabled. You may also use `auto_create_gcs_bucket` to create the specified GCS bucket automatically.
|
359
|
+
|
360
|
+
```yaml
|
361
|
+
out:
|
362
|
+
type: bigquery
|
363
|
+
gcs_bucket: bucket_name
|
364
|
+
auto_create_gcs_bucket: false
|
365
|
+
```
|
366
|
+
|
367
|
+
ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers into GCS.
|
368
|
+
|
348
369
|
## Development
|
349
370
|
|
350
371
|
### Run example:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-bigquery"
|
3
|
-
spec.version = "0.3.
|
3
|
+
spec.version = "0.3.4"
|
4
4
|
spec.authors = ["Satoshi Akama", "Naotoshi Seo"]
|
5
5
|
spec.summary = "Google BigQuery output plugin for Embulk"
|
6
6
|
spec.description = "Embulk plugin that insert records to Google BigQuery."
|
@@ -0,0 +1,32 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: date, type: string}
|
13
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
14
|
+
- {name: "null", type: string}
|
15
|
+
- {name: long, type: long}
|
16
|
+
- {name: string, type: string}
|
17
|
+
- {name: double, type: double}
|
18
|
+
- {name: boolean, type: boolean}
|
19
|
+
out:
|
20
|
+
type: bigquery
|
21
|
+
mode: replace
|
22
|
+
auth_method: json_key
|
23
|
+
json_keyfile: example/your-project-000.json
|
24
|
+
dataset: your_dataset_name
|
25
|
+
table: your_table_name
|
26
|
+
source_format: NEWLINE_DELIMITED_JSON
|
27
|
+
compression: GZIP
|
28
|
+
auto_create_dataset: true
|
29
|
+
auto_create_table: true
|
30
|
+
schema_file: example/schema.json
|
31
|
+
gcs_bucket: your_bucket_name
|
32
|
+
auto_create_gcs_bucket: true
|
@@ -1,8 +1,10 @@
|
|
1
|
+
require 'uri'
|
1
2
|
require 'json'
|
2
3
|
require 'tempfile'
|
3
4
|
require 'fileutils'
|
4
5
|
require 'securerandom'
|
5
6
|
require_relative 'bigquery/bigquery_client'
|
7
|
+
require_relative 'bigquery/gcs_client'
|
6
8
|
require_relative 'bigquery/file_writer'
|
7
9
|
require_relative 'bigquery/value_converter_factory'
|
8
10
|
|
@@ -73,6 +75,9 @@ module Embulk
|
|
73
75
|
'skip_file_generation' => config.param('skip_file_generation', :bool, :default => false),
|
74
76
|
'compression' => config.param('compression', :string, :default => 'NONE'),
|
75
77
|
|
78
|
+
'gcs_bucket' => config.param('gcs_bucket', :string, :default => nil),
|
79
|
+
'auto_create_gcs_bucket' => config.param('auto_create_gcs_bucket', :bool, :default => false),
|
80
|
+
|
76
81
|
'source_format' => config.param('source_format', :string, :default => 'CSV'),
|
77
82
|
'max_bad_records' => config.param('max_bad_records', :integer, :default => 0),
|
78
83
|
'field_delimiter' => config.param('field_delimiter', :string, :default => ','),
|
@@ -312,8 +317,22 @@ module Embulk
|
|
312
317
|
if task['skip_load'] # only for debug
|
313
318
|
Embulk.logger.info { "embulk-output-bigquery: Skip load" }
|
314
319
|
else
|
315
|
-
|
316
|
-
|
320
|
+
if !paths.empty?
|
321
|
+
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
322
|
+
if bucket = task['gcs_bucket']
|
323
|
+
gcs = GcsClient.new(task)
|
324
|
+
gcs.insert_bucket(bucket) if task['auto_create_gcs_bucket']
|
325
|
+
objects = paths.size.times.map { SecureRandom.uuid.to_s }
|
326
|
+
gcs.insert_objects(paths, objects: objects, bucket: bucket)
|
327
|
+
object_uris = objects.map {|object| URI.join("gs://#{bucket}", object).to_s }
|
328
|
+
responses = bigquery.load_from_gcs(object_uris, target_table)
|
329
|
+
objects.each {|object| gcs.delete_object(object, bucket: bucket) }
|
330
|
+
else
|
331
|
+
responses = bigquery.load_in_parallel(paths, target_table)
|
332
|
+
end
|
333
|
+
else
|
334
|
+
responses = []
|
335
|
+
end
|
317
336
|
transaction_report = self.transaction_report(task, responses)
|
318
337
|
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
319
338
|
|
@@ -1,76 +1,22 @@
|
|
1
1
|
require 'google/apis/bigquery_v2'
|
2
|
-
require 'google/api_client/auth/key_utils'
|
3
2
|
require 'json'
|
4
3
|
require 'thwait'
|
4
|
+
require_relative 'google_client'
|
5
5
|
require_relative 'helper'
|
6
6
|
|
7
7
|
module Embulk
|
8
8
|
module Output
|
9
9
|
class Bigquery < OutputPlugin
|
10
|
-
class
|
11
|
-
class JobTimeoutError < Error; end
|
12
|
-
class NotFoundError < Error; end
|
13
|
-
|
14
|
-
class BigqueryClient
|
10
|
+
class BigqueryClient < GoogleClient
|
15
11
|
def initialize(task, schema, fields = nil)
|
16
|
-
@task = task
|
17
|
-
@schema = schema
|
18
|
-
|
19
|
-
@project = task['project']
|
20
|
-
@dataset = task['dataset']
|
21
|
-
|
22
|
-
reset_fields(fields) if fields
|
23
|
-
end
|
24
|
-
|
25
|
-
def client
|
26
|
-
return @cached_client if @cached_client && @cached_client_expiration > Time.now
|
27
|
-
|
28
|
-
client = Google::Apis::BigqueryV2::BigqueryService.new
|
29
|
-
client.client_options.application_name = @task['application_name']
|
30
|
-
client.request_options.retries = @task['retries']
|
31
|
-
client.request_options.timeout_sec = @task['timeout_sec']
|
32
|
-
client.request_options.open_timeout_sec = @task['open_timeout_sec']
|
33
|
-
Embulk.logger.debug { "embulk-output-bigquery: client_options: #{client.client_options.to_h}" }
|
34
|
-
Embulk.logger.debug { "embulk-output-bigquery: request_options: #{client.request_options.to_h}" }
|
35
|
-
|
36
12
|
scope = "https://www.googleapis.com/auth/bigquery"
|
13
|
+
client_class = Google::Apis::BigqueryV2::BigqueryService
|
14
|
+
super(task, scope, client_class)
|
37
15
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
auth = Signet::OAuth2::Client.new(
|
43
|
-
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
44
|
-
audience: "https://accounts.google.com/o/oauth2/token",
|
45
|
-
scope: scope,
|
46
|
-
issuer: @task['service_account_email'],
|
47
|
-
signing_key: key)
|
48
|
-
|
49
|
-
when 'compute_engine'
|
50
|
-
auth = Google::Auth::GCECredentials.new
|
51
|
-
|
52
|
-
when 'json_key'
|
53
|
-
json_key = @task['json_keyfile']
|
54
|
-
if File.exist?(json_key)
|
55
|
-
auth = File.open(json_key) do |f|
|
56
|
-
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
|
57
|
-
end
|
58
|
-
else
|
59
|
-
key = StringIO.new(json_key)
|
60
|
-
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
|
61
|
-
end
|
62
|
-
|
63
|
-
when 'application_default'
|
64
|
-
auth = Google::Auth.get_application_default([scope])
|
65
|
-
|
66
|
-
else
|
67
|
-
raise ConfigError, "Unknown auth method: #{@task['auth_method']}"
|
68
|
-
end
|
69
|
-
|
70
|
-
client.authorization = auth
|
71
|
-
|
72
|
-
@cached_client_expiration = Time.now + 1800
|
73
|
-
@cached_client = client
|
16
|
+
@schema = schema
|
17
|
+
reset_fields(fields) if fields
|
18
|
+
@project = @task['project']
|
19
|
+
@dataset = @task['dataset']
|
74
20
|
end
|
75
21
|
|
76
22
|
def fields
|
@@ -94,6 +40,62 @@ module Embulk
|
|
94
40
|
self.fields
|
95
41
|
end
|
96
42
|
|
43
|
+
# @params gcs_patsh [Array] arary of gcs paths such as gs://bucket/path
|
44
|
+
# @return [Array] responses
|
45
|
+
def load_from_gcs(object_uris, table)
|
46
|
+
begin
|
47
|
+
# As https://cloud.google.com/bigquery/docs/managing_jobs_datasets_projects#managingjobs says,
|
48
|
+
# we should generate job_id in client code, otherwise, retrying would cause duplication
|
49
|
+
if @task['prevent_duplicate_insert'] and (@task['mode'] == 'append' or @task['mode'] == 'append_direct')
|
50
|
+
job_id = Helper.create_load_job_id(@task, path, fields)
|
51
|
+
else
|
52
|
+
job_id = "embulk_load_job_#{SecureRandom.uuid}"
|
53
|
+
end
|
54
|
+
Embulk.logger.info { "embulk-output-bigquery: Load job starting... job_id:[#{job_id}] #{object_uris} => #{@project}:#{@dataset}.#{table}" }
|
55
|
+
|
56
|
+
body = {
|
57
|
+
job_reference: {
|
58
|
+
project_id: @project,
|
59
|
+
job_id: job_id,
|
60
|
+
},
|
61
|
+
configuration: {
|
62
|
+
load: {
|
63
|
+
destination_table: {
|
64
|
+
project_id: @project,
|
65
|
+
dataset_id: @dataset,
|
66
|
+
table_id: table,
|
67
|
+
},
|
68
|
+
schema: {
|
69
|
+
fields: fields,
|
70
|
+
},
|
71
|
+
write_disposition: 'WRITE_APPEND',
|
72
|
+
source_format: @task['source_format'],
|
73
|
+
max_bad_records: @task['max_bad_records'],
|
74
|
+
field_delimiter: @task['source_format'] == 'CSV' ? @task['field_delimiter'] : nil,
|
75
|
+
encoding: @task['encoding'],
|
76
|
+
ignore_unknown_values: @task['ignore_unknown_values'],
|
77
|
+
allow_quoted_newlines: @task['allow_quoted_newlines'],
|
78
|
+
source_uris: object_uris,
|
79
|
+
}
|
80
|
+
}
|
81
|
+
}
|
82
|
+
opts = {}
|
83
|
+
|
84
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts})" }
|
85
|
+
response = client.insert_job(@project, body, opts)
|
86
|
+
unless @task['is_skip_job_result_check']
|
87
|
+
response = wait_load('Load', response)
|
88
|
+
end
|
89
|
+
[response]
|
90
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
91
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
92
|
+
Embulk.logger.error {
|
93
|
+
"embulk-output-bigquery: insert_job(#{@project}, #{body}, #{opts}), response:#{response}"
|
94
|
+
}
|
95
|
+
raise Error, "failed to load #{object_uris} to #{@project}:#{@dataset}.#{table}, response:#{response}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
97
99
|
def load_in_parallel(paths, table)
|
98
100
|
return [] if paths.empty?
|
99
101
|
# You may think as, load job is a background job, so sending requests in parallel
|
@@ -118,7 +120,7 @@ module Embulk
|
|
118
120
|
end
|
119
121
|
ThreadsWait.all_waits(*threads) do |th|
|
120
122
|
idx, response = th.value # raise errors occurred in threads
|
121
|
-
responses[idx] = response
|
123
|
+
responses[idx] = response if idx
|
122
124
|
end
|
123
125
|
responses
|
124
126
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'google/apis/storage_v1'
|
3
|
+
require_relative 'google_client'
|
4
|
+
require_relative 'helper'
|
5
|
+
|
6
|
+
# ToDo: Use https://cloud.google.com/storage/docs/streaming if google-api-ruby-client supports streaming transfers
|
7
|
+
# ToDo: Tests are not written because this implementation will probably entirely changed on supporting streaming transfers
|
8
|
+
module Embulk
|
9
|
+
module Output
|
10
|
+
class Bigquery < OutputPlugin
|
11
|
+
class GcsClient < GoogleClient
|
12
|
+
def initialize(task)
|
13
|
+
scope = "https://www.googleapis.com/auth/cloud-platform"
|
14
|
+
client_class = Google::Apis::StorageV1::StorageService
|
15
|
+
super(task, scope, client_class)
|
16
|
+
|
17
|
+
@project = @task['project']
|
18
|
+
@bucket = @task['gcs_bucket']
|
19
|
+
end
|
20
|
+
|
21
|
+
def insert_bucket(bucket = nil)
|
22
|
+
bucket ||= @bucket
|
23
|
+
begin
|
24
|
+
Embulk.logger.info { "embulk-output-bigquery: Insert bucket... #{@project}:#{bucket}" }
|
25
|
+
body = {
|
26
|
+
name: bucket,
|
27
|
+
}
|
28
|
+
opts = {}
|
29
|
+
|
30
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts})" }
|
31
|
+
client.insert_bucket(@project, body, opts)
|
32
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
33
|
+
if e.status_code == 409 && /conflict:/ =~ e.message
|
34
|
+
# ignore 'Already Exists' error
|
35
|
+
return nil
|
36
|
+
end
|
37
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
38
|
+
Embulk.logger.error {
|
39
|
+
"embulk-output-bigquery: insert_bucket(#{@project}, #{body}, #{opts}), response:#{response}"
|
40
|
+
}
|
41
|
+
raise Error, "failed to insert bucket #{@project}:#{bucket}, response:#{response}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def insert_object(path, object: nil, bucket: nil)
|
46
|
+
bucket ||= @bucket
|
47
|
+
object ||= path
|
48
|
+
object = object.start_with?('/') ? object[1..-1] : object
|
49
|
+
object_uri = URI.join("gs://#{bucket}", object).to_s
|
50
|
+
|
51
|
+
started = Time.now
|
52
|
+
begin
|
53
|
+
Embulk.logger.info { "embulk-output-bigquery: Insert object... #{path} => #{@project}:#{object_uri}" }
|
54
|
+
body = {
|
55
|
+
name: object,
|
56
|
+
}
|
57
|
+
opts = {
|
58
|
+
upload_source: path,
|
59
|
+
content_type: 'application/octet-stream'
|
60
|
+
}
|
61
|
+
|
62
|
+
Embulk.logger.debug { "embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts})" }
|
63
|
+
# memo: gcs is strongly consistent for insert (read-after-write). ref: https://cloud.google.com/storage/docs/consistency
|
64
|
+
client.insert_object(bucket, body, opts)
|
65
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
66
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
67
|
+
Embulk.logger.error {
|
68
|
+
"embulk-output-bigquery: insert_object(#{bucket}, #{body}, #{opts}), response:#{response}"
|
69
|
+
}
|
70
|
+
raise Error, "failed to insert object #{@project}:#{object_uri}, response:#{response}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def insert_objects(paths, objects: nil, bucket: nil)
|
75
|
+
return [] if paths.empty?
|
76
|
+
bucket ||= @bucket
|
77
|
+
objects ||= paths
|
78
|
+
raise "number of paths and objects are different" if paths.size != objects.size
|
79
|
+
|
80
|
+
responses = []
|
81
|
+
paths.each_with_index do |path, idx|
|
82
|
+
object = objects[idx]
|
83
|
+
responses << insert_object(path, object: object, bucket: bucket)
|
84
|
+
end
|
85
|
+
responses
|
86
|
+
end
|
87
|
+
|
88
|
+
def delete_object(object, bucket: nil)
|
89
|
+
bucket ||= @bucket
|
90
|
+
object = object.start_with?('/') ? object[1..-1] : object
|
91
|
+
object_uri = URI.join("gs://#{bucket}", object).to_s
|
92
|
+
begin
|
93
|
+
Embulk.logger.info { "embulk-output-bigquery: Delete object... #{@project}:#{object_uri}" }
|
94
|
+
opts = {}
|
95
|
+
|
96
|
+
Embulk.logger.debug { "embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts})" }
|
97
|
+
response = client.delete_object(bucket, object, opts)
|
98
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
99
|
+
if e.status_code == 404 # ignore 'notFound' error
|
100
|
+
return nil
|
101
|
+
end
|
102
|
+
response = {status_code: e.status_code, message: e.message, error_class: e.class}
|
103
|
+
Embulk.logger.error {
|
104
|
+
"embulk-output-bigquery: delete_object(#{bucket}, #{object}, #{opts}), response:#{response}"
|
105
|
+
}
|
106
|
+
raise Error, "failed to delete object #{@project}:#{object_uri}, response:#{response}"
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'google/api_client/auth/key_utils'
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Output
|
5
|
+
class Bigquery < OutputPlugin
|
6
|
+
class Error < StandardError; end
|
7
|
+
class JobTimeoutError < Error; end
|
8
|
+
class NotFoundError < Error; end
|
9
|
+
|
10
|
+
class GoogleClient
|
11
|
+
def initialize(task, scope, client_class)
|
12
|
+
@task = task
|
13
|
+
@scope = scope
|
14
|
+
@client_class = client_class
|
15
|
+
end
|
16
|
+
|
17
|
+
def client
|
18
|
+
return @cached_client if @cached_client && @cached_client_expiration > Time.now
|
19
|
+
|
20
|
+
client = @client_class.new
|
21
|
+
client.client_options.application_name = @task['application_name']
|
22
|
+
client.request_options.retries = @task['retries']
|
23
|
+
client.request_options.timeout_sec = @task['timeout_sec']
|
24
|
+
client.request_options.open_timeout_sec = @task['open_timeout_sec']
|
25
|
+
Embulk.logger.debug { "embulk-output-bigquery: client_options: #{client.client_options.to_h}" }
|
26
|
+
Embulk.logger.debug { "embulk-output-bigquery: request_options: #{client.request_options.to_h}" }
|
27
|
+
|
28
|
+
case @task['auth_method']
|
29
|
+
when 'private_key'
|
30
|
+
private_key_passphrase = 'notasecret'
|
31
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(@task['p12_keyfile'], private_key_passphrase)
|
32
|
+
auth = Signet::OAuth2::Client.new(
|
33
|
+
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
34
|
+
audience: "https://accounts.google.com/o/oauth2/token",
|
35
|
+
scope: @scope,
|
36
|
+
issuer: @task['service_account_email'],
|
37
|
+
signing_key: key)
|
38
|
+
|
39
|
+
when 'compute_engine'
|
40
|
+
auth = Google::Auth::GCECredentials.new
|
41
|
+
|
42
|
+
when 'json_key'
|
43
|
+
json_key = @task['json_keyfile']
|
44
|
+
if File.exist?(json_key)
|
45
|
+
auth = File.open(json_key) do |f|
|
46
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
|
47
|
+
end
|
48
|
+
else
|
49
|
+
key = StringIO.new(json_key)
|
50
|
+
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
|
51
|
+
end
|
52
|
+
|
53
|
+
when 'application_default'
|
54
|
+
auth = Google::Auth.get_application_default([@scope])
|
55
|
+
|
56
|
+
else
|
57
|
+
raise ConfigError, "Unknown auth method: #{@task['auth_method']}"
|
58
|
+
end
|
59
|
+
|
60
|
+
client.authorization = auth
|
61
|
+
|
62
|
+
@cached_client_expiration = Time.now + 1800
|
63
|
+
@cached_client = client
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
data/test/test_transaction.rb
CHANGED
@@ -43,7 +43,6 @@ module Embulk
|
|
43
43
|
any_instance_of(BigqueryClient) do |obj|
|
44
44
|
mock(obj).get_dataset(config['dataset'])
|
45
45
|
mock(obj).create_table(config['temp_table'])
|
46
|
-
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
47
46
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_APPEND')
|
48
47
|
mock(obj).delete_table(config['temp_table'])
|
49
48
|
end
|
@@ -56,7 +55,6 @@ module Embulk
|
|
56
55
|
any_instance_of(BigqueryClient) do |obj|
|
57
56
|
mock(obj).get_dataset(config['dataset'])
|
58
57
|
mock(obj).get_table(config['table'])
|
59
|
-
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
60
58
|
end
|
61
59
|
Bigquery.transaction(config, schema, processor_count, &control)
|
62
60
|
end
|
@@ -66,7 +64,6 @@ module Embulk
|
|
66
64
|
any_instance_of(BigqueryClient) do |obj|
|
67
65
|
mock(obj).create_dataset(config['dataset'])
|
68
66
|
mock(obj).create_table(config['table'])
|
69
|
-
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
70
67
|
end
|
71
68
|
Bigquery.transaction(config, schema, processor_count, &control)
|
72
69
|
end
|
@@ -78,7 +75,6 @@ module Embulk
|
|
78
75
|
mock(obj).get_dataset(config['dataset'])
|
79
76
|
mock(obj).delete_table(config['table'])
|
80
77
|
mock(obj).create_table(config['table'])
|
81
|
-
mock(obj).load_in_parallel(anything, config['table']) { [] }
|
82
78
|
end
|
83
79
|
Bigquery.transaction(config, schema, processor_count, &control)
|
84
80
|
end
|
@@ -88,7 +84,6 @@ module Embulk
|
|
88
84
|
any_instance_of(BigqueryClient) do |obj|
|
89
85
|
mock(obj).get_dataset(config['dataset'])
|
90
86
|
mock(obj).create_table(config['temp_table'])
|
91
|
-
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
92
87
|
mock(obj).copy(config['temp_table'], config['table'], write_disposition: 'WRITE_TRUNCATE')
|
93
88
|
mock(obj).delete_table(config['temp_table'])
|
94
89
|
end
|
@@ -102,7 +97,6 @@ module Embulk
|
|
102
97
|
mock(obj).get_dataset(config['dataset'])
|
103
98
|
mock(obj).get_dataset(config['dataset_old'])
|
104
99
|
mock(obj).create_table(config['temp_table'])
|
105
|
-
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
106
100
|
|
107
101
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
108
102
|
|
@@ -118,7 +112,6 @@ module Embulk
|
|
118
112
|
mock(obj).create_dataset(config['dataset'])
|
119
113
|
mock(obj).create_dataset(config['dataset_old'], reference: config['dataset'])
|
120
114
|
mock(obj).create_table(config['temp_table'])
|
121
|
-
mock(obj).load_in_parallel(anything, config['temp_table']) { [] }
|
122
115
|
|
123
116
|
mock(obj).copy(config['table'], config['table_old'], config['dataset_old'])
|
124
117
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Satoshi Akama
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-
|
12
|
+
date: 2016-06-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google-api-client
|
@@ -101,6 +101,7 @@ files:
|
|
101
101
|
- example/config_csv.yml
|
102
102
|
- example/config_delete_in_advance.yml
|
103
103
|
- example/config_expose_errors.yml
|
104
|
+
- example/config_gcs.yml
|
104
105
|
- example/config_guess_from_embulk_schema.yml
|
105
106
|
- example/config_guess_with_column_options.yml
|
106
107
|
- example/config_gzip.yml
|
@@ -136,6 +137,8 @@ files:
|
|
136
137
|
- lib/embulk/output/bigquery.rb
|
137
138
|
- lib/embulk/output/bigquery/bigquery_client.rb
|
138
139
|
- lib/embulk/output/bigquery/file_writer.rb
|
140
|
+
- lib/embulk/output/bigquery/gcs_client.rb
|
141
|
+
- lib/embulk/output/bigquery/google_client.rb
|
139
142
|
- lib/embulk/output/bigquery/helper.rb
|
140
143
|
- lib/embulk/output/bigquery/value_converter_factory.rb
|
141
144
|
- test/helper.rb
|