embulk-output-bigquery 0.2.3 → 0.3.0.pre1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +6 -12
- data/CHANGELOG.md +18 -0
- data/Gemfile +8 -0
- data/LICENSE.txt +20 -0
- data/README.md +165 -39
- data/Rakefile +11 -0
- data/embulk-output-bigquery.gemspec +20 -0
- data/example/config_client_options.yml +33 -0
- data/example/config_csv.yml +30 -0
- data/example/config_delete_in_advance.yml +29 -0
- data/example/config_expose_errors.yml +30 -0
- data/example/config_guess_from_embulk_schema.yml +29 -0
- data/example/config_guess_with_column_options.yml +40 -0
- data/example/config_gzip.yml +30 -0
- data/example/config_jsonl.yml +30 -0
- data/example/config_mode_append.yml +30 -0
- data/example/config_mode_append_direct.yml +30 -0
- data/example/config_payload_column.yml +20 -0
- data/example/config_payload_column_index.yml +20 -0
- data/example/config_prevent_duplicate_insert.yml +30 -0
- data/example/config_replace.yml +30 -0
- data/example/config_replace_backup.yml +32 -0
- data/example/config_skip_file_generation.yml +32 -0
- data/example/config_table_strftime.yml +30 -0
- data/example/config_template_table.yml +21 -0
- data/example/config_uncompressed.yml +30 -0
- data/example/config_with_rehearsal.yml +32 -0
- data/example/example.csv +17 -0
- data/example/example.jsonl +16 -0
- data/example/example.yml +30 -0
- data/example/json_key.json +12 -0
- data/example/nested_example.jsonl +16 -0
- data/example/schema.json +30 -0
- data/example/schema_expose_errors.json +30 -0
- data/lib/embulk/output/bigquery.rb +388 -3
- data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
- data/lib/embulk/output/bigquery/file_writer.rb +103 -0
- data/lib/embulk/output/bigquery/helper.rb +78 -0
- data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
- data/test/helper.rb +13 -0
- data/test/test_bigquery_client.rb +166 -0
- data/test/test_configure.rb +254 -0
- data/test/test_example.rb +34 -0
- data/test/test_file_writer.rb +129 -0
- data/test/test_helper.rb +103 -0
- data/test/test_transaction.rb +129 -0
- data/test/test_value_converter_factory.rb +316 -0
- metadata +114 -45
- data/build.gradle +0 -80
- data/config/checkstyle/checkstyle.xml +0 -128
- data/config/checkstyle/default.xml +0 -108
- data/gradle/wrapper/gradle-wrapper.jar +0 -0
- data/gradle/wrapper/gradle-wrapper.properties +0 -6
- data/gradlew +0 -164
- data/gradlew.bat +0 -90
- data/settings.gradle +0 -2
- data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
- data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
- data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
- data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
- data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
- data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5
@@ -0,0 +1,21 @@
|
|
1
|
+
# embulk gem install embulk-parser-none
|
2
|
+
in:
|
3
|
+
type: file
|
4
|
+
path_prefix: example/example.jsonl
|
5
|
+
parser:
|
6
|
+
type: none
|
7
|
+
column_name: payload
|
8
|
+
out:
|
9
|
+
type: bigquery
|
10
|
+
mode: replace
|
11
|
+
auth_method: json_key
|
12
|
+
json_keyfile: /tmp/your-project-000.json
|
13
|
+
dataset: your_dataset_name
|
14
|
+
table: your_table_name_%Y%m%d
|
15
|
+
compression: GZIP
|
16
|
+
source_format: NEWLINE_DELIMITED_JSON
|
17
|
+
auto_create_dataset: true
|
18
|
+
auto_create_table: true
|
19
|
+
template_table: your_table_name
|
20
|
+
payload_column: payload
|
21
|
+
skip_load: true # for debug
|
@@ -0,0 +1,30 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: date, type: string}
|
13
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
14
|
+
- {name: "null", type: string}
|
15
|
+
- {name: long, type: long}
|
16
|
+
- {name: string, type: string}
|
17
|
+
- {name: double, type: double}
|
18
|
+
- {name: boolean, type: boolean}
|
19
|
+
out:
|
20
|
+
type: bigquery
|
21
|
+
mode: replace
|
22
|
+
auth_method: json_key
|
23
|
+
json_keyfile: /tmp/your-project-000.json
|
24
|
+
dataset: your_dataset_name
|
25
|
+
table: your_table_name
|
26
|
+
source_format: NEWLINE_DELIMITED_JSON
|
27
|
+
compression: NONE
|
28
|
+
auto_create_dataset: true
|
29
|
+
auto_create_table: true
|
30
|
+
schema_file: example/schema.json
|
@@ -0,0 +1,32 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: date, type: string}
|
13
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
14
|
+
- {name: "null", type: string}
|
15
|
+
- {name: long, type: long}
|
16
|
+
- {name: string, type: string}
|
17
|
+
- {name: double, type: double}
|
18
|
+
- {name: boolean, type: boolean}
|
19
|
+
out:
|
20
|
+
type: bigquery
|
21
|
+
mode: replace
|
22
|
+
auth_method: json_key
|
23
|
+
json_keyfile: /tmp/your-project-000.json
|
24
|
+
dataset: your_dataset_name
|
25
|
+
table: your_table_name
|
26
|
+
source_format: NEWLINE_DELIMITED_JSON
|
27
|
+
auto_create_dataset: true
|
28
|
+
auto_create_table: true
|
29
|
+
schema_file: example/schema.json
|
30
|
+
with_rehearsal: true
|
31
|
+
rehearsal_counts: 1
|
32
|
+
skip_load: true # for debug
|
data/example/example.csv
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
date,timestamp,null,long,string,double,boolean
|
2
|
+
2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,true
|
3
|
+
2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,true
|
4
|
+
2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,true
|
5
|
+
2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,true
|
6
|
+
2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,true
|
7
|
+
2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,true
|
8
|
+
2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,true
|
9
|
+
2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,true
|
10
|
+
2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,false
|
11
|
+
2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,false
|
12
|
+
2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,false
|
13
|
+
2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,false
|
14
|
+
2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,false
|
15
|
+
2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,false
|
16
|
+
2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,false
|
17
|
+
2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,false
|
@@ -0,0 +1,16 @@
|
|
1
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"boolean":true}
|
2
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"boolean":true}
|
3
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"boolean":true}
|
4
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"boolean":true}
|
5
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"boolean":true}
|
6
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"boolean":true}
|
7
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"boolean":true}
|
8
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"boolean":true}
|
9
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"boolean":false}
|
10
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"boolean":false}
|
11
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"boolean":false}
|
12
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"boolean":false}
|
13
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"boolean":false}
|
14
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"boolean":false}
|
15
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"boolean":false}
|
16
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"boolean":false}
|
data/example/example.yml
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
in:
|
2
|
+
type: file
|
3
|
+
path_prefix: example/example.csv
|
4
|
+
parser:
|
5
|
+
type: csv
|
6
|
+
charset: UTF-8
|
7
|
+
newline: CRLF
|
8
|
+
null_string: 'NULL'
|
9
|
+
skip_header_lines: 1
|
10
|
+
comment_line_marker: '#'
|
11
|
+
columns:
|
12
|
+
- {name: date, type: string}
|
13
|
+
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
|
14
|
+
- {name: "null", type: string}
|
15
|
+
- {name: long, type: long}
|
16
|
+
- {name: string, type: string}
|
17
|
+
- {name: double, type: double}
|
18
|
+
- {name: boolean, type: boolean}
|
19
|
+
out:
|
20
|
+
type: bigquery
|
21
|
+
mode: replace
|
22
|
+
auth_method: json_key
|
23
|
+
json_keyfile: /tmp/your-project-000.json
|
24
|
+
dataset: your_dataset_name
|
25
|
+
table: your_table_name
|
26
|
+
source_format: NEWLINE_DELIMITED_JSON
|
27
|
+
compression: NONE
|
28
|
+
auto_create_dataset: true
|
29
|
+
auto_create_table: true
|
30
|
+
schema_file: example/schema.json
|
@@ -0,0 +1,12 @@
|
|
1
|
+
{
|
2
|
+
"type": "service_account",
|
3
|
+
"project_id": "your_project_name",
|
4
|
+
"private_key_id": "your_private_key_id",
|
5
|
+
"private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n",
|
6
|
+
"client_email": "your_service_account_email",
|
7
|
+
"client_id": "your_client_id",
|
8
|
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9
|
+
"token_uri": "https://accounts.google.com/o/oauth2/token",
|
10
|
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11
|
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/account-3%40your_project_name.iam.gserviceaccount.com"
|
12
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
2
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
3
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
4
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
5
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
6
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
7
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
8
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
|
9
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
10
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
11
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
12
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
13
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
14
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
15
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
16
|
+
{"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
|
data/example/schema.json
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"name":"date",
|
4
|
+
"type":"STRING"
|
5
|
+
},
|
6
|
+
{
|
7
|
+
"name":"timestamp",
|
8
|
+
"type":"TIMESTAMP"
|
9
|
+
},
|
10
|
+
{
|
11
|
+
"name":"null",
|
12
|
+
"type":"STRING"
|
13
|
+
},
|
14
|
+
{
|
15
|
+
"name":"long",
|
16
|
+
"type":"INTEGER"
|
17
|
+
},
|
18
|
+
{
|
19
|
+
"name":"string",
|
20
|
+
"type":"STRING"
|
21
|
+
},
|
22
|
+
{
|
23
|
+
"name":"double",
|
24
|
+
"type":"FLOAT"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"name":"boolean",
|
28
|
+
"type":"BOOLEAN"
|
29
|
+
}
|
30
|
+
]
|
@@ -0,0 +1,30 @@
|
|
1
|
+
[
|
2
|
+
{
|
3
|
+
"name":"dat",
|
4
|
+
"type":"STRING"
|
5
|
+
},
|
6
|
+
{
|
7
|
+
"name":"timestamp",
|
8
|
+
"type":"TIMESTAMP"
|
9
|
+
},
|
10
|
+
{
|
11
|
+
"name":"null",
|
12
|
+
"type":"STRING"
|
13
|
+
},
|
14
|
+
{
|
15
|
+
"name":"long",
|
16
|
+
"type":"INTEGER"
|
17
|
+
},
|
18
|
+
{
|
19
|
+
"name":"string",
|
20
|
+
"type":"STRING"
|
21
|
+
},
|
22
|
+
{
|
23
|
+
"name":"double",
|
24
|
+
"type":"FLOAT"
|
25
|
+
},
|
26
|
+
{
|
27
|
+
"name":"boolean",
|
28
|
+
"type":"BOOLEAN"
|
29
|
+
}
|
30
|
+
]
|
@@ -1,3 +1,388 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
require 'json'
|
2
|
+
require 'tempfile'
|
3
|
+
require_relative 'bigquery/bigquery_client'
|
4
|
+
require_relative 'bigquery/file_writer'
|
5
|
+
require_relative 'bigquery/value_converter_factory'
|
6
|
+
|
7
|
+
module Embulk
|
8
|
+
module Output
|
9
|
+
class Bigquery < OutputPlugin
|
10
|
+
Plugin.register_output('bigquery', self)
|
11
|
+
|
12
|
+
class Error < StandardError; end
|
13
|
+
|
14
|
+
# To support configuration like below as org.embulk.spi.unit.LoalFile
|
15
|
+
#
|
16
|
+
# json_keyfile:
|
17
|
+
# content: |
|
18
|
+
class LocalFile
|
19
|
+
# @return JSON string
|
20
|
+
def self.load(v)
|
21
|
+
if v.is_a?(String) # path
|
22
|
+
File.read(v)
|
23
|
+
elsif v.is_a?(Hash)
|
24
|
+
v['content']
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.configure(config, schema, processor_count)
|
30
|
+
task = {
|
31
|
+
'mode' => config.param('mode', :string, :default => 'append'),
|
32
|
+
'auth_method' => config.param('auth_method', :string, :default => 'private_key'),
|
33
|
+
'service_account_email' => config.param('service_account_email', :string, :default => nil),
|
34
|
+
'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
|
35
|
+
'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
|
36
|
+
'project' => config.param('project', :string, :default => nil),
|
37
|
+
'dataset' => config.param('dataset', :string),
|
38
|
+
'table' => config.param('table', :string),
|
39
|
+
'dataset_old' => config.param('dataset_old', :string, :default => nil),
|
40
|
+
'table_old' => config.param('table_old', :string, :default => nil),
|
41
|
+
'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
|
42
|
+
'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
|
43
|
+
'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
|
44
|
+
'schema_file' => config.param('schema_file', :string, :default => nil),
|
45
|
+
'template_table' => config.param('template_table', :string, :default => nil),
|
46
|
+
'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
|
47
|
+
'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
|
48
|
+
'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
|
49
|
+
'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
|
50
|
+
'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
|
51
|
+
'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
|
52
|
+
'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
|
53
|
+
|
54
|
+
'column_options' => config.param('column_options', :array, :default => []),
|
55
|
+
'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
|
56
|
+
'default_timestamp_format' => config.param('default_timestamp_format', :string, :default => ValueConverterFactory::DEFAULT_TIMESTAMP_FORMAT),
|
57
|
+
'payload_column' => config.param('payload_column', :string, :default => nil),
|
58
|
+
'payload_column_index' => config.param('payload_column_index', :integer, :default => nil),
|
59
|
+
|
60
|
+
'timeout_sec' => config.param('timeout_sec', :integer, :default => 300),
|
61
|
+
'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
|
62
|
+
'retries' => config.param('retries', :integer, :default => 5),
|
63
|
+
'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
|
64
|
+
|
65
|
+
'path_prefix' => config.param('path_prefix', :string, :default => nil),
|
66
|
+
'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%03d'),
|
67
|
+
'file_ext' => config.param('file_ext', :string, :default => nil),
|
68
|
+
'skip_file_generation' => config.param('skip_file_generation', :bool, :default => false),
|
69
|
+
'compression' => config.param('compression', :string, :default => 'NONE'),
|
70
|
+
|
71
|
+
'source_format' => config.param('source_format', :string, :default => 'CSV'),
|
72
|
+
'max_bad_records' => config.param('max_bad_records', :integer, :default => 0),
|
73
|
+
'field_delimiter' => config.param('field_delimiter', :string, :default => ','),
|
74
|
+
'encoding' => config.param('encoding', :string, :default => 'UTF-8'),
|
75
|
+
'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
|
76
|
+
'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
|
77
|
+
|
78
|
+
# for debug
|
79
|
+
'skip_load' => config.param('skip_load', :bool, :default => false),
|
80
|
+
'temp_table' => config.param('temp_table', :string, :default => nil),
|
81
|
+
'rehearsal_table' => config.param('rehearsal_table', :string, :default => nil),
|
82
|
+
}
|
83
|
+
|
84
|
+
now = Time.now
|
85
|
+
|
86
|
+
task['mode'] = task['mode'].downcase
|
87
|
+
unless %w[append append_direct replace delete_in_advance replace_backup].include?(task['mode'])
|
88
|
+
raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
|
89
|
+
end
|
90
|
+
|
91
|
+
if task['mode'] == 'replace_backup'
|
92
|
+
task['table_old'] ||= task['table_name_old'] # for lower version compatibility
|
93
|
+
if task['dataset_old'].nil? and task['table_old'].nil?
|
94
|
+
raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
|
95
|
+
end
|
96
|
+
task['dataset_old'] ||= task['dataset']
|
97
|
+
task['table_old'] ||= task['table']
|
98
|
+
end
|
99
|
+
|
100
|
+
if task['table_old']
|
101
|
+
task['table_old'] = now.strftime(task['table_old'])
|
102
|
+
end
|
103
|
+
if task['table']
|
104
|
+
task['table'] = now.strftime(task['table'])
|
105
|
+
end
|
106
|
+
|
107
|
+
task['auth_method'] = task['auth_method'].downcase
|
108
|
+
unless %w[private_key json_key compute_engine].include?(task['auth_method'])
|
109
|
+
raise ConfigError.new "`auth_method` must be one of private_key, json_key, compute_engine"
|
110
|
+
end
|
111
|
+
if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
|
112
|
+
raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
|
113
|
+
end
|
114
|
+
if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
|
115
|
+
raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
|
116
|
+
end
|
117
|
+
|
118
|
+
jsonkey_params = nil
|
119
|
+
if task['json_keyfile']
|
120
|
+
begin
|
121
|
+
jsonkey_params = JSON.parse(task['json_keyfile'])
|
122
|
+
rescue => e
|
123
|
+
raise ConfigError.new "json_keyfile is not a JSON file"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
if jsonkey_params
|
128
|
+
task['project'] ||= jsonkey_params['project_id']
|
129
|
+
end
|
130
|
+
if task['project'].nil?
|
131
|
+
raise ConfigError.new "Required field \"project\" is not set"
|
132
|
+
end
|
133
|
+
|
134
|
+
if (task['payload_column'] or task['payload_column_index']) and task['auto_create_table']
|
135
|
+
if task['schema_file'].nil? and task['template_table'].nil?
|
136
|
+
raise ConfigError.new "Cannot guess table schema from Embulk schema with `payload_column` or `payload_column_index`. Either of `schema_file` or `template_table` is required for auto_create_table true"
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
if task['payload_column_index']
|
141
|
+
if task['payload_column_index'] < 0 || schema.size <= task['payload_column_index']
|
142
|
+
raise ConfigError.new "payload_column_index #{task['payload_column_index']} is out of schema size"
|
143
|
+
end
|
144
|
+
elsif task['payload_column']
|
145
|
+
task['payload_column_index'] = schema.find_index {|c| c[:name] == task['payload_column'] }
|
146
|
+
if task['payload_column_index'].nil?
|
147
|
+
raise ConfigError.new "payload_column #{task['payload_column']} does not exist in schema"
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
if task['schema_file']
|
152
|
+
unless File.exist?(task['schema_file'])
|
153
|
+
raise ConfigError.new "schema_file #{task['schema_file']} is not found"
|
154
|
+
end
|
155
|
+
begin
|
156
|
+
JSON.parse(File.read(task['schema_file']))
|
157
|
+
rescue => e
|
158
|
+
raise ConfigError.new "schema_file #{task['schema_file']} is not a JSON file"
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
if task['path_prefix'].nil?
|
163
|
+
task['path_prefix'] = Tempfile.create('embulk_output_bigquery_') {|fp| fp.path }
|
164
|
+
end
|
165
|
+
|
166
|
+
task['source_format'] = task['source_format'].upcase
|
167
|
+
if task['source_format'] == 'JSONL'
|
168
|
+
task['source_format'] = 'NEWLINE_DELIMITED_JSON'
|
169
|
+
end
|
170
|
+
unless %w[CSV NEWLINE_DELIMITED_JSON].include?(task['source_format'])
|
171
|
+
raise ConfigError.new "`source_format` must be CSV or NEWLINE_DELIMITED_JSON (JSONL)"
|
172
|
+
end
|
173
|
+
|
174
|
+
task['compression'] = task['compression'].upcase
|
175
|
+
unless %w[GZIP NONE].include?(task['compression'])
|
176
|
+
raise ConfigError.new "`compression` must be GZIP or NONE"
|
177
|
+
end
|
178
|
+
|
179
|
+
if task['file_ext'].nil?
|
180
|
+
case task['source_format']
|
181
|
+
when 'CSV'
|
182
|
+
file_ext = '.csv'
|
183
|
+
else # newline_delimited_json
|
184
|
+
file_ext = '.jsonl'
|
185
|
+
end
|
186
|
+
case task['compression']
|
187
|
+
when 'GZIP'
|
188
|
+
file_ext << '.gz'
|
189
|
+
end
|
190
|
+
task['file_ext'] = file_ext
|
191
|
+
end
|
192
|
+
|
193
|
+
unique_name = "%08x%08x%08x" % [Process.pid, now.tv_sec, now.tv_nsec]
|
194
|
+
|
195
|
+
if %w[replace replace_backup append].include?(task['mode'])
|
196
|
+
task['temp_table'] ||= "#{task['table']}_LOAD_TEMP_#{unique_name}"
|
197
|
+
end
|
198
|
+
|
199
|
+
if task['with_rehearsal']
|
200
|
+
task['rehearsal_table'] ||= "#{task['table']}_LOAD_REHEARSAL_#{unique_name}"
|
201
|
+
end
|
202
|
+
|
203
|
+
task
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.bigquery
|
207
|
+
@bigquery
|
208
|
+
end
|
209
|
+
|
210
|
+
def self.converters
|
211
|
+
@converters
|
212
|
+
end
|
213
|
+
|
214
|
+
def self.transaction_report(task_reports, responses)
|
215
|
+
transaction_report = {
|
216
|
+
'num_input_rows' => 0,
|
217
|
+
'num_output_rows' => 0,
|
218
|
+
'num_rejected_rows' => 0,
|
219
|
+
}
|
220
|
+
(0...task_reports.size).each do |idx|
|
221
|
+
task_report = task_reports[idx]
|
222
|
+
response = responses[idx]
|
223
|
+
num_input_rows = task_report['num_input_rows']
|
224
|
+
num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
|
225
|
+
num_rejected_rows = num_input_rows - num_output_rows
|
226
|
+
transaction_report['num_input_rows'] += num_input_rows
|
227
|
+
transaction_report['num_output_rows'] += num_output_rows
|
228
|
+
transaction_report['num_rejected_rows'] += num_rejected_rows
|
229
|
+
end
|
230
|
+
transaction_report
|
231
|
+
end
|
232
|
+
|
233
|
+
def self.transaction(config, schema, processor_count, &control)
|
234
|
+
task = self.configure(config, schema, processor_count)
|
235
|
+
|
236
|
+
@task = task
|
237
|
+
@schema = schema
|
238
|
+
@bigquery = BigqueryClient.new(task, schema)
|
239
|
+
@converters = ValueConverterFactory.create_converters(task, schema)
|
240
|
+
|
241
|
+
if task['auto_create_dataset']
|
242
|
+
bigquery.create_dataset(task['dataset'])
|
243
|
+
else
|
244
|
+
bigquery.get_dataset(task['dataset']) # raises NotFoundError
|
245
|
+
end
|
246
|
+
|
247
|
+
if task['mode'] == 'replace_backup' and task['dataset_old'] != task['dataset']
|
248
|
+
if task['auto_create_dataset']
|
249
|
+
bigquery.create_dataset(task['dataset_old'], reference: task['dataset'])
|
250
|
+
else
|
251
|
+
bigquery.get_dataset(task['dataset_old']) # raises NotFoundError
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
case task['mode']
|
256
|
+
when 'delete_in_advance'
|
257
|
+
bigquery.delete_table(task['table'])
|
258
|
+
bigquery.create_table(task['table'])
|
259
|
+
when 'replace', 'replace_backup', 'append'
|
260
|
+
bigquery.create_table(task['temp_table'])
|
261
|
+
else # append_direct
|
262
|
+
if task['auto_create_table']
|
263
|
+
bigquery.create_table(task['table'])
|
264
|
+
else
|
265
|
+
bigquery.get_table(task['table']) # raises NotFoundError
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
begin
|
270
|
+
paths = []
|
271
|
+
if task['skip_file_generation']
|
272
|
+
yield(task) # does nothing, but seems it has to be called
|
273
|
+
path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
|
274
|
+
Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
|
275
|
+
paths = Dir.glob(path_pattern)
|
276
|
+
task_reports = paths.map {|path| { 'path' => path, 'num_input_rows' => 0 } }
|
277
|
+
else
|
278
|
+
task_reports = yield(task) # generates local files
|
279
|
+
Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
|
280
|
+
paths = task_reports.map {|report| report['path'] }
|
281
|
+
end
|
282
|
+
|
283
|
+
if task['skip_load'] # only for debug
|
284
|
+
Embulk.logger.info { "embulk-output-bigquery: Skip load" }
|
285
|
+
else
|
286
|
+
target_table = task['temp_table'] ? task['temp_table'] : task['table']
|
287
|
+
responses = bigquery.load_in_parallel(paths, target_table)
|
288
|
+
transaction_report = self.transaction_report(task_reports, responses)
|
289
|
+
Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
|
290
|
+
|
291
|
+
if task['mode'] == 'replace_backup'
|
292
|
+
bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
|
293
|
+
end
|
294
|
+
|
295
|
+
if task['temp_table']
|
296
|
+
if task['mode'] == 'append'
|
297
|
+
bigquery.copy(task['temp_table'], task['table'],
|
298
|
+
write_disposition: 'WRITE_APPEND')
|
299
|
+
else # replace or replace_backup
|
300
|
+
bigquery.copy(task['temp_table'], task['table'],
|
301
|
+
write_disposition: 'WRITE_TRUNCATE')
|
302
|
+
end
|
303
|
+
end
|
304
|
+
end
|
305
|
+
ensure
|
306
|
+
begin
|
307
|
+
if task['temp_table'] # replace or replace_backup
|
308
|
+
bigquery.delete_table(task['temp_table'])
|
309
|
+
end
|
310
|
+
ensure
|
311
|
+
if task['delete_from_local_when_job_end']
|
312
|
+
paths.each do |path|
|
313
|
+
Embulk.logger.info { "delete #{path}" }
|
314
|
+
File.unlink(path) rescue nil
|
315
|
+
end
|
316
|
+
else
|
317
|
+
paths.each do |path|
|
318
|
+
if File.exist?(path)
|
319
|
+
Embulk.logger.info { "#{path} is left" }
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
# this is for -c next_config option, add some paramters for next execution if wants
|
327
|
+
next_config_diff = {}
|
328
|
+
return next_config_diff
|
329
|
+
end
|
330
|
+
|
331
|
+
# instance is created on each thread
|
332
|
+
def initialize(task, schema, index)
|
333
|
+
super
|
334
|
+
|
335
|
+
if task['with_rehearsal'] and @index == 0
|
336
|
+
@bigquery = self.class.bigquery
|
337
|
+
@rehearsaled = false
|
338
|
+
@num_rows = 0
|
339
|
+
end
|
340
|
+
|
341
|
+
unless task['skip_file_generation']
|
342
|
+
@file_writer = FileWriter.new(task, schema, index, self.class.converters)
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
# called for each page in each thread
|
347
|
+
def close
|
348
|
+
end
|
349
|
+
|
350
|
+
# called for each page in each thread
|
351
|
+
def add(page)
|
352
|
+
if task['with_rehearsal'] and @index == 0 and !@rehearsaled
|
353
|
+
page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
|
354
|
+
if @num_rows > task['rehearsal_counts']
|
355
|
+
Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
|
356
|
+
begin
|
357
|
+
@bigquery.create_table(task['rehearsal_table'])
|
358
|
+
@bigquery.load(@file_writer.path, task['rehearsal_table'])
|
359
|
+
ensure
|
360
|
+
@bigquery.delete_table(task['rehearsal_table'])
|
361
|
+
end
|
362
|
+
@rehearsaled = true
|
363
|
+
end
|
364
|
+
@num_rows += page.to_a.size
|
365
|
+
end
|
366
|
+
|
367
|
+
unless task['skip_file_generation']
|
368
|
+
@file_writer.add(page)
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
def finish
|
373
|
+
end
|
374
|
+
|
375
|
+
def abort
|
376
|
+
end
|
377
|
+
|
378
|
+
# called after processing all pages in each thread, returns a task_report
|
379
|
+
def commit
|
380
|
+
unless task['skip_file_generation']
|
381
|
+
@file_writer.commit
|
382
|
+
else
|
383
|
+
{}
|
384
|
+
end
|
385
|
+
end
|
386
|
+
end
|
387
|
+
end
|
388
|
+
end
|