embulk-output-bigquery 0.2.3 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -12
  3. data/CHANGELOG.md +18 -0
  4. data/Gemfile +8 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +165 -39
  7. data/Rakefile +11 -0
  8. data/embulk-output-bigquery.gemspec +20 -0
  9. data/example/config_client_options.yml +33 -0
  10. data/example/config_csv.yml +30 -0
  11. data/example/config_delete_in_advance.yml +29 -0
  12. data/example/config_expose_errors.yml +30 -0
  13. data/example/config_guess_from_embulk_schema.yml +29 -0
  14. data/example/config_guess_with_column_options.yml +40 -0
  15. data/example/config_gzip.yml +30 -0
  16. data/example/config_jsonl.yml +30 -0
  17. data/example/config_mode_append.yml +30 -0
  18. data/example/config_mode_append_direct.yml +30 -0
  19. data/example/config_payload_column.yml +20 -0
  20. data/example/config_payload_column_index.yml +20 -0
  21. data/example/config_prevent_duplicate_insert.yml +30 -0
  22. data/example/config_replace.yml +30 -0
  23. data/example/config_replace_backup.yml +32 -0
  24. data/example/config_skip_file_generation.yml +32 -0
  25. data/example/config_table_strftime.yml +30 -0
  26. data/example/config_template_table.yml +21 -0
  27. data/example/config_uncompressed.yml +30 -0
  28. data/example/config_with_rehearsal.yml +32 -0
  29. data/example/example.csv +17 -0
  30. data/example/example.jsonl +16 -0
  31. data/example/example.yml +30 -0
  32. data/example/json_key.json +12 -0
  33. data/example/nested_example.jsonl +16 -0
  34. data/example/schema.json +30 -0
  35. data/example/schema_expose_errors.json +30 -0
  36. data/lib/embulk/output/bigquery.rb +388 -3
  37. data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
  38. data/lib/embulk/output/bigquery/file_writer.rb +103 -0
  39. data/lib/embulk/output/bigquery/helper.rb +78 -0
  40. data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
  41. data/test/helper.rb +13 -0
  42. data/test/test_bigquery_client.rb +166 -0
  43. data/test/test_configure.rb +254 -0
  44. data/test/test_example.rb +34 -0
  45. data/test/test_file_writer.rb +129 -0
  46. data/test/test_helper.rb +103 -0
  47. data/test/test_transaction.rb +129 -0
  48. data/test/test_value_converter_factory.rb +316 -0
  49. metadata +114 -45
  50. data/build.gradle +0 -80
  51. data/config/checkstyle/checkstyle.xml +0 -128
  52. data/config/checkstyle/default.xml +0 -108
  53. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  54. data/gradle/wrapper/gradle-wrapper.properties +0 -6
  55. data/gradlew +0 -164
  56. data/gradlew.bat +0 -90
  57. data/settings.gradle +0 -2
  58. data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
  59. data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
  60. data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
  61. data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
  62. data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
  63. data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5
@@ -0,0 +1,21 @@
1
+ # embulk gem install embulk-parser-none
2
+ in:
3
+ type: file
4
+ path_prefix: example/example.jsonl
5
+ parser:
6
+ type: none
7
+ column_name: payload
8
+ out:
9
+ type: bigquery
10
+ mode: replace
11
+ auth_method: json_key
12
+ json_keyfile: /tmp/your-project-000.json
13
+ dataset: your_dataset_name
14
+ table: your_table_name_%Y%m%d
15
+ compression: GZIP
16
+ source_format: NEWLINE_DELIMITED_JSON
17
+ auto_create_dataset: true
18
+ auto_create_table: true
19
+ template_table: your_table_name
20
+ payload_column: payload
21
+ skip_load: true # for debug
@@ -0,0 +1,30 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: date, type: string}
13
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14
+ - {name: "null", type: string}
15
+ - {name: long, type: long}
16
+ - {name: string, type: string}
17
+ - {name: double, type: double}
18
+ - {name: boolean, type: boolean}
19
+ out:
20
+ type: bigquery
21
+ mode: replace
22
+ auth_method: json_key
23
+ json_keyfile: /tmp/your-project-000.json
24
+ dataset: your_dataset_name
25
+ table: your_table_name
26
+ source_format: NEWLINE_DELIMITED_JSON
27
+ compression: NONE
28
+ auto_create_dataset: true
29
+ auto_create_table: true
30
+ schema_file: example/schema.json
@@ -0,0 +1,32 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: date, type: string}
13
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14
+ - {name: "null", type: string}
15
+ - {name: long, type: long}
16
+ - {name: string, type: string}
17
+ - {name: double, type: double}
18
+ - {name: boolean, type: boolean}
19
+ out:
20
+ type: bigquery
21
+ mode: replace
22
+ auth_method: json_key
23
+ json_keyfile: /tmp/your-project-000.json
24
+ dataset: your_dataset_name
25
+ table: your_table_name
26
+ source_format: NEWLINE_DELIMITED_JSON
27
+ auto_create_dataset: true
28
+ auto_create_table: true
29
+ schema_file: example/schema.json
30
+ with_rehearsal: true
31
+ rehearsal_counts: 1
32
+ skip_load: true # for debug
@@ -0,0 +1,17 @@
1
+ date,timestamp,null,long,string,double,boolean
2
+ 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,true
3
+ 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,true
4
+ 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,true
5
+ 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,true
6
+ 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,true
7
+ 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,true
8
+ 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,true
9
+ 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,true
10
+ 2015-07-13,2015-07-13 00:00:00.100000,,90,l6lTsvxd,903.4,false
11
+ 2015-07-13,2015-07-13 00:00:00.100000,,91,XoALSEQg,394.5,false
12
+ 2015-07-13,2015-07-13 00:00:00.100000,,92,0hgDRI_m,810.9,false
13
+ 2015-07-13,2015-07-13 00:00:00.100000,,93,KjCRAc-A,477.4,false
14
+ 2015-07-13,2015-07-13 00:00:00.100000,,94,fyQVGlT8,725.3,false
15
+ 2015-07-13,2015-07-13 00:00:00.100000,,95,FpBYRPWK,316.6,false
16
+ 2015-07-13,2015-07-13 00:00:00.100000,,96,9ikvnUqp,369.5,false
17
+ 2015-07-13,2015-07-13 00:00:00.100000,,97,RRNYDAzK,506.5,false
@@ -0,0 +1,16 @@
1
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"boolean":true}
2
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"boolean":true}
3
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"boolean":true}
4
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"boolean":true}
5
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"boolean":true}
6
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"boolean":true}
7
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"boolean":true}
8
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"boolean":true}
9
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"boolean":false}
10
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"boolean":false}
11
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"boolean":false}
12
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"boolean":false}
13
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"boolean":false}
14
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"boolean":false}
15
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"boolean":false}
16
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"boolean":false}
@@ -0,0 +1,30 @@
1
+ in:
2
+ type: file
3
+ path_prefix: example/example.csv
4
+ parser:
5
+ type: csv
6
+ charset: UTF-8
7
+ newline: CRLF
8
+ null_string: 'NULL'
9
+ skip_header_lines: 1
10
+ comment_line_marker: '#'
11
+ columns:
12
+ - {name: date, type: string}
13
+ - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S.%N", timezone: "+09:00"}
14
+ - {name: "null", type: string}
15
+ - {name: long, type: long}
16
+ - {name: string, type: string}
17
+ - {name: double, type: double}
18
+ - {name: boolean, type: boolean}
19
+ out:
20
+ type: bigquery
21
+ mode: replace
22
+ auth_method: json_key
23
+ json_keyfile: /tmp/your-project-000.json
24
+ dataset: your_dataset_name
25
+ table: your_table_name
26
+ source_format: NEWLINE_DELIMITED_JSON
27
+ compression: NONE
28
+ auto_create_dataset: true
29
+ auto_create_table: true
30
+ schema_file: example/schema.json
@@ -0,0 +1,12 @@
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "your_project_name",
4
+ "private_key_id": "your_private_key_id",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "your_service_account_email",
7
+ "client_id": "your_client_id",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://accounts.google.com/o/oauth2/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/account-3%40your_project_name.iam.gserviceaccount.com"
12
+ }
@@ -0,0 +1,16 @@
1
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
2
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
3
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":true}
4
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":true}
5
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":true}
6
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":true}
7
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
8
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":true}
9
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":90,"string":"l6lTsvxd","double":903.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
10
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":91,"string":"XoALSEQg","double":394.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
11
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":92,"string":"0hgDRI_m","double":810.9,"json":{"k1":"v1","k2":"v2"},"boolean":false}
12
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":93,"string":"KjCRAc-A","double":477.4,"json":{"k1":"v1","k2":"v2"},"boolean":false}
13
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":94,"string":"fyQVGlT8","double":725.3,"json":{"k1":"v1","k2":"v2"},"boolean":false}
14
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":95,"string":"FpBYRPWK","double":316.6,"json":{"k1":"v1","k2":"v2"},"boolean":false}
15
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":96,"string":"9ikvnUqp","double":369.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
16
+ {"date":"2015-07-13","timestamp":"2015-07-13 00:00:00.100000","null":null,"long":97,"string":"RRNYDAzK","double":506.5,"json":{"k1":"v1","k2":"v2"},"boolean":false}
@@ -0,0 +1,30 @@
1
+ [
2
+ {
3
+ "name":"date",
4
+ "type":"STRING"
5
+ },
6
+ {
7
+ "name":"timestamp",
8
+ "type":"TIMESTAMP"
9
+ },
10
+ {
11
+ "name":"null",
12
+ "type":"STRING"
13
+ },
14
+ {
15
+ "name":"long",
16
+ "type":"INTEGER"
17
+ },
18
+ {
19
+ "name":"string",
20
+ "type":"STRING"
21
+ },
22
+ {
23
+ "name":"double",
24
+ "type":"FLOAT"
25
+ },
26
+ {
27
+ "name":"boolean",
28
+ "type":"BOOLEAN"
29
+ }
30
+ ]
@@ -0,0 +1,30 @@
1
+ [
2
+ {
3
+ "name":"dat",
4
+ "type":"STRING"
5
+ },
6
+ {
7
+ "name":"timestamp",
8
+ "type":"TIMESTAMP"
9
+ },
10
+ {
11
+ "name":"null",
12
+ "type":"STRING"
13
+ },
14
+ {
15
+ "name":"long",
16
+ "type":"INTEGER"
17
+ },
18
+ {
19
+ "name":"string",
20
+ "type":"STRING"
21
+ },
22
+ {
23
+ "name":"double",
24
+ "type":"FLOAT"
25
+ },
26
+ {
27
+ "name":"boolean",
28
+ "type":"BOOLEAN"
29
+ }
30
+ ]
@@ -1,3 +1,388 @@
1
- Embulk::JavaPlugin.register_output(
2
- "bigquery", "org.embulk.output.BigqueryOutputPlugin",
3
- File.expand_path('../../../../classpath', __FILE__))
1
+ require 'json'
2
+ require 'tempfile'
3
+ require_relative 'bigquery/bigquery_client'
4
+ require_relative 'bigquery/file_writer'
5
+ require_relative 'bigquery/value_converter_factory'
6
+
7
+ module Embulk
8
+ module Output
9
+ class Bigquery < OutputPlugin
10
+ Plugin.register_output('bigquery', self)
11
+
12
+ class Error < StandardError; end
13
+
14
+ # To support configuration like below as org.embulk.spi.unit.LoalFile
15
+ #
16
+ # json_keyfile:
17
+ # content: |
18
+ class LocalFile
19
+ # @return JSON string
20
+ def self.load(v)
21
+ if v.is_a?(String) # path
22
+ File.read(v)
23
+ elsif v.is_a?(Hash)
24
+ v['content']
25
+ end
26
+ end
27
+ end
28
+
29
+ def self.configure(config, schema, processor_count)
30
+ task = {
31
+ 'mode' => config.param('mode', :string, :default => 'append'),
32
+ 'auth_method' => config.param('auth_method', :string, :default => 'private_key'),
33
+ 'service_account_email' => config.param('service_account_email', :string, :default => nil),
34
+ 'p12_keyfile' => config.param('p12_keyfile', :string, :default => nil),
35
+ 'json_keyfile' => config.param('json_keyfile', LocalFile, :default => nil),
36
+ 'project' => config.param('project', :string, :default => nil),
37
+ 'dataset' => config.param('dataset', :string),
38
+ 'table' => config.param('table', :string),
39
+ 'dataset_old' => config.param('dataset_old', :string, :default => nil),
40
+ 'table_old' => config.param('table_old', :string, :default => nil),
41
+ 'table_name_old' => config.param('table_name_old', :string, :default => nil), # lower version compatibility
42
+ 'auto_create_dataset' => config.param('auto_create_dataset', :bool, :default => false),
43
+ 'auto_create_table' => config.param('auto_create_table', :bool, :default => false),
44
+ 'schema_file' => config.param('schema_file', :string, :default => nil),
45
+ 'template_table' => config.param('template_table', :string, :default => nil),
46
+ 'delete_from_local_when_job_end' => config.param('delete_from_local_when_job_end', :bool, :default => true),
47
+ 'job_status_max_polling_time' => config.param('job_status_max_polling_time', :integer, :default => 3600),
48
+ 'job_status_polling_interval' => config.param('job_status_polling_interval', :integer, :default => 10),
49
+ 'is_skip_job_result_check' => config.param('is_skip_job_result_check', :bool, :default => false),
50
+ 'prevent_duplicate_insert' => config.param('prevent_duplicate_insert', :bool, :default => false),
51
+ 'with_rehearsal' => config.param('with_rehearsal', :bool, :default => false),
52
+ 'rehearsal_counts' => config.param('rehearsal_counts', :integer, :default => 1000),
53
+
54
+ 'column_options' => config.param('column_options', :array, :default => []),
55
+ 'default_timezone' => config.param('default_timezone', :string, :default => ValueConverterFactory::DEFAULT_TIMEZONE),
56
+ 'default_timestamp_format' => config.param('default_timestamp_format', :string, :default => ValueConverterFactory::DEFAULT_TIMESTAMP_FORMAT),
57
+ 'payload_column' => config.param('payload_column', :string, :default => nil),
58
+ 'payload_column_index' => config.param('payload_column_index', :integer, :default => nil),
59
+
60
+ 'timeout_sec' => config.param('timeout_sec', :integer, :default => 300),
61
+ 'open_timeout_sec' => config.param('open_timeout_sec', :integer, :default => 300),
62
+ 'retries' => config.param('retries', :integer, :default => 5),
63
+ 'application_name' => config.param('application_name', :string, :default => 'Embulk BigQuery plugin'),
64
+
65
+ 'path_prefix' => config.param('path_prefix', :string, :default => nil),
66
+ 'sequence_format' => config.param('sequence_format', :string, :default => '.%d.%03d'),
67
+ 'file_ext' => config.param('file_ext', :string, :default => nil),
68
+ 'skip_file_generation' => config.param('skip_file_generation', :bool, :default => false),
69
+ 'compression' => config.param('compression', :string, :default => 'NONE'),
70
+
71
+ 'source_format' => config.param('source_format', :string, :default => 'CSV'),
72
+ 'max_bad_records' => config.param('max_bad_records', :integer, :default => 0),
73
+ 'field_delimiter' => config.param('field_delimiter', :string, :default => ','),
74
+ 'encoding' => config.param('encoding', :string, :default => 'UTF-8'),
75
+ 'ignore_unknown_values' => config.param('ignore_unknown_values', :bool, :default => false),
76
+ 'allow_quoted_newlines' => config.param('allow_quoted_newlines', :bool, :default => false),
77
+
78
+ # for debug
79
+ 'skip_load' => config.param('skip_load', :bool, :default => false),
80
+ 'temp_table' => config.param('temp_table', :string, :default => nil),
81
+ 'rehearsal_table' => config.param('rehearsal_table', :string, :default => nil),
82
+ }
83
+
84
+ now = Time.now
85
+
86
+ task['mode'] = task['mode'].downcase
87
+ unless %w[append append_direct replace delete_in_advance replace_backup].include?(task['mode'])
88
+ raise ConfigError.new "`mode` must be one of append, append_direct, replace, delete_in_advance, replace_backup"
89
+ end
90
+
91
+ if task['mode'] == 'replace_backup'
92
+ task['table_old'] ||= task['table_name_old'] # for lower version compatibility
93
+ if task['dataset_old'].nil? and task['table_old'].nil?
94
+ raise ConfigError.new "`mode replace_backup` requires either of `dataset_old` or `table_old`"
95
+ end
96
+ task['dataset_old'] ||= task['dataset']
97
+ task['table_old'] ||= task['table']
98
+ end
99
+
100
+ if task['table_old']
101
+ task['table_old'] = now.strftime(task['table_old'])
102
+ end
103
+ if task['table']
104
+ task['table'] = now.strftime(task['table'])
105
+ end
106
+
107
+ task['auth_method'] = task['auth_method'].downcase
108
+ unless %w[private_key json_key compute_engine].include?(task['auth_method'])
109
+ raise ConfigError.new "`auth_method` must be one of private_key, json_key, compute_engine"
110
+ end
111
+ if task['auth_method'] == 'private_key' and task['p12_keyfile'].nil?
112
+ raise ConfigError.new "`p12_keyfile` is required for auth_method private_key"
113
+ end
114
+ if task['auth_method'] == 'json_key' and task['json_keyfile'].nil?
115
+ raise ConfigError.new "`json_keyfile` is required for auth_method json_key"
116
+ end
117
+
118
+ jsonkey_params = nil
119
+ if task['json_keyfile']
120
+ begin
121
+ jsonkey_params = JSON.parse(task['json_keyfile'])
122
+ rescue => e
123
+ raise ConfigError.new "json_keyfile is not a JSON file"
124
+ end
125
+ end
126
+
127
+ if jsonkey_params
128
+ task['project'] ||= jsonkey_params['project_id']
129
+ end
130
+ if task['project'].nil?
131
+ raise ConfigError.new "Required field \"project\" is not set"
132
+ end
133
+
134
+ if (task['payload_column'] or task['payload_column_index']) and task['auto_create_table']
135
+ if task['schema_file'].nil? and task['template_table'].nil?
136
+ raise ConfigError.new "Cannot guess table schema from Embulk schema with `payload_column` or `payload_column_index`. Either of `schema_file` or `template_table` is required for auto_create_table true"
137
+ end
138
+ end
139
+
140
+ if task['payload_column_index']
141
+ if task['payload_column_index'] < 0 || schema.size <= task['payload_column_index']
142
+ raise ConfigError.new "payload_column_index #{task['payload_column_index']} is out of schema size"
143
+ end
144
+ elsif task['payload_column']
145
+ task['payload_column_index'] = schema.find_index {|c| c[:name] == task['payload_column'] }
146
+ if task['payload_column_index'].nil?
147
+ raise ConfigError.new "payload_column #{task['payload_column']} does not exist in schema"
148
+ end
149
+ end
150
+
151
+ if task['schema_file']
152
+ unless File.exist?(task['schema_file'])
153
+ raise ConfigError.new "schema_file #{task['schema_file']} is not found"
154
+ end
155
+ begin
156
+ JSON.parse(File.read(task['schema_file']))
157
+ rescue => e
158
+ raise ConfigError.new "schema_file #{task['schema_file']} is not a JSON file"
159
+ end
160
+ end
161
+
162
+ if task['path_prefix'].nil?
163
+ task['path_prefix'] = Tempfile.create('embulk_output_bigquery_') {|fp| fp.path }
164
+ end
165
+
166
+ task['source_format'] = task['source_format'].upcase
167
+ if task['source_format'] == 'JSONL'
168
+ task['source_format'] = 'NEWLINE_DELIMITED_JSON'
169
+ end
170
+ unless %w[CSV NEWLINE_DELIMITED_JSON].include?(task['source_format'])
171
+ raise ConfigError.new "`source_format` must be CSV or NEWLINE_DELIMITED_JSON (JSONL)"
172
+ end
173
+
174
+ task['compression'] = task['compression'].upcase
175
+ unless %w[GZIP NONE].include?(task['compression'])
176
+ raise ConfigError.new "`compression` must be GZIP or NONE"
177
+ end
178
+
179
+ if task['file_ext'].nil?
180
+ case task['source_format']
181
+ when 'CSV'
182
+ file_ext = '.csv'
183
+ else # newline_delimited_json
184
+ file_ext = '.jsonl'
185
+ end
186
+ case task['compression']
187
+ when 'GZIP'
188
+ file_ext << '.gz'
189
+ end
190
+ task['file_ext'] = file_ext
191
+ end
192
+
193
+ unique_name = "%08x%08x%08x" % [Process.pid, now.tv_sec, now.tv_nsec]
194
+
195
+ if %w[replace replace_backup append].include?(task['mode'])
196
+ task['temp_table'] ||= "#{task['table']}_LOAD_TEMP_#{unique_name}"
197
+ end
198
+
199
+ if task['with_rehearsal']
200
+ task['rehearsal_table'] ||= "#{task['table']}_LOAD_REHEARSAL_#{unique_name}"
201
+ end
202
+
203
+ task
204
+ end
205
+
206
+ def self.bigquery
207
+ @bigquery
208
+ end
209
+
210
+ def self.converters
211
+ @converters
212
+ end
213
+
214
+ def self.transaction_report(task_reports, responses)
215
+ transaction_report = {
216
+ 'num_input_rows' => 0,
217
+ 'num_output_rows' => 0,
218
+ 'num_rejected_rows' => 0,
219
+ }
220
+ (0...task_reports.size).each do |idx|
221
+ task_report = task_reports[idx]
222
+ response = responses[idx]
223
+ num_input_rows = task_report['num_input_rows']
224
+ num_output_rows = response ? response.statistics.load.output_rows.to_i : 0
225
+ num_rejected_rows = num_input_rows - num_output_rows
226
+ transaction_report['num_input_rows'] += num_input_rows
227
+ transaction_report['num_output_rows'] += num_output_rows
228
+ transaction_report['num_rejected_rows'] += num_rejected_rows
229
+ end
230
+ transaction_report
231
+ end
232
+
233
+ def self.transaction(config, schema, processor_count, &control)
234
+ task = self.configure(config, schema, processor_count)
235
+
236
+ @task = task
237
+ @schema = schema
238
+ @bigquery = BigqueryClient.new(task, schema)
239
+ @converters = ValueConverterFactory.create_converters(task, schema)
240
+
241
+ if task['auto_create_dataset']
242
+ bigquery.create_dataset(task['dataset'])
243
+ else
244
+ bigquery.get_dataset(task['dataset']) # raises NotFoundError
245
+ end
246
+
247
+ if task['mode'] == 'replace_backup' and task['dataset_old'] != task['dataset']
248
+ if task['auto_create_dataset']
249
+ bigquery.create_dataset(task['dataset_old'], reference: task['dataset'])
250
+ else
251
+ bigquery.get_dataset(task['dataset_old']) # raises NotFoundError
252
+ end
253
+ end
254
+
255
+ case task['mode']
256
+ when 'delete_in_advance'
257
+ bigquery.delete_table(task['table'])
258
+ bigquery.create_table(task['table'])
259
+ when 'replace', 'replace_backup', 'append'
260
+ bigquery.create_table(task['temp_table'])
261
+ else # append_direct
262
+ if task['auto_create_table']
263
+ bigquery.create_table(task['table'])
264
+ else
265
+ bigquery.get_table(task['table']) # raises NotFoundError
266
+ end
267
+ end
268
+
269
+ begin
270
+ paths = []
271
+ if task['skip_file_generation']
272
+ yield(task) # does nothing, but seems it has to be called
273
+ path_pattern = "#{task['path_prefix']}*#{task['file_ext']}"
274
+ Embulk.logger.info { "embulk-output-bigquery: Skip file generation. Get paths from `#{path_pattern}`" }
275
+ paths = Dir.glob(path_pattern)
276
+ task_reports = paths.map {|path| { 'path' => path, 'num_input_rows' => 0 } }
277
+ else
278
+ task_reports = yield(task) # generates local files
279
+ Embulk.logger.info { "embulk-output-bigquery: task_reports: #{task_reports.to_json}" }
280
+ paths = task_reports.map {|report| report['path'] }
281
+ end
282
+
283
+ if task['skip_load'] # only for debug
284
+ Embulk.logger.info { "embulk-output-bigquery: Skip load" }
285
+ else
286
+ target_table = task['temp_table'] ? task['temp_table'] : task['table']
287
+ responses = bigquery.load_in_parallel(paths, target_table)
288
+ transaction_report = self.transaction_report(task_reports, responses)
289
+ Embulk.logger.info { "embulk-output-bigquery: transaction_report: #{transaction_report.to_json}" }
290
+
291
+ if task['mode'] == 'replace_backup'
292
+ bigquery.copy(task['table'], task['table_old'], task['dataset_old'])
293
+ end
294
+
295
+ if task['temp_table']
296
+ if task['mode'] == 'append'
297
+ bigquery.copy(task['temp_table'], task['table'],
298
+ write_disposition: 'WRITE_APPEND')
299
+ else # replace or replace_backup
300
+ bigquery.copy(task['temp_table'], task['table'],
301
+ write_disposition: 'WRITE_TRUNCATE')
302
+ end
303
+ end
304
+ end
305
+ ensure
306
+ begin
307
+ if task['temp_table'] # replace or replace_backup
308
+ bigquery.delete_table(task['temp_table'])
309
+ end
310
+ ensure
311
+ if task['delete_from_local_when_job_end']
312
+ paths.each do |path|
313
+ Embulk.logger.info { "delete #{path}" }
314
+ File.unlink(path) rescue nil
315
+ end
316
+ else
317
+ paths.each do |path|
318
+ if File.exist?(path)
319
+ Embulk.logger.info { "#{path} is left" }
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ # this is for -c next_config option, add some paramters for next execution if wants
327
+ next_config_diff = {}
328
+ return next_config_diff
329
+ end
330
+
331
+ # instance is created on each thread
332
+ def initialize(task, schema, index)
333
+ super
334
+
335
+ if task['with_rehearsal'] and @index == 0
336
+ @bigquery = self.class.bigquery
337
+ @rehearsaled = false
338
+ @num_rows = 0
339
+ end
340
+
341
+ unless task['skip_file_generation']
342
+ @file_writer = FileWriter.new(task, schema, index, self.class.converters)
343
+ end
344
+ end
345
+
346
+ # called for each page in each thread
347
+ def close
348
+ end
349
+
350
+ # called for each page in each thread
351
+ def add(page)
352
+ if task['with_rehearsal'] and @index == 0 and !@rehearsaled
353
+ page = page.to_a # to avoid https://github.com/embulk/embulk/issues/403
354
+ if @num_rows > task['rehearsal_counts']
355
+ Embulk.logger.info { "embulk-output-bigquery: Rehearsal started" }
356
+ begin
357
+ @bigquery.create_table(task['rehearsal_table'])
358
+ @bigquery.load(@file_writer.path, task['rehearsal_table'])
359
+ ensure
360
+ @bigquery.delete_table(task['rehearsal_table'])
361
+ end
362
+ @rehearsaled = true
363
+ end
364
+ @num_rows += page.to_a.size
365
+ end
366
+
367
+ unless task['skip_file_generation']
368
+ @file_writer.add(page)
369
+ end
370
+ end
371
+
372
+ def finish
373
+ end
374
+
375
+ def abort
376
+ end
377
+
378
+ # called after processing all pages in each thread, returns a task_report
379
+ def commit
380
+ unless task['skip_file_generation']
381
+ @file_writer.commit
382
+ else
383
+ {}
384
+ end
385
+ end
386
+ end
387
+ end
388
+ end