fluent-plugin-bigquery 0.2.16 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +76 -3
- data/Rakefile +1 -0
- data/fluent-plugin-bigquery.gemspec +3 -5
- data/lib/fluent/plugin/bigquery/schema.rb +221 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +289 -0
- data/lib/fluent/plugin/out_bigquery.rb +159 -373
- data/test/helper.rb +1 -0
- data/test/plugin/test_out_bigquery.rb +470 -142
- data/test/plugin/test_record_schema.rb +173 -0
- metadata +17 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77b839947c8f721f341499a0be5c9f21552833ad
|
4
|
+
data.tar.gz: 3efa172577cb54e19290f0df245ecbb677994869
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f0dbcd7b1cd8f4462657006f9a51338e0327f0b61a172ae0d39860a4c88bae4af0ba8f299cc9657a8ac3eb44acf53f5bc1161d023a06b5ab6d133d4ab72aeba2
|
7
|
+
data.tar.gz: aabd97536deeeb1a6b3f55e0c71f28ddbcfbf99ced84c7d299e881409dab907a34a943c8fa69df0a3373d6c6e1eb5a83fcde157ea6c601b90d84ad7c93b816b4
|
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
[Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
|
4
4
|
|
5
|
+
- **Plugin type**: TimeSlicedOutput
|
6
|
+
|
5
7
|
* insert data over streaming inserts
|
6
8
|
* for continuous real-time insertions
|
7
9
|
* https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
@@ -14,6 +16,59 @@ OAuth flow for installed applications.
|
|
14
16
|
|
15
17
|
## Configuration
|
16
18
|
|
19
|
+
### Options
|
20
|
+
|
21
|
+
| name | type | required? | default | description |
|
22
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
23
|
+
| method | string | no | insert | `insert` (Streaming Insert) or `load` (load job) |
|
24
|
+
| buffer_type | string | no | lightening (insert) or file (load) | |
|
25
|
+
| buffer_chunk_limit | integer | no | 1MB (insert) or 1GB (load) | |
|
26
|
+
| buffer_queue_limit | integer | no | 1024 (insert) or 32 (load) | |
|
27
|
+
| buffer_chunk_records_limit | integer | no | 500 | |
|
28
|
+
| flush_interval | float | no | 0.25 (*insert) or default of time sliced output (load) | |
|
29
|
+
| try_flush_interval | float | no | 0.05 (*insert) or default of time sliced output (load) | |
|
30
|
+
| auth_method | enum | yes | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
31
|
+
| email | string | yes (private_key) | nil | GCP Service Account Email |
|
32
|
+
| private_key_path | string | yes (private_key) | nil | GCP Private Key file path |
|
33
|
+
| private_key_passphrase | string | yes (private_key) | nil | GCP Private Key Passphrase |
|
34
|
+
| json_key | string | yes (json_key) | nil | GCP JSON Key file path or JSON Key string |
|
35
|
+
| project | string | yes | nil | |
|
36
|
+
| table | string | yes (either `tables`) | nil | |
|
37
|
+
| tables | string | yes (either `table`) | nil | can set multi table names splitted by `,` |
|
38
|
+
| template_suffix | string | no | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
|
39
|
+
| auto_create_table | bool | no | false | If true, creates table automatically |
|
40
|
+
| skip_invalid_rows | bool | no | false | Only `insert` method. |
|
41
|
+
| max_bad_records | integer | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
|
42
|
+
| ignore_unknown_values | bool | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
43
|
+
| schema_path | string | yes (either `fetch_schema`) | nil | Schema Definition file path. It is formatted by JSON. |
|
44
|
+
| fetch_schema | bool | yes (either `schema_path`) | false | If true, fetch table schema definition from Bigquery table automatically. |
|
45
|
+
| fetch_schema_table | string | no | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
46
|
+
| schema_cache_expire | integer | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
47
|
+
| field_string | string | no | nil | see examples. |
|
48
|
+
| field_integer | string | no | nil | see examples. |
|
49
|
+
| field_float | string | no | nil | see examples. |
|
50
|
+
| field_boolean | string | no | nil | see examples. |
|
51
|
+
| field_timestamp | string | no | nil | see examples. |
|
52
|
+
| time_field | string | no | nil | If this param is set, plugin set formatted time string to this field. |
|
53
|
+
| time_format | string | no | nil | ex. `%s`, `%Y/%m%d %H:%M:%S` |
|
54
|
+
| replace_record_key | bool | no | false | see examples. |
|
55
|
+
| replace_record_key_regexp{1-10} | string | no | nil | see examples. |
|
56
|
+
| convert_hash_to_json | bool | no | false | If true, converts Hash value of record to JSON String. |
|
57
|
+
| insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
58
|
+
| request_timeout_sec | integer | no | nil | Bigquery API response timeout |
|
59
|
+
| request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
+
|
61
|
+
### Standard Options
|
62
|
+
|
63
|
+
| name | type | required? | default | description |
|
64
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
65
|
+
| localtime | bool | no | nil | Use localtime |
|
66
|
+
| utc | bool | no | nil | Use utc |
|
67
|
+
|
68
|
+
And see http://docs.fluentd.org/articles/output-plugin-overview#time-sliced-output-parameters
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
17
72
|
### Streaming inserts
|
18
73
|
|
19
74
|
Configure insert specifications with target table schema, with your credentials. This is minimum configurations:
|
@@ -139,7 +194,7 @@ __CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` o
|
|
139
194
|
|
140
195
|
### Authentication
|
141
196
|
|
142
|
-
There are
|
197
|
+
There are four methods supported to fetch access token for the service account.
|
143
198
|
|
144
199
|
1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
|
145
200
|
2. JSON key of GCP(Google Cloud Platform)'s service account
|
@@ -301,6 +356,24 @@ For example value of `subdomain` attribute is `"bq.fluent"`, table id will be li
|
|
301
356
|
- any type of attribute is allowed because stringified value will be used as replacement.
|
302
357
|
- acceptable characters are alphabets, digits and `_`. All other characters will be removed.
|
303
358
|
|
359
|
+
### Date partitioned table support
|
360
|
+
this plugin can insert (load) into date partitioned table.
|
361
|
+
|
362
|
+
Use `%{time_slice}`.
|
363
|
+
|
364
|
+
```apache
|
365
|
+
<match dummy>
|
366
|
+
@type bigquery
|
367
|
+
|
368
|
+
...
|
369
|
+
time_slice_format %Y%m%d
|
370
|
+
table accesslog$%{time_slice}
|
371
|
+
...
|
372
|
+
</match>
|
373
|
+
```
|
374
|
+
|
375
|
+
But, Dynamic table creating doesn't support date partitioned table yet.
|
376
|
+
|
304
377
|
### Dynamic table creating
|
305
378
|
|
306
379
|
When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...".
|
@@ -398,6 +471,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
398
471
|
time_field time
|
399
472
|
|
400
473
|
fetch_schema true
|
474
|
+
# fetch_schema_table other_table # if you want to fetch schema from other table
|
401
475
|
field_integer time
|
402
476
|
</match>
|
403
477
|
```
|
@@ -425,8 +499,6 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
425
499
|
|
426
500
|
## TODO
|
427
501
|
|
428
|
-
* support Load API
|
429
|
-
* with automatically configured flush/buffer options
|
430
502
|
* support optional data fields
|
431
503
|
* support NULLABLE/REQUIRED/REPEATED field options in field list style of configuration
|
432
504
|
* OAuth installed application credentials support
|
@@ -438,3 +510,4 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
438
510
|
|
439
511
|
* @tagomoris: First author, original version
|
440
512
|
* KAIZEN platform Inc.: Maintener, Since 2014.08.19
|
513
|
+
* @joker1007
|
data/Rakefile
CHANGED
@@ -24,13 +24,11 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "test-unit-rr", "~> 1.0.3"
|
25
25
|
|
26
26
|
spec.add_runtime_dependency "google-api-client", "~> 0.9.3"
|
27
|
-
spec.add_runtime_dependency "googleauth", ">=
|
27
|
+
spec.add_runtime_dependency "googleauth", ">= 0.5.0"
|
28
28
|
spec.add_runtime_dependency "multi_json"
|
29
|
-
spec.add_runtime_dependency "activesupport", ">= 3.2"
|
30
|
-
spec.add_runtime_dependency "fluentd"
|
29
|
+
spec.add_runtime_dependency "activesupport", ">= 3.2", "< 5"
|
30
|
+
spec.add_runtime_dependency "fluentd", "~> 0.12.0"
|
31
31
|
spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
32
32
|
spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
|
33
33
|
spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
|
34
|
-
|
35
|
-
spec.add_development_dependency "fluent-plugin-dummydata-producer"
|
36
34
|
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
module Fluent
|
2
|
+
module BigQuery
|
3
|
+
class FieldSchema
|
4
|
+
def initialize(name, mode = :nullable)
|
5
|
+
unless [:nullable, :required, :repeated].include?(mode)
|
6
|
+
raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
|
7
|
+
end
|
8
|
+
### https://developers.google.com/bigquery/docs/tables
|
9
|
+
# Each field has the following properties:
|
10
|
+
#
|
11
|
+
# name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
|
12
|
+
# and must start with a letter or underscore. The maximum length is 128 characters.
|
13
|
+
# https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
|
14
|
+
unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
|
15
|
+
raise ConfigError, "invalid bigquery field name: '#{name}'"
|
16
|
+
end
|
17
|
+
|
18
|
+
@name = name
|
19
|
+
@mode = mode
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :name, :mode
|
23
|
+
|
24
|
+
def format(value)
|
25
|
+
case @mode
|
26
|
+
when :nullable
|
27
|
+
format_one(value) unless value.nil?
|
28
|
+
when :required
|
29
|
+
if value.nil?
|
30
|
+
log.warn "Required field #{name} cannot be null"
|
31
|
+
nil
|
32
|
+
else
|
33
|
+
format_one(value)
|
34
|
+
end
|
35
|
+
when :repeated
|
36
|
+
value.nil? ? [] : value.each_with_object([]) { |v, arr| arr << format_one(v) if v }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def format_one(value)
|
41
|
+
raise NotImplementedError, "Must implement in a subclass"
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_h
|
45
|
+
{
|
46
|
+
:name => name,
|
47
|
+
:type => type.to_s.upcase,
|
48
|
+
:mode => mode.to_s.upcase,
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class StringFieldSchema < FieldSchema
|
54
|
+
def type
|
55
|
+
:string
|
56
|
+
end
|
57
|
+
|
58
|
+
def format_one(value)
|
59
|
+
value.to_s
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class IntegerFieldSchema < FieldSchema
|
64
|
+
def type
|
65
|
+
:integer
|
66
|
+
end
|
67
|
+
|
68
|
+
def format_one(value)
|
69
|
+
value.to_i
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
class FloatFieldSchema < FieldSchema
|
74
|
+
def type
|
75
|
+
:float
|
76
|
+
end
|
77
|
+
|
78
|
+
def format_one(value)
|
79
|
+
value.to_f
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class BooleanFieldSchema < FieldSchema
|
84
|
+
def type
|
85
|
+
:boolean
|
86
|
+
end
|
87
|
+
|
88
|
+
def format_one(value)
|
89
|
+
!!value
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class TimestampFieldSchema < FieldSchema
|
94
|
+
INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
|
95
|
+
FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
|
96
|
+
|
97
|
+
def type
|
98
|
+
:timestamp
|
99
|
+
end
|
100
|
+
|
101
|
+
def format_one(value)
|
102
|
+
case value
|
103
|
+
when Time
|
104
|
+
value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
|
105
|
+
when String
|
106
|
+
if value =~ INTEGER_REGEXP
|
107
|
+
value.to_i
|
108
|
+
elsif value =~ FLOAT_REGEXP
|
109
|
+
value.to_f
|
110
|
+
else
|
111
|
+
value
|
112
|
+
end
|
113
|
+
else
|
114
|
+
value
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
class RecordSchema < FieldSchema
|
120
|
+
FIELD_TYPES = {
|
121
|
+
string: StringFieldSchema,
|
122
|
+
integer: IntegerFieldSchema,
|
123
|
+
float: FloatFieldSchema,
|
124
|
+
boolean: BooleanFieldSchema,
|
125
|
+
timestamp: TimestampFieldSchema,
|
126
|
+
record: RecordSchema
|
127
|
+
}.freeze
|
128
|
+
|
129
|
+
def initialize(name, mode = :nullable)
|
130
|
+
super(name, mode)
|
131
|
+
@fields = {}
|
132
|
+
end
|
133
|
+
|
134
|
+
def type
|
135
|
+
:record
|
136
|
+
end
|
137
|
+
|
138
|
+
def [](name)
|
139
|
+
@fields[name]
|
140
|
+
end
|
141
|
+
|
142
|
+
def empty?
|
143
|
+
@fields.empty?
|
144
|
+
end
|
145
|
+
|
146
|
+
def to_a
|
147
|
+
@fields.map do |_, field_schema|
|
148
|
+
field_schema.to_h
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def to_h
|
153
|
+
{
|
154
|
+
:name => name,
|
155
|
+
:type => type.to_s.upcase,
|
156
|
+
:mode => mode.to_s.upcase,
|
157
|
+
:fields => self.to_a,
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
def load_schema(schema, allow_overwrite=true)
|
162
|
+
schema.each do |field|
|
163
|
+
raise ConfigError, 'field must have type' unless field.key?('type')
|
164
|
+
|
165
|
+
name = field['name']
|
166
|
+
mode = (field['mode'] || 'nullable').downcase.to_sym
|
167
|
+
|
168
|
+
type = field['type'].downcase.to_sym
|
169
|
+
field_schema_class = FIELD_TYPES[type]
|
170
|
+
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
171
|
+
|
172
|
+
next if @fields.key?(name) and !allow_overwrite
|
173
|
+
|
174
|
+
field_schema = field_schema_class.new(name, mode)
|
175
|
+
@fields[name] = field_schema
|
176
|
+
if type == :record
|
177
|
+
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
178
|
+
field_schema.load_schema(field['fields'], allow_overwrite)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def register_field(name, type)
|
184
|
+
if @fields.key?(name) and @fields[name].type != :timestamp
|
185
|
+
raise ConfigError, "field #{name} is registered twice"
|
186
|
+
end
|
187
|
+
if name[/\./]
|
188
|
+
recordname = $`
|
189
|
+
fieldname = $'
|
190
|
+
register_record_field(recordname)
|
191
|
+
@fields[recordname].register_field(fieldname, type)
|
192
|
+
else
|
193
|
+
schema = FIELD_TYPES[type]
|
194
|
+
raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
|
195
|
+
@fields[name] = schema.new(name)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def format_one(record)
|
200
|
+
out = {}
|
201
|
+
record.each do |key, value|
|
202
|
+
next if value.nil?
|
203
|
+
schema = @fields[key]
|
204
|
+
out[key] = schema ? schema.format(value) : value
|
205
|
+
end
|
206
|
+
out
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
def register_record_field(name)
|
211
|
+
if !@fields.key?(name)
|
212
|
+
@fields[name] = RecordSchema.new(name)
|
213
|
+
else
|
214
|
+
unless @fields[name].kind_of?(RecordSchema)
|
215
|
+
raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
module Fluent
|
2
|
+
module BigQuery
|
3
|
+
class Writer
|
4
|
+
RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze
|
5
|
+
|
6
|
+
class Error < StandardError
|
7
|
+
attr_reader :origin
|
8
|
+
|
9
|
+
def initialize(message, origin = nil)
|
10
|
+
@origin = origin
|
11
|
+
super(message || origin.message)
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(name, *args)
|
15
|
+
if @origin
|
16
|
+
@origin.send(name, *args)
|
17
|
+
else
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def reason
|
23
|
+
@origin && @origin.respond_to?(:reason) ? @origin.reason : nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def status_code
|
27
|
+
@origin && @origin.respond_to?(:status_code) ? @origin.status_code : nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def retryable?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class UnRetryableError < Error; end
|
36
|
+
|
37
|
+
class RetryableError < Error
|
38
|
+
def retryable?
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(log, auth_method, auth_options = {})
|
44
|
+
@auth_method = auth_method
|
45
|
+
@scope = "https://www.googleapis.com/auth/bigquery"
|
46
|
+
@auth_options = auth_options
|
47
|
+
@log = log
|
48
|
+
|
49
|
+
@cached_client_expiration = Time.now + 1800
|
50
|
+
end
|
51
|
+
|
52
|
+
def client
|
53
|
+
return @client if @client && @cached_client_expiration > Time.now
|
54
|
+
|
55
|
+
client = Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
|
56
|
+
cl.authorization = get_auth
|
57
|
+
end
|
58
|
+
|
59
|
+
@cached_client_expiration = Time.now + 1800
|
60
|
+
@client = client
|
61
|
+
end
|
62
|
+
|
63
|
+
def create_table(project, dataset, table_id, record_schema)
|
64
|
+
create_table_retry_limit = 3
|
65
|
+
create_table_retry_wait = 1
|
66
|
+
create_table_retry_count = 0
|
67
|
+
|
68
|
+
begin
|
69
|
+
client.insert_table(project, dataset, {
|
70
|
+
table_reference: {
|
71
|
+
table_id: table_id,
|
72
|
+
},
|
73
|
+
schema: {
|
74
|
+
fields: record_schema.to_a,
|
75
|
+
}
|
76
|
+
}, {})
|
77
|
+
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
78
|
+
@client = nil
|
79
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
80
|
+
@client = nil
|
81
|
+
|
82
|
+
message = e.message
|
83
|
+
if e.status_code == 409 && /Already Exists:/ =~ message
|
84
|
+
log.debug "already created table", project_id: project, dataset: dataset, table: table_id
|
85
|
+
# ignore 'Already Exists' error
|
86
|
+
return
|
87
|
+
end
|
88
|
+
|
89
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
90
|
+
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message, reason: reason
|
91
|
+
|
92
|
+
if RETRYABLE_ERROR_REASON.include?(reason) && create_table_retry_count < create_table_retry_limit
|
93
|
+
sleep create_table_retry_wait
|
94
|
+
create_table_retry_wait *= 2
|
95
|
+
create_table_retry_count += 1
|
96
|
+
retry
|
97
|
+
else
|
98
|
+
raise UnRetryableError.new("failed to create table in bigquery", e)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def fetch_schema(project, dataset, table_id)
|
104
|
+
res = client.get_table(project, dataset, table_id)
|
105
|
+
schema = res.schema.fields.as_json
|
106
|
+
log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
|
107
|
+
|
108
|
+
schema
|
109
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
110
|
+
@client = nil
|
111
|
+
message = e.message
|
112
|
+
log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
|
116
|
+
def insert_rows(project, dataset, table_id, rows, skip_invalid_rows: false, ignore_unknown_values: false, template_suffix: nil, timeout_sec: nil, open_timeout_sec: 60)
|
117
|
+
body = {
|
118
|
+
rows: rows,
|
119
|
+
skip_invalid_rows: skip_invalid_rows,
|
120
|
+
ignore_unknown_values: ignore_unknown_values,
|
121
|
+
}
|
122
|
+
body.merge!(template_suffix: template_suffix) if template_suffix
|
123
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {
|
124
|
+
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
125
|
+
})
|
126
|
+
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
127
|
+
log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
|
128
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
129
|
+
@client = nil
|
130
|
+
|
131
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
132
|
+
log.error "tabledata.insertAll API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
133
|
+
|
134
|
+
if RETRYABLE_ERROR_REASON.include?(reason)
|
135
|
+
raise RetryableError.new(nil, e)
|
136
|
+
else
|
137
|
+
raise UnRetryableError.new(nil, e)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
|
142
|
+
configuration = {
|
143
|
+
configuration: {
|
144
|
+
load: {
|
145
|
+
destination_table: {
|
146
|
+
project_id: project,
|
147
|
+
dataset_id: dataset,
|
148
|
+
table_id: table_id,
|
149
|
+
},
|
150
|
+
schema: {
|
151
|
+
fields: fields.to_a,
|
152
|
+
},
|
153
|
+
write_disposition: "WRITE_APPEND",
|
154
|
+
source_format: "NEWLINE_DELIMITED_JSON",
|
155
|
+
ignore_unknown_values: ignore_unknown_values,
|
156
|
+
max_bad_records: max_bad_records,
|
157
|
+
}
|
158
|
+
}
|
159
|
+
}
|
160
|
+
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
161
|
+
|
162
|
+
# If target table is already exist, omit schema configuration.
|
163
|
+
# Because schema changing is easier.
|
164
|
+
begin
|
165
|
+
if client.get_table(project, dataset, table_id)
|
166
|
+
configuration[:configuration][:load].delete(:schema)
|
167
|
+
end
|
168
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
|
169
|
+
raise UnRetryableError.new("Schema is empty") if fields.empty?
|
170
|
+
end
|
171
|
+
|
172
|
+
res = client.insert_job(
|
173
|
+
project,
|
174
|
+
configuration,
|
175
|
+
{
|
176
|
+
upload_source: upload_source,
|
177
|
+
content_type: "application/octet-stream",
|
178
|
+
options: {
|
179
|
+
timeout_sec: timeout_sec,
|
180
|
+
open_timeout_sec: open_timeout_sec,
|
181
|
+
}
|
182
|
+
}
|
183
|
+
)
|
184
|
+
wait_load_job(project, dataset, res.job_reference.job_id, table_id)
|
185
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
186
|
+
@client = nil
|
187
|
+
|
188
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
189
|
+
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
190
|
+
|
191
|
+
return wait_load_job(project, dataset, job_id, table_id, retryable: false) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
192
|
+
|
193
|
+
if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
|
194
|
+
raise RetryableError.new(nil, e)
|
195
|
+
else
|
196
|
+
raise UnRetryableError.new(nil, e)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def wait_load_job(project, dataset, job_id, table_id, retryable: true)
|
201
|
+
wait_interval = 10
|
202
|
+
_response = client.get_job(project, job_id)
|
203
|
+
|
204
|
+
until _response.status.state == "DONE"
|
205
|
+
log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
|
206
|
+
sleep wait_interval
|
207
|
+
_response = client.get_job(project, _response.job_reference.job_id)
|
208
|
+
end
|
209
|
+
|
210
|
+
errors = _response.status.errors
|
211
|
+
if errors
|
212
|
+
errors.each do |e|
|
213
|
+
log.error "job.insert API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
error_result = _response.status.error_result
|
218
|
+
if error_result
|
219
|
+
log.error "job.insert API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
|
220
|
+
if retryable && RETRYABLE_ERROR_REASON.include?(error_result.reason)
|
221
|
+
raise RetryableError.new("failed to load into bigquery, retry")
|
222
|
+
else
|
223
|
+
raise UnRetryableError.new("failed to load into bigquery, and cannot retry")
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
log.debug "finish load job", state: _response.status.state
|
228
|
+
end
|
229
|
+
|
230
|
+
private
|
231
|
+
|
232
|
+
def log
|
233
|
+
@log
|
234
|
+
end
|
235
|
+
|
236
|
+
def get_auth
|
237
|
+
case @auth_method
|
238
|
+
when :private_key
|
239
|
+
get_auth_from_private_key
|
240
|
+
when :compute_engine
|
241
|
+
get_auth_from_compute_engine
|
242
|
+
when :json_key
|
243
|
+
get_auth_from_json_key
|
244
|
+
when :application_default
|
245
|
+
get_auth_from_application_default
|
246
|
+
else
|
247
|
+
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def get_auth_from_private_key
|
252
|
+
require 'google/api_client/auth/key_utils'
|
253
|
+
private_key_path = @auth_options[:private_key_path]
|
254
|
+
private_key_passphrase = @auth_options[:private_key_passphrase]
|
255
|
+
email = @auth_options[:email]
|
256
|
+
|
257
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
|
258
|
+
Signet::OAuth2::Client.new(
|
259
|
+
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
260
|
+
audience: "https://accounts.google.com/o/oauth2/token",
|
261
|
+
scope: @scope,
|
262
|
+
issuer: email,
|
263
|
+
signing_key: key
|
264
|
+
)
|
265
|
+
end
|
266
|
+
|
267
|
+
def get_auth_from_compute_engine
|
268
|
+
Google::Auth::GCECredentials.new
|
269
|
+
end
|
270
|
+
|
271
|
+
def get_auth_from_json_key
|
272
|
+
json_key = @auth_options[:json_key]
|
273
|
+
|
274
|
+
begin
|
275
|
+
File.open(json_key) do |f|
|
276
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
|
277
|
+
end
|
278
|
+
rescue Errno::ENOENT
|
279
|
+
key = StringIO.new(json_key)
|
280
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def get_auth_from_application_default
|
285
|
+
Google::Auth.get_application_default([@scope])
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|