fluent-plugin-bigquery 0.2.16 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +76 -3
- data/Rakefile +1 -0
- data/fluent-plugin-bigquery.gemspec +3 -5
- data/lib/fluent/plugin/bigquery/schema.rb +221 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +289 -0
- data/lib/fluent/plugin/out_bigquery.rb +159 -373
- data/test/helper.rb +1 -0
- data/test/plugin/test_out_bigquery.rb +470 -142
- data/test/plugin/test_record_schema.rb +173 -0
- metadata +17 -21
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77b839947c8f721f341499a0be5c9f21552833ad
|
4
|
+
data.tar.gz: 3efa172577cb54e19290f0df245ecbb677994869
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f0dbcd7b1cd8f4462657006f9a51338e0327f0b61a172ae0d39860a4c88bae4af0ba8f299cc9657a8ac3eb44acf53f5bc1161d023a06b5ab6d133d4ab72aeba2
|
7
|
+
data.tar.gz: aabd97536deeeb1a6b3f55e0c71f28ddbcfbf99ced84c7d299e881409dab907a34a943c8fa69df0a3373d6c6e1eb5a83fcde157ea6c601b90d84ad7c93b816b4
|
data/README.md
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
[Fluentd](http://fluentd.org) output plugin to load/insert data into Google BigQuery.
|
4
4
|
|
5
|
+
- **Plugin type**: TimeSlicedOutput
|
6
|
+
|
5
7
|
* insert data over streaming inserts
|
6
8
|
* for continuous real-time insertions
|
7
9
|
* https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
@@ -14,6 +16,59 @@ OAuth flow for installed applications.
|
|
14
16
|
|
15
17
|
## Configuration
|
16
18
|
|
19
|
+
### Options
|
20
|
+
|
21
|
+
| name | type | required? | default | description |
|
22
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
23
|
+
| method | string | no | insert | `insert` (Streaming Insert) or `load` (load job) |
|
24
|
+
| buffer_type | string | no | lightening (insert) or file (load) | |
|
25
|
+
| buffer_chunk_limit | integer | no | 1MB (insert) or 1GB (load) | |
|
26
|
+
| buffer_queue_limit | integer | no | 1024 (insert) or 32 (load) | |
|
27
|
+
| buffer_chunk_records_limit | integer | no | 500 | |
|
28
|
+
| flush_interval | float | no | 0.25 (*insert) or default of time sliced output (load) | |
|
29
|
+
| try_flush_interval | float | no | 0.05 (*insert) or default of time sliced output (load) | |
|
30
|
+
| auth_method | enum | yes | private_key | `private_key` or `json_key` or `compute_engine` or `application_default` |
|
31
|
+
| email | string | yes (private_key) | nil | GCP Service Account Email |
|
32
|
+
| private_key_path | string | yes (private_key) | nil | GCP Private Key file path |
|
33
|
+
| private_key_passphrase | string | yes (private_key) | nil | GCP Private Key Passphrase |
|
34
|
+
| json_key | string | yes (json_key) | nil | GCP JSON Key file path or JSON Key string |
|
35
|
+
| project | string | yes | nil | |
|
36
|
+
| table | string | yes (either `tables`) | nil | |
|
37
|
+
| tables | string | yes (either `table`) | nil | can set multi table names splitted by `,` |
|
38
|
+
| template_suffix | string | no | nil | can use `%{time_slice}` placeholder replaced by `time_slice_format` |
|
39
|
+
| auto_create_table | bool | no | false | If true, creates table automatically |
|
40
|
+
| skip_invalid_rows | bool | no | false | Only `insert` method. |
|
41
|
+
| max_bad_records | integer | no | 0 | Only `load` method. If the number of bad records exceeds this value, an invalid error is returned in the job result. |
|
42
|
+
| ignore_unknown_values | bool | no | false | Accept rows that contain values that do not match the schema. The unknown values are ignored. |
|
43
|
+
| schema_path | string | yes (either `fetch_schema`) | nil | Schema Definition file path. It is formatted by JSON. |
|
44
|
+
| fetch_schema | bool | yes (either `schema_path`) | false | If true, fetch table schema definition from Bigquery table automatically. |
|
45
|
+
| fetch_schema_table | string | no | nil | If set, fetch table schema definition from this table, If fetch_schema is false, this param is ignored |
|
46
|
+
| schema_cache_expire | integer | no | 600 | Value is second. If current time is after expiration interval, re-fetch table schema definition. |
|
47
|
+
| field_string | string | no | nil | see examples. |
|
48
|
+
| field_integer | string | no | nil | see examples. |
|
49
|
+
| field_float | string | no | nil | see examples. |
|
50
|
+
| field_boolean | string | no | nil | see examples. |
|
51
|
+
| field_timestamp | string | no | nil | see examples. |
|
52
|
+
| time_field | string | no | nil | If this param is set, plugin set formatted time string to this field. |
|
53
|
+
| time_format | string | no | nil | ex. `%s`, `%Y/%m%d %H:%M:%S` |
|
54
|
+
| replace_record_key | bool | no | false | see examples. |
|
55
|
+
| replace_record_key_regexp{1-10} | string | no | nil | see examples. |
|
56
|
+
| convert_hash_to_json | bool | no | false | If true, converts Hash value of record to JSON String. |
|
57
|
+
| insert_id_field | string | no | nil | Use key as `insert_id` of Streaming Insert API parameter. |
|
58
|
+
| request_timeout_sec | integer | no | nil | Bigquery API response timeout |
|
59
|
+
| request_open_timeout_sec | integer | no | 60 | Bigquery API connection, and request timeout. If you send big data to Bigquery, set large value. |
|
60
|
+
|
61
|
+
### Standard Options
|
62
|
+
|
63
|
+
| name | type | required? | default | description |
|
64
|
+
| :------------------------------------- | :------------ | :----------- | :------------------------- | :----------------------- |
|
65
|
+
| localtime | bool | no | nil | Use localtime |
|
66
|
+
| utc | bool | no | nil | Use utc |
|
67
|
+
|
68
|
+
And see http://docs.fluentd.org/articles/output-plugin-overview#time-sliced-output-parameters
|
69
|
+
|
70
|
+
## Examples
|
71
|
+
|
17
72
|
### Streaming inserts
|
18
73
|
|
19
74
|
Configure insert specifications with target table schema, with your credentials. This is minimum configurations:
|
@@ -139,7 +194,7 @@ __CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` o
|
|
139
194
|
|
140
195
|
### Authentication
|
141
196
|
|
142
|
-
There are
|
197
|
+
There are four methods supported to fetch access token for the service account.
|
143
198
|
|
144
199
|
1. Public-Private key pair of GCP(Google Cloud Platform)'s service account
|
145
200
|
2. JSON key of GCP(Google Cloud Platform)'s service account
|
@@ -301,6 +356,24 @@ For example value of `subdomain` attribute is `"bq.fluent"`, table id will be li
|
|
301
356
|
- any type of attribute is allowed because stringified value will be used as replacement.
|
302
357
|
- acceptable characters are alphabets, digits and `_`. All other characters will be removed.
|
303
358
|
|
359
|
+
### Date partitioned table support
|
360
|
+
this plugin can insert (load) into date partitioned table.
|
361
|
+
|
362
|
+
Use `%{time_slice}`.
|
363
|
+
|
364
|
+
```apache
|
365
|
+
<match dummy>
|
366
|
+
@type bigquery
|
367
|
+
|
368
|
+
...
|
369
|
+
time_slice_format %Y%m%d
|
370
|
+
table accesslog$%{time_slice}
|
371
|
+
...
|
372
|
+
</match>
|
373
|
+
```
|
374
|
+
|
375
|
+
But, Dynamic table creating doesn't support date partitioned table yet.
|
376
|
+
|
304
377
|
### Dynamic table creating
|
305
378
|
|
306
379
|
When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...".
|
@@ -398,6 +471,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
|
|
398
471
|
time_field time
|
399
472
|
|
400
473
|
fetch_schema true
|
474
|
+
# fetch_schema_table other_table # if you want to fetch schema from other table
|
401
475
|
field_integer time
|
402
476
|
</match>
|
403
477
|
```
|
@@ -425,8 +499,6 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
425
499
|
|
426
500
|
## TODO
|
427
501
|
|
428
|
-
* support Load API
|
429
|
-
* with automatically configured flush/buffer options
|
430
502
|
* support optional data fields
|
431
503
|
* support NULLABLE/REQUIRED/REPEATED field options in field list style of configuration
|
432
504
|
* OAuth installed application credentials support
|
@@ -438,3 +510,4 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
|
|
438
510
|
|
439
511
|
* @tagomoris: First author, original version
|
440
512
|
* KAIZEN platform Inc.: Maintener, Since 2014.08.19
|
513
|
+
* @joker1007
|
data/Rakefile
CHANGED
@@ -24,13 +24,11 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_development_dependency "test-unit-rr", "~> 1.0.3"
|
25
25
|
|
26
26
|
spec.add_runtime_dependency "google-api-client", "~> 0.9.3"
|
27
|
-
spec.add_runtime_dependency "googleauth", ">=
|
27
|
+
spec.add_runtime_dependency "googleauth", ">= 0.5.0"
|
28
28
|
spec.add_runtime_dependency "multi_json"
|
29
|
-
spec.add_runtime_dependency "activesupport", ">= 3.2"
|
30
|
-
spec.add_runtime_dependency "fluentd"
|
29
|
+
spec.add_runtime_dependency "activesupport", ">= 3.2", "< 5"
|
30
|
+
spec.add_runtime_dependency "fluentd", "~> 0.12.0"
|
31
31
|
spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
32
32
|
spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
|
33
33
|
spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
|
34
|
-
|
35
|
-
spec.add_development_dependency "fluent-plugin-dummydata-producer"
|
36
34
|
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
module Fluent
|
2
|
+
module BigQuery
|
3
|
+
class FieldSchema
|
4
|
+
def initialize(name, mode = :nullable)
|
5
|
+
unless [:nullable, :required, :repeated].include?(mode)
|
6
|
+
raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
|
7
|
+
end
|
8
|
+
### https://developers.google.com/bigquery/docs/tables
|
9
|
+
# Each field has the following properties:
|
10
|
+
#
|
11
|
+
# name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
|
12
|
+
# and must start with a letter or underscore. The maximum length is 128 characters.
|
13
|
+
# https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
|
14
|
+
unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
|
15
|
+
raise ConfigError, "invalid bigquery field name: '#{name}'"
|
16
|
+
end
|
17
|
+
|
18
|
+
@name = name
|
19
|
+
@mode = mode
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :name, :mode
|
23
|
+
|
24
|
+
def format(value)
|
25
|
+
case @mode
|
26
|
+
when :nullable
|
27
|
+
format_one(value) unless value.nil?
|
28
|
+
when :required
|
29
|
+
if value.nil?
|
30
|
+
log.warn "Required field #{name} cannot be null"
|
31
|
+
nil
|
32
|
+
else
|
33
|
+
format_one(value)
|
34
|
+
end
|
35
|
+
when :repeated
|
36
|
+
value.nil? ? [] : value.each_with_object([]) { |v, arr| arr << format_one(v) if v }
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def format_one(value)
|
41
|
+
raise NotImplementedError, "Must implement in a subclass"
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_h
|
45
|
+
{
|
46
|
+
:name => name,
|
47
|
+
:type => type.to_s.upcase,
|
48
|
+
:mode => mode.to_s.upcase,
|
49
|
+
}
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
class StringFieldSchema < FieldSchema
|
54
|
+
def type
|
55
|
+
:string
|
56
|
+
end
|
57
|
+
|
58
|
+
def format_one(value)
|
59
|
+
value.to_s
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
class IntegerFieldSchema < FieldSchema
|
64
|
+
def type
|
65
|
+
:integer
|
66
|
+
end
|
67
|
+
|
68
|
+
def format_one(value)
|
69
|
+
value.to_i
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
class FloatFieldSchema < FieldSchema
|
74
|
+
def type
|
75
|
+
:float
|
76
|
+
end
|
77
|
+
|
78
|
+
def format_one(value)
|
79
|
+
value.to_f
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
class BooleanFieldSchema < FieldSchema
|
84
|
+
def type
|
85
|
+
:boolean
|
86
|
+
end
|
87
|
+
|
88
|
+
def format_one(value)
|
89
|
+
!!value
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class TimestampFieldSchema < FieldSchema
|
94
|
+
INTEGER_REGEXP = /\A-?[[:digit:]]+\z/.freeze
|
95
|
+
FLOAT_REGEXP = /\A-?[[:digit:]]+(\.[[:digit:]]+)\z/.freeze
|
96
|
+
|
97
|
+
def type
|
98
|
+
:timestamp
|
99
|
+
end
|
100
|
+
|
101
|
+
def format_one(value)
|
102
|
+
case value
|
103
|
+
when Time
|
104
|
+
value.strftime("%Y-%m-%d %H:%M:%S.%6L %:z")
|
105
|
+
when String
|
106
|
+
if value =~ INTEGER_REGEXP
|
107
|
+
value.to_i
|
108
|
+
elsif value =~ FLOAT_REGEXP
|
109
|
+
value.to_f
|
110
|
+
else
|
111
|
+
value
|
112
|
+
end
|
113
|
+
else
|
114
|
+
value
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
class RecordSchema < FieldSchema
|
120
|
+
FIELD_TYPES = {
|
121
|
+
string: StringFieldSchema,
|
122
|
+
integer: IntegerFieldSchema,
|
123
|
+
float: FloatFieldSchema,
|
124
|
+
boolean: BooleanFieldSchema,
|
125
|
+
timestamp: TimestampFieldSchema,
|
126
|
+
record: RecordSchema
|
127
|
+
}.freeze
|
128
|
+
|
129
|
+
def initialize(name, mode = :nullable)
|
130
|
+
super(name, mode)
|
131
|
+
@fields = {}
|
132
|
+
end
|
133
|
+
|
134
|
+
def type
|
135
|
+
:record
|
136
|
+
end
|
137
|
+
|
138
|
+
def [](name)
|
139
|
+
@fields[name]
|
140
|
+
end
|
141
|
+
|
142
|
+
def empty?
|
143
|
+
@fields.empty?
|
144
|
+
end
|
145
|
+
|
146
|
+
def to_a
|
147
|
+
@fields.map do |_, field_schema|
|
148
|
+
field_schema.to_h
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def to_h
|
153
|
+
{
|
154
|
+
:name => name,
|
155
|
+
:type => type.to_s.upcase,
|
156
|
+
:mode => mode.to_s.upcase,
|
157
|
+
:fields => self.to_a,
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
def load_schema(schema, allow_overwrite=true)
|
162
|
+
schema.each do |field|
|
163
|
+
raise ConfigError, 'field must have type' unless field.key?('type')
|
164
|
+
|
165
|
+
name = field['name']
|
166
|
+
mode = (field['mode'] || 'nullable').downcase.to_sym
|
167
|
+
|
168
|
+
type = field['type'].downcase.to_sym
|
169
|
+
field_schema_class = FIELD_TYPES[type]
|
170
|
+
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
171
|
+
|
172
|
+
next if @fields.key?(name) and !allow_overwrite
|
173
|
+
|
174
|
+
field_schema = field_schema_class.new(name, mode)
|
175
|
+
@fields[name] = field_schema
|
176
|
+
if type == :record
|
177
|
+
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
178
|
+
field_schema.load_schema(field['fields'], allow_overwrite)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def register_field(name, type)
|
184
|
+
if @fields.key?(name) and @fields[name].type != :timestamp
|
185
|
+
raise ConfigError, "field #{name} is registered twice"
|
186
|
+
end
|
187
|
+
if name[/\./]
|
188
|
+
recordname = $`
|
189
|
+
fieldname = $'
|
190
|
+
register_record_field(recordname)
|
191
|
+
@fields[recordname].register_field(fieldname, type)
|
192
|
+
else
|
193
|
+
schema = FIELD_TYPES[type]
|
194
|
+
raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
|
195
|
+
@fields[name] = schema.new(name)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def format_one(record)
|
200
|
+
out = {}
|
201
|
+
record.each do |key, value|
|
202
|
+
next if value.nil?
|
203
|
+
schema = @fields[key]
|
204
|
+
out[key] = schema ? schema.format(value) : value
|
205
|
+
end
|
206
|
+
out
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
def register_record_field(name)
|
211
|
+
if !@fields.key?(name)
|
212
|
+
@fields[name] = RecordSchema.new(name)
|
213
|
+
else
|
214
|
+
unless @fields[name].kind_of?(RecordSchema)
|
215
|
+
raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,289 @@
|
|
1
|
+
module Fluent
|
2
|
+
module BigQuery
|
3
|
+
class Writer
|
4
|
+
RETRYABLE_ERROR_REASON = %w(backendError internalError rateLimitExceeded tableUnavailable).freeze
|
5
|
+
|
6
|
+
class Error < StandardError
|
7
|
+
attr_reader :origin
|
8
|
+
|
9
|
+
def initialize(message, origin = nil)
|
10
|
+
@origin = origin
|
11
|
+
super(message || origin.message)
|
12
|
+
end
|
13
|
+
|
14
|
+
def method_missing(name, *args)
|
15
|
+
if @origin
|
16
|
+
@origin.send(name, *args)
|
17
|
+
else
|
18
|
+
super
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def reason
|
23
|
+
@origin && @origin.respond_to?(:reason) ? @origin.reason : nil
|
24
|
+
end
|
25
|
+
|
26
|
+
def status_code
|
27
|
+
@origin && @origin.respond_to?(:status_code) ? @origin.status_code : nil
|
28
|
+
end
|
29
|
+
|
30
|
+
def retryable?
|
31
|
+
false
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class UnRetryableError < Error; end
|
36
|
+
|
37
|
+
class RetryableError < Error
|
38
|
+
def retryable?
|
39
|
+
true
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def initialize(log, auth_method, auth_options = {})
|
44
|
+
@auth_method = auth_method
|
45
|
+
@scope = "https://www.googleapis.com/auth/bigquery"
|
46
|
+
@auth_options = auth_options
|
47
|
+
@log = log
|
48
|
+
|
49
|
+
@cached_client_expiration = Time.now + 1800
|
50
|
+
end
|
51
|
+
|
52
|
+
def client
|
53
|
+
return @client if @client && @cached_client_expiration > Time.now
|
54
|
+
|
55
|
+
client = Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
|
56
|
+
cl.authorization = get_auth
|
57
|
+
end
|
58
|
+
|
59
|
+
@cached_client_expiration = Time.now + 1800
|
60
|
+
@client = client
|
61
|
+
end
|
62
|
+
|
63
|
+
def create_table(project, dataset, table_id, record_schema)
|
64
|
+
create_table_retry_limit = 3
|
65
|
+
create_table_retry_wait = 1
|
66
|
+
create_table_retry_count = 0
|
67
|
+
|
68
|
+
begin
|
69
|
+
client.insert_table(project, dataset, {
|
70
|
+
table_reference: {
|
71
|
+
table_id: table_id,
|
72
|
+
},
|
73
|
+
schema: {
|
74
|
+
fields: record_schema.to_a,
|
75
|
+
}
|
76
|
+
}, {})
|
77
|
+
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
78
|
+
@client = nil
|
79
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
80
|
+
@client = nil
|
81
|
+
|
82
|
+
message = e.message
|
83
|
+
if e.status_code == 409 && /Already Exists:/ =~ message
|
84
|
+
log.debug "already created table", project_id: project, dataset: dataset, table: table_id
|
85
|
+
# ignore 'Already Exists' error
|
86
|
+
return
|
87
|
+
end
|
88
|
+
|
89
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
90
|
+
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message, reason: reason
|
91
|
+
|
92
|
+
if RETRYABLE_ERROR_REASON.include?(reason) && create_table_retry_count < create_table_retry_limit
|
93
|
+
sleep create_table_retry_wait
|
94
|
+
create_table_retry_wait *= 2
|
95
|
+
create_table_retry_count += 1
|
96
|
+
retry
|
97
|
+
else
|
98
|
+
raise UnRetryableError.new("failed to create table in bigquery", e)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def fetch_schema(project, dataset, table_id)
|
104
|
+
res = client.get_table(project, dataset, table_id)
|
105
|
+
schema = res.schema.fields.as_json
|
106
|
+
log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
|
107
|
+
|
108
|
+
schema
|
109
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
110
|
+
@client = nil
|
111
|
+
message = e.message
|
112
|
+
log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
113
|
+
nil
|
114
|
+
end
|
115
|
+
|
116
|
+
def insert_rows(project, dataset, table_id, rows, skip_invalid_rows: false, ignore_unknown_values: false, template_suffix: nil, timeout_sec: nil, open_timeout_sec: 60)
|
117
|
+
body = {
|
118
|
+
rows: rows,
|
119
|
+
skip_invalid_rows: skip_invalid_rows,
|
120
|
+
ignore_unknown_values: ignore_unknown_values,
|
121
|
+
}
|
122
|
+
body.merge!(template_suffix: template_suffix) if template_suffix
|
123
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {
|
124
|
+
options: {timeout_sec: timeout_sec, open_timeout_sec: open_timeout_sec}
|
125
|
+
})
|
126
|
+
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
127
|
+
log.warn "insert errors", insert_errors: res.insert_errors if res.insert_errors && !res.insert_errors.empty?
|
128
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
129
|
+
@client = nil
|
130
|
+
|
131
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
132
|
+
log.error "tabledata.insertAll API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
133
|
+
|
134
|
+
if RETRYABLE_ERROR_REASON.include?(reason)
|
135
|
+
raise RetryableError.new(nil, e)
|
136
|
+
else
|
137
|
+
raise UnRetryableError.new(nil, e)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def create_load_job(project, dataset, table_id, upload_source, job_id, fields, ignore_unknown_values: false, max_bad_records: 0, timeout_sec: nil, open_timeout_sec: 60)
|
142
|
+
configuration = {
|
143
|
+
configuration: {
|
144
|
+
load: {
|
145
|
+
destination_table: {
|
146
|
+
project_id: project,
|
147
|
+
dataset_id: dataset,
|
148
|
+
table_id: table_id,
|
149
|
+
},
|
150
|
+
schema: {
|
151
|
+
fields: fields.to_a,
|
152
|
+
},
|
153
|
+
write_disposition: "WRITE_APPEND",
|
154
|
+
source_format: "NEWLINE_DELIMITED_JSON",
|
155
|
+
ignore_unknown_values: ignore_unknown_values,
|
156
|
+
max_bad_records: max_bad_records,
|
157
|
+
}
|
158
|
+
}
|
159
|
+
}
|
160
|
+
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
161
|
+
|
162
|
+
# If target table is already exist, omit schema configuration.
|
163
|
+
# Because schema changing is easier.
|
164
|
+
begin
|
165
|
+
if client.get_table(project, dataset, table_id)
|
166
|
+
configuration[:configuration][:load].delete(:schema)
|
167
|
+
end
|
168
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError
|
169
|
+
raise UnRetryableError.new("Schema is empty") if fields.empty?
|
170
|
+
end
|
171
|
+
|
172
|
+
res = client.insert_job(
|
173
|
+
project,
|
174
|
+
configuration,
|
175
|
+
{
|
176
|
+
upload_source: upload_source,
|
177
|
+
content_type: "application/octet-stream",
|
178
|
+
options: {
|
179
|
+
timeout_sec: timeout_sec,
|
180
|
+
open_timeout_sec: open_timeout_sec,
|
181
|
+
}
|
182
|
+
}
|
183
|
+
)
|
184
|
+
wait_load_job(project, dataset, res.job_reference.job_id, table_id)
|
185
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
186
|
+
@client = nil
|
187
|
+
|
188
|
+
reason = e.respond_to?(:reason) ? e.reason : nil
|
189
|
+
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
190
|
+
|
191
|
+
return wait_load_job(project, dataset, job_id, table_id, retryable: false) if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
192
|
+
|
193
|
+
if RETRYABLE_ERROR_REASON.include?(reason) || e.is_a?(Google::Apis::ServerError)
|
194
|
+
raise RetryableError.new(nil, e)
|
195
|
+
else
|
196
|
+
raise UnRetryableError.new(nil, e)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def wait_load_job(project, dataset, job_id, table_id, retryable: true)
|
201
|
+
wait_interval = 10
|
202
|
+
_response = client.get_job(project, job_id)
|
203
|
+
|
204
|
+
until _response.status.state == "DONE"
|
205
|
+
log.debug "wait for load job finish", state: _response.status.state, job_id: _response.job_reference.job_id
|
206
|
+
sleep wait_interval
|
207
|
+
_response = client.get_job(project, _response.job_reference.job_id)
|
208
|
+
end
|
209
|
+
|
210
|
+
errors = _response.status.errors
|
211
|
+
if errors
|
212
|
+
errors.each do |e|
|
213
|
+
log.error "job.insert API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
error_result = _response.status.error_result
|
218
|
+
if error_result
|
219
|
+
log.error "job.insert API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
|
220
|
+
if retryable && RETRYABLE_ERROR_REASON.include?(error_result.reason)
|
221
|
+
raise RetryableError.new("failed to load into bigquery, retry")
|
222
|
+
else
|
223
|
+
raise UnRetryableError.new("failed to load into bigquery, and cannot retry")
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
log.debug "finish load job", state: _response.status.state
|
228
|
+
end
|
229
|
+
|
230
|
+
private
|
231
|
+
|
232
|
+
def log
|
233
|
+
@log
|
234
|
+
end
|
235
|
+
|
236
|
+
def get_auth
|
237
|
+
case @auth_method
|
238
|
+
when :private_key
|
239
|
+
get_auth_from_private_key
|
240
|
+
when :compute_engine
|
241
|
+
get_auth_from_compute_engine
|
242
|
+
when :json_key
|
243
|
+
get_auth_from_json_key
|
244
|
+
when :application_default
|
245
|
+
get_auth_from_application_default
|
246
|
+
else
|
247
|
+
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
def get_auth_from_private_key
|
252
|
+
require 'google/api_client/auth/key_utils'
|
253
|
+
private_key_path = @auth_options[:private_key_path]
|
254
|
+
private_key_passphrase = @auth_options[:private_key_passphrase]
|
255
|
+
email = @auth_options[:email]
|
256
|
+
|
257
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
|
258
|
+
Signet::OAuth2::Client.new(
|
259
|
+
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
260
|
+
audience: "https://accounts.google.com/o/oauth2/token",
|
261
|
+
scope: @scope,
|
262
|
+
issuer: email,
|
263
|
+
signing_key: key
|
264
|
+
)
|
265
|
+
end
|
266
|
+
|
267
|
+
def get_auth_from_compute_engine
|
268
|
+
Google::Auth::GCECredentials.new
|
269
|
+
end
|
270
|
+
|
271
|
+
def get_auth_from_json_key
|
272
|
+
json_key = @auth_options[:json_key]
|
273
|
+
|
274
|
+
begin
|
275
|
+
File.open(json_key) do |f|
|
276
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
|
277
|
+
end
|
278
|
+
rescue Errno::ENOENT
|
279
|
+
key = StringIO.new(json_key)
|
280
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def get_auth_from_application_default
|
285
|
+
Google::Auth.get_application_default([@scope])
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|