fluent-plugin-bigquery 0.2.16 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +76 -3
- data/Rakefile +1 -0
- data/fluent-plugin-bigquery.gemspec +3 -5
- data/lib/fluent/plugin/bigquery/schema.rb +221 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +289 -0
- data/lib/fluent/plugin/out_bigquery.rb +159 -373
- data/test/helper.rb +1 -0
- data/test/plugin/test_out_bigquery.rb +470 -142
- data/test/plugin/test_record_schema.rb +173 -0
- metadata +17 -21
@@ -5,6 +5,9 @@ require 'fluent/plugin/bigquery/version'
|
|
5
5
|
require 'fluent/mixin/config_placeholders'
|
6
6
|
require 'fluent/mixin/plaintextformatter'
|
7
7
|
|
8
|
+
require 'fluent/plugin/bigquery/schema'
|
9
|
+
require 'fluent/plugin/bigquery/writer'
|
10
|
+
|
8
11
|
## TODO: load implementation
|
9
12
|
# require 'fluent/plugin/bigquery/load_request_body_wrapper'
|
10
13
|
|
@@ -19,36 +22,39 @@ module Fluent
|
|
19
22
|
# https://developers.google.com/bigquery/browser-tool-quickstart
|
20
23
|
# https://developers.google.com/bigquery/bigquery-api-quickstart
|
21
24
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
config_set_default :try_flush_interval, 0.05
|
25
|
+
### default for insert
|
26
|
+
def configure_for_insert(conf)
|
27
|
+
raise ConfigError unless conf["method"] != "load"
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
|
29
|
+
conf["buffer_type"] = "lightening" unless conf["buffer_type"]
|
30
|
+
conf["flush_interval"] = 0.25 unless conf["flush_interval"]
|
31
|
+
conf["try_flush_interval"] = 0.05 unless conf["try_flush_interval"]
|
32
|
+
conf["buffer_chunk_limit"] = 1 * 1024 ** 2 unless conf["buffer_chunk_limit"] # 1MB
|
33
|
+
conf["buffer_queue_limit"] = 1024 unless conf["buffer_queue_limit"]
|
34
|
+
conf["buffer_chunk_records_limit"] = 500 unless conf["buffer_chunk_records_limit"]
|
35
|
+
end
|
30
36
|
|
31
|
-
### for loads
|
32
|
-
|
33
|
-
|
34
|
-
# config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
|
37
|
+
### default for loads
|
38
|
+
def configure_for_load(conf)
|
39
|
+
raise ConfigError unless conf["method"] == "load"
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
|
41
|
+
# buffer_type, flush_interval, try_flush_interval is TimeSlicedOutput default
|
42
|
+
conf["buffer_chunk_limit"] = 1 * 1024 ** 3 unless conf["buffer_chunk_limit"] # 1GB
|
43
|
+
conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
|
44
|
+
end
|
39
45
|
|
40
46
|
# Available methods are:
|
41
47
|
# * private_key -- Use service account credential from pkcs12 private key file
|
42
48
|
# * compute_engine -- Use access token available in instances of ComputeEngine
|
43
|
-
# *
|
49
|
+
# * json_key -- Use service account credential from JSON key
|
44
50
|
# * application_default -- Use application default credential
|
45
|
-
config_param :auth_method, :
|
51
|
+
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
46
52
|
|
47
53
|
### Service Account credential
|
48
54
|
config_param :email, :string, default: nil
|
49
55
|
config_param :private_key_path, :string, default: nil
|
50
56
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
51
|
-
config_param :json_key, default: nil
|
57
|
+
config_param :json_key, default: nil, secret: true
|
52
58
|
|
53
59
|
# see as simple reference
|
54
60
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -62,12 +68,32 @@ module Fluent
|
|
62
68
|
# table_id
|
63
69
|
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
64
70
|
config_param :table, :string, default: nil
|
65
|
-
config_param :tables, :string, default: nil
|
71
|
+
config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
|
72
|
+
|
73
|
+
# template_suffix (only insert)
|
74
|
+
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
75
|
+
config_param :template_suffix, :string, default: nil
|
66
76
|
|
67
77
|
config_param :auto_create_table, :bool, default: false
|
68
78
|
|
79
|
+
# skip_invalid_rows (only insert)
|
80
|
+
# Insert all valid rows of a request, even if invalid rows exist.
|
81
|
+
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
82
|
+
config_param :skip_invalid_rows, :bool, default: false
|
83
|
+
# max_bad_records (only load)
|
84
|
+
# The maximum number of bad records that BigQuery can ignore when running the job.
|
85
|
+
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
86
|
+
# The default value is 0, which requires that all records are valid.
|
87
|
+
config_param :max_bad_records, :integer, default: 0
|
88
|
+
# ignore_unknown_values
|
89
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
90
|
+
# Default is false, which treats unknown values as errors.
|
91
|
+
config_param :ignore_unknown_values, :bool, default: false
|
92
|
+
|
69
93
|
config_param :schema_path, :string, default: nil
|
70
94
|
config_param :fetch_schema, :bool, default: false
|
95
|
+
config_param :fetch_schema_table, :string, default: nil
|
96
|
+
config_param :schema_cache_expire, :time, default: 600
|
71
97
|
config_param :field_string, :string, default: nil
|
72
98
|
config_param :field_integer, :string, default: nil
|
73
99
|
config_param :field_float, :string, default: nil
|
@@ -90,20 +116,15 @@ module Fluent
|
|
90
116
|
config_param :utc, :bool, default: nil
|
91
117
|
config_param :time_field, :string, default: nil
|
92
118
|
|
119
|
+
# insert_id_field (only insert)
|
93
120
|
config_param :insert_id_field, :string, default: nil
|
121
|
+
# prevent_duplicate_load (only load)
|
122
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
94
123
|
|
95
|
-
config_param :method, :
|
124
|
+
config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
|
96
125
|
|
97
|
-
|
98
|
-
|
99
|
-
# https://developers.google.com/bigquery/loading-data-into-bigquery
|
100
|
-
# Maximum File Sizes:
|
101
|
-
# File Type Compressed Uncompressed
|
102
|
-
# CSV 1 GB With new-lines in strings: 4 GB
|
103
|
-
# Without new-lines in strings: 1 TB
|
104
|
-
# JSON 1 GB 1 TB
|
105
|
-
|
106
|
-
config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
126
|
+
# TODO
|
127
|
+
# config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
107
128
|
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
108
129
|
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
109
130
|
### method: ''Streaming data inserts support
|
@@ -114,6 +135,14 @@ module Fluent
|
|
114
135
|
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
115
136
|
### Toooooooooooooo short/small per inserts and row!
|
116
137
|
|
138
|
+
## Timeout
|
139
|
+
# request_timeout_sec
|
140
|
+
# Bigquery API response timeout
|
141
|
+
# request_open_timeout_sec
|
142
|
+
# Bigquery API connection, and request timeout
|
143
|
+
config_param :request_timeout_sec, :time, default: nil
|
144
|
+
config_param :request_open_timeout_sec, :time, default: 60
|
145
|
+
|
117
146
|
### Table types
|
118
147
|
# https://developers.google.com/bigquery/docs/tables
|
119
148
|
#
|
@@ -142,34 +171,36 @@ module Fluent
|
|
142
171
|
Faraday.default_connection.options.timeout = 60
|
143
172
|
end
|
144
173
|
|
145
|
-
# Define `log` method for v0.10.42 or earlier
|
146
|
-
unless method_defined?(:log)
|
147
|
-
define_method("log") { $log }
|
148
|
-
end
|
149
|
-
|
150
174
|
def configure(conf)
|
175
|
+
if conf["method"] == "load"
|
176
|
+
configure_for_load(conf)
|
177
|
+
else
|
178
|
+
configure_for_insert(conf)
|
179
|
+
end
|
151
180
|
super
|
152
181
|
|
153
|
-
|
182
|
+
case @method
|
183
|
+
when :insert
|
154
184
|
extend(InsertImplementation)
|
155
|
-
|
185
|
+
when :load
|
186
|
+
raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
|
156
187
|
extend(LoadImplementation)
|
157
188
|
else
|
158
|
-
raise
|
189
|
+
raise Fluent::ConfigError "'method' must be 'insert' or 'load'"
|
159
190
|
end
|
160
191
|
|
161
192
|
case @auth_method
|
162
|
-
when
|
193
|
+
when :private_key
|
163
194
|
unless @email && @private_key_path
|
164
195
|
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
165
196
|
end
|
166
|
-
when
|
197
|
+
when :compute_engine
|
167
198
|
# Do nothing
|
168
|
-
when
|
199
|
+
when :json_key
|
169
200
|
unless @json_key
|
170
201
|
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
171
202
|
end
|
172
|
-
when
|
203
|
+
when :application_default
|
173
204
|
# Do nothing
|
174
205
|
else
|
175
206
|
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
@@ -181,7 +212,7 @@ module Fluent
|
|
181
212
|
|
182
213
|
@tablelist = @tables ? @tables.split(',') : [@table]
|
183
214
|
|
184
|
-
@fields = RecordSchema.new('record')
|
215
|
+
@fields = Fluent::BigQuery::RecordSchema.new('record')
|
185
216
|
if @schema_path
|
186
217
|
@fields.load_schema(MultiJson.load(File.read(@schema_path)))
|
187
218
|
end
|
@@ -232,57 +263,20 @@ module Fluent
|
|
232
263
|
def start
|
233
264
|
super
|
234
265
|
|
235
|
-
@cached_client = nil
|
236
|
-
@cached_client_expiration = nil
|
237
|
-
|
238
266
|
@tables_queue = @tablelist.dup.shuffle
|
239
267
|
@tables_mutex = Mutex.new
|
268
|
+
@fetch_schema_mutex = Mutex.new
|
240
269
|
|
241
|
-
|
270
|
+
@last_fetch_schema_time = 0
|
271
|
+
fetch_schema(false) if @fetch_schema
|
242
272
|
end
|
243
273
|
|
244
|
-
def
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
case @auth_method
|
252
|
-
when 'private_key'
|
253
|
-
require 'google/api_client/auth/key_utils'
|
254
|
-
key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
|
255
|
-
auth = Signet::OAuth2::Client.new(
|
256
|
-
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
257
|
-
audience: "https://accounts.google.com/o/oauth2/token",
|
258
|
-
scope: scope,
|
259
|
-
issuer: @email,
|
260
|
-
signing_key: key)
|
261
|
-
|
262
|
-
when 'compute_engine'
|
263
|
-
auth = Google::Auth::GCECredentials.new
|
264
|
-
|
265
|
-
when 'json_key'
|
266
|
-
if File.exist?(@json_key)
|
267
|
-
auth = File.open(@json_key) do |f|
|
268
|
-
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
|
269
|
-
end
|
270
|
-
else
|
271
|
-
key = StringIO.new(@json_key)
|
272
|
-
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
|
273
|
-
end
|
274
|
-
|
275
|
-
when 'application_default'
|
276
|
-
auth = Google::Auth.get_application_default([scope])
|
277
|
-
|
278
|
-
else
|
279
|
-
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
280
|
-
end
|
281
|
-
|
282
|
-
client.authorization = auth
|
283
|
-
|
284
|
-
@cached_client_expiration = Time.now + 1800
|
285
|
-
@cached_client = client
|
274
|
+
def writer
|
275
|
+
@writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
276
|
+
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
277
|
+
email: @email,
|
278
|
+
json_key: @json_key,
|
279
|
+
})
|
286
280
|
end
|
287
281
|
|
288
282
|
def generate_table_id(table_id_format, current_time, row = nil, chunk = nil)
|
@@ -295,7 +289,6 @@ module Fluent
|
|
295
289
|
current_time
|
296
290
|
end
|
297
291
|
if row && format =~ /\$\{/
|
298
|
-
json = row[:json]
|
299
292
|
format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
|
300
293
|
row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
|
301
294
|
end
|
@@ -313,28 +306,6 @@ module Fluent
|
|
313
306
|
end
|
314
307
|
end
|
315
308
|
|
316
|
-
def create_table(table_id)
|
317
|
-
client.insert_table(@project, @dataset, {
|
318
|
-
table_reference: {
|
319
|
-
table_id: table_id,
|
320
|
-
},
|
321
|
-
schema: {
|
322
|
-
fields: @fields.to_a,
|
323
|
-
}
|
324
|
-
}, {})
|
325
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
326
|
-
# api_error? -> client cache clear
|
327
|
-
@cached_client = nil
|
328
|
-
|
329
|
-
message = e.message
|
330
|
-
if e.status_code == 409 && /Already Exists:/ =~ message
|
331
|
-
# ignore 'Already Exists' error
|
332
|
-
return
|
333
|
-
end
|
334
|
-
log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
|
335
|
-
raise "failed to create table in bigquery" # TODO: error class
|
336
|
-
end
|
337
|
-
|
338
309
|
def replace_record_key(record)
|
339
310
|
new_record = {}
|
340
311
|
record.each do |key, _|
|
@@ -363,28 +334,42 @@ module Fluent
|
|
363
334
|
@tables_queue.push t
|
364
335
|
t
|
365
336
|
end
|
366
|
-
|
337
|
+
template_suffix_format = @template_suffix
|
338
|
+
_write(chunk, table_id_format, template_suffix_format)
|
367
339
|
end
|
368
340
|
|
369
|
-
def fetch_schema
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
341
|
+
def fetch_schema(allow_overwrite = true)
|
342
|
+
table_id = nil
|
343
|
+
@fetch_schema_mutex.synchronize do
|
344
|
+
if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
|
345
|
+
table_id_format = @fetch_schema_table || @tablelist[0]
|
346
|
+
table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
|
347
|
+
schema = writer.fetch_schema(@project, @dataset, table_id)
|
348
|
+
|
349
|
+
if schema
|
350
|
+
if allow_overwrite
|
351
|
+
fields = Fluent::BigQuery::RecordSchema.new("record")
|
352
|
+
fields.load_schema(schema, allow_overwrite)
|
353
|
+
@fields = fields
|
354
|
+
else
|
355
|
+
@fields.load_schema(schema, allow_overwrite)
|
356
|
+
end
|
357
|
+
else
|
358
|
+
if @fields.empty?
|
359
|
+
raise "failed to fetch schema from bigquery"
|
360
|
+
else
|
361
|
+
log.warn "#{table_id} uses previous schema"
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
@last_fetch_schema_time = Fluent::Engine.now
|
366
|
+
end
|
367
|
+
end
|
383
368
|
end
|
384
369
|
|
385
370
|
module InsertImplementation
|
386
371
|
def format(tag, time, record)
|
387
|
-
|
372
|
+
fetch_schema if @template_suffix
|
388
373
|
|
389
374
|
if @replace_record_key
|
390
375
|
record = replace_record_key(record)
|
@@ -394,6 +379,7 @@ module Fluent
|
|
394
379
|
record = convert_hash_to_json(record)
|
395
380
|
end
|
396
381
|
|
382
|
+
buf = String.new
|
397
383
|
row = @fields.format(@add_time_field.call(record, time))
|
398
384
|
unless row.empty?
|
399
385
|
row = {"json" => row}
|
@@ -403,44 +389,51 @@ module Fluent
|
|
403
389
|
buf
|
404
390
|
end
|
405
391
|
|
406
|
-
def _write(chunk, table_format)
|
392
|
+
def _write(chunk, table_format, template_suffix_format)
|
407
393
|
rows = []
|
408
394
|
chunk.msgpack_each do |row_object|
|
409
395
|
# TODO: row size limit
|
410
396
|
rows << row_object.deep_symbolize_keys
|
411
397
|
end
|
412
398
|
|
413
|
-
|
414
|
-
|
399
|
+
now = Time.at(Fluent::Engine.now)
|
400
|
+
group = rows.group_by do |row|
|
401
|
+
[
|
402
|
+
generate_table_id(table_format, now, row, chunk),
|
403
|
+
template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
|
404
|
+
]
|
405
|
+
end
|
406
|
+
group.each do |(table_id, template_suffix), group_rows|
|
407
|
+
insert(table_id, group_rows, template_suffix)
|
415
408
|
end
|
416
409
|
end
|
417
410
|
|
418
|
-
def insert(table_id, rows)
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
423
|
-
# api_error? -> client cache clear
|
424
|
-
@cached_client = nil
|
425
|
-
|
426
|
-
message = e.message
|
427
|
-
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
|
411
|
+
def insert(table_id, rows, template_suffix)
|
412
|
+
writer.insert_rows(@project, @dataset, table_id, rows, skip_invalid_rows: @skip_invalid_rows, ignore_unknown_values: @ignore_unknown_values, template_suffix: template_suffix)
|
413
|
+
rescue Fluent::BigQuery::Writer::Error => e
|
414
|
+
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
428
415
|
# Table Not Found: Auto Create Table
|
429
|
-
create_table(table_id)
|
416
|
+
writer.create_table(@project, @dataset, table_id, @fields)
|
430
417
|
raise "table created. send rows next time."
|
431
418
|
end
|
432
|
-
|
433
|
-
|
419
|
+
|
420
|
+
if e.retryable?
|
421
|
+
raise e # TODO: error class
|
422
|
+
elsif @secondary
|
423
|
+
flush_secondary(@secondary)
|
424
|
+
end
|
434
425
|
end
|
435
426
|
end
|
436
427
|
|
437
428
|
module LoadImplementation
|
438
429
|
def format(tag, time, record)
|
439
|
-
|
430
|
+
fetch_schema if @fetch_schema_table
|
440
431
|
|
441
432
|
if @replace_record_key
|
442
433
|
record = replace_record_key(record)
|
443
434
|
end
|
435
|
+
|
436
|
+
buf = String.new
|
444
437
|
row = @fields.format(@add_time_field.call(record, time))
|
445
438
|
unless row.empty?
|
446
439
|
buf << MultiJson.dump(row) + "\n"
|
@@ -448,53 +441,37 @@ module Fluent
|
|
448
441
|
buf
|
449
442
|
end
|
450
443
|
|
451
|
-
def _write(chunk, table_id_format)
|
452
|
-
|
444
|
+
def _write(chunk, table_id_format, _)
|
445
|
+
now = Time.at(Fluent::Engine.now)
|
446
|
+
table_id = generate_table_id(table_id_format, now, nil, chunk)
|
453
447
|
load(chunk, table_id)
|
454
448
|
end
|
455
449
|
|
456
450
|
def load(chunk, table_id)
|
457
451
|
res = nil
|
458
|
-
create_upload_source(chunk) do |upload_source|
|
459
|
-
res = client.insert_job(@project, {
|
460
|
-
configuration: {
|
461
|
-
load: {
|
462
|
-
destination_table: {
|
463
|
-
project_id: @project,
|
464
|
-
dataset_id: @dataset,
|
465
|
-
table_id: table_id,
|
466
|
-
},
|
467
|
-
schema: {
|
468
|
-
fields: @fields.to_a,
|
469
|
-
},
|
470
|
-
write_disposition: "WRITE_APPEND",
|
471
|
-
source_format: "NEWLINE_DELIMITED_JSON"
|
472
|
-
}
|
473
|
-
}
|
474
|
-
}, {upload_source: upload_source, content_type: "application/octet-stream"})
|
475
|
-
end
|
476
|
-
wait_load(res, table_id)
|
477
|
-
end
|
478
452
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
_response = res
|
484
|
-
until _response.status.state == "DONE"
|
485
|
-
log.debug "wait for load job finish", state: _response.status.state
|
486
|
-
sleep wait_interval
|
487
|
-
_response = client.get_job(@project, _response.job_reference.job_id)
|
453
|
+
if @prevent_duplicate_load
|
454
|
+
job_id = create_job_id(chunk, @dataset, table_id, @fields.to_a, @max_bad_records, @ignore_unknown_values)
|
455
|
+
else
|
456
|
+
job_id = nil
|
488
457
|
end
|
489
458
|
|
490
|
-
|
491
|
-
|
492
|
-
|
459
|
+
create_upload_source(chunk) do |upload_source|
|
460
|
+
res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
|
461
|
+
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
462
|
+
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
|
463
|
+
})
|
464
|
+
end
|
465
|
+
rescue Fluent::BigQuery::Writer::Error => e
|
466
|
+
if e.retryable?
|
467
|
+
raise e
|
468
|
+
elsif @secondary
|
469
|
+
flush_secondary(@secondary)
|
493
470
|
end
|
494
|
-
|
495
|
-
log.debug "finish load job", state: _response.status.state
|
496
471
|
end
|
497
472
|
|
473
|
+
private
|
474
|
+
|
498
475
|
def create_upload_source(chunk)
|
499
476
|
chunk_is_file = @buffer_type == 'file'
|
500
477
|
if chunk_is_file
|
@@ -511,200 +488,9 @@ module Fluent
|
|
511
488
|
end
|
512
489
|
end
|
513
490
|
end
|
514
|
-
end
|
515
|
-
|
516
|
-
class FieldSchema
|
517
|
-
def initialize(name, mode = :nullable)
|
518
|
-
unless [:nullable, :required, :repeated].include?(mode)
|
519
|
-
raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
|
520
|
-
end
|
521
|
-
### https://developers.google.com/bigquery/docs/tables
|
522
|
-
# Each field has the following properties:
|
523
|
-
#
|
524
|
-
# name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
|
525
|
-
# and must start with a letter or underscore. The maximum length is 128 characters.
|
526
|
-
# https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
|
527
|
-
unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
|
528
|
-
raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
|
529
|
-
end
|
530
|
-
|
531
|
-
@name = name
|
532
|
-
@mode = mode
|
533
|
-
end
|
534
|
-
|
535
|
-
attr_reader :name, :mode
|
536
|
-
|
537
|
-
def format(value)
|
538
|
-
case @mode
|
539
|
-
when :nullable
|
540
|
-
format_one(value) unless value.nil?
|
541
|
-
when :required
|
542
|
-
raise "Required field #{name} cannot be null" if value.nil?
|
543
|
-
format_one(value)
|
544
|
-
when :repeated
|
545
|
-
value.nil? ? [] : value.map {|v| format_one(v) }
|
546
|
-
end
|
547
|
-
end
|
548
|
-
|
549
|
-
def format_one(value)
|
550
|
-
raise NotImplementedError, "Must implement in a subclass"
|
551
|
-
end
|
552
|
-
|
553
|
-
def to_h
|
554
|
-
{
|
555
|
-
:name => name,
|
556
|
-
:type => type.to_s.upcase,
|
557
|
-
:mode => mode.to_s.upcase,
|
558
|
-
}
|
559
|
-
end
|
560
|
-
end
|
561
|
-
|
562
|
-
class StringFieldSchema < FieldSchema
|
563
|
-
def type
|
564
|
-
:string
|
565
|
-
end
|
566
|
-
|
567
|
-
def format_one(value)
|
568
|
-
value.to_s
|
569
|
-
end
|
570
|
-
end
|
571
|
-
|
572
|
-
class IntegerFieldSchema < FieldSchema
|
573
|
-
def type
|
574
|
-
:integer
|
575
|
-
end
|
576
|
-
|
577
|
-
def format_one(value)
|
578
|
-
value.to_i
|
579
|
-
end
|
580
|
-
end
|
581
|
-
|
582
|
-
class FloatFieldSchema < FieldSchema
|
583
|
-
def type
|
584
|
-
:float
|
585
|
-
end
|
586
|
-
|
587
|
-
def format_one(value)
|
588
|
-
value.to_f
|
589
|
-
end
|
590
|
-
end
|
591
|
-
|
592
|
-
class BooleanFieldSchema < FieldSchema
|
593
|
-
def type
|
594
|
-
:boolean
|
595
|
-
end
|
596
|
-
|
597
|
-
def format_one(value)
|
598
|
-
!!value
|
599
|
-
end
|
600
|
-
end
|
601
|
-
|
602
|
-
class TimestampFieldSchema < FieldSchema
|
603
|
-
def type
|
604
|
-
:timestamp
|
605
|
-
end
|
606
|
-
|
607
|
-
def format_one(value)
|
608
|
-
value
|
609
|
-
end
|
610
|
-
end
|
611
|
-
|
612
|
-
class RecordSchema < FieldSchema
|
613
|
-
FIELD_TYPES = {
|
614
|
-
string: StringFieldSchema,
|
615
|
-
integer: IntegerFieldSchema,
|
616
|
-
float: FloatFieldSchema,
|
617
|
-
boolean: BooleanFieldSchema,
|
618
|
-
timestamp: TimestampFieldSchema,
|
619
|
-
record: RecordSchema
|
620
|
-
}.freeze
|
621
|
-
|
622
|
-
def initialize(name, mode = :nullable)
|
623
|
-
super(name, mode)
|
624
|
-
@fields = {}
|
625
|
-
end
|
626
|
-
|
627
|
-
def type
|
628
|
-
:record
|
629
|
-
end
|
630
|
-
|
631
|
-
def [](name)
|
632
|
-
@fields[name]
|
633
|
-
end
|
634
|
-
|
635
|
-
def to_a
|
636
|
-
@fields.map do |_, field_schema|
|
637
|
-
field_schema.to_h
|
638
|
-
end
|
639
|
-
end
|
640
|
-
|
641
|
-
def to_h
|
642
|
-
{
|
643
|
-
:name => name,
|
644
|
-
:type => type.to_s.upcase,
|
645
|
-
:mode => mode.to_s.upcase,
|
646
|
-
:fields => self.to_a,
|
647
|
-
}
|
648
|
-
end
|
649
|
-
|
650
|
-
def load_schema(schema, allow_overwrite=true)
|
651
|
-
schema.each do |field|
|
652
|
-
raise ConfigError, 'field must have type' unless field.key?('type')
|
653
|
-
|
654
|
-
name = field['name']
|
655
|
-
mode = (field['mode'] || 'nullable').downcase.to_sym
|
656
|
-
|
657
|
-
type = field['type'].downcase.to_sym
|
658
|
-
field_schema_class = FIELD_TYPES[type]
|
659
|
-
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
660
|
-
|
661
|
-
next if @fields.key?(name) and !allow_overwrite
|
662
|
-
|
663
|
-
field_schema = field_schema_class.new(name, mode)
|
664
|
-
@fields[name] = field_schema
|
665
|
-
if type == :record
|
666
|
-
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
667
|
-
field_schema.load_schema(field['fields'], allow_overwrite)
|
668
|
-
end
|
669
|
-
end
|
670
|
-
end
|
671
|
-
|
672
|
-
def register_field(name, type)
|
673
|
-
if @fields.key?(name) and @fields[name].type != :timestamp
|
674
|
-
raise ConfigError, "field #{name} is registered twice"
|
675
|
-
end
|
676
|
-
if name[/\./]
|
677
|
-
recordname = $`
|
678
|
-
fieldname = $'
|
679
|
-
register_record_field(recordname)
|
680
|
-
@fields[recordname].register_field(fieldname, type)
|
681
|
-
else
|
682
|
-
schema = FIELD_TYPES[type]
|
683
|
-
raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
|
684
|
-
@fields[name] = schema.new(name)
|
685
|
-
end
|
686
|
-
end
|
687
491
|
|
688
|
-
def
|
689
|
-
|
690
|
-
@fields.each do |key, schema|
|
691
|
-
value = record[key]
|
692
|
-
formatted = schema.format(value)
|
693
|
-
next if formatted.nil? # field does not exists, or null value
|
694
|
-
out[key] = formatted
|
695
|
-
end
|
696
|
-
out
|
697
|
-
end
|
698
|
-
|
699
|
-
private
|
700
|
-
def register_record_field(name)
|
701
|
-
if !@fields.key?(name)
|
702
|
-
@fields[name] = RecordSchema.new(name)
|
703
|
-
else
|
704
|
-
unless @fields[name].kind_of?(RecordSchema)
|
705
|
-
raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
|
706
|
-
end
|
707
|
-
end
|
492
|
+
def create_job_id(chunk, dataset, table, schema, max_bad_records, ignore_unknown_values)
|
493
|
+
"fluentd_job_" + Digest::SHA1.hexdigest("#{chunk.unique_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
|
708
494
|
end
|
709
495
|
end
|
710
496
|
end
|