fluent-plugin-bigquery 0.2.16 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +76 -3
- data/Rakefile +1 -0
- data/fluent-plugin-bigquery.gemspec +3 -5
- data/lib/fluent/plugin/bigquery/schema.rb +221 -0
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +289 -0
- data/lib/fluent/plugin/out_bigquery.rb +159 -373
- data/test/helper.rb +1 -0
- data/test/plugin/test_out_bigquery.rb +470 -142
- data/test/plugin/test_record_schema.rb +173 -0
- metadata +17 -21
@@ -5,6 +5,9 @@ require 'fluent/plugin/bigquery/version'
|
|
5
5
|
require 'fluent/mixin/config_placeholders'
|
6
6
|
require 'fluent/mixin/plaintextformatter'
|
7
7
|
|
8
|
+
require 'fluent/plugin/bigquery/schema'
|
9
|
+
require 'fluent/plugin/bigquery/writer'
|
10
|
+
|
8
11
|
## TODO: load implementation
|
9
12
|
# require 'fluent/plugin/bigquery/load_request_body_wrapper'
|
10
13
|
|
@@ -19,36 +22,39 @@ module Fluent
|
|
19
22
|
# https://developers.google.com/bigquery/browser-tool-quickstart
|
20
23
|
# https://developers.google.com/bigquery/bigquery-api-quickstart
|
21
24
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
config_set_default :try_flush_interval, 0.05
|
25
|
+
### default for insert
|
26
|
+
def configure_for_insert(conf)
|
27
|
+
raise ConfigError unless conf["method"] != "load"
|
26
28
|
|
27
|
-
|
28
|
-
|
29
|
-
|
29
|
+
conf["buffer_type"] = "lightening" unless conf["buffer_type"]
|
30
|
+
conf["flush_interval"] = 0.25 unless conf["flush_interval"]
|
31
|
+
conf["try_flush_interval"] = 0.05 unless conf["try_flush_interval"]
|
32
|
+
conf["buffer_chunk_limit"] = 1 * 1024 ** 2 unless conf["buffer_chunk_limit"] # 1MB
|
33
|
+
conf["buffer_queue_limit"] = 1024 unless conf["buffer_queue_limit"]
|
34
|
+
conf["buffer_chunk_records_limit"] = 500 unless conf["buffer_chunk_records_limit"]
|
35
|
+
end
|
30
36
|
|
31
|
-
### for loads
|
32
|
-
|
33
|
-
|
34
|
-
# config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
|
37
|
+
### default for loads
|
38
|
+
def configure_for_load(conf)
|
39
|
+
raise ConfigError unless conf["method"] == "load"
|
35
40
|
|
36
|
-
|
37
|
-
|
38
|
-
|
41
|
+
# buffer_type, flush_interval, try_flush_interval is TimeSlicedOutput default
|
42
|
+
conf["buffer_chunk_limit"] = 1 * 1024 ** 3 unless conf["buffer_chunk_limit"] # 1GB
|
43
|
+
conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
|
44
|
+
end
|
39
45
|
|
40
46
|
# Available methods are:
|
41
47
|
# * private_key -- Use service account credential from pkcs12 private key file
|
42
48
|
# * compute_engine -- Use access token available in instances of ComputeEngine
|
43
|
-
# *
|
49
|
+
# * json_key -- Use service account credential from JSON key
|
44
50
|
# * application_default -- Use application default credential
|
45
|
-
config_param :auth_method, :
|
51
|
+
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
46
52
|
|
47
53
|
### Service Account credential
|
48
54
|
config_param :email, :string, default: nil
|
49
55
|
config_param :private_key_path, :string, default: nil
|
50
56
|
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
51
|
-
config_param :json_key, default: nil
|
57
|
+
config_param :json_key, default: nil, secret: true
|
52
58
|
|
53
59
|
# see as simple reference
|
54
60
|
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
@@ -62,12 +68,32 @@ module Fluent
|
|
62
68
|
# table_id
|
63
69
|
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
64
70
|
config_param :table, :string, default: nil
|
65
|
-
config_param :tables, :string, default: nil
|
71
|
+
config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
|
72
|
+
|
73
|
+
# template_suffix (only insert)
|
74
|
+
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
75
|
+
config_param :template_suffix, :string, default: nil
|
66
76
|
|
67
77
|
config_param :auto_create_table, :bool, default: false
|
68
78
|
|
79
|
+
# skip_invalid_rows (only insert)
|
80
|
+
# Insert all valid rows of a request, even if invalid rows exist.
|
81
|
+
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
82
|
+
config_param :skip_invalid_rows, :bool, default: false
|
83
|
+
# max_bad_records (only load)
|
84
|
+
# The maximum number of bad records that BigQuery can ignore when running the job.
|
85
|
+
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
86
|
+
# The default value is 0, which requires that all records are valid.
|
87
|
+
config_param :max_bad_records, :integer, default: 0
|
88
|
+
# ignore_unknown_values
|
89
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
90
|
+
# Default is false, which treats unknown values as errors.
|
91
|
+
config_param :ignore_unknown_values, :bool, default: false
|
92
|
+
|
69
93
|
config_param :schema_path, :string, default: nil
|
70
94
|
config_param :fetch_schema, :bool, default: false
|
95
|
+
config_param :fetch_schema_table, :string, default: nil
|
96
|
+
config_param :schema_cache_expire, :time, default: 600
|
71
97
|
config_param :field_string, :string, default: nil
|
72
98
|
config_param :field_integer, :string, default: nil
|
73
99
|
config_param :field_float, :string, default: nil
|
@@ -90,20 +116,15 @@ module Fluent
|
|
90
116
|
config_param :utc, :bool, default: nil
|
91
117
|
config_param :time_field, :string, default: nil
|
92
118
|
|
119
|
+
# insert_id_field (only insert)
|
93
120
|
config_param :insert_id_field, :string, default: nil
|
121
|
+
# prevent_duplicate_load (only load)
|
122
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
94
123
|
|
95
|
-
config_param :method, :
|
124
|
+
config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
|
96
125
|
|
97
|
-
|
98
|
-
|
99
|
-
# https://developers.google.com/bigquery/loading-data-into-bigquery
|
100
|
-
# Maximum File Sizes:
|
101
|
-
# File Type Compressed Uncompressed
|
102
|
-
# CSV 1 GB With new-lines in strings: 4 GB
|
103
|
-
# Without new-lines in strings: 1 TB
|
104
|
-
# JSON 1 GB 1 TB
|
105
|
-
|
106
|
-
config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
126
|
+
# TODO
|
127
|
+
# config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
107
128
|
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
108
129
|
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
109
130
|
### method: ''Streaming data inserts support
|
@@ -114,6 +135,14 @@ module Fluent
|
|
114
135
|
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
115
136
|
### Toooooooooooooo short/small per inserts and row!
|
116
137
|
|
138
|
+
## Timeout
|
139
|
+
# request_timeout_sec
|
140
|
+
# Bigquery API response timeout
|
141
|
+
# request_open_timeout_sec
|
142
|
+
# Bigquery API connection, and request timeout
|
143
|
+
config_param :request_timeout_sec, :time, default: nil
|
144
|
+
config_param :request_open_timeout_sec, :time, default: 60
|
145
|
+
|
117
146
|
### Table types
|
118
147
|
# https://developers.google.com/bigquery/docs/tables
|
119
148
|
#
|
@@ -142,34 +171,36 @@ module Fluent
|
|
142
171
|
Faraday.default_connection.options.timeout = 60
|
143
172
|
end
|
144
173
|
|
145
|
-
# Define `log` method for v0.10.42 or earlier
|
146
|
-
unless method_defined?(:log)
|
147
|
-
define_method("log") { $log }
|
148
|
-
end
|
149
|
-
|
150
174
|
def configure(conf)
|
175
|
+
if conf["method"] == "load"
|
176
|
+
configure_for_load(conf)
|
177
|
+
else
|
178
|
+
configure_for_insert(conf)
|
179
|
+
end
|
151
180
|
super
|
152
181
|
|
153
|
-
|
182
|
+
case @method
|
183
|
+
when :insert
|
154
184
|
extend(InsertImplementation)
|
155
|
-
|
185
|
+
when :load
|
186
|
+
raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
|
156
187
|
extend(LoadImplementation)
|
157
188
|
else
|
158
|
-
raise
|
189
|
+
raise Fluent::ConfigError "'method' must be 'insert' or 'load'"
|
159
190
|
end
|
160
191
|
|
161
192
|
case @auth_method
|
162
|
-
when
|
193
|
+
when :private_key
|
163
194
|
unless @email && @private_key_path
|
164
195
|
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
165
196
|
end
|
166
|
-
when
|
197
|
+
when :compute_engine
|
167
198
|
# Do nothing
|
168
|
-
when
|
199
|
+
when :json_key
|
169
200
|
unless @json_key
|
170
201
|
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
171
202
|
end
|
172
|
-
when
|
203
|
+
when :application_default
|
173
204
|
# Do nothing
|
174
205
|
else
|
175
206
|
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
@@ -181,7 +212,7 @@ module Fluent
|
|
181
212
|
|
182
213
|
@tablelist = @tables ? @tables.split(',') : [@table]
|
183
214
|
|
184
|
-
@fields = RecordSchema.new('record')
|
215
|
+
@fields = Fluent::BigQuery::RecordSchema.new('record')
|
185
216
|
if @schema_path
|
186
217
|
@fields.load_schema(MultiJson.load(File.read(@schema_path)))
|
187
218
|
end
|
@@ -232,57 +263,20 @@ module Fluent
|
|
232
263
|
def start
|
233
264
|
super
|
234
265
|
|
235
|
-
@cached_client = nil
|
236
|
-
@cached_client_expiration = nil
|
237
|
-
|
238
266
|
@tables_queue = @tablelist.dup.shuffle
|
239
267
|
@tables_mutex = Mutex.new
|
268
|
+
@fetch_schema_mutex = Mutex.new
|
240
269
|
|
241
|
-
|
270
|
+
@last_fetch_schema_time = 0
|
271
|
+
fetch_schema(false) if @fetch_schema
|
242
272
|
end
|
243
273
|
|
244
|
-
def
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
case @auth_method
|
252
|
-
when 'private_key'
|
253
|
-
require 'google/api_client/auth/key_utils'
|
254
|
-
key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
|
255
|
-
auth = Signet::OAuth2::Client.new(
|
256
|
-
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
257
|
-
audience: "https://accounts.google.com/o/oauth2/token",
|
258
|
-
scope: scope,
|
259
|
-
issuer: @email,
|
260
|
-
signing_key: key)
|
261
|
-
|
262
|
-
when 'compute_engine'
|
263
|
-
auth = Google::Auth::GCECredentials.new
|
264
|
-
|
265
|
-
when 'json_key'
|
266
|
-
if File.exist?(@json_key)
|
267
|
-
auth = File.open(@json_key) do |f|
|
268
|
-
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
|
269
|
-
end
|
270
|
-
else
|
271
|
-
key = StringIO.new(@json_key)
|
272
|
-
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
|
273
|
-
end
|
274
|
-
|
275
|
-
when 'application_default'
|
276
|
-
auth = Google::Auth.get_application_default([scope])
|
277
|
-
|
278
|
-
else
|
279
|
-
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
280
|
-
end
|
281
|
-
|
282
|
-
client.authorization = auth
|
283
|
-
|
284
|
-
@cached_client_expiration = Time.now + 1800
|
285
|
-
@cached_client = client
|
274
|
+
def writer
|
275
|
+
@writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
276
|
+
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
277
|
+
email: @email,
|
278
|
+
json_key: @json_key,
|
279
|
+
})
|
286
280
|
end
|
287
281
|
|
288
282
|
def generate_table_id(table_id_format, current_time, row = nil, chunk = nil)
|
@@ -295,7 +289,6 @@ module Fluent
|
|
295
289
|
current_time
|
296
290
|
end
|
297
291
|
if row && format =~ /\$\{/
|
298
|
-
json = row[:json]
|
299
292
|
format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
|
300
293
|
row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
|
301
294
|
end
|
@@ -313,28 +306,6 @@ module Fluent
|
|
313
306
|
end
|
314
307
|
end
|
315
308
|
|
316
|
-
def create_table(table_id)
|
317
|
-
client.insert_table(@project, @dataset, {
|
318
|
-
table_reference: {
|
319
|
-
table_id: table_id,
|
320
|
-
},
|
321
|
-
schema: {
|
322
|
-
fields: @fields.to_a,
|
323
|
-
}
|
324
|
-
}, {})
|
325
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
326
|
-
# api_error? -> client cache clear
|
327
|
-
@cached_client = nil
|
328
|
-
|
329
|
-
message = e.message
|
330
|
-
if e.status_code == 409 && /Already Exists:/ =~ message
|
331
|
-
# ignore 'Already Exists' error
|
332
|
-
return
|
333
|
-
end
|
334
|
-
log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
|
335
|
-
raise "failed to create table in bigquery" # TODO: error class
|
336
|
-
end
|
337
|
-
|
338
309
|
def replace_record_key(record)
|
339
310
|
new_record = {}
|
340
311
|
record.each do |key, _|
|
@@ -363,28 +334,42 @@ module Fluent
|
|
363
334
|
@tables_queue.push t
|
364
335
|
t
|
365
336
|
end
|
366
|
-
|
337
|
+
template_suffix_format = @template_suffix
|
338
|
+
_write(chunk, table_id_format, template_suffix_format)
|
367
339
|
end
|
368
340
|
|
369
|
-
def fetch_schema
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
341
|
+
def fetch_schema(allow_overwrite = true)
|
342
|
+
table_id = nil
|
343
|
+
@fetch_schema_mutex.synchronize do
|
344
|
+
if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
|
345
|
+
table_id_format = @fetch_schema_table || @tablelist[0]
|
346
|
+
table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
|
347
|
+
schema = writer.fetch_schema(@project, @dataset, table_id)
|
348
|
+
|
349
|
+
if schema
|
350
|
+
if allow_overwrite
|
351
|
+
fields = Fluent::BigQuery::RecordSchema.new("record")
|
352
|
+
fields.load_schema(schema, allow_overwrite)
|
353
|
+
@fields = fields
|
354
|
+
else
|
355
|
+
@fields.load_schema(schema, allow_overwrite)
|
356
|
+
end
|
357
|
+
else
|
358
|
+
if @fields.empty?
|
359
|
+
raise "failed to fetch schema from bigquery"
|
360
|
+
else
|
361
|
+
log.warn "#{table_id} uses previous schema"
|
362
|
+
end
|
363
|
+
end
|
364
|
+
|
365
|
+
@last_fetch_schema_time = Fluent::Engine.now
|
366
|
+
end
|
367
|
+
end
|
383
368
|
end
|
384
369
|
|
385
370
|
module InsertImplementation
|
386
371
|
def format(tag, time, record)
|
387
|
-
|
372
|
+
fetch_schema if @template_suffix
|
388
373
|
|
389
374
|
if @replace_record_key
|
390
375
|
record = replace_record_key(record)
|
@@ -394,6 +379,7 @@ module Fluent
|
|
394
379
|
record = convert_hash_to_json(record)
|
395
380
|
end
|
396
381
|
|
382
|
+
buf = String.new
|
397
383
|
row = @fields.format(@add_time_field.call(record, time))
|
398
384
|
unless row.empty?
|
399
385
|
row = {"json" => row}
|
@@ -403,44 +389,51 @@ module Fluent
|
|
403
389
|
buf
|
404
390
|
end
|
405
391
|
|
406
|
-
def _write(chunk, table_format)
|
392
|
+
def _write(chunk, table_format, template_suffix_format)
|
407
393
|
rows = []
|
408
394
|
chunk.msgpack_each do |row_object|
|
409
395
|
# TODO: row size limit
|
410
396
|
rows << row_object.deep_symbolize_keys
|
411
397
|
end
|
412
398
|
|
413
|
-
|
414
|
-
|
399
|
+
now = Time.at(Fluent::Engine.now)
|
400
|
+
group = rows.group_by do |row|
|
401
|
+
[
|
402
|
+
generate_table_id(table_format, now, row, chunk),
|
403
|
+
template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
|
404
|
+
]
|
405
|
+
end
|
406
|
+
group.each do |(table_id, template_suffix), group_rows|
|
407
|
+
insert(table_id, group_rows, template_suffix)
|
415
408
|
end
|
416
409
|
end
|
417
410
|
|
418
|
-
def insert(table_id, rows)
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
423
|
-
# api_error? -> client cache clear
|
424
|
-
@cached_client = nil
|
425
|
-
|
426
|
-
message = e.message
|
427
|
-
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
|
411
|
+
def insert(table_id, rows, template_suffix)
|
412
|
+
writer.insert_rows(@project, @dataset, table_id, rows, skip_invalid_rows: @skip_invalid_rows, ignore_unknown_values: @ignore_unknown_values, template_suffix: template_suffix)
|
413
|
+
rescue Fluent::BigQuery::Writer::Error => e
|
414
|
+
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
428
415
|
# Table Not Found: Auto Create Table
|
429
|
-
create_table(table_id)
|
416
|
+
writer.create_table(@project, @dataset, table_id, @fields)
|
430
417
|
raise "table created. send rows next time."
|
431
418
|
end
|
432
|
-
|
433
|
-
|
419
|
+
|
420
|
+
if e.retryable?
|
421
|
+
raise e # TODO: error class
|
422
|
+
elsif @secondary
|
423
|
+
flush_secondary(@secondary)
|
424
|
+
end
|
434
425
|
end
|
435
426
|
end
|
436
427
|
|
437
428
|
module LoadImplementation
|
438
429
|
def format(tag, time, record)
|
439
|
-
|
430
|
+
fetch_schema if @fetch_schema_table
|
440
431
|
|
441
432
|
if @replace_record_key
|
442
433
|
record = replace_record_key(record)
|
443
434
|
end
|
435
|
+
|
436
|
+
buf = String.new
|
444
437
|
row = @fields.format(@add_time_field.call(record, time))
|
445
438
|
unless row.empty?
|
446
439
|
buf << MultiJson.dump(row) + "\n"
|
@@ -448,53 +441,37 @@ module Fluent
|
|
448
441
|
buf
|
449
442
|
end
|
450
443
|
|
451
|
-
def _write(chunk, table_id_format)
|
452
|
-
|
444
|
+
def _write(chunk, table_id_format, _)
|
445
|
+
now = Time.at(Fluent::Engine.now)
|
446
|
+
table_id = generate_table_id(table_id_format, now, nil, chunk)
|
453
447
|
load(chunk, table_id)
|
454
448
|
end
|
455
449
|
|
456
450
|
def load(chunk, table_id)
|
457
451
|
res = nil
|
458
|
-
create_upload_source(chunk) do |upload_source|
|
459
|
-
res = client.insert_job(@project, {
|
460
|
-
configuration: {
|
461
|
-
load: {
|
462
|
-
destination_table: {
|
463
|
-
project_id: @project,
|
464
|
-
dataset_id: @dataset,
|
465
|
-
table_id: table_id,
|
466
|
-
},
|
467
|
-
schema: {
|
468
|
-
fields: @fields.to_a,
|
469
|
-
},
|
470
|
-
write_disposition: "WRITE_APPEND",
|
471
|
-
source_format: "NEWLINE_DELIMITED_JSON"
|
472
|
-
}
|
473
|
-
}
|
474
|
-
}, {upload_source: upload_source, content_type: "application/octet-stream"})
|
475
|
-
end
|
476
|
-
wait_load(res, table_id)
|
477
|
-
end
|
478
452
|
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
_response = res
|
484
|
-
until _response.status.state == "DONE"
|
485
|
-
log.debug "wait for load job finish", state: _response.status.state
|
486
|
-
sleep wait_interval
|
487
|
-
_response = client.get_job(@project, _response.job_reference.job_id)
|
453
|
+
if @prevent_duplicate_load
|
454
|
+
job_id = create_job_id(chunk, @dataset, table_id, @fields.to_a, @max_bad_records, @ignore_unknown_values)
|
455
|
+
else
|
456
|
+
job_id = nil
|
488
457
|
end
|
489
458
|
|
490
|
-
|
491
|
-
|
492
|
-
|
459
|
+
create_upload_source(chunk) do |upload_source|
|
460
|
+
res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
|
461
|
+
ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
|
462
|
+
timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
|
463
|
+
})
|
464
|
+
end
|
465
|
+
rescue Fluent::BigQuery::Writer::Error => e
|
466
|
+
if e.retryable?
|
467
|
+
raise e
|
468
|
+
elsif @secondary
|
469
|
+
flush_secondary(@secondary)
|
493
470
|
end
|
494
|
-
|
495
|
-
log.debug "finish load job", state: _response.status.state
|
496
471
|
end
|
497
472
|
|
473
|
+
private
|
474
|
+
|
498
475
|
def create_upload_source(chunk)
|
499
476
|
chunk_is_file = @buffer_type == 'file'
|
500
477
|
if chunk_is_file
|
@@ -511,200 +488,9 @@ module Fluent
|
|
511
488
|
end
|
512
489
|
end
|
513
490
|
end
|
514
|
-
end
|
515
|
-
|
516
|
-
class FieldSchema
|
517
|
-
def initialize(name, mode = :nullable)
|
518
|
-
unless [:nullable, :required, :repeated].include?(mode)
|
519
|
-
raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
|
520
|
-
end
|
521
|
-
### https://developers.google.com/bigquery/docs/tables
|
522
|
-
# Each field has the following properties:
|
523
|
-
#
|
524
|
-
# name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
|
525
|
-
# and must start with a letter or underscore. The maximum length is 128 characters.
|
526
|
-
# https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
|
527
|
-
unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
|
528
|
-
raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
|
529
|
-
end
|
530
|
-
|
531
|
-
@name = name
|
532
|
-
@mode = mode
|
533
|
-
end
|
534
|
-
|
535
|
-
attr_reader :name, :mode
|
536
|
-
|
537
|
-
def format(value)
|
538
|
-
case @mode
|
539
|
-
when :nullable
|
540
|
-
format_one(value) unless value.nil?
|
541
|
-
when :required
|
542
|
-
raise "Required field #{name} cannot be null" if value.nil?
|
543
|
-
format_one(value)
|
544
|
-
when :repeated
|
545
|
-
value.nil? ? [] : value.map {|v| format_one(v) }
|
546
|
-
end
|
547
|
-
end
|
548
|
-
|
549
|
-
def format_one(value)
|
550
|
-
raise NotImplementedError, "Must implement in a subclass"
|
551
|
-
end
|
552
|
-
|
553
|
-
def to_h
|
554
|
-
{
|
555
|
-
:name => name,
|
556
|
-
:type => type.to_s.upcase,
|
557
|
-
:mode => mode.to_s.upcase,
|
558
|
-
}
|
559
|
-
end
|
560
|
-
end
|
561
|
-
|
562
|
-
class StringFieldSchema < FieldSchema
|
563
|
-
def type
|
564
|
-
:string
|
565
|
-
end
|
566
|
-
|
567
|
-
def format_one(value)
|
568
|
-
value.to_s
|
569
|
-
end
|
570
|
-
end
|
571
|
-
|
572
|
-
class IntegerFieldSchema < FieldSchema
|
573
|
-
def type
|
574
|
-
:integer
|
575
|
-
end
|
576
|
-
|
577
|
-
def format_one(value)
|
578
|
-
value.to_i
|
579
|
-
end
|
580
|
-
end
|
581
|
-
|
582
|
-
class FloatFieldSchema < FieldSchema
|
583
|
-
def type
|
584
|
-
:float
|
585
|
-
end
|
586
|
-
|
587
|
-
def format_one(value)
|
588
|
-
value.to_f
|
589
|
-
end
|
590
|
-
end
|
591
|
-
|
592
|
-
class BooleanFieldSchema < FieldSchema
|
593
|
-
def type
|
594
|
-
:boolean
|
595
|
-
end
|
596
|
-
|
597
|
-
def format_one(value)
|
598
|
-
!!value
|
599
|
-
end
|
600
|
-
end
|
601
|
-
|
602
|
-
class TimestampFieldSchema < FieldSchema
|
603
|
-
def type
|
604
|
-
:timestamp
|
605
|
-
end
|
606
|
-
|
607
|
-
def format_one(value)
|
608
|
-
value
|
609
|
-
end
|
610
|
-
end
|
611
|
-
|
612
|
-
class RecordSchema < FieldSchema
|
613
|
-
FIELD_TYPES = {
|
614
|
-
string: StringFieldSchema,
|
615
|
-
integer: IntegerFieldSchema,
|
616
|
-
float: FloatFieldSchema,
|
617
|
-
boolean: BooleanFieldSchema,
|
618
|
-
timestamp: TimestampFieldSchema,
|
619
|
-
record: RecordSchema
|
620
|
-
}.freeze
|
621
|
-
|
622
|
-
def initialize(name, mode = :nullable)
|
623
|
-
super(name, mode)
|
624
|
-
@fields = {}
|
625
|
-
end
|
626
|
-
|
627
|
-
def type
|
628
|
-
:record
|
629
|
-
end
|
630
|
-
|
631
|
-
def [](name)
|
632
|
-
@fields[name]
|
633
|
-
end
|
634
|
-
|
635
|
-
def to_a
|
636
|
-
@fields.map do |_, field_schema|
|
637
|
-
field_schema.to_h
|
638
|
-
end
|
639
|
-
end
|
640
|
-
|
641
|
-
def to_h
|
642
|
-
{
|
643
|
-
:name => name,
|
644
|
-
:type => type.to_s.upcase,
|
645
|
-
:mode => mode.to_s.upcase,
|
646
|
-
:fields => self.to_a,
|
647
|
-
}
|
648
|
-
end
|
649
|
-
|
650
|
-
def load_schema(schema, allow_overwrite=true)
|
651
|
-
schema.each do |field|
|
652
|
-
raise ConfigError, 'field must have type' unless field.key?('type')
|
653
|
-
|
654
|
-
name = field['name']
|
655
|
-
mode = (field['mode'] || 'nullable').downcase.to_sym
|
656
|
-
|
657
|
-
type = field['type'].downcase.to_sym
|
658
|
-
field_schema_class = FIELD_TYPES[type]
|
659
|
-
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
660
|
-
|
661
|
-
next if @fields.key?(name) and !allow_overwrite
|
662
|
-
|
663
|
-
field_schema = field_schema_class.new(name, mode)
|
664
|
-
@fields[name] = field_schema
|
665
|
-
if type == :record
|
666
|
-
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
667
|
-
field_schema.load_schema(field['fields'], allow_overwrite)
|
668
|
-
end
|
669
|
-
end
|
670
|
-
end
|
671
|
-
|
672
|
-
def register_field(name, type)
|
673
|
-
if @fields.key?(name) and @fields[name].type != :timestamp
|
674
|
-
raise ConfigError, "field #{name} is registered twice"
|
675
|
-
end
|
676
|
-
if name[/\./]
|
677
|
-
recordname = $`
|
678
|
-
fieldname = $'
|
679
|
-
register_record_field(recordname)
|
680
|
-
@fields[recordname].register_field(fieldname, type)
|
681
|
-
else
|
682
|
-
schema = FIELD_TYPES[type]
|
683
|
-
raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
|
684
|
-
@fields[name] = schema.new(name)
|
685
|
-
end
|
686
|
-
end
|
687
491
|
|
688
|
-
def
|
689
|
-
|
690
|
-
@fields.each do |key, schema|
|
691
|
-
value = record[key]
|
692
|
-
formatted = schema.format(value)
|
693
|
-
next if formatted.nil? # field does not exists, or null value
|
694
|
-
out[key] = formatted
|
695
|
-
end
|
696
|
-
out
|
697
|
-
end
|
698
|
-
|
699
|
-
private
|
700
|
-
def register_record_field(name)
|
701
|
-
if !@fields.key?(name)
|
702
|
-
@fields[name] = RecordSchema.new(name)
|
703
|
-
else
|
704
|
-
unless @fields[name].kind_of?(RecordSchema)
|
705
|
-
raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
|
706
|
-
end
|
707
|
-
end
|
492
|
+
def create_job_id(chunk, dataset, table, schema, max_bad_records, ignore_unknown_values)
|
493
|
+
"fluentd_job_" + Digest::SHA1.hexdigest("#{chunk.unique_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
|
708
494
|
end
|
709
495
|
end
|
710
496
|
end
|