fluent-plugin-bigquery 0.2.16 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,6 +5,9 @@ require 'fluent/plugin/bigquery/version'
5
5
  require 'fluent/mixin/config_placeholders'
6
6
  require 'fluent/mixin/plaintextformatter'
7
7
 
8
+ require 'fluent/plugin/bigquery/schema'
9
+ require 'fluent/plugin/bigquery/writer'
10
+
8
11
  ## TODO: load implementation
9
12
  # require 'fluent/plugin/bigquery/load_request_body_wrapper'
10
13
 
@@ -19,36 +22,39 @@ module Fluent
19
22
  # https://developers.google.com/bigquery/browser-tool-quickstart
20
23
  # https://developers.google.com/bigquery/bigquery-api-quickstart
21
24
 
22
- config_set_default :buffer_type, 'lightening'
23
-
24
- config_set_default :flush_interval, 0.25
25
- config_set_default :try_flush_interval, 0.05
25
+ ### default for insert
26
+ def configure_for_insert(conf)
27
+ raise ConfigError unless conf["method"] != "load"
26
28
 
27
- config_set_default :buffer_chunk_records_limit, 500
28
- config_set_default :buffer_chunk_limit, 1000000
29
- config_set_default :buffer_queue_limit, 1024
29
+ conf["buffer_type"] = "lightening" unless conf["buffer_type"]
30
+ conf["flush_interval"] = 0.25 unless conf["flush_interval"]
31
+ conf["try_flush_interval"] = 0.05 unless conf["try_flush_interval"]
32
+ conf["buffer_chunk_limit"] = 1 * 1024 ** 2 unless conf["buffer_chunk_limit"] # 1MB
33
+ conf["buffer_queue_limit"] = 1024 unless conf["buffer_queue_limit"]
34
+ conf["buffer_chunk_records_limit"] = 500 unless conf["buffer_chunk_records_limit"]
35
+ end
30
36
 
31
- ### for loads
32
- ### TODO: different default values for buffering between 'load' and insert
33
- # config_set_default :flush_interval, 1800 # 30min => 48 imports/day
34
- # config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
37
+ ### default for loads
38
+ def configure_for_load(conf)
39
+ raise ConfigError unless conf["method"] == "load"
35
40
 
36
- ### OAuth credential
37
- # config_param :client_id, :string
38
- # config_param :client_secret, :string
41
+ # buffer_type, flush_interval, try_flush_interval is TimeSlicedOutput default
42
+ conf["buffer_chunk_limit"] = 1 * 1024 ** 3 unless conf["buffer_chunk_limit"] # 1GB
43
+ conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
44
+ end
39
45
 
40
46
  # Available methods are:
41
47
  # * private_key -- Use service account credential from pkcs12 private key file
42
48
  # * compute_engine -- Use access token available in instances of ComputeEngine
43
- # * private_json_key -- Use service account credential from JSON key
49
+ # * json_key -- Use service account credential from JSON key
44
50
  # * application_default -- Use application default credential
45
- config_param :auth_method, :string, default: 'private_key'
51
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
46
52
 
47
53
  ### Service Account credential
48
54
  config_param :email, :string, default: nil
49
55
  config_param :private_key_path, :string, default: nil
50
56
  config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
51
- config_param :json_key, default: nil
57
+ config_param :json_key, default: nil, secret: true
52
58
 
53
59
  # see as simple reference
54
60
  # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -62,12 +68,32 @@ module Fluent
62
68
  # table_id
63
69
  # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
64
70
  config_param :table, :string, default: nil
65
- config_param :tables, :string, default: nil
71
+ config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
72
+
73
+ # template_suffix (only insert)
74
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
75
+ config_param :template_suffix, :string, default: nil
66
76
 
67
77
  config_param :auto_create_table, :bool, default: false
68
78
 
79
+ # skip_invalid_rows (only insert)
80
+ # Insert all valid rows of a request, even if invalid rows exist.
81
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
82
+ config_param :skip_invalid_rows, :bool, default: false
83
+ # max_bad_records (only load)
84
+ # The maximum number of bad records that BigQuery can ignore when running the job.
85
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
86
+ # The default value is 0, which requires that all records are valid.
87
+ config_param :max_bad_records, :integer, default: 0
88
+ # ignore_unknown_values
89
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
90
+ # Default is false, which treats unknown values as errors.
91
+ config_param :ignore_unknown_values, :bool, default: false
92
+
69
93
  config_param :schema_path, :string, default: nil
70
94
  config_param :fetch_schema, :bool, default: false
95
+ config_param :fetch_schema_table, :string, default: nil
96
+ config_param :schema_cache_expire, :time, default: 600
71
97
  config_param :field_string, :string, default: nil
72
98
  config_param :field_integer, :string, default: nil
73
99
  config_param :field_float, :string, default: nil
@@ -90,20 +116,15 @@ module Fluent
90
116
  config_param :utc, :bool, default: nil
91
117
  config_param :time_field, :string, default: nil
92
118
 
119
+ # insert_id_field (only insert)
93
120
  config_param :insert_id_field, :string, default: nil
121
+ # prevent_duplicate_load (only load)
122
+ config_param :prevent_duplicate_load, :bool, default: false
94
123
 
95
- config_param :method, :string, default: 'insert' # or 'load'
124
+ config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
96
125
 
97
- config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
98
- ### method: 'load'
99
- # https://developers.google.com/bigquery/loading-data-into-bigquery
100
- # Maximum File Sizes:
101
- # File Type Compressed Uncompressed
102
- # CSV 1 GB With new-lines in strings: 4 GB
103
- # Without new-lines in strings: 1 TB
104
- # JSON 1 GB 1 TB
105
-
106
- config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
126
+ # TODO
127
+ # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
107
128
  # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
108
129
  # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
109
130
  ### method: ''Streaming data inserts support
@@ -114,6 +135,14 @@ module Fluent
114
135
  # If you exceed 100 rows per second for an extended period of time, throttling might occur.
115
136
  ### Toooooooooooooo short/small per inserts and row!
116
137
 
138
+ ## Timeout
139
+ # request_timeout_sec
140
+ # Bigquery API response timeout
141
+ # request_open_timeout_sec
142
+ # Bigquery API connection, and request timeout
143
+ config_param :request_timeout_sec, :time, default: nil
144
+ config_param :request_open_timeout_sec, :time, default: 60
145
+
117
146
  ### Table types
118
147
  # https://developers.google.com/bigquery/docs/tables
119
148
  #
@@ -142,34 +171,36 @@ module Fluent
142
171
  Faraday.default_connection.options.timeout = 60
143
172
  end
144
173
 
145
- # Define `log` method for v0.10.42 or earlier
146
- unless method_defined?(:log)
147
- define_method("log") { $log }
148
- end
149
-
150
174
  def configure(conf)
175
+ if conf["method"] == "load"
176
+ configure_for_load(conf)
177
+ else
178
+ configure_for_insert(conf)
179
+ end
151
180
  super
152
181
 
153
- if @method == "insert"
182
+ case @method
183
+ when :insert
154
184
  extend(InsertImplementation)
155
- elsif @method == "load"
185
+ when :load
186
+ raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
156
187
  extend(LoadImplementation)
157
188
  else
158
- raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
189
+ raise Fluent::ConfigError "'method' must be 'insert' or 'load'"
159
190
  end
160
191
 
161
192
  case @auth_method
162
- when 'private_key'
193
+ when :private_key
163
194
  unless @email && @private_key_path
164
195
  raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
165
196
  end
166
- when 'compute_engine'
197
+ when :compute_engine
167
198
  # Do nothing
168
- when 'json_key'
199
+ when :json_key
169
200
  unless @json_key
170
201
  raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
171
202
  end
172
- when 'application_default'
203
+ when :application_default
173
204
  # Do nothing
174
205
  else
175
206
  raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
@@ -181,7 +212,7 @@ module Fluent
181
212
 
182
213
  @tablelist = @tables ? @tables.split(',') : [@table]
183
214
 
184
- @fields = RecordSchema.new('record')
215
+ @fields = Fluent::BigQuery::RecordSchema.new('record')
185
216
  if @schema_path
186
217
  @fields.load_schema(MultiJson.load(File.read(@schema_path)))
187
218
  end
@@ -232,57 +263,20 @@ module Fluent
232
263
  def start
233
264
  super
234
265
 
235
- @cached_client = nil
236
- @cached_client_expiration = nil
237
-
238
266
  @tables_queue = @tablelist.dup.shuffle
239
267
  @tables_mutex = Mutex.new
268
+ @fetch_schema_mutex = Mutex.new
240
269
 
241
- fetch_schema() if @fetch_schema
270
+ @last_fetch_schema_time = 0
271
+ fetch_schema(false) if @fetch_schema
242
272
  end
243
273
 
244
- def client
245
- return @cached_client if @cached_client && @cached_client_expiration > Time.now
246
-
247
- client = Google::Apis::BigqueryV2::BigqueryService.new
248
-
249
- scope = "https://www.googleapis.com/auth/bigquery"
250
-
251
- case @auth_method
252
- when 'private_key'
253
- require 'google/api_client/auth/key_utils'
254
- key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
255
- auth = Signet::OAuth2::Client.new(
256
- token_credential_uri: "https://accounts.google.com/o/oauth2/token",
257
- audience: "https://accounts.google.com/o/oauth2/token",
258
- scope: scope,
259
- issuer: @email,
260
- signing_key: key)
261
-
262
- when 'compute_engine'
263
- auth = Google::Auth::GCECredentials.new
264
-
265
- when 'json_key'
266
- if File.exist?(@json_key)
267
- auth = File.open(@json_key) do |f|
268
- Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
269
- end
270
- else
271
- key = StringIO.new(@json_key)
272
- auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
273
- end
274
-
275
- when 'application_default'
276
- auth = Google::Auth.get_application_default([scope])
277
-
278
- else
279
- raise ConfigError, "Unknown auth method: #{@auth_method}"
280
- end
281
-
282
- client.authorization = auth
283
-
284
- @cached_client_expiration = Time.now + 1800
285
- @cached_client = client
274
+ def writer
275
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
276
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
277
+ email: @email,
278
+ json_key: @json_key,
279
+ })
286
280
  end
287
281
 
288
282
  def generate_table_id(table_id_format, current_time, row = nil, chunk = nil)
@@ -295,7 +289,6 @@ module Fluent
295
289
  current_time
296
290
  end
297
291
  if row && format =~ /\$\{/
298
- json = row[:json]
299
292
  format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
300
293
  row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
301
294
  end
@@ -313,28 +306,6 @@ module Fluent
313
306
  end
314
307
  end
315
308
 
316
- def create_table(table_id)
317
- client.insert_table(@project, @dataset, {
318
- table_reference: {
319
- table_id: table_id,
320
- },
321
- schema: {
322
- fields: @fields.to_a,
323
- }
324
- }, {})
325
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
326
- # api_error? -> client cache clear
327
- @cached_client = nil
328
-
329
- message = e.message
330
- if e.status_code == 409 && /Already Exists:/ =~ message
331
- # ignore 'Already Exists' error
332
- return
333
- end
334
- log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
335
- raise "failed to create table in bigquery" # TODO: error class
336
- end
337
-
338
309
  def replace_record_key(record)
339
310
  new_record = {}
340
311
  record.each do |key, _|
@@ -363,28 +334,42 @@ module Fluent
363
334
  @tables_queue.push t
364
335
  t
365
336
  end
366
- _write(chunk, table_id_format)
337
+ template_suffix_format = @template_suffix
338
+ _write(chunk, table_id_format, template_suffix_format)
367
339
  end
368
340
 
369
- def fetch_schema
370
- table_id_format = @tablelist[0]
371
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
372
- res = client.get_table(@project, @dataset, table_id)
373
-
374
- schema = res.schema.fields.as_json
375
- log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
376
- @fields.load_schema(schema, false)
377
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
378
- # api_error? -> client cache clear
379
- @cached_client = nil
380
- message = e.message
381
- log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
382
- raise "failed to fetch schema from bigquery" # TODO: error class
341
+ def fetch_schema(allow_overwrite = true)
342
+ table_id = nil
343
+ @fetch_schema_mutex.synchronize do
344
+ if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
345
+ table_id_format = @fetch_schema_table || @tablelist[0]
346
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
347
+ schema = writer.fetch_schema(@project, @dataset, table_id)
348
+
349
+ if schema
350
+ if allow_overwrite
351
+ fields = Fluent::BigQuery::RecordSchema.new("record")
352
+ fields.load_schema(schema, allow_overwrite)
353
+ @fields = fields
354
+ else
355
+ @fields.load_schema(schema, allow_overwrite)
356
+ end
357
+ else
358
+ if @fields.empty?
359
+ raise "failed to fetch schema from bigquery"
360
+ else
361
+ log.warn "#{table_id} uses previous schema"
362
+ end
363
+ end
364
+
365
+ @last_fetch_schema_time = Fluent::Engine.now
366
+ end
367
+ end
383
368
  end
384
369
 
385
370
  module InsertImplementation
386
371
  def format(tag, time, record)
387
- buf = ''
372
+ fetch_schema if @template_suffix
388
373
 
389
374
  if @replace_record_key
390
375
  record = replace_record_key(record)
@@ -394,6 +379,7 @@ module Fluent
394
379
  record = convert_hash_to_json(record)
395
380
  end
396
381
 
382
+ buf = String.new
397
383
  row = @fields.format(@add_time_field.call(record, time))
398
384
  unless row.empty?
399
385
  row = {"json" => row}
@@ -403,44 +389,51 @@ module Fluent
403
389
  buf
404
390
  end
405
391
 
406
- def _write(chunk, table_format)
392
+ def _write(chunk, table_format, template_suffix_format)
407
393
  rows = []
408
394
  chunk.msgpack_each do |row_object|
409
395
  # TODO: row size limit
410
396
  rows << row_object.deep_symbolize_keys
411
397
  end
412
398
 
413
- rows.group_by {|row| generate_table_id(table_format, Time.at(Fluent::Engine.now), row, chunk) }.each do |table_id, group|
414
- insert(table_id, group)
399
+ now = Time.at(Fluent::Engine.now)
400
+ group = rows.group_by do |row|
401
+ [
402
+ generate_table_id(table_format, now, row, chunk),
403
+ template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
404
+ ]
405
+ end
406
+ group.each do |(table_id, template_suffix), group_rows|
407
+ insert(table_id, group_rows, template_suffix)
415
408
  end
416
409
  end
417
410
 
418
- def insert(table_id, rows)
419
- client.insert_all_table_data(@project, @dataset, table_id, {
420
- rows: rows
421
- }, {})
422
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
423
- # api_error? -> client cache clear
424
- @cached_client = nil
425
-
426
- message = e.message
427
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
411
+ def insert(table_id, rows, template_suffix)
412
+ writer.insert_rows(@project, @dataset, table_id, rows, skip_invalid_rows: @skip_invalid_rows, ignore_unknown_values: @ignore_unknown_values, template_suffix: template_suffix)
413
+ rescue Fluent::BigQuery::Writer::Error => e
414
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
428
415
  # Table Not Found: Auto Create Table
429
- create_table(table_id)
416
+ writer.create_table(@project, @dataset, table_id, @fields)
430
417
  raise "table created. send rows next time."
431
418
  end
432
- log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
433
- raise "failed to insert into bigquery" # TODO: error class
419
+
420
+ if e.retryable?
421
+ raise e # TODO: error class
422
+ elsif @secondary
423
+ flush_secondary(@secondary)
424
+ end
434
425
  end
435
426
  end
436
427
 
437
428
  module LoadImplementation
438
429
  def format(tag, time, record)
439
- buf = ''
430
+ fetch_schema if @fetch_schema_table
440
431
 
441
432
  if @replace_record_key
442
433
  record = replace_record_key(record)
443
434
  end
435
+
436
+ buf = String.new
444
437
  row = @fields.format(@add_time_field.call(record, time))
445
438
  unless row.empty?
446
439
  buf << MultiJson.dump(row) + "\n"
@@ -448,53 +441,37 @@ module Fluent
448
441
  buf
449
442
  end
450
443
 
451
- def _write(chunk, table_id_format)
452
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), nil, chunk)
444
+ def _write(chunk, table_id_format, _)
445
+ now = Time.at(Fluent::Engine.now)
446
+ table_id = generate_table_id(table_id_format, now, nil, chunk)
453
447
  load(chunk, table_id)
454
448
  end
455
449
 
456
450
  def load(chunk, table_id)
457
451
  res = nil
458
- create_upload_source(chunk) do |upload_source|
459
- res = client.insert_job(@project, {
460
- configuration: {
461
- load: {
462
- destination_table: {
463
- project_id: @project,
464
- dataset_id: @dataset,
465
- table_id: table_id,
466
- },
467
- schema: {
468
- fields: @fields.to_a,
469
- },
470
- write_disposition: "WRITE_APPEND",
471
- source_format: "NEWLINE_DELIMITED_JSON"
472
- }
473
- }
474
- }, {upload_source: upload_source, content_type: "application/octet-stream"})
475
- end
476
- wait_load(res, table_id)
477
- end
478
452
 
479
- private
480
-
481
- def wait_load(res, table_id)
482
- wait_interval = 10
483
- _response = res
484
- until _response.status.state == "DONE"
485
- log.debug "wait for load job finish", state: _response.status.state
486
- sleep wait_interval
487
- _response = client.get_job(@project, _response.job_reference.job_id)
453
+ if @prevent_duplicate_load
454
+ job_id = create_job_id(chunk, @dataset, table_id, @fields.to_a, @max_bad_records, @ignore_unknown_values)
455
+ else
456
+ job_id = nil
488
457
  end
489
458
 
490
- if _response.status.error_result
491
- log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, message: _response.status.error_result.message
492
- raise "failed to load into bigquery"
459
+ create_upload_source(chunk) do |upload_source|
460
+ res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
461
+ ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
462
+ timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
463
+ })
464
+ end
465
+ rescue Fluent::BigQuery::Writer::Error => e
466
+ if e.retryable?
467
+ raise e
468
+ elsif @secondary
469
+ flush_secondary(@secondary)
493
470
  end
494
-
495
- log.debug "finish load job", state: _response.status.state
496
471
  end
497
472
 
473
+ private
474
+
498
475
  def create_upload_source(chunk)
499
476
  chunk_is_file = @buffer_type == 'file'
500
477
  if chunk_is_file
@@ -511,200 +488,9 @@ module Fluent
511
488
  end
512
489
  end
513
490
  end
514
- end
515
-
516
- class FieldSchema
517
- def initialize(name, mode = :nullable)
518
- unless [:nullable, :required, :repeated].include?(mode)
519
- raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
520
- end
521
- ### https://developers.google.com/bigquery/docs/tables
522
- # Each field has the following properties:
523
- #
524
- # name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
525
- # and must start with a letter or underscore. The maximum length is 128 characters.
526
- # https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
527
- unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
528
- raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
529
- end
530
-
531
- @name = name
532
- @mode = mode
533
- end
534
-
535
- attr_reader :name, :mode
536
-
537
- def format(value)
538
- case @mode
539
- when :nullable
540
- format_one(value) unless value.nil?
541
- when :required
542
- raise "Required field #{name} cannot be null" if value.nil?
543
- format_one(value)
544
- when :repeated
545
- value.nil? ? [] : value.map {|v| format_one(v) }
546
- end
547
- end
548
-
549
- def format_one(value)
550
- raise NotImplementedError, "Must implement in a subclass"
551
- end
552
-
553
- def to_h
554
- {
555
- :name => name,
556
- :type => type.to_s.upcase,
557
- :mode => mode.to_s.upcase,
558
- }
559
- end
560
- end
561
-
562
- class StringFieldSchema < FieldSchema
563
- def type
564
- :string
565
- end
566
-
567
- def format_one(value)
568
- value.to_s
569
- end
570
- end
571
-
572
- class IntegerFieldSchema < FieldSchema
573
- def type
574
- :integer
575
- end
576
-
577
- def format_one(value)
578
- value.to_i
579
- end
580
- end
581
-
582
- class FloatFieldSchema < FieldSchema
583
- def type
584
- :float
585
- end
586
-
587
- def format_one(value)
588
- value.to_f
589
- end
590
- end
591
-
592
- class BooleanFieldSchema < FieldSchema
593
- def type
594
- :boolean
595
- end
596
-
597
- def format_one(value)
598
- !!value
599
- end
600
- end
601
-
602
- class TimestampFieldSchema < FieldSchema
603
- def type
604
- :timestamp
605
- end
606
-
607
- def format_one(value)
608
- value
609
- end
610
- end
611
-
612
- class RecordSchema < FieldSchema
613
- FIELD_TYPES = {
614
- string: StringFieldSchema,
615
- integer: IntegerFieldSchema,
616
- float: FloatFieldSchema,
617
- boolean: BooleanFieldSchema,
618
- timestamp: TimestampFieldSchema,
619
- record: RecordSchema
620
- }.freeze
621
-
622
- def initialize(name, mode = :nullable)
623
- super(name, mode)
624
- @fields = {}
625
- end
626
-
627
- def type
628
- :record
629
- end
630
-
631
- def [](name)
632
- @fields[name]
633
- end
634
-
635
- def to_a
636
- @fields.map do |_, field_schema|
637
- field_schema.to_h
638
- end
639
- end
640
-
641
- def to_h
642
- {
643
- :name => name,
644
- :type => type.to_s.upcase,
645
- :mode => mode.to_s.upcase,
646
- :fields => self.to_a,
647
- }
648
- end
649
-
650
- def load_schema(schema, allow_overwrite=true)
651
- schema.each do |field|
652
- raise ConfigError, 'field must have type' unless field.key?('type')
653
-
654
- name = field['name']
655
- mode = (field['mode'] || 'nullable').downcase.to_sym
656
-
657
- type = field['type'].downcase.to_sym
658
- field_schema_class = FIELD_TYPES[type]
659
- raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
660
-
661
- next if @fields.key?(name) and !allow_overwrite
662
-
663
- field_schema = field_schema_class.new(name, mode)
664
- @fields[name] = field_schema
665
- if type == :record
666
- raise ConfigError, "record field must have fields" unless field.key?('fields')
667
- field_schema.load_schema(field['fields'], allow_overwrite)
668
- end
669
- end
670
- end
671
-
672
- def register_field(name, type)
673
- if @fields.key?(name) and @fields[name].type != :timestamp
674
- raise ConfigError, "field #{name} is registered twice"
675
- end
676
- if name[/\./]
677
- recordname = $`
678
- fieldname = $'
679
- register_record_field(recordname)
680
- @fields[recordname].register_field(fieldname, type)
681
- else
682
- schema = FIELD_TYPES[type]
683
- raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
684
- @fields[name] = schema.new(name)
685
- end
686
- end
687
491
 
688
- def format_one(record)
689
- out = {}
690
- @fields.each do |key, schema|
691
- value = record[key]
692
- formatted = schema.format(value)
693
- next if formatted.nil? # field does not exists, or null value
694
- out[key] = formatted
695
- end
696
- out
697
- end
698
-
699
- private
700
- def register_record_field(name)
701
- if !@fields.key?(name)
702
- @fields[name] = RecordSchema.new(name)
703
- else
704
- unless @fields[name].kind_of?(RecordSchema)
705
- raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
706
- end
707
- end
492
+ def create_job_id(chunk, dataset, table, schema, max_bad_records, ignore_unknown_values)
493
+ "fluentd_job_" + Digest::SHA1.hexdigest("#{chunk.unique_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
708
494
  end
709
495
  end
710
496
  end