fluent-plugin-bigquery 0.2.16 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -5,6 +5,9 @@ require 'fluent/plugin/bigquery/version'
5
5
  require 'fluent/mixin/config_placeholders'
6
6
  require 'fluent/mixin/plaintextformatter'
7
7
 
8
+ require 'fluent/plugin/bigquery/schema'
9
+ require 'fluent/plugin/bigquery/writer'
10
+
8
11
  ## TODO: load implementation
9
12
  # require 'fluent/plugin/bigquery/load_request_body_wrapper'
10
13
 
@@ -19,36 +22,39 @@ module Fluent
19
22
  # https://developers.google.com/bigquery/browser-tool-quickstart
20
23
  # https://developers.google.com/bigquery/bigquery-api-quickstart
21
24
 
22
- config_set_default :buffer_type, 'lightening'
23
-
24
- config_set_default :flush_interval, 0.25
25
- config_set_default :try_flush_interval, 0.05
25
+ ### default for insert
26
+ def configure_for_insert(conf)
27
+ raise ConfigError unless conf["method"] != "load"
26
28
 
27
- config_set_default :buffer_chunk_records_limit, 500
28
- config_set_default :buffer_chunk_limit, 1000000
29
- config_set_default :buffer_queue_limit, 1024
29
+ conf["buffer_type"] = "lightening" unless conf["buffer_type"]
30
+ conf["flush_interval"] = 0.25 unless conf["flush_interval"]
31
+ conf["try_flush_interval"] = 0.05 unless conf["try_flush_interval"]
32
+ conf["buffer_chunk_limit"] = 1 * 1024 ** 2 unless conf["buffer_chunk_limit"] # 1MB
33
+ conf["buffer_queue_limit"] = 1024 unless conf["buffer_queue_limit"]
34
+ conf["buffer_chunk_records_limit"] = 500 unless conf["buffer_chunk_records_limit"]
35
+ end
30
36
 
31
- ### for loads
32
- ### TODO: different default values for buffering between 'load' and insert
33
- # config_set_default :flush_interval, 1800 # 30min => 48 imports/day
34
- # config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
37
+ ### default for loads
38
+ def configure_for_load(conf)
39
+ raise ConfigError unless conf["method"] == "load"
35
40
 
36
- ### OAuth credential
37
- # config_param :client_id, :string
38
- # config_param :client_secret, :string
41
+ # buffer_type, flush_interval, try_flush_interval is TimeSlicedOutput default
42
+ conf["buffer_chunk_limit"] = 1 * 1024 ** 3 unless conf["buffer_chunk_limit"] # 1GB
43
+ conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
44
+ end
39
45
 
40
46
  # Available methods are:
41
47
  # * private_key -- Use service account credential from pkcs12 private key file
42
48
  # * compute_engine -- Use access token available in instances of ComputeEngine
43
- # * private_json_key -- Use service account credential from JSON key
49
+ # * json_key -- Use service account credential from JSON key
44
50
  # * application_default -- Use application default credential
45
- config_param :auth_method, :string, default: 'private_key'
51
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
46
52
 
47
53
  ### Service Account credential
48
54
  config_param :email, :string, default: nil
49
55
  config_param :private_key_path, :string, default: nil
50
56
  config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
51
- config_param :json_key, default: nil
57
+ config_param :json_key, default: nil, secret: true
52
58
 
53
59
  # see as simple reference
54
60
  # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
@@ -62,12 +68,32 @@ module Fluent
62
68
  # table_id
63
69
  # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
64
70
  config_param :table, :string, default: nil
65
- config_param :tables, :string, default: nil
71
+ config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
72
+
73
+ # template_suffix (only insert)
74
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
75
+ config_param :template_suffix, :string, default: nil
66
76
 
67
77
  config_param :auto_create_table, :bool, default: false
68
78
 
79
+ # skip_invalid_rows (only insert)
80
+ # Insert all valid rows of a request, even if invalid rows exist.
81
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
82
+ config_param :skip_invalid_rows, :bool, default: false
83
+ # max_bad_records (only load)
84
+ # The maximum number of bad records that BigQuery can ignore when running the job.
85
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
86
+ # The default value is 0, which requires that all records are valid.
87
+ config_param :max_bad_records, :integer, default: 0
88
+ # ignore_unknown_values
89
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
90
+ # Default is false, which treats unknown values as errors.
91
+ config_param :ignore_unknown_values, :bool, default: false
92
+
69
93
  config_param :schema_path, :string, default: nil
70
94
  config_param :fetch_schema, :bool, default: false
95
+ config_param :fetch_schema_table, :string, default: nil
96
+ config_param :schema_cache_expire, :time, default: 600
71
97
  config_param :field_string, :string, default: nil
72
98
  config_param :field_integer, :string, default: nil
73
99
  config_param :field_float, :string, default: nil
@@ -90,20 +116,15 @@ module Fluent
90
116
  config_param :utc, :bool, default: nil
91
117
  config_param :time_field, :string, default: nil
92
118
 
119
+ # insert_id_field (only insert)
93
120
  config_param :insert_id_field, :string, default: nil
121
+ # prevent_duplicate_load (only load)
122
+ config_param :prevent_duplicate_load, :bool, default: false
94
123
 
95
- config_param :method, :string, default: 'insert' # or 'load'
124
+ config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
96
125
 
97
- config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
98
- ### method: 'load'
99
- # https://developers.google.com/bigquery/loading-data-into-bigquery
100
- # Maximum File Sizes:
101
- # File Type Compressed Uncompressed
102
- # CSV 1 GB With new-lines in strings: 4 GB
103
- # Without new-lines in strings: 1 TB
104
- # JSON 1 GB 1 TB
105
-
106
- config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
126
+ # TODO
127
+ # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
107
128
  # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
108
129
  # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
109
130
  ### method: ''Streaming data inserts support
@@ -114,6 +135,14 @@ module Fluent
114
135
  # If you exceed 100 rows per second for an extended period of time, throttling might occur.
115
136
  ### Toooooooooooooo short/small per inserts and row!
116
137
 
138
+ ## Timeout
139
+ # request_timeout_sec
140
+ # Bigquery API response timeout
141
+ # request_open_timeout_sec
142
+ # Bigquery API connection, and request timeout
143
+ config_param :request_timeout_sec, :time, default: nil
144
+ config_param :request_open_timeout_sec, :time, default: 60
145
+
117
146
  ### Table types
118
147
  # https://developers.google.com/bigquery/docs/tables
119
148
  #
@@ -142,34 +171,36 @@ module Fluent
142
171
  Faraday.default_connection.options.timeout = 60
143
172
  end
144
173
 
145
- # Define `log` method for v0.10.42 or earlier
146
- unless method_defined?(:log)
147
- define_method("log") { $log }
148
- end
149
-
150
174
  def configure(conf)
175
+ if conf["method"] == "load"
176
+ configure_for_load(conf)
177
+ else
178
+ configure_for_insert(conf)
179
+ end
151
180
  super
152
181
 
153
- if @method == "insert"
182
+ case @method
183
+ when :insert
154
184
  extend(InsertImplementation)
155
- elsif @method == "load"
185
+ when :load
186
+ raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
156
187
  extend(LoadImplementation)
157
188
  else
158
- raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
189
+ raise Fluent::ConfigError "'method' must be 'insert' or 'load'"
159
190
  end
160
191
 
161
192
  case @auth_method
162
- when 'private_key'
193
+ when :private_key
163
194
  unless @email && @private_key_path
164
195
  raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
165
196
  end
166
- when 'compute_engine'
197
+ when :compute_engine
167
198
  # Do nothing
168
- when 'json_key'
199
+ when :json_key
169
200
  unless @json_key
170
201
  raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
171
202
  end
172
- when 'application_default'
203
+ when :application_default
173
204
  # Do nothing
174
205
  else
175
206
  raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
@@ -181,7 +212,7 @@ module Fluent
181
212
 
182
213
  @tablelist = @tables ? @tables.split(',') : [@table]
183
214
 
184
- @fields = RecordSchema.new('record')
215
+ @fields = Fluent::BigQuery::RecordSchema.new('record')
185
216
  if @schema_path
186
217
  @fields.load_schema(MultiJson.load(File.read(@schema_path)))
187
218
  end
@@ -232,57 +263,20 @@ module Fluent
232
263
  def start
233
264
  super
234
265
 
235
- @cached_client = nil
236
- @cached_client_expiration = nil
237
-
238
266
  @tables_queue = @tablelist.dup.shuffle
239
267
  @tables_mutex = Mutex.new
268
+ @fetch_schema_mutex = Mutex.new
240
269
 
241
- fetch_schema() if @fetch_schema
270
+ @last_fetch_schema_time = 0
271
+ fetch_schema(false) if @fetch_schema
242
272
  end
243
273
 
244
- def client
245
- return @cached_client if @cached_client && @cached_client_expiration > Time.now
246
-
247
- client = Google::Apis::BigqueryV2::BigqueryService.new
248
-
249
- scope = "https://www.googleapis.com/auth/bigquery"
250
-
251
- case @auth_method
252
- when 'private_key'
253
- require 'google/api_client/auth/key_utils'
254
- key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
255
- auth = Signet::OAuth2::Client.new(
256
- token_credential_uri: "https://accounts.google.com/o/oauth2/token",
257
- audience: "https://accounts.google.com/o/oauth2/token",
258
- scope: scope,
259
- issuer: @email,
260
- signing_key: key)
261
-
262
- when 'compute_engine'
263
- auth = Google::Auth::GCECredentials.new
264
-
265
- when 'json_key'
266
- if File.exist?(@json_key)
267
- auth = File.open(@json_key) do |f|
268
- Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
269
- end
270
- else
271
- key = StringIO.new(@json_key)
272
- auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
273
- end
274
-
275
- when 'application_default'
276
- auth = Google::Auth.get_application_default([scope])
277
-
278
- else
279
- raise ConfigError, "Unknown auth method: #{@auth_method}"
280
- end
281
-
282
- client.authorization = auth
283
-
284
- @cached_client_expiration = Time.now + 1800
285
- @cached_client = client
274
+ def writer
275
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
276
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
277
+ email: @email,
278
+ json_key: @json_key,
279
+ })
286
280
  end
287
281
 
288
282
  def generate_table_id(table_id_format, current_time, row = nil, chunk = nil)
@@ -295,7 +289,6 @@ module Fluent
295
289
  current_time
296
290
  end
297
291
  if row && format =~ /\$\{/
298
- json = row[:json]
299
292
  format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
300
293
  row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
301
294
  end
@@ -313,28 +306,6 @@ module Fluent
313
306
  end
314
307
  end
315
308
 
316
- def create_table(table_id)
317
- client.insert_table(@project, @dataset, {
318
- table_reference: {
319
- table_id: table_id,
320
- },
321
- schema: {
322
- fields: @fields.to_a,
323
- }
324
- }, {})
325
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
326
- # api_error? -> client cache clear
327
- @cached_client = nil
328
-
329
- message = e.message
330
- if e.status_code == 409 && /Already Exists:/ =~ message
331
- # ignore 'Already Exists' error
332
- return
333
- end
334
- log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
335
- raise "failed to create table in bigquery" # TODO: error class
336
- end
337
-
338
309
  def replace_record_key(record)
339
310
  new_record = {}
340
311
  record.each do |key, _|
@@ -363,28 +334,42 @@ module Fluent
363
334
  @tables_queue.push t
364
335
  t
365
336
  end
366
- _write(chunk, table_id_format)
337
+ template_suffix_format = @template_suffix
338
+ _write(chunk, table_id_format, template_suffix_format)
367
339
  end
368
340
 
369
- def fetch_schema
370
- table_id_format = @tablelist[0]
371
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
372
- res = client.get_table(@project, @dataset, table_id)
373
-
374
- schema = res.schema.fields.as_json
375
- log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
376
- @fields.load_schema(schema, false)
377
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
378
- # api_error? -> client cache clear
379
- @cached_client = nil
380
- message = e.message
381
- log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
382
- raise "failed to fetch schema from bigquery" # TODO: error class
341
+ def fetch_schema(allow_overwrite = true)
342
+ table_id = nil
343
+ @fetch_schema_mutex.synchronize do
344
+ if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
345
+ table_id_format = @fetch_schema_table || @tablelist[0]
346
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
347
+ schema = writer.fetch_schema(@project, @dataset, table_id)
348
+
349
+ if schema
350
+ if allow_overwrite
351
+ fields = Fluent::BigQuery::RecordSchema.new("record")
352
+ fields.load_schema(schema, allow_overwrite)
353
+ @fields = fields
354
+ else
355
+ @fields.load_schema(schema, allow_overwrite)
356
+ end
357
+ else
358
+ if @fields.empty?
359
+ raise "failed to fetch schema from bigquery"
360
+ else
361
+ log.warn "#{table_id} uses previous schema"
362
+ end
363
+ end
364
+
365
+ @last_fetch_schema_time = Fluent::Engine.now
366
+ end
367
+ end
383
368
  end
384
369
 
385
370
  module InsertImplementation
386
371
  def format(tag, time, record)
387
- buf = ''
372
+ fetch_schema if @template_suffix
388
373
 
389
374
  if @replace_record_key
390
375
  record = replace_record_key(record)
@@ -394,6 +379,7 @@ module Fluent
394
379
  record = convert_hash_to_json(record)
395
380
  end
396
381
 
382
+ buf = String.new
397
383
  row = @fields.format(@add_time_field.call(record, time))
398
384
  unless row.empty?
399
385
  row = {"json" => row}
@@ -403,44 +389,51 @@ module Fluent
403
389
  buf
404
390
  end
405
391
 
406
- def _write(chunk, table_format)
392
+ def _write(chunk, table_format, template_suffix_format)
407
393
  rows = []
408
394
  chunk.msgpack_each do |row_object|
409
395
  # TODO: row size limit
410
396
  rows << row_object.deep_symbolize_keys
411
397
  end
412
398
 
413
- rows.group_by {|row| generate_table_id(table_format, Time.at(Fluent::Engine.now), row, chunk) }.each do |table_id, group|
414
- insert(table_id, group)
399
+ now = Time.at(Fluent::Engine.now)
400
+ group = rows.group_by do |row|
401
+ [
402
+ generate_table_id(table_format, now, row, chunk),
403
+ template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
404
+ ]
405
+ end
406
+ group.each do |(table_id, template_suffix), group_rows|
407
+ insert(table_id, group_rows, template_suffix)
415
408
  end
416
409
  end
417
410
 
418
- def insert(table_id, rows)
419
- client.insert_all_table_data(@project, @dataset, table_id, {
420
- rows: rows
421
- }, {})
422
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
423
- # api_error? -> client cache clear
424
- @cached_client = nil
425
-
426
- message = e.message
427
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
411
+ def insert(table_id, rows, template_suffix)
412
+ writer.insert_rows(@project, @dataset, table_id, rows, skip_invalid_rows: @skip_invalid_rows, ignore_unknown_values: @ignore_unknown_values, template_suffix: template_suffix)
413
+ rescue Fluent::BigQuery::Writer::Error => e
414
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
428
415
  # Table Not Found: Auto Create Table
429
- create_table(table_id)
416
+ writer.create_table(@project, @dataset, table_id, @fields)
430
417
  raise "table created. send rows next time."
431
418
  end
432
- log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
433
- raise "failed to insert into bigquery" # TODO: error class
419
+
420
+ if e.retryable?
421
+ raise e # TODO: error class
422
+ elsif @secondary
423
+ flush_secondary(@secondary)
424
+ end
434
425
  end
435
426
  end
436
427
 
437
428
  module LoadImplementation
438
429
  def format(tag, time, record)
439
- buf = ''
430
+ fetch_schema if @fetch_schema_table
440
431
 
441
432
  if @replace_record_key
442
433
  record = replace_record_key(record)
443
434
  end
435
+
436
+ buf = String.new
444
437
  row = @fields.format(@add_time_field.call(record, time))
445
438
  unless row.empty?
446
439
  buf << MultiJson.dump(row) + "\n"
@@ -448,53 +441,37 @@ module Fluent
448
441
  buf
449
442
  end
450
443
 
451
- def _write(chunk, table_id_format)
452
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), nil, chunk)
444
+ def _write(chunk, table_id_format, _)
445
+ now = Time.at(Fluent::Engine.now)
446
+ table_id = generate_table_id(table_id_format, now, nil, chunk)
453
447
  load(chunk, table_id)
454
448
  end
455
449
 
456
450
  def load(chunk, table_id)
457
451
  res = nil
458
- create_upload_source(chunk) do |upload_source|
459
- res = client.insert_job(@project, {
460
- configuration: {
461
- load: {
462
- destination_table: {
463
- project_id: @project,
464
- dataset_id: @dataset,
465
- table_id: table_id,
466
- },
467
- schema: {
468
- fields: @fields.to_a,
469
- },
470
- write_disposition: "WRITE_APPEND",
471
- source_format: "NEWLINE_DELIMITED_JSON"
472
- }
473
- }
474
- }, {upload_source: upload_source, content_type: "application/octet-stream"})
475
- end
476
- wait_load(res, table_id)
477
- end
478
452
 
479
- private
480
-
481
- def wait_load(res, table_id)
482
- wait_interval = 10
483
- _response = res
484
- until _response.status.state == "DONE"
485
- log.debug "wait for load job finish", state: _response.status.state
486
- sleep wait_interval
487
- _response = client.get_job(@project, _response.job_reference.job_id)
453
+ if @prevent_duplicate_load
454
+ job_id = create_job_id(chunk, @dataset, table_id, @fields.to_a, @max_bad_records, @ignore_unknown_values)
455
+ else
456
+ job_id = nil
488
457
  end
489
458
 
490
- if _response.status.error_result
491
- log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, message: _response.status.error_result.message
492
- raise "failed to load into bigquery"
459
+ create_upload_source(chunk) do |upload_source|
460
+ res = writer.create_load_job(@project, @dataset, table_id, upload_source, job_id, @fields, {
461
+ ignore_unknown_values: @ignore_unknown_values, max_bad_records: @max_bad_records,
462
+ timeout_sec: @request_timeout_sec, open_timeout_sec: @request_open_timeout_sec,
463
+ })
464
+ end
465
+ rescue Fluent::BigQuery::Writer::Error => e
466
+ if e.retryable?
467
+ raise e
468
+ elsif @secondary
469
+ flush_secondary(@secondary)
493
470
  end
494
-
495
- log.debug "finish load job", state: _response.status.state
496
471
  end
497
472
 
473
+ private
474
+
498
475
  def create_upload_source(chunk)
499
476
  chunk_is_file = @buffer_type == 'file'
500
477
  if chunk_is_file
@@ -511,200 +488,9 @@ module Fluent
511
488
  end
512
489
  end
513
490
  end
514
- end
515
-
516
- class FieldSchema
517
- def initialize(name, mode = :nullable)
518
- unless [:nullable, :required, :repeated].include?(mode)
519
- raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
520
- end
521
- ### https://developers.google.com/bigquery/docs/tables
522
- # Each field has the following properties:
523
- #
524
- # name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
525
- # and must start with a letter or underscore. The maximum length is 128 characters.
526
- # https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
527
- unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
528
- raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
529
- end
530
-
531
- @name = name
532
- @mode = mode
533
- end
534
-
535
- attr_reader :name, :mode
536
-
537
- def format(value)
538
- case @mode
539
- when :nullable
540
- format_one(value) unless value.nil?
541
- when :required
542
- raise "Required field #{name} cannot be null" if value.nil?
543
- format_one(value)
544
- when :repeated
545
- value.nil? ? [] : value.map {|v| format_one(v) }
546
- end
547
- end
548
-
549
- def format_one(value)
550
- raise NotImplementedError, "Must implement in a subclass"
551
- end
552
-
553
- def to_h
554
- {
555
- :name => name,
556
- :type => type.to_s.upcase,
557
- :mode => mode.to_s.upcase,
558
- }
559
- end
560
- end
561
-
562
- class StringFieldSchema < FieldSchema
563
- def type
564
- :string
565
- end
566
-
567
- def format_one(value)
568
- value.to_s
569
- end
570
- end
571
-
572
- class IntegerFieldSchema < FieldSchema
573
- def type
574
- :integer
575
- end
576
-
577
- def format_one(value)
578
- value.to_i
579
- end
580
- end
581
-
582
- class FloatFieldSchema < FieldSchema
583
- def type
584
- :float
585
- end
586
-
587
- def format_one(value)
588
- value.to_f
589
- end
590
- end
591
-
592
- class BooleanFieldSchema < FieldSchema
593
- def type
594
- :boolean
595
- end
596
-
597
- def format_one(value)
598
- !!value
599
- end
600
- end
601
-
602
- class TimestampFieldSchema < FieldSchema
603
- def type
604
- :timestamp
605
- end
606
-
607
- def format_one(value)
608
- value
609
- end
610
- end
611
-
612
- class RecordSchema < FieldSchema
613
- FIELD_TYPES = {
614
- string: StringFieldSchema,
615
- integer: IntegerFieldSchema,
616
- float: FloatFieldSchema,
617
- boolean: BooleanFieldSchema,
618
- timestamp: TimestampFieldSchema,
619
- record: RecordSchema
620
- }.freeze
621
-
622
- def initialize(name, mode = :nullable)
623
- super(name, mode)
624
- @fields = {}
625
- end
626
-
627
- def type
628
- :record
629
- end
630
-
631
- def [](name)
632
- @fields[name]
633
- end
634
-
635
- def to_a
636
- @fields.map do |_, field_schema|
637
- field_schema.to_h
638
- end
639
- end
640
-
641
- def to_h
642
- {
643
- :name => name,
644
- :type => type.to_s.upcase,
645
- :mode => mode.to_s.upcase,
646
- :fields => self.to_a,
647
- }
648
- end
649
-
650
- def load_schema(schema, allow_overwrite=true)
651
- schema.each do |field|
652
- raise ConfigError, 'field must have type' unless field.key?('type')
653
-
654
- name = field['name']
655
- mode = (field['mode'] || 'nullable').downcase.to_sym
656
-
657
- type = field['type'].downcase.to_sym
658
- field_schema_class = FIELD_TYPES[type]
659
- raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
660
-
661
- next if @fields.key?(name) and !allow_overwrite
662
-
663
- field_schema = field_schema_class.new(name, mode)
664
- @fields[name] = field_schema
665
- if type == :record
666
- raise ConfigError, "record field must have fields" unless field.key?('fields')
667
- field_schema.load_schema(field['fields'], allow_overwrite)
668
- end
669
- end
670
- end
671
-
672
- def register_field(name, type)
673
- if @fields.key?(name) and @fields[name].type != :timestamp
674
- raise ConfigError, "field #{name} is registered twice"
675
- end
676
- if name[/\./]
677
- recordname = $`
678
- fieldname = $'
679
- register_record_field(recordname)
680
- @fields[recordname].register_field(fieldname, type)
681
- else
682
- schema = FIELD_TYPES[type]
683
- raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
684
- @fields[name] = schema.new(name)
685
- end
686
- end
687
491
 
688
- def format_one(record)
689
- out = {}
690
- @fields.each do |key, schema|
691
- value = record[key]
692
- formatted = schema.format(value)
693
- next if formatted.nil? # field does not exists, or null value
694
- out[key] = formatted
695
- end
696
- out
697
- end
698
-
699
- private
700
- def register_record_field(name)
701
- if !@fields.key?(name)
702
- @fields[name] = RecordSchema.new(name)
703
- else
704
- unless @fields[name].kind_of?(RecordSchema)
705
- raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
706
- end
707
- end
492
+ def create_job_id(chunk, dataset, table, schema, max_bad_records, ignore_unknown_values)
493
+ "fluentd_job_" + Digest::SHA1.hexdigest("#{chunk.unique_id}#{dataset}#{table}#{schema.to_s}#{max_bad_records}#{ignore_unknown_values}")
708
494
  end
709
495
  end
710
496
  end