fluent-plugin-bigquery-custom 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new(:test) do |test|
6
+ test.libs << 'lib' << 'test'
7
+ test.pattern = 'test/**/test_*.rb'
8
+ test.verbose = true
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,34 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'fluent/plugin/bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "fluent-plugin-bigquery-custom"
8
+ spec.version = Fluent::BigQueryPlugin::VERSION
9
+ spec.authors = ["Tomohiro Hashidate"]
10
+ spec.email = ["kakyoin.hierophant@gmail.com"]
11
+ spec.description = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts}
12
+ spec.summary = %q{Fluentd plugin to store data on Google BigQuery}
13
+ spec.homepage = "https://github.com/joker1007/fluent-plugin-bigquery-custom"
14
+ spec.license = "APLv2"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "rake"
22
+ spec.add_development_dependency "rr"
23
+ spec.add_development_dependency "test-unit", "~> 3.0.2"
24
+ spec.add_development_dependency "test-unit-rr", "~> 1.0.3"
25
+
26
+ spec.add_runtime_dependency "google-api-client", "~> 0.9.pre5"
27
+ spec.add_runtime_dependency "googleauth"
28
+ spec.add_runtime_dependency "fluentd"
29
+ spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
30
+ spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
31
+ spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
32
+
33
+ spec.add_development_dependency "fluent-plugin-dummydata-producer"
34
+ end
@@ -0,0 +1,6 @@
1
+ module Fluent
2
+ module BigQueryPlugin
3
+ VERSION = "0.3.0"
4
+ end
5
+ end
6
+
@@ -0,0 +1,727 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/mixin/config_placeholders'
6
+ require 'fluent/mixin/plaintextformatter'
7
+
8
+ ## TODO: load implementation
9
+ # require 'fluent/plugin/bigquery/load_request_body_wrapper'
10
+
11
+ module Fluent
12
+ ### TODO: error classes for each api error responses
13
+ # class BigQueryAPIError < StandardError
14
+ # end
15
+
16
+ class BigQueryOutput < TimeSlicedOutput
17
+ Fluent::Plugin.register_output('bigquery', self)
18
+
19
+ # https://developers.google.com/bigquery/browser-tool-quickstart
20
+ # https://developers.google.com/bigquery/bigquery-api-quickstart
21
+
22
+ config_set_default :buffer_type, 'lightening'
23
+
24
+ config_set_default :flush_interval, 0.25
25
+ config_set_default :try_flush_interval, 0.05
26
+
27
+ config_set_default :buffer_chunk_records_limit, 500
28
+ config_set_default :buffer_chunk_limit, 1000000
29
+ config_set_default :buffer_queue_limit, 1024
30
+
31
+ ### for loads
32
+ ### TODO: different default values for buffering between 'load' and insert
33
+ # config_set_default :flush_interval, 1800 # 30min => 48 imports/day
34
+ # config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
35
+
36
+ ### OAuth credential
37
+ # config_param :client_id, :string
38
+ # config_param :client_secret, :string
39
+
40
+ # Available methods are:
41
+ # * private_key -- Use service account credential from pkcs12 private key file
42
+ # * compute_engine -- Use access token available in instances of ComputeEngine
43
+ # * private_json_key -- Use service account credential from JSON key
44
+ # * application_default -- Use application default credential
45
+ config_param :auth_method, :string, default: 'private_key'
46
+
47
+ ### Service Account credential
48
+ config_param :email, :string, default: nil
49
+ config_param :private_key_path, :string, default: nil
50
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
51
+ config_param :json_key, default: nil
52
+
53
+ # see as simple reference
54
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
55
+ config_param :project, :string
56
+
57
+ # dataset_name
58
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
59
+ # but it cannot start with a number or underscore, or have spaces.
60
+ config_param :dataset, :string
61
+
62
+ # table_id
63
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
64
+ config_param :table, :string, default: nil
65
+ config_param :tables, :string, default: nil
66
+
67
+ config_param :auto_create_table, :bool, default: false
68
+
69
+ # skip_invalid_rows (only insert)
70
+ # Insert all valid rows of a request, even if invalid rows exist.
71
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
72
+ config_param :skip_invalid_rows, :bool, default: false
73
+ # max_bad_records (only load)
74
+ # The maximum number of bad records that BigQuery can ignore when running the job.
75
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
76
+ # The default value is 0, which requires that all records are valid.
77
+ config_param :max_bad_records, :integer, default: 0
78
+ # ignore_unknown_values
79
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
80
+ # Default is false, which treats unknown values as errors.
81
+ config_param :ignore_unknown_values, :bool, default: false
82
+
83
+ config_param :schema_path, :string, default: nil
84
+ config_param :fetch_schema, :bool, default: false
85
+ config_param :field_string, :string, default: nil
86
+ config_param :field_integer, :string, default: nil
87
+ config_param :field_float, :string, default: nil
88
+ config_param :field_boolean, :string, default: nil
89
+ config_param :field_timestamp, :string, default: nil
90
+ ### TODO: record field stream inserts doesn't works well?
91
+ ### At table creation, table type json + field type record -> field type validation fails
92
+ ### At streaming inserts, schema cannot be specified
93
+ # config_param :field_record, :string, defualt: nil
94
+ # config_param :optional_data_field, :string, default: nil
95
+
96
+ REGEXP_MAX_NUM = 10
97
+ config_param :replace_record_key, :bool, default: false
98
+ (1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
99
+
100
+ config_param :time_format, :string, default: nil
101
+ config_param :localtime, :bool, default: nil
102
+ config_param :utc, :bool, default: nil
103
+ config_param :time_field, :string, default: nil
104
+
105
+ config_param :insert_id_field, :string, default: nil
106
+
107
+ config_param :method, :string, default: 'insert' # or 'load'
108
+
109
+ config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
110
+ ### method: 'load'
111
+ # https://developers.google.com/bigquery/loading-data-into-bigquery
112
+ # Maximum File Sizes:
113
+ # File Type Compressed Uncompressed
114
+ # CSV 1 GB With new-lines in strings: 4 GB
115
+ # Without new-lines in strings: 1 TB
116
+ # JSON 1 GB 1 TB
117
+
118
+ config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
119
+ # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
120
+ # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
121
+ ### method: ''Streaming data inserts support
122
+ # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
123
+ # Maximum row size: 100 KB
124
+ # Maximum data size of all rows, per insert: 1 MB
125
+ # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
126
+ # If you exceed 100 rows per second for an extended period of time, throttling might occur.
127
+ ### Toooooooooooooo short/small per inserts and row!
128
+
129
+ ### Table types
130
+ # https://developers.google.com/bigquery/docs/tables
131
+ #
132
+ # type - The following data types are supported; see Data Formats for details on each data type:
133
+ # STRING
134
+ # INTEGER
135
+ # FLOAT
136
+ # BOOLEAN
137
+ # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
138
+ #
139
+ # mode - Whether a field can be null. The following values are supported:
140
+ # NULLABLE - The cell can be null.
141
+ # REQUIRED - The cell cannot be null.
142
+ # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
143
+
144
+ def initialize
145
+ super
146
+ require 'json'
147
+ require 'google/apis/bigquery_v2'
148
+ require 'googleauth'
149
+ require 'active_support/core_ext/hash'
150
+ require 'active_support/core_ext/object/json'
151
+
152
+ # MEMO: signet-0.6.1 depend on Farady.default_connection
153
+ Faraday.default_connection.options.timeout = 60
154
+ end
155
+
156
+ # Define `log` method for v0.10.42 or earlier
157
+ unless method_defined?(:log)
158
+ define_method("log") { $log }
159
+ end
160
+
161
+ def configure(conf)
162
+ super
163
+
164
+ if @method == "insert"
165
+ extend(InsertImplementation)
166
+ elsif @method == "load"
167
+ extend(LoadImplementation)
168
+ else
169
+ raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
170
+ end
171
+
172
+ case @auth_method
173
+ when 'private_key'
174
+ unless @email && @private_key_path
175
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
176
+ end
177
+ when 'compute_engine'
178
+ # Do nothing
179
+ when 'json_key'
180
+ unless @json_key
181
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
182
+ end
183
+ when 'application_default'
184
+ # Do nothing
185
+ else
186
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
187
+ end
188
+
189
+ unless @table.nil? ^ @tables.nil?
190
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
191
+ end
192
+
193
+ @tablelist = @tables ? @tables.split(',') : [@table]
194
+
195
+ @fields = RecordSchema.new('record')
196
+ if @schema_path
197
+ @fields.load_schema(JSON.parse(File.read(@schema_path)))
198
+ end
199
+
200
+ types = %w(string integer float boolean timestamp)
201
+ types.each do |type|
202
+ raw_fields = instance_variable_get("@field_#{type}")
203
+ next unless raw_fields
204
+ raw_fields.split(',').each do |field|
205
+ @fields.register_field field.strip, type.to_sym
206
+ end
207
+ end
208
+
209
+ @regexps = {}
210
+ (1..REGEXP_MAX_NUM).each do |i|
211
+ next unless conf["replace_record_key_regexp#{i}"]
212
+ regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
213
+ raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
214
+ raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
215
+ @regexps[regexp] = replacement
216
+ end
217
+
218
+ @localtime = false if @localtime.nil? && @utc
219
+
220
+ @timef = TimeFormatter.new(@time_format, @localtime)
221
+
222
+ if @time_field
223
+ keys = @time_field.split('.')
224
+ last_key = keys.pop
225
+ @add_time_field = ->(record, time) {
226
+ keys.inject(record) { |h, k| h[k] ||= {} }[last_key] = @timef.format(time)
227
+ record
228
+ }
229
+ else
230
+ @add_time_field = ->(record, time) { record }
231
+ end
232
+
233
+ if @insert_id_field
234
+ insert_id_keys = @insert_id_field.split('.')
235
+ @get_insert_id = ->(record) {
236
+ insert_id_keys.inject(record) {|h, k| h[k] }
237
+ }
238
+ else
239
+ @get_insert_id = nil
240
+ end
241
+ end
242
+
243
+ def start
244
+ super
245
+
246
+ @cached_client = nil
247
+ @cached_client_expiration = nil
248
+
249
+ @tables_queue = @tablelist.dup.shuffle
250
+ @tables_mutex = Mutex.new
251
+
252
+ fetch_schema() if @fetch_schema
253
+ end
254
+
255
+ def client
256
+ return @cached_client if @cached_client && @cached_client_expiration > Time.now
257
+
258
+ client = Google::Apis::BigqueryV2::BigqueryService.new
259
+
260
+ scope = "https://www.googleapis.com/auth/bigquery"
261
+
262
+ case @auth_method
263
+ when 'private_key'
264
+ require 'google/api_client/auth/key_utils'
265
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
266
+ auth = Signet::OAuth2::Client.new(
267
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
268
+ audience: "https://accounts.google.com/o/oauth2/token",
269
+ scope: scope,
270
+ issuer: @email,
271
+ signing_key: key)
272
+
273
+ when 'compute_engine'
274
+ auth = Google::Auth::GCECredentials.new
275
+
276
+ when 'json_key'
277
+ if File.exist?(@json_key)
278
+ auth = File.open(@json_key) do |f|
279
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
280
+ end
281
+ else
282
+ key = StringIO.new(@json_key)
283
+ auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
284
+ end
285
+
286
+ when 'application_default'
287
+ auth = Google::Auth.get_application_default([scope])
288
+
289
+ else
290
+ raise ConfigError, "Unknown auth method: #{@auth_method}"
291
+ end
292
+
293
+ client.authorization = auth
294
+
295
+ @cached_client_expiration = Time.now + 1800
296
+ @cached_client = client
297
+ end
298
+
299
+ def generate_table_id(table_id_format, current_time, chunk = nil)
300
+ table_id = current_time.strftime(table_id_format)
301
+ if chunk
302
+ table_id.gsub(%r(%{time_slice})) { |expr|
303
+ chunk.key
304
+ }
305
+ else
306
+ table_id.gsub(%r(%{time_slice})) { |expr|
307
+ current_time.strftime(@time_slice_format)
308
+ }
309
+ end
310
+ end
311
+
312
+ def create_table(table_id)
313
+ client.insert_table(@project, @dataset, {
314
+ table_reference: {
315
+ table_id: table_id,
316
+ },
317
+ schema: {
318
+ fields: @fields.to_a,
319
+ }
320
+ }, {})
321
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
322
+ # api_error? -> client cache clear
323
+ @cached_client = nil
324
+
325
+ message = e.message
326
+ if e.status_code == 409 && /Already Exists:/ =~ message
327
+ # ignore 'Already Exists' error
328
+ return
329
+ end
330
+ log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
331
+ raise "failed to create table in bigquery" # TODO: error class
332
+ end
333
+
334
+ def replace_record_key(record)
335
+ new_record = {}
336
+ record.each do |key, _|
337
+ new_key = key
338
+ @regexps.each do |regexp, replacement|
339
+ new_key = new_key.gsub(/#{regexp}/, replacement)
340
+ end
341
+ new_key = new_key.gsub(/\W/, '')
342
+ new_record.store(new_key, record[key])
343
+ end
344
+ new_record
345
+ end
346
+
347
+ def write(chunk)
348
+ table_id_format = @tables_mutex.synchronize do
349
+ t = @tables_queue.shift
350
+ @tables_queue.push t
351
+ t
352
+ end
353
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), chunk)
354
+ _write(chunk, table_id)
355
+ end
356
+
357
+ def fetch_schema
358
+ table_id_format = @tablelist[0]
359
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
360
+ res = client.get_table(@project, @dataset, table_id)
361
+
362
+ schema = res.schema.fields.as_json
363
+ log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
364
+ @fields.load_schema(schema, false)
365
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
366
+ # api_error? -> client cache clear
367
+ @cached_client = nil
368
+ message = e.message
369
+ log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
370
+ raise "failed to fetch schema from bigquery" # TODO: error class
371
+ end
372
+
373
+ module InsertImplementation
374
+ def format(tag, time, record)
375
+ buf = ''
376
+
377
+ if @replace_record_key
378
+ record = replace_record_key(record)
379
+ end
380
+ row = @fields.format(@add_time_field.call(record, time))
381
+ unless row.empty?
382
+ row = {"json" => row}
383
+ row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
384
+ buf << row.to_msgpack
385
+ end
386
+ buf
387
+ end
388
+
389
+ def _write(chunk, table_id)
390
+ rows = []
391
+ chunk.msgpack_each do |row_object|
392
+ # TODO: row size limit
393
+ rows << row_object.deep_symbolize_keys
394
+ end
395
+
396
+ res = client.insert_all_table_data(@project, @dataset, table_id, {
397
+ rows: rows,
398
+ skip_invalid_rows: @skip_invalid_rows,
399
+ ignore_unknown_values: @ignore_unknown_values,
400
+ }, {})
401
+
402
+ if res.insert_errors
403
+ reasons = []
404
+ res.insert_errors.each do |i|
405
+ i.errors.each do |e|
406
+ reasons << e.reason
407
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, message: e.message, reason: e.reason
408
+ end
409
+ end
410
+
411
+ raise "failed to insert into bigquery, retry" if reasons.find { |r| r == "backendError" }
412
+ return if reasons.all? { |r| r == "invalid" } && @skip_invalid_rows
413
+ flush_secondary(@secondary) if @secondary
414
+ end
415
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
416
+ # api_error? -> client cache clear
417
+ @cached_client = nil
418
+
419
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
420
+ # Table Not Found: Auto Create Table
421
+ create_table(table_id)
422
+ raise "table created. send rows next time."
423
+ end
424
+
425
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
426
+ if e.reason == "backendError"
427
+ raise "failed to insert into bigquery, retry" # TODO: error class
428
+ elsif @secondary
429
+ flush_secondary(@secondary)
430
+ end
431
+ end
432
+ end
433
+
434
+ module LoadImplementation
435
+ def format(tag, time, record)
436
+ buf = ''
437
+
438
+ if @replace_record_key
439
+ record = replace_record_key(record)
440
+ end
441
+ row = @fields.format(@add_time_field.call(record, time))
442
+ unless row.empty?
443
+ buf << MultiJson.dump(row) + "\n"
444
+ end
445
+ buf
446
+ end
447
+
448
+ def _write(chunk, table_id)
449
+ res = nil
450
+ create_upload_source(chunk) do |upload_source|
451
+ res = client.insert_job(@project, {
452
+ configuration: {
453
+ load: {
454
+ destination_table: {
455
+ project_id: @project,
456
+ dataset_id: @dataset,
457
+ table_id: table_id,
458
+ },
459
+ schema: {
460
+ fields: @fields.to_a,
461
+ },
462
+ write_disposition: "WRITE_APPEND",
463
+ source_format: "NEWLINE_DELIMITED_JSON",
464
+ ignore_unknown_values: @ignore_unknown_values,
465
+ max_bad_records: @max_bad_records,
466
+ }
467
+ }
468
+ }, {upload_source: upload_source, content_type: "application/octet-stream"})
469
+ end
470
+ wait_load(res, table_id)
471
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
472
+ # api_error? -> client cache clear
473
+ @cached_client = nil
474
+
475
+ log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
476
+ if e.reason == "backendError"
477
+ raise "failed to insert into bigquery, retry" # TODO: error class
478
+ elsif @secondary
479
+ flush_secondary(@secondary)
480
+ end
481
+ end
482
+
483
+ private
484
+
485
+ def wait_load(res, table_id)
486
+ wait_interval = 10
487
+ _response = res
488
+ until _response.status.state == "DONE"
489
+ log.debug "wait for load job finish", state: _response.status.state
490
+ sleep wait_interval
491
+ _response = client.get_job(@project, _response.job_reference.job_id)
492
+ end
493
+
494
+ errors = _response.status.errors
495
+ if errors
496
+ errors.each do |e|
497
+ log.error "job.insert API (rows)", project_id: @project, dataset: @dataset, table: table_id, message: e.message, reason: e.reason
498
+ end
499
+ end
500
+
501
+ error_result = _response.status.error_result
502
+ if error_result
503
+ log.error "job.insert API (result)", project_id: @project, dataset: @dataset, table: table_id, message: error_result.message, reason: error_result.reason
504
+ if _response.status.error_result.reason == "backendError"
505
+ raise "failed to load into bigquery"
506
+ elsif @secondary
507
+ flush_secondary(@secondary)
508
+ end
509
+ end
510
+
511
+ log.debug "finish load job", state: _response.status.state
512
+ end
513
+
514
+ def create_upload_source(chunk)
515
+ chunk_is_file = @buffer_type == 'file'
516
+ if chunk_is_file
517
+ File.open(chunk.path) do |file|
518
+ yield file
519
+ end
520
+ else
521
+ Tempfile.open("chunk-tmp") do |file|
522
+ file.binmode
523
+ chunk.write_to(file)
524
+ file.sync
525
+ file.rewind
526
+ yield file
527
+ end
528
+ end
529
+ end
530
+ end
531
+
532
+ class FieldSchema
533
+ def initialize(name, mode = :nullable)
534
+ unless [:nullable, :required, :repeated].include?(mode)
535
+ raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
536
+ end
537
+ ### https://developers.google.com/bigquery/docs/tables
538
+ # Each field has the following properties:
539
+ #
540
+ # name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
541
+ # and must start with a letter or underscore. The maximum length is 128 characters.
542
+ # https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
543
+ unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
544
+ raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
545
+ end
546
+
547
+ @name = name
548
+ @mode = mode
549
+ end
550
+
551
+ attr_reader :name, :mode
552
+
553
+ def format(value)
554
+ case @mode
555
+ when :nullable
556
+ format_one(value) unless value.nil?
557
+ when :required
558
+ raise "Required field #{name} cannot be null" if value.nil?
559
+ format_one(value)
560
+ when :repeated
561
+ value.nil? ? [] : value.map {|v| format_one(v) }
562
+ end
563
+ end
564
+
565
+ def format_one(value)
566
+ raise NotImplementedError, "Must implement in a subclass"
567
+ end
568
+
569
+ def to_h
570
+ {
571
+ :name => name,
572
+ :type => type.to_s.upcase,
573
+ :mode => mode.to_s.upcase,
574
+ }
575
+ end
576
+ end
577
+
578
+ class StringFieldSchema < FieldSchema
579
+ def type
580
+ :string
581
+ end
582
+
583
+ def format_one(value)
584
+ value.to_s
585
+ end
586
+ end
587
+
588
+ class IntegerFieldSchema < FieldSchema
589
+ def type
590
+ :integer
591
+ end
592
+
593
+ def format_one(value)
594
+ value.to_i
595
+ end
596
+ end
597
+
598
+ class FloatFieldSchema < FieldSchema
599
+ def type
600
+ :float
601
+ end
602
+
603
+ def format_one(value)
604
+ value.to_f
605
+ end
606
+ end
607
+
608
+ class BooleanFieldSchema < FieldSchema
609
+ def type
610
+ :boolean
611
+ end
612
+
613
+ def format_one(value)
614
+ !!value
615
+ end
616
+ end
617
+
618
+ class TimestampFieldSchema < FieldSchema
619
+ def type
620
+ :timestamp
621
+ end
622
+
623
+ def format_one(value)
624
+ value
625
+ end
626
+ end
627
+
628
+ class RecordSchema < FieldSchema
629
+ FIELD_TYPES = {
630
+ string: StringFieldSchema,
631
+ integer: IntegerFieldSchema,
632
+ float: FloatFieldSchema,
633
+ boolean: BooleanFieldSchema,
634
+ timestamp: TimestampFieldSchema,
635
+ record: RecordSchema
636
+ }.freeze
637
+
638
+ def initialize(name, mode = :nullable)
639
+ super(name, mode)
640
+ @fields = {}
641
+ end
642
+
643
+ def type
644
+ :record
645
+ end
646
+
647
+ def [](name)
648
+ @fields[name]
649
+ end
650
+
651
+ def to_a
652
+ @fields.map do |_, field_schema|
653
+ field_schema.to_h
654
+ end
655
+ end
656
+
657
+ def to_h
658
+ {
659
+ :name => name,
660
+ :type => type.to_s.upcase,
661
+ :mode => mode.to_s.upcase,
662
+ :fields => self.to_a,
663
+ }
664
+ end
665
+
666
+ def load_schema(schema, allow_overwrite=true)
667
+ schema.each do |field|
668
+ raise ConfigError, 'field must have type' unless field.key?('type')
669
+
670
+ name = field['name']
671
+ mode = (field['mode'] || 'nullable').downcase.to_sym
672
+
673
+ type = field['type'].downcase.to_sym
674
+ field_schema_class = FIELD_TYPES[type]
675
+ raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
676
+
677
+ next if @fields.key?(name) and !allow_overwrite
678
+
679
+ field_schema = field_schema_class.new(name, mode)
680
+ @fields[name] = field_schema
681
+ if type == :record
682
+ raise ConfigError, "record field must have fields" unless field.key?('fields')
683
+ field_schema.load_schema(field['fields'], allow_overwrite)
684
+ end
685
+ end
686
+ end
687
+
688
+ def register_field(name, type)
689
+ if @fields.key?(name) and @fields[name].type != :timestamp
690
+ raise ConfigError, "field #{name} is registered twice"
691
+ end
692
+ if name[/\./]
693
+ recordname = $`
694
+ fieldname = $'
695
+ register_record_field(recordname)
696
+ @fields[recordname].register_field(fieldname, type)
697
+ else
698
+ schema = FIELD_TYPES[type]
699
+ raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
700
+ @fields[name] = schema.new(name)
701
+ end
702
+ end
703
+
704
+ def format_one(record)
705
+ out = {}
706
+ @fields.each do |key, schema|
707
+ value = record[key]
708
+ formatted = schema.format(value)
709
+ next if formatted.nil? # field does not exists, or null value
710
+ out[key] = formatted
711
+ end
712
+ out
713
+ end
714
+
715
+ private
716
+ def register_record_field(name)
717
+ if !@fields.key?(name)
718
+ @fields[name] = RecordSchema.new(name)
719
+ else
720
+ unless @fields[name].kind_of?(RecordSchema)
721
+ raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
722
+ end
723
+ end
724
+ end
725
+ end
726
+ end
727
+ end