fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 2.0.0.beta
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-08 00:00:00.000000000 Z
12
+ date: 2018-03-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -152,9 +152,13 @@ files:
152
152
  - lib/fluent/plugin/bigquery/schema.rb
153
153
  - lib/fluent/plugin/bigquery/version.rb
154
154
  - lib/fluent/plugin/bigquery/writer.rb
155
- - lib/fluent/plugin/out_bigquery.rb
155
+ - lib/fluent/plugin/out_bigquery_base.rb
156
+ - lib/fluent/plugin/out_bigquery_insert.rb
157
+ - lib/fluent/plugin/out_bigquery_load.rb
156
158
  - test/helper.rb
157
- - test/plugin/test_out_bigquery.rb
159
+ - test/plugin/test_out_bigquery_base.rb
160
+ - test/plugin/test_out_bigquery_insert.rb
161
+ - test/plugin/test_out_bigquery_load.rb
158
162
  - test/plugin/test_record_schema.rb
159
163
  - test/plugin/testdata/apache.schema
160
164
  - test/plugin/testdata/json_key.json
@@ -175,9 +179,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
179
  version: '0'
176
180
  required_rubygems_version: !ruby/object:Gem::Requirement
177
181
  requirements:
178
- - - ">="
182
+ - - ">"
179
183
  - !ruby/object:Gem::Version
180
- version: '0'
184
+ version: 1.3.1
181
185
  requirements: []
182
186
  rubyforge_project:
183
187
  rubygems_version: 2.6.12
@@ -186,7 +190,9 @@ specification_version: 4
186
190
  summary: Fluentd plugin to store data on Google BigQuery
187
191
  test_files:
188
192
  - test/helper.rb
189
- - test/plugin/test_out_bigquery.rb
193
+ - test/plugin/test_out_bigquery_base.rb
194
+ - test/plugin/test_out_bigquery_insert.rb
195
+ - test/plugin/test_out_bigquery_load.rb
190
196
  - test/plugin/test_record_schema.rb
191
197
  - test/plugin/testdata/apache.schema
192
198
  - test/plugin/testdata/json_key.json
@@ -1,500 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- require 'fluent/plugin/output'
4
-
5
- require 'fluent/plugin/bigquery/version'
6
-
7
- require 'fluent/plugin/bigquery/helper'
8
- require 'fluent/plugin/bigquery/errors'
9
- require 'fluent/plugin/bigquery/schema'
10
- require 'fluent/plugin/bigquery/writer'
11
-
12
- module Fluent
13
- module Plugin
14
- class BigQueryOutput < Output
15
- Fluent::Plugin.register_output('bigquery', self)
16
-
17
- helpers :inject, :formatter
18
-
19
- # https://developers.google.com/bigquery/browser-tool-quickstart
20
- # https://developers.google.com/bigquery/bigquery-api-quickstart
21
-
22
- ### default for insert
23
- def configure_for_insert(conf)
24
- raise ConfigError unless conf["method"].nil? || conf["method"] == "insert"
25
-
26
- formatter_config = conf.elements("format")[0]
27
- if formatter_config && formatter_config['@type'] != "json"
28
- log.warn "`insert` mode supports only json formatter."
29
- formatter_config['@type'] = nil
30
- end
31
- @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
32
-
33
- buffer_config = conf.elements("buffer")[0]
34
- if buffer_config
35
- buffer_config["@type"] = "memory" unless buffer_config["@type"]
36
- buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
37
- buffer_config["flush_interval"] = 0.25 unless buffer_config["flush_interval"]
38
- buffer_config["flush_thread_interval"] = 0.05 unless buffer_config["flush_thread_interval"]
39
- buffer_config["flush_thread_burst_interval"] = 0.05 unless buffer_config["flush_thread_burst_interval"]
40
- buffer_config["chunk_limit_size"] = 1 * 1024 ** 2 unless buffer_config["chunk_limit_size"] # 1MB
41
- buffer_config["total_limit_size"] = 1 * 1024 ** 3 unless buffer_config["total_limit_size"] # 1GB
42
- buffer_config["chunk_records_limit"] = 500 unless buffer_config["chunk_records_limit"]
43
- end
44
- end
45
-
46
- ### default for loads
47
- def configure_for_load(conf)
48
- raise ConfigError unless conf["method"] == "load"
49
-
50
- formatter_config = conf.elements("format")[0]
51
- @formatter = formatter_create(usage: 'out_bigquery_for_load', conf: formatter_config, default_type: 'json')
52
-
53
- buffer_config = conf.elements("buffer")[0]
54
- return unless buffer_config
55
- buffer_config["@type"] = "file" unless buffer_config["@type"]
56
- buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
57
- buffer_config["chunk_limit_size"] = 1 * 1024 ** 3 unless buffer_config["chunk_limit_size"] # 1GB
58
- buffer_config["total_limit_size"] = 32 * 1024 ** 3 unless buffer_config["total_limit_size"] # 32GB
59
- end
60
-
61
- # Available methods are:
62
- # * private_key -- Use service account credential from pkcs12 private key file
63
- # * compute_engine -- Use access token available in instances of ComputeEngine
64
- # * json_key -- Use service account credential from JSON key
65
- # * application_default -- Use application default credential
66
- config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
67
-
68
- ### Service Account credential
69
- config_param :email, :string, default: nil
70
- config_param :private_key_path, :string, default: nil
71
- config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
72
- config_param :json_key, default: nil, secret: true
73
-
74
- # see as simple reference
75
- # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
76
- config_param :project, :string
77
-
78
- # dataset_name
79
- # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
80
- # but it cannot start with a number or underscore, or have spaces.
81
- config_param :dataset, :string
82
-
83
- # table_id
84
- # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
85
- config_param :table, :string, default: nil
86
- config_param :tables, :array, value_type: :string, default: nil
87
-
88
- # template_suffix (only insert)
89
- # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
90
- config_param :template_suffix, :string, default: nil
91
-
92
- config_param :auto_create_table, :bool, default: false
93
-
94
- config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
95
-
96
- # skip_invalid_rows (only insert)
97
- # Insert all valid rows of a request, even if invalid rows exist.
98
- # The default value is false, which causes the entire request to fail if any invalid rows exist.
99
- config_param :skip_invalid_rows, :bool, default: false
100
- # max_bad_records (only load)
101
- # The maximum number of bad records that BigQuery can ignore when running the job.
102
- # If the number of bad records exceeds this value, an invalid error is returned in the job result.
103
- # The default value is 0, which requires that all records are valid.
104
- config_param :max_bad_records, :integer, default: 0
105
- # ignore_unknown_values
106
- # Accept rows that contain values that do not match the schema. The unknown values are ignored.
107
- # Default is false, which treats unknown values as errors.
108
- config_param :ignore_unknown_values, :bool, default: false
109
-
110
- config_param :schema, :array, default: nil
111
- config_param :schema_path, :string, default: nil
112
- config_param :fetch_schema, :bool, default: false
113
- config_param :fetch_schema_table, :string, default: nil
114
- config_param :schema_cache_expire, :time, default: 600
115
-
116
- REGEXP_MAX_NUM = 10
117
- config_param :replace_record_key, :bool, default: false
118
- (1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
119
-
120
- # insert_id_field (only insert)
121
- config_param :insert_id_field, :string, default: nil
122
- # prevent_duplicate_load (only load)
123
- config_param :prevent_duplicate_load, :bool, default: false
124
-
125
- # add_insert_timestamp (only insert)
126
- # adds a timestamp just before sending the rows to bigquery, so that
127
- # buffering time is not taken into account. Gives a field in bigquery
128
- # which represents the insert time of the row.
129
- config_param :add_insert_timestamp, :string, default: nil
130
-
131
- config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
132
-
133
- # allow_retry_insert_errors (only insert)
134
- # If insert_id_field is not specified, true means to allow duplicate rows
135
- config_param :allow_retry_insert_errors, :bool, default: false
136
-
137
- # TODO
138
- # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
139
- # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
140
- # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
141
- ### method: ''Streaming data inserts support
142
- # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
143
- # Maximum row size: 100 KB
144
- # Maximum data size of all rows, per insert: 1 MB
145
- # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
146
- # If you exceed 100 rows per second for an extended period of time, throttling might occur.
147
- ### Toooooooooooooo short/small per inserts and row!
148
-
149
- ## Timeout
150
- # request_timeout_sec
151
- # Bigquery API response timeout
152
- # request_open_timeout_sec
153
- # Bigquery API connection, and request timeout
154
- config_param :request_timeout_sec, :time, default: nil
155
- config_param :request_open_timeout_sec, :time, default: 60
156
-
157
- ## Partitioning
158
- config_param :time_partitioning_type, :enum, list: [:day], default: nil
159
- config_param :time_partitioning_expiration, :time, default: nil
160
-
161
- ## Formatter
162
- config_section :format do
163
- config_set_default :@type, 'json'
164
- end
165
-
166
- ### Table types
167
- # https://developers.google.com/bigquery/docs/tables
168
- #
169
- # type - The following data types are supported; see Data Formats for details on each data type:
170
- # STRING
171
- # INTEGER
172
- # FLOAT
173
- # BOOLEAN
174
- # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
175
- #
176
- # mode - Whether a field can be null. The following values are supported:
177
- # NULLABLE - The cell can be null.
178
- # REQUIRED - The cell cannot be null.
179
- # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
180
-
181
- def initialize
182
- super
183
- require 'multi_json'
184
- require 'google/apis/bigquery_v2'
185
- require 'googleauth'
186
-
187
- # MEMO: signet-0.6.1 depend on Farady.default_connection
188
- Faraday.default_connection.options.timeout = 60
189
- end
190
-
191
- def configure(conf)
192
- if conf["method"] == "load"
193
- configure_for_load(conf)
194
- else
195
- configure_for_insert(conf)
196
- end
197
- super
198
-
199
- case @method
200
- when :insert
201
- extend(InsertImplementation)
202
- when :load
203
- raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
204
- extend(LoadImplementation)
205
- end
206
-
207
- case @auth_method
208
- when :private_key
209
- unless @email && @private_key_path
210
- raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
211
- end
212
- when :compute_engine
213
- # Do nothing
214
- when :json_key
215
- unless @json_key
216
- raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
217
- end
218
- when :application_default
219
- # Do nothing
220
- else
221
- raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
222
- end
223
-
224
- @writers = {}
225
-
226
- unless @table.nil? ^ @tables.nil?
227
- raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
228
- end
229
-
230
- @tablelist = @tables ? @tables : [@table]
231
-
232
- @table_schema = Fluent::BigQuery::RecordSchema.new('record')
233
- if @schema
234
- @table_schema.load_schema(@schema)
235
- end
236
- if @schema_path
237
- @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
238
- end
239
-
240
- warn "[DEPRECATION] `replace_record_key` param is deprecated. Please use filter_record_transformer or fluent-plugin-record-reformer" if @replace_record_key
241
-
242
- @regexps = {}
243
- (1..REGEXP_MAX_NUM).each do |i|
244
- next unless conf["replace_record_key_regexp#{i}"]
245
- regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
246
- raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
247
- raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
248
- @regexps[regexp] = replacement
249
- end
250
-
251
- if @insert_id_field
252
- insert_id_keys = @insert_id_field.split('.')
253
- @get_insert_id = ->(record) {
254
- insert_id_keys.inject(record) {|h, k| h[k] }
255
- }
256
- else
257
- @get_insert_id = nil
258
- end
259
-
260
- placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
261
- placeholder_validate!(:bigquery, placeholder_params)
262
- end
263
-
264
- def start
265
- super
266
-
267
- @tables_queue = @tablelist.shuffle
268
- @tables_mutex = Mutex.new
269
- @fetched_schemas = {}
270
- @last_fetch_schema_time = Hash.new(0)
271
- end
272
-
273
- def multi_workers_ready?
274
- true
275
- end
276
-
277
- def writer
278
- @writers["thread-#{Thread.current.object_id}"] ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
279
- private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
280
- email: @email,
281
- json_key: @json_key,
282
- source_format: @source_format,
283
- skip_invalid_rows: @skip_invalid_rows,
284
- ignore_unknown_values: @ignore_unknown_values,
285
- max_bad_records: @max_bad_records,
286
- allow_retry_insert_errors: @allow_retry_insert_errors,
287
- prevent_duplicate_load: @prevent_duplicate_load,
288
- auto_create_table: @auto_create_table,
289
- time_partitioning_type: @time_partitioning_type,
290
- time_partitioning_expiration: @time_partitioning_expiration,
291
- timeout_sec: @request_timeout_sec,
292
- open_timeout_sec: @request_open_timeout_sec,
293
- })
294
- end
295
-
296
- def replace_record_key(record)
297
- new_record = {}
298
- record.each do |key, _|
299
- new_key = key
300
- @regexps.each do |regexp, replacement|
301
- new_key = new_key.gsub(/#{regexp}/, replacement)
302
- end
303
- new_key = new_key.gsub(/\W/, '')
304
- new_record.store(new_key, record[key])
305
- end
306
- new_record
307
- end
308
-
309
- def format(tag, time, record)
310
- if @replace_record_key
311
- record = replace_record_key(record)
312
- end
313
-
314
- record = inject_values_to_record(tag, time, record)
315
-
316
- meta = metadata(tag, time, record)
317
- schema =
318
- if @fetch_schema
319
- fetch_schema(meta)
320
- else
321
- @table_schema
322
- end
323
-
324
- begin
325
- row = schema.format(record)
326
- return if row.empty?
327
- @formatter.format(tag, time, row)
328
- rescue
329
- log.error("format error", record: record, schema: schema)
330
- raise
331
- end
332
- end
333
-
334
- def write(chunk)
335
- table_id_format = @tables_mutex.synchronize do
336
- t = @tables_queue.shift
337
- @tables_queue.push t
338
- t
339
- end
340
- _write(chunk, table_id_format)
341
- end
342
-
343
- def fetch_schema(metadata)
344
- table_id = nil
345
- project = extract_placeholders(@project, metadata)
346
- dataset = extract_placeholders(@dataset, metadata)
347
- table_id = fetch_schema_target_table(metadata)
348
-
349
- if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
350
- schema = writer.fetch_schema(project, dataset, table_id)
351
-
352
- if schema
353
- table_schema = Fluent::BigQuery::RecordSchema.new("record")
354
- table_schema.load_schema(schema)
355
- @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
356
- else
357
- if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
358
- raise "failed to fetch schema from bigquery"
359
- else
360
- log.warn "#{table_id} uses previous schema"
361
- end
362
- end
363
-
364
- @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
365
- end
366
-
367
- @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
368
- end
369
-
370
- def fetch_schema_target_table(metadata)
371
- extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
372
- end
373
-
374
- def get_schema(project, dataset, metadata)
375
- if @fetch_schema
376
- @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
377
- else
378
- @table_schema
379
- end
380
- end
381
-
382
- module InsertImplementation
383
- def _write(chunk, table_format)
384
- now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
385
- rows = chunk.open do |io|
386
- io.map do |line|
387
- record = MultiJson.load(line)
388
- record[@add_insert_timestamp] = now if @add_insert_timestamp
389
- row = {"json" => record}
390
- row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
391
- Fluent::BigQuery::Helper.deep_symbolize_keys(row)
392
- end
393
- end
394
-
395
- project = extract_placeholders(@project, chunk.metadata)
396
- dataset = extract_placeholders(@dataset, chunk.metadata)
397
- table_id = extract_placeholders(table_format, chunk.metadata)
398
- template_suffix = @template_suffix ? extract_placeholders(@template_suffix, chunk.metadata) : nil
399
-
400
- schema = get_schema(project, dataset, chunk.metadata)
401
-
402
- insert(project, dataset, table_id, rows, schema, template_suffix)
403
- end
404
-
405
- def insert(project, dataset, table_id, rows, schema, template_suffix)
406
- writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
407
- rescue Fluent::BigQuery::Error => e
408
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
409
- # Table Not Found: Auto Create Table
410
- writer.create_table(project, dataset, table_id, schema)
411
- raise "table created. send rows next time."
412
- end
413
-
414
- raise if e.retryable?
415
-
416
- if @secondary
417
- # TODO: find better way
418
- @retry = retry_state_create(
419
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
420
- forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
421
- max_interval: @buffer_config.retry_max_interval,
422
- secondary: true, secondary_threshold: Float::EPSILON,
423
- randomize: @buffer_config.retry_randomize
424
- )
425
- else
426
- @retry = retry_state_create(
427
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
428
- forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
429
- max_interval: @buffer_config.retry_max_interval,
430
- randomize: @buffer_config.retry_randomize
431
- )
432
- end
433
-
434
- raise
435
- end
436
- end
437
-
438
- module LoadImplementation
439
- def _write(chunk, table_id_format)
440
- project = extract_placeholders(@project, chunk.metadata)
441
- dataset = extract_placeholders(@dataset, chunk.metadata)
442
- table_id = extract_placeholders(table_id_format, chunk.metadata)
443
-
444
- schema = get_schema(project, dataset, chunk.metadata)
445
-
446
- load(chunk, project, dataset, table_id, schema)
447
- end
448
-
449
- def load(chunk, project, dataset, table_id, schema)
450
- res = nil
451
-
452
- create_upload_source(chunk) do |upload_source|
453
- res = writer.create_load_job(chunk.unique_id, project, dataset, table_id, upload_source, schema)
454
- end
455
- rescue Fluent::BigQuery::Error => e
456
- raise if e.retryable?
457
-
458
- if @secondary
459
- # TODO: find better way
460
- @retry = retry_state_create(
461
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
462
- forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
463
- max_interval: @buffer_config.retry_max_interval,
464
- secondary: true, secondary_threshold: Float::EPSILON,
465
- randomize: @buffer_config.retry_randomize
466
- )
467
- else
468
- @retry = retry_state_create(
469
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
470
- forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
471
- max_interval: @buffer_config.retry_max_interval,
472
- randomize: @buffer_config.retry_randomize
473
- )
474
- end
475
-
476
- raise
477
- end
478
-
479
- private
480
-
481
- def create_upload_source(chunk)
482
- chunk_is_file = @buffer_config["@type"] == 'file'
483
- if chunk_is_file
484
- File.open(chunk.path) do |file|
485
- yield file
486
- end
487
- else
488
- Tempfile.open("chunk-tmp") do |file|
489
- file.binmode
490
- chunk.write_to(file)
491
- file.sync
492
- file.rewind
493
- yield file
494
- end
495
- end
496
- end
497
- end
498
- end
499
- end
500
- end