fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 2.0.0.beta
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-12-08 00:00:00.000000000 Z
12
+ date: 2018-03-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -152,9 +152,13 @@ files:
152
152
  - lib/fluent/plugin/bigquery/schema.rb
153
153
  - lib/fluent/plugin/bigquery/version.rb
154
154
  - lib/fluent/plugin/bigquery/writer.rb
155
- - lib/fluent/plugin/out_bigquery.rb
155
+ - lib/fluent/plugin/out_bigquery_base.rb
156
+ - lib/fluent/plugin/out_bigquery_insert.rb
157
+ - lib/fluent/plugin/out_bigquery_load.rb
156
158
  - test/helper.rb
157
- - test/plugin/test_out_bigquery.rb
159
+ - test/plugin/test_out_bigquery_base.rb
160
+ - test/plugin/test_out_bigquery_insert.rb
161
+ - test/plugin/test_out_bigquery_load.rb
158
162
  - test/plugin/test_record_schema.rb
159
163
  - test/plugin/testdata/apache.schema
160
164
  - test/plugin/testdata/json_key.json
@@ -175,9 +179,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
175
179
  version: '0'
176
180
  required_rubygems_version: !ruby/object:Gem::Requirement
177
181
  requirements:
178
- - - ">="
182
+ - - ">"
179
183
  - !ruby/object:Gem::Version
180
- version: '0'
184
+ version: 1.3.1
181
185
  requirements: []
182
186
  rubyforge_project:
183
187
  rubygems_version: 2.6.12
@@ -186,7 +190,9 @@ specification_version: 4
186
190
  summary: Fluentd plugin to store data on Google BigQuery
187
191
  test_files:
188
192
  - test/helper.rb
189
- - test/plugin/test_out_bigquery.rb
193
+ - test/plugin/test_out_bigquery_base.rb
194
+ - test/plugin/test_out_bigquery_insert.rb
195
+ - test/plugin/test_out_bigquery_load.rb
190
196
  - test/plugin/test_record_schema.rb
191
197
  - test/plugin/testdata/apache.schema
192
198
  - test/plugin/testdata/json_key.json
@@ -1,500 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- require 'fluent/plugin/output'
4
-
5
- require 'fluent/plugin/bigquery/version'
6
-
7
- require 'fluent/plugin/bigquery/helper'
8
- require 'fluent/plugin/bigquery/errors'
9
- require 'fluent/plugin/bigquery/schema'
10
- require 'fluent/plugin/bigquery/writer'
11
-
12
- module Fluent
13
- module Plugin
14
- class BigQueryOutput < Output
15
- Fluent::Plugin.register_output('bigquery', self)
16
-
17
- helpers :inject, :formatter
18
-
19
- # https://developers.google.com/bigquery/browser-tool-quickstart
20
- # https://developers.google.com/bigquery/bigquery-api-quickstart
21
-
22
- ### default for insert
23
- def configure_for_insert(conf)
24
- raise ConfigError unless conf["method"].nil? || conf["method"] == "insert"
25
-
26
- formatter_config = conf.elements("format")[0]
27
- if formatter_config && formatter_config['@type'] != "json"
28
- log.warn "`insert` mode supports only json formatter."
29
- formatter_config['@type'] = nil
30
- end
31
- @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
32
-
33
- buffer_config = conf.elements("buffer")[0]
34
- if buffer_config
35
- buffer_config["@type"] = "memory" unless buffer_config["@type"]
36
- buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
37
- buffer_config["flush_interval"] = 0.25 unless buffer_config["flush_interval"]
38
- buffer_config["flush_thread_interval"] = 0.05 unless buffer_config["flush_thread_interval"]
39
- buffer_config["flush_thread_burst_interval"] = 0.05 unless buffer_config["flush_thread_burst_interval"]
40
- buffer_config["chunk_limit_size"] = 1 * 1024 ** 2 unless buffer_config["chunk_limit_size"] # 1MB
41
- buffer_config["total_limit_size"] = 1 * 1024 ** 3 unless buffer_config["total_limit_size"] # 1GB
42
- buffer_config["chunk_records_limit"] = 500 unless buffer_config["chunk_records_limit"]
43
- end
44
- end
45
-
46
- ### default for loads
47
- def configure_for_load(conf)
48
- raise ConfigError unless conf["method"] == "load"
49
-
50
- formatter_config = conf.elements("format")[0]
51
- @formatter = formatter_create(usage: 'out_bigquery_for_load', conf: formatter_config, default_type: 'json')
52
-
53
- buffer_config = conf.elements("buffer")[0]
54
- return unless buffer_config
55
- buffer_config["@type"] = "file" unless buffer_config["@type"]
56
- buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
57
- buffer_config["chunk_limit_size"] = 1 * 1024 ** 3 unless buffer_config["chunk_limit_size"] # 1GB
58
- buffer_config["total_limit_size"] = 32 * 1024 ** 3 unless buffer_config["total_limit_size"] # 32GB
59
- end
60
-
61
- # Available methods are:
62
- # * private_key -- Use service account credential from pkcs12 private key file
63
- # * compute_engine -- Use access token available in instances of ComputeEngine
64
- # * json_key -- Use service account credential from JSON key
65
- # * application_default -- Use application default credential
66
- config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
67
-
68
- ### Service Account credential
69
- config_param :email, :string, default: nil
70
- config_param :private_key_path, :string, default: nil
71
- config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
72
- config_param :json_key, default: nil, secret: true
73
-
74
- # see as simple reference
75
- # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
76
- config_param :project, :string
77
-
78
- # dataset_name
79
- # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
80
- # but it cannot start with a number or underscore, or have spaces.
81
- config_param :dataset, :string
82
-
83
- # table_id
84
- # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
85
- config_param :table, :string, default: nil
86
- config_param :tables, :array, value_type: :string, default: nil
87
-
88
- # template_suffix (only insert)
89
- # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
90
- config_param :template_suffix, :string, default: nil
91
-
92
- config_param :auto_create_table, :bool, default: false
93
-
94
- config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
95
-
96
- # skip_invalid_rows (only insert)
97
- # Insert all valid rows of a request, even if invalid rows exist.
98
- # The default value is false, which causes the entire request to fail if any invalid rows exist.
99
- config_param :skip_invalid_rows, :bool, default: false
100
- # max_bad_records (only load)
101
- # The maximum number of bad records that BigQuery can ignore when running the job.
102
- # If the number of bad records exceeds this value, an invalid error is returned in the job result.
103
- # The default value is 0, which requires that all records are valid.
104
- config_param :max_bad_records, :integer, default: 0
105
- # ignore_unknown_values
106
- # Accept rows that contain values that do not match the schema. The unknown values are ignored.
107
- # Default is false, which treats unknown values as errors.
108
- config_param :ignore_unknown_values, :bool, default: false
109
-
110
- config_param :schema, :array, default: nil
111
- config_param :schema_path, :string, default: nil
112
- config_param :fetch_schema, :bool, default: false
113
- config_param :fetch_schema_table, :string, default: nil
114
- config_param :schema_cache_expire, :time, default: 600
115
-
116
- REGEXP_MAX_NUM = 10
117
- config_param :replace_record_key, :bool, default: false
118
- (1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
119
-
120
- # insert_id_field (only insert)
121
- config_param :insert_id_field, :string, default: nil
122
- # prevent_duplicate_load (only load)
123
- config_param :prevent_duplicate_load, :bool, default: false
124
-
125
- # add_insert_timestamp (only insert)
126
- # adds a timestamp just before sending the rows to bigquery, so that
127
- # buffering time is not taken into account. Gives a field in bigquery
128
- # which represents the insert time of the row.
129
- config_param :add_insert_timestamp, :string, default: nil
130
-
131
- config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
132
-
133
- # allow_retry_insert_errors (only insert)
134
- # If insert_id_field is not specified, true means to allow duplicate rows
135
- config_param :allow_retry_insert_errors, :bool, default: false
136
-
137
- # TODO
138
- # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
139
- # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
140
- # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
141
- ### method: ''Streaming data inserts support
142
- # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
143
- # Maximum row size: 100 KB
144
- # Maximum data size of all rows, per insert: 1 MB
145
- # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
146
- # If you exceed 100 rows per second for an extended period of time, throttling might occur.
147
- ### Toooooooooooooo short/small per inserts and row!
148
-
149
- ## Timeout
150
- # request_timeout_sec
151
- # Bigquery API response timeout
152
- # request_open_timeout_sec
153
- # Bigquery API connection, and request timeout
154
- config_param :request_timeout_sec, :time, default: nil
155
- config_param :request_open_timeout_sec, :time, default: 60
156
-
157
- ## Partitioning
158
- config_param :time_partitioning_type, :enum, list: [:day], default: nil
159
- config_param :time_partitioning_expiration, :time, default: nil
160
-
161
- ## Formatter
162
- config_section :format do
163
- config_set_default :@type, 'json'
164
- end
165
-
166
- ### Table types
167
- # https://developers.google.com/bigquery/docs/tables
168
- #
169
- # type - The following data types are supported; see Data Formats for details on each data type:
170
- # STRING
171
- # INTEGER
172
- # FLOAT
173
- # BOOLEAN
174
- # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
175
- #
176
- # mode - Whether a field can be null. The following values are supported:
177
- # NULLABLE - The cell can be null.
178
- # REQUIRED - The cell cannot be null.
179
- # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
180
-
181
- def initialize
182
- super
183
- require 'multi_json'
184
- require 'google/apis/bigquery_v2'
185
- require 'googleauth'
186
-
187
- # MEMO: signet-0.6.1 depend on Farady.default_connection
188
- Faraday.default_connection.options.timeout = 60
189
- end
190
-
191
- def configure(conf)
192
- if conf["method"] == "load"
193
- configure_for_load(conf)
194
- else
195
- configure_for_insert(conf)
196
- end
197
- super
198
-
199
- case @method
200
- when :insert
201
- extend(InsertImplementation)
202
- when :load
203
- raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
204
- extend(LoadImplementation)
205
- end
206
-
207
- case @auth_method
208
- when :private_key
209
- unless @email && @private_key_path
210
- raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
211
- end
212
- when :compute_engine
213
- # Do nothing
214
- when :json_key
215
- unless @json_key
216
- raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
217
- end
218
- when :application_default
219
- # Do nothing
220
- else
221
- raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
222
- end
223
-
224
- @writers = {}
225
-
226
- unless @table.nil? ^ @tables.nil?
227
- raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
228
- end
229
-
230
- @tablelist = @tables ? @tables : [@table]
231
-
232
- @table_schema = Fluent::BigQuery::RecordSchema.new('record')
233
- if @schema
234
- @table_schema.load_schema(@schema)
235
- end
236
- if @schema_path
237
- @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
238
- end
239
-
240
- warn "[DEPRECATION] `replace_record_key` param is deprecated. Please use filter_record_transformer or fluent-plugin-record-reformer" if @replace_record_key
241
-
242
- @regexps = {}
243
- (1..REGEXP_MAX_NUM).each do |i|
244
- next unless conf["replace_record_key_regexp#{i}"]
245
- regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
246
- raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
247
- raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
248
- @regexps[regexp] = replacement
249
- end
250
-
251
- if @insert_id_field
252
- insert_id_keys = @insert_id_field.split('.')
253
- @get_insert_id = ->(record) {
254
- insert_id_keys.inject(record) {|h, k| h[k] }
255
- }
256
- else
257
- @get_insert_id = nil
258
- end
259
-
260
- placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
261
- placeholder_validate!(:bigquery, placeholder_params)
262
- end
263
-
264
- def start
265
- super
266
-
267
- @tables_queue = @tablelist.shuffle
268
- @tables_mutex = Mutex.new
269
- @fetched_schemas = {}
270
- @last_fetch_schema_time = Hash.new(0)
271
- end
272
-
273
- def multi_workers_ready?
274
- true
275
- end
276
-
277
- def writer
278
- @writers["thread-#{Thread.current.object_id}"] ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
279
- private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
280
- email: @email,
281
- json_key: @json_key,
282
- source_format: @source_format,
283
- skip_invalid_rows: @skip_invalid_rows,
284
- ignore_unknown_values: @ignore_unknown_values,
285
- max_bad_records: @max_bad_records,
286
- allow_retry_insert_errors: @allow_retry_insert_errors,
287
- prevent_duplicate_load: @prevent_duplicate_load,
288
- auto_create_table: @auto_create_table,
289
- time_partitioning_type: @time_partitioning_type,
290
- time_partitioning_expiration: @time_partitioning_expiration,
291
- timeout_sec: @request_timeout_sec,
292
- open_timeout_sec: @request_open_timeout_sec,
293
- })
294
- end
295
-
296
- def replace_record_key(record)
297
- new_record = {}
298
- record.each do |key, _|
299
- new_key = key
300
- @regexps.each do |regexp, replacement|
301
- new_key = new_key.gsub(/#{regexp}/, replacement)
302
- end
303
- new_key = new_key.gsub(/\W/, '')
304
- new_record.store(new_key, record[key])
305
- end
306
- new_record
307
- end
308
-
309
- def format(tag, time, record)
310
- if @replace_record_key
311
- record = replace_record_key(record)
312
- end
313
-
314
- record = inject_values_to_record(tag, time, record)
315
-
316
- meta = metadata(tag, time, record)
317
- schema =
318
- if @fetch_schema
319
- fetch_schema(meta)
320
- else
321
- @table_schema
322
- end
323
-
324
- begin
325
- row = schema.format(record)
326
- return if row.empty?
327
- @formatter.format(tag, time, row)
328
- rescue
329
- log.error("format error", record: record, schema: schema)
330
- raise
331
- end
332
- end
333
-
334
- def write(chunk)
335
- table_id_format = @tables_mutex.synchronize do
336
- t = @tables_queue.shift
337
- @tables_queue.push t
338
- t
339
- end
340
- _write(chunk, table_id_format)
341
- end
342
-
343
- def fetch_schema(metadata)
344
- table_id = nil
345
- project = extract_placeholders(@project, metadata)
346
- dataset = extract_placeholders(@dataset, metadata)
347
- table_id = fetch_schema_target_table(metadata)
348
-
349
- if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
350
- schema = writer.fetch_schema(project, dataset, table_id)
351
-
352
- if schema
353
- table_schema = Fluent::BigQuery::RecordSchema.new("record")
354
- table_schema.load_schema(schema)
355
- @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
356
- else
357
- if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
358
- raise "failed to fetch schema from bigquery"
359
- else
360
- log.warn "#{table_id} uses previous schema"
361
- end
362
- end
363
-
364
- @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
365
- end
366
-
367
- @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
368
- end
369
-
370
- def fetch_schema_target_table(metadata)
371
- extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
372
- end
373
-
374
- def get_schema(project, dataset, metadata)
375
- if @fetch_schema
376
- @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
377
- else
378
- @table_schema
379
- end
380
- end
381
-
382
- module InsertImplementation
383
- def _write(chunk, table_format)
384
- now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
385
- rows = chunk.open do |io|
386
- io.map do |line|
387
- record = MultiJson.load(line)
388
- record[@add_insert_timestamp] = now if @add_insert_timestamp
389
- row = {"json" => record}
390
- row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
391
- Fluent::BigQuery::Helper.deep_symbolize_keys(row)
392
- end
393
- end
394
-
395
- project = extract_placeholders(@project, chunk.metadata)
396
- dataset = extract_placeholders(@dataset, chunk.metadata)
397
- table_id = extract_placeholders(table_format, chunk.metadata)
398
- template_suffix = @template_suffix ? extract_placeholders(@template_suffix, chunk.metadata) : nil
399
-
400
- schema = get_schema(project, dataset, chunk.metadata)
401
-
402
- insert(project, dataset, table_id, rows, schema, template_suffix)
403
- end
404
-
405
- def insert(project, dataset, table_id, rows, schema, template_suffix)
406
- writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
407
- rescue Fluent::BigQuery::Error => e
408
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
409
- # Table Not Found: Auto Create Table
410
- writer.create_table(project, dataset, table_id, schema)
411
- raise "table created. send rows next time."
412
- end
413
-
414
- raise if e.retryable?
415
-
416
- if @secondary
417
- # TODO: find better way
418
- @retry = retry_state_create(
419
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
420
- forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
421
- max_interval: @buffer_config.retry_max_interval,
422
- secondary: true, secondary_threshold: Float::EPSILON,
423
- randomize: @buffer_config.retry_randomize
424
- )
425
- else
426
- @retry = retry_state_create(
427
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
428
- forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
429
- max_interval: @buffer_config.retry_max_interval,
430
- randomize: @buffer_config.retry_randomize
431
- )
432
- end
433
-
434
- raise
435
- end
436
- end
437
-
438
- module LoadImplementation
439
- def _write(chunk, table_id_format)
440
- project = extract_placeholders(@project, chunk.metadata)
441
- dataset = extract_placeholders(@dataset, chunk.metadata)
442
- table_id = extract_placeholders(table_id_format, chunk.metadata)
443
-
444
- schema = get_schema(project, dataset, chunk.metadata)
445
-
446
- load(chunk, project, dataset, table_id, schema)
447
- end
448
-
449
- def load(chunk, project, dataset, table_id, schema)
450
- res = nil
451
-
452
- create_upload_source(chunk) do |upload_source|
453
- res = writer.create_load_job(chunk.unique_id, project, dataset, table_id, upload_source, schema)
454
- end
455
- rescue Fluent::BigQuery::Error => e
456
- raise if e.retryable?
457
-
458
- if @secondary
459
- # TODO: find better way
460
- @retry = retry_state_create(
461
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
462
- forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
463
- max_interval: @buffer_config.retry_max_interval,
464
- secondary: true, secondary_threshold: Float::EPSILON,
465
- randomize: @buffer_config.retry_randomize
466
- )
467
- else
468
- @retry = retry_state_create(
469
- :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
470
- forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
471
- max_interval: @buffer_config.retry_max_interval,
472
- randomize: @buffer_config.retry_randomize
473
- )
474
- end
475
-
476
- raise
477
- end
478
-
479
- private
480
-
481
- def create_upload_source(chunk)
482
- chunk_is_file = @buffer_config["@type"] == 'file'
483
- if chunk_is_file
484
- File.open(chunk.path) do |file|
485
- yield file
486
- end
487
- else
488
- Tempfile.open("chunk-tmp") do |file|
489
- file.binmode
490
- chunk.write_to(file)
491
- file.sync
492
- file.rewind
493
- yield file
494
- end
495
- end
496
- end
497
- end
498
- end
499
- end
500
- end