fluent-plugin-bigquery 1.2.0 → 2.0.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -9
- data/README.md +68 -65
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +45 -39
- data/lib/fluent/plugin/out_bigquery_base.rb +211 -0
- data/lib/fluent/plugin/out_bigquery_insert.rb +131 -0
- data/lib/fluent/plugin/out_bigquery_load.rb +220 -0
- data/test/helper.rb +3 -1
- data/test/plugin/test_out_bigquery_base.rb +579 -0
- data/test/plugin/test_out_bigquery_insert.rb +420 -0
- data/test/plugin/test_out_bigquery_load.rb +310 -0
- metadata +13 -7
- data/lib/fluent/plugin/out_bigquery.rb +0 -500
- data/test/plugin/test_out_bigquery.rb +0 -1276
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fluent-plugin-bigquery
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0.beta
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Naoya Ito
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2018-03-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -152,9 +152,13 @@ files:
|
|
152
152
|
- lib/fluent/plugin/bigquery/schema.rb
|
153
153
|
- lib/fluent/plugin/bigquery/version.rb
|
154
154
|
- lib/fluent/plugin/bigquery/writer.rb
|
155
|
-
- lib/fluent/plugin/
|
155
|
+
- lib/fluent/plugin/out_bigquery_base.rb
|
156
|
+
- lib/fluent/plugin/out_bigquery_insert.rb
|
157
|
+
- lib/fluent/plugin/out_bigquery_load.rb
|
156
158
|
- test/helper.rb
|
157
|
-
- test/plugin/
|
159
|
+
- test/plugin/test_out_bigquery_base.rb
|
160
|
+
- test/plugin/test_out_bigquery_insert.rb
|
161
|
+
- test/plugin/test_out_bigquery_load.rb
|
158
162
|
- test/plugin/test_record_schema.rb
|
159
163
|
- test/plugin/testdata/apache.schema
|
160
164
|
- test/plugin/testdata/json_key.json
|
@@ -175,9 +179,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
175
179
|
version: '0'
|
176
180
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
177
181
|
requirements:
|
178
|
-
- - "
|
182
|
+
- - ">"
|
179
183
|
- !ruby/object:Gem::Version
|
180
|
-
version:
|
184
|
+
version: 1.3.1
|
181
185
|
requirements: []
|
182
186
|
rubyforge_project:
|
183
187
|
rubygems_version: 2.6.12
|
@@ -186,7 +190,9 @@ specification_version: 4
|
|
186
190
|
summary: Fluentd plugin to store data on Google BigQuery
|
187
191
|
test_files:
|
188
192
|
- test/helper.rb
|
189
|
-
- test/plugin/
|
193
|
+
- test/plugin/test_out_bigquery_base.rb
|
194
|
+
- test/plugin/test_out_bigquery_insert.rb
|
195
|
+
- test/plugin/test_out_bigquery_load.rb
|
190
196
|
- test/plugin/test_record_schema.rb
|
191
197
|
- test/plugin/testdata/apache.schema
|
192
198
|
- test/plugin/testdata/json_key.json
|
@@ -1,500 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
|
3
|
-
require 'fluent/plugin/output'
|
4
|
-
|
5
|
-
require 'fluent/plugin/bigquery/version'
|
6
|
-
|
7
|
-
require 'fluent/plugin/bigquery/helper'
|
8
|
-
require 'fluent/plugin/bigquery/errors'
|
9
|
-
require 'fluent/plugin/bigquery/schema'
|
10
|
-
require 'fluent/plugin/bigquery/writer'
|
11
|
-
|
12
|
-
module Fluent
|
13
|
-
module Plugin
|
14
|
-
class BigQueryOutput < Output
|
15
|
-
Fluent::Plugin.register_output('bigquery', self)
|
16
|
-
|
17
|
-
helpers :inject, :formatter
|
18
|
-
|
19
|
-
# https://developers.google.com/bigquery/browser-tool-quickstart
|
20
|
-
# https://developers.google.com/bigquery/bigquery-api-quickstart
|
21
|
-
|
22
|
-
### default for insert
|
23
|
-
def configure_for_insert(conf)
|
24
|
-
raise ConfigError unless conf["method"].nil? || conf["method"] == "insert"
|
25
|
-
|
26
|
-
formatter_config = conf.elements("format")[0]
|
27
|
-
if formatter_config && formatter_config['@type'] != "json"
|
28
|
-
log.warn "`insert` mode supports only json formatter."
|
29
|
-
formatter_config['@type'] = nil
|
30
|
-
end
|
31
|
-
@formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
|
32
|
-
|
33
|
-
buffer_config = conf.elements("buffer")[0]
|
34
|
-
if buffer_config
|
35
|
-
buffer_config["@type"] = "memory" unless buffer_config["@type"]
|
36
|
-
buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
|
37
|
-
buffer_config["flush_interval"] = 0.25 unless buffer_config["flush_interval"]
|
38
|
-
buffer_config["flush_thread_interval"] = 0.05 unless buffer_config["flush_thread_interval"]
|
39
|
-
buffer_config["flush_thread_burst_interval"] = 0.05 unless buffer_config["flush_thread_burst_interval"]
|
40
|
-
buffer_config["chunk_limit_size"] = 1 * 1024 ** 2 unless buffer_config["chunk_limit_size"] # 1MB
|
41
|
-
buffer_config["total_limit_size"] = 1 * 1024 ** 3 unless buffer_config["total_limit_size"] # 1GB
|
42
|
-
buffer_config["chunk_records_limit"] = 500 unless buffer_config["chunk_records_limit"]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
|
46
|
-
### default for loads
|
47
|
-
def configure_for_load(conf)
|
48
|
-
raise ConfigError unless conf["method"] == "load"
|
49
|
-
|
50
|
-
formatter_config = conf.elements("format")[0]
|
51
|
-
@formatter = formatter_create(usage: 'out_bigquery_for_load', conf: formatter_config, default_type: 'json')
|
52
|
-
|
53
|
-
buffer_config = conf.elements("buffer")[0]
|
54
|
-
return unless buffer_config
|
55
|
-
buffer_config["@type"] = "file" unless buffer_config["@type"]
|
56
|
-
buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
|
57
|
-
buffer_config["chunk_limit_size"] = 1 * 1024 ** 3 unless buffer_config["chunk_limit_size"] # 1GB
|
58
|
-
buffer_config["total_limit_size"] = 32 * 1024 ** 3 unless buffer_config["total_limit_size"] # 32GB
|
59
|
-
end
|
60
|
-
|
61
|
-
# Available methods are:
|
62
|
-
# * private_key -- Use service account credential from pkcs12 private key file
|
63
|
-
# * compute_engine -- Use access token available in instances of ComputeEngine
|
64
|
-
# * json_key -- Use service account credential from JSON key
|
65
|
-
# * application_default -- Use application default credential
|
66
|
-
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
67
|
-
|
68
|
-
### Service Account credential
|
69
|
-
config_param :email, :string, default: nil
|
70
|
-
config_param :private_key_path, :string, default: nil
|
71
|
-
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
72
|
-
config_param :json_key, default: nil, secret: true
|
73
|
-
|
74
|
-
# see as simple reference
|
75
|
-
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
76
|
-
config_param :project, :string
|
77
|
-
|
78
|
-
# dataset_name
|
79
|
-
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
80
|
-
# but it cannot start with a number or underscore, or have spaces.
|
81
|
-
config_param :dataset, :string
|
82
|
-
|
83
|
-
# table_id
|
84
|
-
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
85
|
-
config_param :table, :string, default: nil
|
86
|
-
config_param :tables, :array, value_type: :string, default: nil
|
87
|
-
|
88
|
-
# template_suffix (only insert)
|
89
|
-
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
90
|
-
config_param :template_suffix, :string, default: nil
|
91
|
-
|
92
|
-
config_param :auto_create_table, :bool, default: false
|
93
|
-
|
94
|
-
config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
|
95
|
-
|
96
|
-
# skip_invalid_rows (only insert)
|
97
|
-
# Insert all valid rows of a request, even if invalid rows exist.
|
98
|
-
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
99
|
-
config_param :skip_invalid_rows, :bool, default: false
|
100
|
-
# max_bad_records (only load)
|
101
|
-
# The maximum number of bad records that BigQuery can ignore when running the job.
|
102
|
-
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
103
|
-
# The default value is 0, which requires that all records are valid.
|
104
|
-
config_param :max_bad_records, :integer, default: 0
|
105
|
-
# ignore_unknown_values
|
106
|
-
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
107
|
-
# Default is false, which treats unknown values as errors.
|
108
|
-
config_param :ignore_unknown_values, :bool, default: false
|
109
|
-
|
110
|
-
config_param :schema, :array, default: nil
|
111
|
-
config_param :schema_path, :string, default: nil
|
112
|
-
config_param :fetch_schema, :bool, default: false
|
113
|
-
config_param :fetch_schema_table, :string, default: nil
|
114
|
-
config_param :schema_cache_expire, :time, default: 600
|
115
|
-
|
116
|
-
REGEXP_MAX_NUM = 10
|
117
|
-
config_param :replace_record_key, :bool, default: false
|
118
|
-
(1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
|
119
|
-
|
120
|
-
# insert_id_field (only insert)
|
121
|
-
config_param :insert_id_field, :string, default: nil
|
122
|
-
# prevent_duplicate_load (only load)
|
123
|
-
config_param :prevent_duplicate_load, :bool, default: false
|
124
|
-
|
125
|
-
# add_insert_timestamp (only insert)
|
126
|
-
# adds a timestamp just before sending the rows to bigquery, so that
|
127
|
-
# buffering time is not taken into account. Gives a field in bigquery
|
128
|
-
# which represents the insert time of the row.
|
129
|
-
config_param :add_insert_timestamp, :string, default: nil
|
130
|
-
|
131
|
-
config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
|
132
|
-
|
133
|
-
# allow_retry_insert_errors (only insert)
|
134
|
-
# If insert_id_field is not specified, true means to allow duplicate rows
|
135
|
-
config_param :allow_retry_insert_errors, :bool, default: false
|
136
|
-
|
137
|
-
# TODO
|
138
|
-
# config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
139
|
-
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
140
|
-
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
141
|
-
### method: ''Streaming data inserts support
|
142
|
-
# https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
143
|
-
# Maximum row size: 100 KB
|
144
|
-
# Maximum data size of all rows, per insert: 1 MB
|
145
|
-
# Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
|
146
|
-
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
147
|
-
### Toooooooooooooo short/small per inserts and row!
|
148
|
-
|
149
|
-
## Timeout
|
150
|
-
# request_timeout_sec
|
151
|
-
# Bigquery API response timeout
|
152
|
-
# request_open_timeout_sec
|
153
|
-
# Bigquery API connection, and request timeout
|
154
|
-
config_param :request_timeout_sec, :time, default: nil
|
155
|
-
config_param :request_open_timeout_sec, :time, default: 60
|
156
|
-
|
157
|
-
## Partitioning
|
158
|
-
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
159
|
-
config_param :time_partitioning_expiration, :time, default: nil
|
160
|
-
|
161
|
-
## Formatter
|
162
|
-
config_section :format do
|
163
|
-
config_set_default :@type, 'json'
|
164
|
-
end
|
165
|
-
|
166
|
-
### Table types
|
167
|
-
# https://developers.google.com/bigquery/docs/tables
|
168
|
-
#
|
169
|
-
# type - The following data types are supported; see Data Formats for details on each data type:
|
170
|
-
# STRING
|
171
|
-
# INTEGER
|
172
|
-
# FLOAT
|
173
|
-
# BOOLEAN
|
174
|
-
# RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
|
175
|
-
#
|
176
|
-
# mode - Whether a field can be null. The following values are supported:
|
177
|
-
# NULLABLE - The cell can be null.
|
178
|
-
# REQUIRED - The cell cannot be null.
|
179
|
-
# REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
|
180
|
-
|
181
|
-
def initialize
|
182
|
-
super
|
183
|
-
require 'multi_json'
|
184
|
-
require 'google/apis/bigquery_v2'
|
185
|
-
require 'googleauth'
|
186
|
-
|
187
|
-
# MEMO: signet-0.6.1 depend on Farady.default_connection
|
188
|
-
Faraday.default_connection.options.timeout = 60
|
189
|
-
end
|
190
|
-
|
191
|
-
def configure(conf)
|
192
|
-
if conf["method"] == "load"
|
193
|
-
configure_for_load(conf)
|
194
|
-
else
|
195
|
-
configure_for_insert(conf)
|
196
|
-
end
|
197
|
-
super
|
198
|
-
|
199
|
-
case @method
|
200
|
-
when :insert
|
201
|
-
extend(InsertImplementation)
|
202
|
-
when :load
|
203
|
-
raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
|
204
|
-
extend(LoadImplementation)
|
205
|
-
end
|
206
|
-
|
207
|
-
case @auth_method
|
208
|
-
when :private_key
|
209
|
-
unless @email && @private_key_path
|
210
|
-
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
211
|
-
end
|
212
|
-
when :compute_engine
|
213
|
-
# Do nothing
|
214
|
-
when :json_key
|
215
|
-
unless @json_key
|
216
|
-
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
217
|
-
end
|
218
|
-
when :application_default
|
219
|
-
# Do nothing
|
220
|
-
else
|
221
|
-
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
222
|
-
end
|
223
|
-
|
224
|
-
@writers = {}
|
225
|
-
|
226
|
-
unless @table.nil? ^ @tables.nil?
|
227
|
-
raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
|
228
|
-
end
|
229
|
-
|
230
|
-
@tablelist = @tables ? @tables : [@table]
|
231
|
-
|
232
|
-
@table_schema = Fluent::BigQuery::RecordSchema.new('record')
|
233
|
-
if @schema
|
234
|
-
@table_schema.load_schema(@schema)
|
235
|
-
end
|
236
|
-
if @schema_path
|
237
|
-
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
238
|
-
end
|
239
|
-
|
240
|
-
warn "[DEPRECATION] `replace_record_key` param is deprecated. Please use filter_record_transformer or fluent-plugin-record-reformer" if @replace_record_key
|
241
|
-
|
242
|
-
@regexps = {}
|
243
|
-
(1..REGEXP_MAX_NUM).each do |i|
|
244
|
-
next unless conf["replace_record_key_regexp#{i}"]
|
245
|
-
regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
|
246
|
-
raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
|
247
|
-
raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
|
248
|
-
@regexps[regexp] = replacement
|
249
|
-
end
|
250
|
-
|
251
|
-
if @insert_id_field
|
252
|
-
insert_id_keys = @insert_id_field.split('.')
|
253
|
-
@get_insert_id = ->(record) {
|
254
|
-
insert_id_keys.inject(record) {|h, k| h[k] }
|
255
|
-
}
|
256
|
-
else
|
257
|
-
@get_insert_id = nil
|
258
|
-
end
|
259
|
-
|
260
|
-
placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
|
261
|
-
placeholder_validate!(:bigquery, placeholder_params)
|
262
|
-
end
|
263
|
-
|
264
|
-
def start
|
265
|
-
super
|
266
|
-
|
267
|
-
@tables_queue = @tablelist.shuffle
|
268
|
-
@tables_mutex = Mutex.new
|
269
|
-
@fetched_schemas = {}
|
270
|
-
@last_fetch_schema_time = Hash.new(0)
|
271
|
-
end
|
272
|
-
|
273
|
-
def multi_workers_ready?
|
274
|
-
true
|
275
|
-
end
|
276
|
-
|
277
|
-
def writer
|
278
|
-
@writers["thread-#{Thread.current.object_id}"] ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
279
|
-
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
280
|
-
email: @email,
|
281
|
-
json_key: @json_key,
|
282
|
-
source_format: @source_format,
|
283
|
-
skip_invalid_rows: @skip_invalid_rows,
|
284
|
-
ignore_unknown_values: @ignore_unknown_values,
|
285
|
-
max_bad_records: @max_bad_records,
|
286
|
-
allow_retry_insert_errors: @allow_retry_insert_errors,
|
287
|
-
prevent_duplicate_load: @prevent_duplicate_load,
|
288
|
-
auto_create_table: @auto_create_table,
|
289
|
-
time_partitioning_type: @time_partitioning_type,
|
290
|
-
time_partitioning_expiration: @time_partitioning_expiration,
|
291
|
-
timeout_sec: @request_timeout_sec,
|
292
|
-
open_timeout_sec: @request_open_timeout_sec,
|
293
|
-
})
|
294
|
-
end
|
295
|
-
|
296
|
-
def replace_record_key(record)
|
297
|
-
new_record = {}
|
298
|
-
record.each do |key, _|
|
299
|
-
new_key = key
|
300
|
-
@regexps.each do |regexp, replacement|
|
301
|
-
new_key = new_key.gsub(/#{regexp}/, replacement)
|
302
|
-
end
|
303
|
-
new_key = new_key.gsub(/\W/, '')
|
304
|
-
new_record.store(new_key, record[key])
|
305
|
-
end
|
306
|
-
new_record
|
307
|
-
end
|
308
|
-
|
309
|
-
def format(tag, time, record)
|
310
|
-
if @replace_record_key
|
311
|
-
record = replace_record_key(record)
|
312
|
-
end
|
313
|
-
|
314
|
-
record = inject_values_to_record(tag, time, record)
|
315
|
-
|
316
|
-
meta = metadata(tag, time, record)
|
317
|
-
schema =
|
318
|
-
if @fetch_schema
|
319
|
-
fetch_schema(meta)
|
320
|
-
else
|
321
|
-
@table_schema
|
322
|
-
end
|
323
|
-
|
324
|
-
begin
|
325
|
-
row = schema.format(record)
|
326
|
-
return if row.empty?
|
327
|
-
@formatter.format(tag, time, row)
|
328
|
-
rescue
|
329
|
-
log.error("format error", record: record, schema: schema)
|
330
|
-
raise
|
331
|
-
end
|
332
|
-
end
|
333
|
-
|
334
|
-
def write(chunk)
|
335
|
-
table_id_format = @tables_mutex.synchronize do
|
336
|
-
t = @tables_queue.shift
|
337
|
-
@tables_queue.push t
|
338
|
-
t
|
339
|
-
end
|
340
|
-
_write(chunk, table_id_format)
|
341
|
-
end
|
342
|
-
|
343
|
-
def fetch_schema(metadata)
|
344
|
-
table_id = nil
|
345
|
-
project = extract_placeholders(@project, metadata)
|
346
|
-
dataset = extract_placeholders(@dataset, metadata)
|
347
|
-
table_id = fetch_schema_target_table(metadata)
|
348
|
-
|
349
|
-
if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
|
350
|
-
schema = writer.fetch_schema(project, dataset, table_id)
|
351
|
-
|
352
|
-
if schema
|
353
|
-
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
354
|
-
table_schema.load_schema(schema)
|
355
|
-
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
356
|
-
else
|
357
|
-
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
|
358
|
-
raise "failed to fetch schema from bigquery"
|
359
|
-
else
|
360
|
-
log.warn "#{table_id} uses previous schema"
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
@last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
|
365
|
-
end
|
366
|
-
|
367
|
-
@fetched_schemas["#{project}.#{dataset}.#{table_id}"]
|
368
|
-
end
|
369
|
-
|
370
|
-
def fetch_schema_target_table(metadata)
|
371
|
-
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
372
|
-
end
|
373
|
-
|
374
|
-
def get_schema(project, dataset, metadata)
|
375
|
-
if @fetch_schema
|
376
|
-
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
377
|
-
else
|
378
|
-
@table_schema
|
379
|
-
end
|
380
|
-
end
|
381
|
-
|
382
|
-
module InsertImplementation
|
383
|
-
def _write(chunk, table_format)
|
384
|
-
now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
|
385
|
-
rows = chunk.open do |io|
|
386
|
-
io.map do |line|
|
387
|
-
record = MultiJson.load(line)
|
388
|
-
record[@add_insert_timestamp] = now if @add_insert_timestamp
|
389
|
-
row = {"json" => record}
|
390
|
-
row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
|
391
|
-
Fluent::BigQuery::Helper.deep_symbolize_keys(row)
|
392
|
-
end
|
393
|
-
end
|
394
|
-
|
395
|
-
project = extract_placeholders(@project, chunk.metadata)
|
396
|
-
dataset = extract_placeholders(@dataset, chunk.metadata)
|
397
|
-
table_id = extract_placeholders(table_format, chunk.metadata)
|
398
|
-
template_suffix = @template_suffix ? extract_placeholders(@template_suffix, chunk.metadata) : nil
|
399
|
-
|
400
|
-
schema = get_schema(project, dataset, chunk.metadata)
|
401
|
-
|
402
|
-
insert(project, dataset, table_id, rows, schema, template_suffix)
|
403
|
-
end
|
404
|
-
|
405
|
-
def insert(project, dataset, table_id, rows, schema, template_suffix)
|
406
|
-
writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
|
407
|
-
rescue Fluent::BigQuery::Error => e
|
408
|
-
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
409
|
-
# Table Not Found: Auto Create Table
|
410
|
-
writer.create_table(project, dataset, table_id, schema)
|
411
|
-
raise "table created. send rows next time."
|
412
|
-
end
|
413
|
-
|
414
|
-
raise if e.retryable?
|
415
|
-
|
416
|
-
if @secondary
|
417
|
-
# TODO: find better way
|
418
|
-
@retry = retry_state_create(
|
419
|
-
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
420
|
-
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
421
|
-
max_interval: @buffer_config.retry_max_interval,
|
422
|
-
secondary: true, secondary_threshold: Float::EPSILON,
|
423
|
-
randomize: @buffer_config.retry_randomize
|
424
|
-
)
|
425
|
-
else
|
426
|
-
@retry = retry_state_create(
|
427
|
-
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
428
|
-
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
429
|
-
max_interval: @buffer_config.retry_max_interval,
|
430
|
-
randomize: @buffer_config.retry_randomize
|
431
|
-
)
|
432
|
-
end
|
433
|
-
|
434
|
-
raise
|
435
|
-
end
|
436
|
-
end
|
437
|
-
|
438
|
-
module LoadImplementation
|
439
|
-
def _write(chunk, table_id_format)
|
440
|
-
project = extract_placeholders(@project, chunk.metadata)
|
441
|
-
dataset = extract_placeholders(@dataset, chunk.metadata)
|
442
|
-
table_id = extract_placeholders(table_id_format, chunk.metadata)
|
443
|
-
|
444
|
-
schema = get_schema(project, dataset, chunk.metadata)
|
445
|
-
|
446
|
-
load(chunk, project, dataset, table_id, schema)
|
447
|
-
end
|
448
|
-
|
449
|
-
def load(chunk, project, dataset, table_id, schema)
|
450
|
-
res = nil
|
451
|
-
|
452
|
-
create_upload_source(chunk) do |upload_source|
|
453
|
-
res = writer.create_load_job(chunk.unique_id, project, dataset, table_id, upload_source, schema)
|
454
|
-
end
|
455
|
-
rescue Fluent::BigQuery::Error => e
|
456
|
-
raise if e.retryable?
|
457
|
-
|
458
|
-
if @secondary
|
459
|
-
# TODO: find better way
|
460
|
-
@retry = retry_state_create(
|
461
|
-
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
462
|
-
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
463
|
-
max_interval: @buffer_config.retry_max_interval,
|
464
|
-
secondary: true, secondary_threshold: Float::EPSILON,
|
465
|
-
randomize: @buffer_config.retry_randomize
|
466
|
-
)
|
467
|
-
else
|
468
|
-
@retry = retry_state_create(
|
469
|
-
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
470
|
-
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
471
|
-
max_interval: @buffer_config.retry_max_interval,
|
472
|
-
randomize: @buffer_config.retry_randomize
|
473
|
-
)
|
474
|
-
end
|
475
|
-
|
476
|
-
raise
|
477
|
-
end
|
478
|
-
|
479
|
-
private
|
480
|
-
|
481
|
-
def create_upload_source(chunk)
|
482
|
-
chunk_is_file = @buffer_config["@type"] == 'file'
|
483
|
-
if chunk_is_file
|
484
|
-
File.open(chunk.path) do |file|
|
485
|
-
yield file
|
486
|
-
end
|
487
|
-
else
|
488
|
-
Tempfile.open("chunk-tmp") do |file|
|
489
|
-
file.binmode
|
490
|
-
chunk.write_to(file)
|
491
|
-
file.sync
|
492
|
-
file.rewind
|
493
|
-
yield file
|
494
|
-
end
|
495
|
-
end
|
496
|
-
end
|
497
|
-
end
|
498
|
-
end
|
499
|
-
end
|
500
|
-
end
|