fluent-plugin-bigquery-test 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/ISSUE_TEMPLATE.md +16 -0
- data/.gitignore +21 -0
- data/.travis.yml +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +13 -0
- data/README.md +602 -0
- data/Rakefile +12 -0
- data/fluent-plugin-bigquery.gemspec +29 -0
- data/gemfiles/activesupport-4.gemfile +6 -0
- data/lib/fluent/plugin/bigquery/errors.rb +84 -0
- data/lib/fluent/plugin/bigquery/helper.rb +33 -0
- data/lib/fluent/plugin/bigquery/schema.rb +281 -0
- data/lib/fluent/plugin/bigquery/version.rb +5 -0
- data/lib/fluent/plugin/bigquery/writer.rb +356 -0
- data/lib/fluent/plugin/out_bigquery_base.rb +221 -0
- data/lib/fluent/plugin/out_bigquery_insert.rb +125 -0
- data/lib/fluent/plugin/out_bigquery_load.rb +221 -0
- data/test/helper.rb +20 -0
- data/test/plugin/test_out_bigquery_base.rb +579 -0
- data/test/plugin/test_out_bigquery_insert.rb +544 -0
- data/test/plugin/test_out_bigquery_load.rb +348 -0
- data/test/plugin/test_record_schema.rb +186 -0
- data/test/plugin/testdata/apache.schema +98 -0
- data/test/plugin/testdata/json_key.json +7 -0
- data/test/plugin/testdata/sudo.schema +27 -0
- data/test/run_test.rb +9 -0
- metadata +197 -0
@@ -0,0 +1,356 @@
|
|
1
|
+
module Fluent
|
2
|
+
module BigQuery
|
3
|
+
class Writer
|
4
|
+
def initialize(log, auth_method, options = {})
|
5
|
+
@auth_method = auth_method
|
6
|
+
@scope = "https://www.googleapis.com/auth/bigquery"
|
7
|
+
@options = options
|
8
|
+
@log = log
|
9
|
+
@num_errors_per_chunk = {}
|
10
|
+
end
|
11
|
+
|
12
|
+
def client
|
13
|
+
@client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
|
14
|
+
cl.authorization = get_auth
|
15
|
+
cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
|
16
|
+
cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
|
17
|
+
cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def create_table(project, dataset, table_id, record_schema)
|
22
|
+
create_table_retry_limit = 3
|
23
|
+
create_table_retry_wait = 1
|
24
|
+
create_table_retry_count = 0
|
25
|
+
table_id = safe_table_id(table_id)
|
26
|
+
|
27
|
+
begin
|
28
|
+
definition = {
|
29
|
+
table_reference: {
|
30
|
+
table_id: table_id,
|
31
|
+
},
|
32
|
+
schema: {
|
33
|
+
fields: record_schema.to_a,
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
definition.merge!(time_partitioning: time_partitioning) if time_partitioning
|
38
|
+
definition.merge!(clustering: clustering) if clustering
|
39
|
+
client.insert_table(project, dataset, definition, {})
|
40
|
+
log.debug "create table", project_id: project, dataset: dataset, table: table_id
|
41
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
42
|
+
message = e.message
|
43
|
+
if e.status_code == 409 && /Already Exists:/ =~ message
|
44
|
+
log.debug "already created table", project_id: project, dataset: dataset, table: table_id
|
45
|
+
# ignore 'Already Exists' error
|
46
|
+
return
|
47
|
+
end
|
48
|
+
|
49
|
+
log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
50
|
+
|
51
|
+
if create_table_retry_count < create_table_retry_limit
|
52
|
+
sleep create_table_retry_wait
|
53
|
+
create_table_retry_wait *= 2
|
54
|
+
create_table_retry_count += 1
|
55
|
+
retry
|
56
|
+
else
|
57
|
+
raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def fetch_schema(project, dataset, table_id)
|
63
|
+
res = client.get_table(project, dataset, table_id)
|
64
|
+
schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields])
|
65
|
+
log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
|
66
|
+
|
67
|
+
schema
|
68
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
69
|
+
message = e.message
|
70
|
+
log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
|
71
|
+
nil
|
72
|
+
end
|
73
|
+
|
74
|
+
def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
|
75
|
+
body = {
|
76
|
+
rows: rows,
|
77
|
+
skip_invalid_rows: @options[:skip_invalid_rows],
|
78
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
79
|
+
}
|
80
|
+
body.merge!(template_suffix: template_suffix) if template_suffix
|
81
|
+
|
82
|
+
if @options[:auto_create_table]
|
83
|
+
res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
84
|
+
else
|
85
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
86
|
+
end
|
87
|
+
log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
|
88
|
+
|
89
|
+
if res.insert_errors && !res.insert_errors.empty?
|
90
|
+
log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
|
91
|
+
if @options[:allow_retry_insert_errors]
|
92
|
+
is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
|
93
|
+
insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
|
94
|
+
end
|
95
|
+
if is_included_any_retryable_insert_error
|
96
|
+
raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry")
|
97
|
+
else
|
98
|
+
raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry")
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
103
|
+
error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
|
104
|
+
wrapped = Fluent::BigQuery::Error.wrap(e)
|
105
|
+
if wrapped.retryable?
|
106
|
+
log.warn "tabledata.insertAll API", error_data
|
107
|
+
else
|
108
|
+
log.error "tabledata.insertAll API", error_data
|
109
|
+
end
|
110
|
+
|
111
|
+
raise wrapped
|
112
|
+
end
|
113
|
+
|
114
|
+
JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
|
115
|
+
def as_hash(*keys)
|
116
|
+
if keys.empty?
|
117
|
+
to_h
|
118
|
+
else
|
119
|
+
to_h.select { |k, _| keys.include?(k) }
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
|
125
|
+
configuration = {
|
126
|
+
configuration: {
|
127
|
+
load: {
|
128
|
+
destination_table: {
|
129
|
+
project_id: project,
|
130
|
+
dataset_id: dataset,
|
131
|
+
table_id: table_id,
|
132
|
+
},
|
133
|
+
write_disposition: "WRITE_APPEND",
|
134
|
+
source_format: source_format,
|
135
|
+
ignore_unknown_values: @options[:ignore_unknown_values],
|
136
|
+
max_bad_records: @options[:max_bad_records],
|
137
|
+
}
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
|
142
|
+
configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
|
143
|
+
|
144
|
+
begin
|
145
|
+
# Check table existance
|
146
|
+
client.get_table(project, dataset, table_id)
|
147
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
148
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
149
|
+
raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
|
150
|
+
raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
|
151
|
+
configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
|
152
|
+
configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
|
153
|
+
configuration[:configuration][:load].merge!(clustering: clustering) if clustering
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
res = client.insert_job(
|
158
|
+
project,
|
159
|
+
configuration,
|
160
|
+
{
|
161
|
+
upload_source: upload_source,
|
162
|
+
content_type: "application/octet-stream",
|
163
|
+
}
|
164
|
+
)
|
165
|
+
JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
|
166
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
167
|
+
log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
|
168
|
+
|
169
|
+
if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
|
170
|
+
return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
|
171
|
+
end
|
172
|
+
|
173
|
+
raise Fluent::BigQuery::Error.wrap(e)
|
174
|
+
end
|
175
|
+
|
176
|
+
def fetch_load_job(job_reference)
|
177
|
+
project = job_reference.project_id
|
178
|
+
job_id = job_reference.job_id
|
179
|
+
location = @options[:location]
|
180
|
+
|
181
|
+
res = client.get_job(project, job_id, location: location)
|
182
|
+
log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
|
183
|
+
|
184
|
+
if res.status.state == "DONE"
|
185
|
+
res
|
186
|
+
end
|
187
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
188
|
+
e = Fluent::BigQuery::Error.wrap(e)
|
189
|
+
raise e unless e.retryable?
|
190
|
+
end
|
191
|
+
|
192
|
+
def commit_load_job(chunk_id_hex, response)
|
193
|
+
job_id = response.id
|
194
|
+
project = response.configuration.load.destination_table.project_id
|
195
|
+
dataset = response.configuration.load.destination_table.dataset_id
|
196
|
+
table_id = response.configuration.load.destination_table.table_id
|
197
|
+
|
198
|
+
errors = response.status.errors
|
199
|
+
if errors
|
200
|
+
errors.each do |e|
|
201
|
+
log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
error_result = response.status.error_result
|
206
|
+
if error_result
|
207
|
+
log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
|
208
|
+
if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
|
209
|
+
@num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
|
210
|
+
raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
|
211
|
+
else
|
212
|
+
@num_errors_per_chunk.delete(chunk_id_hex)
|
213
|
+
raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
|
218
|
+
stats = response.statistics.load
|
219
|
+
duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
|
220
|
+
log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
|
221
|
+
@num_errors_per_chunk.delete(chunk_id_hex)
|
222
|
+
end
|
223
|
+
|
224
|
+
private
|
225
|
+
|
226
|
+
def log
|
227
|
+
@log
|
228
|
+
end
|
229
|
+
|
230
|
+
def get_auth
|
231
|
+
case @auth_method
|
232
|
+
when :private_key
|
233
|
+
get_auth_from_private_key
|
234
|
+
when :compute_engine
|
235
|
+
get_auth_from_compute_engine
|
236
|
+
when :json_key
|
237
|
+
get_auth_from_json_key
|
238
|
+
when :application_default
|
239
|
+
get_auth_from_application_default
|
240
|
+
else
|
241
|
+
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def get_auth_from_private_key
|
246
|
+
require 'google/api_client/auth/key_utils'
|
247
|
+
private_key_path = @options[:private_key_path]
|
248
|
+
private_key_passphrase = @options[:private_key_passphrase]
|
249
|
+
email = @options[:email]
|
250
|
+
|
251
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
|
252
|
+
Signet::OAuth2::Client.new(
|
253
|
+
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
254
|
+
audience: "https://accounts.google.com/o/oauth2/token",
|
255
|
+
scope: @scope,
|
256
|
+
issuer: email,
|
257
|
+
signing_key: key
|
258
|
+
)
|
259
|
+
end
|
260
|
+
|
261
|
+
def get_auth_from_compute_engine
|
262
|
+
Google::Auth::GCECredentials.new
|
263
|
+
end
|
264
|
+
|
265
|
+
def get_auth_from_json_key
|
266
|
+
json_key = @options[:json_key]
|
267
|
+
|
268
|
+
begin
|
269
|
+
JSON.parse(json_key)
|
270
|
+
key = StringIO.new(json_key)
|
271
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
|
272
|
+
rescue JSON::ParserError
|
273
|
+
key = json_key
|
274
|
+
File.open(json_key) do |f|
|
275
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def get_auth_from_application_default
|
281
|
+
Google::Auth.get_application_default([@scope])
|
282
|
+
end
|
283
|
+
|
284
|
+
def safe_table_id(table_id)
|
285
|
+
table_id.gsub(/\$\d+$/, "")
|
286
|
+
end
|
287
|
+
|
288
|
+
def create_job_id(chunk_id_hex, dataset, table, schema)
|
289
|
+
job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
|
290
|
+
@log.debug "job_id_key: #{job_id_key}"
|
291
|
+
"fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
|
292
|
+
end
|
293
|
+
|
294
|
+
def source_format
|
295
|
+
case @options[:source_format]
|
296
|
+
when :json
|
297
|
+
"NEWLINE_DELIMITED_JSON"
|
298
|
+
when :avro
|
299
|
+
"AVRO"
|
300
|
+
when :csv
|
301
|
+
"CSV"
|
302
|
+
else
|
303
|
+
"NEWLINE_DELIMITED_JSON"
|
304
|
+
end
|
305
|
+
end
|
306
|
+
|
307
|
+
def time_partitioning
|
308
|
+
return @time_partitioning if instance_variable_defined?(:@time_partitioning)
|
309
|
+
|
310
|
+
if @options[:time_partitioning_type]
|
311
|
+
@time_partitioning = {
|
312
|
+
type: @options[:time_partitioning_type].to_s.upcase,
|
313
|
+
field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
|
314
|
+
expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
|
315
|
+
}.reject { |_, v| v.nil? }
|
316
|
+
else
|
317
|
+
@time_partitioning
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
def clustering
|
322
|
+
return @clustering if instance_variable_defined?(:@clustering)
|
323
|
+
|
324
|
+
if @options[:clustering_fields]
|
325
|
+
@clustering = {
|
326
|
+
fields: @options[:clustering_fields]
|
327
|
+
}
|
328
|
+
else
|
329
|
+
@clustering
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
333
|
+
def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
|
334
|
+
try_count ||= 1
|
335
|
+
res = client.insert_all_table_data(project, dataset, table_id, body, {})
|
336
|
+
rescue Google::Apis::ClientError => e
|
337
|
+
if e.status_code == 404 && /Not Found: Table/i =~ e.message
|
338
|
+
if try_count == 1
|
339
|
+
# Table Not Found: Auto Create Table
|
340
|
+
create_table(project, dataset, table_id, schema)
|
341
|
+
elsif try_count > 10
|
342
|
+
raise "A new table was created but it is not found."
|
343
|
+
end
|
344
|
+
|
345
|
+
# Retry to insert several times because the created table is not visible from Streaming insert for a little while
|
346
|
+
# cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
|
347
|
+
try_count += 1
|
348
|
+
sleep 5
|
349
|
+
log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
|
350
|
+
retry
|
351
|
+
end
|
352
|
+
raise
|
353
|
+
end
|
354
|
+
end
|
355
|
+
end
|
356
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
require 'fluent/plugin/output'
|
2
|
+
|
3
|
+
require 'fluent/plugin/bigquery/version'
|
4
|
+
|
5
|
+
require 'fluent/plugin/bigquery/helper'
|
6
|
+
require 'fluent/plugin/bigquery/errors'
|
7
|
+
require 'fluent/plugin/bigquery/schema'
|
8
|
+
require 'fluent/plugin/bigquery/writer'
|
9
|
+
|
10
|
+
require 'multi_json'
|
11
|
+
require 'google/apis/bigquery_v2'
|
12
|
+
require 'googleauth'
|
13
|
+
|
14
|
+
module Fluent
|
15
|
+
module Plugin
|
16
|
+
# This class is abstract class
|
17
|
+
class BigQueryBaseOutput < Output
|
18
|
+
helpers :inject, :formatter
|
19
|
+
|
20
|
+
# Available methods are:
|
21
|
+
# * private_key -- Use service account credential from pkcs12 private key file
|
22
|
+
# * compute_engine -- Use access token available in instances of ComputeEngine
|
23
|
+
# * json_key -- Use service account credential from JSON key
|
24
|
+
# * application_default -- Use application default credential
|
25
|
+
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
26
|
+
|
27
|
+
### Service Account credential
|
28
|
+
config_param :email, :string, default: nil
|
29
|
+
config_param :private_key_path, :string, default: nil
|
30
|
+
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
|
+
config_param :json_key, default: nil, secret: true
|
32
|
+
# The geographic location of the job. Required except for US and EU.
|
33
|
+
# https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
|
34
|
+
config_param :location, :string, default: nil
|
35
|
+
|
36
|
+
# see as simple reference
|
37
|
+
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
38
|
+
config_param :project, :string
|
39
|
+
|
40
|
+
# dataset_name
|
41
|
+
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
42
|
+
# but it cannot start with a number or underscore, or have spaces.
|
43
|
+
config_param :dataset, :string
|
44
|
+
|
45
|
+
# table_id
|
46
|
+
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
47
|
+
config_param :table, :string, default: nil
|
48
|
+
config_param :tables, :array, value_type: :string, default: nil
|
49
|
+
|
50
|
+
config_param :auto_create_table, :bool, default: false
|
51
|
+
|
52
|
+
# ignore_unknown_values
|
53
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
54
|
+
# Default is false, which treats unknown values as errors.
|
55
|
+
config_param :ignore_unknown_values, :bool, default: false
|
56
|
+
|
57
|
+
config_param :schema, :array, default: nil
|
58
|
+
config_param :schema_path, :string, default: nil
|
59
|
+
config_param :fetch_schema, :bool, default: false
|
60
|
+
config_param :fetch_schema_table, :string, default: nil
|
61
|
+
config_param :schema_cache_expire, :time, default: 600
|
62
|
+
|
63
|
+
## Timeout
|
64
|
+
# request_timeout_sec
|
65
|
+
# Bigquery API response timeout
|
66
|
+
# request_open_timeout_sec
|
67
|
+
# Bigquery API connection, and request timeout
|
68
|
+
config_param :request_timeout_sec, :time, default: nil
|
69
|
+
config_param :request_open_timeout_sec, :time, default: 60
|
70
|
+
|
71
|
+
## Partitioning
|
72
|
+
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
73
|
+
config_param :time_partitioning_field, :string, default: nil
|
74
|
+
config_param :time_partitioning_expiration, :time, default: nil
|
75
|
+
|
76
|
+
## Clustering
|
77
|
+
config_param :clustering_fields, :array, default: nil
|
78
|
+
|
79
|
+
## Formatter
|
80
|
+
config_section :format do
|
81
|
+
config_set_default :@type, 'json'
|
82
|
+
end
|
83
|
+
|
84
|
+
def configure(conf)
|
85
|
+
super
|
86
|
+
|
87
|
+
case @auth_method
|
88
|
+
when :private_key
|
89
|
+
unless @email && @private_key_path
|
90
|
+
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
91
|
+
end
|
92
|
+
when :compute_engine
|
93
|
+
# Do nothing
|
94
|
+
when :json_key
|
95
|
+
unless @json_key
|
96
|
+
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
97
|
+
end
|
98
|
+
when :application_default
|
99
|
+
# Do nothing
|
100
|
+
else
|
101
|
+
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
102
|
+
end
|
103
|
+
|
104
|
+
unless @table.nil? ^ @tables.nil?
|
105
|
+
raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
|
106
|
+
end
|
107
|
+
|
108
|
+
@tablelist = @tables ? @tables : [@table]
|
109
|
+
|
110
|
+
@table_schema = Fluent::BigQuery::RecordSchema.new('record')
|
111
|
+
if @schema
|
112
|
+
@table_schema.load_schema(@schema)
|
113
|
+
end
|
114
|
+
if @schema_path
|
115
|
+
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
116
|
+
end
|
117
|
+
|
118
|
+
formatter_config = conf.elements("format")[0]
|
119
|
+
@formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
|
120
|
+
end
|
121
|
+
|
122
|
+
def start
|
123
|
+
super
|
124
|
+
|
125
|
+
@tables_queue = @tablelist.shuffle
|
126
|
+
@tables_mutex = Mutex.new
|
127
|
+
@fetched_schemas = {}
|
128
|
+
@last_fetch_schema_time = Hash.new(0)
|
129
|
+
end
|
130
|
+
|
131
|
+
def multi_workers_ready?
|
132
|
+
true
|
133
|
+
end
|
134
|
+
|
135
|
+
def writer
|
136
|
+
@writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
137
|
+
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
138
|
+
email: @email,
|
139
|
+
json_key: @json_key,
|
140
|
+
location: @location,
|
141
|
+
source_format: @source_format,
|
142
|
+
skip_invalid_rows: @skip_invalid_rows,
|
143
|
+
ignore_unknown_values: @ignore_unknown_values,
|
144
|
+
max_bad_records: @max_bad_records,
|
145
|
+
allow_retry_insert_errors: @allow_retry_insert_errors,
|
146
|
+
prevent_duplicate_load: @prevent_duplicate_load,
|
147
|
+
auto_create_table: @auto_create_table,
|
148
|
+
time_partitioning_type: @time_partitioning_type,
|
149
|
+
time_partitioning_field: @time_partitioning_field,
|
150
|
+
time_partitioning_expiration: @time_partitioning_expiration,
|
151
|
+
clustering_fields: @clustering_fields,
|
152
|
+
timeout_sec: @request_timeout_sec,
|
153
|
+
open_timeout_sec: @request_open_timeout_sec,
|
154
|
+
})
|
155
|
+
end
|
156
|
+
|
157
|
+
def format(tag, time, record)
|
158
|
+
record = inject_values_to_record(tag, time, record)
|
159
|
+
|
160
|
+
meta = metadata(tag, time, record)
|
161
|
+
schema =
|
162
|
+
if @fetch_schema
|
163
|
+
fetch_schema(meta)
|
164
|
+
else
|
165
|
+
@table_schema
|
166
|
+
end
|
167
|
+
|
168
|
+
begin
|
169
|
+
row = schema.format(record)
|
170
|
+
return if row.empty?
|
171
|
+
@formatter.format(tag, time, row)
|
172
|
+
rescue
|
173
|
+
log.error("format error", record: record, schema: schema)
|
174
|
+
raise
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
def write(chunk)
|
179
|
+
end
|
180
|
+
|
181
|
+
def fetch_schema(metadata)
|
182
|
+
table_id = nil
|
183
|
+
project = extract_placeholders(@project, metadata)
|
184
|
+
dataset = extract_placeholders(@dataset, metadata)
|
185
|
+
table_id = fetch_schema_target_table(metadata)
|
186
|
+
|
187
|
+
if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
|
188
|
+
schema = writer.fetch_schema(project, dataset, table_id)
|
189
|
+
|
190
|
+
if schema
|
191
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
192
|
+
table_schema.load_schema(schema)
|
193
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
194
|
+
else
|
195
|
+
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
|
196
|
+
raise "failed to fetch schema from bigquery"
|
197
|
+
else
|
198
|
+
log.warn "#{table_id} uses previous schema"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
@last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
|
203
|
+
end
|
204
|
+
|
205
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"]
|
206
|
+
end
|
207
|
+
|
208
|
+
def fetch_schema_target_table(metadata)
|
209
|
+
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
210
|
+
end
|
211
|
+
|
212
|
+
def get_schema(project, dataset, metadata)
|
213
|
+
if @fetch_schema
|
214
|
+
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
215
|
+
else
|
216
|
+
@table_schema
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|