fluent-plugin-bigquery-test 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ module Fluent
2
+ module BigQueryPlugin
3
+ VERSION = "2.2.0".freeze
4
+ end
5
+ end
@@ -0,0 +1,356 @@
1
+ module Fluent
2
+ module BigQuery
3
+ class Writer
4
+ def initialize(log, auth_method, options = {})
5
+ @auth_method = auth_method
6
+ @scope = "https://www.googleapis.com/auth/bigquery"
7
+ @options = options
8
+ @log = log
9
+ @num_errors_per_chunk = {}
10
+ end
11
+
12
+ def client
13
+ @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
14
+ cl.authorization = get_auth
15
+ cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
16
+ cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
17
+ cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
18
+ end
19
+ end
20
+
21
+ def create_table(project, dataset, table_id, record_schema)
22
+ create_table_retry_limit = 3
23
+ create_table_retry_wait = 1
24
+ create_table_retry_count = 0
25
+ table_id = safe_table_id(table_id)
26
+
27
+ begin
28
+ definition = {
29
+ table_reference: {
30
+ table_id: table_id,
31
+ },
32
+ schema: {
33
+ fields: record_schema.to_a,
34
+ }
35
+ }
36
+
37
+ definition.merge!(time_partitioning: time_partitioning) if time_partitioning
38
+ definition.merge!(clustering: clustering) if clustering
39
+ client.insert_table(project, dataset, definition, {})
40
+ log.debug "create table", project_id: project, dataset: dataset, table: table_id
41
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
42
+ message = e.message
43
+ if e.status_code == 409 && /Already Exists:/ =~ message
44
+ log.debug "already created table", project_id: project, dataset: dataset, table: table_id
45
+ # ignore 'Already Exists' error
46
+ return
47
+ end
48
+
49
+ log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
50
+
51
+ if create_table_retry_count < create_table_retry_limit
52
+ sleep create_table_retry_wait
53
+ create_table_retry_wait *= 2
54
+ create_table_retry_count += 1
55
+ retry
56
+ else
57
+ raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e)
58
+ end
59
+ end
60
+ end
61
+
62
+ def fetch_schema(project, dataset, table_id)
63
+ res = client.get_table(project, dataset, table_id)
64
+ schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields])
65
+ log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
66
+
67
+ schema
68
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
69
+ message = e.message
70
+ log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
71
+ nil
72
+ end
73
+
74
+ def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
75
+ body = {
76
+ rows: rows,
77
+ skip_invalid_rows: @options[:skip_invalid_rows],
78
+ ignore_unknown_values: @options[:ignore_unknown_values],
79
+ }
80
+ body.merge!(template_suffix: template_suffix) if template_suffix
81
+
82
+ if @options[:auto_create_table]
83
+ res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
84
+ else
85
+ res = client.insert_all_table_data(project, dataset, table_id, body, {})
86
+ end
87
+ log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
88
+
89
+ if res.insert_errors && !res.insert_errors.empty?
90
+ log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
91
+ if @options[:allow_retry_insert_errors]
92
+ is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
93
+ insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
94
+ end
95
+ if is_included_any_retryable_insert_error
96
+ raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry")
97
+ else
98
+ raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry")
99
+ end
100
+ end
101
+ end
102
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
103
+ error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
104
+ wrapped = Fluent::BigQuery::Error.wrap(e)
105
+ if wrapped.retryable?
106
+ log.warn "tabledata.insertAll API", error_data
107
+ else
108
+ log.error "tabledata.insertAll API", error_data
109
+ end
110
+
111
+ raise wrapped
112
+ end
113
+
114
+ JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
115
+ def as_hash(*keys)
116
+ if keys.empty?
117
+ to_h
118
+ else
119
+ to_h.select { |k, _| keys.include?(k) }
120
+ end
121
+ end
122
+ end
123
+
124
+ def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
125
+ configuration = {
126
+ configuration: {
127
+ load: {
128
+ destination_table: {
129
+ project_id: project,
130
+ dataset_id: dataset,
131
+ table_id: table_id,
132
+ },
133
+ write_disposition: "WRITE_APPEND",
134
+ source_format: source_format,
135
+ ignore_unknown_values: @options[:ignore_unknown_values],
136
+ max_bad_records: @options[:max_bad_records],
137
+ }
138
+ }
139
+ }
140
+
141
+ job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
142
+ configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
143
+
144
+ begin
145
+ # Check table existance
146
+ client.get_table(project, dataset, table_id)
147
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
148
+ if e.status_code == 404 && /Not Found: Table/i =~ e.message
149
+ raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
150
+ raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
151
+ configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
152
+ configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
153
+ configuration[:configuration][:load].merge!(clustering: clustering) if clustering
154
+ end
155
+ end
156
+
157
+ res = client.insert_job(
158
+ project,
159
+ configuration,
160
+ {
161
+ upload_source: upload_source,
162
+ content_type: "application/octet-stream",
163
+ }
164
+ )
165
+ JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
166
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
167
+ log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
168
+
169
+ if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
170
+ return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
171
+ end
172
+
173
+ raise Fluent::BigQuery::Error.wrap(e)
174
+ end
175
+
176
+ def fetch_load_job(job_reference)
177
+ project = job_reference.project_id
178
+ job_id = job_reference.job_id
179
+ location = @options[:location]
180
+
181
+ res = client.get_job(project, job_id, location: location)
182
+ log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
183
+
184
+ if res.status.state == "DONE"
185
+ res
186
+ end
187
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
188
+ e = Fluent::BigQuery::Error.wrap(e)
189
+ raise e unless e.retryable?
190
+ end
191
+
192
+ def commit_load_job(chunk_id_hex, response)
193
+ job_id = response.id
194
+ project = response.configuration.load.destination_table.project_id
195
+ dataset = response.configuration.load.destination_table.dataset_id
196
+ table_id = response.configuration.load.destination_table.table_id
197
+
198
+ errors = response.status.errors
199
+ if errors
200
+ errors.each do |e|
201
+ log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
202
+ end
203
+ end
204
+
205
+ error_result = response.status.error_result
206
+ if error_result
207
+ log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
208
+ if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
209
+ @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
210
+ raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
211
+ else
212
+ @num_errors_per_chunk.delete(chunk_id_hex)
213
+ raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
214
+ end
215
+ end
216
+
217
+ # `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
218
+ stats = response.statistics.load
219
+ duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
220
+ log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
221
+ @num_errors_per_chunk.delete(chunk_id_hex)
222
+ end
223
+
224
+ private
225
+
226
+ def log
227
+ @log
228
+ end
229
+
230
+ def get_auth
231
+ case @auth_method
232
+ when :private_key
233
+ get_auth_from_private_key
234
+ when :compute_engine
235
+ get_auth_from_compute_engine
236
+ when :json_key
237
+ get_auth_from_json_key
238
+ when :application_default
239
+ get_auth_from_application_default
240
+ else
241
+ raise ConfigError, "Unknown auth method: #{@auth_method}"
242
+ end
243
+ end
244
+
245
+ def get_auth_from_private_key
246
+ require 'google/api_client/auth/key_utils'
247
+ private_key_path = @options[:private_key_path]
248
+ private_key_passphrase = @options[:private_key_passphrase]
249
+ email = @options[:email]
250
+
251
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
252
+ Signet::OAuth2::Client.new(
253
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
254
+ audience: "https://accounts.google.com/o/oauth2/token",
255
+ scope: @scope,
256
+ issuer: email,
257
+ signing_key: key
258
+ )
259
+ end
260
+
261
+ def get_auth_from_compute_engine
262
+ Google::Auth::GCECredentials.new
263
+ end
264
+
265
+ def get_auth_from_json_key
266
+ json_key = @options[:json_key]
267
+
268
+ begin
269
+ JSON.parse(json_key)
270
+ key = StringIO.new(json_key)
271
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
272
+ rescue JSON::ParserError
273
+ key = json_key
274
+ File.open(json_key) do |f|
275
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
276
+ end
277
+ end
278
+ end
279
+
280
+ def get_auth_from_application_default
281
+ Google::Auth.get_application_default([@scope])
282
+ end
283
+
284
+ def safe_table_id(table_id)
285
+ table_id.gsub(/\$\d+$/, "")
286
+ end
287
+
288
+ def create_job_id(chunk_id_hex, dataset, table, schema)
289
+ job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
290
+ @log.debug "job_id_key: #{job_id_key}"
291
+ "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
292
+ end
293
+
294
+ def source_format
295
+ case @options[:source_format]
296
+ when :json
297
+ "NEWLINE_DELIMITED_JSON"
298
+ when :avro
299
+ "AVRO"
300
+ when :csv
301
+ "CSV"
302
+ else
303
+ "NEWLINE_DELIMITED_JSON"
304
+ end
305
+ end
306
+
307
+ def time_partitioning
308
+ return @time_partitioning if instance_variable_defined?(:@time_partitioning)
309
+
310
+ if @options[:time_partitioning_type]
311
+ @time_partitioning = {
312
+ type: @options[:time_partitioning_type].to_s.upcase,
313
+ field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
314
+ expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
315
+ }.reject { |_, v| v.nil? }
316
+ else
317
+ @time_partitioning
318
+ end
319
+ end
320
+
321
+ def clustering
322
+ return @clustering if instance_variable_defined?(:@clustering)
323
+
324
+ if @options[:clustering_fields]
325
+ @clustering = {
326
+ fields: @options[:clustering_fields]
327
+ }
328
+ else
329
+ @clustering
330
+ end
331
+ end
332
+
333
+ def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
334
+ try_count ||= 1
335
+ res = client.insert_all_table_data(project, dataset, table_id, body, {})
336
+ rescue Google::Apis::ClientError => e
337
+ if e.status_code == 404 && /Not Found: Table/i =~ e.message
338
+ if try_count == 1
339
+ # Table Not Found: Auto Create Table
340
+ create_table(project, dataset, table_id, schema)
341
+ elsif try_count > 10
342
+ raise "A new table was created but it is not found."
343
+ end
344
+
345
+ # Retry to insert several times because the created table is not visible from Streaming insert for a little while
346
+ # cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
347
+ try_count += 1
348
+ sleep 5
349
+ log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
350
+ retry
351
+ end
352
+ raise
353
+ end
354
+ end
355
+ end
356
+ end
@@ -0,0 +1,221 @@
1
+ require 'fluent/plugin/output'
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/plugin/bigquery/helper'
6
+ require 'fluent/plugin/bigquery/errors'
7
+ require 'fluent/plugin/bigquery/schema'
8
+ require 'fluent/plugin/bigquery/writer'
9
+
10
+ require 'multi_json'
11
+ require 'google/apis/bigquery_v2'
12
+ require 'googleauth'
13
+
14
+ module Fluent
15
+ module Plugin
16
+ # This class is abstract class
17
+ class BigQueryBaseOutput < Output
18
+ helpers :inject, :formatter
19
+
20
+ # Available methods are:
21
+ # * private_key -- Use service account credential from pkcs12 private key file
22
+ # * compute_engine -- Use access token available in instances of ComputeEngine
23
+ # * json_key -- Use service account credential from JSON key
24
+ # * application_default -- Use application default credential
25
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
26
+
27
+ ### Service Account credential
28
+ config_param :email, :string, default: nil
29
+ config_param :private_key_path, :string, default: nil
30
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
+ config_param :json_key, default: nil, secret: true
32
+ # The geographic location of the job. Required except for US and EU.
33
+ # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
34
+ config_param :location, :string, default: nil
35
+
36
+ # see as simple reference
37
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
38
+ config_param :project, :string
39
+
40
+ # dataset_name
41
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
42
+ # but it cannot start with a number or underscore, or have spaces.
43
+ config_param :dataset, :string
44
+
45
+ # table_id
46
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
47
+ config_param :table, :string, default: nil
48
+ config_param :tables, :array, value_type: :string, default: nil
49
+
50
+ config_param :auto_create_table, :bool, default: false
51
+
52
+ # ignore_unknown_values
53
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
54
+ # Default is false, which treats unknown values as errors.
55
+ config_param :ignore_unknown_values, :bool, default: false
56
+
57
+ config_param :schema, :array, default: nil
58
+ config_param :schema_path, :string, default: nil
59
+ config_param :fetch_schema, :bool, default: false
60
+ config_param :fetch_schema_table, :string, default: nil
61
+ config_param :schema_cache_expire, :time, default: 600
62
+
63
+ ## Timeout
64
+ # request_timeout_sec
65
+ # Bigquery API response timeout
66
+ # request_open_timeout_sec
67
+ # Bigquery API connection, and request timeout
68
+ config_param :request_timeout_sec, :time, default: nil
69
+ config_param :request_open_timeout_sec, :time, default: 60
70
+
71
+ ## Partitioning
72
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
73
+ config_param :time_partitioning_field, :string, default: nil
74
+ config_param :time_partitioning_expiration, :time, default: nil
75
+
76
+ ## Clustering
77
+ config_param :clustering_fields, :array, default: nil
78
+
79
+ ## Formatter
80
+ config_section :format do
81
+ config_set_default :@type, 'json'
82
+ end
83
+
84
+ def configure(conf)
85
+ super
86
+
87
+ case @auth_method
88
+ when :private_key
89
+ unless @email && @private_key_path
90
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
91
+ end
92
+ when :compute_engine
93
+ # Do nothing
94
+ when :json_key
95
+ unless @json_key
96
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
97
+ end
98
+ when :application_default
99
+ # Do nothing
100
+ else
101
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
102
+ end
103
+
104
+ unless @table.nil? ^ @tables.nil?
105
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
106
+ end
107
+
108
+ @tablelist = @tables ? @tables : [@table]
109
+
110
+ @table_schema = Fluent::BigQuery::RecordSchema.new('record')
111
+ if @schema
112
+ @table_schema.load_schema(@schema)
113
+ end
114
+ if @schema_path
115
+ @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
116
+ end
117
+
118
+ formatter_config = conf.elements("format")[0]
119
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
120
+ end
121
+
122
+ def start
123
+ super
124
+
125
+ @tables_queue = @tablelist.shuffle
126
+ @tables_mutex = Mutex.new
127
+ @fetched_schemas = {}
128
+ @last_fetch_schema_time = Hash.new(0)
129
+ end
130
+
131
+ def multi_workers_ready?
132
+ true
133
+ end
134
+
135
+ def writer
136
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
137
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
138
+ email: @email,
139
+ json_key: @json_key,
140
+ location: @location,
141
+ source_format: @source_format,
142
+ skip_invalid_rows: @skip_invalid_rows,
143
+ ignore_unknown_values: @ignore_unknown_values,
144
+ max_bad_records: @max_bad_records,
145
+ allow_retry_insert_errors: @allow_retry_insert_errors,
146
+ prevent_duplicate_load: @prevent_duplicate_load,
147
+ auto_create_table: @auto_create_table,
148
+ time_partitioning_type: @time_partitioning_type,
149
+ time_partitioning_field: @time_partitioning_field,
150
+ time_partitioning_expiration: @time_partitioning_expiration,
151
+ clustering_fields: @clustering_fields,
152
+ timeout_sec: @request_timeout_sec,
153
+ open_timeout_sec: @request_open_timeout_sec,
154
+ })
155
+ end
156
+
157
+ def format(tag, time, record)
158
+ record = inject_values_to_record(tag, time, record)
159
+
160
+ meta = metadata(tag, time, record)
161
+ schema =
162
+ if @fetch_schema
163
+ fetch_schema(meta)
164
+ else
165
+ @table_schema
166
+ end
167
+
168
+ begin
169
+ row = schema.format(record)
170
+ return if row.empty?
171
+ @formatter.format(tag, time, row)
172
+ rescue
173
+ log.error("format error", record: record, schema: schema)
174
+ raise
175
+ end
176
+ end
177
+
178
+ def write(chunk)
179
+ end
180
+
181
+ def fetch_schema(metadata)
182
+ table_id = nil
183
+ project = extract_placeholders(@project, metadata)
184
+ dataset = extract_placeholders(@dataset, metadata)
185
+ table_id = fetch_schema_target_table(metadata)
186
+
187
+ if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
188
+ schema = writer.fetch_schema(project, dataset, table_id)
189
+
190
+ if schema
191
+ table_schema = Fluent::BigQuery::RecordSchema.new("record")
192
+ table_schema.load_schema(schema)
193
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
194
+ else
195
+ if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
196
+ raise "failed to fetch schema from bigquery"
197
+ else
198
+ log.warn "#{table_id} uses previous schema"
199
+ end
200
+ end
201
+
202
+ @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
203
+ end
204
+
205
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
206
+ end
207
+
208
+ def fetch_schema_target_table(metadata)
209
+ extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
210
+ end
211
+
212
+ def get_schema(project, dataset, metadata)
213
+ if @fetch_schema
214
+ @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
215
+ else
216
+ @table_schema
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end