fluent-plugin-bigquery-test 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ module Fluent
2
+ module BigQueryPlugin
3
+ VERSION = "2.2.0".freeze
4
+ end
5
+ end
@@ -0,0 +1,356 @@
1
+ module Fluent
2
+ module BigQuery
3
+ class Writer
4
+ def initialize(log, auth_method, options = {})
5
+ @auth_method = auth_method
6
+ @scope = "https://www.googleapis.com/auth/bigquery"
7
+ @options = options
8
+ @log = log
9
+ @num_errors_per_chunk = {}
10
+ end
11
+
12
+ def client
13
+ @client ||= Google::Apis::BigqueryV2::BigqueryService.new.tap do |cl|
14
+ cl.authorization = get_auth
15
+ cl.client_options.open_timeout_sec = @options[:open_timeout_sec] if @options[:open_timeout_sec]
16
+ cl.client_options.read_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
17
+ cl.client_options.send_timeout_sec = @options[:timeout_sec] if @options[:timeout_sec]
18
+ end
19
+ end
20
+
21
+ def create_table(project, dataset, table_id, record_schema)
22
+ create_table_retry_limit = 3
23
+ create_table_retry_wait = 1
24
+ create_table_retry_count = 0
25
+ table_id = safe_table_id(table_id)
26
+
27
+ begin
28
+ definition = {
29
+ table_reference: {
30
+ table_id: table_id,
31
+ },
32
+ schema: {
33
+ fields: record_schema.to_a,
34
+ }
35
+ }
36
+
37
+ definition.merge!(time_partitioning: time_partitioning) if time_partitioning
38
+ definition.merge!(clustering: clustering) if clustering
39
+ client.insert_table(project, dataset, definition, {})
40
+ log.debug "create table", project_id: project, dataset: dataset, table: table_id
41
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
42
+ message = e.message
43
+ if e.status_code == 409 && /Already Exists:/ =~ message
44
+ log.debug "already created table", project_id: project, dataset: dataset, table: table_id
45
+ # ignore 'Already Exists' error
46
+ return
47
+ end
48
+
49
+ log.error "tables.insert API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
50
+
51
+ if create_table_retry_count < create_table_retry_limit
52
+ sleep create_table_retry_wait
53
+ create_table_retry_wait *= 2
54
+ create_table_retry_count += 1
55
+ retry
56
+ else
57
+ raise Fluent::BigQuery::UnRetryableError.new("failed to create table in bigquery", e)
58
+ end
59
+ end
60
+ end
61
+
62
+ def fetch_schema(project, dataset, table_id)
63
+ res = client.get_table(project, dataset, table_id)
64
+ schema = Fluent::BigQuery::Helper.deep_stringify_keys(res.schema.to_h[:fields])
65
+ log.debug "Load schema from BigQuery: #{project}:#{dataset}.#{table_id} #{schema}"
66
+
67
+ schema
68
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
69
+ message = e.message
70
+ log.error "tables.get API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: message
71
+ nil
72
+ end
73
+
74
+ def insert_rows(project, dataset, table_id, rows, schema, template_suffix: nil)
75
+ body = {
76
+ rows: rows,
77
+ skip_invalid_rows: @options[:skip_invalid_rows],
78
+ ignore_unknown_values: @options[:ignore_unknown_values],
79
+ }
80
+ body.merge!(template_suffix: template_suffix) if template_suffix
81
+
82
+ if @options[:auto_create_table]
83
+ res = insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
84
+ else
85
+ res = client.insert_all_table_data(project, dataset, table_id, body, {})
86
+ end
87
+ log.debug "insert rows", project_id: project, dataset: dataset, table: table_id, count: rows.size
88
+
89
+ if res.insert_errors && !res.insert_errors.empty?
90
+ log.warn "insert errors", project_id: project, dataset: dataset, table: table_id, insert_errors: res.insert_errors.to_s
91
+ if @options[:allow_retry_insert_errors]
92
+ is_included_any_retryable_insert_error = res.insert_errors.any? do |insert_error|
93
+ insert_error.errors.any? { |error| Fluent::BigQuery::Error.retryable_insert_errors_reason?(error.reason) }
94
+ end
95
+ if is_included_any_retryable_insert_error
96
+ raise Fluent::BigQuery::RetryableError.new("failed to insert into bigquery(insert errors), retry")
97
+ else
98
+ raise Fluent::BigQuery::UnRetryableError.new("failed to insert into bigquery(insert errors), and cannot retry")
99
+ end
100
+ end
101
+ end
102
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
103
+ error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message }
104
+ wrapped = Fluent::BigQuery::Error.wrap(e)
105
+ if wrapped.retryable?
106
+ log.warn "tabledata.insertAll API", error_data
107
+ else
108
+ log.error "tabledata.insertAll API", error_data
109
+ end
110
+
111
+ raise wrapped
112
+ end
113
+
114
+ JobReference = Struct.new(:chunk_id, :chunk_id_hex, :project_id, :dataset_id, :table_id, :job_id) do
115
+ def as_hash(*keys)
116
+ if keys.empty?
117
+ to_h
118
+ else
119
+ to_h.select { |k, _| keys.include?(k) }
120
+ end
121
+ end
122
+ end
123
+
124
+ def create_load_job(chunk_id, chunk_id_hex, project, dataset, table_id, upload_source, fields)
125
+ configuration = {
126
+ configuration: {
127
+ load: {
128
+ destination_table: {
129
+ project_id: project,
130
+ dataset_id: dataset,
131
+ table_id: table_id,
132
+ },
133
+ write_disposition: "WRITE_APPEND",
134
+ source_format: source_format,
135
+ ignore_unknown_values: @options[:ignore_unknown_values],
136
+ max_bad_records: @options[:max_bad_records],
137
+ }
138
+ }
139
+ }
140
+
141
+ job_id = create_job_id(chunk_id_hex, dataset, table_id, fields.to_a) if @options[:prevent_duplicate_load]
142
+ configuration.merge!({job_reference: {project_id: project, job_id: job_id}}) if job_id
143
+
144
+ begin
145
+ # Check table existance
146
+ client.get_table(project, dataset, table_id)
147
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
148
+ if e.status_code == 404 && /Not Found: Table/i =~ e.message
149
+ raise Fluent::BigQuery::UnRetryableError.new("Table is not found") unless @options[:auto_create_table]
150
+ raise Fluent::BigQuery::UnRetryableError.new("Schema is empty") if fields.empty?
151
+ configuration[:configuration][:load].merge!(schema: {fields: fields.to_a})
152
+ configuration[:configuration][:load].merge!(time_partitioning: time_partitioning) if time_partitioning
153
+ configuration[:configuration][:load].merge!(clustering: clustering) if clustering
154
+ end
155
+ end
156
+
157
+ res = client.insert_job(
158
+ project,
159
+ configuration,
160
+ {
161
+ upload_source: upload_source,
162
+ content_type: "application/octet-stream",
163
+ }
164
+ )
165
+ JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, res.job_reference.job_id)
166
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
167
+ log.error "job.load API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message
168
+
169
+ if job_id && e.status_code == 409 && e.message =~ /Job/ # duplicate load job
170
+ return JobReference.new(chunk_id, chunk_id_hex, project, dataset, table_id, job_id)
171
+ end
172
+
173
+ raise Fluent::BigQuery::Error.wrap(e)
174
+ end
175
+
176
+ def fetch_load_job(job_reference)
177
+ project = job_reference.project_id
178
+ job_id = job_reference.job_id
179
+ location = @options[:location]
180
+
181
+ res = client.get_job(project, job_id, location: location)
182
+ log.debug "load job fetched", id: job_id, state: res.status.state, **job_reference.as_hash(:project_id, :dataset_id, :table_id)
183
+
184
+ if res.status.state == "DONE"
185
+ res
186
+ end
187
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
188
+ e = Fluent::BigQuery::Error.wrap(e)
189
+ raise e unless e.retryable?
190
+ end
191
+
192
+ def commit_load_job(chunk_id_hex, response)
193
+ job_id = response.id
194
+ project = response.configuration.load.destination_table.project_id
195
+ dataset = response.configuration.load.destination_table.dataset_id
196
+ table_id = response.configuration.load.destination_table.table_id
197
+
198
+ errors = response.status.errors
199
+ if errors
200
+ errors.each do |e|
201
+ log.error "job.load API (rows)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: e.message, reason: e.reason
202
+ end
203
+ end
204
+
205
+ error_result = response.status.error_result
206
+ if error_result
207
+ log.error "job.load API (result)", job_id: job_id, project_id: project, dataset: dataset, table: table_id, message: error_result.message, reason: error_result.reason
208
+ if Fluent::BigQuery::Error.retryable_error_reason?(error_result.reason)
209
+ @num_errors_per_chunk[chunk_id_hex] = @num_errors_per_chunk[chunk_id_hex].to_i + 1
210
+ raise Fluent::BigQuery::RetryableError.new("failed to load into bigquery, retry")
211
+ else
212
+ @num_errors_per_chunk.delete(chunk_id_hex)
213
+ raise Fluent::BigQuery::UnRetryableError.new("failed to load into bigquery, and cannot retry")
214
+ end
215
+ end
216
+
217
+ # `stats` can be nil if we receive a warning like "Warning: Load job succeeded with data imported, however statistics may be lost due to internal error."
218
+ stats = response.statistics.load
219
+ duration = (response.statistics.end_time - response.statistics.creation_time) / 1000.0
220
+ log.debug "load job finished", id: job_id, state: response.status.state, input_file_bytes: stats&.input_file_bytes, input_files: stats&.input_files, output_bytes: stats&.output_bytes, output_rows: stats&.output_rows, bad_records: stats&.bad_records, duration: duration.round(2), project_id: project, dataset: dataset, table: table_id
221
+ @num_errors_per_chunk.delete(chunk_id_hex)
222
+ end
223
+
224
+ private
225
+
226
+ def log
227
+ @log
228
+ end
229
+
230
+ def get_auth
231
+ case @auth_method
232
+ when :private_key
233
+ get_auth_from_private_key
234
+ when :compute_engine
235
+ get_auth_from_compute_engine
236
+ when :json_key
237
+ get_auth_from_json_key
238
+ when :application_default
239
+ get_auth_from_application_default
240
+ else
241
+ raise ConfigError, "Unknown auth method: #{@auth_method}"
242
+ end
243
+ end
244
+
245
+ def get_auth_from_private_key
246
+ require 'google/api_client/auth/key_utils'
247
+ private_key_path = @options[:private_key_path]
248
+ private_key_passphrase = @options[:private_key_passphrase]
249
+ email = @options[:email]
250
+
251
+ key = Google::APIClient::KeyUtils.load_from_pkcs12(private_key_path, private_key_passphrase)
252
+ Signet::OAuth2::Client.new(
253
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
254
+ audience: "https://accounts.google.com/o/oauth2/token",
255
+ scope: @scope,
256
+ issuer: email,
257
+ signing_key: key
258
+ )
259
+ end
260
+
261
+ def get_auth_from_compute_engine
262
+ Google::Auth::GCECredentials.new
263
+ end
264
+
265
+ def get_auth_from_json_key
266
+ json_key = @options[:json_key]
267
+
268
+ begin
269
+ JSON.parse(json_key)
270
+ key = StringIO.new(json_key)
271
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: @scope)
272
+ rescue JSON::ParserError
273
+ key = json_key
274
+ File.open(json_key) do |f|
275
+ Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: @scope)
276
+ end
277
+ end
278
+ end
279
+
280
+ def get_auth_from_application_default
281
+ Google::Auth.get_application_default([@scope])
282
+ end
283
+
284
+ def safe_table_id(table_id)
285
+ table_id.gsub(/\$\d+$/, "")
286
+ end
287
+
288
+ def create_job_id(chunk_id_hex, dataset, table, schema)
289
+ job_id_key = "#{chunk_id_hex}#{dataset}#{table}#{schema.to_s}#{@options[:max_bad_records]}#{@options[:ignore_unknown_values]}#{@num_errors_per_chunk[chunk_id_hex]}"
290
+ @log.debug "job_id_key: #{job_id_key}"
291
+ "fluentd_job_" + Digest::SHA1.hexdigest(job_id_key)
292
+ end
293
+
294
+ def source_format
295
+ case @options[:source_format]
296
+ when :json
297
+ "NEWLINE_DELIMITED_JSON"
298
+ when :avro
299
+ "AVRO"
300
+ when :csv
301
+ "CSV"
302
+ else
303
+ "NEWLINE_DELIMITED_JSON"
304
+ end
305
+ end
306
+
307
+ def time_partitioning
308
+ return @time_partitioning if instance_variable_defined?(:@time_partitioning)
309
+
310
+ if @options[:time_partitioning_type]
311
+ @time_partitioning = {
312
+ type: @options[:time_partitioning_type].to_s.upcase,
313
+ field: @options[:time_partitioning_field] ? @options[:time_partitioning_field].to_s : nil,
314
+ expiration_ms: @options[:time_partitioning_expiration] ? @options[:time_partitioning_expiration] * 1000 : nil,
315
+ }.reject { |_, v| v.nil? }
316
+ else
317
+ @time_partitioning
318
+ end
319
+ end
320
+
321
+ def clustering
322
+ return @clustering if instance_variable_defined?(:@clustering)
323
+
324
+ if @options[:clustering_fields]
325
+ @clustering = {
326
+ fields: @options[:clustering_fields]
327
+ }
328
+ else
329
+ @clustering
330
+ end
331
+ end
332
+
333
+ def insert_all_table_data_with_create_table(project, dataset, table_id, body, schema)
334
+ try_count ||= 1
335
+ res = client.insert_all_table_data(project, dataset, table_id, body, {})
336
+ rescue Google::Apis::ClientError => e
337
+ if e.status_code == 404 && /Not Found: Table/i =~ e.message
338
+ if try_count == 1
339
+ # Table Not Found: Auto Create Table
340
+ create_table(project, dataset, table_id, schema)
341
+ elsif try_count > 10
342
+ raise "A new table was created but it is not found."
343
+ end
344
+
345
+ # Retry to insert several times because the created table is not visible from Streaming insert for a little while
346
+ # cf. https://cloud.google.com/bigquery/troubleshooting-errors#metadata-errors-for-streaming-inserts
347
+ try_count += 1
348
+ sleep 5
349
+ log.debug "Retry to insert rows", project_id: project, dataset: dataset, table: table_id
350
+ retry
351
+ end
352
+ raise
353
+ end
354
+ end
355
+ end
356
+ end
@@ -0,0 +1,221 @@
1
+ require 'fluent/plugin/output'
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/plugin/bigquery/helper'
6
+ require 'fluent/plugin/bigquery/errors'
7
+ require 'fluent/plugin/bigquery/schema'
8
+ require 'fluent/plugin/bigquery/writer'
9
+
10
+ require 'multi_json'
11
+ require 'google/apis/bigquery_v2'
12
+ require 'googleauth'
13
+
14
+ module Fluent
15
+ module Plugin
16
+ # This class is abstract class
17
+ class BigQueryBaseOutput < Output
18
+ helpers :inject, :formatter
19
+
20
+ # Available methods are:
21
+ # * private_key -- Use service account credential from pkcs12 private key file
22
+ # * compute_engine -- Use access token available in instances of ComputeEngine
23
+ # * json_key -- Use service account credential from JSON key
24
+ # * application_default -- Use application default credential
25
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
26
+
27
+ ### Service Account credential
28
+ config_param :email, :string, default: nil
29
+ config_param :private_key_path, :string, default: nil
30
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
+ config_param :json_key, default: nil, secret: true
32
+ # The geographic location of the job. Required except for US and EU.
33
+ # https://github.com/googleapis/google-api-ruby-client/blob/master/generated/google/apis/bigquery_v2/service.rb#L350
34
+ config_param :location, :string, default: nil
35
+
36
+ # see as simple reference
37
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
38
+ config_param :project, :string
39
+
40
+ # dataset_name
41
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
42
+ # but it cannot start with a number or underscore, or have spaces.
43
+ config_param :dataset, :string
44
+
45
+ # table_id
46
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
47
+ config_param :table, :string, default: nil
48
+ config_param :tables, :array, value_type: :string, default: nil
49
+
50
+ config_param :auto_create_table, :bool, default: false
51
+
52
+ # ignore_unknown_values
53
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
54
+ # Default is false, which treats unknown values as errors.
55
+ config_param :ignore_unknown_values, :bool, default: false
56
+
57
+ config_param :schema, :array, default: nil
58
+ config_param :schema_path, :string, default: nil
59
+ config_param :fetch_schema, :bool, default: false
60
+ config_param :fetch_schema_table, :string, default: nil
61
+ config_param :schema_cache_expire, :time, default: 600
62
+
63
+ ## Timeout
64
+ # request_timeout_sec
65
+ # Bigquery API response timeout
66
+ # request_open_timeout_sec
67
+ # Bigquery API connection, and request timeout
68
+ config_param :request_timeout_sec, :time, default: nil
69
+ config_param :request_open_timeout_sec, :time, default: 60
70
+
71
+ ## Partitioning
72
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
73
+ config_param :time_partitioning_field, :string, default: nil
74
+ config_param :time_partitioning_expiration, :time, default: nil
75
+
76
+ ## Clustering
77
+ config_param :clustering_fields, :array, default: nil
78
+
79
+ ## Formatter
80
+ config_section :format do
81
+ config_set_default :@type, 'json'
82
+ end
83
+
84
+ def configure(conf)
85
+ super
86
+
87
+ case @auth_method
88
+ when :private_key
89
+ unless @email && @private_key_path
90
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
91
+ end
92
+ when :compute_engine
93
+ # Do nothing
94
+ when :json_key
95
+ unless @json_key
96
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
97
+ end
98
+ when :application_default
99
+ # Do nothing
100
+ else
101
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
102
+ end
103
+
104
+ unless @table.nil? ^ @tables.nil?
105
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
106
+ end
107
+
108
+ @tablelist = @tables ? @tables : [@table]
109
+
110
+ @table_schema = Fluent::BigQuery::RecordSchema.new('record')
111
+ if @schema
112
+ @table_schema.load_schema(@schema)
113
+ end
114
+ if @schema_path
115
+ @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
116
+ end
117
+
118
+ formatter_config = conf.elements("format")[0]
119
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
120
+ end
121
+
122
+ def start
123
+ super
124
+
125
+ @tables_queue = @tablelist.shuffle
126
+ @tables_mutex = Mutex.new
127
+ @fetched_schemas = {}
128
+ @last_fetch_schema_time = Hash.new(0)
129
+ end
130
+
131
+ def multi_workers_ready?
132
+ true
133
+ end
134
+
135
+ def writer
136
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
137
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
138
+ email: @email,
139
+ json_key: @json_key,
140
+ location: @location,
141
+ source_format: @source_format,
142
+ skip_invalid_rows: @skip_invalid_rows,
143
+ ignore_unknown_values: @ignore_unknown_values,
144
+ max_bad_records: @max_bad_records,
145
+ allow_retry_insert_errors: @allow_retry_insert_errors,
146
+ prevent_duplicate_load: @prevent_duplicate_load,
147
+ auto_create_table: @auto_create_table,
148
+ time_partitioning_type: @time_partitioning_type,
149
+ time_partitioning_field: @time_partitioning_field,
150
+ time_partitioning_expiration: @time_partitioning_expiration,
151
+ clustering_fields: @clustering_fields,
152
+ timeout_sec: @request_timeout_sec,
153
+ open_timeout_sec: @request_open_timeout_sec,
154
+ })
155
+ end
156
+
157
+ def format(tag, time, record)
158
+ record = inject_values_to_record(tag, time, record)
159
+
160
+ meta = metadata(tag, time, record)
161
+ schema =
162
+ if @fetch_schema
163
+ fetch_schema(meta)
164
+ else
165
+ @table_schema
166
+ end
167
+
168
+ begin
169
+ row = schema.format(record)
170
+ return if row.empty?
171
+ @formatter.format(tag, time, row)
172
+ rescue
173
+ log.error("format error", record: record, schema: schema)
174
+ raise
175
+ end
176
+ end
177
+
178
+ def write(chunk)
179
+ end
180
+
181
+ def fetch_schema(metadata)
182
+ table_id = nil
183
+ project = extract_placeholders(@project, metadata)
184
+ dataset = extract_placeholders(@dataset, metadata)
185
+ table_id = fetch_schema_target_table(metadata)
186
+
187
+ if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
188
+ schema = writer.fetch_schema(project, dataset, table_id)
189
+
190
+ if schema
191
+ table_schema = Fluent::BigQuery::RecordSchema.new("record")
192
+ table_schema.load_schema(schema)
193
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
194
+ else
195
+ if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].nil?
196
+ raise "failed to fetch schema from bigquery"
197
+ else
198
+ log.warn "#{table_id} uses previous schema"
199
+ end
200
+ end
201
+
202
+ @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
203
+ end
204
+
205
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
206
+ end
207
+
208
+ def fetch_schema_target_table(metadata)
209
+ extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
210
+ end
211
+
212
+ def get_schema(project, dataset, metadata)
213
+ if @fetch_schema
214
+ @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
215
+ else
216
+ @table_schema
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end