fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ require 'fluent/plugin/output'
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/plugin/bigquery/helper'
6
+ require 'fluent/plugin/bigquery/errors'
7
+ require 'fluent/plugin/bigquery/schema'
8
+ require 'fluent/plugin/bigquery/writer'
9
+
10
+ require 'multi_json'
11
+ require 'google/apis/bigquery_v2'
12
+ require 'googleauth'
13
+
14
+ module Fluent
15
+ module Plugin
16
+ # This class is abstract class
17
+ class BigQueryBaseOutput < Output
18
+ helpers :inject, :formatter
19
+
20
+ # Available methods are:
21
+ # * private_key -- Use service account credential from pkcs12 private key file
22
+ # * compute_engine -- Use access token available in instances of ComputeEngine
23
+ # * json_key -- Use service account credential from JSON key
24
+ # * application_default -- Use application default credential
25
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
26
+
27
+ ### Service Account credential
28
+ config_param :email, :string, default: nil
29
+ config_param :private_key_path, :string, default: nil
30
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
+ config_param :json_key, default: nil, secret: true
32
+
33
+ # see as simple reference
34
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
35
+ config_param :project, :string
36
+
37
+ # dataset_name
38
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
39
+ # but it cannot start with a number or underscore, or have spaces.
40
+ config_param :dataset, :string
41
+
42
+ # table_id
43
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
44
+ config_param :table, :string, default: nil
45
+ config_param :tables, :array, value_type: :string, default: nil
46
+
47
+ config_param :auto_create_table, :bool, default: false
48
+
49
+ # ignore_unknown_values
50
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
51
+ # Default is false, which treats unknown values as errors.
52
+ config_param :ignore_unknown_values, :bool, default: false
53
+
54
+ config_param :schema, :array, default: nil
55
+ config_param :schema_path, :string, default: nil
56
+ config_param :fetch_schema, :bool, default: false
57
+ config_param :fetch_schema_table, :string, default: nil
58
+ config_param :schema_cache_expire, :time, default: 600
59
+
60
+ ## Timeout
61
+ # request_timeout_sec
62
+ # Bigquery API response timeout
63
+ # request_open_timeout_sec
64
+ # Bigquery API connection, and request timeout
65
+ config_param :request_timeout_sec, :time, default: nil
66
+ config_param :request_open_timeout_sec, :time, default: 60
67
+
68
+ ## Partitioning
69
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
70
+ config_param :time_partitioning_expiration, :time, default: nil
71
+
72
+ ## Formatter
73
+ config_section :format do
74
+ config_set_default :@type, 'json'
75
+ end
76
+
77
+ def configure(conf)
78
+ super
79
+
80
+ case @auth_method
81
+ when :private_key
82
+ unless @email && @private_key_path
83
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
84
+ end
85
+ when :compute_engine
86
+ # Do nothing
87
+ when :json_key
88
+ unless @json_key
89
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
90
+ end
91
+ when :application_default
92
+ # Do nothing
93
+ else
94
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
95
+ end
96
+
97
+ unless @table.nil? ^ @tables.nil?
98
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
99
+ end
100
+
101
+ @tablelist = @tables ? @tables : [@table]
102
+
103
+ @table_schema = Fluent::BigQuery::RecordSchema.new('record')
104
+ if @schema
105
+ @table_schema.load_schema(@schema)
106
+ end
107
+ if @schema_path
108
+ @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
109
+ end
110
+
111
+ formatter_config = conf.elements("format")[0]
112
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
113
+ end
114
+
115
+ def start
116
+ super
117
+
118
+ @tables_queue = @tablelist.shuffle
119
+ @tables_mutex = Mutex.new
120
+ @fetched_schemas = {}
121
+ @last_fetch_schema_time = Hash.new(0)
122
+ end
123
+
124
+ def multi_workers_ready?
125
+ true
126
+ end
127
+
128
+ def writer
129
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
130
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
131
+ email: @email,
132
+ json_key: @json_key,
133
+ source_format: @source_format,
134
+ skip_invalid_rows: @skip_invalid_rows,
135
+ ignore_unknown_values: @ignore_unknown_values,
136
+ max_bad_records: @max_bad_records,
137
+ allow_retry_insert_errors: @allow_retry_insert_errors,
138
+ prevent_duplicate_load: @prevent_duplicate_load,
139
+ auto_create_table: @auto_create_table,
140
+ time_partitioning_type: @time_partitioning_type,
141
+ time_partitioning_expiration: @time_partitioning_expiration,
142
+ timeout_sec: @request_timeout_sec,
143
+ open_timeout_sec: @request_open_timeout_sec,
144
+ })
145
+ end
146
+
147
+ def format(tag, time, record)
148
+ record = inject_values_to_record(tag, time, record)
149
+
150
+ meta = metadata(tag, time, record)
151
+ schema =
152
+ if @fetch_schema
153
+ fetch_schema(meta)
154
+ else
155
+ @table_schema
156
+ end
157
+
158
+ begin
159
+ row = schema.format(record)
160
+ return if row.empty?
161
+ @formatter.format(tag, time, row)
162
+ rescue
163
+ log.error("format error", record: record, schema: schema)
164
+ raise
165
+ end
166
+ end
167
+
168
+ def write(chunk)
169
+ end
170
+
171
+ def fetch_schema(metadata)
172
+ table_id = nil
173
+ project = extract_placeholders(@project, metadata)
174
+ dataset = extract_placeholders(@dataset, metadata)
175
+ table_id = fetch_schema_target_table(metadata)
176
+
177
+ if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
178
+ schema = writer.fetch_schema(project, dataset, table_id)
179
+
180
+ if schema
181
+ table_schema = Fluent::BigQuery::RecordSchema.new("record")
182
+ table_schema.load_schema(schema)
183
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
184
+ else
185
+ if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
186
+ raise "failed to fetch schema from bigquery"
187
+ else
188
+ log.warn "#{table_id} uses previous schema"
189
+ end
190
+ end
191
+
192
+ @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
193
+ end
194
+
195
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
196
+ end
197
+
198
+ def fetch_schema_target_table(metadata)
199
+ extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
200
+ end
201
+
202
+ def get_schema(project, dataset, metadata)
203
+ if @fetch_schema
204
+ @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
205
+ else
206
+ @table_schema
207
+ end
208
+ end
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,131 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryInsertOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_insert', self)
7
+
8
+ helpers :record_accessor
9
+
10
+ # template_suffix (only insert)
11
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
12
+ config_param :template_suffix, :string, default: nil
13
+
14
+ # skip_invalid_rows (only insert)
15
+ # Insert all valid rows of a request, even if invalid rows exist.
16
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
17
+ config_param :skip_invalid_rows, :bool, default: false
18
+
19
+ # insert_id_field (only insert)
20
+ config_param :insert_id_field, :string, default: nil
21
+
22
+ # add_insert_timestamp (only insert)
23
+ # adds a timestamp just before sending the rows to bigquery, so that
24
+ # buffering time is not taken into account. Gives a field in bigquery
25
+ # which represents the insert time of the row.
26
+ config_param :add_insert_timestamp, :string, default: nil
27
+
28
+ # allow_retry_insert_errors (only insert)
29
+ # If insert_id_field is not specified, true means to allow duplicate rows
30
+ config_param :allow_retry_insert_errors, :bool, default: false
31
+
32
+ ## Buffer
33
+ config_section :buffer do
34
+ config_set_default :@type, "memory"
35
+ config_set_default :flush_mode, :interval
36
+ config_set_default :flush_interval, 1
37
+ config_set_default :flush_thread_interval, 0.05
38
+ config_set_default :flush_thread_burst_interval, 0.05
39
+ config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
40
+ config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
41
+ config_set_default :chunk_limit_records, 500
42
+ end
43
+
44
+ def configure(conf)
45
+ super
46
+
47
+ if @insert_id_field
48
+ if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
49
+ warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
50
+ end
51
+ @get_insert_id = record_accessor_create(@insert_id_field)
52
+ end
53
+
54
+ formatter_config = conf.elements("format")[0]
55
+ if formatter_config && formatter_config['@type'] != "json"
56
+ raise ConfigError, "`bigquery_insert` supports only json formatter."
57
+ end
58
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
59
+
60
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
61
+ placeholder_validate!(:bigquery_insert, placeholder_params)
62
+ end
63
+
64
+ # for Fluent::Plugin::Output#implement? method
65
+ def format(tag, time, record)
66
+ super
67
+ end
68
+
69
+ def write(chunk)
70
+ table_format = @tables_mutex.synchronize do
71
+ t = @tables_queue.shift
72
+ @tables_queue.push t
73
+ t
74
+ end
75
+
76
+ now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
77
+
78
+ rows = chunk.open do |io|
79
+ io.map do |line|
80
+ record = MultiJson.load(line)
81
+ record[@add_insert_timestamp] = now if @add_insert_timestamp
82
+ row = {"json" => record}
83
+ row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
84
+ Fluent::BigQuery::Helper.deep_symbolize_keys(row)
85
+ end
86
+ end
87
+
88
+ metadata = chunk.metadata
89
+ project = extract_placeholders(@project, metadata)
90
+ dataset = extract_placeholders(@dataset, metadata)
91
+ table_id = extract_placeholders(table_format, metadata)
92
+ template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
93
+ schema = get_schema(project, dataset, metadata)
94
+
95
+ insert(project, dataset, table_id, rows, schema, template_suffix)
96
+ end
97
+
98
+ def insert(project, dataset, table_id, rows, schema, template_suffix)
99
+ writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
100
+ rescue Fluent::BigQuery::Error => e
101
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
102
+ # Table Not Found: Auto Create Table
103
+ writer.create_table(project, dataset, table_id, schema)
104
+ raise "table created. send rows next time."
105
+ end
106
+
107
+ raise if e.retryable?
108
+
109
+ if @secondary
110
+ # TODO: find better way
111
+ @retry = retry_state_create(
112
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
113
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
114
+ max_interval: @buffer_config.retry_max_interval,
115
+ secondary: true, secondary_threshold: Float::EPSILON,
116
+ randomize: @buffer_config.retry_randomize
117
+ )
118
+ else
119
+ @retry = retry_state_create(
120
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
121
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
122
+ max_interval: @buffer_config.retry_max_interval,
123
+ randomize: @buffer_config.retry_randomize
124
+ )
125
+ end
126
+
127
+ raise
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,220 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryLoadOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_load', self)
7
+
8
+ helpers :timer
9
+
10
+ config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
11
+
12
+ # max_bad_records (only load)
13
+ # The maximum number of bad records that BigQuery can ignore when running the job.
14
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
15
+ # The default value is 0, which requires that all records are valid.
16
+ config_param :max_bad_records, :integer, default: 0
17
+
18
+ # prevent_duplicate_load (only load)
19
+ config_param :prevent_duplicate_load, :bool, default: false
20
+
21
+ config_param :use_delayed_commit, :bool, default: true
22
+ config_param :wait_job_interval, :time, default: 3
23
+
24
+ ## Buffer
25
+ config_section :buffer do
26
+ config_set_default :@type, "file"
27
+ config_set_default :flush_mode, :interval
28
+ config_set_default :flush_interval, 3600 # 1h
29
+ config_set_default :flush_thread_interval, 5
30
+ config_set_default :flush_thread_burst_interval, 5
31
+ config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
32
+ config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
33
+
34
+ config_set_default :delayed_commit_timeout, 1800 # 30m
35
+ end
36
+
37
+ def configure(conf)
38
+ super
39
+
40
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
41
+ placeholder_validate!(:bigquery_load, placeholder_params)
42
+ end
43
+
44
+ def start
45
+ super
46
+
47
+ if prefer_delayed_commit
48
+ @polling_targets = []
49
+ @polling_mutex = Mutex.new
50
+ log.debug("start load job polling")
51
+ timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
52
+ end
53
+ end
54
+
55
+ def prefer_delayed_commit
56
+ @use_delayed_commit
57
+ end
58
+
59
+ # for Fluent::Plugin::Output#implement? method
60
+ def format(tag, time, record)
61
+ super
62
+ end
63
+
64
+ def write(chunk)
65
+ job_reference = do_write(chunk)
66
+
67
+ until response = writer.fetch_load_job(job_reference)
68
+ sleep @wait_job_interval
69
+ end
70
+
71
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
72
+ rescue Fluent::BigQuery::Error => e
73
+ raise if e.retryable?
74
+
75
+ @retry_mutex.synchronize do
76
+ if @secondary
77
+ # TODO: find better way
78
+ @retry = retry_state_create(
79
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
80
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
81
+ max_interval: @buffer_config.retry_max_interval,
82
+ secondary: true, secondary_threshold: Float::EPSILON,
83
+ randomize: @buffer_config.retry_randomize
84
+ )
85
+ else
86
+ @retry = retry_state_create(
87
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
88
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
89
+ max_interval: @buffer_config.retry_max_interval,
90
+ randomize: @buffer_config.retry_randomize
91
+ )
92
+ end
93
+ end
94
+
95
+ raise
96
+ end
97
+
98
+ def try_write(chunk)
99
+ job_reference = do_write(chunk)
100
+ @polling_mutex.synchronize do
101
+ @polling_targets << job_reference
102
+ end
103
+ rescue Fluent::BigQuery::Error => e
104
+ raise if e.retryable?
105
+
106
+ @retry_mutex.synchronize do
107
+ if @secondary
108
+ # TODO: find better way
109
+ @retry = retry_state_create(
110
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
111
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
112
+ max_interval: @buffer_config.retry_max_interval,
113
+ secondary: true, secondary_threshold: Float::EPSILON,
114
+ randomize: @buffer_config.retry_randomize
115
+ )
116
+ else
117
+ @retry = retry_state_create(
118
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
119
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
120
+ max_interval: @buffer_config.retry_max_interval,
121
+ randomize: @buffer_config.retry_randomize
122
+ )
123
+ end
124
+ end
125
+
126
+ raise
127
+ end
128
+
129
+ private
130
+
131
+ def do_write(chunk)
132
+ table_format = @tables_mutex.synchronize do
133
+ t = @tables_queue.shift
134
+ @tables_queue.push t
135
+ t
136
+ end
137
+
138
+ metadata = chunk.metadata
139
+ project = extract_placeholders(@project, metadata)
140
+ dataset = extract_placeholders(@dataset, metadata)
141
+ table_id = extract_placeholders(table_format, metadata)
142
+ schema = get_schema(project, dataset, metadata)
143
+
144
+ create_upload_source(chunk) do |upload_source|
145
+ writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
146
+ end
147
+ end
148
+
149
+ def poll
150
+ job_reference = @polling_mutex.synchronize do
151
+ @polling_targets.shift
152
+ end
153
+ return unless job_reference
154
+
155
+ begin
156
+ response = writer.fetch_load_job(job_reference)
157
+ if response
158
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
159
+ commit_write(job_reference.chunk_id)
160
+ log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
161
+ else
162
+ @polling_mutex.synchronize do
163
+ @polling_targets << job_reference
164
+ end
165
+ end
166
+ rescue Fluent::BigQuery::Error => e
167
+ # RetryableError comes from only `commit_load_job`
168
+ # if error is retryable, takeback chunk and do next `try_flush`
169
+ # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
170
+ if e.retryable?
171
+ log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
172
+ else
173
+ log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
174
+ @retry_mutex.synchronize do
175
+ if @secondary
176
+ # TODO: find better way
177
+ @retry = retry_state_create(
178
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
179
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
180
+ max_interval: @buffer_config.retry_max_interval,
181
+ secondary: true, secondary_threshold: Float::EPSILON,
182
+ randomize: @buffer_config.retry_randomize
183
+ )
184
+ else
185
+ @retry = retry_state_create(
186
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
187
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
188
+ max_interval: @buffer_config.retry_max_interval,
189
+ randomize: @buffer_config.retry_randomize
190
+ )
191
+ end
192
+ end
193
+ end
194
+
195
+ rollback_write(job_reference.chunk_id)
196
+ rescue => e
197
+ log.error("unexpected error while polling", error: e)
198
+ log.error_backtrace
199
+ end
200
+ end
201
+
202
+ def create_upload_source(chunk)
203
+ chunk_is_file = @buffer_config["@type"] == 'file'
204
+ if chunk_is_file
205
+ File.open(chunk.path) do |file|
206
+ yield file
207
+ end
208
+ else
209
+ Tempfile.open("chunk-tmp") do |file|
210
+ file.binmode
211
+ chunk.write_to(file)
212
+ file.sync
213
+ file.rewind
214
+ yield file
215
+ end
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end