fluent-plugin-bigquery 1.2.0 → 2.0.0.beta

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,211 @@
1
+ require 'fluent/plugin/output'
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/plugin/bigquery/helper'
6
+ require 'fluent/plugin/bigquery/errors'
7
+ require 'fluent/plugin/bigquery/schema'
8
+ require 'fluent/plugin/bigquery/writer'
9
+
10
+ require 'multi_json'
11
+ require 'google/apis/bigquery_v2'
12
+ require 'googleauth'
13
+
14
+ module Fluent
15
+ module Plugin
16
+ # This class is abstract class
17
+ class BigQueryBaseOutput < Output
18
+ helpers :inject, :formatter
19
+
20
+ # Available methods are:
21
+ # * private_key -- Use service account credential from pkcs12 private key file
22
+ # * compute_engine -- Use access token available in instances of ComputeEngine
23
+ # * json_key -- Use service account credential from JSON key
24
+ # * application_default -- Use application default credential
25
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
26
+
27
+ ### Service Account credential
28
+ config_param :email, :string, default: nil
29
+ config_param :private_key_path, :string, default: nil
30
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
31
+ config_param :json_key, default: nil, secret: true
32
+
33
+ # see as simple reference
34
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
35
+ config_param :project, :string
36
+
37
+ # dataset_name
38
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
39
+ # but it cannot start with a number or underscore, or have spaces.
40
+ config_param :dataset, :string
41
+
42
+ # table_id
43
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
44
+ config_param :table, :string, default: nil
45
+ config_param :tables, :array, value_type: :string, default: nil
46
+
47
+ config_param :auto_create_table, :bool, default: false
48
+
49
+ # ignore_unknown_values
50
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
51
+ # Default is false, which treats unknown values as errors.
52
+ config_param :ignore_unknown_values, :bool, default: false
53
+
54
+ config_param :schema, :array, default: nil
55
+ config_param :schema_path, :string, default: nil
56
+ config_param :fetch_schema, :bool, default: false
57
+ config_param :fetch_schema_table, :string, default: nil
58
+ config_param :schema_cache_expire, :time, default: 600
59
+
60
+ ## Timeout
61
+ # request_timeout_sec
62
+ # Bigquery API response timeout
63
+ # request_open_timeout_sec
64
+ # Bigquery API connection, and request timeout
65
+ config_param :request_timeout_sec, :time, default: nil
66
+ config_param :request_open_timeout_sec, :time, default: 60
67
+
68
+ ## Partitioning
69
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
70
+ config_param :time_partitioning_expiration, :time, default: nil
71
+
72
+ ## Formatter
73
+ config_section :format do
74
+ config_set_default :@type, 'json'
75
+ end
76
+
77
+ def configure(conf)
78
+ super
79
+
80
+ case @auth_method
81
+ when :private_key
82
+ unless @email && @private_key_path
83
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
84
+ end
85
+ when :compute_engine
86
+ # Do nothing
87
+ when :json_key
88
+ unless @json_key
89
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
90
+ end
91
+ when :application_default
92
+ # Do nothing
93
+ else
94
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
95
+ end
96
+
97
+ unless @table.nil? ^ @tables.nil?
98
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
99
+ end
100
+
101
+ @tablelist = @tables ? @tables : [@table]
102
+
103
+ @table_schema = Fluent::BigQuery::RecordSchema.new('record')
104
+ if @schema
105
+ @table_schema.load_schema(@schema)
106
+ end
107
+ if @schema_path
108
+ @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
109
+ end
110
+
111
+ formatter_config = conf.elements("format")[0]
112
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
113
+ end
114
+
115
+ def start
116
+ super
117
+
118
+ @tables_queue = @tablelist.shuffle
119
+ @tables_mutex = Mutex.new
120
+ @fetched_schemas = {}
121
+ @last_fetch_schema_time = Hash.new(0)
122
+ end
123
+
124
+ def multi_workers_ready?
125
+ true
126
+ end
127
+
128
+ def writer
129
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
130
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
131
+ email: @email,
132
+ json_key: @json_key,
133
+ source_format: @source_format,
134
+ skip_invalid_rows: @skip_invalid_rows,
135
+ ignore_unknown_values: @ignore_unknown_values,
136
+ max_bad_records: @max_bad_records,
137
+ allow_retry_insert_errors: @allow_retry_insert_errors,
138
+ prevent_duplicate_load: @prevent_duplicate_load,
139
+ auto_create_table: @auto_create_table,
140
+ time_partitioning_type: @time_partitioning_type,
141
+ time_partitioning_expiration: @time_partitioning_expiration,
142
+ timeout_sec: @request_timeout_sec,
143
+ open_timeout_sec: @request_open_timeout_sec,
144
+ })
145
+ end
146
+
147
+ def format(tag, time, record)
148
+ record = inject_values_to_record(tag, time, record)
149
+
150
+ meta = metadata(tag, time, record)
151
+ schema =
152
+ if @fetch_schema
153
+ fetch_schema(meta)
154
+ else
155
+ @table_schema
156
+ end
157
+
158
+ begin
159
+ row = schema.format(record)
160
+ return if row.empty?
161
+ @formatter.format(tag, time, row)
162
+ rescue
163
+ log.error("format error", record: record, schema: schema)
164
+ raise
165
+ end
166
+ end
167
+
168
+ def write(chunk)
169
+ end
170
+
171
+ def fetch_schema(metadata)
172
+ table_id = nil
173
+ project = extract_placeholders(@project, metadata)
174
+ dataset = extract_placeholders(@dataset, metadata)
175
+ table_id = fetch_schema_target_table(metadata)
176
+
177
+ if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
178
+ schema = writer.fetch_schema(project, dataset, table_id)
179
+
180
+ if schema
181
+ table_schema = Fluent::BigQuery::RecordSchema.new("record")
182
+ table_schema.load_schema(schema)
183
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
184
+ else
185
+ if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
186
+ raise "failed to fetch schema from bigquery"
187
+ else
188
+ log.warn "#{table_id} uses previous schema"
189
+ end
190
+ end
191
+
192
+ @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
193
+ end
194
+
195
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
196
+ end
197
+
198
+ def fetch_schema_target_table(metadata)
199
+ extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
200
+ end
201
+
202
+ def get_schema(project, dataset, metadata)
203
+ if @fetch_schema
204
+ @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
205
+ else
206
+ @table_schema
207
+ end
208
+ end
209
+ end
210
+ end
211
+ end
@@ -0,0 +1,131 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryInsertOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_insert', self)
7
+
8
+ helpers :record_accessor
9
+
10
+ # template_suffix (only insert)
11
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
12
+ config_param :template_suffix, :string, default: nil
13
+
14
+ # skip_invalid_rows (only insert)
15
+ # Insert all valid rows of a request, even if invalid rows exist.
16
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
17
+ config_param :skip_invalid_rows, :bool, default: false
18
+
19
+ # insert_id_field (only insert)
20
+ config_param :insert_id_field, :string, default: nil
21
+
22
+ # add_insert_timestamp (only insert)
23
+ # adds a timestamp just before sending the rows to bigquery, so that
24
+ # buffering time is not taken into account. Gives a field in bigquery
25
+ # which represents the insert time of the row.
26
+ config_param :add_insert_timestamp, :string, default: nil
27
+
28
+ # allow_retry_insert_errors (only insert)
29
+ # If insert_id_field is not specified, true means to allow duplicate rows
30
+ config_param :allow_retry_insert_errors, :bool, default: false
31
+
32
+ ## Buffer
33
+ config_section :buffer do
34
+ config_set_default :@type, "memory"
35
+ config_set_default :flush_mode, :interval
36
+ config_set_default :flush_interval, 1
37
+ config_set_default :flush_thread_interval, 0.05
38
+ config_set_default :flush_thread_burst_interval, 0.05
39
+ config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
40
+ config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
41
+ config_set_default :chunk_limit_records, 500
42
+ end
43
+
44
+ def configure(conf)
45
+ super
46
+
47
+ if @insert_id_field
48
+ if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
49
+ warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
50
+ end
51
+ @get_insert_id = record_accessor_create(@insert_id_field)
52
+ end
53
+
54
+ formatter_config = conf.elements("format")[0]
55
+ if formatter_config && formatter_config['@type'] != "json"
56
+ raise ConfigError, "`bigquery_insert` supports only json formatter."
57
+ end
58
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
59
+
60
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
61
+ placeholder_validate!(:bigquery_insert, placeholder_params)
62
+ end
63
+
64
+ # for Fluent::Plugin::Output#implement? method
65
+ def format(tag, time, record)
66
+ super
67
+ end
68
+
69
+ def write(chunk)
70
+ table_format = @tables_mutex.synchronize do
71
+ t = @tables_queue.shift
72
+ @tables_queue.push t
73
+ t
74
+ end
75
+
76
+ now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
77
+
78
+ rows = chunk.open do |io|
79
+ io.map do |line|
80
+ record = MultiJson.load(line)
81
+ record[@add_insert_timestamp] = now if @add_insert_timestamp
82
+ row = {"json" => record}
83
+ row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
84
+ Fluent::BigQuery::Helper.deep_symbolize_keys(row)
85
+ end
86
+ end
87
+
88
+ metadata = chunk.metadata
89
+ project = extract_placeholders(@project, metadata)
90
+ dataset = extract_placeholders(@dataset, metadata)
91
+ table_id = extract_placeholders(table_format, metadata)
92
+ template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
93
+ schema = get_schema(project, dataset, metadata)
94
+
95
+ insert(project, dataset, table_id, rows, schema, template_suffix)
96
+ end
97
+
98
+ def insert(project, dataset, table_id, rows, schema, template_suffix)
99
+ writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
100
+ rescue Fluent::BigQuery::Error => e
101
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
102
+ # Table Not Found: Auto Create Table
103
+ writer.create_table(project, dataset, table_id, schema)
104
+ raise "table created. send rows next time."
105
+ end
106
+
107
+ raise if e.retryable?
108
+
109
+ if @secondary
110
+ # TODO: find better way
111
+ @retry = retry_state_create(
112
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
113
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
114
+ max_interval: @buffer_config.retry_max_interval,
115
+ secondary: true, secondary_threshold: Float::EPSILON,
116
+ randomize: @buffer_config.retry_randomize
117
+ )
118
+ else
119
+ @retry = retry_state_create(
120
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
121
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
122
+ max_interval: @buffer_config.retry_max_interval,
123
+ randomize: @buffer_config.retry_randomize
124
+ )
125
+ end
126
+
127
+ raise
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,220 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryLoadOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_load', self)
7
+
8
+ helpers :timer
9
+
10
+ config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
11
+
12
+ # max_bad_records (only load)
13
+ # The maximum number of bad records that BigQuery can ignore when running the job.
14
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
15
+ # The default value is 0, which requires that all records are valid.
16
+ config_param :max_bad_records, :integer, default: 0
17
+
18
+ # prevent_duplicate_load (only load)
19
+ config_param :prevent_duplicate_load, :bool, default: false
20
+
21
+ config_param :use_delayed_commit, :bool, default: true
22
+ config_param :wait_job_interval, :time, default: 3
23
+
24
+ ## Buffer
25
+ config_section :buffer do
26
+ config_set_default :@type, "file"
27
+ config_set_default :flush_mode, :interval
28
+ config_set_default :flush_interval, 3600 # 1h
29
+ config_set_default :flush_thread_interval, 5
30
+ config_set_default :flush_thread_burst_interval, 5
31
+ config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
32
+ config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
33
+
34
+ config_set_default :delayed_commit_timeout, 1800 # 30m
35
+ end
36
+
37
+ def configure(conf)
38
+ super
39
+
40
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
41
+ placeholder_validate!(:bigquery_load, placeholder_params)
42
+ end
43
+
44
+ def start
45
+ super
46
+
47
+ if prefer_delayed_commit
48
+ @polling_targets = []
49
+ @polling_mutex = Mutex.new
50
+ log.debug("start load job polling")
51
+ timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
52
+ end
53
+ end
54
+
55
+ def prefer_delayed_commit
56
+ @use_delayed_commit
57
+ end
58
+
59
+ # for Fluent::Plugin::Output#implement? method
60
+ def format(tag, time, record)
61
+ super
62
+ end
63
+
64
+ def write(chunk)
65
+ job_reference = do_write(chunk)
66
+
67
+ until response = writer.fetch_load_job(job_reference)
68
+ sleep @wait_job_interval
69
+ end
70
+
71
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
72
+ rescue Fluent::BigQuery::Error => e
73
+ raise if e.retryable?
74
+
75
+ @retry_mutex.synchronize do
76
+ if @secondary
77
+ # TODO: find better way
78
+ @retry = retry_state_create(
79
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
80
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
81
+ max_interval: @buffer_config.retry_max_interval,
82
+ secondary: true, secondary_threshold: Float::EPSILON,
83
+ randomize: @buffer_config.retry_randomize
84
+ )
85
+ else
86
+ @retry = retry_state_create(
87
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
88
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
89
+ max_interval: @buffer_config.retry_max_interval,
90
+ randomize: @buffer_config.retry_randomize
91
+ )
92
+ end
93
+ end
94
+
95
+ raise
96
+ end
97
+
98
+ def try_write(chunk)
99
+ job_reference = do_write(chunk)
100
+ @polling_mutex.synchronize do
101
+ @polling_targets << job_reference
102
+ end
103
+ rescue Fluent::BigQuery::Error => e
104
+ raise if e.retryable?
105
+
106
+ @retry_mutex.synchronize do
107
+ if @secondary
108
+ # TODO: find better way
109
+ @retry = retry_state_create(
110
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
111
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
112
+ max_interval: @buffer_config.retry_max_interval,
113
+ secondary: true, secondary_threshold: Float::EPSILON,
114
+ randomize: @buffer_config.retry_randomize
115
+ )
116
+ else
117
+ @retry = retry_state_create(
118
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
119
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
120
+ max_interval: @buffer_config.retry_max_interval,
121
+ randomize: @buffer_config.retry_randomize
122
+ )
123
+ end
124
+ end
125
+
126
+ raise
127
+ end
128
+
129
+ private
130
+
131
+ def do_write(chunk)
132
+ table_format = @tables_mutex.synchronize do
133
+ t = @tables_queue.shift
134
+ @tables_queue.push t
135
+ t
136
+ end
137
+
138
+ metadata = chunk.metadata
139
+ project = extract_placeholders(@project, metadata)
140
+ dataset = extract_placeholders(@dataset, metadata)
141
+ table_id = extract_placeholders(table_format, metadata)
142
+ schema = get_schema(project, dataset, metadata)
143
+
144
+ create_upload_source(chunk) do |upload_source|
145
+ writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
146
+ end
147
+ end
148
+
149
+ def poll
150
+ job_reference = @polling_mutex.synchronize do
151
+ @polling_targets.shift
152
+ end
153
+ return unless job_reference
154
+
155
+ begin
156
+ response = writer.fetch_load_job(job_reference)
157
+ if response
158
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
159
+ commit_write(job_reference.chunk_id)
160
+ log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
161
+ else
162
+ @polling_mutex.synchronize do
163
+ @polling_targets << job_reference
164
+ end
165
+ end
166
+ rescue Fluent::BigQuery::Error => e
167
+ # RetryableError comes from only `commit_load_job`
168
+ # if error is retryable, takeback chunk and do next `try_flush`
169
+ # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
170
+ if e.retryable?
171
+ log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
172
+ else
173
+ log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
174
+ @retry_mutex.synchronize do
175
+ if @secondary
176
+ # TODO: find better way
177
+ @retry = retry_state_create(
178
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
179
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
180
+ max_interval: @buffer_config.retry_max_interval,
181
+ secondary: true, secondary_threshold: Float::EPSILON,
182
+ randomize: @buffer_config.retry_randomize
183
+ )
184
+ else
185
+ @retry = retry_state_create(
186
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
187
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
188
+ max_interval: @buffer_config.retry_max_interval,
189
+ randomize: @buffer_config.retry_randomize
190
+ )
191
+ end
192
+ end
193
+ end
194
+
195
+ rollback_write(job_reference.chunk_id)
196
+ rescue => e
197
+ log.error("unexpected error while polling", error: e)
198
+ log.error_backtrace
199
+ end
200
+ end
201
+
202
+ def create_upload_source(chunk)
203
+ chunk_is_file = @buffer_config["@type"] == 'file'
204
+ if chunk_is_file
205
+ File.open(chunk.path) do |file|
206
+ yield file
207
+ end
208
+ else
209
+ Tempfile.open("chunk-tmp") do |file|
210
+ file.binmode
211
+ chunk.write_to(file)
212
+ file.sync
213
+ file.rewind
214
+ yield file
215
+ end
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end