fluent-plugin-bigquery 1.2.0 → 2.0.0.beta
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -9
- data/README.md +68 -65
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +45 -39
- data/lib/fluent/plugin/out_bigquery_base.rb +211 -0
- data/lib/fluent/plugin/out_bigquery_insert.rb +131 -0
- data/lib/fluent/plugin/out_bigquery_load.rb +220 -0
- data/test/helper.rb +3 -1
- data/test/plugin/test_out_bigquery_base.rb +579 -0
- data/test/plugin/test_out_bigquery_insert.rb +420 -0
- data/test/plugin/test_out_bigquery_load.rb +310 -0
- metadata +13 -7
- data/lib/fluent/plugin/out_bigquery.rb +0 -500
- data/test/plugin/test_out_bigquery.rb +0 -1276
@@ -0,0 +1,211 @@
|
|
1
|
+
require 'fluent/plugin/output'
|
2
|
+
|
3
|
+
require 'fluent/plugin/bigquery/version'
|
4
|
+
|
5
|
+
require 'fluent/plugin/bigquery/helper'
|
6
|
+
require 'fluent/plugin/bigquery/errors'
|
7
|
+
require 'fluent/plugin/bigquery/schema'
|
8
|
+
require 'fluent/plugin/bigquery/writer'
|
9
|
+
|
10
|
+
require 'multi_json'
|
11
|
+
require 'google/apis/bigquery_v2'
|
12
|
+
require 'googleauth'
|
13
|
+
|
14
|
+
module Fluent
|
15
|
+
module Plugin
|
16
|
+
# This class is abstract class
|
17
|
+
class BigQueryBaseOutput < Output
|
18
|
+
helpers :inject, :formatter
|
19
|
+
|
20
|
+
# Available methods are:
|
21
|
+
# * private_key -- Use service account credential from pkcs12 private key file
|
22
|
+
# * compute_engine -- Use access token available in instances of ComputeEngine
|
23
|
+
# * json_key -- Use service account credential from JSON key
|
24
|
+
# * application_default -- Use application default credential
|
25
|
+
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
26
|
+
|
27
|
+
### Service Account credential
|
28
|
+
config_param :email, :string, default: nil
|
29
|
+
config_param :private_key_path, :string, default: nil
|
30
|
+
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
31
|
+
config_param :json_key, default: nil, secret: true
|
32
|
+
|
33
|
+
# see as simple reference
|
34
|
+
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
35
|
+
config_param :project, :string
|
36
|
+
|
37
|
+
# dataset_name
|
38
|
+
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
39
|
+
# but it cannot start with a number or underscore, or have spaces.
|
40
|
+
config_param :dataset, :string
|
41
|
+
|
42
|
+
# table_id
|
43
|
+
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
44
|
+
config_param :table, :string, default: nil
|
45
|
+
config_param :tables, :array, value_type: :string, default: nil
|
46
|
+
|
47
|
+
config_param :auto_create_table, :bool, default: false
|
48
|
+
|
49
|
+
# ignore_unknown_values
|
50
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
51
|
+
# Default is false, which treats unknown values as errors.
|
52
|
+
config_param :ignore_unknown_values, :bool, default: false
|
53
|
+
|
54
|
+
config_param :schema, :array, default: nil
|
55
|
+
config_param :schema_path, :string, default: nil
|
56
|
+
config_param :fetch_schema, :bool, default: false
|
57
|
+
config_param :fetch_schema_table, :string, default: nil
|
58
|
+
config_param :schema_cache_expire, :time, default: 600
|
59
|
+
|
60
|
+
## Timeout
|
61
|
+
# request_timeout_sec
|
62
|
+
# Bigquery API response timeout
|
63
|
+
# request_open_timeout_sec
|
64
|
+
# Bigquery API connection, and request timeout
|
65
|
+
config_param :request_timeout_sec, :time, default: nil
|
66
|
+
config_param :request_open_timeout_sec, :time, default: 60
|
67
|
+
|
68
|
+
## Partitioning
|
69
|
+
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
70
|
+
config_param :time_partitioning_expiration, :time, default: nil
|
71
|
+
|
72
|
+
## Formatter
|
73
|
+
config_section :format do
|
74
|
+
config_set_default :@type, 'json'
|
75
|
+
end
|
76
|
+
|
77
|
+
def configure(conf)
|
78
|
+
super
|
79
|
+
|
80
|
+
case @auth_method
|
81
|
+
when :private_key
|
82
|
+
unless @email && @private_key_path
|
83
|
+
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
84
|
+
end
|
85
|
+
when :compute_engine
|
86
|
+
# Do nothing
|
87
|
+
when :json_key
|
88
|
+
unless @json_key
|
89
|
+
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
90
|
+
end
|
91
|
+
when :application_default
|
92
|
+
# Do nothing
|
93
|
+
else
|
94
|
+
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
95
|
+
end
|
96
|
+
|
97
|
+
unless @table.nil? ^ @tables.nil?
|
98
|
+
raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
|
99
|
+
end
|
100
|
+
|
101
|
+
@tablelist = @tables ? @tables : [@table]
|
102
|
+
|
103
|
+
@table_schema = Fluent::BigQuery::RecordSchema.new('record')
|
104
|
+
if @schema
|
105
|
+
@table_schema.load_schema(@schema)
|
106
|
+
end
|
107
|
+
if @schema_path
|
108
|
+
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
109
|
+
end
|
110
|
+
|
111
|
+
formatter_config = conf.elements("format")[0]
|
112
|
+
@formatter = formatter_create(usage: 'out_bigquery_for_insert', default_type: 'json', conf: formatter_config)
|
113
|
+
end
|
114
|
+
|
115
|
+
def start
|
116
|
+
super
|
117
|
+
|
118
|
+
@tables_queue = @tablelist.shuffle
|
119
|
+
@tables_mutex = Mutex.new
|
120
|
+
@fetched_schemas = {}
|
121
|
+
@last_fetch_schema_time = Hash.new(0)
|
122
|
+
end
|
123
|
+
|
124
|
+
def multi_workers_ready?
|
125
|
+
true
|
126
|
+
end
|
127
|
+
|
128
|
+
def writer
|
129
|
+
@writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
130
|
+
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
131
|
+
email: @email,
|
132
|
+
json_key: @json_key,
|
133
|
+
source_format: @source_format,
|
134
|
+
skip_invalid_rows: @skip_invalid_rows,
|
135
|
+
ignore_unknown_values: @ignore_unknown_values,
|
136
|
+
max_bad_records: @max_bad_records,
|
137
|
+
allow_retry_insert_errors: @allow_retry_insert_errors,
|
138
|
+
prevent_duplicate_load: @prevent_duplicate_load,
|
139
|
+
auto_create_table: @auto_create_table,
|
140
|
+
time_partitioning_type: @time_partitioning_type,
|
141
|
+
time_partitioning_expiration: @time_partitioning_expiration,
|
142
|
+
timeout_sec: @request_timeout_sec,
|
143
|
+
open_timeout_sec: @request_open_timeout_sec,
|
144
|
+
})
|
145
|
+
end
|
146
|
+
|
147
|
+
def format(tag, time, record)
|
148
|
+
record = inject_values_to_record(tag, time, record)
|
149
|
+
|
150
|
+
meta = metadata(tag, time, record)
|
151
|
+
schema =
|
152
|
+
if @fetch_schema
|
153
|
+
fetch_schema(meta)
|
154
|
+
else
|
155
|
+
@table_schema
|
156
|
+
end
|
157
|
+
|
158
|
+
begin
|
159
|
+
row = schema.format(record)
|
160
|
+
return if row.empty?
|
161
|
+
@formatter.format(tag, time, row)
|
162
|
+
rescue
|
163
|
+
log.error("format error", record: record, schema: schema)
|
164
|
+
raise
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def write(chunk)
|
169
|
+
end
|
170
|
+
|
171
|
+
def fetch_schema(metadata)
|
172
|
+
table_id = nil
|
173
|
+
project = extract_placeholders(@project, metadata)
|
174
|
+
dataset = extract_placeholders(@dataset, metadata)
|
175
|
+
table_id = fetch_schema_target_table(metadata)
|
176
|
+
|
177
|
+
if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
|
178
|
+
schema = writer.fetch_schema(project, dataset, table_id)
|
179
|
+
|
180
|
+
if schema
|
181
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
182
|
+
table_schema.load_schema(schema)
|
183
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
184
|
+
else
|
185
|
+
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
|
186
|
+
raise "failed to fetch schema from bigquery"
|
187
|
+
else
|
188
|
+
log.warn "#{table_id} uses previous schema"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
@last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
|
193
|
+
end
|
194
|
+
|
195
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"]
|
196
|
+
end
|
197
|
+
|
198
|
+
def fetch_schema_target_table(metadata)
|
199
|
+
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
200
|
+
end
|
201
|
+
|
202
|
+
def get_schema(project, dataset, metadata)
|
203
|
+
if @fetch_schema
|
204
|
+
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
205
|
+
else
|
206
|
+
@table_schema
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
211
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
require 'fluent/plugin/out_bigquery_base'
|
2
|
+
|
3
|
+
module Fluent
|
4
|
+
module Plugin
|
5
|
+
class BigQueryInsertOutput < BigQueryBaseOutput
|
6
|
+
Fluent::Plugin.register_output('bigquery_insert', self)
|
7
|
+
|
8
|
+
helpers :record_accessor
|
9
|
+
|
10
|
+
# template_suffix (only insert)
|
11
|
+
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
12
|
+
config_param :template_suffix, :string, default: nil
|
13
|
+
|
14
|
+
# skip_invalid_rows (only insert)
|
15
|
+
# Insert all valid rows of a request, even if invalid rows exist.
|
16
|
+
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
17
|
+
config_param :skip_invalid_rows, :bool, default: false
|
18
|
+
|
19
|
+
# insert_id_field (only insert)
|
20
|
+
config_param :insert_id_field, :string, default: nil
|
21
|
+
|
22
|
+
# add_insert_timestamp (only insert)
|
23
|
+
# adds a timestamp just before sending the rows to bigquery, so that
|
24
|
+
# buffering time is not taken into account. Gives a field in bigquery
|
25
|
+
# which represents the insert time of the row.
|
26
|
+
config_param :add_insert_timestamp, :string, default: nil
|
27
|
+
|
28
|
+
# allow_retry_insert_errors (only insert)
|
29
|
+
# If insert_id_field is not specified, true means to allow duplicate rows
|
30
|
+
config_param :allow_retry_insert_errors, :bool, default: false
|
31
|
+
|
32
|
+
## Buffer
|
33
|
+
config_section :buffer do
|
34
|
+
config_set_default :@type, "memory"
|
35
|
+
config_set_default :flush_mode, :interval
|
36
|
+
config_set_default :flush_interval, 1
|
37
|
+
config_set_default :flush_thread_interval, 0.05
|
38
|
+
config_set_default :flush_thread_burst_interval, 0.05
|
39
|
+
config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
|
40
|
+
config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
|
41
|
+
config_set_default :chunk_limit_records, 500
|
42
|
+
end
|
43
|
+
|
44
|
+
def configure(conf)
|
45
|
+
super
|
46
|
+
|
47
|
+
if @insert_id_field
|
48
|
+
if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
|
49
|
+
warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
|
50
|
+
end
|
51
|
+
@get_insert_id = record_accessor_create(@insert_id_field)
|
52
|
+
end
|
53
|
+
|
54
|
+
formatter_config = conf.elements("format")[0]
|
55
|
+
if formatter_config && formatter_config['@type'] != "json"
|
56
|
+
raise ConfigError, "`bigquery_insert` supports only json formatter."
|
57
|
+
end
|
58
|
+
@formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
|
59
|
+
|
60
|
+
placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
|
61
|
+
placeholder_validate!(:bigquery_insert, placeholder_params)
|
62
|
+
end
|
63
|
+
|
64
|
+
# for Fluent::Plugin::Output#implement? method
|
65
|
+
def format(tag, time, record)
|
66
|
+
super
|
67
|
+
end
|
68
|
+
|
69
|
+
def write(chunk)
|
70
|
+
table_format = @tables_mutex.synchronize do
|
71
|
+
t = @tables_queue.shift
|
72
|
+
@tables_queue.push t
|
73
|
+
t
|
74
|
+
end
|
75
|
+
|
76
|
+
now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
|
77
|
+
|
78
|
+
rows = chunk.open do |io|
|
79
|
+
io.map do |line|
|
80
|
+
record = MultiJson.load(line)
|
81
|
+
record[@add_insert_timestamp] = now if @add_insert_timestamp
|
82
|
+
row = {"json" => record}
|
83
|
+
row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
|
84
|
+
Fluent::BigQuery::Helper.deep_symbolize_keys(row)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
metadata = chunk.metadata
|
89
|
+
project = extract_placeholders(@project, metadata)
|
90
|
+
dataset = extract_placeholders(@dataset, metadata)
|
91
|
+
table_id = extract_placeholders(table_format, metadata)
|
92
|
+
template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
|
93
|
+
schema = get_schema(project, dataset, metadata)
|
94
|
+
|
95
|
+
insert(project, dataset, table_id, rows, schema, template_suffix)
|
96
|
+
end
|
97
|
+
|
98
|
+
def insert(project, dataset, table_id, rows, schema, template_suffix)
|
99
|
+
writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
|
100
|
+
rescue Fluent::BigQuery::Error => e
|
101
|
+
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
102
|
+
# Table Not Found: Auto Create Table
|
103
|
+
writer.create_table(project, dataset, table_id, schema)
|
104
|
+
raise "table created. send rows next time."
|
105
|
+
end
|
106
|
+
|
107
|
+
raise if e.retryable?
|
108
|
+
|
109
|
+
if @secondary
|
110
|
+
# TODO: find better way
|
111
|
+
@retry = retry_state_create(
|
112
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
113
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
114
|
+
max_interval: @buffer_config.retry_max_interval,
|
115
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
116
|
+
randomize: @buffer_config.retry_randomize
|
117
|
+
)
|
118
|
+
else
|
119
|
+
@retry = retry_state_create(
|
120
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
121
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
122
|
+
max_interval: @buffer_config.retry_max_interval,
|
123
|
+
randomize: @buffer_config.retry_randomize
|
124
|
+
)
|
125
|
+
end
|
126
|
+
|
127
|
+
raise
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
require 'fluent/plugin/out_bigquery_base'
|
2
|
+
|
3
|
+
module Fluent
|
4
|
+
module Plugin
|
5
|
+
class BigQueryLoadOutput < BigQueryBaseOutput
|
6
|
+
Fluent::Plugin.register_output('bigquery_load', self)
|
7
|
+
|
8
|
+
helpers :timer
|
9
|
+
|
10
|
+
config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
|
11
|
+
|
12
|
+
# max_bad_records (only load)
|
13
|
+
# The maximum number of bad records that BigQuery can ignore when running the job.
|
14
|
+
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
15
|
+
# The default value is 0, which requires that all records are valid.
|
16
|
+
config_param :max_bad_records, :integer, default: 0
|
17
|
+
|
18
|
+
# prevent_duplicate_load (only load)
|
19
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
20
|
+
|
21
|
+
config_param :use_delayed_commit, :bool, default: true
|
22
|
+
config_param :wait_job_interval, :time, default: 3
|
23
|
+
|
24
|
+
## Buffer
|
25
|
+
config_section :buffer do
|
26
|
+
config_set_default :@type, "file"
|
27
|
+
config_set_default :flush_mode, :interval
|
28
|
+
config_set_default :flush_interval, 3600 # 1h
|
29
|
+
config_set_default :flush_thread_interval, 5
|
30
|
+
config_set_default :flush_thread_burst_interval, 5
|
31
|
+
config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
|
32
|
+
config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
|
33
|
+
|
34
|
+
config_set_default :delayed_commit_timeout, 1800 # 30m
|
35
|
+
end
|
36
|
+
|
37
|
+
def configure(conf)
|
38
|
+
super
|
39
|
+
|
40
|
+
placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
|
41
|
+
placeholder_validate!(:bigquery_load, placeholder_params)
|
42
|
+
end
|
43
|
+
|
44
|
+
def start
|
45
|
+
super
|
46
|
+
|
47
|
+
if prefer_delayed_commit
|
48
|
+
@polling_targets = []
|
49
|
+
@polling_mutex = Mutex.new
|
50
|
+
log.debug("start load job polling")
|
51
|
+
timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def prefer_delayed_commit
|
56
|
+
@use_delayed_commit
|
57
|
+
end
|
58
|
+
|
59
|
+
# for Fluent::Plugin::Output#implement? method
|
60
|
+
def format(tag, time, record)
|
61
|
+
super
|
62
|
+
end
|
63
|
+
|
64
|
+
def write(chunk)
|
65
|
+
job_reference = do_write(chunk)
|
66
|
+
|
67
|
+
until response = writer.fetch_load_job(job_reference)
|
68
|
+
sleep @wait_job_interval
|
69
|
+
end
|
70
|
+
|
71
|
+
writer.commit_load_job(job_reference.chunk_id_hex, response)
|
72
|
+
rescue Fluent::BigQuery::Error => e
|
73
|
+
raise if e.retryable?
|
74
|
+
|
75
|
+
@retry_mutex.synchronize do
|
76
|
+
if @secondary
|
77
|
+
# TODO: find better way
|
78
|
+
@retry = retry_state_create(
|
79
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
80
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
81
|
+
max_interval: @buffer_config.retry_max_interval,
|
82
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
83
|
+
randomize: @buffer_config.retry_randomize
|
84
|
+
)
|
85
|
+
else
|
86
|
+
@retry = retry_state_create(
|
87
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
88
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
89
|
+
max_interval: @buffer_config.retry_max_interval,
|
90
|
+
randomize: @buffer_config.retry_randomize
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
raise
|
96
|
+
end
|
97
|
+
|
98
|
+
def try_write(chunk)
|
99
|
+
job_reference = do_write(chunk)
|
100
|
+
@polling_mutex.synchronize do
|
101
|
+
@polling_targets << job_reference
|
102
|
+
end
|
103
|
+
rescue Fluent::BigQuery::Error => e
|
104
|
+
raise if e.retryable?
|
105
|
+
|
106
|
+
@retry_mutex.synchronize do
|
107
|
+
if @secondary
|
108
|
+
# TODO: find better way
|
109
|
+
@retry = retry_state_create(
|
110
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
111
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
112
|
+
max_interval: @buffer_config.retry_max_interval,
|
113
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
114
|
+
randomize: @buffer_config.retry_randomize
|
115
|
+
)
|
116
|
+
else
|
117
|
+
@retry = retry_state_create(
|
118
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
119
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
120
|
+
max_interval: @buffer_config.retry_max_interval,
|
121
|
+
randomize: @buffer_config.retry_randomize
|
122
|
+
)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
raise
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
def do_write(chunk)
|
132
|
+
table_format = @tables_mutex.synchronize do
|
133
|
+
t = @tables_queue.shift
|
134
|
+
@tables_queue.push t
|
135
|
+
t
|
136
|
+
end
|
137
|
+
|
138
|
+
metadata = chunk.metadata
|
139
|
+
project = extract_placeholders(@project, metadata)
|
140
|
+
dataset = extract_placeholders(@dataset, metadata)
|
141
|
+
table_id = extract_placeholders(table_format, metadata)
|
142
|
+
schema = get_schema(project, dataset, metadata)
|
143
|
+
|
144
|
+
create_upload_source(chunk) do |upload_source|
|
145
|
+
writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def poll
|
150
|
+
job_reference = @polling_mutex.synchronize do
|
151
|
+
@polling_targets.shift
|
152
|
+
end
|
153
|
+
return unless job_reference
|
154
|
+
|
155
|
+
begin
|
156
|
+
response = writer.fetch_load_job(job_reference)
|
157
|
+
if response
|
158
|
+
writer.commit_load_job(job_reference.chunk_id_hex, response)
|
159
|
+
commit_write(job_reference.chunk_id)
|
160
|
+
log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
|
161
|
+
else
|
162
|
+
@polling_mutex.synchronize do
|
163
|
+
@polling_targets << job_reference
|
164
|
+
end
|
165
|
+
end
|
166
|
+
rescue Fluent::BigQuery::Error => e
|
167
|
+
# RetryableError comes from only `commit_load_job`
|
168
|
+
# if error is retryable, takeback chunk and do next `try_flush`
|
169
|
+
# if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
|
170
|
+
if e.retryable?
|
171
|
+
log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
|
172
|
+
else
|
173
|
+
log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
|
174
|
+
@retry_mutex.synchronize do
|
175
|
+
if @secondary
|
176
|
+
# TODO: find better way
|
177
|
+
@retry = retry_state_create(
|
178
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
179
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
180
|
+
max_interval: @buffer_config.retry_max_interval,
|
181
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
182
|
+
randomize: @buffer_config.retry_randomize
|
183
|
+
)
|
184
|
+
else
|
185
|
+
@retry = retry_state_create(
|
186
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
187
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
188
|
+
max_interval: @buffer_config.retry_max_interval,
|
189
|
+
randomize: @buffer_config.retry_randomize
|
190
|
+
)
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
rollback_write(job_reference.chunk_id)
|
196
|
+
rescue => e
|
197
|
+
log.error("unexpected error while polling", error: e)
|
198
|
+
log.error_backtrace
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def create_upload_source(chunk)
|
203
|
+
chunk_is_file = @buffer_config["@type"] == 'file'
|
204
|
+
if chunk_is_file
|
205
|
+
File.open(chunk.path) do |file|
|
206
|
+
yield file
|
207
|
+
end
|
208
|
+
else
|
209
|
+
Tempfile.open("chunk-tmp") do |file|
|
210
|
+
file.binmode
|
211
|
+
chunk.write_to(file)
|
212
|
+
file.sync
|
213
|
+
file.rewind
|
214
|
+
yield file
|
215
|
+
end
|
216
|
+
end
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|