fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +0 -1
- data/README.md +123 -127
- data/fluent-plugin-bigquery.gemspec +1 -4
- data/lib/fluent/plugin/bigquery/schema.rb +2 -4
- data/lib/fluent/plugin/bigquery/version.rb +1 -1
- data/lib/fluent/plugin/bigquery/writer.rb +2 -8
- data/lib/fluent/plugin/out_bigquery.rb +431 -440
- data/test/helper.rb +5 -1
- data/test/plugin/test_out_bigquery.rb +479 -708
- data/test/plugin/test_record_schema.rb +8 -24
- data/test/run_test.rb +9 -0
- metadata +8 -48
@@ -113,15 +113,9 @@ module Fluent
|
|
113
113
|
@client = nil
|
114
114
|
|
115
115
|
reason = e.respond_to?(:reason) ? e.reason : nil
|
116
|
-
|
117
|
-
wrapped = Fluent::BigQuery::Error.wrap(e)
|
118
|
-
if wrapped.retryable?
|
119
|
-
log.warn "tabledata.insertAll API", error_data
|
120
|
-
else
|
121
|
-
log.error "tabledata.insertAll API", error_data
|
122
|
-
end
|
116
|
+
log.error "tabledata.insertAll API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
|
123
117
|
|
124
|
-
raise
|
118
|
+
raise Fluent::BigQuery::Error.wrap(e)
|
125
119
|
end
|
126
120
|
|
127
121
|
def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
|
@@ -2,520 +2,511 @@
|
|
2
2
|
|
3
3
|
require 'fluent/plugin/bigquery/version'
|
4
4
|
|
5
|
-
require 'fluent/mixin/config_placeholders'
|
6
|
-
require 'fluent/mixin/plaintextformatter'
|
7
|
-
|
8
5
|
require 'fluent/plugin/bigquery/errors'
|
9
6
|
require 'fluent/plugin/bigquery/schema'
|
10
7
|
require 'fluent/plugin/bigquery/writer'
|
11
8
|
|
12
|
-
## TODO: load implementation
|
13
|
-
# require 'fluent/plugin/bigquery/load_request_body_wrapper'
|
14
|
-
|
15
9
|
module Fluent
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
|
41
|
-
end
|
10
|
+
module Plugin
|
11
|
+
class BigQueryOutput < Output
|
12
|
+
Fluent::Plugin.register_output('bigquery', self)
|
13
|
+
|
14
|
+
helpers :inject
|
15
|
+
|
16
|
+
# https://developers.google.com/bigquery/browser-tool-quickstart
|
17
|
+
# https://developers.google.com/bigquery/bigquery-api-quickstart
|
18
|
+
|
19
|
+
### default for insert
|
20
|
+
def configure_for_insert(conf)
|
21
|
+
raise ConfigError unless conf["method"].nil? || conf["method"] == "insert"
|
22
|
+
|
23
|
+
buffer_config = conf.elements("buffer")[0]
|
24
|
+
return unless buffer_config
|
25
|
+
buffer_config["@type"] = "memory" unless buffer_config["@type"]
|
26
|
+
buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
|
27
|
+
buffer_config["flush_interval"] = 0.25 unless buffer_config["flush_interval"]
|
28
|
+
buffer_config["flush_thread_interval"] = 0.05 unless buffer_config["flush_thread_interval"]
|
29
|
+
buffer_config["flush_thread_burst_interval"] = 0.05 unless buffer_config["flush_thread_burst_interval"]
|
30
|
+
buffer_config["chunk_limit_size"] = 1 * 1024 ** 2 unless buffer_config["chunk_limit_size"] # 1MB
|
31
|
+
buffer_config["total_limit_size"] = 1 * 1024 ** 3 unless buffer_config["total_limit_size"] # 1GB
|
32
|
+
buffer_config["chunk_records_limit"] = 500 unless buffer_config["chunk_records_limit"]
|
33
|
+
end
|
42
34
|
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
# * json_key -- Use service account credential from JSON key
|
47
|
-
# * application_default -- Use application default credential
|
48
|
-
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
49
|
-
|
50
|
-
### Service Account credential
|
51
|
-
config_param :email, :string, default: nil
|
52
|
-
config_param :private_key_path, :string, default: nil
|
53
|
-
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
54
|
-
config_param :json_key, default: nil, secret: true
|
55
|
-
|
56
|
-
# see as simple reference
|
57
|
-
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
58
|
-
config_param :project, :string
|
59
|
-
|
60
|
-
# dataset_name
|
61
|
-
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
62
|
-
# but it cannot start with a number or underscore, or have spaces.
|
63
|
-
config_param :dataset, :string
|
64
|
-
|
65
|
-
# table_id
|
66
|
-
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
67
|
-
config_param :table, :string, default: nil
|
68
|
-
config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
|
69
|
-
|
70
|
-
# template_suffix (only insert)
|
71
|
-
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
72
|
-
config_param :template_suffix, :string, default: nil
|
73
|
-
|
74
|
-
config_param :auto_create_table, :bool, default: false
|
75
|
-
|
76
|
-
# skip_invalid_rows (only insert)
|
77
|
-
# Insert all valid rows of a request, even if invalid rows exist.
|
78
|
-
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
79
|
-
config_param :skip_invalid_rows, :bool, default: false
|
80
|
-
# max_bad_records (only load)
|
81
|
-
# The maximum number of bad records that BigQuery can ignore when running the job.
|
82
|
-
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
83
|
-
# The default value is 0, which requires that all records are valid.
|
84
|
-
config_param :max_bad_records, :integer, default: 0
|
85
|
-
# ignore_unknown_values
|
86
|
-
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
87
|
-
# Default is false, which treats unknown values as errors.
|
88
|
-
config_param :ignore_unknown_values, :bool, default: false
|
89
|
-
|
90
|
-
config_param :schema, :array, default: nil
|
91
|
-
config_param :schema_path, :string, default: nil
|
92
|
-
config_param :fetch_schema, :bool, default: false
|
93
|
-
config_param :fetch_schema_table, :string, default: nil
|
94
|
-
config_param :schema_cache_expire, :time, default: 600
|
95
|
-
config_param :field_string, :string, default: nil
|
96
|
-
config_param :field_integer, :string, default: nil
|
97
|
-
config_param :field_float, :string, default: nil
|
98
|
-
config_param :field_boolean, :string, default: nil
|
99
|
-
config_param :field_timestamp, :string, default: nil
|
100
|
-
### TODO: record field stream inserts doesn't works well?
|
101
|
-
### At table creation, table type json + field type record -> field type validation fails
|
102
|
-
### At streaming inserts, schema cannot be specified
|
103
|
-
# config_param :field_record, :string, defualt: nil
|
104
|
-
# config_param :optional_data_field, :string, default: nil
|
105
|
-
|
106
|
-
REGEXP_MAX_NUM = 10
|
107
|
-
config_param :replace_record_key, :bool, default: false
|
108
|
-
(1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
|
109
|
-
|
110
|
-
config_param :convert_hash_to_json, :bool, default: false
|
111
|
-
|
112
|
-
config_param :time_format, :string, default: nil
|
113
|
-
config_param :localtime, :bool, default: nil
|
114
|
-
config_param :utc, :bool, default: nil
|
115
|
-
config_param :time_field, :string, default: nil
|
116
|
-
|
117
|
-
# insert_id_field (only insert)
|
118
|
-
config_param :insert_id_field, :string, default: nil
|
119
|
-
# prevent_duplicate_load (only load)
|
120
|
-
config_param :prevent_duplicate_load, :bool, default: false
|
121
|
-
|
122
|
-
# add_insert_timestamp (only insert)
|
123
|
-
# adds a timestamp just before sending the rows to bigquery, so that
|
124
|
-
# buffering time is not taken into account. Gives a field in bigquery
|
125
|
-
# which represents the insert time of the row.
|
126
|
-
config_param :add_insert_timestamp, :string, default: nil
|
127
|
-
|
128
|
-
config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
|
129
|
-
|
130
|
-
# allow_retry_insert_errors (only insert)
|
131
|
-
# If insert_id_field is not specified, true means to allow duplicate rows
|
132
|
-
config_param :allow_retry_insert_errors, :bool, default: false
|
133
|
-
|
134
|
-
# TODO
|
135
|
-
# config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
136
|
-
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
137
|
-
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
138
|
-
### method: ''Streaming data inserts support
|
139
|
-
# https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
140
|
-
# Maximum row size: 100 KB
|
141
|
-
# Maximum data size of all rows, per insert: 1 MB
|
142
|
-
# Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
|
143
|
-
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
144
|
-
### Toooooooooooooo short/small per inserts and row!
|
145
|
-
|
146
|
-
## Timeout
|
147
|
-
# request_timeout_sec
|
148
|
-
# Bigquery API response timeout
|
149
|
-
# request_open_timeout_sec
|
150
|
-
# Bigquery API connection, and request timeout
|
151
|
-
config_param :request_timeout_sec, :time, default: nil
|
152
|
-
config_param :request_open_timeout_sec, :time, default: 60
|
153
|
-
|
154
|
-
## Partitioning
|
155
|
-
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
156
|
-
config_param :time_partitioning_expiration, :time, default: nil
|
157
|
-
|
158
|
-
### Table types
|
159
|
-
# https://developers.google.com/bigquery/docs/tables
|
160
|
-
#
|
161
|
-
# type - The following data types are supported; see Data Formats for details on each data type:
|
162
|
-
# STRING
|
163
|
-
# INTEGER
|
164
|
-
# FLOAT
|
165
|
-
# BOOLEAN
|
166
|
-
# RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
|
167
|
-
#
|
168
|
-
# mode - Whether a field can be null. The following values are supported:
|
169
|
-
# NULLABLE - The cell can be null.
|
170
|
-
# REQUIRED - The cell cannot be null.
|
171
|
-
# REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
|
172
|
-
|
173
|
-
def initialize
|
174
|
-
super
|
175
|
-
require 'multi_json'
|
176
|
-
require 'google/apis/bigquery_v2'
|
177
|
-
require 'googleauth'
|
178
|
-
require 'active_support/json'
|
179
|
-
require 'active_support/core_ext/hash'
|
180
|
-
require 'active_support/core_ext/object/json'
|
181
|
-
|
182
|
-
# MEMO: signet-0.6.1 depend on Farady.default_connection
|
183
|
-
Faraday.default_connection.options.timeout = 60
|
184
|
-
end
|
35
|
+
### default for loads
|
36
|
+
def configure_for_load(conf)
|
37
|
+
raise ConfigError unless conf["method"] == "load"
|
185
38
|
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
39
|
+
buffer_config = conf.elements("buffer")[0]
|
40
|
+
return unless buffer_config
|
41
|
+
buffer_config["@type"] = "file" unless buffer_config["@type"]
|
42
|
+
buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
|
43
|
+
buffer_config["chunk_limit_size"] = 1 * 1024 ** 3 unless buffer_config["chunk_limit_size"] # 1GB
|
44
|
+
buffer_config["total_limit_size"] = 32 * 1024 ** 3 unless buffer_config["total_limit_size"] # 32GB
|
191
45
|
end
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
46
|
+
|
47
|
+
# Available methods are:
|
48
|
+
# * private_key -- Use service account credential from pkcs12 private key file
|
49
|
+
# * compute_engine -- Use access token available in instances of ComputeEngine
|
50
|
+
# * json_key -- Use service account credential from JSON key
|
51
|
+
# * application_default -- Use application default credential
|
52
|
+
config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
|
53
|
+
|
54
|
+
### Service Account credential
|
55
|
+
config_param :email, :string, default: nil
|
56
|
+
config_param :private_key_path, :string, default: nil
|
57
|
+
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
58
|
+
config_param :json_key, default: nil, secret: true
|
59
|
+
|
60
|
+
# see as simple reference
|
61
|
+
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
62
|
+
config_param :project, :string
|
63
|
+
|
64
|
+
# dataset_name
|
65
|
+
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
66
|
+
# but it cannot start with a number or underscore, or have spaces.
|
67
|
+
config_param :dataset, :string
|
68
|
+
|
69
|
+
# table_id
|
70
|
+
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
71
|
+
config_param :table, :string, default: nil
|
72
|
+
config_param :tables, :array, value_type: :string, default: nil
|
73
|
+
|
74
|
+
# template_suffix (only insert)
|
75
|
+
# https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
|
76
|
+
config_param :template_suffix, :string, default: nil
|
77
|
+
|
78
|
+
config_param :auto_create_table, :bool, default: false
|
79
|
+
|
80
|
+
# skip_invalid_rows (only insert)
|
81
|
+
# Insert all valid rows of a request, even if invalid rows exist.
|
82
|
+
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
83
|
+
config_param :skip_invalid_rows, :bool, default: false
|
84
|
+
# max_bad_records (only load)
|
85
|
+
# The maximum number of bad records that BigQuery can ignore when running the job.
|
86
|
+
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
87
|
+
# The default value is 0, which requires that all records are valid.
|
88
|
+
config_param :max_bad_records, :integer, default: 0
|
89
|
+
# ignore_unknown_values
|
90
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
91
|
+
# Default is false, which treats unknown values as errors.
|
92
|
+
config_param :ignore_unknown_values, :bool, default: false
|
93
|
+
|
94
|
+
config_param :schema, :array, default: nil
|
95
|
+
config_param :schema_path, :string, default: nil
|
96
|
+
config_param :fetch_schema, :bool, default: false
|
97
|
+
config_param :fetch_schema_table, :string, default: nil
|
98
|
+
config_param :schema_cache_expire, :time, default: 600
|
99
|
+
config_param :field_string, :array, value_type: :string, default: nil
|
100
|
+
config_param :field_integer, :array, value_type: :string, default: nil
|
101
|
+
config_param :field_float, :array, value_type: :string, default: nil
|
102
|
+
config_param :field_boolean, :array, value_type: :string, default: nil
|
103
|
+
config_param :field_timestamp, :array, value_type: :string, default: nil
|
104
|
+
### TODO: record field stream inserts doesn't works well?
|
105
|
+
### At table creation, table type json + field type record -> field type validation fails
|
106
|
+
### At streaming inserts, schema cannot be specified
|
107
|
+
# config_param :field_record, :string, defualt: nil
|
108
|
+
# config_param :optional_data_field, :string, default: nil
|
109
|
+
|
110
|
+
REGEXP_MAX_NUM = 10
|
111
|
+
config_param :replace_record_key, :bool, default: false
|
112
|
+
(1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
|
113
|
+
|
114
|
+
config_param :convert_hash_to_json, :bool, default: false
|
115
|
+
|
116
|
+
# insert_id_field (only insert)
|
117
|
+
config_param :insert_id_field, :string, default: nil
|
118
|
+
# prevent_duplicate_load (only load)
|
119
|
+
config_param :prevent_duplicate_load, :bool, default: false
|
120
|
+
|
121
|
+
config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
|
122
|
+
|
123
|
+
# allow_retry_insert_errors (only insert)
|
124
|
+
# If insert_id_field is not specified, true means to allow duplicate rows
|
125
|
+
config_param :allow_retry_insert_errors, :bool, default: false
|
126
|
+
|
127
|
+
# TODO
|
128
|
+
# config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
129
|
+
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
130
|
+
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
131
|
+
### method: ''Streaming data inserts support
|
132
|
+
# https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
133
|
+
# Maximum row size: 100 KB
|
134
|
+
# Maximum data size of all rows, per insert: 1 MB
|
135
|
+
# Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
|
136
|
+
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
137
|
+
### Toooooooooooooo short/small per inserts and row!
|
138
|
+
|
139
|
+
## Timeout
|
140
|
+
# request_timeout_sec
|
141
|
+
# Bigquery API response timeout
|
142
|
+
# request_open_timeout_sec
|
143
|
+
# Bigquery API connection, and request timeout
|
144
|
+
config_param :request_timeout_sec, :time, default: nil
|
145
|
+
config_param :request_open_timeout_sec, :time, default: 60
|
146
|
+
|
147
|
+
## Partitioning
|
148
|
+
config_param :time_partitioning_type, :enum, list: [:day], default: nil
|
149
|
+
config_param :time_partitioning_expiration, :time, default: nil
|
150
|
+
|
151
|
+
### Table types
|
152
|
+
# https://developers.google.com/bigquery/docs/tables
|
153
|
+
#
|
154
|
+
# type - The following data types are supported; see Data Formats for details on each data type:
|
155
|
+
# STRING
|
156
|
+
# INTEGER
|
157
|
+
# FLOAT
|
158
|
+
# BOOLEAN
|
159
|
+
# RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
|
160
|
+
#
|
161
|
+
# mode - Whether a field can be null. The following values are supported:
|
162
|
+
# NULLABLE - The cell can be null.
|
163
|
+
# REQUIRED - The cell cannot be null.
|
164
|
+
# REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
|
165
|
+
|
166
|
+
def initialize
|
167
|
+
super
|
168
|
+
require 'multi_json'
|
169
|
+
require 'google/apis/bigquery_v2'
|
170
|
+
require 'googleauth'
|
171
|
+
require 'active_support/json'
|
172
|
+
require 'active_support/core_ext/hash'
|
173
|
+
require 'active_support/core_ext/object/json'
|
174
|
+
|
175
|
+
# MEMO: signet-0.6.1 depend on Farady.default_connection
|
176
|
+
Faraday.default_connection.options.timeout = 60
|
202
177
|
end
|
203
178
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
179
|
+
def configure(conf)
|
180
|
+
if conf["method"] == "load"
|
181
|
+
configure_for_load(conf)
|
182
|
+
else
|
183
|
+
configure_for_insert(conf)
|
208
184
|
end
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
185
|
+
super
|
186
|
+
|
187
|
+
case @method
|
188
|
+
when :insert
|
189
|
+
extend(InsertImplementation)
|
190
|
+
when :load
|
191
|
+
raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
|
192
|
+
extend(LoadImplementation)
|
214
193
|
end
|
215
|
-
when :application_default
|
216
|
-
# Do nothing
|
217
|
-
else
|
218
|
-
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
219
|
-
end
|
220
194
|
|
221
|
-
|
222
|
-
|
223
|
-
|
195
|
+
case @auth_method
|
196
|
+
when :private_key
|
197
|
+
unless @email && @private_key_path
|
198
|
+
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
199
|
+
end
|
200
|
+
when :compute_engine
|
201
|
+
# Do nothing
|
202
|
+
when :json_key
|
203
|
+
unless @json_key
|
204
|
+
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
205
|
+
end
|
206
|
+
when :application_default
|
207
|
+
# Do nothing
|
208
|
+
else
|
209
|
+
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
210
|
+
end
|
224
211
|
|
225
|
-
|
212
|
+
unless @table.nil? ^ @tables.nil?
|
213
|
+
raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
|
214
|
+
end
|
226
215
|
|
227
|
-
|
228
|
-
@fields = Fluent::BigQuery::RecordSchema.new('record')
|
229
|
-
if @schema
|
230
|
-
@fields.load_schema(@schema)
|
231
|
-
end
|
232
|
-
if @schema_path
|
233
|
-
@fields.load_schema(MultiJson.load(File.read(@schema_path)))
|
234
|
-
end
|
216
|
+
@tablelist = @tables ? @tables : [@table]
|
235
217
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
218
|
+
legacy_schema_config_deprecation
|
219
|
+
@table_schema = Fluent::BigQuery::RecordSchema.new('record')
|
220
|
+
if @schema
|
221
|
+
@table_schema.load_schema(@schema)
|
222
|
+
end
|
223
|
+
if @schema_path
|
224
|
+
@table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
|
242
225
|
end
|
243
|
-
end
|
244
226
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
227
|
+
types = %i(string integer float boolean timestamp)
|
228
|
+
types.each do |type|
|
229
|
+
fields = instance_variable_get("@field_#{type}")
|
230
|
+
next unless fields
|
231
|
+
fields.each do |field|
|
232
|
+
@table_schema.register_field field, type
|
233
|
+
end
|
234
|
+
end
|
253
235
|
|
254
|
-
|
236
|
+
@regexps = {}
|
237
|
+
(1..REGEXP_MAX_NUM).each do |i|
|
238
|
+
next unless conf["replace_record_key_regexp#{i}"]
|
239
|
+
regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
|
240
|
+
raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
|
241
|
+
raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
|
242
|
+
@regexps[regexp] = replacement
|
243
|
+
end
|
255
244
|
|
256
|
-
|
245
|
+
if @insert_id_field
|
246
|
+
insert_id_keys = @insert_id_field.split('.')
|
247
|
+
@get_insert_id = ->(record) {
|
248
|
+
insert_id_keys.inject(record) {|h, k| h[k] }
|
249
|
+
}
|
250
|
+
else
|
251
|
+
@get_insert_id = nil
|
252
|
+
end
|
257
253
|
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
keys.inject(record) { |h, k| h[k] ||= {} }[last_key] = @timef.format(time)
|
263
|
-
record
|
264
|
-
}
|
265
|
-
else
|
266
|
-
@add_time_field = ->(record, time) { record }
|
254
|
+
placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
|
255
|
+
placeholder_validate!(:bigquery, placeholder_params)
|
256
|
+
|
257
|
+
warn "[DEPRECATION] `convert_hash_to_json` param is deprecated. If Hash value is inserted string field, plugin convert it to json automatically." if @convert_hash_to_json
|
267
258
|
end
|
268
259
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
@
|
260
|
+
def start
|
261
|
+
super
|
262
|
+
|
263
|
+
@tables_queue = @tablelist.shuffle
|
264
|
+
@tables_mutex = Mutex.new
|
265
|
+
@fetched_schemas = {}
|
266
|
+
@last_fetch_schema_time = Hash.new(0)
|
276
267
|
end
|
277
268
|
|
278
|
-
|
279
|
-
|
269
|
+
def writer
|
270
|
+
@writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
|
271
|
+
private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
|
272
|
+
email: @email,
|
273
|
+
json_key: @json_key,
|
274
|
+
skip_invalid_rows: @skip_invalid_rows,
|
275
|
+
ignore_unknown_values: @ignore_unknown_values,
|
276
|
+
max_bad_records: @max_bad_records,
|
277
|
+
allow_retry_insert_errors: @allow_retry_insert_errors,
|
278
|
+
prevent_duplicate_load: @prevent_duplicate_load,
|
279
|
+
auto_create_table: @auto_create_table,
|
280
|
+
time_partitioning_type: @time_partitioning_type,
|
281
|
+
time_partitioning_expiration: @time_partitioning_expiration,
|
282
|
+
timeout_sec: @request_timeout_sec,
|
283
|
+
open_timeout_sec: @request_open_timeout_sec,
|
284
|
+
})
|
285
|
+
end
|
280
286
|
|
281
|
-
|
282
|
-
|
287
|
+
def replace_record_key(record)
|
288
|
+
new_record = {}
|
289
|
+
record.each do |key, _|
|
290
|
+
new_key = key
|
291
|
+
@regexps.each do |regexp, replacement|
|
292
|
+
new_key = new_key.gsub(/#{regexp}/, replacement)
|
293
|
+
end
|
294
|
+
new_key = new_key.gsub(/\W/, '')
|
295
|
+
new_record.store(new_key, record[key])
|
296
|
+
end
|
297
|
+
new_record
|
298
|
+
end
|
283
299
|
|
284
|
-
|
285
|
-
|
286
|
-
|
300
|
+
def convert_hash_to_json(record)
|
301
|
+
record.each do |key, value|
|
302
|
+
if value.class == Hash
|
303
|
+
record[key] = MultiJson.dump(value)
|
304
|
+
end
|
305
|
+
end
|
306
|
+
record
|
307
|
+
end
|
287
308
|
|
288
|
-
|
289
|
-
|
290
|
-
|
309
|
+
def format(tag, time, record)
|
310
|
+
if @replace_record_key
|
311
|
+
record = replace_record_key(record)
|
312
|
+
end
|
291
313
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
skip_invalid_rows: @skip_invalid_rows,
|
298
|
-
ignore_unknown_values: @ignore_unknown_values,
|
299
|
-
max_bad_records: @max_bad_records,
|
300
|
-
allow_retry_insert_errors: @allow_retry_insert_errors,
|
301
|
-
prevent_duplicate_load: @prevent_duplicate_load,
|
302
|
-
auto_create_table: @auto_create_table,
|
303
|
-
time_partitioning_type: @time_partitioning_type,
|
304
|
-
time_partitioning_expiration: @time_partitioning_expiration,
|
305
|
-
timeout_sec: @request_timeout_sec,
|
306
|
-
open_timeout_sec: @request_open_timeout_sec,
|
307
|
-
})
|
308
|
-
end
|
314
|
+
if @convert_hash_to_json
|
315
|
+
record = convert_hash_to_json(record)
|
316
|
+
end
|
317
|
+
|
318
|
+
record = inject_values_to_record(tag, time, record)
|
309
319
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
|
321
|
-
row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
|
320
|
+
begin
|
321
|
+
meta = metadata(tag, time, record)
|
322
|
+
schema =
|
323
|
+
if @fetch_schema
|
324
|
+
fetch_schema(meta)
|
325
|
+
else
|
326
|
+
@table_schema
|
327
|
+
end
|
328
|
+
ensure
|
329
|
+
@buffer.metadata_list.delete(meta)
|
322
330
|
end
|
323
|
-
end
|
324
|
-
table_id = time.strftime(format)
|
325
|
-
|
326
|
-
if chunk
|
327
|
-
table_id.gsub(%r(%{time_slice})) { |expr|
|
328
|
-
chunk.key
|
329
|
-
}
|
330
|
-
else
|
331
|
-
table_id.gsub(%r(%{time_slice})) { |expr|
|
332
|
-
current_time.strftime(@time_slice_format)
|
333
|
-
}
|
334
|
-
end
|
335
|
-
end
|
336
331
|
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
332
|
+
begin
|
333
|
+
buf = String.new
|
334
|
+
row = schema.format(record)
|
335
|
+
unless row.empty?
|
336
|
+
buf << MultiJson.dump(row) + "\n"
|
337
|
+
end
|
338
|
+
buf
|
339
|
+
rescue
|
340
|
+
log.error("format error", record: record, schema: schema)
|
341
|
+
raise
|
343
342
|
end
|
344
|
-
new_key = new_key.gsub(/\W/, '')
|
345
|
-
new_record.store(new_key, record[key])
|
346
343
|
end
|
347
|
-
new_record
|
348
|
-
end
|
349
344
|
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
345
|
+
def write(chunk)
|
346
|
+
table_id_format = @tables_mutex.synchronize do
|
347
|
+
t = @tables_queue.shift
|
348
|
+
@tables_queue.push t
|
349
|
+
t
|
354
350
|
end
|
351
|
+
_write(chunk, table_id_format)
|
355
352
|
end
|
356
|
-
record
|
357
|
-
end
|
358
353
|
|
359
|
-
|
360
|
-
|
361
|
-
|
354
|
+
def legacy_schema_config_deprecation
|
355
|
+
if [@field_string, @field_integer, @field_float, @field_boolean, @field_timestamp].any?
|
356
|
+
warn "[DEPRECATION] `field_*` style schema config is deprecated. Instead of it, use `schema` config params that is array of json style."
|
357
|
+
end
|
362
358
|
end
|
363
|
-
end
|
364
359
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
@
|
369
|
-
|
370
|
-
end
|
371
|
-
template_suffix_format = @template_suffix
|
372
|
-
_write(chunk, table_id_format, template_suffix_format)
|
373
|
-
end
|
360
|
+
def fetch_schema(metadata)
|
361
|
+
table_id = nil
|
362
|
+
project = extract_placeholders(@project, metadata)
|
363
|
+
dataset = extract_placeholders(@dataset, metadata)
|
364
|
+
table_id = fetch_schema_target_table(metadata)
|
374
365
|
|
375
|
-
|
376
|
-
|
377
|
-
@fetch_schema_mutex.synchronize do
|
378
|
-
if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
|
379
|
-
table_id_format = @fetch_schema_table || @tablelist[0]
|
380
|
-
table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
|
381
|
-
schema = writer.fetch_schema(@project, @dataset, table_id)
|
366
|
+
if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
|
367
|
+
schema = writer.fetch_schema(project, dataset, table_id)
|
382
368
|
|
383
369
|
if schema
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
@fields = fields
|
388
|
-
else
|
389
|
-
@fields.load_schema(schema, allow_overwrite)
|
390
|
-
end
|
370
|
+
table_schema = Fluent::BigQuery::RecordSchema.new("record")
|
371
|
+
table_schema.load_schema(schema)
|
372
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
|
391
373
|
else
|
392
|
-
if @
|
374
|
+
if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
|
393
375
|
raise "failed to fetch schema from bigquery"
|
394
376
|
else
|
395
377
|
log.warn "#{table_id} uses previous schema"
|
396
378
|
end
|
397
379
|
end
|
398
380
|
|
399
|
-
@last_fetch_schema_time = Fluent::Engine.now
|
381
|
+
@last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
|
400
382
|
end
|
383
|
+
|
384
|
+
@fetched_schemas["#{project}.#{dataset}.#{table_id}"]
|
401
385
|
end
|
402
|
-
end
|
403
386
|
|
404
|
-
|
405
|
-
|
406
|
-
|
387
|
+
def fetch_schema_target_table(metadata)
|
388
|
+
extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
|
389
|
+
end
|
407
390
|
|
408
|
-
|
409
|
-
|
391
|
+
def get_schema(project, dataset, metadata)
|
392
|
+
if @fetch_schema
|
393
|
+
@fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
|
394
|
+
else
|
395
|
+
@table_schema
|
410
396
|
end
|
397
|
+
end
|
411
398
|
|
412
|
-
|
413
|
-
|
414
|
-
|
399
|
+
module InsertImplementation
|
400
|
+
def _write(chunk, table_format)
|
401
|
+
rows = chunk.open do |io|
|
402
|
+
io.map do |line|
|
403
|
+
record = MultiJson.load(line)
|
404
|
+
row = {"json" => record}
|
405
|
+
row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
|
406
|
+
row.deep_symbolize_keys
|
407
|
+
end
|
408
|
+
end
|
415
409
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
row['insert_id'] = @get_insert_id.call(record) if @get_insert_id
|
421
|
-
buf << row.to_msgpack
|
422
|
-
end
|
423
|
-
buf
|
424
|
-
end
|
410
|
+
project = extract_placeholders(@project, chunk.metadata)
|
411
|
+
dataset = extract_placeholders(@dataset, chunk.metadata)
|
412
|
+
table_id = extract_placeholders(table_format, chunk.metadata)
|
413
|
+
template_suffix = @template_suffix ? extract_placeholders(@template_suffix, chunk.metadata) : nil
|
425
414
|
|
426
|
-
|
427
|
-
now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
|
428
|
-
rows = []
|
429
|
-
chunk.msgpack_each do |row_object|
|
430
|
-
# TODO: row size limit
|
431
|
-
row_object["json"][@add_insert_timestamp] = now if @add_insert_timestamp
|
432
|
-
rows << row_object.deep_symbolize_keys
|
433
|
-
end
|
415
|
+
schema = get_schema(project, dataset, chunk.metadata)
|
434
416
|
|
435
|
-
|
436
|
-
group = rows.group_by do |row|
|
437
|
-
[
|
438
|
-
generate_table_id(table_format, now, row, chunk),
|
439
|
-
template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
|
440
|
-
]
|
441
|
-
end
|
442
|
-
group.each do |(table_id, template_suffix), group_rows|
|
443
|
-
insert(table_id, group_rows, template_suffix)
|
417
|
+
insert(project, dataset, table_id, rows, schema, template_suffix)
|
444
418
|
end
|
445
|
-
end
|
446
419
|
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
420
|
+
def insert(project, dataset, table_id, rows, schema, template_suffix)
|
421
|
+
writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
|
422
|
+
rescue Fluent::BigQuery::Error => e
|
423
|
+
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
424
|
+
# Table Not Found: Auto Create Table
|
425
|
+
writer.create_table(project, dataset, table_id, schema)
|
426
|
+
raise "table created. send rows next time."
|
427
|
+
end
|
455
428
|
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
429
|
+
raise if e.retryable?
|
430
|
+
|
431
|
+
if @secondary
|
432
|
+
# TODO: find better way
|
433
|
+
@retry = retry_state_create(
|
434
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
435
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
436
|
+
max_interval: @buffer_config.retry_max_interval,
|
437
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
438
|
+
randomize: @buffer_config.retry_randomize
|
439
|
+
)
|
440
|
+
else
|
441
|
+
@retry = retry_state_create(
|
442
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
443
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
444
|
+
max_interval: @buffer_config.retry_max_interval,
|
445
|
+
randomize: @buffer_config.retry_randomize
|
446
|
+
)
|
447
|
+
end
|
448
|
+
|
449
|
+
raise
|
460
450
|
end
|
461
451
|
end
|
462
|
-
end
|
463
452
|
|
464
|
-
|
465
|
-
|
466
|
-
|
453
|
+
module LoadImplementation
|
454
|
+
def _write(chunk, table_id_format)
|
455
|
+
project = extract_placeholders(@project, chunk.metadata)
|
456
|
+
dataset = extract_placeholders(@dataset, chunk.metadata)
|
457
|
+
table_id = extract_placeholders(table_id_format, chunk.metadata)
|
467
458
|
|
468
|
-
|
469
|
-
record = replace_record_key(record)
|
470
|
-
end
|
471
|
-
|
472
|
-
if @convert_hash_to_json
|
473
|
-
record = convert_hash_to_json(record)
|
474
|
-
end
|
459
|
+
schema = get_schema(project, dataset, chunk.metadata)
|
475
460
|
|
476
|
-
|
477
|
-
row = @fields.format(@add_time_field.call(record, time))
|
478
|
-
unless row.empty?
|
479
|
-
buf << MultiJson.dump(row) + "\n"
|
461
|
+
load(chunk, project, dataset, table_id, schema)
|
480
462
|
end
|
481
|
-
buf
|
482
|
-
end
|
483
463
|
|
484
|
-
|
485
|
-
|
486
|
-
table_id = generate_table_id(table_id_format, now, nil, chunk)
|
487
|
-
load(chunk, table_id)
|
488
|
-
end
|
464
|
+
def load(chunk, project, dataset, table_id, schema)
|
465
|
+
res = nil
|
489
466
|
|
490
|
-
|
491
|
-
|
467
|
+
create_upload_source(chunk) do |upload_source|
|
468
|
+
res = writer.create_load_job(chunk.unique_id, project, dataset, table_id, upload_source, schema)
|
469
|
+
end
|
470
|
+
rescue Fluent::BigQuery::Error => e
|
471
|
+
raise if e.retryable?
|
472
|
+
|
473
|
+
if @secondary
|
474
|
+
# TODO: find better way
|
475
|
+
@retry = retry_state_create(
|
476
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
477
|
+
forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
478
|
+
max_interval: @buffer_config.retry_max_interval,
|
479
|
+
secondary: true, secondary_threshold: Float::EPSILON,
|
480
|
+
randomize: @buffer_config.retry_randomize
|
481
|
+
)
|
482
|
+
else
|
483
|
+
@retry = retry_state_create(
|
484
|
+
:output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
|
485
|
+
forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
|
486
|
+
max_interval: @buffer_config.retry_max_interval,
|
487
|
+
randomize: @buffer_config.retry_randomize
|
488
|
+
)
|
489
|
+
end
|
492
490
|
|
493
|
-
|
494
|
-
res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields)
|
495
|
-
end
|
496
|
-
rescue Fluent::BigQuery::Error => e
|
497
|
-
if e.retryable?
|
498
|
-
raise e
|
499
|
-
elsif @secondary
|
500
|
-
flush_secondary(@secondary)
|
491
|
+
raise
|
501
492
|
end
|
502
|
-
end
|
503
493
|
|
504
|
-
|
494
|
+
private
|
505
495
|
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
496
|
+
def create_upload_source(chunk)
|
497
|
+
chunk_is_file = @buffer_config["@type"] == 'file'
|
498
|
+
if chunk_is_file
|
499
|
+
File.open(chunk.path) do |file|
|
500
|
+
yield file
|
501
|
+
end
|
502
|
+
else
|
503
|
+
Tempfile.open("chunk-tmp") do |file|
|
504
|
+
file.binmode
|
505
|
+
chunk.write_to(file)
|
506
|
+
file.sync
|
507
|
+
file.rewind
|
508
|
+
yield file
|
509
|
+
end
|
519
510
|
end
|
520
511
|
end
|
521
512
|
end
|