fluent-plugin-bigquery-custom 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.travis.yml +10 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +13 -0
- data/README.md +424 -0
- data/Rakefile +11 -0
- data/fluent-plugin-bigquery-custom.gemspec +34 -0
- data/lib/fluent/plugin/bigquery/version.rb +6 -0
- data/lib/fluent/plugin/out_bigquery.rb +727 -0
- data/test/helper.rb +34 -0
- data/test/plugin/test_out_bigquery.rb +1015 -0
- data/test/plugin/testdata/apache.schema +98 -0
- data/test/plugin/testdata/json_key.json +7 -0
- data/test/plugin/testdata/sudo.schema +27 -0
- metadata +218 -0
data/Rakefile
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'fluent/plugin/bigquery/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "fluent-plugin-bigquery-custom"
|
8
|
+
spec.version = Fluent::BigQueryPlugin::VERSION
|
9
|
+
spec.authors = ["Tomohiro Hashidate"]
|
10
|
+
spec.email = ["kakyoin.hierophant@gmail.com"]
|
11
|
+
spec.description = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts}
|
12
|
+
spec.summary = %q{Fluentd plugin to store data on Google BigQuery}
|
13
|
+
spec.homepage = "https://github.com/joker1007/fluent-plugin-bigquery-custom"
|
14
|
+
spec.license = "APLv2"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "rake"
|
22
|
+
spec.add_development_dependency "rr"
|
23
|
+
spec.add_development_dependency "test-unit", "~> 3.0.2"
|
24
|
+
spec.add_development_dependency "test-unit-rr", "~> 1.0.3"
|
25
|
+
|
26
|
+
spec.add_runtime_dependency "google-api-client", "~> 0.9.pre5"
|
27
|
+
spec.add_runtime_dependency "googleauth"
|
28
|
+
spec.add_runtime_dependency "fluentd"
|
29
|
+
spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
|
30
|
+
spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
|
31
|
+
spec.add_runtime_dependency "fluent-plugin-buffer-lightening", ">= 0.0.2"
|
32
|
+
|
33
|
+
spec.add_development_dependency "fluent-plugin-dummydata-producer"
|
34
|
+
end
|
@@ -0,0 +1,727 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
require 'fluent/plugin/bigquery/version'
|
4
|
+
|
5
|
+
require 'fluent/mixin/config_placeholders'
|
6
|
+
require 'fluent/mixin/plaintextformatter'
|
7
|
+
|
8
|
+
## TODO: load implementation
|
9
|
+
# require 'fluent/plugin/bigquery/load_request_body_wrapper'
|
10
|
+
|
11
|
+
module Fluent
|
12
|
+
### TODO: error classes for each api error responses
|
13
|
+
# class BigQueryAPIError < StandardError
|
14
|
+
# end
|
15
|
+
|
16
|
+
class BigQueryOutput < TimeSlicedOutput
|
17
|
+
Fluent::Plugin.register_output('bigquery', self)
|
18
|
+
|
19
|
+
# https://developers.google.com/bigquery/browser-tool-quickstart
|
20
|
+
# https://developers.google.com/bigquery/bigquery-api-quickstart
|
21
|
+
|
22
|
+
config_set_default :buffer_type, 'lightening'
|
23
|
+
|
24
|
+
config_set_default :flush_interval, 0.25
|
25
|
+
config_set_default :try_flush_interval, 0.05
|
26
|
+
|
27
|
+
config_set_default :buffer_chunk_records_limit, 500
|
28
|
+
config_set_default :buffer_chunk_limit, 1000000
|
29
|
+
config_set_default :buffer_queue_limit, 1024
|
30
|
+
|
31
|
+
### for loads
|
32
|
+
### TODO: different default values for buffering between 'load' and insert
|
33
|
+
# config_set_default :flush_interval, 1800 # 30min => 48 imports/day
|
34
|
+
# config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
|
35
|
+
|
36
|
+
### OAuth credential
|
37
|
+
# config_param :client_id, :string
|
38
|
+
# config_param :client_secret, :string
|
39
|
+
|
40
|
+
# Available methods are:
|
41
|
+
# * private_key -- Use service account credential from pkcs12 private key file
|
42
|
+
# * compute_engine -- Use access token available in instances of ComputeEngine
|
43
|
+
# * private_json_key -- Use service account credential from JSON key
|
44
|
+
# * application_default -- Use application default credential
|
45
|
+
config_param :auth_method, :string, default: 'private_key'
|
46
|
+
|
47
|
+
### Service Account credential
|
48
|
+
config_param :email, :string, default: nil
|
49
|
+
config_param :private_key_path, :string, default: nil
|
50
|
+
config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
|
51
|
+
config_param :json_key, default: nil
|
52
|
+
|
53
|
+
# see as simple reference
|
54
|
+
# https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
|
55
|
+
config_param :project, :string
|
56
|
+
|
57
|
+
# dataset_name
|
58
|
+
# The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
|
59
|
+
# but it cannot start with a number or underscore, or have spaces.
|
60
|
+
config_param :dataset, :string
|
61
|
+
|
62
|
+
# table_id
|
63
|
+
# In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
|
64
|
+
config_param :table, :string, default: nil
|
65
|
+
config_param :tables, :string, default: nil
|
66
|
+
|
67
|
+
config_param :auto_create_table, :bool, default: false
|
68
|
+
|
69
|
+
# skip_invalid_rows (only insert)
|
70
|
+
# Insert all valid rows of a request, even if invalid rows exist.
|
71
|
+
# The default value is false, which causes the entire request to fail if any invalid rows exist.
|
72
|
+
config_param :skip_invalid_rows, :bool, default: false
|
73
|
+
# max_bad_records (only load)
|
74
|
+
# The maximum number of bad records that BigQuery can ignore when running the job.
|
75
|
+
# If the number of bad records exceeds this value, an invalid error is returned in the job result.
|
76
|
+
# The default value is 0, which requires that all records are valid.
|
77
|
+
config_param :max_bad_records, :integer, default: 0
|
78
|
+
# ignore_unknown_values
|
79
|
+
# Accept rows that contain values that do not match the schema. The unknown values are ignored.
|
80
|
+
# Default is false, which treats unknown values as errors.
|
81
|
+
config_param :ignore_unknown_values, :bool, default: false
|
82
|
+
|
83
|
+
config_param :schema_path, :string, default: nil
|
84
|
+
config_param :fetch_schema, :bool, default: false
|
85
|
+
config_param :field_string, :string, default: nil
|
86
|
+
config_param :field_integer, :string, default: nil
|
87
|
+
config_param :field_float, :string, default: nil
|
88
|
+
config_param :field_boolean, :string, default: nil
|
89
|
+
config_param :field_timestamp, :string, default: nil
|
90
|
+
### TODO: record field stream inserts doesn't works well?
|
91
|
+
### At table creation, table type json + field type record -> field type validation fails
|
92
|
+
### At streaming inserts, schema cannot be specified
|
93
|
+
# config_param :field_record, :string, defualt: nil
|
94
|
+
# config_param :optional_data_field, :string, default: nil
|
95
|
+
|
96
|
+
REGEXP_MAX_NUM = 10
|
97
|
+
config_param :replace_record_key, :bool, default: false
|
98
|
+
(1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
|
99
|
+
|
100
|
+
config_param :time_format, :string, default: nil
|
101
|
+
config_param :localtime, :bool, default: nil
|
102
|
+
config_param :utc, :bool, default: nil
|
103
|
+
config_param :time_field, :string, default: nil
|
104
|
+
|
105
|
+
config_param :insert_id_field, :string, default: nil
|
106
|
+
|
107
|
+
config_param :method, :string, default: 'insert' # or 'load'
|
108
|
+
|
109
|
+
config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
|
110
|
+
### method: 'load'
|
111
|
+
# https://developers.google.com/bigquery/loading-data-into-bigquery
|
112
|
+
# Maximum File Sizes:
|
113
|
+
# File Type Compressed Uncompressed
|
114
|
+
# CSV 1 GB With new-lines in strings: 4 GB
|
115
|
+
# Without new-lines in strings: 1 TB
|
116
|
+
# JSON 1 GB 1 TB
|
117
|
+
|
118
|
+
config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
|
119
|
+
# config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
|
120
|
+
# config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
|
121
|
+
### method: ''Streaming data inserts support
|
122
|
+
# https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
|
123
|
+
# Maximum row size: 100 KB
|
124
|
+
# Maximum data size of all rows, per insert: 1 MB
|
125
|
+
# Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
|
126
|
+
# If you exceed 100 rows per second for an extended period of time, throttling might occur.
|
127
|
+
### Toooooooooooooo short/small per inserts and row!
|
128
|
+
|
129
|
+
### Table types
|
130
|
+
# https://developers.google.com/bigquery/docs/tables
|
131
|
+
#
|
132
|
+
# type - The following data types are supported; see Data Formats for details on each data type:
|
133
|
+
# STRING
|
134
|
+
# INTEGER
|
135
|
+
# FLOAT
|
136
|
+
# BOOLEAN
|
137
|
+
# RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
|
138
|
+
#
|
139
|
+
# mode - Whether a field can be null. The following values are supported:
|
140
|
+
# NULLABLE - The cell can be null.
|
141
|
+
# REQUIRED - The cell cannot be null.
|
142
|
+
# REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
|
143
|
+
|
144
|
+
def initialize
|
145
|
+
super
|
146
|
+
require 'json'
|
147
|
+
require 'google/apis/bigquery_v2'
|
148
|
+
require 'googleauth'
|
149
|
+
require 'active_support/core_ext/hash'
|
150
|
+
require 'active_support/core_ext/object/json'
|
151
|
+
|
152
|
+
# MEMO: signet-0.6.1 depend on Farady.default_connection
|
153
|
+
Faraday.default_connection.options.timeout = 60
|
154
|
+
end
|
155
|
+
|
156
|
+
# Define `log` method for v0.10.42 or earlier
|
157
|
+
unless method_defined?(:log)
|
158
|
+
define_method("log") { $log }
|
159
|
+
end
|
160
|
+
|
161
|
+
def configure(conf)
|
162
|
+
super
|
163
|
+
|
164
|
+
if @method == "insert"
|
165
|
+
extend(InsertImplementation)
|
166
|
+
elsif @method == "load"
|
167
|
+
extend(LoadImplementation)
|
168
|
+
else
|
169
|
+
raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
|
170
|
+
end
|
171
|
+
|
172
|
+
case @auth_method
|
173
|
+
when 'private_key'
|
174
|
+
unless @email && @private_key_path
|
175
|
+
raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
|
176
|
+
end
|
177
|
+
when 'compute_engine'
|
178
|
+
# Do nothing
|
179
|
+
when 'json_key'
|
180
|
+
unless @json_key
|
181
|
+
raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
|
182
|
+
end
|
183
|
+
when 'application_default'
|
184
|
+
# Do nothing
|
185
|
+
else
|
186
|
+
raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
|
187
|
+
end
|
188
|
+
|
189
|
+
unless @table.nil? ^ @tables.nil?
|
190
|
+
raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
|
191
|
+
end
|
192
|
+
|
193
|
+
@tablelist = @tables ? @tables.split(',') : [@table]
|
194
|
+
|
195
|
+
@fields = RecordSchema.new('record')
|
196
|
+
if @schema_path
|
197
|
+
@fields.load_schema(JSON.parse(File.read(@schema_path)))
|
198
|
+
end
|
199
|
+
|
200
|
+
types = %w(string integer float boolean timestamp)
|
201
|
+
types.each do |type|
|
202
|
+
raw_fields = instance_variable_get("@field_#{type}")
|
203
|
+
next unless raw_fields
|
204
|
+
raw_fields.split(',').each do |field|
|
205
|
+
@fields.register_field field.strip, type.to_sym
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
@regexps = {}
|
210
|
+
(1..REGEXP_MAX_NUM).each do |i|
|
211
|
+
next unless conf["replace_record_key_regexp#{i}"]
|
212
|
+
regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
|
213
|
+
raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
|
214
|
+
raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
|
215
|
+
@regexps[regexp] = replacement
|
216
|
+
end
|
217
|
+
|
218
|
+
@localtime = false if @localtime.nil? && @utc
|
219
|
+
|
220
|
+
@timef = TimeFormatter.new(@time_format, @localtime)
|
221
|
+
|
222
|
+
if @time_field
|
223
|
+
keys = @time_field.split('.')
|
224
|
+
last_key = keys.pop
|
225
|
+
@add_time_field = ->(record, time) {
|
226
|
+
keys.inject(record) { |h, k| h[k] ||= {} }[last_key] = @timef.format(time)
|
227
|
+
record
|
228
|
+
}
|
229
|
+
else
|
230
|
+
@add_time_field = ->(record, time) { record }
|
231
|
+
end
|
232
|
+
|
233
|
+
if @insert_id_field
|
234
|
+
insert_id_keys = @insert_id_field.split('.')
|
235
|
+
@get_insert_id = ->(record) {
|
236
|
+
insert_id_keys.inject(record) {|h, k| h[k] }
|
237
|
+
}
|
238
|
+
else
|
239
|
+
@get_insert_id = nil
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def start
|
244
|
+
super
|
245
|
+
|
246
|
+
@cached_client = nil
|
247
|
+
@cached_client_expiration = nil
|
248
|
+
|
249
|
+
@tables_queue = @tablelist.dup.shuffle
|
250
|
+
@tables_mutex = Mutex.new
|
251
|
+
|
252
|
+
fetch_schema() if @fetch_schema
|
253
|
+
end
|
254
|
+
|
255
|
+
def client
|
256
|
+
return @cached_client if @cached_client && @cached_client_expiration > Time.now
|
257
|
+
|
258
|
+
client = Google::Apis::BigqueryV2::BigqueryService.new
|
259
|
+
|
260
|
+
scope = "https://www.googleapis.com/auth/bigquery"
|
261
|
+
|
262
|
+
case @auth_method
|
263
|
+
when 'private_key'
|
264
|
+
require 'google/api_client/auth/key_utils'
|
265
|
+
key = Google::APIClient::KeyUtils.load_from_pkcs12(@private_key_path, @private_key_passphrase)
|
266
|
+
auth = Signet::OAuth2::Client.new(
|
267
|
+
token_credential_uri: "https://accounts.google.com/o/oauth2/token",
|
268
|
+
audience: "https://accounts.google.com/o/oauth2/token",
|
269
|
+
scope: scope,
|
270
|
+
issuer: @email,
|
271
|
+
signing_key: key)
|
272
|
+
|
273
|
+
when 'compute_engine'
|
274
|
+
auth = Google::Auth::GCECredentials.new
|
275
|
+
|
276
|
+
when 'json_key'
|
277
|
+
if File.exist?(@json_key)
|
278
|
+
auth = File.open(@json_key) do |f|
|
279
|
+
Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: f, scope: scope)
|
280
|
+
end
|
281
|
+
else
|
282
|
+
key = StringIO.new(@json_key)
|
283
|
+
auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
|
284
|
+
end
|
285
|
+
|
286
|
+
when 'application_default'
|
287
|
+
auth = Google::Auth.get_application_default([scope])
|
288
|
+
|
289
|
+
else
|
290
|
+
raise ConfigError, "Unknown auth method: #{@auth_method}"
|
291
|
+
end
|
292
|
+
|
293
|
+
client.authorization = auth
|
294
|
+
|
295
|
+
@cached_client_expiration = Time.now + 1800
|
296
|
+
@cached_client = client
|
297
|
+
end
|
298
|
+
|
299
|
+
def generate_table_id(table_id_format, current_time, chunk = nil)
|
300
|
+
table_id = current_time.strftime(table_id_format)
|
301
|
+
if chunk
|
302
|
+
table_id.gsub(%r(%{time_slice})) { |expr|
|
303
|
+
chunk.key
|
304
|
+
}
|
305
|
+
else
|
306
|
+
table_id.gsub(%r(%{time_slice})) { |expr|
|
307
|
+
current_time.strftime(@time_slice_format)
|
308
|
+
}
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def create_table(table_id)
|
313
|
+
client.insert_table(@project, @dataset, {
|
314
|
+
table_reference: {
|
315
|
+
table_id: table_id,
|
316
|
+
},
|
317
|
+
schema: {
|
318
|
+
fields: @fields.to_a,
|
319
|
+
}
|
320
|
+
}, {})
|
321
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
322
|
+
# api_error? -> client cache clear
|
323
|
+
@cached_client = nil
|
324
|
+
|
325
|
+
message = e.message
|
326
|
+
if e.status_code == 409 && /Already Exists:/ =~ message
|
327
|
+
# ignore 'Already Exists' error
|
328
|
+
return
|
329
|
+
end
|
330
|
+
log.error "tables.insert API", :project_id => @project, :dataset => @dataset, :table => table_id, :code => e.status_code, :message => message
|
331
|
+
raise "failed to create table in bigquery" # TODO: error class
|
332
|
+
end
|
333
|
+
|
334
|
+
def replace_record_key(record)
|
335
|
+
new_record = {}
|
336
|
+
record.each do |key, _|
|
337
|
+
new_key = key
|
338
|
+
@regexps.each do |regexp, replacement|
|
339
|
+
new_key = new_key.gsub(/#{regexp}/, replacement)
|
340
|
+
end
|
341
|
+
new_key = new_key.gsub(/\W/, '')
|
342
|
+
new_record.store(new_key, record[key])
|
343
|
+
end
|
344
|
+
new_record
|
345
|
+
end
|
346
|
+
|
347
|
+
def write(chunk)
|
348
|
+
table_id_format = @tables_mutex.synchronize do
|
349
|
+
t = @tables_queue.shift
|
350
|
+
@tables_queue.push t
|
351
|
+
t
|
352
|
+
end
|
353
|
+
table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), chunk)
|
354
|
+
_write(chunk, table_id)
|
355
|
+
end
|
356
|
+
|
357
|
+
def fetch_schema
|
358
|
+
table_id_format = @tablelist[0]
|
359
|
+
table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
|
360
|
+
res = client.get_table(@project, @dataset, table_id)
|
361
|
+
|
362
|
+
schema = res.schema.fields.as_json
|
363
|
+
log.debug "Load schema from BigQuery: #{@project}:#{@dataset}.#{table_id} #{schema}"
|
364
|
+
@fields.load_schema(schema, false)
|
365
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
366
|
+
# api_error? -> client cache clear
|
367
|
+
@cached_client = nil
|
368
|
+
message = e.message
|
369
|
+
log.error "tables.get API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
|
370
|
+
raise "failed to fetch schema from bigquery" # TODO: error class
|
371
|
+
end
|
372
|
+
|
373
|
+
module InsertImplementation
|
374
|
+
def format(tag, time, record)
|
375
|
+
buf = ''
|
376
|
+
|
377
|
+
if @replace_record_key
|
378
|
+
record = replace_record_key(record)
|
379
|
+
end
|
380
|
+
row = @fields.format(@add_time_field.call(record, time))
|
381
|
+
unless row.empty?
|
382
|
+
row = {"json" => row}
|
383
|
+
row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
|
384
|
+
buf << row.to_msgpack
|
385
|
+
end
|
386
|
+
buf
|
387
|
+
end
|
388
|
+
|
389
|
+
def _write(chunk, table_id)
|
390
|
+
rows = []
|
391
|
+
chunk.msgpack_each do |row_object|
|
392
|
+
# TODO: row size limit
|
393
|
+
rows << row_object.deep_symbolize_keys
|
394
|
+
end
|
395
|
+
|
396
|
+
res = client.insert_all_table_data(@project, @dataset, table_id, {
|
397
|
+
rows: rows,
|
398
|
+
skip_invalid_rows: @skip_invalid_rows,
|
399
|
+
ignore_unknown_values: @ignore_unknown_values,
|
400
|
+
}, {})
|
401
|
+
|
402
|
+
if res.insert_errors
|
403
|
+
reasons = []
|
404
|
+
res.insert_errors.each do |i|
|
405
|
+
i.errors.each do |e|
|
406
|
+
reasons << e.reason
|
407
|
+
log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, message: e.message, reason: e.reason
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
raise "failed to insert into bigquery, retry" if reasons.find { |r| r == "backendError" }
|
412
|
+
return if reasons.all? { |r| r == "invalid" } && @skip_invalid_rows
|
413
|
+
flush_secondary(@secondary) if @secondary
|
414
|
+
end
|
415
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
416
|
+
# api_error? -> client cache clear
|
417
|
+
@cached_client = nil
|
418
|
+
|
419
|
+
if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
|
420
|
+
# Table Not Found: Auto Create Table
|
421
|
+
create_table(table_id)
|
422
|
+
raise "table created. send rows next time."
|
423
|
+
end
|
424
|
+
|
425
|
+
log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
|
426
|
+
if e.reason == "backendError"
|
427
|
+
raise "failed to insert into bigquery, retry" # TODO: error class
|
428
|
+
elsif @secondary
|
429
|
+
flush_secondary(@secondary)
|
430
|
+
end
|
431
|
+
end
|
432
|
+
end
|
433
|
+
|
434
|
+
module LoadImplementation
|
435
|
+
def format(tag, time, record)
|
436
|
+
buf = ''
|
437
|
+
|
438
|
+
if @replace_record_key
|
439
|
+
record = replace_record_key(record)
|
440
|
+
end
|
441
|
+
row = @fields.format(@add_time_field.call(record, time))
|
442
|
+
unless row.empty?
|
443
|
+
buf << MultiJson.dump(row) + "\n"
|
444
|
+
end
|
445
|
+
buf
|
446
|
+
end
|
447
|
+
|
448
|
+
def _write(chunk, table_id)
|
449
|
+
res = nil
|
450
|
+
create_upload_source(chunk) do |upload_source|
|
451
|
+
res = client.insert_job(@project, {
|
452
|
+
configuration: {
|
453
|
+
load: {
|
454
|
+
destination_table: {
|
455
|
+
project_id: @project,
|
456
|
+
dataset_id: @dataset,
|
457
|
+
table_id: table_id,
|
458
|
+
},
|
459
|
+
schema: {
|
460
|
+
fields: @fields.to_a,
|
461
|
+
},
|
462
|
+
write_disposition: "WRITE_APPEND",
|
463
|
+
source_format: "NEWLINE_DELIMITED_JSON",
|
464
|
+
ignore_unknown_values: @ignore_unknown_values,
|
465
|
+
max_bad_records: @max_bad_records,
|
466
|
+
}
|
467
|
+
}
|
468
|
+
}, {upload_source: upload_source, content_type: "application/octet-stream"})
|
469
|
+
end
|
470
|
+
wait_load(res, table_id)
|
471
|
+
rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
|
472
|
+
# api_error? -> client cache clear
|
473
|
+
@cached_client = nil
|
474
|
+
|
475
|
+
log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: e.message, reason: e.reason
|
476
|
+
if e.reason == "backendError"
|
477
|
+
raise "failed to insert into bigquery, retry" # TODO: error class
|
478
|
+
elsif @secondary
|
479
|
+
flush_secondary(@secondary)
|
480
|
+
end
|
481
|
+
end
|
482
|
+
|
483
|
+
private
|
484
|
+
|
485
|
+
def wait_load(res, table_id)
|
486
|
+
wait_interval = 10
|
487
|
+
_response = res
|
488
|
+
until _response.status.state == "DONE"
|
489
|
+
log.debug "wait for load job finish", state: _response.status.state
|
490
|
+
sleep wait_interval
|
491
|
+
_response = client.get_job(@project, _response.job_reference.job_id)
|
492
|
+
end
|
493
|
+
|
494
|
+
errors = _response.status.errors
|
495
|
+
if errors
|
496
|
+
errors.each do |e|
|
497
|
+
log.error "job.insert API (rows)", project_id: @project, dataset: @dataset, table: table_id, message: e.message, reason: e.reason
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
error_result = _response.status.error_result
|
502
|
+
if error_result
|
503
|
+
log.error "job.insert API (result)", project_id: @project, dataset: @dataset, table: table_id, message: error_result.message, reason: error_result.reason
|
504
|
+
if _response.status.error_result.reason == "backendError"
|
505
|
+
raise "failed to load into bigquery"
|
506
|
+
elsif @secondary
|
507
|
+
flush_secondary(@secondary)
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
log.debug "finish load job", state: _response.status.state
|
512
|
+
end
|
513
|
+
|
514
|
+
def create_upload_source(chunk)
|
515
|
+
chunk_is_file = @buffer_type == 'file'
|
516
|
+
if chunk_is_file
|
517
|
+
File.open(chunk.path) do |file|
|
518
|
+
yield file
|
519
|
+
end
|
520
|
+
else
|
521
|
+
Tempfile.open("chunk-tmp") do |file|
|
522
|
+
file.binmode
|
523
|
+
chunk.write_to(file)
|
524
|
+
file.sync
|
525
|
+
file.rewind
|
526
|
+
yield file
|
527
|
+
end
|
528
|
+
end
|
529
|
+
end
|
530
|
+
end
|
531
|
+
|
532
|
+
class FieldSchema
|
533
|
+
def initialize(name, mode = :nullable)
|
534
|
+
unless [:nullable, :required, :repeated].include?(mode)
|
535
|
+
raise ConfigError, "Unrecognized mode for #{name}: #{mode}"
|
536
|
+
end
|
537
|
+
### https://developers.google.com/bigquery/docs/tables
|
538
|
+
# Each field has the following properties:
|
539
|
+
#
|
540
|
+
# name - The name must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
|
541
|
+
# and must start with a letter or underscore. The maximum length is 128 characters.
|
542
|
+
# https://cloud.google.com/bigquery/docs/reference/v2/tables#schema.fields.name
|
543
|
+
unless name =~ /^[_A-Za-z][_A-Za-z0-9]{,127}$/
|
544
|
+
raise Fluent::ConfigError, "invalid bigquery field name: '#{name}'"
|
545
|
+
end
|
546
|
+
|
547
|
+
@name = name
|
548
|
+
@mode = mode
|
549
|
+
end
|
550
|
+
|
551
|
+
attr_reader :name, :mode
|
552
|
+
|
553
|
+
def format(value)
|
554
|
+
case @mode
|
555
|
+
when :nullable
|
556
|
+
format_one(value) unless value.nil?
|
557
|
+
when :required
|
558
|
+
raise "Required field #{name} cannot be null" if value.nil?
|
559
|
+
format_one(value)
|
560
|
+
when :repeated
|
561
|
+
value.nil? ? [] : value.map {|v| format_one(v) }
|
562
|
+
end
|
563
|
+
end
|
564
|
+
|
565
|
+
def format_one(value)
|
566
|
+
raise NotImplementedError, "Must implement in a subclass"
|
567
|
+
end
|
568
|
+
|
569
|
+
def to_h
|
570
|
+
{
|
571
|
+
:name => name,
|
572
|
+
:type => type.to_s.upcase,
|
573
|
+
:mode => mode.to_s.upcase,
|
574
|
+
}
|
575
|
+
end
|
576
|
+
end
|
577
|
+
|
578
|
+
class StringFieldSchema < FieldSchema
|
579
|
+
def type
|
580
|
+
:string
|
581
|
+
end
|
582
|
+
|
583
|
+
def format_one(value)
|
584
|
+
value.to_s
|
585
|
+
end
|
586
|
+
end
|
587
|
+
|
588
|
+
class IntegerFieldSchema < FieldSchema
|
589
|
+
def type
|
590
|
+
:integer
|
591
|
+
end
|
592
|
+
|
593
|
+
def format_one(value)
|
594
|
+
value.to_i
|
595
|
+
end
|
596
|
+
end
|
597
|
+
|
598
|
+
class FloatFieldSchema < FieldSchema
|
599
|
+
def type
|
600
|
+
:float
|
601
|
+
end
|
602
|
+
|
603
|
+
def format_one(value)
|
604
|
+
value.to_f
|
605
|
+
end
|
606
|
+
end
|
607
|
+
|
608
|
+
class BooleanFieldSchema < FieldSchema
|
609
|
+
def type
|
610
|
+
:boolean
|
611
|
+
end
|
612
|
+
|
613
|
+
def format_one(value)
|
614
|
+
!!value
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
class TimestampFieldSchema < FieldSchema
|
619
|
+
def type
|
620
|
+
:timestamp
|
621
|
+
end
|
622
|
+
|
623
|
+
def format_one(value)
|
624
|
+
value
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
class RecordSchema < FieldSchema
|
629
|
+
FIELD_TYPES = {
|
630
|
+
string: StringFieldSchema,
|
631
|
+
integer: IntegerFieldSchema,
|
632
|
+
float: FloatFieldSchema,
|
633
|
+
boolean: BooleanFieldSchema,
|
634
|
+
timestamp: TimestampFieldSchema,
|
635
|
+
record: RecordSchema
|
636
|
+
}.freeze
|
637
|
+
|
638
|
+
def initialize(name, mode = :nullable)
|
639
|
+
super(name, mode)
|
640
|
+
@fields = {}
|
641
|
+
end
|
642
|
+
|
643
|
+
def type
|
644
|
+
:record
|
645
|
+
end
|
646
|
+
|
647
|
+
def [](name)
|
648
|
+
@fields[name]
|
649
|
+
end
|
650
|
+
|
651
|
+
def to_a
|
652
|
+
@fields.map do |_, field_schema|
|
653
|
+
field_schema.to_h
|
654
|
+
end
|
655
|
+
end
|
656
|
+
|
657
|
+
def to_h
|
658
|
+
{
|
659
|
+
:name => name,
|
660
|
+
:type => type.to_s.upcase,
|
661
|
+
:mode => mode.to_s.upcase,
|
662
|
+
:fields => self.to_a,
|
663
|
+
}
|
664
|
+
end
|
665
|
+
|
666
|
+
def load_schema(schema, allow_overwrite=true)
|
667
|
+
schema.each do |field|
|
668
|
+
raise ConfigError, 'field must have type' unless field.key?('type')
|
669
|
+
|
670
|
+
name = field['name']
|
671
|
+
mode = (field['mode'] || 'nullable').downcase.to_sym
|
672
|
+
|
673
|
+
type = field['type'].downcase.to_sym
|
674
|
+
field_schema_class = FIELD_TYPES[type]
|
675
|
+
raise ConfigError, "Invalid field type: #{field['type']}" unless field_schema_class
|
676
|
+
|
677
|
+
next if @fields.key?(name) and !allow_overwrite
|
678
|
+
|
679
|
+
field_schema = field_schema_class.new(name, mode)
|
680
|
+
@fields[name] = field_schema
|
681
|
+
if type == :record
|
682
|
+
raise ConfigError, "record field must have fields" unless field.key?('fields')
|
683
|
+
field_schema.load_schema(field['fields'], allow_overwrite)
|
684
|
+
end
|
685
|
+
end
|
686
|
+
end
|
687
|
+
|
688
|
+
def register_field(name, type)
|
689
|
+
if @fields.key?(name) and @fields[name].type != :timestamp
|
690
|
+
raise ConfigError, "field #{name} is registered twice"
|
691
|
+
end
|
692
|
+
if name[/\./]
|
693
|
+
recordname = $`
|
694
|
+
fieldname = $'
|
695
|
+
register_record_field(recordname)
|
696
|
+
@fields[recordname].register_field(fieldname, type)
|
697
|
+
else
|
698
|
+
schema = FIELD_TYPES[type]
|
699
|
+
raise ConfigError, "[Bug] Invalid field type #{type}" unless schema
|
700
|
+
@fields[name] = schema.new(name)
|
701
|
+
end
|
702
|
+
end
|
703
|
+
|
704
|
+
def format_one(record)
|
705
|
+
out = {}
|
706
|
+
@fields.each do |key, schema|
|
707
|
+
value = record[key]
|
708
|
+
formatted = schema.format(value)
|
709
|
+
next if formatted.nil? # field does not exists, or null value
|
710
|
+
out[key] = formatted
|
711
|
+
end
|
712
|
+
out
|
713
|
+
end
|
714
|
+
|
715
|
+
private
|
716
|
+
def register_record_field(name)
|
717
|
+
if !@fields.key?(name)
|
718
|
+
@fields[name] = RecordSchema.new(name)
|
719
|
+
else
|
720
|
+
unless @fields[name].kind_of?(RecordSchema)
|
721
|
+
raise ConfigError, "field #{name} is required to be a record but already registered as #{@field[name]}"
|
722
|
+
end
|
723
|
+
end
|
724
|
+
end
|
725
|
+
end
|
726
|
+
end
|
727
|
+
end
|