fluent-plugin-bigquery 0.4.4 → 0.5.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -113,15 +113,9 @@ module Fluent
113
113
  @client = nil
114
114
 
115
115
  reason = e.respond_to?(:reason) ? e.reason : nil
116
- error_data = { project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason }
117
- wrapped = Fluent::BigQuery::Error.wrap(e)
118
- if wrapped.retryable?
119
- log.warn "tabledata.insertAll API", error_data
120
- else
121
- log.error "tabledata.insertAll API", error_data
122
- end
116
+ log.error "tabledata.insertAll API", project_id: project, dataset: dataset, table: table_id, code: e.status_code, message: e.message, reason: reason
123
117
 
124
- raise wrapped
118
+ raise Fluent::BigQuery::Error.wrap(e)
125
119
  end
126
120
 
127
121
  def create_load_job(chunk_id, project, dataset, table_id, upload_source, fields)
@@ -2,520 +2,511 @@
2
2
 
3
3
  require 'fluent/plugin/bigquery/version'
4
4
 
5
- require 'fluent/mixin/config_placeholders'
6
- require 'fluent/mixin/plaintextformatter'
7
-
8
5
  require 'fluent/plugin/bigquery/errors'
9
6
  require 'fluent/plugin/bigquery/schema'
10
7
  require 'fluent/plugin/bigquery/writer'
11
8
 
12
- ## TODO: load implementation
13
- # require 'fluent/plugin/bigquery/load_request_body_wrapper'
14
-
15
9
  module Fluent
16
- class BigQueryOutput < TimeSlicedOutput
17
- Fluent::Plugin.register_output('bigquery', self)
18
-
19
- # https://developers.google.com/bigquery/browser-tool-quickstart
20
- # https://developers.google.com/bigquery/bigquery-api-quickstart
21
-
22
- ### default for insert
23
- def configure_for_insert(conf)
24
- raise ConfigError unless conf["method"] != "load"
25
-
26
- conf["buffer_type"] = "lightening" unless conf["buffer_type"]
27
- conf["flush_interval"] = 0.25 unless conf["flush_interval"]
28
- conf["try_flush_interval"] = 0.05 unless conf["try_flush_interval"]
29
- conf["buffer_chunk_limit"] = 1 * 1024 ** 2 unless conf["buffer_chunk_limit"] # 1MB
30
- conf["buffer_queue_limit"] = 1024 unless conf["buffer_queue_limit"]
31
- conf["buffer_chunk_records_limit"] = 500 unless conf["buffer_chunk_records_limit"]
32
- end
33
-
34
- ### default for loads
35
- def configure_for_load(conf)
36
- raise ConfigError unless conf["method"] == "load"
37
-
38
- # buffer_type, flush_interval, try_flush_interval is TimeSlicedOutput default
39
- conf["buffer_chunk_limit"] = 1 * 1024 ** 3 unless conf["buffer_chunk_limit"] # 1GB
40
- conf["buffer_queue_limit"] = 32 unless conf["buffer_queue_limit"]
41
- end
10
+ module Plugin
11
+ class BigQueryOutput < Output
12
+ Fluent::Plugin.register_output('bigquery', self)
13
+
14
+ helpers :inject
15
+
16
+ # https://developers.google.com/bigquery/browser-tool-quickstart
17
+ # https://developers.google.com/bigquery/bigquery-api-quickstart
18
+
19
+ ### default for insert
20
+ def configure_for_insert(conf)
21
+ raise ConfigError unless conf["method"].nil? || conf["method"] == "insert"
22
+
23
+ buffer_config = conf.elements("buffer")[0]
24
+ return unless buffer_config
25
+ buffer_config["@type"] = "memory" unless buffer_config["@type"]
26
+ buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
27
+ buffer_config["flush_interval"] = 0.25 unless buffer_config["flush_interval"]
28
+ buffer_config["flush_thread_interval"] = 0.05 unless buffer_config["flush_thread_interval"]
29
+ buffer_config["flush_thread_burst_interval"] = 0.05 unless buffer_config["flush_thread_burst_interval"]
30
+ buffer_config["chunk_limit_size"] = 1 * 1024 ** 2 unless buffer_config["chunk_limit_size"] # 1MB
31
+ buffer_config["total_limit_size"] = 1 * 1024 ** 3 unless buffer_config["total_limit_size"] # 1GB
32
+ buffer_config["chunk_records_limit"] = 500 unless buffer_config["chunk_records_limit"]
33
+ end
42
34
 
43
- # Available methods are:
44
- # * private_key -- Use service account credential from pkcs12 private key file
45
- # * compute_engine -- Use access token available in instances of ComputeEngine
46
- # * json_key -- Use service account credential from JSON key
47
- # * application_default -- Use application default credential
48
- config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
49
-
50
- ### Service Account credential
51
- config_param :email, :string, default: nil
52
- config_param :private_key_path, :string, default: nil
53
- config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
54
- config_param :json_key, default: nil, secret: true
55
-
56
- # see as simple reference
57
- # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
58
- config_param :project, :string
59
-
60
- # dataset_name
61
- # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
62
- # but it cannot start with a number or underscore, or have spaces.
63
- config_param :dataset, :string
64
-
65
- # table_id
66
- # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
67
- config_param :table, :string, default: nil
68
- config_param :tables, :string, default: nil # TODO: use :array with value_type: :string
69
-
70
- # template_suffix (only insert)
71
- # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
72
- config_param :template_suffix, :string, default: nil
73
-
74
- config_param :auto_create_table, :bool, default: false
75
-
76
- # skip_invalid_rows (only insert)
77
- # Insert all valid rows of a request, even if invalid rows exist.
78
- # The default value is false, which causes the entire request to fail if any invalid rows exist.
79
- config_param :skip_invalid_rows, :bool, default: false
80
- # max_bad_records (only load)
81
- # The maximum number of bad records that BigQuery can ignore when running the job.
82
- # If the number of bad records exceeds this value, an invalid error is returned in the job result.
83
- # The default value is 0, which requires that all records are valid.
84
- config_param :max_bad_records, :integer, default: 0
85
- # ignore_unknown_values
86
- # Accept rows that contain values that do not match the schema. The unknown values are ignored.
87
- # Default is false, which treats unknown values as errors.
88
- config_param :ignore_unknown_values, :bool, default: false
89
-
90
- config_param :schema, :array, default: nil
91
- config_param :schema_path, :string, default: nil
92
- config_param :fetch_schema, :bool, default: false
93
- config_param :fetch_schema_table, :string, default: nil
94
- config_param :schema_cache_expire, :time, default: 600
95
- config_param :field_string, :string, default: nil
96
- config_param :field_integer, :string, default: nil
97
- config_param :field_float, :string, default: nil
98
- config_param :field_boolean, :string, default: nil
99
- config_param :field_timestamp, :string, default: nil
100
- ### TODO: record field stream inserts doesn't works well?
101
- ### At table creation, table type json + field type record -> field type validation fails
102
- ### At streaming inserts, schema cannot be specified
103
- # config_param :field_record, :string, defualt: nil
104
- # config_param :optional_data_field, :string, default: nil
105
-
106
- REGEXP_MAX_NUM = 10
107
- config_param :replace_record_key, :bool, default: false
108
- (1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
109
-
110
- config_param :convert_hash_to_json, :bool, default: false
111
-
112
- config_param :time_format, :string, default: nil
113
- config_param :localtime, :bool, default: nil
114
- config_param :utc, :bool, default: nil
115
- config_param :time_field, :string, default: nil
116
-
117
- # insert_id_field (only insert)
118
- config_param :insert_id_field, :string, default: nil
119
- # prevent_duplicate_load (only load)
120
- config_param :prevent_duplicate_load, :bool, default: false
121
-
122
- # add_insert_timestamp (only insert)
123
- # adds a timestamp just before sending the rows to bigquery, so that
124
- # buffering time is not taken into account. Gives a field in bigquery
125
- # which represents the insert time of the row.
126
- config_param :add_insert_timestamp, :string, default: nil
127
-
128
- config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
129
-
130
- # allow_retry_insert_errors (only insert)
131
- # If insert_id_field is not specified, true means to allow duplicate rows
132
- config_param :allow_retry_insert_errors, :bool, default: false
133
-
134
- # TODO
135
- # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
136
- # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
137
- # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
138
- ### method: ''Streaming data inserts support
139
- # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
140
- # Maximum row size: 100 KB
141
- # Maximum data size of all rows, per insert: 1 MB
142
- # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
143
- # If you exceed 100 rows per second for an extended period of time, throttling might occur.
144
- ### Toooooooooooooo short/small per inserts and row!
145
-
146
- ## Timeout
147
- # request_timeout_sec
148
- # Bigquery API response timeout
149
- # request_open_timeout_sec
150
- # Bigquery API connection, and request timeout
151
- config_param :request_timeout_sec, :time, default: nil
152
- config_param :request_open_timeout_sec, :time, default: 60
153
-
154
- ## Partitioning
155
- config_param :time_partitioning_type, :enum, list: [:day], default: nil
156
- config_param :time_partitioning_expiration, :time, default: nil
157
-
158
- ### Table types
159
- # https://developers.google.com/bigquery/docs/tables
160
- #
161
- # type - The following data types are supported; see Data Formats for details on each data type:
162
- # STRING
163
- # INTEGER
164
- # FLOAT
165
- # BOOLEAN
166
- # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
167
- #
168
- # mode - Whether a field can be null. The following values are supported:
169
- # NULLABLE - The cell can be null.
170
- # REQUIRED - The cell cannot be null.
171
- # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
172
-
173
- def initialize
174
- super
175
- require 'multi_json'
176
- require 'google/apis/bigquery_v2'
177
- require 'googleauth'
178
- require 'active_support/json'
179
- require 'active_support/core_ext/hash'
180
- require 'active_support/core_ext/object/json'
181
-
182
- # MEMO: signet-0.6.1 depend on Farady.default_connection
183
- Faraday.default_connection.options.timeout = 60
184
- end
35
+ ### default for loads
36
+ def configure_for_load(conf)
37
+ raise ConfigError unless conf["method"] == "load"
185
38
 
186
- def configure(conf)
187
- if conf["method"] == "load"
188
- configure_for_load(conf)
189
- else
190
- configure_for_insert(conf)
39
+ buffer_config = conf.elements("buffer")[0]
40
+ return unless buffer_config
41
+ buffer_config["@type"] = "file" unless buffer_config["@type"]
42
+ buffer_config["flush_mode"] = :interval unless buffer_config["flush_mode"]
43
+ buffer_config["chunk_limit_size"] = 1 * 1024 ** 3 unless buffer_config["chunk_limit_size"] # 1GB
44
+ buffer_config["total_limit_size"] = 32 * 1024 ** 3 unless buffer_config["total_limit_size"] # 32GB
191
45
  end
192
- super
193
-
194
- case @method
195
- when :insert
196
- extend(InsertImplementation)
197
- when :load
198
- raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
199
- extend(LoadImplementation)
200
- else
201
- raise Fluent::ConfigError "'method' must be 'insert' or 'load'"
46
+
47
+ # Available methods are:
48
+ # * private_key -- Use service account credential from pkcs12 private key file
49
+ # * compute_engine -- Use access token available in instances of ComputeEngine
50
+ # * json_key -- Use service account credential from JSON key
51
+ # * application_default -- Use application default credential
52
+ config_param :auth_method, :enum, list: [:private_key, :compute_engine, :json_key, :application_default], default: :private_key
53
+
54
+ ### Service Account credential
55
+ config_param :email, :string, default: nil
56
+ config_param :private_key_path, :string, default: nil
57
+ config_param :private_key_passphrase, :string, default: 'notasecret', secret: true
58
+ config_param :json_key, default: nil, secret: true
59
+
60
+ # see as simple reference
61
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
62
+ config_param :project, :string
63
+
64
+ # dataset_name
65
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
66
+ # but it cannot start with a number or underscore, or have spaces.
67
+ config_param :dataset, :string
68
+
69
+ # table_id
70
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
71
+ config_param :table, :string, default: nil
72
+ config_param :tables, :array, value_type: :string, default: nil
73
+
74
+ # template_suffix (only insert)
75
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
76
+ config_param :template_suffix, :string, default: nil
77
+
78
+ config_param :auto_create_table, :bool, default: false
79
+
80
+ # skip_invalid_rows (only insert)
81
+ # Insert all valid rows of a request, even if invalid rows exist.
82
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
83
+ config_param :skip_invalid_rows, :bool, default: false
84
+ # max_bad_records (only load)
85
+ # The maximum number of bad records that BigQuery can ignore when running the job.
86
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
87
+ # The default value is 0, which requires that all records are valid.
88
+ config_param :max_bad_records, :integer, default: 0
89
+ # ignore_unknown_values
90
+ # Accept rows that contain values that do not match the schema. The unknown values are ignored.
91
+ # Default is false, which treats unknown values as errors.
92
+ config_param :ignore_unknown_values, :bool, default: false
93
+
94
+ config_param :schema, :array, default: nil
95
+ config_param :schema_path, :string, default: nil
96
+ config_param :fetch_schema, :bool, default: false
97
+ config_param :fetch_schema_table, :string, default: nil
98
+ config_param :schema_cache_expire, :time, default: 600
99
+ config_param :field_string, :array, value_type: :string, default: nil
100
+ config_param :field_integer, :array, value_type: :string, default: nil
101
+ config_param :field_float, :array, value_type: :string, default: nil
102
+ config_param :field_boolean, :array, value_type: :string, default: nil
103
+ config_param :field_timestamp, :array, value_type: :string, default: nil
104
+ ### TODO: record field stream inserts doesn't works well?
105
+ ### At table creation, table type json + field type record -> field type validation fails
106
+ ### At streaming inserts, schema cannot be specified
107
+ # config_param :field_record, :string, defualt: nil
108
+ # config_param :optional_data_field, :string, default: nil
109
+
110
+ REGEXP_MAX_NUM = 10
111
+ config_param :replace_record_key, :bool, default: false
112
+ (1..REGEXP_MAX_NUM).each {|i| config_param :"replace_record_key_regexp#{i}", :string, default: nil }
113
+
114
+ config_param :convert_hash_to_json, :bool, default: false
115
+
116
+ # insert_id_field (only insert)
117
+ config_param :insert_id_field, :string, default: nil
118
+ # prevent_duplicate_load (only load)
119
+ config_param :prevent_duplicate_load, :bool, default: false
120
+
121
+ config_param :method, :enum, list: [:insert, :load], default: :insert, skip_accessor: true
122
+
123
+ # allow_retry_insert_errors (only insert)
124
+ # If insert_id_field is not specified, true means to allow duplicate rows
125
+ config_param :allow_retry_insert_errors, :bool, default: false
126
+
127
+ # TODO
128
+ # config_param :row_size_limit, :integer, default: 100*1000 # < 100KB # configurable in google ?
129
+ # config_param :insert_size_limit, :integer, default: 1000**2 # < 1MB
130
+ # config_param :rows_per_second_limit, :integer, default: 1000 # spike limit
131
+ ### method: ''Streaming data inserts support
132
+ # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
133
+ # Maximum row size: 100 KB
134
+ # Maximum data size of all rows, per insert: 1 MB
135
+ # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
136
+ # If you exceed 100 rows per second for an extended period of time, throttling might occur.
137
+ ### Toooooooooooooo short/small per inserts and row!
138
+
139
+ ## Timeout
140
+ # request_timeout_sec
141
+ # Bigquery API response timeout
142
+ # request_open_timeout_sec
143
+ # Bigquery API connection, and request timeout
144
+ config_param :request_timeout_sec, :time, default: nil
145
+ config_param :request_open_timeout_sec, :time, default: 60
146
+
147
+ ## Partitioning
148
+ config_param :time_partitioning_type, :enum, list: [:day], default: nil
149
+ config_param :time_partitioning_expiration, :time, default: nil
150
+
151
+ ### Table types
152
+ # https://developers.google.com/bigquery/docs/tables
153
+ #
154
+ # type - The following data types are supported; see Data Formats for details on each data type:
155
+ # STRING
156
+ # INTEGER
157
+ # FLOAT
158
+ # BOOLEAN
159
+ # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
160
+ #
161
+ # mode - Whether a field can be null. The following values are supported:
162
+ # NULLABLE - The cell can be null.
163
+ # REQUIRED - The cell cannot be null.
164
+ # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
165
+
166
+ def initialize
167
+ super
168
+ require 'multi_json'
169
+ require 'google/apis/bigquery_v2'
170
+ require 'googleauth'
171
+ require 'active_support/json'
172
+ require 'active_support/core_ext/hash'
173
+ require 'active_support/core_ext/object/json'
174
+
175
+ # MEMO: signet-0.6.1 depend on Farady.default_connection
176
+ Faraday.default_connection.options.timeout = 60
202
177
  end
203
178
 
204
- case @auth_method
205
- when :private_key
206
- unless @email && @private_key_path
207
- raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
179
+ def configure(conf)
180
+ if conf["method"] == "load"
181
+ configure_for_load(conf)
182
+ else
183
+ configure_for_insert(conf)
208
184
  end
209
- when :compute_engine
210
- # Do nothing
211
- when :json_key
212
- unless @json_key
213
- raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
185
+ super
186
+
187
+ case @method
188
+ when :insert
189
+ extend(InsertImplementation)
190
+ when :load
191
+ raise Fluent::ConfigError, "'template_suffix' is for only `insert` mode, instead use 'fetch_schema_table' and formatted table name" if @template_suffix
192
+ extend(LoadImplementation)
214
193
  end
215
- when :application_default
216
- # Do nothing
217
- else
218
- raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
219
- end
220
194
 
221
- unless @table.nil? ^ @tables.nil?
222
- raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
223
- end
195
+ case @auth_method
196
+ when :private_key
197
+ unless @email && @private_key_path
198
+ raise Fluent::ConfigError, "'email' and 'private_key_path' must be specified if auth_method == 'private_key'"
199
+ end
200
+ when :compute_engine
201
+ # Do nothing
202
+ when :json_key
203
+ unless @json_key
204
+ raise Fluent::ConfigError, "'json_key' must be specified if auth_method == 'json_key'"
205
+ end
206
+ when :application_default
207
+ # Do nothing
208
+ else
209
+ raise Fluent::ConfigError, "unrecognized 'auth_method': #{@auth_method}"
210
+ end
224
211
 
225
- @tablelist = @tables ? @tables.split(',') : [@table]
212
+ unless @table.nil? ^ @tables.nil?
213
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
214
+ end
226
215
 
227
- legacy_schema_config_deprecation
228
- @fields = Fluent::BigQuery::RecordSchema.new('record')
229
- if @schema
230
- @fields.load_schema(@schema)
231
- end
232
- if @schema_path
233
- @fields.load_schema(MultiJson.load(File.read(@schema_path)))
234
- end
216
+ @tablelist = @tables ? @tables : [@table]
235
217
 
236
- types = %w(string integer float boolean timestamp)
237
- types.each do |type|
238
- raw_fields = instance_variable_get("@field_#{type}")
239
- next unless raw_fields
240
- raw_fields.split(',').each do |field|
241
- @fields.register_field field.strip, type.to_sym
218
+ legacy_schema_config_deprecation
219
+ @table_schema = Fluent::BigQuery::RecordSchema.new('record')
220
+ if @schema
221
+ @table_schema.load_schema(@schema)
222
+ end
223
+ if @schema_path
224
+ @table_schema.load_schema(MultiJson.load(File.read(@schema_path)))
242
225
  end
243
- end
244
226
 
245
- @regexps = {}
246
- (1..REGEXP_MAX_NUM).each do |i|
247
- next unless conf["replace_record_key_regexp#{i}"]
248
- regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
249
- raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
250
- raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
251
- @regexps[regexp] = replacement
252
- end
227
+ types = %i(string integer float boolean timestamp)
228
+ types.each do |type|
229
+ fields = instance_variable_get("@field_#{type}")
230
+ next unless fields
231
+ fields.each do |field|
232
+ @table_schema.register_field field, type
233
+ end
234
+ end
253
235
 
254
- @localtime = false if @localtime.nil? && @utc
236
+ @regexps = {}
237
+ (1..REGEXP_MAX_NUM).each do |i|
238
+ next unless conf["replace_record_key_regexp#{i}"]
239
+ regexp, replacement = conf["replace_record_key_regexp#{i}"].split(/ /, 2)
240
+ raise ConfigError, "replace_record_key_regexp#{i} does not contain 2 parameters" unless replacement
241
+ raise ConfigError, "replace_record_key_regexp#{i} contains a duplicated key, #{regexp}" if @regexps[regexp]
242
+ @regexps[regexp] = replacement
243
+ end
255
244
 
256
- @timef = TimeFormatter.new(@time_format, @localtime)
245
+ if @insert_id_field
246
+ insert_id_keys = @insert_id_field.split('.')
247
+ @get_insert_id = ->(record) {
248
+ insert_id_keys.inject(record) {|h, k| h[k] }
249
+ }
250
+ else
251
+ @get_insert_id = nil
252
+ end
257
253
 
258
- if @time_field
259
- keys = @time_field.split('.')
260
- last_key = keys.pop
261
- @add_time_field = ->(record, time) {
262
- keys.inject(record) { |h, k| h[k] ||= {} }[last_key] = @timef.format(time)
263
- record
264
- }
265
- else
266
- @add_time_field = ->(record, time) { record }
254
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
255
+ placeholder_validate!(:bigquery, placeholder_params)
256
+
257
+ warn "[DEPRECATION] `convert_hash_to_json` param is deprecated. If Hash value is inserted string field, plugin convert it to json automatically." if @convert_hash_to_json
267
258
  end
268
259
 
269
- if @insert_id_field
270
- insert_id_keys = @insert_id_field.split('.')
271
- @get_insert_id = ->(record) {
272
- insert_id_keys.inject(record) {|h, k| h[k] }
273
- }
274
- else
275
- @get_insert_id = nil
260
+ def start
261
+ super
262
+
263
+ @tables_queue = @tablelist.shuffle
264
+ @tables_mutex = Mutex.new
265
+ @fetched_schemas = {}
266
+ @last_fetch_schema_time = Hash.new(0)
276
267
  end
277
268
 
278
- warn "[DEPRECATION] `convert_hash_to_json` param is deprecated. If Hash value is inserted string field, plugin convert it to json automatically." if @convert_hash_to_json
279
- end
269
+ def writer
270
+ @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
271
+ private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
272
+ email: @email,
273
+ json_key: @json_key,
274
+ skip_invalid_rows: @skip_invalid_rows,
275
+ ignore_unknown_values: @ignore_unknown_values,
276
+ max_bad_records: @max_bad_records,
277
+ allow_retry_insert_errors: @allow_retry_insert_errors,
278
+ prevent_duplicate_load: @prevent_duplicate_load,
279
+ auto_create_table: @auto_create_table,
280
+ time_partitioning_type: @time_partitioning_type,
281
+ time_partitioning_expiration: @time_partitioning_expiration,
282
+ timeout_sec: @request_timeout_sec,
283
+ open_timeout_sec: @request_open_timeout_sec,
284
+ })
285
+ end
280
286
 
281
- def start
282
- super
287
+ def replace_record_key(record)
288
+ new_record = {}
289
+ record.each do |key, _|
290
+ new_key = key
291
+ @regexps.each do |regexp, replacement|
292
+ new_key = new_key.gsub(/#{regexp}/, replacement)
293
+ end
294
+ new_key = new_key.gsub(/\W/, '')
295
+ new_record.store(new_key, record[key])
296
+ end
297
+ new_record
298
+ end
283
299
 
284
- @tables_queue = @tablelist.dup.shuffle
285
- @tables_mutex = Mutex.new
286
- @fetch_schema_mutex = Mutex.new
300
+ def convert_hash_to_json(record)
301
+ record.each do |key, value|
302
+ if value.class == Hash
303
+ record[key] = MultiJson.dump(value)
304
+ end
305
+ end
306
+ record
307
+ end
287
308
 
288
- @last_fetch_schema_time = 0
289
- fetch_schema(false) if @fetch_schema
290
- end
309
+ def format(tag, time, record)
310
+ if @replace_record_key
311
+ record = replace_record_key(record)
312
+ end
291
313
 
292
- def writer
293
- @writer ||= Fluent::BigQuery::Writer.new(@log, @auth_method, {
294
- private_key_path: @private_key_path, private_key_passphrase: @private_key_passphrase,
295
- email: @email,
296
- json_key: @json_key,
297
- skip_invalid_rows: @skip_invalid_rows,
298
- ignore_unknown_values: @ignore_unknown_values,
299
- max_bad_records: @max_bad_records,
300
- allow_retry_insert_errors: @allow_retry_insert_errors,
301
- prevent_duplicate_load: @prevent_duplicate_load,
302
- auto_create_table: @auto_create_table,
303
- time_partitioning_type: @time_partitioning_type,
304
- time_partitioning_expiration: @time_partitioning_expiration,
305
- timeout_sec: @request_timeout_sec,
306
- open_timeout_sec: @request_open_timeout_sec,
307
- })
308
- end
314
+ if @convert_hash_to_json
315
+ record = convert_hash_to_json(record)
316
+ end
317
+
318
+ record = inject_values_to_record(tag, time, record)
309
319
 
310
- def generate_table_id(table_id_format, current_time, row = nil, chunk = nil)
311
- format, col = table_id_format.split(/@/)
312
- time = if col && row
313
- keys = col.split('.')
314
- t = keys.inject(row[:json]) {|obj, attr| obj[attr.to_sym] }
315
- Time.at(t)
316
- else
317
- current_time
318
- end
319
- if row && format =~ /\$\{/
320
- format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
321
- row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
320
+ begin
321
+ meta = metadata(tag, time, record)
322
+ schema =
323
+ if @fetch_schema
324
+ fetch_schema(meta)
325
+ else
326
+ @table_schema
327
+ end
328
+ ensure
329
+ @buffer.metadata_list.delete(meta)
322
330
  end
323
- end
324
- table_id = time.strftime(format)
325
-
326
- if chunk
327
- table_id.gsub(%r(%{time_slice})) { |expr|
328
- chunk.key
329
- }
330
- else
331
- table_id.gsub(%r(%{time_slice})) { |expr|
332
- current_time.strftime(@time_slice_format)
333
- }
334
- end
335
- end
336
331
 
337
- def replace_record_key(record)
338
- new_record = {}
339
- record.each do |key, _|
340
- new_key = key
341
- @regexps.each do |regexp, replacement|
342
- new_key = new_key.gsub(/#{regexp}/, replacement)
332
+ begin
333
+ buf = String.new
334
+ row = schema.format(record)
335
+ unless row.empty?
336
+ buf << MultiJson.dump(row) + "\n"
337
+ end
338
+ buf
339
+ rescue
340
+ log.error("format error", record: record, schema: schema)
341
+ raise
343
342
  end
344
- new_key = new_key.gsub(/\W/, '')
345
- new_record.store(new_key, record[key])
346
343
  end
347
- new_record
348
- end
349
344
 
350
- def convert_hash_to_json(record)
351
- record.each do |key, value|
352
- if value.class == Hash
353
- record[key] = MultiJson.dump(value)
345
+ def write(chunk)
346
+ table_id_format = @tables_mutex.synchronize do
347
+ t = @tables_queue.shift
348
+ @tables_queue.push t
349
+ t
354
350
  end
351
+ _write(chunk, table_id_format)
355
352
  end
356
- record
357
- end
358
353
 
359
- def legacy_schema_config_deprecation
360
- if [@field_string, @field_integer, @field_float, @field_boolean, @field_timestamp].any?
361
- warn "[DEPRECATION] `field_*` style schema config is deprecated. Instead of it, use `schema` config params that is array of json style."
354
+ def legacy_schema_config_deprecation
355
+ if [@field_string, @field_integer, @field_float, @field_boolean, @field_timestamp].any?
356
+ warn "[DEPRECATION] `field_*` style schema config is deprecated. Instead of it, use `schema` config params that is array of json style."
357
+ end
362
358
  end
363
- end
364
359
 
365
- def write(chunk)
366
- table_id_format = @tables_mutex.synchronize do
367
- t = @tables_queue.shift
368
- @tables_queue.push t
369
- t
370
- end
371
- template_suffix_format = @template_suffix
372
- _write(chunk, table_id_format, template_suffix_format)
373
- end
360
+ def fetch_schema(metadata)
361
+ table_id = nil
362
+ project = extract_placeholders(@project, metadata)
363
+ dataset = extract_placeholders(@dataset, metadata)
364
+ table_id = fetch_schema_target_table(metadata)
374
365
 
375
- def fetch_schema(allow_overwrite = true)
376
- table_id = nil
377
- @fetch_schema_mutex.synchronize do
378
- if Fluent::Engine.now - @last_fetch_schema_time > @schema_cache_expire
379
- table_id_format = @fetch_schema_table || @tablelist[0]
380
- table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now))
381
- schema = writer.fetch_schema(@project, @dataset, table_id)
366
+ if Fluent::Engine.now - @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] > @schema_cache_expire
367
+ schema = writer.fetch_schema(project, dataset, table_id)
382
368
 
383
369
  if schema
384
- if allow_overwrite
385
- fields = Fluent::BigQuery::RecordSchema.new("record")
386
- fields.load_schema(schema, allow_overwrite)
387
- @fields = fields
388
- else
389
- @fields.load_schema(schema, allow_overwrite)
390
- end
370
+ table_schema = Fluent::BigQuery::RecordSchema.new("record")
371
+ table_schema.load_schema(schema)
372
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"] = table_schema
391
373
  else
392
- if @fields.empty?
374
+ if @fetched_schemas["#{project}.#{dataset}.#{table_id}"].empty?
393
375
  raise "failed to fetch schema from bigquery"
394
376
  else
395
377
  log.warn "#{table_id} uses previous schema"
396
378
  end
397
379
  end
398
380
 
399
- @last_fetch_schema_time = Fluent::Engine.now
381
+ @last_fetch_schema_time["#{project}.#{dataset}.#{table_id}"] = Fluent::Engine.now
400
382
  end
383
+
384
+ @fetched_schemas["#{project}.#{dataset}.#{table_id}"]
401
385
  end
402
- end
403
386
 
404
- module InsertImplementation
405
- def format(tag, time, record)
406
- fetch_schema if @template_suffix
387
+ def fetch_schema_target_table(metadata)
388
+ extract_placeholders(@fetch_schema_table || @tablelist[0], metadata)
389
+ end
407
390
 
408
- if @replace_record_key
409
- record = replace_record_key(record)
391
+ def get_schema(project, dataset, metadata)
392
+ if @fetch_schema
393
+ @fetched_schemas["#{project}.#{dataset}.#{fetch_schema_target_table(metadata)}"] || fetch_schema(metadata)
394
+ else
395
+ @table_schema
410
396
  end
397
+ end
411
398
 
412
- if @convert_hash_to_json
413
- record = convert_hash_to_json(record)
414
- end
399
+ module InsertImplementation
400
+ def _write(chunk, table_format)
401
+ rows = chunk.open do |io|
402
+ io.map do |line|
403
+ record = MultiJson.load(line)
404
+ row = {"json" => record}
405
+ row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
406
+ row.deep_symbolize_keys
407
+ end
408
+ end
415
409
 
416
- buf = String.new
417
- row = @fields.format(@add_time_field.call(record, time))
418
- unless row.empty?
419
- row = {"json" => row}
420
- row['insert_id'] = @get_insert_id.call(record) if @get_insert_id
421
- buf << row.to_msgpack
422
- end
423
- buf
424
- end
410
+ project = extract_placeholders(@project, chunk.metadata)
411
+ dataset = extract_placeholders(@dataset, chunk.metadata)
412
+ table_id = extract_placeholders(table_format, chunk.metadata)
413
+ template_suffix = @template_suffix ? extract_placeholders(@template_suffix, chunk.metadata) : nil
425
414
 
426
- def _write(chunk, table_format, template_suffix_format)
427
- now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
428
- rows = []
429
- chunk.msgpack_each do |row_object|
430
- # TODO: row size limit
431
- row_object["json"][@add_insert_timestamp] = now if @add_insert_timestamp
432
- rows << row_object.deep_symbolize_keys
433
- end
415
+ schema = get_schema(project, dataset, chunk.metadata)
434
416
 
435
- now = Time.at(Fluent::Engine.now)
436
- group = rows.group_by do |row|
437
- [
438
- generate_table_id(table_format, now, row, chunk),
439
- template_suffix_format ? generate_table_id(template_suffix_format, now, row, chunk) : nil,
440
- ]
441
- end
442
- group.each do |(table_id, template_suffix), group_rows|
443
- insert(table_id, group_rows, template_suffix)
417
+ insert(project, dataset, table_id, rows, schema, template_suffix)
444
418
  end
445
- end
446
419
 
447
- def insert(table_id, rows, template_suffix)
448
- writer.insert_rows(@project, @dataset, table_id, rows, template_suffix: template_suffix)
449
- rescue Fluent::BigQuery::Error => e
450
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
451
- # Table Not Found: Auto Create Table
452
- writer.create_table(@project, @dataset, table_id, @fields)
453
- raise "table created. send rows next time."
454
- end
420
+ def insert(project, dataset, table_id, rows, schema, template_suffix)
421
+ writer.insert_rows(project, dataset, table_id, rows, template_suffix: template_suffix)
422
+ rescue Fluent::BigQuery::Error => e
423
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ e.message
424
+ # Table Not Found: Auto Create Table
425
+ writer.create_table(project, dataset, table_id, schema)
426
+ raise "table created. send rows next time."
427
+ end
455
428
 
456
- if e.retryable?
457
- raise e # TODO: error class
458
- elsif @secondary
459
- flush_secondary(@secondary)
429
+ raise if e.retryable?
430
+
431
+ if @secondary
432
+ # TODO: find better way
433
+ @retry = retry_state_create(
434
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
435
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
436
+ max_interval: @buffer_config.retry_max_interval,
437
+ secondary: true, secondary_threshold: Float::EPSILON,
438
+ randomize: @buffer_config.retry_randomize
439
+ )
440
+ else
441
+ @retry = retry_state_create(
442
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
443
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
444
+ max_interval: @buffer_config.retry_max_interval,
445
+ randomize: @buffer_config.retry_randomize
446
+ )
447
+ end
448
+
449
+ raise
460
450
  end
461
451
  end
462
- end
463
452
 
464
- module LoadImplementation
465
- def format(tag, time, record)
466
- fetch_schema if @fetch_schema_table
453
+ module LoadImplementation
454
+ def _write(chunk, table_id_format)
455
+ project = extract_placeholders(@project, chunk.metadata)
456
+ dataset = extract_placeholders(@dataset, chunk.metadata)
457
+ table_id = extract_placeholders(table_id_format, chunk.metadata)
467
458
 
468
- if @replace_record_key
469
- record = replace_record_key(record)
470
- end
471
-
472
- if @convert_hash_to_json
473
- record = convert_hash_to_json(record)
474
- end
459
+ schema = get_schema(project, dataset, chunk.metadata)
475
460
 
476
- buf = String.new
477
- row = @fields.format(@add_time_field.call(record, time))
478
- unless row.empty?
479
- buf << MultiJson.dump(row) + "\n"
461
+ load(chunk, project, dataset, table_id, schema)
480
462
  end
481
- buf
482
- end
483
463
 
484
- def _write(chunk, table_id_format, _)
485
- now = Time.at(Fluent::Engine.now)
486
- table_id = generate_table_id(table_id_format, now, nil, chunk)
487
- load(chunk, table_id)
488
- end
464
+ def load(chunk, project, dataset, table_id, schema)
465
+ res = nil
489
466
 
490
- def load(chunk, table_id)
491
- res = nil
467
+ create_upload_source(chunk) do |upload_source|
468
+ res = writer.create_load_job(chunk.unique_id, project, dataset, table_id, upload_source, schema)
469
+ end
470
+ rescue Fluent::BigQuery::Error => e
471
+ raise if e.retryable?
472
+
473
+ if @secondary
474
+ # TODO: find better way
475
+ @retry = retry_state_create(
476
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
477
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
478
+ max_interval: @buffer_config.retry_max_interval,
479
+ secondary: true, secondary_threshold: Float::EPSILON,
480
+ randomize: @buffer_config.retry_randomize
481
+ )
482
+ else
483
+ @retry = retry_state_create(
484
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
485
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
486
+ max_interval: @buffer_config.retry_max_interval,
487
+ randomize: @buffer_config.retry_randomize
488
+ )
489
+ end
492
490
 
493
- create_upload_source(chunk) do |upload_source|
494
- res = writer.create_load_job(chunk.unique_id, @project, @dataset, table_id, upload_source, @fields)
495
- end
496
- rescue Fluent::BigQuery::Error => e
497
- if e.retryable?
498
- raise e
499
- elsif @secondary
500
- flush_secondary(@secondary)
491
+ raise
501
492
  end
502
- end
503
493
 
504
- private
494
+ private
505
495
 
506
- def create_upload_source(chunk)
507
- chunk_is_file = @buffer_type == 'file'
508
- if chunk_is_file
509
- File.open(chunk.path) do |file|
510
- yield file
511
- end
512
- else
513
- Tempfile.open("chunk-tmp") do |file|
514
- file.binmode
515
- chunk.write_to(file)
516
- file.sync
517
- file.rewind
518
- yield file
496
+ def create_upload_source(chunk)
497
+ chunk_is_file = @buffer_config["@type"] == 'file'
498
+ if chunk_is_file
499
+ File.open(chunk.path) do |file|
500
+ yield file
501
+ end
502
+ else
503
+ Tempfile.open("chunk-tmp") do |file|
504
+ file.binmode
505
+ chunk.write_to(file)
506
+ file.sync
507
+ file.rewind
508
+ yield file
509
+ end
519
510
  end
520
511
  end
521
512
  end