fluent-plugin-bigquery-test 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,125 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryInsertOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_insert', self)
7
+
8
+ helpers :record_accessor
9
+
10
+ # template_suffix (only insert)
11
+ # https://cloud.google.com/bigquery/streaming-data-into-bigquery#template_table_details
12
+ config_param :template_suffix, :string, default: nil
13
+
14
+ # skip_invalid_rows (only insert)
15
+ # Insert all valid rows of a request, even if invalid rows exist.
16
+ # The default value is false, which causes the entire request to fail if any invalid rows exist.
17
+ config_param :skip_invalid_rows, :bool, default: false
18
+
19
+ # insert_id_field (only insert)
20
+ config_param :insert_id_field, :string, default: nil
21
+
22
+ # add_insert_timestamp (only insert)
23
+ # adds a timestamp just before sending the rows to bigquery, so that
24
+ # buffering time is not taken into account. Gives a field in bigquery
25
+ # which represents the insert time of the row.
26
+ config_param :add_insert_timestamp, :string, default: nil
27
+
28
+ # allow_retry_insert_errors (only insert)
29
+ # If insert_id_field is not specified, true means to allow duplicate rows
30
+ config_param :allow_retry_insert_errors, :bool, default: false
31
+
32
+ ## Buffer
33
+ config_section :buffer do
34
+ config_set_default :@type, "memory"
35
+ config_set_default :flush_mode, :interval
36
+ config_set_default :flush_interval, 1
37
+ config_set_default :flush_thread_interval, 0.05
38
+ config_set_default :flush_thread_burst_interval, 0.05
39
+ config_set_default :chunk_limit_size, 1 * 1024 ** 2 # 1MB
40
+ config_set_default :total_limit_size, 1 * 1024 ** 3 # 1GB
41
+ config_set_default :chunk_limit_records, 500
42
+ end
43
+
44
+ def configure(conf)
45
+ super
46
+
47
+ if @insert_id_field
48
+ if @insert_id_field !~ /^\$[\[\.]/ && @insert_id_field =~ /\./
49
+ warn "[BREAKING CHANGE] insert_id_field format is changed. Use fluentd record_accessor helper. (https://docs.fluentd.org/v1.0/articles/api-plugin-helper-record_accessor)"
50
+ end
51
+ @get_insert_id = record_accessor_create(@insert_id_field)
52
+ end
53
+
54
+ formatter_config = conf.elements("format")[0]
55
+ if formatter_config && formatter_config['@type'] != "json"
56
+ raise ConfigError, "`bigquery_insert` supports only json formatter."
57
+ end
58
+ @formatter = formatter_create(usage: 'out_bigquery_for_insert', type: 'json', conf: formatter_config)
59
+
60
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}/template_suffix=#{@template_suffix}"
61
+ placeholder_validate!(:bigquery_insert, placeholder_params)
62
+ end
63
+
64
+ # for Fluent::Plugin::Output#implement? method
65
+ def format(tag, time, record)
66
+ super
67
+ end
68
+
69
+ def write(chunk)
70
+ table_format = @tables_mutex.synchronize do
71
+ t = @tables_queue.shift
72
+ @tables_queue.push t
73
+ t
74
+ end
75
+
76
+ now = Time.now.utc.strftime("%Y-%m-%d %H:%M:%S.%6N") if @add_insert_timestamp
77
+
78
+ rows = chunk.open do |io|
79
+ io.map do |line|
80
+ record = MultiJson.load(line)
81
+ record[@add_insert_timestamp] = now if @add_insert_timestamp
82
+ row = {"json" => record}
83
+ row["insert_id"] = @get_insert_id.call(record) if @get_insert_id
84
+ Fluent::BigQuery::Helper.deep_symbolize_keys(row)
85
+ end
86
+ end
87
+
88
+ metadata = chunk.metadata
89
+ project = extract_placeholders(@project, metadata)
90
+ dataset = extract_placeholders(@dataset, metadata)
91
+ table_id = extract_placeholders(table_format, metadata)
92
+ template_suffix = @template_suffix ? extract_placeholders(@template_suffix, metadata) : nil
93
+ schema = get_schema(project, dataset, metadata)
94
+
95
+ insert(project, dataset, table_id, rows, schema, template_suffix)
96
+ end
97
+
98
+ def insert(project, dataset, table_id, rows, schema, template_suffix)
99
+ writer.insert_rows(project, dataset, table_id, rows, schema, template_suffix: template_suffix)
100
+ rescue Fluent::BigQuery::Error => e
101
+ raise if e.retryable?
102
+
103
+ if @secondary
104
+ # TODO: find better way
105
+ @retry = retry_state_create(
106
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
107
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
108
+ max_interval: @buffer_config.retry_max_interval,
109
+ secondary: true, secondary_threshold: Float::EPSILON,
110
+ randomize: @buffer_config.retry_randomize
111
+ )
112
+ else
113
+ @retry = retry_state_create(
114
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
115
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
116
+ max_interval: @buffer_config.retry_max_interval,
117
+ randomize: @buffer_config.retry_randomize
118
+ )
119
+ end
120
+
121
+ raise
122
+ end
123
+ end
124
+ end
125
+ end
@@ -0,0 +1,221 @@
1
+ require 'fluent/plugin/out_bigquery_base'
2
+
3
+ module Fluent
4
+ module Plugin
5
+ class BigQueryLoadOutput < BigQueryBaseOutput
6
+ Fluent::Plugin.register_output('bigquery_load', self)
7
+
8
+ helpers :timer
9
+
10
+ config_param :source_format, :enum, list: [:json, :avro, :csv], default: :json
11
+
12
+ # max_bad_records (only load)
13
+ # The maximum number of bad records that BigQuery can ignore when running the job.
14
+ # If the number of bad records exceeds this value, an invalid error is returned in the job result.
15
+ # The default value is 0, which requires that all records are valid.
16
+ config_param :max_bad_records, :integer, default: 0
17
+
18
+ # prevent_duplicate_load (only load)
19
+ config_param :prevent_duplicate_load, :bool, default: false
20
+
21
+ config_param :use_delayed_commit, :bool, default: true
22
+ config_param :wait_job_interval, :time, default: 3
23
+
24
+ ## Buffer
25
+ config_section :buffer do
26
+ config_set_default :@type, "file"
27
+ config_set_default :flush_mode, :interval
28
+ config_set_default :flush_interval, 3600 # 1h
29
+ config_set_default :flush_thread_interval, 5
30
+ config_set_default :flush_thread_burst_interval, 5
31
+ config_set_default :chunk_limit_size, 1 * 1024 ** 3 # 1GB
32
+ config_set_default :total_limit_size, 32 * 1024 ** 3 # 32GB
33
+
34
+ config_set_default :delayed_commit_timeout, 1800 # 30m
35
+ end
36
+
37
+ def configure(conf)
38
+ super
39
+
40
+ placeholder_params = "project=#{@project}/dataset=#{@dataset}/table=#{@tablelist.join(",")}/fetch_schema_table=#{@fetch_schema_table}"
41
+ placeholder_validate!(:bigquery_load, placeholder_params)
42
+ end
43
+
44
+ def start
45
+ super
46
+
47
+ if prefer_delayed_commit
48
+ @polling_targets = []
49
+ @polling_mutex = Mutex.new
50
+ log.debug("start load job polling")
51
+ timer_execute(:polling_bigquery_load_job, @wait_job_interval, &method(:poll))
52
+ end
53
+ end
54
+
55
+ def prefer_delayed_commit
56
+ @use_delayed_commit
57
+ end
58
+
59
+ # for Fluent::Plugin::Output#implement? method
60
+ def format(tag, time, record)
61
+ super
62
+ end
63
+
64
+ def write(chunk)
65
+ job_reference = do_write(chunk)
66
+
67
+ until response = writer.fetch_load_job(job_reference)
68
+ sleep @wait_job_interval
69
+ end
70
+
71
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
72
+ rescue Fluent::BigQuery::Error => e
73
+ raise if e.retryable?
74
+
75
+ @retry_mutex.synchronize do
76
+ if @secondary
77
+ # TODO: find better way
78
+ @retry = retry_state_create(
79
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
80
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
81
+ max_interval: @buffer_config.retry_max_interval,
82
+ secondary: true, secondary_threshold: Float::EPSILON,
83
+ randomize: @buffer_config.retry_randomize
84
+ )
85
+ else
86
+ @retry = retry_state_create(
87
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
88
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
89
+ max_interval: @buffer_config.retry_max_interval,
90
+ randomize: @buffer_config.retry_randomize
91
+ )
92
+ end
93
+ end
94
+
95
+ raise
96
+ end
97
+
98
+ def try_write(chunk)
99
+ job_reference = do_write(chunk)
100
+ @polling_mutex.synchronize do
101
+ @polling_targets << job_reference
102
+ end
103
+ rescue Fluent::BigQuery::Error => e
104
+ raise if e.retryable?
105
+
106
+ @retry_mutex.synchronize do
107
+ if @secondary
108
+ # TODO: find better way
109
+ @retry = retry_state_create(
110
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
111
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
112
+ max_interval: @buffer_config.retry_max_interval,
113
+ secondary: true, secondary_threshold: Float::EPSILON,
114
+ randomize: @buffer_config.retry_randomize
115
+ )
116
+ else
117
+ @retry = retry_state_create(
118
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
119
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
120
+ max_interval: @buffer_config.retry_max_interval,
121
+ randomize: @buffer_config.retry_randomize
122
+ )
123
+ end
124
+ end
125
+
126
+ raise
127
+ end
128
+
129
+ private
130
+
131
+ def do_write(chunk)
132
+ table_format = @tables_mutex.synchronize do
133
+ t = @tables_queue.shift
134
+ @tables_queue.push t
135
+ t
136
+ end
137
+
138
+ metadata = chunk.metadata
139
+ project = extract_placeholders(@project, metadata)
140
+ dataset = extract_placeholders(@dataset, metadata)
141
+ table_id = extract_placeholders(table_format, metadata)
142
+ schema = get_schema(project, dataset, metadata)
143
+
144
+ create_upload_source(chunk) do |upload_source|
145
+ writer.create_load_job(chunk.unique_id, dump_unique_id_hex(chunk.unique_id), project, dataset, table_id, upload_source, schema)
146
+ end
147
+ end
148
+
149
+ def poll
150
+ job_reference = @polling_mutex.synchronize do
151
+ @polling_targets.shift
152
+ end
153
+ return unless job_reference
154
+
155
+ begin
156
+ response = writer.fetch_load_job(job_reference)
157
+ if response
158
+ writer.commit_load_job(job_reference.chunk_id_hex, response)
159
+ commit_write(job_reference.chunk_id)
160
+ log.debug("commit chunk", chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
161
+ else
162
+ @polling_mutex.synchronize do
163
+ @polling_targets << job_reference
164
+ end
165
+ end
166
+ rescue Fluent::BigQuery::Error => e
167
+ # RetryableError comes from only `commit_load_job`
168
+ # if error is retryable, takeback chunk and do next `try_flush`
169
+ # if error is not retryable, create custom retry_state and takeback chunk do next `try_flush`
170
+ if e.retryable?
171
+ log.warn("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
172
+ else
173
+ log.error("failed to poll load job", error: e, chunk: job_reference.chunk_id_hex, **job_reference.as_hash(:job_id, :project_id, :dataset_id, :table_id))
174
+ @retry_mutex.synchronize do
175
+ if @secondary
176
+ # TODO: find better way
177
+ @retry = retry_state_create(
178
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
179
+ forever: false, max_steps: @buffer_config.retry_max_times, backoff_base: @buffer_config.retry_exponential_backoff_base,
180
+ max_interval: @buffer_config.retry_max_interval,
181
+ secondary: true, secondary_threshold: Float::EPSILON,
182
+ randomize: @buffer_config.retry_randomize
183
+ )
184
+ else
185
+ @retry = retry_state_create(
186
+ :output_retries, @buffer_config.retry_type, @buffer_config.retry_wait, @buffer_config.retry_timeout,
187
+ forever: false, max_steps: 0, backoff_base: @buffer_config.retry_exponential_backoff_base,
188
+ max_interval: @buffer_config.retry_max_interval,
189
+ randomize: @buffer_config.retry_randomize
190
+ )
191
+ end
192
+ end
193
+ end
194
+
195
+ rollback_write(job_reference.chunk_id)
196
+ rescue => e
197
+ log.error("unexpected error while polling", error: e)
198
+ log.error_backtrace
199
+ rollback_write(job_reference.chunk_id)
200
+ end
201
+ end
202
+
203
+ def create_upload_source(chunk)
204
+ chunk_is_file = @buffer_config["@type"] == 'file'
205
+ if chunk_is_file
206
+ File.open(chunk.path) do |file|
207
+ yield file
208
+ end
209
+ else
210
+ Tempfile.open("chunk-tmp") do |file|
211
+ file.binmode
212
+ chunk.write_to(file)
213
+ file.sync
214
+ file.rewind
215
+ yield file
216
+ end
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,20 @@
1
+ require 'bundler/setup'
2
+ require 'test/unit'
3
+
4
+ $LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
5
+ $LOAD_PATH.unshift(__dir__)
6
+ require 'fluent/test'
7
+
8
+ require 'fluent/plugin/buffer'
9
+ require 'fluent/plugin/buf_memory'
10
+ require 'fluent/plugin/buf_file'
11
+ require 'fluent/test/driver/output'
12
+
13
+ require 'fluent/plugin/out_bigquery_base'
14
+ require 'fluent/plugin/out_bigquery_insert'
15
+ require 'fluent/plugin/out_bigquery_load'
16
+ require 'google/apis/bigquery_v2'
17
+ require 'google/api_client/auth/key_utils'
18
+ require 'googleauth'
19
+
20
+ require 'test/unit/rr'
@@ -0,0 +1,579 @@
1
+ require 'helper'
2
+
3
+ class BigQueryBaseOutputTest < Test::Unit::TestCase
4
+ def setup
5
+ Fluent::Test.setup
6
+ end
7
+
8
+ CONFIG = %[
9
+ table foo
10
+ email foo@bar.example
11
+ private_key_path /path/to/key
12
+ project yourproject_id
13
+ dataset yourdataset_id
14
+
15
+ <inject>
16
+ time_format %s
17
+ time_key time
18
+ </inject>
19
+
20
+ schema [
21
+ {"name": "time", "type": "INTEGER"},
22
+ {"name": "status", "type": "INTEGER"},
23
+ {"name": "bytes", "type": "INTEGER"},
24
+ {"name": "vhost", "type": "STRING"},
25
+ {"name": "path", "type": "STRING"},
26
+ {"name": "method", "type": "STRING"},
27
+ {"name": "protocol", "type": "STRING"},
28
+ {"name": "agent", "type": "STRING"},
29
+ {"name": "referer", "type": "STRING"},
30
+ {"name": "remote", "type": "RECORD", "fields": [
31
+ {"name": "host", "type": "STRING"},
32
+ {"name": "ip", "type": "STRING"},
33
+ {"name": "user", "type": "STRING"}
34
+ ]},
35
+ {"name": "requesttime", "type": "FLOAT"},
36
+ {"name": "bot_access", "type": "BOOLEAN"},
37
+ {"name": "loginsession", "type": "BOOLEAN"}
38
+ ]
39
+ ]
40
+
41
+ API_SCOPE = "https://www.googleapis.com/auth/bigquery"
42
+
43
+ def create_driver(conf = CONFIG)
44
+ Fluent::Test::Driver::Output.new(Fluent::Plugin::BigQueryBaseOutput).configure(conf)
45
+ end
46
+
47
+ def stub_writer(stub_auth: true)
48
+ stub.proxy(Fluent::BigQuery::Writer).new.with_any_args do |writer|
49
+ stub(writer).get_auth { nil } if stub_auth
50
+ yield writer
51
+ writer
52
+ end
53
+ end
54
+
55
+ private def sudo_schema_response
56
+ {
57
+ "schema" => {
58
+ "fields" => [
59
+ {
60
+ "name" => "time",
61
+ "type" => "TIMESTAMP",
62
+ "mode" => "REQUIRED"
63
+ },
64
+ {
65
+ "name" => "tty",
66
+ "type" => "STRING",
67
+ "mode" => "NULLABLE"
68
+ },
69
+ {
70
+ "name" => "pwd",
71
+ "type" => "STRING",
72
+ "mode" => "REQUIRED"
73
+ },
74
+ {
75
+ "name" => "user",
76
+ "type" => "STRING",
77
+ "mode" => "REQUIRED"
78
+ },
79
+ {
80
+ "name" => "argv",
81
+ "type" => "STRING",
82
+ "mode" => "REPEATED"
83
+ }
84
+ ]
85
+ }
86
+ }
87
+ end
88
+
89
+ def test_configure_table
90
+ driver = create_driver
91
+ assert_equal driver.instance.table, 'foo'
92
+ assert_nil driver.instance.tables
93
+
94
+ driver = create_driver(CONFIG.sub(/\btable\s+.*$/, 'tables foo,bar'))
95
+ assert_nil driver.instance.table
96
+ assert_equal driver.instance.tables, ['foo' ,'bar']
97
+
98
+ assert_raise(Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid") {
99
+ create_driver(CONFIG + "tables foo,bar")
100
+ }
101
+ end
102
+
103
+ def test_configure_auth_private_key
104
+ driver = create_driver
105
+ stub_writer(stub_auth: false) do |writer|
106
+ mock(writer).get_auth_from_private_key { stub! }
107
+ end
108
+ assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
109
+ end
110
+
111
+ def test_configure_auth_compute_engine
112
+ driver = create_driver(%[
113
+ table foo
114
+ auth_method compute_engine
115
+ project yourproject_id
116
+ dataset yourdataset_id
117
+ schema [
118
+ {"name": "time", "type": "INTEGER"},
119
+ {"name": "status", "type": "INTEGER"},
120
+ {"name": "bytes", "type": "INTEGER"}
121
+ ]
122
+ ])
123
+
124
+ stub_writer(stub_auth: false) do |writer|
125
+ mock(writer).get_auth_from_compute_engine { stub! }
126
+ end
127
+ assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
128
+ end
129
+
130
+ def test_configure_auth_json_key_as_file
131
+ driver = create_driver(%[
132
+ table foo
133
+ auth_method json_key
134
+ json_key jsonkey.josn
135
+ project yourproject_id
136
+ dataset yourdataset_id
137
+ schema [
138
+ {"name": "time", "type": "INTEGER"},
139
+ {"name": "status", "type": "INTEGER"},
140
+ {"name": "bytes", "type": "INTEGER"}
141
+ ]
142
+ ])
143
+
144
+ stub_writer(stub_auth: false) do |writer|
145
+ mock(writer).get_auth_from_json_key { stub! }
146
+ end
147
+ assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
148
+ end
149
+
150
+ def test_configure_auth_json_key_as_file_raise_permission_error
151
+ json_key_path = 'test/plugin/testdata/json_key.json'
152
+ json_key_path_dir = File.dirname(json_key_path)
153
+
154
+ begin
155
+ File.chmod(0000, json_key_path_dir)
156
+
157
+ driver = create_driver(%[
158
+ table foo
159
+ auth_method json_key
160
+ json_key #{json_key_path}
161
+ project yourproject_id
162
+ dataset yourdataset_id
163
+ schema [
164
+ {"name": "time", "type": "INTEGER"},
165
+ {"name": "status", "type": "INTEGER"},
166
+ {"name": "bytes", "type": "INTEGER"}
167
+ ]
168
+ ])
169
+ assert_raises(Errno::EACCES) do
170
+ driver.instance.writer.client
171
+ end
172
+ ensure
173
+ File.chmod(0755, json_key_path_dir)
174
+ end
175
+ end
176
+
177
+ def test_configure_auth_json_key_as_string
178
+ json_key = '{"private_key": "X", "client_email": "' + 'x' * 255 + '@developer.gserviceaccount.com"}'
179
+ json_key_io = StringIO.new(json_key)
180
+ authorization = Object.new
181
+ stub(Google::Auth::ServiceAccountCredentials).make_creds(json_key_io: satisfy {|arg| JSON.parse(arg.read) == JSON.parse(json_key_io.read) }, scope: API_SCOPE) { authorization }
182
+
183
+ driver = create_driver(%[
184
+ table foo
185
+ auth_method json_key
186
+ json_key #{json_key}
187
+ project yourproject_id
188
+ dataset yourdataset_id
189
+ schema [
190
+ {"name": "time", "type": "INTEGER"},
191
+ {"name": "status", "type": "INTEGER"},
192
+ {"name": "bytes", "type": "INTEGER"}
193
+ ]
194
+ ])
195
+ stub_writer(stub_auth: false) do |writer|
196
+ mock.proxy(writer).get_auth_from_json_key { stub! }
197
+ end
198
+ assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
199
+ end
200
+
201
+ def test_configure_auth_application_default
202
+ driver = create_driver(%[
203
+ table foo
204
+ auth_method application_default
205
+ project yourproject_id
206
+ dataset yourdataset_id
207
+ schema [
208
+ {"name": "time", "type": "INTEGER"},
209
+ {"name": "status", "type": "INTEGER"},
210
+ {"name": "bytes", "type": "INTEGER"}
211
+ ]
212
+ ])
213
+
214
+ stub_writer(stub_auth: false) do |writer|
215
+ mock.proxy(writer).get_auth_from_application_default { stub! }
216
+ end
217
+ assert driver.instance.writer.client.is_a?(Google::Apis::BigqueryV2::BigqueryService)
218
+ end
219
+
220
+ def test_format
221
+ now = Fluent::EventTime.new(Time.now.to_i)
222
+ input = {
223
+ "status" => "1",
224
+ "bytes" => 3.0,
225
+ "vhost" => :bar,
226
+ "path" => "/path/to/baz",
227
+ "method" => "GET",
228
+ "protocol" => "HTTP/0.9",
229
+ "agent" => "libwww",
230
+ "referer" => "http://referer.example",
231
+ "requesttime" => (now - 1).to_f.to_s,
232
+ "bot_access" => true,
233
+ "loginsession" => false,
234
+ "something-else" => "would be ignored",
235
+ "yet-another" => {
236
+ "foo" => "bar",
237
+ "baz" => 1,
238
+ },
239
+ "remote" => {
240
+ "host" => "remote.example",
241
+ "ip" => "192.0.2.1",
242
+ "port" => 12345,
243
+ "user" => "tagomoris",
244
+ }
245
+ }
246
+ expected = {
247
+ "time" => now.to_i,
248
+ "status" => 1,
249
+ "bytes" => 3,
250
+ "vhost" => "bar",
251
+ "path" => "/path/to/baz",
252
+ "method" => "GET",
253
+ "protocol" => "HTTP/0.9",
254
+ "agent" => "libwww",
255
+ "referer" => "http://referer.example",
256
+ "requesttime" => (now - 1).to_f.to_s.to_f,
257
+ "bot_access" => true,
258
+ "loginsession" => false,
259
+ "something-else" => "would be ignored",
260
+ "yet-another" => {
261
+ "foo" => "bar",
262
+ "baz" => 1,
263
+ },
264
+ "remote" => {
265
+ "host" => "remote.example",
266
+ "ip" => "192.0.2.1",
267
+ "port" => 12345,
268
+ "user" => "tagomoris",
269
+ }
270
+ }
271
+
272
+ driver = create_driver(CONFIG)
273
+ buf = nil
274
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
275
+
276
+ assert_equal expected, MultiJson.load(buf)
277
+ end
278
+
279
+ [
280
+ # <time_format>, <time field type>, <time expectation generator>, <assertion>
281
+ [
282
+ "%s.%6N",
283
+ lambda{|t| t.strftime("%s.%6N").to_f },
284
+ lambda{|recv, expected, actual|
285
+ recv.assert_in_delta(expected, actual, Float::EPSILON / 10**3)
286
+ }
287
+ ],
288
+ [
289
+ "%Y-%m-%dT%H:%M:%S%:z",
290
+ lambda{|t| t.iso8601 },
291
+ :assert_equal.to_proc
292
+ ],
293
+ ].each do |format, expect_time, assert|
294
+ define_method("test_time_formats_#{format}") do
295
+ now = Fluent::Engine.now
296
+ input = {}
297
+ expected = { "time" => expect_time[Time.at(now.to_r)] }
298
+
299
+ driver = create_driver(<<-CONFIG)
300
+ table foo
301
+ email foo@bar.example
302
+ private_key_path /path/to/key
303
+ project yourproject_id
304
+ dataset yourdataset_id
305
+
306
+ <inject>
307
+ time_format #{format}
308
+ time_type string
309
+ time_key time
310
+ </inject>
311
+
312
+ schema [
313
+ {"name": "metadata", "type": "RECORD", "fields": [
314
+ {"name": "time", "type": "INTEGER"},
315
+ {"name": "node", "type": "STRING"}
316
+ ]},
317
+ {"name": "log", "type": "STRING"}
318
+ ]
319
+ CONFIG
320
+
321
+ buf = nil
322
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
323
+
324
+ assert[self, expected["time"], MultiJson.load(buf)["time"]]
325
+ end
326
+ end
327
+
328
+ def test_format_with_schema
329
+ now = Fluent::EventTime.new(Time.now.to_i)
330
+ input = {
331
+ "request" => {
332
+ "vhost" => :bar,
333
+ "path" => "/path/to/baz",
334
+ "method" => "GET",
335
+ "protocol" => "HTTP/0.9",
336
+ "agent" => "libwww",
337
+ "referer" => "http://referer.example",
338
+ "time" => (now - 1).to_f,
339
+ "bot_access" => true,
340
+ "loginsession" => false,
341
+ },
342
+ "response" => {
343
+ "status" => "1",
344
+ "bytes" => 3.0,
345
+ },
346
+ "remote" => {
347
+ "host" => "remote.example",
348
+ "ip" => "192.0.2.1",
349
+ "port" => 12345,
350
+ "user" => "tagomoris",
351
+ },
352
+ "something-else" => "would be ignored",
353
+ "yet-another" => {
354
+ "foo" => "bar",
355
+ "baz" => 1,
356
+ },
357
+ }
358
+ expected = {
359
+ "time" => now.to_f,
360
+ "request" => {
361
+ "vhost" => "bar",
362
+ "path" => "/path/to/baz",
363
+ "method" => "GET",
364
+ "protocol" => "HTTP/0.9",
365
+ "agent" => "libwww",
366
+ "referer" => "http://referer.example",
367
+ "time" => (now - 1).to_f,
368
+ "bot_access" => true,
369
+ "loginsession" => false,
370
+ },
371
+ "remote" => {
372
+ "host" => "remote.example",
373
+ "ip" => "192.0.2.1",
374
+ "port" => 12345,
375
+ "user" => "tagomoris",
376
+ },
377
+ "response" => {
378
+ "status" => 1,
379
+ "bytes" => 3,
380
+ },
381
+ "something-else" => "would be ignored",
382
+ "yet-another" => {
383
+ "foo" => "bar",
384
+ "baz" => 1,
385
+ },
386
+ }
387
+
388
+ driver = create_driver(<<-CONFIG)
389
+ table foo
390
+ email foo@bar.example
391
+ private_key_path /path/to/key
392
+ project yourproject_id
393
+ dataset yourdataset_id
394
+
395
+ <inject>
396
+ time_format %s
397
+ time_key time
398
+ </inject>
399
+
400
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "apache.schema")}
401
+ schema [{"name": "time", "type": "INTEGER"}]
402
+ CONFIG
403
+
404
+ buf = nil
405
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
406
+
407
+ assert_equal expected, MultiJson.load(buf)
408
+ end
409
+
410
+ def test_format_repeated_field_with_schema
411
+ now = Fluent::EventTime.new(Time.now.to_i)
412
+ input = {
413
+ "tty" => nil,
414
+ "pwd" => "/home/yugui",
415
+ "user" => "fluentd",
416
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
417
+ }
418
+ expected = {
419
+ "time" => now.to_f,
420
+ "pwd" => "/home/yugui",
421
+ "user" => "fluentd",
422
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
423
+ }
424
+
425
+ driver = create_driver(<<-CONFIG)
426
+ table foo
427
+ email foo@bar.example
428
+ private_key_path /path/to/key
429
+ project yourproject_id
430
+ dataset yourdataset_id
431
+
432
+ <inject>
433
+ time_format %s
434
+ time_key time
435
+ </inject>
436
+
437
+ schema_path #{File.join(File.dirname(__FILE__), "testdata", "sudo.schema")}
438
+ schema [{"name": "time", "type": "INTEGER"}]
439
+ CONFIG
440
+
441
+ buf = nil
442
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
443
+
444
+ assert_equal expected, MultiJson.load(buf)
445
+ end
446
+
447
+ def test_format_fetch_from_bigquery_api
448
+ now = Fluent::EventTime.new(Time.now.to_i)
449
+ input = {
450
+ "tty" => nil,
451
+ "pwd" => "/home/yugui",
452
+ "user" => "fluentd",
453
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
454
+ }
455
+ expected = {
456
+ "time" => now.to_i,
457
+ "pwd" => "/home/yugui",
458
+ "user" => "fluentd",
459
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
460
+ }
461
+
462
+ driver = create_driver(<<-CONFIG)
463
+ table foo
464
+ email foo@bar.example
465
+ private_key_path /path/to/key
466
+ project yourproject_id
467
+ dataset yourdataset_id
468
+
469
+ <inject>
470
+ time_format %s
471
+ time_key time
472
+ </inject>
473
+
474
+ fetch_schema true
475
+ schema [{"name": "time", "type": "INTEGER"}]
476
+ CONFIG
477
+
478
+ stub_writer do |writer|
479
+ mock(writer).fetch_schema('yourproject_id', 'yourdataset_id', 'foo') do
480
+ sudo_schema_response["schema"]["fields"]
481
+ end
482
+ end
483
+
484
+ buf = nil
485
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
486
+
487
+ assert_equal expected, MultiJson.load(buf)
488
+
489
+ table_schema = driver.instance.instance_eval{ @fetched_schemas['yourproject_id.yourdataset_id.foo'] }
490
+ assert table_schema["time"]
491
+ assert_equal :timestamp, table_schema["time"].type
492
+ assert_equal :required, table_schema["time"].mode
493
+
494
+ assert table_schema["tty"]
495
+ assert_equal :string, table_schema["tty"].type
496
+ assert_equal :nullable, table_schema["tty"].mode
497
+
498
+ assert table_schema["pwd"]
499
+ assert_equal :string, table_schema["pwd"].type
500
+ assert_equal :required, table_schema["pwd"].mode
501
+
502
+ assert table_schema["user"]
503
+ assert_equal :string, table_schema["user"].type
504
+ assert_equal :required, table_schema["user"].mode
505
+
506
+ assert table_schema["argv"]
507
+ assert_equal :string, table_schema["argv"].type
508
+ assert_equal :repeated, table_schema["argv"].mode
509
+ end
510
+
511
+ def test_format_fetch_from_bigquery_api_with_fetch_schema_table
512
+ now = Fluent::EventTime.new(Time.now.to_i)
513
+ input = {
514
+ "tty" => nil,
515
+ "pwd" => "/home/yugui",
516
+ "user" => "fluentd",
517
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
518
+ }
519
+ expected = {
520
+ "time" => now.to_i,
521
+ "pwd" => "/home/yugui",
522
+ "user" => "fluentd",
523
+ "argv" => %w[ tail -f /var/log/fluentd/fluentd.log ]
524
+ }
525
+
526
+ driver = create_driver(<<-CONFIG)
527
+ table foo_%Y_%m_%d
528
+ email foo@bar.example
529
+ private_key_path /path/to/key
530
+ project yourproject_id
531
+ dataset yourdataset_id
532
+
533
+ <inject>
534
+ time_format %s
535
+ time_key time
536
+ </inject>
537
+
538
+ fetch_schema true
539
+ fetch_schema_table foo
540
+ schema [{"name": "time", "type": "INTEGER"}]
541
+
542
+ <buffer time>
543
+ timekey 1d
544
+ </buffer>
545
+ CONFIG
546
+
547
+ stub_writer do |writer|
548
+ mock(writer).fetch_schema('yourproject_id', 'yourdataset_id', 'foo') do
549
+ sudo_schema_response["schema"]["fields"]
550
+ end
551
+ end
552
+
553
+ buf = nil
554
+ driver.run { buf = driver.instance.format("my.tag", now, input) }
555
+
556
+ assert_equal expected, MultiJson.load(buf)
557
+
558
+ table_schema = driver.instance.instance_eval{ @fetched_schemas['yourproject_id.yourdataset_id.foo'] }
559
+ assert table_schema["time"]
560
+ assert_equal :timestamp, table_schema["time"].type
561
+ assert_equal :required, table_schema["time"].mode
562
+
563
+ assert table_schema["tty"]
564
+ assert_equal :string, table_schema["tty"].type
565
+ assert_equal :nullable, table_schema["tty"].mode
566
+
567
+ assert table_schema["pwd"]
568
+ assert_equal :string, table_schema["pwd"].type
569
+ assert_equal :required, table_schema["pwd"].mode
570
+
571
+ assert table_schema["user"]
572
+ assert_equal :string, table_schema["user"].type
573
+ assert_equal :required, table_schema["user"].mode
574
+
575
+ assert table_schema["argv"]
576
+ assert_equal :string, table_schema["argv"].type
577
+ assert_equal :repeated, table_schema["argv"].mode
578
+ end
579
+ end