fluent-plugin-kusto 0.0.1.beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,423 @@
1
+ # frozen_string_literal: true
2
+
3
+ # KustoOutput is a Fluentd output plugin for ingesting logs into Azure Data Explorer (Kusto).
4
+ # Supports managed identity, AAD authentication, multi-worker, buffer, and delayed commit.
5
+ require 'fluent/plugin/output'
6
+ require_relative 'ingester'
7
+ require 'time'
8
+ require_relative 'kusto_error_handler'
9
+ require_relative 'kusto_query'
10
+ require_relative 'conffile'
11
+ require 'logger'
12
+ require 'json'
13
+ require 'stringio'
14
+ require 'zlib'
15
+ require 'set'
16
+
17
+ module Fluent
18
+ module Plugin
19
+ class KustoOutput < Output
20
+ # Register plugin and define configuration parameters
21
+ Fluent::Plugin.register_output('kusto', self)
22
+ helpers :compat_parameters, :inject
23
+
24
+ config_param :endpoint, :string, default: nil, secret: true
25
+ config_param :database_name, :string, default: nil
26
+ config_param :table_name, :string, default: nil
27
+ config_param :tenant_id, :string, default: nil
28
+ config_param :client_id, :string, default: nil
29
+ config_param :client_secret, :string, default: nil, secret: true
30
+ config_param :buffered, :bool, default: true
31
+ config_param :delayed, :bool, default: false
32
+ config_param :managed_identity_client_id, :string, default: nil, secret: true
33
+ config_param :azure_cloud, :string, default: 'AzureCloud'
34
+ config_param :compression_enabled, :bool, default: true
35
+ config_param :logger_path, :string, default: nil
36
+ config_param :auth_type, :string, default: 'aad',
37
+ desc: 'Authentication type to use for Kusto. Options: "aad", "user_managed_identity", "system_managed_identity", "workload_identity".'
38
+ config_param :workload_identity_client_id, :string, default: nil, secret: true,
39
+ desc: 'Client ID for workload identity authentication.'
40
+ config_param :workload_identity_tenant_id, :string, default: nil, secret: true,
41
+ desc: 'Tenant ID for workload identity authentication.'
42
+ config_param :workload_identity_token_file_path, :string, default: nil, secret: true,
43
+ desc: 'File path for workload identity token.'
44
+ config_param :deferred_commit_timeout, :integer, default: 30,
45
+ desc: 'Maximum time in seconds to wait for deferred commit verification before force committing.'
46
+
47
+ config_section :buffer do
48
+ config_set_default :chunk_keys, ['time']
49
+ config_set_default :timekey, (60 * 60 * 24)
50
+ end
51
+
52
+ def multi_workers_ready?
53
+ # Enable multi-worker support
54
+ true
55
+ end
56
+
57
+ def configure(conf)
58
+ # Configure plugin and validate parameters
59
+ compat_parameters_convert(conf, :buffer)
60
+ super
61
+ validate_buffer_config(conf)
62
+ validate_delayed_config
63
+ validate_required_params
64
+ end
65
+
66
+ def start
67
+ # Initialize output configuration and ingester
68
+ super
69
+ setup_outconfiguration
70
+ setup_ingester_and_logger
71
+ @table_name = @outconfiguration&.table_name
72
+ @database_name = @outconfiguration&.database_name
73
+ @shutdown_called = false
74
+ @deferred_threads = []
75
+ @plugin_start_time = Time.now
76
+ @total_bytes_ingested = 0
77
+ end
78
+
79
+ def format(tag, time, record)
80
+ # Format a record for ingestion
81
+ tag_val = extract_tag(record, tag)
82
+ timestamp = extract_timestamp(record, time)
83
+ safe_record = sanitize_record_for_json(record)
84
+ "#{format_record_json(tag_val, timestamp, safe_record)}\n"
85
+ end
86
+
87
+ def extract_tag(record, tag)
88
+ # Extract tag from record or fallback to defaults
89
+ return tag if !record.is_a?(Hash) || record.nil?
90
+ return record['tag'] if record['tag']
91
+ return tag if tag
92
+ return record['host'] if record['host']
93
+ return record['user'] if record['user']
94
+ return ::Regexp.last_match(1) if record['message'] && record['message'] =~ /(\d{1,3}(?:\.\d{1,3}){3})/
95
+
96
+ 'default_tag'
97
+ end
98
+
99
+ def extract_timestamp(record, time)
100
+ # Extract datetime from record or fallback to time
101
+ timestamp = find_time_or_date_key(record)
102
+ return timestamp if timestamp && !timestamp.to_s.empty?
103
+
104
+ timestamp = (time ? Time.at(time).utc.iso8601 : '')
105
+ return timestamp unless timestamp.to_s.empty?
106
+
107
+ timestamp = find_timestamp_by_regex(record)
108
+ timestamp ||= ''
109
+ timestamp
110
+ end
111
+
112
+ def find_time_or_date_key(record)
113
+ # Find time/date key in record
114
+ return nil unless record.is_a?(Hash)
115
+
116
+ record.each do |k, v|
117
+ return v if k.to_s.downcase.include?('time') || k.to_s.downcase.include?('date')
118
+ end
119
+ nil
120
+ end
121
+
122
+ def find_timestamp_by_regex(record)
123
+ # Find datetime by regex in record values
124
+ record.each_value do |v|
125
+ next unless v.is_a?(String)
126
+ return ::Regexp.last_match(1) if v =~ %r{(\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} [+-]\d{4}\].*)}
127
+ end
128
+ nil
129
+ end
130
+
131
+ def format_record_json(tag_val, timestamp, safe_record)
132
+ # Format record as JSON for ingestion
133
+ record_value = if safe_record.is_a?(Hash)
134
+ safe_record.reject do |k, _|
135
+ %w[tag time].include?(k)
136
+ end
137
+ else
138
+ safe_record || {}
139
+ end
140
+ { 'tag' => tag_val, 'timestamp' => timestamp, 'record' => record_value }.to_json
141
+ end
142
+
143
+ def dump_unique_id_hex(unique_id)
144
+ # Convert unique_id to hex string
145
+ return 'noid' if unique_id.nil?
146
+ return unique_id.unpack1('H*') if unique_id.respond_to?(:unpack1)
147
+
148
+ unique_id.to_s
149
+ end
150
+
151
+ def compress_data(data)
152
+ # Compress data using gzip
153
+ sio = StringIO.new
154
+ gz = Zlib::GzipWriter.new(sio)
155
+ gz.write(data)
156
+ gz.close
157
+ sio.string
158
+ end
159
+
160
+ def process(tag, es)
161
+ es.each do |time, record|
162
+ formatted = format(tag, time, record).encode('UTF-8', invalid: :replace, undef: :replace, replace: '_')
163
+ safe_tag = tag.to_s.encode('UTF-8', invalid: :replace, undef: :replace, replace: '_').gsub(/[^0-9A-Za-z.-]/,
164
+ '_')
165
+ blob_name = "fluentd_event_#{safe_tag}.json"
166
+ @ingester.upload_data_to_blob_and_queue(formatted, blob_name, @database_name, @table_name,
167
+ compression_enabled)
168
+ rescue StandardError => e
169
+ @logger&.error("Failed to ingest event to Kusto: #{e}\nEvent skipped: #{record.inspect}\n#{e.backtrace.join("\n")}")
170
+ next
171
+ end
172
+ end
173
+
174
+ def write(chunk)
175
+ # Write a chunk of events to Kusto
176
+ worker_id = Fluent::Engine.worker_id
177
+ raw_data = chunk.read
178
+ tag = extract_tag_from_metadata(chunk.metadata)
179
+ safe_tag = tag.to_s.encode('UTF-8', invalid: :replace, undef: :replace, replace: '_').gsub(/[^0-9A-Za-z.-]/,
180
+ '_')
181
+ unique_id = chunk.unique_id
182
+ ext = compression_enabled ? '.json.gz' : '.json'
183
+ blob_name = "fluentd_event_worker#{worker_id}_#{safe_tag}_#{dump_unique_id_hex(unique_id)}#{ext}"
184
+ data_to_upload = compression_enabled ? compress_data(raw_data) : raw_data
185
+ begin
186
+ @ingester.upload_data_to_blob_and_queue(data_to_upload, blob_name, @database_name, @table_name,
187
+ compression_enabled)
188
+ rescue StandardError => e
189
+ handle_kusto_error(e, unique_id)
190
+ end
191
+ end
192
+
193
+ def extract_tag_from_metadata(metadata)
194
+ # Extract tag from chunk metadata
195
+ return 'default_tag' if metadata.nil?
196
+ return metadata.tag || 'default_tag' if metadata.respond_to?(:tag)
197
+
198
+ 'default_tag'
199
+ end
200
+
201
+ def handle_kusto_error(e, unique_id)
202
+ # Handle and log Kusto errors
203
+ KustoErrorHandler.handle_kusto_error(@logger, e, dump_unique_id_hex(unique_id))
204
+ end
205
+
206
+ def try_write(chunk)
207
+ @deferred_threads ||= []
208
+ tag = extract_tag_from_metadata(chunk.metadata)
209
+ safe_tag = tag.to_s.encode('UTF-8', invalid: :replace, undef: :replace, replace: '_').gsub(/[^0-9A-Za-z.-]/,
210
+ '_')
211
+ chunk_id = dump_unique_id_hex(chunk.unique_id)
212
+ ext = compression_enabled ? '.json.gz' : '.json'
213
+ blob_name = "fluentd_event_#{safe_tag}_#{chunk_id}#{ext}"
214
+ raw_data = chunk.read || ''
215
+ records = raw_data.split("\n").map do |line|
216
+ rec = JSON.parse(line)
217
+ rec['record']['chunk_id'] = chunk_id if rec.is_a?(Hash) && rec['record'].is_a?(Hash)
218
+ rec.to_json
219
+ rescue StandardError
220
+ line
221
+ end
222
+ updated_raw_data = records.join("\n")
223
+ row_count = records.size
224
+ data_to_upload = compression_enabled ? compress_data(updated_raw_data) : updated_raw_data
225
+ begin
226
+ @ingester.upload_data_to_blob_and_queue(data_to_upload, blob_name, @database_name, @table_name,
227
+ compression_enabled)
228
+ if @shutdown_called || !@delayed
229
+ commit_write(chunk.unique_id)
230
+ if @shutdown_called
231
+ @logger&.info("Immediate commit for chunk_id=#{chunk_id} due to shutdown")
232
+ else
233
+ @logger&.info("Immediate commit for chunk_id=#{chunk_id} (delayed=false)")
234
+ end
235
+ else
236
+ thread = start_deferred_commit_thread(chunk_id, chunk, row_count)
237
+ @deferred_threads << thread if thread
238
+ end
239
+ rescue StandardError => e
240
+ KustoErrorHandler.handle_try_write_error(@logger, e, chunk_id)
241
+ end
242
+ end
243
+
244
+ def start_deferred_commit_thread(chunk_id, chunk, row_count)
245
+ # Start a thread to commit chunk after verifying ingestion
246
+ return nil if @shutdown_called
247
+
248
+ Thread.new do
249
+ max_wait_time = @deferred_commit_timeout # Maximum wait time in seconds
250
+ check_interval = 1 # Check every 1 second
251
+ attempts = 0
252
+ max_attempts = max_wait_time / check_interval
253
+
254
+ loop do
255
+ break if @shutdown_called
256
+
257
+ attempts += 1
258
+
259
+ if check_data_on_server(chunk_id, row_count)
260
+ commit_write(chunk.unique_id)
261
+ @logger&.info("Successfully committed chunk_id=#{chunk_id} after #{attempts} attempts")
262
+ break
263
+ end
264
+
265
+ # If we've exceeded max attempts, commit anyway to avoid hanging
266
+ if attempts >= max_attempts
267
+ commit_write(chunk.unique_id)
268
+ @logger&.warn("Force committing chunk_id=#{chunk_id} after #{max_wait_time}s timeout (#{attempts} verification attempts)")
269
+ break
270
+ end
271
+
272
+ sleep check_interval
273
+ end
274
+ rescue StandardError => e
275
+ @logger&.error("Error in deferred commit thread for chunk_id=#{chunk_id}: #{e}")
276
+ # Ensure chunk is committed even on error to avoid hanging
277
+ begin
278
+ commit_write(chunk.unique_id)
279
+ @logger&.warn("Force committed chunk_id=#{chunk_id} due to error in verification thread")
280
+ rescue StandardError => commit_error
281
+ @logger&.error("Failed to commit chunk_id=#{chunk_id} after thread error: #{commit_error}")
282
+ end
283
+ end
284
+ end
285
+
286
+ def check_data_on_server(chunk_id, row_count)
287
+ # Query Kusto to verify chunk ingestion
288
+ begin
289
+ # Sanitize inputs to prevent injection attacks
290
+ safe_table_name = @table_name.to_s.gsub(/[^a-zA-Z0-9_]/, '')
291
+ safe_chunk_id = chunk_id.to_s.gsub(/[^a-zA-Z0-9_-]/, '')
292
+ query = "#{safe_table_name} | extend record_dynamic = parse_json(record) | where record_dynamic.chunk_id == '#{safe_chunk_id}' | count"
293
+ result = run_kusto_api_query(query, @outconfiguration.kusto_endpoint, @ingester.token_provider,
294
+ use_ingest_endpoint: false, database_name: @database_name)
295
+ if result.is_a?(Array) && result[0].is_a?(Array)
296
+ count_val = result[0][0].to_i
297
+ return count_val == row_count
298
+ elsif @logger.respond_to?(:error)
299
+ if @logger.respond_to?(:error)
300
+ @logger.error("Kusto query failed or returned unexpected result: #{result.inspect}")
301
+ end
302
+ end
303
+ rescue StandardError => e
304
+ @logger.error("Failed to get chunk_id count: #{e}") if @logger.respond_to?(:error)
305
+ end
306
+ false
307
+ end
308
+
309
+ def shutdown
310
+ # Handle plugin shutdown and cleanup threads
311
+ @shutdown_called = true
312
+
313
+ # Give deferred threads a chance to finish gracefully
314
+ if @deferred_threads&.any?
315
+ @logger&.info("Shutting down with #{@deferred_threads.size} active deferred commit threads")
316
+
317
+ # Wait up to 10 seconds for threads to complete naturally
318
+ deadline = Time.now + 10
319
+
320
+ while Time.now < deadline && @deferred_threads.any?(&:alive?)
321
+ alive_count = @deferred_threads.count(&:alive?)
322
+ @logger&.debug("Waiting for #{alive_count} deferred threads to complete...")
323
+ sleep 0.5
324
+ end
325
+
326
+ # Force kill any remaining threads
327
+ @deferred_threads.each do |t|
328
+ if t.alive?
329
+ t.kill
330
+ @logger&.info('delayed commit for buffer chunks was cancelled in shutdown chunk_id=unknown')
331
+ end
332
+ end
333
+
334
+ @deferred_threads.clear
335
+ end
336
+
337
+ @ingester.shutdown if @ingester.respond_to?(:shutdown)
338
+ super
339
+ end
340
+
341
+ private
342
+
343
+ def validate_buffer_config(conf)
344
+ # Validate buffer configuration
345
+ return unless !@buffered && conf.elements('buffer').any?
346
+
347
+ raise Fluent::ConfigError, 'Buffer section present but buffered is false'
348
+ end
349
+
350
+ def validate_delayed_config
351
+ # Validate delayed commit configuration
352
+ return unless !@buffered && @delayed
353
+
354
+ raise Fluent::ConfigError,
355
+ 'Delayed commit is only supported in buffered mode (buffered must be true if delayed is true)'
356
+ end
357
+
358
+ def validate_required_params
359
+ # Ensure required parameters are present
360
+ required_params = %w[endpoint database_name table_name]
361
+ missing_params = required_params.select do |param|
362
+ value = send(param)
363
+ value.nil? || value.strip.empty?
364
+ end
365
+ return if missing_params.empty?
366
+
367
+ raise Fluent::ConfigError, "Missing required parameters: #{missing_params.join(', ')}"
368
+ end
369
+
370
+ def prefer_buffered_processing
371
+ @buffered
372
+ end
373
+
374
+ def prefer_delayed_commit
375
+ @delayed
376
+ end
377
+
378
+ def setup_outconfiguration
379
+ # Build OutputConfiguration for Kusto
380
+ @outconfiguration = OutputConfiguration.new(
381
+ client_app_id: client_id,
382
+ client_app_secret: client_secret,
383
+ tenant_id: tenant_id,
384
+ kusto_endpoint: endpoint,
385
+ database_name: database_name,
386
+ table_name: table_name,
387
+ azure_cloud: azure_cloud,
388
+ managed_identity_client_id: managed_identity_client_id,
389
+ logger_path: logger_path,
390
+ auth_type: auth_type,
391
+ workload_identity_client_id: workload_identity_client_id,
392
+ workload_identity_tenant_id: workload_identity_tenant_id,
393
+ workload_identity_token_file_path: workload_identity_token_file_path
394
+ )
395
+ end
396
+
397
+ def setup_ingester_and_logger
398
+ # Initialize Ingester and logger
399
+ @ingester = Ingester.new(@outconfiguration)
400
+ @logger = @outconfiguration.logger
401
+ end
402
+
403
+ def sanitize_record_for_json(obj, seen = Set.new)
404
+ # Recursively sanitize record for JSON serialization
405
+ return obj unless obj.is_a?(Hash) || obj.is_a?(Array)
406
+ raise 'Circular reference detected in record' if seen.include?(obj.object_id)
407
+
408
+ seen.add(obj.object_id)
409
+ obj.is_a?(Hash) ? sanitize_hash(obj, seen) : sanitize_array(obj, seen)
410
+ end
411
+
412
+ def sanitize_hash(obj, seen)
413
+ obj.each_with_object({}) do |(k, v), h|
414
+ h[k.to_s] = sanitize_record_for_json(v, seen)
415
+ end
416
+ end
417
+
418
+ def sanitize_array(obj, seen)
419
+ obj.map { |v| sanitize_record_for_json(v, seen) }
420
+ end
421
+ end
422
+ end
423
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'test-unit'
4
+ require 'fluent/test'
5
+ require 'fluent/test/driver/output'
6
+ require 'fluent/test/helpers'
7
+
8
+ Test::Unit::TestCase.include(Fluent::Test::Helpers)
9
+ Test::Unit::TestCase.extend(Fluent::Test::Helpers)
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'test/unit'
4
+ require_relative '../../lib/fluent/plugin/auth/azcli_tokenprovider'
5
+
6
+ class DummyConfig
7
+ attr_reader :kusto_endpoint
8
+
9
+ def initialize(resource)
10
+ @kusto_endpoint = resource
11
+ end
12
+
13
+ def logger
14
+ require 'logger'
15
+ Logger.new($stdout)
16
+ end
17
+ end
18
+
19
+ class AzCliTokenProviderIntegrationTest < Test::Unit::TestCase
20
+ def setup
21
+ @resource = 'https://kusto.kusto.windows.net'
22
+ @provider = AzCliTokenProvider.new(DummyConfig.new(@resource))
23
+ end
24
+
25
+ def test_get_token_integration
26
+ begin
27
+ @provider.send(:locate_azure_cli)
28
+ rescue RuntimeError
29
+ omit('Azure CLI not installed, skipping integration test.')
30
+ end
31
+
32
+ token = @provider.get_token
33
+ assert_not_nil(token, 'Token should not be nil')
34
+ assert_kind_of(String, token)
35
+ assert(token.length.positive?, 'Token should not be empty')
36
+ end
37
+ end