fluent-plugin-kusto 0.0.2.beta → 0.0.3.beta
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/fluent/plugin/auth/aad_tokenprovider.rb +2 -3
- data/lib/fluent/plugin/auth/mi_tokenprovider.rb +1 -1
- data/lib/fluent/plugin/auth/tokenprovider_base.rb +259 -10
- data/lib/fluent/plugin/auth/wif_tokenprovider.rb +18 -3
- data/lib/fluent/plugin/client.rb +82 -1
- data/lib/fluent/plugin/ingester.rb +22 -8
- data/lib/fluent/plugin/kusto_constants.rb +57 -0
- data/lib/fluent/plugin/kusto_query.rb +8 -1
- data/lib/fluent/plugin/kusto_version.rb +9 -0
- data/test/plugin/test_e2e_kusto.rb +289 -202
- data/test/plugin/test_mi_tokenprovider.rb +10 -0
- data/test/plugin/test_wif_tokenprovider.rb +9 -0
- metadata +5 -5
- data/test/plugin/e2e_kusto.rb +0 -862
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2f46e06c2f84df5ddae86b8bffc55bbd0e7a92ad636d4c02512b7c9fa909e563
|
4
|
+
data.tar.gz: ef350fc500e82cbf80a0b91c1247463b8a4a00487324bbbc39f04a4017436d90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00634a1a008a0dcb07929181b4946ad42b5d0d2d8b4aa1099498f13f72fd4b5479cb64f347fb065937d930870ca933dd6320cac8c096d0e689415dcd79564b5b
|
7
|
+
data.tar.gz: db35058e072ed1bdabce57439db87c161a026a4654bc61afd40f9a87f305c75a146017fbbdf91f3c71e7017a24b80ffed48eb061167c44361e3792615baa6b84
|
@@ -52,7 +52,7 @@ class AadTokenProvider < AbstractTokenProvider
|
|
52
52
|
|
53
53
|
def post_token_request
|
54
54
|
headers = header
|
55
|
-
max_retries = 10
|
55
|
+
max_retries = 3 # Reduced from 10 to prevent rate limiting cascade
|
56
56
|
retries = 0
|
57
57
|
uri = URI.parse(@token_request_uri)
|
58
58
|
form_data = URI.encode_www_form(
|
@@ -63,8 +63,7 @@ class AadTokenProvider < AbstractTokenProvider
|
|
63
63
|
)
|
64
64
|
while retries < max_retries
|
65
65
|
begin
|
66
|
-
http =
|
67
|
-
http.use_ssl = (uri.scheme == 'https')
|
66
|
+
http = create_http_client(uri)
|
68
67
|
request = Net::HTTP::Post.new(uri.request_uri, headers)
|
69
68
|
request.body = form_data
|
70
69
|
|
@@ -73,7 +73,7 @@ class ManagedIdentityTokenProvider < AbstractTokenProvider
|
|
73
73
|
uri = URI.parse(@token_acquire_url)
|
74
74
|
while retries < max_retries
|
75
75
|
begin
|
76
|
-
http =
|
76
|
+
http = create_http_client(uri)
|
77
77
|
request = Net::HTTP::Get.new(uri.request_uri, headers)
|
78
78
|
response = http.request(request)
|
79
79
|
return JSON.parse(response.body) if response.code.to_i == 200
|
@@ -1,13 +1,46 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
require 'logger'
|
4
|
+
require 'fluent/plugin/kusto_constants'
|
4
5
|
|
5
6
|
# AbstractTokenProvider defines the interface and shared logic for all token providers.
|
7
|
+
# Enhanced with retry logic and better token expiry management to prevent timeout issues.
|
6
8
|
class AbstractTokenProvider
|
7
9
|
def initialize(outconfiguration)
|
8
10
|
@logger = setup_logger(outconfiguration)
|
9
11
|
setup_config(outconfiguration)
|
10
|
-
@token_state = {
|
12
|
+
@token_state = {
|
13
|
+
access_token: nil,
|
14
|
+
expiry_time: nil,
|
15
|
+
token_details_mutex: Mutex.new,
|
16
|
+
refresh_in_progress: false,
|
17
|
+
consecutive_failures: 0,
|
18
|
+
last_failure_time: nil,
|
19
|
+
creation_time: Time.now,
|
20
|
+
refresh_count: 0,
|
21
|
+
last_successful_refresh: nil
|
22
|
+
}
|
23
|
+
|
24
|
+
# Simplified retry configuration using constants
|
25
|
+
@retry_config = {
|
26
|
+
max_retries: KustoConstants::Authentication::DEFAULT_MAX_RETRIES,
|
27
|
+
base_delay: KustoConstants::Authentication::DEFAULT_BASE_DELAY,
|
28
|
+
backoff_multiplier: KustoConstants::Authentication::DEFAULT_BACKOFF_MULTIPLIER,
|
29
|
+
max_delay: KustoConstants::Authentication::DEFAULT_MAX_DELAY
|
30
|
+
}
|
31
|
+
|
32
|
+
# Minimal health configuration for 12-hour reset
|
33
|
+
@health_config = {
|
34
|
+
max_token_age: KustoConstants::HealthCheck::MAX_COMPONENT_AGE_SECONDS,
|
35
|
+
max_refresh_cycles: KustoConstants::HealthCheck::MAX_REFRESH_CYCLES
|
36
|
+
}
|
37
|
+
|
38
|
+
# HTTP timeout configuration - consistent across all token providers
|
39
|
+
@http_config = {
|
40
|
+
open_timeout: KustoConstants::Authentication::HTTP_OPEN_TIMEOUT,
|
41
|
+
read_timeout: KustoConstants::Authentication::HTTP_READ_TIMEOUT,
|
42
|
+
write_timeout: KustoConstants::Authentication::HTTP_WRITE_TIMEOUT
|
43
|
+
}
|
11
44
|
end
|
12
45
|
|
13
46
|
# Abstract method: must be implemented by subclasses to fetch a new token.
|
@@ -15,18 +48,65 @@ class AbstractTokenProvider
|
|
15
48
|
raise NotImplementedError, 'Subclasses must implement fetch_token'
|
16
49
|
end
|
17
50
|
|
18
|
-
# Public method to get a valid token, refreshing if needed.
|
51
|
+
# Public method to get a valid token, refreshing if needed with enhanced retry logic.
|
19
52
|
def get_token
|
20
53
|
@token_state[:token_details_mutex].synchronize do
|
21
54
|
if saved_token_need_refresh?
|
55
|
+
if @token_state[:refresh_in_progress]
|
56
|
+
@logger.debug("Token refresh already in progress, waiting...")
|
57
|
+
return wait_for_refresh_completion
|
58
|
+
end
|
59
|
+
|
22
60
|
@logger.info("Refreshing token. Previous expiry: #{@token_state[:expiry_time]}")
|
23
|
-
|
61
|
+
refresh_saved_token_with_retry
|
24
62
|
@logger.info("New token expiry: #{@token_state[:expiry_time]}")
|
63
|
+
else
|
64
|
+
@logger.debug("Reusing existing token (expires at #{@token_state[:expiry_time]})")
|
25
65
|
end
|
26
66
|
@token_state[:access_token]
|
27
67
|
end
|
28
68
|
end
|
29
69
|
|
70
|
+
# Health check method - returns health status as hash
|
71
|
+
# Note: This method should be called from within a synchronized context
|
72
|
+
def health_status
|
73
|
+
{
|
74
|
+
token_valid: !saved_token_need_refresh?,
|
75
|
+
token_expires_at: @token_state[:expiry_time],
|
76
|
+
consecutive_failures: @token_state[:consecutive_failures],
|
77
|
+
last_failure_time: @token_state[:last_failure_time],
|
78
|
+
refresh_in_progress: @token_state[:refresh_in_progress],
|
79
|
+
refresh_count: @token_state[:refresh_count],
|
80
|
+
last_successful_refresh: @token_state[:last_successful_refresh],
|
81
|
+
token_age_hours: @token_state[:creation_time] ? (Time.now - @token_state[:creation_time]) / 3600 : 0
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
# Thread-safe wrapper for health_status when called externally
|
86
|
+
def get_health_status
|
87
|
+
@token_state[:token_details_mutex].synchronize do
|
88
|
+
health_status
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Log health status for operational visibility
|
93
|
+
def log_health_status(context = "")
|
94
|
+
status = health_status
|
95
|
+
context_prefix = context.empty? ? "" : "#{context}: "
|
96
|
+
|
97
|
+
@logger.info("#{context_prefix}Token provider health - " \
|
98
|
+
"valid: #{status[:token_valid]}, " \
|
99
|
+
"expires_at: #{status[:token_expires_at]}, " \
|
100
|
+
"failures: #{status[:consecutive_failures]}, " \
|
101
|
+
"refresh_count: #{status[:refresh_count]}, " \
|
102
|
+
"age_hours: #{status[:token_age_hours].round(1)}")
|
103
|
+
|
104
|
+
if status[:consecutive_failures] > 0
|
105
|
+
@logger.warn("#{context_prefix}Token provider has #{status[:consecutive_failures]} consecutive failures, " \
|
106
|
+
"last failure: #{status[:last_failure_time]}")
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
30
110
|
private
|
31
111
|
|
32
112
|
def setup_logger(outconfiguration)
|
@@ -38,20 +118,189 @@ class AbstractTokenProvider
|
|
38
118
|
end
|
39
119
|
|
40
120
|
def saved_token_need_refresh?
|
41
|
-
@token_state[:access_token].nil? || @token_state[:expiry_time].nil?
|
121
|
+
return true if @token_state[:access_token].nil? || @token_state[:expiry_time].nil?
|
122
|
+
|
123
|
+
# Check for long-running pod health issues
|
124
|
+
if long_running_pod_health_check_needed?
|
125
|
+
@logger.warn("Long-running pod health issue detected, forcing token refresh")
|
126
|
+
return true
|
127
|
+
end
|
128
|
+
|
129
|
+
# Use token expiry buffer from constants to prevent race conditions
|
130
|
+
@token_state[:expiry_time] <= (Time.now + KustoConstants::Authentication::TOKEN_EXPIRY_BUFFER_SECONDS)
|
131
|
+
end
|
132
|
+
|
133
|
+
def long_running_pod_health_check_needed?
|
134
|
+
current_time = Time.now
|
135
|
+
|
136
|
+
# Check if token is too old (12+ hours) - force refresh to prevent staleness
|
137
|
+
if @token_state[:creation_time] &&
|
138
|
+
(current_time - @token_state[:creation_time]) > @health_config[:max_token_age]
|
139
|
+
@logger.warn("Token provider is #{(current_time - @token_state[:creation_time]) / 3600} hours old, forcing refresh")
|
140
|
+
reset_token_state_for_long_running_pod
|
141
|
+
return true
|
142
|
+
end
|
143
|
+
|
144
|
+
# Check if too many refresh cycles (potential state corruption)
|
145
|
+
if @token_state[:refresh_count] > @health_config[:max_refresh_cycles]
|
146
|
+
@logger.warn("Token provider has #{@token_state[:refresh_count]} refresh cycles, resetting state")
|
147
|
+
reset_token_state_for_long_running_pod
|
148
|
+
return true
|
149
|
+
end
|
150
|
+
|
151
|
+
# Check if last successful refresh was too long ago
|
152
|
+
if @token_state[:last_successful_refresh] &&
|
153
|
+
(current_time - @token_state[:last_successful_refresh]) > (@health_config[:max_token_age] / 2)
|
154
|
+
@logger.warn("No successful refresh for #{(current_time - @token_state[:last_successful_refresh]) / 3600} hours")
|
155
|
+
return true
|
156
|
+
end
|
157
|
+
|
158
|
+
false
|
42
159
|
end
|
43
160
|
|
44
|
-
def
|
45
|
-
|
46
|
-
@
|
47
|
-
|
161
|
+
def reset_token_state_for_long_running_pod
|
162
|
+
log_health_status("Before reset")
|
163
|
+
@logger.info("Resetting token state for long-running pod health")
|
164
|
+
|
165
|
+
@token_state[:access_token] = nil
|
166
|
+
@token_state[:expiry_time] = nil
|
167
|
+
@token_state[:consecutive_failures] = 0
|
168
|
+
@token_state[:last_failure_time] = nil
|
169
|
+
@token_state[:creation_time] = Time.now
|
170
|
+
@token_state[:refresh_count] = 0
|
171
|
+
@token_state[:last_successful_refresh] = nil
|
172
|
+
|
173
|
+
log_health_status("After reset")
|
174
|
+
end
|
175
|
+
|
176
|
+
def wait_for_refresh_completion
|
177
|
+
# Wait for ongoing refresh to complete (max 30 seconds)
|
178
|
+
max_wait = 30
|
179
|
+
start_time = Time.now
|
180
|
+
|
181
|
+
while @token_state[:refresh_in_progress] && (Time.now - start_time) < max_wait
|
182
|
+
sleep(0.5)
|
183
|
+
end
|
184
|
+
|
185
|
+
# Return token if refresh completed successfully
|
186
|
+
return @token_state[:access_token] if @token_state[:access_token] && !saved_token_need_refresh?
|
187
|
+
|
188
|
+
# If still no valid token, attempt refresh ourselves
|
189
|
+
@token_state[:refresh_in_progress] = false
|
190
|
+
refresh_saved_token_with_retry
|
191
|
+
@token_state[:access_token]
|
192
|
+
end
|
193
|
+
|
194
|
+
def refresh_saved_token_with_retry
|
195
|
+
@token_state[:refresh_in_progress] = true
|
196
|
+
|
197
|
+
begin
|
198
|
+
token_response = fetch_token_with_retry
|
199
|
+
@token_state[:access_token] = token_response[:access_token]
|
200
|
+
@token_state[:expiry_time] = get_token_expiry_time(token_response[:expires_in])
|
201
|
+
@token_state[:consecutive_failures] = 0
|
202
|
+
@token_state[:last_failure_time] = nil
|
203
|
+
@token_state[:refresh_count] += 1
|
204
|
+
@token_state[:last_successful_refresh] = Time.now
|
205
|
+
|
206
|
+
@logger.info("Token refresh successful (cycle #{@token_state[:refresh_count]})")
|
207
|
+
|
208
|
+
# Log health status after successful refresh for operational visibility
|
209
|
+
log_health_status("After successful refresh")
|
210
|
+
ensure
|
211
|
+
@token_state[:refresh_in_progress] = false
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def fetch_token_with_retry
|
216
|
+
attempt = 0
|
217
|
+
last_exception = nil
|
218
|
+
|
219
|
+
while attempt < @retry_config[:max_retries]
|
220
|
+
attempt += 1
|
221
|
+
|
222
|
+
begin
|
223
|
+
@logger.info("Attempting token fetch (attempt #{attempt}/#{@retry_config[:max_retries]})")
|
224
|
+
return fetch_token
|
225
|
+
|
226
|
+
rescue StandardError => e
|
227
|
+
last_exception = e
|
228
|
+
@logger.warn("Token fetch attempt #{attempt} failed: #{e.message}")
|
229
|
+
|
230
|
+
# Don't retry on permanent errors
|
231
|
+
if permanent_error?(e)
|
232
|
+
@logger.error("Permanent error detected, not retrying: #{e.message}")
|
233
|
+
record_failure(e)
|
234
|
+
raise e
|
235
|
+
end
|
236
|
+
|
237
|
+
# Calculate delay with exponential backoff
|
238
|
+
if attempt < @retry_config[:max_retries]
|
239
|
+
delay = calculate_retry_delay(attempt)
|
240
|
+
@logger.info("Retrying in #{delay} seconds...")
|
241
|
+
sleep(delay)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
# All retries exhausted
|
247
|
+
record_failure(last_exception)
|
248
|
+
raise last_exception || StandardError.new("Token fetch failed after #{@retry_config[:max_retries]} attempts")
|
249
|
+
end
|
250
|
+
|
251
|
+
def calculate_retry_delay(attempt)
|
252
|
+
# Exponential backoff: base_delay * backoff_multiplier^(attempt-1)
|
253
|
+
# Example: 1s, 2s, 4s for base_delay=1, backoff_multiplier=2
|
254
|
+
delay = @retry_config[:base_delay] * (@retry_config[:backoff_multiplier] ** (attempt - 1))
|
255
|
+
delay = [@retry_config[:max_delay], delay].min
|
256
|
+
|
257
|
+
# Add jitter to prevent thundering herd
|
258
|
+
# When many concurrent refreshes are happening, this will space them out better
|
259
|
+
jitter = delay * 0.1
|
260
|
+
delay += rand(-jitter..jitter)
|
261
|
+
|
262
|
+
[delay, KustoConstants::Authentication::MIN_RETRY_DELAY].max # Minimum retry delay from constants
|
263
|
+
end
|
264
|
+
|
265
|
+
def permanent_error?(exception)
|
266
|
+
return false unless exception.respond_to?(:message)
|
267
|
+
|
268
|
+
message = exception.message.to_s.downcase
|
269
|
+
permanent_patterns = [
|
270
|
+
'unauthorized',
|
271
|
+
'forbidden',
|
272
|
+
'invalid_client',
|
273
|
+
'invalid_grant',
|
274
|
+
'access_denied'
|
275
|
+
]
|
276
|
+
|
277
|
+
permanent_patterns.any? { |pattern| message.include?(pattern) }
|
278
|
+
end
|
279
|
+
|
280
|
+
def record_failure(exception)
|
281
|
+
@token_state[:consecutive_failures] += 1
|
282
|
+
@token_state[:last_failure_time] = Time.now
|
283
|
+
@logger.error("Token fetch failed: #{exception&.message || 'Unknown error'}")
|
48
284
|
end
|
49
285
|
|
50
286
|
def get_token_expiry_time(expires_in_seconds)
|
51
287
|
if expires_in_seconds.nil? || expires_in_seconds.to_i <= 0
|
52
|
-
|
288
|
+
# Default to 55 minutes if expires_in is not provided or invalid
|
289
|
+
Time.now + KustoConstants::Authentication::DEFAULT_TOKEN_EXPIRY_SECONDS
|
53
290
|
else
|
54
|
-
|
291
|
+
# Use buffer from constants for better safety margin
|
292
|
+
Time.now + expires_in_seconds.to_i - KustoConstants::Authentication::TOKEN_EXPIRY_BUFFER_SECONDS
|
55
293
|
end
|
56
294
|
end
|
295
|
+
|
296
|
+
# Helper method to create HTTP client with consistent timeout configuration
|
297
|
+
# This prevents hanging connections and ensures consistent behavior across all token providers
|
298
|
+
def create_http_client(uri)
|
299
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
300
|
+
http.use_ssl = (uri.scheme == 'https')
|
301
|
+
http.open_timeout = @http_config[:open_timeout]
|
302
|
+
http.read_timeout = @http_config[:read_timeout]
|
303
|
+
http.write_timeout = @http_config[:write_timeout]
|
304
|
+
http
|
305
|
+
end
|
57
306
|
end
|
@@ -34,7 +34,7 @@ class WorkloadIdentity < AbstractTokenProvider
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def acquire_workload_identity_token
|
37
|
-
oidc_token =
|
37
|
+
oidc_token = read_token_file_safely
|
38
38
|
uri = URI.parse(format(AZURE_OAUTH2_TOKEN_ENDPOINT, tenant_id: @tenant_id))
|
39
39
|
req = Net::HTTP::Post.new(uri)
|
40
40
|
req.set_form_data(
|
@@ -44,11 +44,26 @@ class WorkloadIdentity < AbstractTokenProvider
|
|
44
44
|
'client_assertion_type' => 'urn:ietf:params:oauth:client-assertion-type:jwt-bearer',
|
45
45
|
'client_assertion' => oidc_token
|
46
46
|
)
|
47
|
-
http =
|
48
|
-
http.use_ssl = true
|
47
|
+
http = create_http_client(uri)
|
49
48
|
res = http.request(req)
|
50
49
|
raise "Failed to get access token: #{res.code} #{res.body}" unless res.is_a?(Net::HTTPSuccess)
|
51
50
|
|
52
51
|
JSON.parse(res.body)
|
53
52
|
end
|
53
|
+
|
54
|
+
def read_token_file_safely
|
55
|
+
max_attempts = 3
|
56
|
+
max_attempts.times do |attempt|
|
57
|
+
begin
|
58
|
+
# Safe file reading with corruption detection
|
59
|
+
token = File.read(@token_file).strip
|
60
|
+
raise "Empty or invalid token file" if token.empty? || token.length < 10
|
61
|
+
return token
|
62
|
+
rescue => e
|
63
|
+
@logger.warn("Token file read attempt #{attempt + 1}/#{max_attempts} failed: #{e.message}")
|
64
|
+
raise e if attempt == max_attempts - 1
|
65
|
+
sleep(0.1 * (2 ** attempt)) # Exponential backoff: 0.1s, 0.2s, 0.4s
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
54
69
|
end
|
data/lib/fluent/plugin/client.rb
CHANGED
@@ -26,6 +26,19 @@ class Client
|
|
26
26
|
@resources_expiry_time = nil
|
27
27
|
@outconfiguration = outconfiguration
|
28
28
|
@token_provider = create_token_provider(outconfiguration)
|
29
|
+
|
30
|
+
# Minimal state tracking for 12-hour reset
|
31
|
+
@client_state = {
|
32
|
+
creation_time: Time.now,
|
33
|
+
resource_fetch_count: 0,
|
34
|
+
last_successful_fetch: nil
|
35
|
+
}
|
36
|
+
|
37
|
+
# Simplified health configuration
|
38
|
+
@health_config = {
|
39
|
+
max_client_age: 43_200, # 12 hours - force reset after this time
|
40
|
+
max_fetch_cycles: 200 # Force reset after too many fetch cycles
|
41
|
+
}
|
29
42
|
end
|
30
43
|
|
31
44
|
def resources
|
@@ -36,6 +49,17 @@ class Client
|
|
36
49
|
@cached_resources
|
37
50
|
end
|
38
51
|
|
52
|
+
# Minimal health status for operational visibility
|
53
|
+
def health_status
|
54
|
+
{
|
55
|
+
resources_cached: !@cached_resources.nil?,
|
56
|
+
cache_expires_at: @resources_expiry_time,
|
57
|
+
fetch_cycles: @client_state[:resource_fetch_count],
|
58
|
+
pod_age_hours: (Time.now - @client_state[:creation_time]) / 3600,
|
59
|
+
last_successful_fetch: @client_state[:last_successful_fetch]
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
39
63
|
attr_reader :blob_sas_uri, :queue_sas_uri, :identity_token, :logger, :blob_rows, :data_endpoint, :token_provider
|
40
64
|
|
41
65
|
private
|
@@ -48,10 +72,56 @@ class Client
|
|
48
72
|
end
|
49
73
|
|
50
74
|
def resources_cached?
|
75
|
+
# Check for long-running pod health issues first
|
76
|
+
if long_running_pod_health_check_needed?
|
77
|
+
@logger.warn("Long-running pod health issue detected, forcing resource refresh")
|
78
|
+
return false
|
79
|
+
end
|
80
|
+
|
51
81
|
# Check if resources are cached and not expired
|
52
82
|
@cached_resources && @resources_expiry_time && @resources_expiry_time > Time.now
|
53
83
|
end
|
54
84
|
|
85
|
+
def long_running_pod_health_check_needed?
|
86
|
+
current_time = Time.now
|
87
|
+
|
88
|
+
# Check if client is too old (12+ hours) - force reset to prevent staleness
|
89
|
+
if @client_state[:creation_time] &&
|
90
|
+
(current_time - @client_state[:creation_time]) > @health_config[:max_client_age]
|
91
|
+
@logger.warn("Client is #{(current_time - @client_state[:creation_time]) / 3600} hours old, forcing reset")
|
92
|
+
reset_client_state_for_long_running_pod
|
93
|
+
return true
|
94
|
+
end
|
95
|
+
|
96
|
+
# Check if too many fetch cycles (potential state corruption)
|
97
|
+
if @client_state[:resource_fetch_count] > @health_config[:max_fetch_cycles]
|
98
|
+
@logger.warn("Client has #{@client_state[:resource_fetch_count]} fetch cycles, resetting state")
|
99
|
+
reset_client_state_for_long_running_pod
|
100
|
+
return true
|
101
|
+
end
|
102
|
+
|
103
|
+
# Check if no successful fetch for too long (6 hours)
|
104
|
+
if @client_state[:last_successful_fetch] &&
|
105
|
+
(current_time - @client_state[:last_successful_fetch]) > 21_600
|
106
|
+
@logger.warn("No successful resource fetch for #{(current_time - @client_state[:last_successful_fetch]) / 3600} hours")
|
107
|
+
return true
|
108
|
+
end
|
109
|
+
|
110
|
+
false
|
111
|
+
end
|
112
|
+
|
113
|
+
def reset_client_state_for_long_running_pod
|
114
|
+
@logger.info("Resetting client state for long-running pod health")
|
115
|
+
|
116
|
+
@cached_resources = nil
|
117
|
+
@resources_expiry_time = nil
|
118
|
+
@client_state[:creation_time] = Time.now
|
119
|
+
@client_state[:resource_fetch_count] = 0
|
120
|
+
@client_state[:last_successful_fetch] = nil
|
121
|
+
@client_state[:consecutive_failures] = 0
|
122
|
+
|
123
|
+
end
|
124
|
+
|
55
125
|
def fetch_and_cache_resources
|
56
126
|
# Fetch resources from Kusto and cache them
|
57
127
|
@logger.info('Fetching resources from Kusto...')
|
@@ -137,7 +207,18 @@ class Client
|
|
137
207
|
queue_sas_uri: queue_sas_uri,
|
138
208
|
identity_token: identity_token
|
139
209
|
}
|
140
|
-
|
210
|
+
|
211
|
+
# Add jitter (±30 minutes) to prevent thundering herd
|
212
|
+
base_ttl = 21_600 # 6 hours
|
213
|
+
jitter = rand(-1800..1800) # ±30 minutes
|
214
|
+
@resources_expiry_time = Time.now + base_ttl + jitter
|
215
|
+
|
216
|
+
# Update client state tracking
|
217
|
+
@client_state[:resource_fetch_count] += 1
|
218
|
+
@client_state[:last_successful_fetch] = Time.now
|
219
|
+
@client_state[:consecutive_failures] = 0
|
220
|
+
|
221
|
+
@logger.info("Resources cached with jitter: #{jitter / 60} minutes (expires at #{@resources_expiry_time}) - fetch cycle #{@client_state[:resource_fetch_count]}")
|
141
222
|
end
|
142
223
|
|
143
224
|
def validate_kusto_resource_rows(blob_rows, aad_token_rows)
|
@@ -25,7 +25,6 @@ class Ingester
|
|
25
25
|
def initialize(outconfiguration)
|
26
26
|
# Initialize Ingester with configuration and resources
|
27
27
|
@client = self.class.client(outconfiguration)
|
28
|
-
@resources = @client.resources
|
29
28
|
@logger = begin
|
30
29
|
outconfiguration.logger
|
31
30
|
rescue StandardError
|
@@ -34,8 +33,19 @@ class Ingester
|
|
34
33
|
end
|
35
34
|
|
36
35
|
def self.client(outconfiguration)
|
37
|
-
#
|
38
|
-
self.client_cache
|
36
|
+
# Thread-safe singleton client cache with basic validation
|
37
|
+
return self.client_cache if self.client_cache
|
38
|
+
|
39
|
+
# Double-checked locking pattern for thread safety
|
40
|
+
@client_mutex ||= Mutex.new
|
41
|
+
@client_mutex.synchronize do
|
42
|
+
self.client_cache ||= Client.new(outconfiguration)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# CRITICAL FIX: Dynamic resource access instead of stale cached reference
|
47
|
+
def resources
|
48
|
+
@client.resources
|
39
49
|
end
|
40
50
|
|
41
51
|
def build_uri(container_sas_uri, name)
|
@@ -56,7 +66,8 @@ class Ingester
|
|
56
66
|
request['x-ms-blob-type'] = 'BlockBlob'
|
57
67
|
request['Content-Length'] = blob_size.to_s
|
58
68
|
|
59
|
-
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https'
|
69
|
+
response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https',
|
70
|
+
open_timeout: 10, read_timeout: 30, write_timeout: 10) do |http|
|
60
71
|
http.request(request)
|
61
72
|
end
|
62
73
|
|
@@ -111,7 +122,8 @@ class Ingester
|
|
111
122
|
request = Net::HTTP::Post.new(post_uri)
|
112
123
|
request['Content-Type'] = 'application/xml'
|
113
124
|
request.body = "<QueueMessage><MessageText>#{encoded_message}</MessageText></QueueMessage>"
|
114
|
-
response = Net::HTTP.start(post_uri.hostname, post_uri.port, use_ssl: post_uri.scheme == 'https'
|
125
|
+
response = Net::HTTP.start(post_uri.hostname, post_uri.port, use_ssl: post_uri.scheme == 'https',
|
126
|
+
open_timeout: 10, read_timeout: 30, write_timeout: 10) do |http|
|
115
127
|
http.request(request)
|
116
128
|
end
|
117
129
|
{
|
@@ -124,10 +136,12 @@ class Ingester
|
|
124
136
|
|
125
137
|
def upload_data_to_blob_and_queue(raw_data, blob_name, db, table_name, compression_enabled = true, mapping_reference = nil)
|
126
138
|
# Upload data to blob and send ingestion message to queue
|
127
|
-
|
128
|
-
|
139
|
+
# Use dynamic resources method instead of stale cached reference
|
140
|
+
current_resources = resources
|
141
|
+
blob_uri, blob_size_bytes = upload_to_blob(current_resources[:blob_sas_uri], raw_data, blob_name)
|
142
|
+
message = prepare_ingestion_message2(db, table_name, blob_uri, blob_size_bytes, current_resources[:identity_token],
|
129
143
|
compression_enabled, mapping_reference)
|
130
|
-
post_message_to_queue_http(
|
144
|
+
post_message_to_queue_http(current_resources[:queue_sas_uri], message)
|
131
145
|
{ blob_uri: blob_uri, blob_size_bytes: blob_size_bytes }
|
132
146
|
end
|
133
147
|
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# KustoConstants contains shared configuration constants used across the Kusto plugin
|
4
|
+
# to avoid magic numbers and ensure consistency.
|
5
|
+
module KustoConstants
|
6
|
+
# Authentication and token management constants
|
7
|
+
module Authentication
|
8
|
+
# Token expiry buffer time in seconds (5 minutes)
|
9
|
+
# Used to refresh tokens before they actually expire to prevent race conditions
|
10
|
+
TOKEN_EXPIRY_BUFFER_SECONDS = 300
|
11
|
+
|
12
|
+
# Default token expiry time in seconds (55 minutes)
|
13
|
+
# Used when expires_in is not provided or invalid
|
14
|
+
DEFAULT_TOKEN_EXPIRY_SECONDS = 3300
|
15
|
+
|
16
|
+
# Maximum retry attempts for token fetching
|
17
|
+
DEFAULT_MAX_RETRIES = 3
|
18
|
+
|
19
|
+
# Base delay for exponential backoff in seconds
|
20
|
+
DEFAULT_BASE_DELAY = 1
|
21
|
+
|
22
|
+
# Backoff multiplier for exponential backoff
|
23
|
+
DEFAULT_BACKOFF_MULTIPLIER = 2
|
24
|
+
|
25
|
+
# Maximum delay between retries in seconds
|
26
|
+
DEFAULT_MAX_DELAY = 30
|
27
|
+
|
28
|
+
# Minimum retry delay in seconds (prevents too-rapid retries)
|
29
|
+
MIN_RETRY_DELAY = 0.1
|
30
|
+
|
31
|
+
# HTTP client timeout settings
|
32
|
+
HTTP_OPEN_TIMEOUT = 10
|
33
|
+
HTTP_READ_TIMEOUT = 30
|
34
|
+
HTTP_WRITE_TIMEOUT = 10
|
35
|
+
end
|
36
|
+
|
37
|
+
# Resource caching and client management constants
|
38
|
+
module ResourceCache
|
39
|
+
# Base TTL for resource cache in seconds (6 hours)
|
40
|
+
BASE_CACHE_TTL_SECONDS = 21_600
|
41
|
+
|
42
|
+
# Maximum jitter for cache TTL in seconds (±30 minutes)
|
43
|
+
CACHE_TTL_JITTER_SECONDS = 1800
|
44
|
+
end
|
45
|
+
|
46
|
+
# Long-running pod health check constants
|
47
|
+
module HealthCheck
|
48
|
+
# Maximum age before forcing reset in seconds (12 hours)
|
49
|
+
MAX_COMPONENT_AGE_SECONDS = 43_200
|
50
|
+
|
51
|
+
# Maximum refresh cycles before forcing reset
|
52
|
+
MAX_REFRESH_CYCLES = 100
|
53
|
+
|
54
|
+
# Maximum resource fetch cycles before forcing reset
|
55
|
+
MAX_FETCH_CYCLES = 200
|
56
|
+
end
|
57
|
+
end
|
@@ -8,6 +8,7 @@ require 'uri'
|
|
8
8
|
require 'json'
|
9
9
|
require 'securerandom'
|
10
10
|
require 'base64'
|
11
|
+
require_relative 'kusto_version'
|
11
12
|
|
12
13
|
def to_ingest_endpoint(data_endpoint)
|
13
14
|
# Convert a Kusto data endpoint to its corresponding ingest endpoint
|
@@ -24,12 +25,18 @@ def run_kusto_api_query(query, data_endpoint, token_provider, use_ingest_endpoin
|
|
24
25
|
|
25
26
|
http = Net::HTTP.new(uri.host, uri.port)
|
26
27
|
http.use_ssl = true
|
28
|
+
# Add timeouts to prevent hanging connections
|
29
|
+
http.open_timeout = 10
|
30
|
+
http.read_timeout = 30
|
31
|
+
http.write_timeout = 10
|
27
32
|
|
28
33
|
headers = {
|
29
34
|
'Authorization' => "Bearer #{access_token}",
|
30
35
|
'Content-Type' => 'application/json',
|
31
36
|
'Accept' => 'application/json',
|
32
|
-
'x-ms-client-version' =>
|
37
|
+
'x-ms-client-version' => "Kusto.FluentD:#{Fluent::Plugin::Kusto::VERSION}",
|
38
|
+
'x-ms-app' => 'Kusto.FluentD',
|
39
|
+
'x-ms-user' => 'Kusto.FluentD'
|
33
40
|
}
|
34
41
|
|
35
42
|
body_hash = { csl: query }
|