aiqa-client 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiqa/__init__.py +1 -1
- aiqa/client.py +108 -23
- aiqa/constants.py +3 -1
- aiqa/experiment_runner.py +12 -29
- aiqa/http_utils.py +143 -0
- aiqa/object_serialiser.py +136 -115
- aiqa/tracing.py +155 -267
- aiqa/tracing_llm_utils.py +191 -0
- {aiqa_client-0.4.3.dist-info → aiqa_client-0.5.2.dist-info}/METADATA +1 -1
- aiqa_client-0.5.2.dist-info/RECORD +14 -0
- aiqa/aiqa_exporter.py +0 -679
- aiqa/test_experiment_runner.py +0 -176
- aiqa/test_startup_reliability.py +0 -249
- aiqa/test_tracing.py +0 -230
- aiqa_client-0.4.3.dist-info/RECORD +0 -16
- {aiqa_client-0.4.3.dist-info → aiqa_client-0.5.2.dist-info}/WHEEL +0 -0
- {aiqa_client-0.4.3.dist-info → aiqa_client-0.5.2.dist-info}/licenses/LICENSE.txt +0 -0
- {aiqa_client-0.4.3.dist-info → aiqa_client-0.5.2.dist-info}/top_level.txt +0 -0
aiqa/aiqa_exporter.py
DELETED
|
@@ -1,679 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
OpenTelemetry span exporter that sends spans to the AIQA server API.
|
|
3
|
-
Buffers spans and flushes them periodically or on shutdown. Thread-safe.
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
import json
|
|
8
|
-
import logging
|
|
9
|
-
import threading
|
|
10
|
-
import time
|
|
11
|
-
import io
|
|
12
|
-
import asyncio
|
|
13
|
-
from typing import List, Dict, Any, Optional
|
|
14
|
-
from opentelemetry.sdk.trace import ReadableSpan
|
|
15
|
-
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
|
16
|
-
|
|
17
|
-
from .constants import AIQA_TRACER_NAME, VERSION
|
|
18
|
-
|
|
19
|
-
logger = logging.getLogger("AIQA")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class AIQASpanExporter(SpanExporter):
|
|
23
|
-
"""
|
|
24
|
-
Exports spans to AIQA server. Buffers spans and auto-flushes every flush_interval_seconds.
|
|
25
|
-
Call shutdown() before process exit to flush remaining spans.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
server_url: Optional[str] = None,
|
|
31
|
-
api_key: Optional[str] = None,
|
|
32
|
-
flush_interval_seconds: float = 5.0,
|
|
33
|
-
max_batch_size_bytes: int = 5 * 1024 * 1024, # 5MB default
|
|
34
|
-
max_buffer_spans: int = 10000, # Maximum spans to buffer (prevents unbounded growth)
|
|
35
|
-
startup_delay_seconds: Optional[float] = None,
|
|
36
|
-
):
|
|
37
|
-
"""
|
|
38
|
-
Initialize the AIQA span exporter.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
|
|
42
|
-
api_key: API key for authentication (defaults to AIQA_API_KEY env var)
|
|
43
|
-
flush_interval_seconds: How often to flush spans to the server
|
|
44
|
-
max_batch_size_bytes: Maximum size of a single batch in bytes (default: 5mb)
|
|
45
|
-
max_buffer_spans: Maximum spans to buffer (prevents unbounded growth)
|
|
46
|
-
startup_delay_seconds: Delay before starting auto-flush (default: 10s, or AIQA_STARTUP_DELAY_SECONDS env var)
|
|
47
|
-
"""
|
|
48
|
-
self._server_url = server_url
|
|
49
|
-
self._api_key = api_key
|
|
50
|
-
self.flush_interval_ms = flush_interval_seconds * 1000
|
|
51
|
-
self.max_batch_size_bytes = max_batch_size_bytes
|
|
52
|
-
self.max_buffer_spans = max_buffer_spans
|
|
53
|
-
|
|
54
|
-
# Get startup delay from parameter or environment variable (default: 10s)
|
|
55
|
-
if startup_delay_seconds is None:
|
|
56
|
-
env_delay = os.getenv("AIQA_STARTUP_DELAY_SECONDS")
|
|
57
|
-
if env_delay:
|
|
58
|
-
try:
|
|
59
|
-
startup_delay_seconds = float(env_delay)
|
|
60
|
-
except ValueError:
|
|
61
|
-
logger.warning(f"Invalid AIQA_STARTUP_DELAY_SECONDS value '{env_delay}', using default 10.0")
|
|
62
|
-
startup_delay_seconds = 10.0
|
|
63
|
-
else:
|
|
64
|
-
startup_delay_seconds = 10.0
|
|
65
|
-
self.startup_delay_seconds = startup_delay_seconds
|
|
66
|
-
|
|
67
|
-
self.buffer: List[Dict[str, Any]] = []
|
|
68
|
-
self.buffer_span_keys: set = set() # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
|
|
69
|
-
self.buffer_lock = threading.Lock()
|
|
70
|
-
self.flush_lock = threading.Lock()
|
|
71
|
-
# shutdown_requested is only set once (in shutdown()) and read many times
|
|
72
|
-
# No lock needed: worst case is reading stale False, which is acceptable
|
|
73
|
-
self.shutdown_requested = False
|
|
74
|
-
self.flush_timer: Optional[threading.Thread] = None
|
|
75
|
-
self._auto_flush_started = False
|
|
76
|
-
self._auto_flush_lock = threading.Lock() # Lock for lazy thread creation
|
|
77
|
-
|
|
78
|
-
logger.info(
|
|
79
|
-
f"Initializing AIQASpanExporter: server_url={self.server_url or 'not set'}, "
|
|
80
|
-
f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
|
|
81
|
-
)
|
|
82
|
-
# Don't start thread immediately - start lazily on first export to avoid startup issues
|
|
83
|
-
|
|
84
|
-
@property
|
|
85
|
-
def server_url(self) -> str:
|
|
86
|
-
return self._server_url or os.getenv("AIQA_SERVER_URL", "").rstrip("/")
|
|
87
|
-
|
|
88
|
-
@property
|
|
89
|
-
def api_key(self) -> str:
|
|
90
|
-
return self._api_key or os.getenv("AIQA_API_KEY", "")
|
|
91
|
-
|
|
92
|
-
def export(self, spans: List[ReadableSpan]) -> SpanExportResult:
|
|
93
|
-
"""
|
|
94
|
-
Export spans to the AIQA server. Adds spans to buffer for async flushing.
|
|
95
|
-
Deduplicates spans based on (traceId, spanId) to prevent repeated exports.
|
|
96
|
-
"""
|
|
97
|
-
if not spans:
|
|
98
|
-
logger.debug("export() called with empty spans list")
|
|
99
|
-
return SpanExportResult.SUCCESS
|
|
100
|
-
|
|
101
|
-
# Check if AIQA tracing is enabled
|
|
102
|
-
try:
|
|
103
|
-
from .client import get_aiqa_client
|
|
104
|
-
client = get_aiqa_client()
|
|
105
|
-
if not client.enabled:
|
|
106
|
-
logger.debug(f"AIQA export() skipped: tracing is disabled")
|
|
107
|
-
return SpanExportResult.SUCCESS
|
|
108
|
-
except Exception:
|
|
109
|
-
# If we can't check enabled status, proceed (fail open)
|
|
110
|
-
pass
|
|
111
|
-
|
|
112
|
-
logger.debug(f"AIQA export() called with {len(spans)} spans")
|
|
113
|
-
|
|
114
|
-
# Lazy initialization: start auto-flush thread on first export
|
|
115
|
-
# This avoids thread creation during initialization, which can cause issues in ECS deployments
|
|
116
|
-
self._ensure_auto_flush_started()
|
|
117
|
-
|
|
118
|
-
# Serialize and add to buffer, deduplicating by (traceId, spanId)
|
|
119
|
-
with self.buffer_lock:
|
|
120
|
-
serialized_spans = []
|
|
121
|
-
duplicates_count = 0
|
|
122
|
-
dropped_count = 0
|
|
123
|
-
for span in spans:
|
|
124
|
-
# Check if buffer is full (prevent unbounded growth)
|
|
125
|
-
if len(self.buffer) >= self.max_buffer_spans:
|
|
126
|
-
dropped_count += 1
|
|
127
|
-
continue
|
|
128
|
-
|
|
129
|
-
serialized = self._serialize_span(span)
|
|
130
|
-
span_key = (serialized["traceId"], serialized["spanId"])
|
|
131
|
-
if span_key not in self.buffer_span_keys:
|
|
132
|
-
serialized_spans.append(serialized)
|
|
133
|
-
self.buffer_span_keys.add(span_key)
|
|
134
|
-
else:
|
|
135
|
-
duplicates_count += 1
|
|
136
|
-
logger.debug(f"export() skipping duplicate span: traceId={serialized['traceId']}, spanId={serialized['spanId']}")
|
|
137
|
-
|
|
138
|
-
self.buffer.extend(serialized_spans)
|
|
139
|
-
buffer_size = len(self.buffer)
|
|
140
|
-
|
|
141
|
-
if dropped_count > 0:
|
|
142
|
-
logger.warning(
|
|
143
|
-
f"WARNING: Buffer full ({buffer_size} spans), dropped {dropped_count} span(s). "
|
|
144
|
-
f"Consider increasing max_buffer_spans or fixing server connectivity."
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
if duplicates_count > 0:
|
|
148
|
-
logger.debug(
|
|
149
|
-
f"export() added {len(serialized_spans)} span(s) to buffer, skipped {duplicates_count} duplicate(s). "
|
|
150
|
-
f"Total buffered: {buffer_size}"
|
|
151
|
-
)
|
|
152
|
-
else:
|
|
153
|
-
logger.debug(
|
|
154
|
-
f"export() added {len(spans)} span(s) to buffer. "
|
|
155
|
-
f"Total buffered: {buffer_size}"
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
return SpanExportResult.SUCCESS
|
|
159
|
-
|
|
160
|
-
def _serialize_span(self, span: ReadableSpan) -> Dict[str, Any]:
|
|
161
|
-
"""Convert ReadableSpan to a serializable format."""
|
|
162
|
-
span_context = span.get_span_context()
|
|
163
|
-
|
|
164
|
-
# Get parent span ID
|
|
165
|
-
parent_span_id = None
|
|
166
|
-
if hasattr(span, "parent") and span.parent:
|
|
167
|
-
parent_span_id = format(span.parent.span_id, "016x")
|
|
168
|
-
elif hasattr(span, "parent_span_id") and span.parent_span_id:
|
|
169
|
-
parent_span_id = format(span.parent_span_id, "016x")
|
|
170
|
-
|
|
171
|
-
# Get span kind (handle both enum and int)
|
|
172
|
-
span_kind = span.kind
|
|
173
|
-
if hasattr(span_kind, "value"):
|
|
174
|
-
span_kind = span_kind.value
|
|
175
|
-
|
|
176
|
-
# Get status code (handle both enum and int)
|
|
177
|
-
status_code = span.status.status_code
|
|
178
|
-
if hasattr(status_code, "value"):
|
|
179
|
-
status_code = status_code.value
|
|
180
|
-
|
|
181
|
-
return {
|
|
182
|
-
"name": span.name,
|
|
183
|
-
"kind": span_kind,
|
|
184
|
-
"parentSpanId": parent_span_id,
|
|
185
|
-
"startTime": self._time_to_tuple(span.start_time),
|
|
186
|
-
"endTime": self._time_to_tuple(span.end_time) if span.end_time else None,
|
|
187
|
-
"status": {
|
|
188
|
-
"code": status_code,
|
|
189
|
-
"message": getattr(span.status, "description", None),
|
|
190
|
-
},
|
|
191
|
-
"attributes": dict(span.attributes) if span.attributes else {},
|
|
192
|
-
"links": [
|
|
193
|
-
{
|
|
194
|
-
"context": {
|
|
195
|
-
"traceId": format(link.context.trace_id, "032x"),
|
|
196
|
-
"spanId": format(link.context.span_id, "016x"),
|
|
197
|
-
},
|
|
198
|
-
"attributes": dict(link.attributes) if link.attributes else {},
|
|
199
|
-
}
|
|
200
|
-
for link in (span.links or [])
|
|
201
|
-
],
|
|
202
|
-
"events": [
|
|
203
|
-
{
|
|
204
|
-
"name": event.name,
|
|
205
|
-
"time": self._time_to_tuple(event.timestamp),
|
|
206
|
-
"attributes": dict(event.attributes) if event.attributes else {},
|
|
207
|
-
}
|
|
208
|
-
for event in (span.events or [])
|
|
209
|
-
],
|
|
210
|
-
"resource": {
|
|
211
|
-
"attributes": dict(span.resource.attributes) if span.resource.attributes else {},
|
|
212
|
-
},
|
|
213
|
-
"traceId": format(span_context.trace_id, "032x"),
|
|
214
|
-
"spanId": format(span_context.span_id, "016x"),
|
|
215
|
-
"traceFlags": span_context.trace_flags,
|
|
216
|
-
"duration": self._time_to_tuple(span.end_time - span.start_time) if span.end_time else None,
|
|
217
|
-
"ended": span.end_time is not None,
|
|
218
|
-
"instrumentationLibrary": self._get_instrumentation_library(span),
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
def _get_instrumentation_library(self, span: ReadableSpan) -> Dict[str, Any]:
|
|
222
|
-
"""
|
|
223
|
-
Get instrumentation library information from the span: just use the package version.
|
|
224
|
-
"""
|
|
225
|
-
return {
|
|
226
|
-
"name": AIQA_TRACER_NAME,
|
|
227
|
-
"version": VERSION,
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
def _time_to_tuple(self, nanoseconds: int) -> tuple:
|
|
231
|
-
"""Convert nanoseconds to (seconds, nanoseconds) tuple."""
|
|
232
|
-
seconds = int(nanoseconds // 1_000_000_000)
|
|
233
|
-
nanos = int(nanoseconds % 1_000_000_000)
|
|
234
|
-
return (seconds, nanos)
|
|
235
|
-
|
|
236
|
-
def _build_request_headers(self) -> Dict[str, str]:
|
|
237
|
-
"""Build HTTP headers for span requests."""
|
|
238
|
-
headers = {"Content-Type": "application/json"}
|
|
239
|
-
if self.api_key:
|
|
240
|
-
headers["Authorization"] = f"ApiKey {self.api_key}"
|
|
241
|
-
return headers
|
|
242
|
-
|
|
243
|
-
def _get_span_url(self) -> str:
|
|
244
|
-
"""Get the URL for sending spans."""
|
|
245
|
-
if not self.server_url:
|
|
246
|
-
raise ValueError("AIQA_SERVER_URL is not set. Cannot send spans to server.")
|
|
247
|
-
return f"{self.server_url}/span"
|
|
248
|
-
|
|
249
|
-
def _is_interpreter_shutdown_error(self, error: Exception) -> bool:
|
|
250
|
-
"""Check if error is due to interpreter shutdown."""
|
|
251
|
-
error_str = str(error)
|
|
252
|
-
return "cannot schedule new futures after" in error_str or "interpreter shutdown" in error_str
|
|
253
|
-
|
|
254
|
-
def _extract_spans_from_buffer(self) -> List[Dict[str, Any]]:
|
|
255
|
-
"""Extract spans from buffer (thread-safe). Returns copy of buffer."""
|
|
256
|
-
with self.buffer_lock:
|
|
257
|
-
return self.buffer[:]
|
|
258
|
-
|
|
259
|
-
def _extract_and_remove_spans_from_buffer(self) -> List[Dict[str, Any]]:
|
|
260
|
-
"""
|
|
261
|
-
Atomically extract and remove all spans from buffer (thread-safe).
|
|
262
|
-
Returns the extracted spans. This prevents race conditions where spans
|
|
263
|
-
are added between extraction and clearing.
|
|
264
|
-
Note: Does NOT clear buffer_span_keys - that should be done after successful send
|
|
265
|
-
to avoid unnecessary clearing/rebuilding on failures.
|
|
266
|
-
"""
|
|
267
|
-
with self.buffer_lock:
|
|
268
|
-
spans = self.buffer[:]
|
|
269
|
-
self.buffer.clear()
|
|
270
|
-
return spans
|
|
271
|
-
|
|
272
|
-
def _remove_span_keys_from_tracking(self, spans: List[Dict[str, Any]]) -> None:
|
|
273
|
-
"""
|
|
274
|
-
Remove span keys from tracking set (thread-safe). Called after successful send.
|
|
275
|
-
"""
|
|
276
|
-
with self.buffer_lock:
|
|
277
|
-
for span in spans:
|
|
278
|
-
span_key = (span["traceId"], span["spanId"])
|
|
279
|
-
self.buffer_span_keys.discard(span_key)
|
|
280
|
-
|
|
281
|
-
def _prepend_spans_to_buffer(self, spans: List[Dict[str, Any]]) -> None:
|
|
282
|
-
"""
|
|
283
|
-
Prepend spans back to buffer (thread-safe). Used to restore spans
|
|
284
|
-
if sending fails. Rebuilds the span keys tracking set.
|
|
285
|
-
"""
|
|
286
|
-
with self.buffer_lock:
|
|
287
|
-
self.buffer[:0] = spans
|
|
288
|
-
# Rebuild span keys set from current buffer contents
|
|
289
|
-
self.buffer_span_keys = {(span["traceId"], span["spanId"]) for span in self.buffer}
|
|
290
|
-
|
|
291
|
-
def _clear_buffer(self) -> None:
|
|
292
|
-
"""Clear the buffer (thread-safe)."""
|
|
293
|
-
with self.buffer_lock:
|
|
294
|
-
self.buffer.clear()
|
|
295
|
-
self.buffer_span_keys.clear()
|
|
296
|
-
|
|
297
|
-
def _split_into_batches(self, spans: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
|
|
298
|
-
"""
|
|
299
|
-
Split spans into batches based on max_batch_size_bytes.
|
|
300
|
-
Each batch will be as large as possible without exceeding the limit.
|
|
301
|
-
If a single span exceeds the limit, it will be sent in its own batch with a warning.
|
|
302
|
-
"""
|
|
303
|
-
if not spans:
|
|
304
|
-
return []
|
|
305
|
-
|
|
306
|
-
batches = []
|
|
307
|
-
current_batch = []
|
|
308
|
-
current_batch_size = 0
|
|
309
|
-
|
|
310
|
-
for span in spans:
|
|
311
|
-
# Estimate size of this span when serialized
|
|
312
|
-
span_json = json.dumps(span)
|
|
313
|
-
span_size = len(span_json.encode('utf-8'))
|
|
314
|
-
|
|
315
|
-
# Check if this single span exceeds the limit
|
|
316
|
-
if span_size > self.max_batch_size_bytes:
|
|
317
|
-
# If we have a current batch, save it first
|
|
318
|
-
if current_batch:
|
|
319
|
-
batches.append(current_batch)
|
|
320
|
-
current_batch = []
|
|
321
|
-
current_batch_size = 0
|
|
322
|
-
|
|
323
|
-
# Log warning about oversized span
|
|
324
|
-
span_name = span.get('name', 'unknown')
|
|
325
|
-
span_trace_id = span.get('traceId', 'unknown')
|
|
326
|
-
logger.warning(
|
|
327
|
-
f"Span '{span_name}' (traceId={span_trace_id}) exceeds max_batch_size_bytes "
|
|
328
|
-
f"({span_size} bytes > {self.max_batch_size_bytes} bytes). "
|
|
329
|
-
f"Will attempt to send it anyway - may fail if server/nginx limit is exceeded."
|
|
330
|
-
)
|
|
331
|
-
# Still create a batch with just this span - we'll try to send it
|
|
332
|
-
batches.append([span])
|
|
333
|
-
continue
|
|
334
|
-
|
|
335
|
-
# If adding this span would exceed the limit, start a new batch
|
|
336
|
-
if current_batch and current_batch_size + span_size > self.max_batch_size_bytes:
|
|
337
|
-
batches.append(current_batch)
|
|
338
|
-
current_batch = []
|
|
339
|
-
current_batch_size = 0
|
|
340
|
-
|
|
341
|
-
current_batch.append(span)
|
|
342
|
-
current_batch_size += span_size
|
|
343
|
-
|
|
344
|
-
# Add the last batch if it has any spans
|
|
345
|
-
if current_batch:
|
|
346
|
-
batches.append(current_batch)
|
|
347
|
-
|
|
348
|
-
return batches
|
|
349
|
-
|
|
350
|
-
async def flush(self) -> None:
|
|
351
|
-
"""
|
|
352
|
-
Flush buffered spans to the server. Thread-safe: ensures only one flush operation runs at a time.
|
|
353
|
-
Atomically extracts spans to prevent race conditions with concurrent export() calls.
|
|
354
|
-
|
|
355
|
-
Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
|
|
356
|
-
"""
|
|
357
|
-
logger.debug("flush() called - attempting to acquire flush lock")
|
|
358
|
-
with self.flush_lock:
|
|
359
|
-
logger.debug("flush() acquired flush lock")
|
|
360
|
-
# Atomically extract and remove spans to prevent race conditions
|
|
361
|
-
# where export() adds spans between extraction and clearing
|
|
362
|
-
spans_to_flush = self._extract_and_remove_spans_from_buffer()
|
|
363
|
-
logger.debug(f"flush() extracted {len(spans_to_flush)} span(s) from buffer")
|
|
364
|
-
|
|
365
|
-
if not spans_to_flush:
|
|
366
|
-
logger.debug("flush() completed: no spans to flush")
|
|
367
|
-
return
|
|
368
|
-
|
|
369
|
-
# Skip sending if server URL is not configured
|
|
370
|
-
if not self.server_url:
|
|
371
|
-
logger.warning(
|
|
372
|
-
f"Skipping flush: AIQA_SERVER_URL is not set. {len(spans_to_flush)} span(s) will not be sent."
|
|
373
|
-
)
|
|
374
|
-
# Spans already removed from buffer, clear their keys to free memory
|
|
375
|
-
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
376
|
-
return
|
|
377
|
-
|
|
378
|
-
# Release flush_lock before I/O to avoid blocking other flush attempts
|
|
379
|
-
# Spans are already extracted, so concurrent exports won't interfere
|
|
380
|
-
logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
|
|
381
|
-
try:
|
|
382
|
-
await self._send_spans(spans_to_flush)
|
|
383
|
-
logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
|
|
384
|
-
# Spans already removed from buffer during extraction
|
|
385
|
-
# Now clear their keys from tracking set to free memory
|
|
386
|
-
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
387
|
-
except RuntimeError as error:
|
|
388
|
-
if self._is_interpreter_shutdown_error(error):
|
|
389
|
-
if self.shutdown_requested:
|
|
390
|
-
logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
|
|
391
|
-
else:
|
|
392
|
-
logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
|
|
393
|
-
# Put spans back for retry with sync send during shutdown
|
|
394
|
-
self._prepend_spans_to_buffer(spans_to_flush)
|
|
395
|
-
raise
|
|
396
|
-
logger.error(f"Error flushing spans to server: {error}")
|
|
397
|
-
# Put spans back for retry
|
|
398
|
-
self._prepend_spans_to_buffer(spans_to_flush)
|
|
399
|
-
raise
|
|
400
|
-
except Exception as error:
|
|
401
|
-
logger.error(f"Error flushing spans to server: {error}")
|
|
402
|
-
# Put spans back for retry
|
|
403
|
-
self._prepend_spans_to_buffer(spans_to_flush)
|
|
404
|
-
if self.shutdown_requested:
|
|
405
|
-
raise
|
|
406
|
-
|
|
407
|
-
def _ensure_auto_flush_started(self) -> None:
|
|
408
|
-
"""Ensure auto-flush thread is started (lazy initialization). Thread-safe."""
|
|
409
|
-
# Fast path: check without lock first
|
|
410
|
-
if self._auto_flush_started or self.shutdown_requested:
|
|
411
|
-
return
|
|
412
|
-
|
|
413
|
-
# Slow path: acquire lock and double-check
|
|
414
|
-
with self._auto_flush_lock:
|
|
415
|
-
if self._auto_flush_started or self.shutdown_requested:
|
|
416
|
-
return
|
|
417
|
-
|
|
418
|
-
try:
|
|
419
|
-
self._start_auto_flush()
|
|
420
|
-
self._auto_flush_started = True
|
|
421
|
-
except Exception as e:
|
|
422
|
-
logger.error(f"Failed to start auto-flush thread: {e}", exc_info=True)
|
|
423
|
-
# Don't raise - allow spans to be buffered even if auto-flush fails
|
|
424
|
-
# They can still be flushed manually or on shutdown
|
|
425
|
-
|
|
426
|
-
def _flush_worker(self) -> None:
|
|
427
|
-
"""Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
|
|
428
|
-
import asyncio
|
|
429
|
-
logger.debug("Auto-flush worker thread started")
|
|
430
|
-
|
|
431
|
-
# Wait for startup delay before beginning flush operations
|
|
432
|
-
# This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
|
|
433
|
-
if self.startup_delay_seconds > 0:
|
|
434
|
-
logger.info(f"Auto-flush waiting {self.startup_delay_seconds}s before first flush (startup delay)")
|
|
435
|
-
# Sleep in small increments to allow for early shutdown
|
|
436
|
-
sleep_interval = 0.5
|
|
437
|
-
remaining_delay = self.startup_delay_seconds
|
|
438
|
-
while remaining_delay > 0 and not self.shutdown_requested:
|
|
439
|
-
sleep_time = min(sleep_interval, remaining_delay)
|
|
440
|
-
time.sleep(sleep_time)
|
|
441
|
-
remaining_delay -= sleep_time
|
|
442
|
-
|
|
443
|
-
if self.shutdown_requested:
|
|
444
|
-
logger.debug("Auto-flush startup delay interrupted by shutdown")
|
|
445
|
-
return
|
|
446
|
-
|
|
447
|
-
logger.info("Auto-flush startup delay complete, beginning flush operations")
|
|
448
|
-
|
|
449
|
-
# Create event loop in this thread (isolated from main thread's event loop)
|
|
450
|
-
# This prevents interference with the main application's event loop
|
|
451
|
-
try:
|
|
452
|
-
loop = asyncio.new_event_loop()
|
|
453
|
-
asyncio.set_event_loop(loop)
|
|
454
|
-
except Exception as e:
|
|
455
|
-
logger.error(f"Failed to create event loop for auto-flush thread: {e}", exc_info=True)
|
|
456
|
-
return
|
|
457
|
-
|
|
458
|
-
# Ensure event loop is always closed, even if an exception occurs
|
|
459
|
-
try:
|
|
460
|
-
cycle_count = 0
|
|
461
|
-
while not self.shutdown_requested:
|
|
462
|
-
cycle_count += 1
|
|
463
|
-
logger.debug(f"Auto-flush cycle #{cycle_count} starting")
|
|
464
|
-
try:
|
|
465
|
-
loop.run_until_complete(self.flush())
|
|
466
|
-
logger.debug(f"Auto-flush cycle #{cycle_count} completed, sleeping {self.flush_interval_ms / 1000.0}s")
|
|
467
|
-
except Exception as e:
|
|
468
|
-
logger.error(f"Error in auto-flush cycle #{cycle_count}: {e}")
|
|
469
|
-
logger.debug(f"Auto-flush cycle #{cycle_count} error handled, sleeping {self.flush_interval_ms / 1000.0}s")
|
|
470
|
-
|
|
471
|
-
# Sleep after each cycle (including errors) to avoid tight loops
|
|
472
|
-
if not self.shutdown_requested:
|
|
473
|
-
time.sleep(self.flush_interval_ms / 1000.0)
|
|
474
|
-
|
|
475
|
-
logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
|
|
476
|
-
# Don't do final flush here - shutdown() will handle it with synchronous send
|
|
477
|
-
# This avoids event loop shutdown issues
|
|
478
|
-
logger.debug("Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
|
|
479
|
-
finally:
|
|
480
|
-
# Always close the event loop, even if an exception occurs
|
|
481
|
-
try:
|
|
482
|
-
if not loop.is_closed():
|
|
483
|
-
loop.close()
|
|
484
|
-
logger.debug("Auto-flush worker thread event loop closed")
|
|
485
|
-
except Exception:
|
|
486
|
-
pass # Ignore errors during cleanup
|
|
487
|
-
|
|
488
|
-
def _start_auto_flush(self) -> None:
|
|
489
|
-
"""Start the auto-flush timer with startup delay."""
|
|
490
|
-
if self.shutdown_requested:
|
|
491
|
-
logger.warning("_start_auto_flush() called but shutdown already requested")
|
|
492
|
-
return
|
|
493
|
-
|
|
494
|
-
logger.info(
|
|
495
|
-
f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
|
|
496
|
-
f"startup delay {self.startup_delay_seconds}s"
|
|
497
|
-
)
|
|
498
|
-
|
|
499
|
-
flush_thread = threading.Thread(target=self._flush_worker, daemon=True, name="AIQA-AutoFlush")
|
|
500
|
-
flush_thread.start()
|
|
501
|
-
self.flush_timer = flush_thread
|
|
502
|
-
logger.info(f"Auto-flush thread started: {flush_thread.name} (daemon={flush_thread.daemon})")
|
|
503
|
-
|
|
504
|
-
async def _send_spans(self, spans: List[Dict[str, Any]]) -> None:
|
|
505
|
-
"""Send spans to the server API (async). Batches large payloads automatically."""
|
|
506
|
-
import aiohttp
|
|
507
|
-
|
|
508
|
-
# Split into batches if needed
|
|
509
|
-
batches = self._split_into_batches(spans)
|
|
510
|
-
if len(batches) > 1:
|
|
511
|
-
logger.info(f"_send_spans() splitting {len(spans)} spans into {len(batches)} batches")
|
|
512
|
-
|
|
513
|
-
url = self._get_span_url()
|
|
514
|
-
headers = self._build_request_headers()
|
|
515
|
-
|
|
516
|
-
if self.api_key:
|
|
517
|
-
logger.debug("_send_spans() using API key authentication")
|
|
518
|
-
else:
|
|
519
|
-
logger.debug("_send_spans() no API key provided")
|
|
520
|
-
|
|
521
|
-
# Use timeout to prevent hanging on unreachable servers
|
|
522
|
-
timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
|
|
523
|
-
errors = []
|
|
524
|
-
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
525
|
-
for batch_idx, batch in enumerate(batches):
|
|
526
|
-
try:
|
|
527
|
-
logger.debug(f"_send_spans() sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
|
|
528
|
-
# Pre-serialize JSON to bytes and wrap in BytesIO to avoid blocking event loop
|
|
529
|
-
json_bytes = json.dumps(batch).encode('utf-8')
|
|
530
|
-
data = io.BytesIO(json_bytes)
|
|
531
|
-
|
|
532
|
-
async with session.post(url, data=data, headers=headers) as response:
|
|
533
|
-
logger.debug(f"_send_spans() batch {batch_idx + 1} received response: status={response.status}")
|
|
534
|
-
if not response.ok:
|
|
535
|
-
error_text = await response.text()
|
|
536
|
-
error_msg = f"Failed to send batch {batch_idx + 1}/{len(batches)}: {response.status} {response.reason} - {error_text[:200]}"
|
|
537
|
-
logger.error(f"_send_spans() {error_msg}")
|
|
538
|
-
errors.append((batch_idx + 1, error_msg))
|
|
539
|
-
# Continue with other batches even if one fails
|
|
540
|
-
continue
|
|
541
|
-
logger.debug(f"_send_spans() batch {batch_idx + 1} successfully sent {len(batch)} spans")
|
|
542
|
-
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
543
|
-
# Network errors and timeouts - log but don't fail completely
|
|
544
|
-
error_msg = f"Network error in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
545
|
-
logger.warning(f"_send_spans() {error_msg} - will retry on next flush")
|
|
546
|
-
errors.append((batch_idx + 1, error_msg))
|
|
547
|
-
# Continue with other batches
|
|
548
|
-
except RuntimeError as e:
|
|
549
|
-
if self._is_interpreter_shutdown_error(e):
|
|
550
|
-
if self.shutdown_requested:
|
|
551
|
-
logger.debug(f"_send_spans() skipped due to interpreter shutdown: {e}")
|
|
552
|
-
else:
|
|
553
|
-
logger.warning(f"_send_spans() interrupted by interpreter shutdown: {e}")
|
|
554
|
-
raise
|
|
555
|
-
error_msg = f"RuntimeError in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
556
|
-
logger.error(f"_send_spans() {error_msg}")
|
|
557
|
-
errors.append((batch_idx + 1, error_msg))
|
|
558
|
-
# Continue with other batches
|
|
559
|
-
except Exception as e:
|
|
560
|
-
error_msg = f"Exception in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
561
|
-
logger.error(f"_send_spans() {error_msg}")
|
|
562
|
-
errors.append((batch_idx + 1, error_msg))
|
|
563
|
-
# Continue with other batches
|
|
564
|
-
|
|
565
|
-
# If any batches failed, raise an exception with details
|
|
566
|
-
# Spans will be restored to buffer for retry on next flush
|
|
567
|
-
if errors:
|
|
568
|
-
error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
|
|
569
|
-
raise Exception(f"Failed to send some spans: {error_summary}")
|
|
570
|
-
|
|
571
|
-
logger.debug(f"_send_spans() successfully sent all {len(spans)} spans in {len(batches)} batch(es)")
|
|
572
|
-
|
|
573
|
-
def _send_spans_sync(self, spans: List[Dict[str, Any]]) -> None:
|
|
574
|
-
"""Send spans to the server API (synchronous, for shutdown scenarios). Batches large payloads automatically."""
|
|
575
|
-
import requests
|
|
576
|
-
|
|
577
|
-
# Split into batches if needed
|
|
578
|
-
batches = self._split_into_batches(spans)
|
|
579
|
-
if len(batches) > 1:
|
|
580
|
-
logger.info(f"_send_spans_sync() splitting {len(spans)} spans into {len(batches)} batches")
|
|
581
|
-
|
|
582
|
-
url = self._get_span_url()
|
|
583
|
-
headers = self._build_request_headers()
|
|
584
|
-
|
|
585
|
-
if self.api_key:
|
|
586
|
-
logger.debug("_send_spans_sync() using API key authentication")
|
|
587
|
-
else:
|
|
588
|
-
logger.debug("_send_spans_sync() no API key provided")
|
|
589
|
-
|
|
590
|
-
errors = []
|
|
591
|
-
for batch_idx, batch in enumerate(batches):
|
|
592
|
-
try:
|
|
593
|
-
logger.debug(f"_send_spans_sync() sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
|
|
594
|
-
response = requests.post(url, json=batch, headers=headers, timeout=10.0)
|
|
595
|
-
logger.debug(f"_send_spans_sync() batch {batch_idx + 1} received response: status={response.status_code}")
|
|
596
|
-
if not response.ok:
|
|
597
|
-
error_text = response.text[:200] if response.text else ""
|
|
598
|
-
error_msg = f"Failed to send batch {batch_idx + 1}/{len(batches)}: {response.status_code} {response.reason} - {error_text}"
|
|
599
|
-
logger.error(f"_send_spans_sync() {error_msg}")
|
|
600
|
-
errors.append((batch_idx + 1, error_msg))
|
|
601
|
-
# Continue with other batches even if one fails
|
|
602
|
-
continue
|
|
603
|
-
logger.debug(f"_send_spans_sync() batch {batch_idx + 1} successfully sent {len(batch)} spans")
|
|
604
|
-
except Exception as e:
|
|
605
|
-
error_msg = f"Exception in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
606
|
-
logger.error(f"_send_spans_sync() {error_msg}")
|
|
607
|
-
errors.append((batch_idx + 1, error_msg))
|
|
608
|
-
# Continue with other batches
|
|
609
|
-
|
|
610
|
-
# If any batches failed, raise an exception with details
|
|
611
|
-
if errors:
|
|
612
|
-
error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
|
|
613
|
-
raise Exception(f"Failed to send some spans: {error_summary}")
|
|
614
|
-
|
|
615
|
-
logger.debug(f"_send_spans_sync() successfully sent all {len(spans)} spans in {len(batches)} batch(es)")
|
|
616
|
-
|
|
617
|
-
def shutdown(self) -> None:
|
|
618
|
-
"""Shutdown the exporter, flushing any remaining spans. Call before process exit."""
|
|
619
|
-
logger.info("shutdown() called - initiating exporter shutdown")
|
|
620
|
-
self.shutdown_requested = True
|
|
621
|
-
|
|
622
|
-
# Check buffer state before shutdown
|
|
623
|
-
with self.buffer_lock:
|
|
624
|
-
buffer_size = len(self.buffer)
|
|
625
|
-
logger.info(f"shutdown() buffer contains {buffer_size} span(s) before shutdown")
|
|
626
|
-
|
|
627
|
-
# Wait for flush thread to finish (it will do final flush)
|
|
628
|
-
# Only wait if thread was actually started
|
|
629
|
-
if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
|
|
630
|
-
logger.info("shutdown() waiting for auto-flush thread to complete (timeout=10s)")
|
|
631
|
-
self.flush_timer.join(timeout=10.0)
|
|
632
|
-
if self.flush_timer.is_alive():
|
|
633
|
-
logger.warning("shutdown() auto-flush thread did not complete within timeout")
|
|
634
|
-
else:
|
|
635
|
-
logger.info("shutdown() auto-flush thread completed")
|
|
636
|
-
else:
|
|
637
|
-
logger.debug("shutdown() no active auto-flush thread to wait for")
|
|
638
|
-
|
|
639
|
-
# Final flush attempt (use synchronous send to avoid event loop issues)
|
|
640
|
-
with self.flush_lock:
|
|
641
|
-
logger.debug("shutdown() performing final flush with synchronous send")
|
|
642
|
-
# Atomically extract and remove spans to prevent race conditions
|
|
643
|
-
spans_to_flush = self._extract_and_remove_spans_from_buffer()
|
|
644
|
-
logger.debug(f"shutdown() extracted {len(spans_to_flush)} span(s) from buffer for final flush")
|
|
645
|
-
|
|
646
|
-
if spans_to_flush:
|
|
647
|
-
if not self.server_url:
|
|
648
|
-
logger.warning(
|
|
649
|
-
f"shutdown() skipping final flush: AIQA_SERVER_URL is not set. "
|
|
650
|
-
f"{len(spans_to_flush)} span(s) will not be sent."
|
|
651
|
-
)
|
|
652
|
-
# Spans already removed from buffer, clear their keys to free memory
|
|
653
|
-
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
654
|
-
else:
|
|
655
|
-
logger.info(f"shutdown() sending {len(spans_to_flush)} span(s) to server (synchronous)")
|
|
656
|
-
try:
|
|
657
|
-
self._send_spans_sync(spans_to_flush)
|
|
658
|
-
logger.info(f"shutdown() successfully sent {len(spans_to_flush)} span(s) to server")
|
|
659
|
-
# Spans already removed from buffer during extraction
|
|
660
|
-
# Clear their keys from tracking set to free memory
|
|
661
|
-
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
662
|
-
except Exception as e:
|
|
663
|
-
logger.error(f"shutdown() failed to send spans: {e}")
|
|
664
|
-
# Spans already removed, but process is exiting anyway
|
|
665
|
-
logger.warning(f"shutdown() {len(spans_to_flush)} span(s) were not sent due to error")
|
|
666
|
-
# Keys will remain in tracking set, but process is exiting so memory will be freed
|
|
667
|
-
else:
|
|
668
|
-
logger.debug("shutdown() no spans to flush")
|
|
669
|
-
|
|
670
|
-
# Check buffer state after shutdown
|
|
671
|
-
with self.buffer_lock:
|
|
672
|
-
buffer_size = len(self.buffer)
|
|
673
|
-
if buffer_size > 0:
|
|
674
|
-
logger.warning(f"shutdown() buffer still contains {buffer_size} span(s) after shutdown")
|
|
675
|
-
else:
|
|
676
|
-
logger.info("shutdown() buffer is empty after shutdown")
|
|
677
|
-
|
|
678
|
-
logger.info("shutdown() completed")
|
|
679
|
-
|