aiqa-client 0.4.7__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiqa/aiqa_exporter.py DELETED
@@ -1,772 +0,0 @@
1
- """
2
- OpenTelemetry span exporter that sends spans to the AIQA server API.
3
- Buffers spans and flushes them periodically or on shutdown. Thread-safe.
4
- """
5
-
6
- import os
7
- import json
8
- import logging
9
- import threading
10
- import time
11
- import io
12
- import asyncio
13
- from typing import List, Dict, Any, Optional
14
- from opentelemetry.sdk.trace import ReadableSpan
15
- from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
16
-
17
- from .constants import AIQA_TRACER_NAME, VERSION, LOG_TAG
18
- from .http_utils import get_server_url, get_api_key, build_headers
19
- from .object_serialiser import toNumber
20
-
21
- logger = logging.getLogger(LOG_TAG)
22
-
23
-
24
- class AIQASpanExporter(SpanExporter):
25
- """
26
- Exports spans to AIQA server. Buffers spans and auto-flushes every flush_interval_seconds.
27
- Call shutdown() before process exit to flush remaining spans.
28
- """
29
-
30
- def __init__(
31
- self,
32
- server_url: Optional[str] = None,
33
- api_key: Optional[str] = None,
34
- flush_interval_seconds: float = 5.0,
35
- max_batch_size_bytes: int = 5 * 1024 * 1024, # 5MB default
36
- max_buffer_spans: Optional[int] = None, # Maximum spans to buffer (prevents unbounded growth)
37
- max_buffer_size_bytes: Optional[int] = None, # Maximum buffer size in bytes (prevents unbounded memory growth)
38
- startup_delay_seconds: Optional[float] = None,
39
- ):
40
- """
41
- Initialize the AIQA span exporter.
42
-
43
- Args:
44
- server_url: URL of the AIQA server (defaults to AIQA_SERVER_URL env var)
45
- api_key: API key for authentication (defaults to AIQA_API_KEY env var)
46
- flush_interval_seconds: How often to flush spans to the server
47
- max_batch_size_bytes: Maximum size of a single batch in bytes (default: 5mb)
48
- max_buffer_spans: Maximum spans to buffer (prevents unbounded growth).
49
- Defaults to 10000, or AIQA_MAX_BUFFER_SPANS env var if set.
50
- max_buffer_size_bytes: Maximum total buffer size in bytes (prevents unbounded memory growth).
51
- Defaults to None (no limit), or AIQA_MAX_BUFFER_SIZE_BYTES env var if set.
52
- startup_delay_seconds: Delay before starting auto-flush (default: 10s, or AIQA_STARTUP_DELAY_SECONDS env var)
53
- """
54
- self._server_url = get_server_url(server_url)
55
- self._api_key = get_api_key(api_key)
56
- self.flush_interval_ms = flush_interval_seconds * 1000
57
- self.max_batch_size_bytes = max_batch_size_bytes
58
-
59
- # Get max_buffer_spans from parameter, environment variable, or default
60
- if not max_buffer_spans:
61
- max_buffer_spans = toNumber(os.getenv("AIQA_MAX_BUFFER_SPANS")) or 10000
62
- self.max_buffer_spans = max_buffer_spans
63
-
64
- # Get max_buffer_size_bytes from parameter, environment variable, or default
65
- if not max_buffer_size_bytes:
66
- max_buffer_size_bytes = toNumber(os.getenv("AIQA_MAX_BUFFER_SIZE_BYTES")) or toNumber("100m")
67
- self.max_buffer_size_bytes = max_buffer_size_bytes
68
-
69
- # Get startup delay from parameter or environment variable (default: 10s)
70
- if startup_delay_seconds is None:
71
- env_delay = os.getenv("AIQA_STARTUP_DELAY_SECONDS")
72
- if env_delay:
73
- try:
74
- startup_delay_seconds = float(env_delay)
75
- except ValueError:
76
- logger.warning(f"Invalid AIQA_STARTUP_DELAY_SECONDS value '{env_delay}', using default 10.0")
77
- startup_delay_seconds = 10.0
78
- else:
79
- startup_delay_seconds = 10.0
80
- self.startup_delay_seconds = startup_delay_seconds
81
-
82
- self.buffer: List[Dict[str, Any]] = []
83
- self.buffer_span_keys: set = set() # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
84
- self.buffer_size_bytes: int = 0 # Track total size of buffered spans in bytes
85
- # Cache span sizes to avoid recalculation (maps span_key -> size_bytes)
86
- # Limited to max_buffer_spans * 2 to prevent unbounded growth
87
- self._span_size_cache: Dict[tuple, int] = {}
88
- self._max_cache_size = self.max_buffer_spans * 2 # Allow cache to be 2x buffer size
89
- self.buffer_lock = threading.Lock()
90
- self.flush_lock = threading.Lock()
91
- # shutdown_requested is only set once (in shutdown()) and read many times
92
- # No lock needed: worst case is reading stale False, which is acceptable
93
- self.shutdown_requested = False
94
- self.flush_timer: Optional[threading.Thread] = None
95
- self._auto_flush_started = False
96
- self._auto_flush_lock = threading.Lock() # Lock for lazy thread creation
97
-
98
- logger.info(f"Initializing AIQASpanExporter: server_url={self._server_url or 'not set'}, "
99
- f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
100
- )
101
- # Don't start thread immediately - start lazily on first export to avoid startup issues
102
-
103
- def export(self, spans: List[ReadableSpan]) -> SpanExportResult:
104
- """
105
- Export spans to the AIQA server. Adds spans to buffer for async flushing.
106
- Deduplicates spans based on (traceId, spanId) to prevent repeated exports.
107
- Actual send is done by flush -> _send_spans, or shutdown -> _send_spans_sync
108
- """
109
- if not spans:
110
- logger.debug(f"export: called with empty spans list")
111
- return SpanExportResult.SUCCESS
112
-
113
- # Check if AIQA tracing is enabled
114
- try:
115
- from .client import get_aiqa_client
116
- client = get_aiqa_client()
117
- if not client.enabled:
118
- logger.debug(f"AIQA export: skipped: tracing is disabled")
119
- return SpanExportResult.SUCCESS
120
- except Exception:
121
- # If we can't check enabled status, proceed (fail open)
122
- pass
123
-
124
- logger.debug(f"AIQA export() to buffer called with {len(spans)} spans")
125
-
126
- # Lazy initialization: start auto-flush thread on first export
127
- # This avoids thread creation during initialization, which can cause issues in ECS deployments
128
- self._ensure_auto_flush_started()
129
-
130
- # Serialize and add to buffer, deduplicating by (traceId, spanId)
131
- with self.buffer_lock:
132
- serialized_spans = []
133
- serialized_sizes = [] # Track sizes of serialized spans
134
- duplicates_count = 0
135
- dropped_count = 0
136
- dropped_memory_count = 0
137
- flush_in_progress = self.flush_lock.locked()
138
-
139
- for span in spans:
140
- # Check if buffer is full by span count (prevent unbounded growth)
141
- if len(self.buffer) >= self.max_buffer_spans:
142
- if flush_in_progress:
143
- # Flush in progress, drop this span
144
- dropped_count += 1
145
- continue
146
- # Flush not in progress, will trigger flush after adding spans
147
- # Continue processing remaining spans to add them before flush
148
-
149
- serialized = self._serialize_span(span)
150
- span_key = (serialized["traceId"], serialized["spanId"])
151
- if span_key not in self.buffer_span_keys:
152
- # Estimate size of this span when serialized (cache for later use)
153
- span_size = self._get_span_size(span_key, serialized)
154
-
155
- # Check if buffer is full by memory size (prevent unbounded memory growth)
156
- if self.max_buffer_size_bytes is not None and self.buffer_size_bytes + span_size > self.max_buffer_size_bytes:
157
- if flush_in_progress:
158
- # Flush in progress, drop this span
159
- # Don't cache size for dropped spans to prevent memory leak
160
- dropped_memory_count += 1
161
- continue
162
- # Flush not in progress, will trigger flush after adding spans
163
- # Continue processing remaining spans to add them before flush
164
-
165
- serialized_spans.append(serialized)
166
- serialized_sizes.append(span_size)
167
- self.buffer_span_keys.add(span_key)
168
- else:
169
- duplicates_count += 1
170
- logger.debug(f"export: skipping duplicate span: traceId={serialized['traceId']}, spanId={serialized['spanId']}")
171
-
172
- # Add spans and update buffer size
173
- self.buffer.extend(serialized_spans)
174
- self.buffer_size_bytes += sum(serialized_sizes)
175
- buffer_size = len(self.buffer)
176
-
177
- # Check if thresholds are reached after adding spans
178
- threshold_reached = self._check_thresholds_reached()
179
-
180
- if dropped_count > 0:
181
- logger.warning(f"WARNING: Buffer full ({buffer_size} spans), dropped {dropped_count} span(s) (flush in progress). "
182
- f"Consider increasing max_buffer_spans or fixing server connectivity."
183
- )
184
- if dropped_memory_count > 0:
185
- logger.warning(f"WARNING: Buffer memory limit reached ({self.buffer_size_bytes} bytes / {self.max_buffer_size_bytes} bytes), "
186
- f"dropped {dropped_memory_count} span(s) (flush in progress). "
187
- f"Consider increasing AIQA_MAX_BUFFER_SIZE_BYTES or fixing server connectivity."
188
- )
189
-
190
- # Trigger immediate flush if threshold reached and flush not in progress
191
- if threshold_reached and not flush_in_progress:
192
- logger.info(f"Buffer threshold reached ({buffer_size} spans, {self.buffer_size_bytes} bytes), triggering immediate flush")
193
- self._trigger_immediate_flush()
194
-
195
- if duplicates_count > 0:
196
- logger.debug(f"export() added {len(serialized_spans)} span(s) to buffer, skipped {duplicates_count} duplicate(s). "
197
- f"Total buffered: {buffer_size}"
198
- )
199
- else:
200
- logger.debug(f"export() added {len(spans)} span(s) to buffer. "
201
- f"Total buffered: {buffer_size}"
202
- )
203
-
204
- return SpanExportResult.SUCCESS
205
-
206
- def _serialize_span(self, span: ReadableSpan) -> Dict[str, Any]:
207
- """Convert ReadableSpan to a serializable format."""
208
- span_context = span.get_span_context()
209
-
210
- # Get parent span ID
211
- parent_span_id = None
212
- if hasattr(span, "parent") and span.parent:
213
- parent_span_id = format(span.parent.span_id, "016x")
214
- elif hasattr(span, "parent_span_id") and span.parent_span_id:
215
- parent_span_id = format(span.parent_span_id, "016x")
216
-
217
- # Get span kind (handle both enum and int)
218
- span_kind = span.kind
219
- if hasattr(span_kind, "value"):
220
- span_kind = span_kind.value
221
-
222
- # Get status code (handle both enum and int)
223
- status_code = span.status.status_code
224
- if hasattr(status_code, "value"):
225
- status_code = status_code.value
226
-
227
- return {
228
- "name": span.name,
229
- "kind": span_kind,
230
- "parentSpanId": parent_span_id,
231
- "startTime": self._time_to_tuple(span.start_time),
232
- "endTime": self._time_to_tuple(span.end_time) if span.end_time else None,
233
- "status": {
234
- "code": status_code,
235
- "message": getattr(span.status, "description", None),
236
- },
237
- "attributes": dict(span.attributes) if span.attributes else {},
238
- "links": [
239
- {
240
- "context": {
241
- "traceId": format(link.context.trace_id, "032x"),
242
- "spanId": format(link.context.span_id, "016x"),
243
- },
244
- "attributes": dict(link.attributes) if link.attributes else {},
245
- }
246
- for link in (span.links or [])
247
- ],
248
- "events": [
249
- {
250
- "name": event.name,
251
- "time": self._time_to_tuple(event.timestamp),
252
- "attributes": dict(event.attributes) if event.attributes else {},
253
- }
254
- for event in (span.events or [])
255
- ],
256
- "resource": {
257
- "attributes": dict(span.resource.attributes) if span.resource.attributes else {},
258
- },
259
- "traceId": format(span_context.trace_id, "032x"),
260
- "spanId": format(span_context.span_id, "016x"),
261
- "traceFlags": span_context.trace_flags,
262
- "duration": self._time_to_tuple(span.end_time - span.start_time) if span.end_time else None,
263
- "ended": span.end_time is not None,
264
- "instrumentationLibrary": self._get_instrumentation_library(span),
265
- }
266
-
267
- def _get_instrumentation_library(self, span: ReadableSpan) -> Dict[str, Any]:
268
- """
269
- Get instrumentation library information from the span: just use the package version.
270
- """
271
- return {
272
- "name": AIQA_TRACER_NAME,
273
- "version": VERSION,
274
- }
275
-
276
- def _time_to_tuple(self, nanoseconds: int) -> tuple:
277
- """Convert nanoseconds to (seconds, nanoseconds) tuple."""
278
- seconds = int(nanoseconds // 1_000_000_000)
279
- nanos = int(nanoseconds % 1_000_000_000)
280
- return (seconds, nanos)
281
-
282
- def _get_span_size(self, span_key: tuple, serialized: Dict[str, Any]) -> int:
283
- """
284
- Get span size from cache or calculate and cache it.
285
- Thread-safe when called within buffer_lock.
286
- Limits cache size to prevent unbounded memory growth.
287
- """
288
- if span_key in self._span_size_cache:
289
- return self._span_size_cache[span_key]
290
- span_json = json.dumps(serialized)
291
- span_size = len(span_json.encode('utf-8'))
292
- # Only cache if we have valid keys and cache isn't too large
293
- if span_key[0] and span_key[1] and len(self._span_size_cache) < self._max_cache_size:
294
- self._span_size_cache[span_key] = span_size
295
- return span_size
296
-
297
- def _check_thresholds_reached(self) -> bool:
298
- """Check if buffer thresholds are reached. Must be called within buffer_lock."""
299
- if len(self.buffer) >= self.max_buffer_spans:
300
- return True
301
- if self.max_buffer_size_bytes is not None and self.buffer_size_bytes >= self.max_buffer_size_bytes:
302
- return True
303
- return False
304
-
305
- def _build_request_headers(self) -> Dict[str, str]:
306
- """Build HTTP headers for span requests."""
307
- return build_headers(self._api_key)
308
-
309
- def _get_span_url(self) -> str:
310
- return f"{self._server_url}/span"
311
-
312
- def _is_interpreter_shutdown_error(self, error: Exception) -> bool:
313
- """Check if error is due to interpreter shutdown."""
314
- error_str = str(error)
315
- return "cannot schedule new futures after" in error_str or "interpreter shutdown" in error_str
316
-
317
- def _extract_spans_from_buffer(self) -> List[Dict[str, Any]]:
318
- """Extract spans from buffer (thread-safe). Returns copy of buffer."""
319
- with self.buffer_lock:
320
- return self.buffer[:]
321
-
322
- def _extract_and_remove_spans_from_buffer(self) -> List[Dict[str, Any]]:
323
- """
324
- Atomically extract and remove all spans from buffer (thread-safe).
325
- Returns the extracted spans. This prevents race conditions where spans
326
- are added between extraction and clearing.
327
- Note: Does NOT clear buffer_span_keys - that should be done after successful send
328
- to avoid unnecessary clearing/rebuilding on failures.
329
- Also resets buffer_size_bytes to 0.
330
- """
331
- with self.buffer_lock:
332
- spans = self.buffer[:]
333
- self.buffer.clear()
334
- self.buffer_size_bytes = 0
335
- return spans
336
-
337
- def _remove_span_keys_from_tracking(self, spans: List[Dict[str, Any]]) -> None:
338
- """
339
- Remove span keys from tracking set and size cache (thread-safe). Called after successful send.
340
- """
341
- with self.buffer_lock:
342
- for span in spans:
343
- span_key = (span["traceId"], span["spanId"])
344
- self.buffer_span_keys.discard(span_key)
345
- # Also remove from size cache to free memory
346
- self._span_size_cache.pop(span_key, None)
347
-
348
- def _prepend_spans_to_buffer(self, spans: List[Dict[str, Any]]) -> None:
349
- """
350
- Prepend spans back to buffer (thread-safe). Used to restore spans
351
- if sending fails. Rebuilds the span keys tracking set and buffer size.
352
- Uses cached sizes when available to avoid re-serialization.
353
- """
354
- with self.buffer_lock:
355
- self.buffer[:0] = spans
356
- # Rebuild span keys set from current buffer contents
357
- self.buffer_span_keys = {(span["traceId"], span["spanId"]) for span in self.buffer}
358
- # Recalculate buffer size using cache when available
359
- total_size = 0
360
- for span in self.buffer:
361
- span_key = (span.get("traceId"), span.get("spanId"))
362
- total_size += self._get_span_size(span_key, span)
363
- self.buffer_size_bytes = total_size
364
-
365
- def _clear_buffer(self) -> None:
366
- """Clear the buffer (thread-safe)."""
367
- with self.buffer_lock:
368
- self.buffer.clear()
369
- self.buffer_span_keys.clear()
370
- self.buffer_size_bytes = 0
371
- self._span_size_cache.clear()
372
-
373
- def _split_into_batches(self, spans: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
374
- """
375
- Split spans into batches based on max_batch_size_bytes.
376
- Each batch will be as large as possible without exceeding the limit.
377
- If a single span exceeds the limit, it will be sent in its own batch with a warning.
378
- """
379
- if not spans:
380
- return []
381
-
382
- batches = []
383
- current_batch = []
384
- current_batch_size = 0
385
-
386
- for span in spans:
387
- # Get size from cache if available, otherwise calculate it
388
- span_key = (span.get("traceId"), span.get("spanId"))
389
- span_size = self._get_span_size(span_key, span)
390
-
391
- # Check if this single span exceeds the limit
392
- if span_size > self.max_batch_size_bytes:
393
- # If we have a current batch, save it first
394
- if current_batch:
395
- batches.append(current_batch)
396
- current_batch = []
397
- current_batch_size = 0
398
-
399
- # Log warning about oversized span
400
- span_name = span.get('name', 'unknown')
401
- span_trace_id = span.get('traceId', 'unknown')
402
- logger.warning(f"Span \'{span_name}' (traceId={span_trace_id}) exceeds max_batch_size_bytes "
403
- f"({span_size} bytes > {self.max_batch_size_bytes} bytes). "
404
- f"Will attempt to send it anyway - may fail if server/nginx limit is exceeded."
405
- )
406
- # Still create a batch with just this span - we'll try to send it
407
- batches.append([span])
408
- continue
409
-
410
- # If adding this span would exceed the limit, start a new batch
411
- if current_batch and current_batch_size + span_size > self.max_batch_size_bytes:
412
- batches.append(current_batch)
413
- current_batch = []
414
- current_batch_size = 0
415
-
416
- current_batch.append(span)
417
- current_batch_size += span_size
418
-
419
- # Add the last batch if it has any spans
420
- if current_batch:
421
- batches.append(current_batch)
422
-
423
- return batches
424
-
425
- async def flush(self) -> None:
426
- """
427
- Flush buffered spans to the server. Thread-safe: ensures only one flush operation runs at a time.
428
- Atomically extracts spans to prevent race conditions with concurrent export() calls.
429
-
430
- Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
431
- """
432
- logger.debug(f"flush: called - attempting to acquire flush lock")
433
- with self.flush_lock:
434
- logger.debug(f"flush() acquired flush lock")
435
- # Atomically extract and remove spans to prevent race conditions
436
- # where export() adds spans between extraction and clearing
437
- spans_to_flush = self._extract_and_remove_spans_from_buffer()
438
- logger.debug(f"flush: extracted {len(spans_to_flush)} span(s) from buffer")
439
-
440
- if not spans_to_flush:
441
- logger.debug(f"flush() completed: no spans to flush")
442
- return
443
-
444
- # Skip sending if server URL is not configured
445
- if not self._server_url:
446
- logger.warning(f"Skipping flush: AIQA_SERVER_URL is not set. {len(spans_to_flush)} span(s) will not be sent."
447
- )
448
- # Spans already removed from buffer, clear their keys to free memory
449
- self._remove_span_keys_from_tracking(spans_to_flush)
450
- return
451
-
452
- # Release flush_lock before I/O to avoid blocking other flush attempts
453
- # Spans are already extracted, so concurrent exports won't interfere
454
- logger.info(f"flush: sending {len(spans_to_flush)} span(s) to server")
455
- try:
456
- await self._send_spans(spans_to_flush)
457
- logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
458
- # Spans already removed from buffer during extraction
459
- # Now clear their keys from tracking set to free memory
460
- self._remove_span_keys_from_tracking(spans_to_flush)
461
- except RuntimeError as error:
462
- if self._is_interpreter_shutdown_error(error):
463
- if self.shutdown_requested:
464
- logger.debug(f"flush: skipped due to interpreter shutdown: {error}")
465
- else:
466
- logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
467
- # Put spans back for retry with sync send during shutdown
468
- self._prepend_spans_to_buffer(spans_to_flush)
469
- raise
470
- logger.error(f"Error flushing spans to server: {error}")
471
- # Put spans back for retry
472
- self._prepend_spans_to_buffer(spans_to_flush)
473
- raise
474
- except Exception as error:
475
- logger.error(f"Error flushing spans to server: {error}")
476
- # Put spans back for retry
477
- self._prepend_spans_to_buffer(spans_to_flush)
478
- if self.shutdown_requested:
479
- raise
480
-
481
- def _ensure_auto_flush_started(self) -> None:
482
- """Ensure auto-flush thread is started (lazy initialization). Thread-safe."""
483
- # Fast path: check without lock first
484
- if self._auto_flush_started or self.shutdown_requested:
485
- return
486
-
487
- # Slow path: acquire lock and double-check
488
- with self._auto_flush_lock:
489
- if self._auto_flush_started or self.shutdown_requested:
490
- return
491
-
492
- try:
493
- self._start_auto_flush()
494
- self._auto_flush_started = True
495
- except Exception as e:
496
- logger.error(f"Failed to start auto-flush thread: {e}", exc_info=True)
497
- # Don't raise - allow spans to be buffered even if auto-flush fails
498
- # They can still be flushed manually or on shutdown
499
-
500
- def _trigger_immediate_flush(self) -> None:
501
- """
502
- Trigger an immediate flush in a background thread.
503
- This is called when buffer thresholds are reached and no flush is in progress.
504
- """
505
- def flush_in_thread():
506
- """Run flush in a new thread with its own event loop."""
507
- try:
508
- loop = asyncio.new_event_loop()
509
- asyncio.set_event_loop(loop)
510
- try:
511
- loop.run_until_complete(self.flush())
512
- finally:
513
- if not loop.is_closed():
514
- loop.close()
515
- except Exception as e:
516
- logger.error(f"Error in immediate flush thread: {e}", exc_info=True)
517
-
518
- # Start flush in background thread (daemon so it doesn't block shutdown)
519
- flush_thread = threading.Thread(target=flush_in_thread, daemon=True, name="AIQA-ImmediateFlush")
520
- flush_thread.start()
521
-
522
- def _flush_worker(self) -> None:
523
- """Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
524
- import asyncio
525
- logger.debug(f"Auto-flush worker thread started")
526
-
527
- # Wait for startup delay before beginning flush operations
528
- # This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
529
- if self.startup_delay_seconds > 0:
530
- logger.info(f"Auto-flush waiting {self.startup_delay_seconds}s before first flush (startup delay)")
531
- # Sleep in small increments to allow for early shutdown
532
- sleep_interval = 0.5
533
- remaining_delay = self.startup_delay_seconds
534
- while remaining_delay > 0 and not self.shutdown_requested:
535
- sleep_time = min(sleep_interval, remaining_delay)
536
- time.sleep(sleep_time)
537
- remaining_delay -= sleep_time
538
-
539
- if self.shutdown_requested:
540
- logger.debug(f"Auto-flush startup delay interrupted by shutdown")
541
- return
542
-
543
- logger.info(f"Auto-flush startup delay complete, beginning flush operations")
544
-
545
- # Create event loop in this thread (isolated from main thread's event loop)
546
- # This prevents interference with the main application's event loop
547
- try:
548
- loop = asyncio.new_event_loop()
549
- asyncio.set_event_loop(loop)
550
- except Exception as e:
551
- logger.error(f"Failed to create event loop for auto-flush thread: {e}", exc_info=True)
552
- return
553
-
554
- # Ensure event loop is always closed, even if an exception occurs
555
- try:
556
- cycle_count = 0
557
- while not self.shutdown_requested:
558
- cycle_count += 1
559
- logger.debug(f"Auto-flush cycle #{cycle_count} starting")
560
- try:
561
- loop.run_until_complete(self.flush())
562
- logger.debug(f"Auto-flush cycle #{cycle_count} completed, sleeping {self.flush_interval_ms / 1000.0}s")
563
- except Exception as e:
564
- logger.error(f"Error in auto-flush cycle #{cycle_count}: {e}")
565
- logger.debug(f"Auto-flush cycle #{cycle_count} error handled, sleeping {self.flush_interval_ms / 1000.0}s")
566
-
567
- # Sleep after each cycle (including errors) to avoid tight loops
568
- if not self.shutdown_requested:
569
- time.sleep(self.flush_interval_ms / 1000.0)
570
-
571
- logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
572
- # Don't do final flush here - shutdown() will handle it with synchronous send
573
- # This avoids event loop shutdown issues
574
- logger.debug(f"Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
575
- finally:
576
- # Always close the event loop, even if an exception occurs
577
- try:
578
- if not loop.is_closed():
579
- loop.close()
580
- logger.debug(f"Auto-flush worker thread event loop closed")
581
- except Exception:
582
- pass # Ignore errors during cleanup
583
-
584
- def _start_auto_flush(self) -> None:
585
- """Start the auto-flush timer with startup delay."""
586
- if self.shutdown_requested:
587
- logger.warning(f"_start_auto_flush() called but shutdown already requested")
588
- return
589
-
590
- logger.info(f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
591
- f"startup delay {self.startup_delay_seconds}s"
592
- )
593
-
594
- flush_thread = threading.Thread(target=self._flush_worker, daemon=True, name="AIQA-AutoFlush")
595
- flush_thread.start()
596
- self.flush_timer = flush_thread
597
- logger.info(f"Auto-flush thread started: {flush_thread.name} (daemon={flush_thread.daemon})")
598
-
599
- async def _send_spans(self, spans: List[Dict[str, Any]]) -> None:
600
- """Send spans to the server API (async). Batches large payloads automatically."""
601
- import aiohttp
602
-
603
- # Split into batches if needed
604
- batches = self._split_into_batches(spans)
605
- if len(batches) > 1:
606
- logger.info(f"_send_spans: splitting {len(spans)} spans into {len(batches)} batches")
607
-
608
- url = self._get_span_url()
609
- headers = self._build_request_headers()
610
-
611
- if not self._api_key: # This should not happen
612
- logger.error(f"_send_spans: fail - no API key provided. {len(spans)} spans lost.")
613
- # Spans were already removed from buffer before calling this method. They will now get forgotten
614
- return
615
-
616
- # Use timeout to prevent hanging on unreachable servers
617
- timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
618
- errors = []
619
- async with aiohttp.ClientSession(timeout=timeout) as session:
620
- for batch_idx, batch in enumerate(batches):
621
- try:
622
- logger.debug(f"_send_spans: sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
623
- # Pre-serialize JSON to bytes and wrap in BytesIO to avoid blocking event loop
624
- json_bytes = json.dumps(batch).encode('utf-8')
625
- data = io.BytesIO(json_bytes)
626
-
627
- async with session.post(url, data=data, headers=headers) as response:
628
- logger.debug(f"_send_spans: batch {batch_idx + 1} received response: status={response.status}")
629
- if not response.ok:
630
- error_text = await response.text()
631
- error_msg = f"Failed to send batch {batch_idx + 1}/{len(batches)}: {response.status} {response.reason} - {error_text[:200]}"
632
- logger.error(f"_send_spans: {error_msg}")
633
- errors.append((batch_idx + 1, error_msg))
634
- # Continue with other batches even if one fails
635
- continue
636
- logger.debug(f"_send_spans: batch {batch_idx + 1} successfully sent {len(batch)} spans")
637
- except (aiohttp.ClientError, asyncio.TimeoutError) as e:
638
- # Network errors and timeouts - log but don't fail completely
639
- error_msg = f"Network error in batch {batch_idx + 1}: {type(e).__name__}: {e}"
640
- logger.warning(f"_send_spans: {error_msg} - will retry on next flush")
641
- errors.append((batch_idx + 1, error_msg))
642
- # Continue with other batches
643
- except RuntimeError as e:
644
- if self._is_interpreter_shutdown_error(e):
645
- if self.shutdown_requested:
646
- logger.debug(f"_send_spans: skipped due to interpreter shutdown: {e}")
647
- else:
648
- logger.warning(f"_send_spans: interrupted by interpreter shutdown: {e}")
649
- raise
650
- error_msg = f"RuntimeError in batch {batch_idx + 1}: {type(e).__name__}: {e}"
651
- logger.error(f"_send_spans: {error_msg}")
652
- errors.append((batch_idx + 1, error_msg))
653
- # Continue with other batches
654
- except Exception as e:
655
- error_msg = f"Exception in batch {batch_idx + 1}: {type(e).__name__}: {e}"
656
- logger.error(f"_send_spans: {error_msg}")
657
- errors.append((batch_idx + 1, error_msg))
658
- # Continue with other batches
659
-
660
- # If any batches failed, raise an exception with details
661
- # Spans will be restored to buffer for retry on next flush
662
- if errors:
663
- error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
664
- raise Exception(f"Failed to send some spans: {error_summary}")
665
-
666
- logger.debug(f"_send_spans: successfully sent all {len(spans)} spans in {len(batches)} batch(es)")
667
-
668
- def _send_spans_sync(self, spans: List[Dict[str, Any]]) -> None:
669
- """Send spans to the server API (synchronous, for shutdown scenarios). Batches large payloads automatically."""
670
- import requests
671
-
672
- # Split into batches if needed
673
- batches = self._split_into_batches(spans)
674
- if len(batches) > 1:
675
- logger.info(f"_send_spans_sync() splitting {len(spans)} spans into {len(batches)} batches")
676
-
677
- url = self._get_span_url()
678
- headers = self._build_request_headers()
679
-
680
- if not self._api_key:
681
- logger.error(f"_send_spans_sync() fail - no API key provided")
682
- return
683
-
684
- errors = []
685
- for batch_idx, batch in enumerate(batches):
686
- try:
687
- logger.debug(f"_send_spans_sync() sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
688
- response = requests.post(url, json=batch, headers=headers, timeout=10.0)
689
- logger.debug(f"_send_spans_sync() batch {batch_idx + 1} received response: status={response.status_code}")
690
- if not response.ok:
691
- error_text = response.text[:200] if response.text else ""
692
- error_msg = f"Failed to send batch {batch_idx + 1}/{len(batches)}: {response.status_code} {response.reason} - {error_text}"
693
- logger.error(f"_send_spans_sync() {error_msg}")
694
- errors.append((batch_idx + 1, error_msg))
695
- # Continue with other batches even if one fails
696
- continue
697
- logger.debug(f"_send_spans_sync() batch {batch_idx + 1} successfully sent {len(batch)} spans")
698
- except Exception as e:
699
- error_msg = f"Exception in batch {batch_idx + 1}: {type(e).__name__}: {e}"
700
- logger.error(f"_send_spans_sync() {error_msg}")
701
- errors.append((batch_idx + 1, error_msg))
702
- # Continue with other batches
703
-
704
- # If any batches failed, raise an exception with details
705
- if errors:
706
- error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
707
- raise Exception(f"Failed to send some spans: {error_summary}")
708
-
709
- logger.debug(f"_send_spans_sync() successfully sent all {len(spans)} spans in {len(batches)} batch(es)")
710
-
711
- def shutdown(self) -> None:
712
- """Shutdown the exporter, flushing any remaining spans. Call before process exit."""
713
- logger.info(f"shutdown: called - initiating exporter shutdown")
714
- self.shutdown_requested = True
715
-
716
- # Check buffer state before shutdown
717
- with self.buffer_lock:
718
- buffer_size = len(self.buffer)
719
- logger.info(f"shutdown: buffer contains {buffer_size} span(s) before shutdown")
720
-
721
- # Wait for flush thread to finish (it will do final flush)
722
- # Only wait if thread was actually started
723
- if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
724
- logger.info(f"shutdown: waiting for auto-flush thread to complete (timeout=10s)")
725
- self.flush_timer.join(timeout=10.0)
726
- if self.flush_timer.is_alive():
727
- logger.warning(f"shutdown: auto-flush thread did not complete within timeout")
728
- else:
729
- logger.info(f"shutdown: auto-flush thread completed")
730
- else:
731
- logger.debug(f"shutdown: no active auto-flush thread to wait for")
732
-
733
- # Final flush attempt (use synchronous send to avoid event loop issues)
734
- with self.flush_lock:
735
- logger.debug(f"shutdown: performing final flush with synchronous send")
736
- # Atomically extract and remove spans to prevent race conditions
737
- spans_to_flush = self._extract_and_remove_spans_from_buffer()
738
- logger.debug(f"shutdown: extracted {len(spans_to_flush)} span(s) from buffer for final flush")
739
-
740
- if spans_to_flush:
741
- if not self._server_url:
742
- logger.warning(f"shutdown: skipping final flush: AIQA_SERVER_URL is not set. "
743
- f"{len(spans_to_flush)} span(s) will not be sent."
744
- )
745
- # Spans already removed from buffer, clear their keys to free memory
746
- self._remove_span_keys_from_tracking(spans_to_flush)
747
- else:
748
- logger.info(f"shutdown: sending {len(spans_to_flush)} span(s) to server (synchronous)")
749
- try:
750
- self._send_spans_sync(spans_to_flush)
751
- logger.info(f"shutdown: successfully sent {len(spans_to_flush)} span(s) to server")
752
- # Spans already removed from buffer during extraction
753
- # Clear their keys from tracking set to free memory
754
- self._remove_span_keys_from_tracking(spans_to_flush)
755
- except Exception as e:
756
- logger.error(f"shutdown: failed to send spans: {e}")
757
- # Spans already removed, but process is exiting anyway
758
- logger.warning(f"shutdown: {len(spans_to_flush)} span(s) were not sent due to error")
759
- # Keys will remain in tracking set, but process is exiting so memory will be freed
760
- else:
761
- logger.debug(f"shutdown: no spans to flush")
762
-
763
- # Check buffer state after shutdown
764
- with self.buffer_lock:
765
- buffer_size = len(self.buffer)
766
- if buffer_size > 0:
767
- logger.warning(f"shutdown: buffer still contains {buffer_size} span(s) after shutdown")
768
- else:
769
- logger.info(f"shutdown: buffer is empty after shutdown")
770
-
771
- logger.info(f"shutdown: completed")
772
-