aiqa-client 0.3.7__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
aiqa/__init__.py CHANGED
@@ -1,22 +1,29 @@
1
1
  """
2
2
  Python client for AIQA server - OpenTelemetry tracing decorators.
3
3
 
4
- IMPORTANT: Before using any AIQA functionality, you must call get_aiqa_client() to initialize
5
- the client and load environment variables (AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG, etc.).
4
+ Initialization is automatic - you don't need to call get_aiqa_client() explicitly.
5
+ The client initializes automatically when WithTracing is first used.
6
+
7
+ Set environment variables:
8
+ AIQA_SERVER_URL: URL of the AIQA server
9
+ AIQA_API_KEY: API key for authentication
10
+ AIQA_COMPONENT_TAG: Optional component identifier
11
+ AIQA_STARTUP_DELAY_SECONDS: Optional delay before first flush (default: 10s)
6
12
 
7
13
  Example:
8
14
  from dotenv import load_dotenv
9
- from aiqa import get_aiqa_client, WithTracing
15
+ from aiqa import WithTracing
10
16
 
11
17
  # Load environment variables from .env file (if using one)
12
18
  load_dotenv()
13
19
 
14
- # Initialize client (must be called before using WithTracing or other functions)
15
- get_aiqa_client()
16
-
20
+ # No explicit initialization needed - it happens automatically when used
17
21
  @WithTracing
18
22
  def my_function():
19
23
  return "Hello, AIQA!"
24
+
25
+ # Call the function - initialization happens on first use
26
+ result = my_function()
20
27
  """
21
28
 
22
29
  from .tracing import (
@@ -36,8 +43,7 @@ from .tracing import (
36
43
  )
37
44
  from .client import get_aiqa_client
38
45
  from .experiment_runner import ExperimentRunner
39
-
40
- __version__ = "0.3.7"
46
+ from .constants import VERSION
41
47
 
42
48
  __all__ = [
43
49
  "WithTracing",
@@ -55,6 +61,6 @@ __all__ = [
55
61
  "set_conversation_id",
56
62
  "set_component_tag",
57
63
  "get_span",
58
- "__version__",
64
+ "VERSION",
59
65
  ]
60
66
 
aiqa/aiqa_exporter.py CHANGED
@@ -9,12 +9,12 @@ import logging
9
9
  import threading
10
10
  import time
11
11
  import io
12
+ import asyncio
12
13
  from typing import List, Dict, Any, Optional
13
14
  from opentelemetry.sdk.trace import ReadableSpan
14
15
  from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
15
16
 
16
- from .constants import AIQA_TRACER_NAME
17
- from . import __version__
17
+ from .constants import AIQA_TRACER_NAME, VERSION
18
18
 
19
19
  logger = logging.getLogger("AIQA")
20
20
 
@@ -32,6 +32,7 @@ class AIQASpanExporter(SpanExporter):
32
32
  flush_interval_seconds: float = 5.0,
33
33
  max_batch_size_bytes: int = 5 * 1024 * 1024, # 5MB default
34
34
  max_buffer_spans: int = 10000, # Maximum spans to buffer (prevents unbounded growth)
35
+ startup_delay_seconds: Optional[float] = None,
35
36
  ):
36
37
  """
37
38
  Initialize the AIQA span exporter.
@@ -41,24 +42,44 @@ class AIQASpanExporter(SpanExporter):
41
42
  api_key: API key for authentication (defaults to AIQA_API_KEY env var)
42
43
  flush_interval_seconds: How often to flush spans to the server
43
44
  max_batch_size_bytes: Maximum size of a single batch in bytes (default: 5mb)
45
+ max_buffer_spans: Maximum spans to buffer (prevents unbounded growth)
46
+ startup_delay_seconds: Delay before starting auto-flush (default: 10s, or AIQA_STARTUP_DELAY_SECONDS env var)
44
47
  """
45
48
  self._server_url = server_url
46
49
  self._api_key = api_key
47
50
  self.flush_interval_ms = flush_interval_seconds * 1000
48
51
  self.max_batch_size_bytes = max_batch_size_bytes
49
52
  self.max_buffer_spans = max_buffer_spans
53
+
54
+ # Get startup delay from parameter or environment variable (default: 10s)
55
+ if startup_delay_seconds is None:
56
+ env_delay = os.getenv("AIQA_STARTUP_DELAY_SECONDS")
57
+ if env_delay:
58
+ try:
59
+ startup_delay_seconds = float(env_delay)
60
+ except ValueError:
61
+ logger.warning(f"Invalid AIQA_STARTUP_DELAY_SECONDS value '{env_delay}', using default 10.0")
62
+ startup_delay_seconds = 10.0
63
+ else:
64
+ startup_delay_seconds = 10.0
65
+ self.startup_delay_seconds = startup_delay_seconds
66
+
50
67
  self.buffer: List[Dict[str, Any]] = []
51
68
  self.buffer_span_keys: set = set() # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
52
69
  self.buffer_lock = threading.Lock()
53
70
  self.flush_lock = threading.Lock()
71
+ # shutdown_requested is only set once (in shutdown()) and read many times
72
+ # No lock needed: worst case is reading stale False, which is acceptable
54
73
  self.shutdown_requested = False
55
74
  self.flush_timer: Optional[threading.Thread] = None
75
+ self._auto_flush_started = False
76
+ self._auto_flush_lock = threading.Lock() # Lock for lazy thread creation
56
77
 
57
78
  logger.info(
58
79
  f"Initializing AIQASpanExporter: server_url={self.server_url or 'not set'}, "
59
- f"flush_interval={flush_interval_seconds}s"
80
+ f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
60
81
  )
61
- self._start_auto_flush()
82
+ # Don't start thread immediately - start lazily on first export to avoid startup issues
62
83
 
63
84
  @property
64
85
  def server_url(self) -> str:
@@ -89,6 +110,11 @@ class AIQASpanExporter(SpanExporter):
89
110
  pass
90
111
 
91
112
  logger.debug(f"AIQA export() called with {len(spans)} spans")
113
+
114
+ # Lazy initialization: start auto-flush thread on first export
115
+ # This avoids thread creation during initialization, which can cause issues in ECS deployments
116
+ self._ensure_auto_flush_started()
117
+
92
118
  # Serialize and add to buffer, deduplicating by (traceId, spanId)
93
119
  with self.buffer_lock:
94
120
  serialized_spans = []
@@ -198,7 +224,7 @@ class AIQASpanExporter(SpanExporter):
198
224
  """
199
225
  return {
200
226
  "name": AIQA_TRACER_NAME,
201
- "version": __version__,
227
+ "version": VERSION,
202
228
  }
203
229
 
204
230
  def _time_to_tuple(self, nanoseconds: int) -> tuple:
@@ -325,6 +351,8 @@ class AIQASpanExporter(SpanExporter):
325
351
  """
326
352
  Flush buffered spans to the server. Thread-safe: ensures only one flush operation runs at a time.
327
353
  Atomically extracts spans to prevent race conditions with concurrent export() calls.
354
+
355
+ Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
328
356
  """
329
357
  logger.debug("flush() called - attempting to acquire flush lock")
330
358
  with self.flush_lock:
@@ -347,49 +375,88 @@ class AIQASpanExporter(SpanExporter):
347
375
  self._remove_span_keys_from_tracking(spans_to_flush)
348
376
  return
349
377
 
350
- logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
351
- try:
352
- await self._send_spans(spans_to_flush)
353
- logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
354
- # Spans already removed from buffer during extraction
355
- # Now clear their keys from tracking set to free memory
356
- self._remove_span_keys_from_tracking(spans_to_flush)
357
- except RuntimeError as error:
358
- if self._is_interpreter_shutdown_error(error):
359
- if self.shutdown_requested:
360
- logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
361
- # Put spans back for retry with sync send during shutdown
362
- self._prepend_spans_to_buffer(spans_to_flush)
363
- else:
364
- logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
365
- # Put spans back for retry
366
- self._prepend_spans_to_buffer(spans_to_flush)
367
- raise
368
- logger.error(f"Error flushing spans to server: {error}")
369
- # Put spans back for retry
378
+ # Release flush_lock before I/O to avoid blocking other flush attempts
379
+ # Spans are already extracted, so concurrent exports won't interfere
380
+ logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
381
+ try:
382
+ await self._send_spans(spans_to_flush)
383
+ logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
384
+ # Spans already removed from buffer during extraction
385
+ # Now clear their keys from tracking set to free memory
386
+ self._remove_span_keys_from_tracking(spans_to_flush)
387
+ except RuntimeError as error:
388
+ if self._is_interpreter_shutdown_error(error):
389
+ if self.shutdown_requested:
390
+ logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
391
+ else:
392
+ logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
393
+ # Put spans back for retry with sync send during shutdown
370
394
  self._prepend_spans_to_buffer(spans_to_flush)
371
395
  raise
372
- except Exception as error:
373
- logger.error(f"Error flushing spans to server: {error}")
374
- # Put spans back for retry
375
- self._prepend_spans_to_buffer(spans_to_flush)
376
- if self.shutdown_requested:
377
- raise
396
+ logger.error(f"Error flushing spans to server: {error}")
397
+ # Put spans back for retry
398
+ self._prepend_spans_to_buffer(spans_to_flush)
399
+ raise
400
+ except Exception as error:
401
+ logger.error(f"Error flushing spans to server: {error}")
402
+ # Put spans back for retry
403
+ self._prepend_spans_to_buffer(spans_to_flush)
404
+ if self.shutdown_requested:
405
+ raise
378
406
 
379
- def _start_auto_flush(self) -> None:
380
- """Start the auto-flush timer."""
381
- if self.shutdown_requested:
382
- logger.warning("_start_auto_flush() called but shutdown already requested")
407
+ def _ensure_auto_flush_started(self) -> None:
408
+ """Ensure auto-flush thread is started (lazy initialization). Thread-safe."""
409
+ # Fast path: check without lock first
410
+ if self._auto_flush_started or self.shutdown_requested:
383
411
  return
384
-
385
- logger.info(f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s")
386
-
387
- def flush_worker():
388
- import asyncio
389
- logger.debug("Auto-flush worker thread started")
412
+
413
+ # Slow path: acquire lock and double-check
414
+ with self._auto_flush_lock:
415
+ if self._auto_flush_started or self.shutdown_requested:
416
+ return
417
+
418
+ try:
419
+ self._start_auto_flush()
420
+ self._auto_flush_started = True
421
+ except Exception as e:
422
+ logger.error(f"Failed to start auto-flush thread: {e}", exc_info=True)
423
+ # Don't raise - allow spans to be buffered even if auto-flush fails
424
+ # They can still be flushed manually or on shutdown
425
+
426
+ def _flush_worker(self) -> None:
427
+ """Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
428
+ import asyncio
429
+ logger.debug("Auto-flush worker thread started")
430
+
431
+ # Wait for startup delay before beginning flush operations
432
+ # This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
433
+ if self.startup_delay_seconds > 0:
434
+ logger.info(f"Auto-flush waiting {self.startup_delay_seconds}s before first flush (startup delay)")
435
+ # Sleep in small increments to allow for early shutdown
436
+ sleep_interval = 0.5
437
+ remaining_delay = self.startup_delay_seconds
438
+ while remaining_delay > 0 and not self.shutdown_requested:
439
+ sleep_time = min(sleep_interval, remaining_delay)
440
+ time.sleep(sleep_time)
441
+ remaining_delay -= sleep_time
442
+
443
+ if self.shutdown_requested:
444
+ logger.debug("Auto-flush startup delay interrupted by shutdown")
445
+ return
446
+
447
+ logger.info("Auto-flush startup delay complete, beginning flush operations")
448
+
449
+ # Create event loop in this thread (isolated from main thread's event loop)
450
+ # This prevents interference with the main application's event loop
451
+ try:
390
452
  loop = asyncio.new_event_loop()
391
453
  asyncio.set_event_loop(loop)
392
-
454
+ except Exception as e:
455
+ logger.error(f"Failed to create event loop for auto-flush thread: {e}", exc_info=True)
456
+ return
457
+
458
+ # Ensure event loop is always closed, even if an exception occurs
459
+ try:
393
460
  cycle_count = 0
394
461
  while not self.shutdown_requested:
395
462
  cycle_count += 1
@@ -397,27 +464,39 @@ class AIQASpanExporter(SpanExporter):
397
464
  try:
398
465
  loop.run_until_complete(self.flush())
399
466
  logger.debug(f"Auto-flush cycle #{cycle_count} completed, sleeping {self.flush_interval_ms / 1000.0}s")
400
- time.sleep(self.flush_interval_ms / 1000.0)
401
467
  except Exception as e:
402
468
  logger.error(f"Error in auto-flush cycle #{cycle_count}: {e}")
403
469
  logger.debug(f"Auto-flush cycle #{cycle_count} error handled, sleeping {self.flush_interval_ms / 1000.0}s")
470
+
471
+ # Sleep after each cycle (including errors) to avoid tight loops
472
+ if not self.shutdown_requested:
404
473
  time.sleep(self.flush_interval_ms / 1000.0)
405
474
 
406
475
  logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
407
-
408
476
  # Don't do final flush here - shutdown() will handle it with synchronous send
409
477
  # This avoids event loop shutdown issues
410
478
  logger.debug("Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
411
-
412
- # Close the event loop
479
+ finally:
480
+ # Always close the event loop, even if an exception occurs
413
481
  try:
414
482
  if not loop.is_closed():
415
483
  loop.close()
416
484
  logger.debug("Auto-flush worker thread event loop closed")
417
485
  except Exception:
418
486
  pass # Ignore errors during cleanup
487
+
488
+ def _start_auto_flush(self) -> None:
489
+ """Start the auto-flush timer with startup delay."""
490
+ if self.shutdown_requested:
491
+ logger.warning("_start_auto_flush() called but shutdown already requested")
492
+ return
493
+
494
+ logger.info(
495
+ f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
496
+ f"startup delay {self.startup_delay_seconds}s"
497
+ )
419
498
 
420
- flush_thread = threading.Thread(target=flush_worker, daemon=True, name="AIQA-AutoFlush")
499
+ flush_thread = threading.Thread(target=self._flush_worker, daemon=True, name="AIQA-AutoFlush")
421
500
  flush_thread.start()
422
501
  self.flush_timer = flush_thread
423
502
  logger.info(f"Auto-flush thread started: {flush_thread.name} (daemon={flush_thread.daemon})")
@@ -439,8 +518,10 @@ class AIQASpanExporter(SpanExporter):
439
518
  else:
440
519
  logger.debug("_send_spans() no API key provided")
441
520
 
521
+ # Use timeout to prevent hanging on unreachable servers
522
+ timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
442
523
  errors = []
443
- async with aiohttp.ClientSession() as session:
524
+ async with aiohttp.ClientSession(timeout=timeout) as session:
444
525
  for batch_idx, batch in enumerate(batches):
445
526
  try:
446
527
  logger.debug(f"_send_spans() sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
@@ -458,6 +539,12 @@ class AIQASpanExporter(SpanExporter):
458
539
  # Continue with other batches even if one fails
459
540
  continue
460
541
  logger.debug(f"_send_spans() batch {batch_idx + 1} successfully sent {len(batch)} spans")
542
+ except (aiohttp.ClientError, asyncio.TimeoutError) as e:
543
+ # Network errors and timeouts - log but don't fail completely
544
+ error_msg = f"Network error in batch {batch_idx + 1}: {type(e).__name__}: {e}"
545
+ logger.warning(f"_send_spans() {error_msg} - will retry on next flush")
546
+ errors.append((batch_idx + 1, error_msg))
547
+ # Continue with other batches
461
548
  except RuntimeError as e:
462
549
  if self._is_interpreter_shutdown_error(e):
463
550
  if self.shutdown_requested:
@@ -476,6 +563,7 @@ class AIQASpanExporter(SpanExporter):
476
563
  # Continue with other batches
477
564
 
478
565
  # If any batches failed, raise an exception with details
566
+ # Spans will be restored to buffer for retry on next flush
479
567
  if errors:
480
568
  error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
481
569
  raise Exception(f"Failed to send some spans: {error_summary}")
@@ -537,7 +625,8 @@ class AIQASpanExporter(SpanExporter):
537
625
  logger.info(f"shutdown() buffer contains {buffer_size} span(s) before shutdown")
538
626
 
539
627
  # Wait for flush thread to finish (it will do final flush)
540
- if self.flush_timer and self.flush_timer.is_alive():
628
+ # Only wait if thread was actually started
629
+ if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
541
630
  logger.info("shutdown() waiting for auto-flush thread to complete (timeout=10s)")
542
631
  self.flush_timer.join(timeout=10.0)
543
632
  if self.flush_timer.is_alive():
aiqa/client.py CHANGED
@@ -7,9 +7,6 @@ from opentelemetry import trace
7
7
  from opentelemetry.sdk.trace import TracerProvider
8
8
  from opentelemetry.sdk.trace.export import BatchSpanProcessor
9
9
 
10
- if TYPE_CHECKING:
11
- from .aiqa_exporter import AIQASpanExporter
12
-
13
10
  logger = logging.getLogger("AIQA")
14
11
 
15
12
  # Compatibility import for TraceIdRatioBased sampler
@@ -46,7 +43,7 @@ class AIQAClient:
46
43
  if cls._instance is None:
47
44
  cls._instance = super().__new__(cls)
48
45
  cls._instance._provider: Optional[TracerProvider] = None
49
- cls._instance._exporter: Optional[AIQASpanExporter] = None
46
+ cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
50
47
  cls._instance._enabled: bool = True
51
48
  cls._instance._initialized: bool = False
52
49
  return cls._instance
@@ -62,12 +59,12 @@ class AIQAClient:
62
59
  self._provider = value
63
60
 
64
61
  @property
65
- def exporter(self) -> Optional[AIQASpanExporter]:
62
+ def exporter(self) -> Optional[Any]:
66
63
  """Get the span exporter."""
67
64
  return self._exporter
68
65
 
69
66
  @exporter.setter
70
- def exporter(self, value: Optional[AIQASpanExporter]) -> None:
67
+ def exporter(self, value: Optional[Any]) -> None:
71
68
  """Set the span exporter."""
72
69
  self._exporter = value
73
70
 
@@ -132,9 +129,14 @@ def get_aiqa_client() -> AIQAClient:
132
129
  """
133
130
  Initialize and return the AIQA client singleton.
134
131
 
135
- This function must be called before using any AIQA tracing functionality to ensure
136
- that environment variables (such as AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
137
- are properly loaded and the tracing system is initialized.
132
+ This function is called automatically when WithTracing is first used, so you typically
133
+ don't need to call it explicitly. However, you can call it manually if you want to:
134
+ - Check if tracing is enabled (client.enabled)
135
+ - Initialize before the first @WithTracing usage
136
+ - Access the client object for advanced usage
137
+
138
+ The function loads environment variables (AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
139
+ and initializes the tracing system.
138
140
 
139
141
  The client object manages the tracing system state. Tracing is done by the WithTracing
140
142
  decorator. Experiments are run by the ExperimentRunner class.
@@ -145,12 +147,14 @@ def get_aiqa_client() -> AIQAClient:
145
147
  Example:
146
148
  from aiqa import get_aiqa_client, WithTracing
147
149
 
148
- # Initialize client (loads env vars)
150
+ # Optional: Initialize explicitly (usually not needed)
149
151
  client = get_aiqa_client()
152
+ if client.enabled:
153
+ print("Tracing is enabled")
150
154
 
151
155
  @WithTracing
152
156
  def my_function():
153
- pass
157
+ pass # Initialization happens automatically here if not done above
154
158
  """
155
159
  global client
156
160
  try:
@@ -252,10 +256,10 @@ def get_aiqa_tracer() -> trace.Tracer:
252
256
  """
253
257
  try:
254
258
  # Import here to avoid circular import
255
- from . import __version__
259
+ from . import VERSION
256
260
  # Compatibility: version parameter may not be supported in older OpenTelemetry versions
257
261
  # Try with version parameter (newer OpenTelemetry versions)
258
- return trace.get_tracer(AIQA_TRACER_NAME, version=__version__)
262
+ return trace.get_tracer(AIQA_TRACER_NAME, version=VERSION)
259
263
  except Exception as e:
260
264
  # Log issue but still return a tracer
261
265
  logger.info(f"Issue getting AIQA tracer with version: {e}, using fallback")
aiqa/constants.py CHANGED
@@ -3,3 +3,4 @@ Constants used across the AIQA client package.
3
3
  """
4
4
 
5
5
  AIQA_TRACER_NAME = "aiqa-tracer"
6
+ VERSION = "0.4.1" # automatically updated by set-version-json.sh
@@ -0,0 +1,249 @@
1
+ """
2
+ Test startup reliability - simulates ECS deployment scenarios where rapid initialization
3
+ and network issues could cause deployment failures.
4
+
5
+ These tests verify that:
6
+ 1. Exporter initialization doesn't block or create threads immediately
7
+ 2. Thread creation is lazy (only on first export)
8
+ 3. Network failures during startup don't cause hangs
9
+ 4. Multiple rapid initializations don't cause issues
10
+ """
11
+
12
+ import os
13
+ import time
14
+ import threading
15
+ import pytest
16
+ from unittest.mock import patch, MagicMock
17
+ from opentelemetry.sdk.trace import TracerProvider
18
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
19
+
20
+ from aiqa.client import get_aiqa_client, AIQAClient
21
+ from aiqa.aiqa_exporter import AIQASpanExporter
22
+
23
+
24
+ class TestStartupReliability:
25
+ """Tests for startup reliability in ECS-like scenarios."""
26
+
27
+ def test_exporter_initialization_does_not_create_thread_immediately(self):
28
+ """Verify that creating an exporter doesn't immediately start a thread."""
29
+ with patch.dict(
30
+ os.environ,
31
+ {
32
+ "AIQA_SERVER_URL": "http://localhost:3000",
33
+ "AIQA_API_KEY": "test-api-key",
34
+ },
35
+ ):
36
+ exporter = AIQASpanExporter(startup_delay_seconds=0.1)
37
+
38
+ # Thread should not be created immediately
39
+ assert exporter.flush_timer is None
40
+ assert not exporter._auto_flush_started
41
+
42
+ # Cleanup
43
+ exporter.shutdown()
44
+
45
+ def test_thread_created_lazily_on_first_export(self):
46
+ """Verify thread is only created when first span is exported."""
47
+ with patch.dict(
48
+ os.environ,
49
+ {
50
+ "AIQA_SERVER_URL": "http://localhost:3000",
51
+ "AIQA_API_KEY": "test-api-key",
52
+ },
53
+ ):
54
+ exporter = AIQASpanExporter(startup_delay_seconds=0.1)
55
+
56
+ # Thread should not exist yet
57
+ assert exporter.flush_timer is None
58
+
59
+ # Create a mock span and export it
60
+ from opentelemetry.sdk.trace import ReadableSpan
61
+ from opentelemetry.trace import SpanContext, TraceFlags
62
+
63
+ mock_span = MagicMock(spec=ReadableSpan)
64
+ mock_span.get_span_context.return_value = SpanContext(
65
+ trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
66
+ )
67
+ mock_span.name = "test_span"
68
+ mock_span.kind = 1
69
+ mock_span.start_time = 1000000000
70
+ mock_span.end_time = 2000000000
71
+ mock_span.status.status_code = 1
72
+ mock_span.attributes = {}
73
+ mock_span.links = []
74
+ mock_span.events = []
75
+ mock_span.resource.attributes = {}
76
+ mock_span.parent = None
77
+
78
+ # Export should trigger thread creation
79
+ result = exporter.export([mock_span])
80
+
81
+ # Give thread a moment to start
82
+ time.sleep(0.2)
83
+
84
+ # Now thread should exist
85
+ assert exporter._auto_flush_started
86
+ assert exporter.flush_timer is not None
87
+ assert exporter.flush_timer.is_alive()
88
+
89
+ # Cleanup
90
+ exporter.shutdown()
91
+ if exporter.flush_timer:
92
+ exporter.flush_timer.join(timeout=2.0)
93
+
94
+ def test_rapid_multiple_initializations(self):
95
+ """Test that multiple rapid initializations don't cause issues (simulates health checks)."""
96
+ with patch.dict(
97
+ os.environ,
98
+ {
99
+ "AIQA_SERVER_URL": "http://localhost:3000",
100
+ "AIQA_API_KEY": "test-api-key",
101
+ },
102
+ ):
103
+ # Simulate rapid health check calls
104
+ clients = []
105
+ for _ in range(10):
106
+ client = get_aiqa_client()
107
+ clients.append(client)
108
+ time.sleep(0.01) # Very short delay
109
+
110
+ # All should be the same singleton
111
+ assert all(c is clients[0] for c in clients)
112
+
113
+ # Should not have created multiple threads
114
+ if clients[0].exporter:
115
+ assert clients[0].exporter._auto_flush_started or clients[0].exporter.flush_timer is None
116
+
117
+ def test_initialization_with_unreachable_server(self):
118
+ """Test that initialization doesn't hang when server is unreachable."""
119
+ with patch.dict(
120
+ os.environ,
121
+ {
122
+ "AIQA_SERVER_URL": "http://unreachable-server:3000",
123
+ "AIQA_API_KEY": "test-api-key",
124
+ },
125
+ ):
126
+ # Should not block or raise
127
+ client = get_aiqa_client()
128
+ assert client is not None
129
+ assert client._initialized
130
+
131
+ # Exporter should exist but thread shouldn't be started yet
132
+ if client.exporter:
133
+ # Thread creation is lazy, so it might not exist
134
+ assert client.exporter.flush_timer is None or not client.exporter._auto_flush_started
135
+
136
+ def test_startup_delay_respected(self):
137
+ """Verify that startup delay prevents immediate flush attempts."""
138
+ with patch.dict(
139
+ os.environ,
140
+ {
141
+ "AIQA_SERVER_URL": "http://localhost:3000",
142
+ "AIQA_API_KEY": "test-api-key",
143
+ },
144
+ ):
145
+ exporter = AIQASpanExporter(startup_delay_seconds=0.5)
146
+
147
+ # Create and export a span to trigger thread creation
148
+ from opentelemetry.sdk.trace import ReadableSpan
149
+ from opentelemetry.trace import SpanContext, TraceFlags
150
+
151
+ mock_span = MagicMock(spec=ReadableSpan)
152
+ mock_span.get_span_context.return_value = SpanContext(
153
+ trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
154
+ )
155
+ mock_span.name = "test_span"
156
+ mock_span.kind = 1
157
+ mock_span.start_time = 1000000000
158
+ mock_span.end_time = 2000000000
159
+ mock_span.status.status_code = 1
160
+ mock_span.attributes = {}
161
+ mock_span.links = []
162
+ mock_span.events = []
163
+ mock_span.resource.attributes = {}
164
+ mock_span.parent = None
165
+
166
+ exporter.export([mock_span])
167
+
168
+ # Thread should be created
169
+ time.sleep(0.1)
170
+ assert exporter._auto_flush_started
171
+
172
+ # But flush should not have happened yet (within delay period)
173
+ # We can't easily test this without mocking time, but we verify thread exists
174
+ assert exporter.flush_timer is not None
175
+
176
+ # Cleanup
177
+ exporter.shutdown()
178
+ if exporter.flush_timer:
179
+ exporter.flush_timer.join(timeout=2.0)
180
+
181
+ def test_concurrent_initialization(self):
182
+ """Test concurrent initialization from multiple threads (simulates ECS health checks)."""
183
+ with patch.dict(
184
+ os.environ,
185
+ {
186
+ "AIQA_SERVER_URL": "http://localhost:3000",
187
+ "AIQA_API_KEY": "test-api-key",
188
+ },
189
+ ):
190
+ clients = []
191
+ errors = []
192
+
193
+ def init_client():
194
+ try:
195
+ client = get_aiqa_client()
196
+ clients.append(client)
197
+ except Exception as e:
198
+ errors.append(e)
199
+
200
+ # Start multiple threads initializing simultaneously
201
+ threads = [threading.Thread(target=init_client) for _ in range(5)]
202
+ for t in threads:
203
+ t.start()
204
+ for t in threads:
205
+ t.join(timeout=5.0)
206
+
207
+ # Should have no errors
208
+ assert len(errors) == 0
209
+
210
+ # All should be the same singleton
211
+ assert len(set(id(c) for c in clients)) == 1
212
+
213
+ def test_shutdown_before_thread_starts(self):
214
+ """Test that shutdown works even if thread was never started."""
215
+ with patch.dict(
216
+ os.environ,
217
+ {
218
+ "AIQA_SERVER_URL": "http://localhost:3000",
219
+ "AIQA_API_KEY": "test-api-key",
220
+ },
221
+ ):
222
+ exporter = AIQASpanExporter(startup_delay_seconds=1.0)
223
+
224
+ # Thread should not exist
225
+ assert exporter.flush_timer is None
226
+
227
+ # Shutdown should work without errors
228
+ exporter.shutdown()
229
+
230
+ # Should still be able to call shutdown again
231
+ exporter.shutdown()
232
+
233
+ def test_initialization_timeout(self):
234
+ """Test that initialization completes quickly even with network issues."""
235
+ with patch.dict(
236
+ os.environ,
237
+ {
238
+ "AIQA_SERVER_URL": "http://localhost:3000",
239
+ "AIQA_API_KEY": "test-api-key",
240
+ },
241
+ ):
242
+ start_time = time.time()
243
+ client = get_aiqa_client()
244
+ elapsed = time.time() - start_time
245
+
246
+ # Initialization should be fast (< 1 second)
247
+ assert elapsed < 1.0
248
+ assert client is not None
249
+
aiqa/tracing.py CHANGED
@@ -590,7 +590,8 @@ def WithTracing(
590
590
  is_generator = inspect.isgeneratorfunction(fn)
591
591
  is_async_generator = inspect.isasyncgenfunction(fn) if hasattr(inspect, 'isasyncgenfunction') else False
592
592
 
593
- tracer = get_aiqa_tracer()
593
+ # Don't get tracer here - get it lazily when function is called
594
+ # This ensures initialization only happens when tracing is actually used
594
595
 
595
596
  def _setup_span(span: trace.Span, input_data: Any) -> bool:
596
597
  """Setup span with input data. Returns True if span is recording."""
@@ -627,10 +628,13 @@ def WithTracing(
627
628
  def _execute_with_span_sync(executor: Callable[[], Any], input_data: Any) -> Any:
628
629
  """Execute sync function within span context, handling input/output and exceptions."""
629
630
  # Ensure tracer provider is initialized before creating spans
631
+ # This is called lazily when the function runs, not at decorator definition time
630
632
  client = get_aiqa_client()
631
633
  if not client.enabled:
632
634
  return executor()
633
635
 
636
+ # Get tracer after initialization (lazy)
637
+ tracer = get_aiqa_tracer()
634
638
  with tracer.start_as_current_span(fn_name) as span:
635
639
  if not _setup_span(span, input_data):
636
640
  return executor()
@@ -646,10 +650,13 @@ def WithTracing(
646
650
  async def _execute_with_span_async(executor: Callable[[], Any], input_data: Any) -> Any:
647
651
  """Execute async function within span context, handling input/output and exceptions."""
648
652
  # Ensure tracer provider is initialized before creating spans
653
+ # This is called lazily when the function runs, not at decorator definition time
649
654
  client = get_aiqa_client()
650
655
  if not client.enabled:
651
656
  return await executor()
652
657
 
658
+ # Get tracer after initialization (lazy)
659
+ tracer = get_aiqa_tracer()
653
660
  with tracer.start_as_current_span(fn_name) as span:
654
661
  if not _setup_span(span, input_data):
655
662
  return await executor()
@@ -668,10 +675,13 @@ def WithTracing(
668
675
  def _execute_generator_sync(executor: Callable[[], Any], input_data: Any) -> Any:
669
676
  """Execute sync generator function, returning a traced generator."""
670
677
  # Ensure tracer provider is initialized before creating spans
678
+ # This is called lazily when the function runs, not at decorator definition time
671
679
  client = get_aiqa_client()
672
680
  if not client.enabled:
673
681
  return executor()
674
682
 
683
+ # Get tracer after initialization (lazy)
684
+ tracer = get_aiqa_tracer()
675
685
  # Create span but don't use 'with' - span will be closed by TracedGenerator
676
686
  span = tracer.start_span(fn_name)
677
687
  token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
@@ -694,10 +704,13 @@ def WithTracing(
694
704
  async def _execute_generator_async(executor: Callable[[], Any], input_data: Any) -> Any:
695
705
  """Execute async generator function, returning a traced async generator."""
696
706
  # Ensure tracer provider is initialized before creating spans
707
+ # This is called lazily when the function runs, not at decorator definition time
697
708
  client = get_aiqa_client()
698
709
  if not client.enabled:
699
710
  return await executor()
700
711
 
712
+ # Get tracer after initialization (lazy)
713
+ tracer = get_aiqa_tracer()
701
714
  # Create span but don't use 'with' - span will be closed by TracedAsyncGenerator
702
715
  span = tracer.start_span(fn_name)
703
716
  token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
@@ -935,7 +948,8 @@ def set_component_tag(tag: str) -> None:
935
948
  This can also be set via the AIQA_COMPONENT_TAG environment variable.
936
949
  The component tag allows you to identify which component/system generated the spans.
937
950
 
938
- Note: If using environment variables, ensure you call get_aiqa_client() first to initialize
951
+ Note: Initialization is automatic when WithTracing is first used. You can also call
952
+ get_aiqa_client() explicitly if needed.
939
953
  the client and load environment variables.
940
954
 
941
955
  Args:
@@ -1045,6 +1059,8 @@ def create_span_from_trace_id(
1045
1059
  from opentelemetry.trace import set_span_in_context
1046
1060
  parent_context = set_span_in_context(trace.NonRecordingSpan(parent_span_context))
1047
1061
 
1062
+ # Ensure initialization before creating span
1063
+ get_aiqa_client()
1048
1064
  # Start a new span in this context (it will be a child of the parent span)
1049
1065
  tracer = get_aiqa_tracer()
1050
1066
  span = tracer.start_span(span_name, context=parent_context)
@@ -1057,6 +1073,8 @@ def create_span_from_trace_id(
1057
1073
  return span
1058
1074
  except (ValueError, AttributeError) as e:
1059
1075
  logger.error(f"Error creating span from trace_id: {e}")
1076
+ # Ensure initialization before creating span
1077
+ get_aiqa_client()
1060
1078
  # Fallback: create a new span
1061
1079
  tracer = get_aiqa_tracer()
1062
1080
  span = tracer.start_span(span_name)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aiqa-client
3
- Version: 0.3.7
3
+ Version: 0.4.1
4
4
  Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
5
5
  Author-email: AIQA <info@aiqa.dev>
6
6
  License: MIT
@@ -56,6 +56,20 @@ pip install -r requirements.txt
56
56
  pip install -e .
57
57
  ```
58
58
 
59
+ ### Development Setup
60
+
61
+ For development, install with dev dependencies to run tests:
62
+
63
+ ```bash
64
+ pip install -e ".[dev]"
65
+ ```
66
+
67
+ Then run the unit tests:
68
+
69
+ ```bash
70
+ pytest
71
+ ```
72
+
59
73
  See [TESTING.md](TESTING.md) for detailed testing instructions.
60
74
 
61
75
  ## Setup
@@ -0,0 +1,16 @@
1
+ aiqa/__init__.py,sha256=8MQBrnisjeYNrwrbTheUafEWS09GtIF7ff0fBZ1Jb24,1710
2
+ aiqa/aiqa_exporter.py,sha256=ge8DOebzewWA5AW2BH4cQ4eVARtZn7jPqpgZZBDIJR4,32565
3
+ aiqa/client.py,sha256=Vm6CA4q0vNbkLXwGCjx1Khfp6tyXxEFtIwZ31PMdrYU,9920
4
+ aiqa/constants.py,sha256=-FmvbNT2blwHn_dmoWiseSseFZP7ZCNJbkjvmZkdr4k,153
5
+ aiqa/experiment_runner.py,sha256=ZEDwECstAv4lWXpcdB9WSxfDQj43iqkGzB_YzoY933M,12053
6
+ aiqa/object_serialiser.py,sha256=pgcBVw5sZH8f7N6n3-qOvEcbNhuPS5yq7qdhaNT6Sks,15236
7
+ aiqa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ aiqa/test_experiment_runner.py,sha256=LM8BuCrzBZL0Wyu_ierK0tNLsOUxxMTAHbAGW2G0qp0,5562
9
+ aiqa/test_startup_reliability.py,sha256=bt3fc-W3BPWoVK8RIYhnbwS-saBUwtPx90W57D7nOEM,9216
10
+ aiqa/test_tracing.py,sha256=mSVrhRQ6Dz5djlSUkCt097sIr84562w6E0BnuQDpMrI,8347
11
+ aiqa/tracing.py,sha256=gdC1aHH-GUIQDqNgAZsXNH8-sGBzlB1ij4R-D02uYXk,50758
12
+ aiqa_client-0.4.1.dist-info/licenses/LICENSE,sha256=kIzkzLuzG0HHaWYm4F4W5FeJ1Yxut3Ec6bhLWyw798A,1062
13
+ aiqa_client-0.4.1.dist-info/METADATA,sha256=dRozyP6cybntCZwT29Z-4Du7wufive_AiuKDFy40IKY,7673
14
+ aiqa_client-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ aiqa_client-0.4.1.dist-info/top_level.txt,sha256=nwcsuVVSuWu27iLxZd4n1evVzv1W6FVTrSnCXCc-NQs,5
16
+ aiqa_client-0.4.1.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- aiqa/__init__.py,sha256=dnCT31jRL0nUeSvHksUris3_7lxlxChfzshxY7_gHlk,1455
2
- aiqa/aiqa_exporter.py,sha256=MbA7SkJoNm03dvrcGteU57Y0YNpVw8fzL_W-RI2lI0Q,27698
3
- aiqa/client.py,sha256=TolaBb7ZnnD5SawclI9KMBsdGKaAosTxvLVeGlKseAA,9599
4
- aiqa/constants.py,sha256=3QLmyhyVayKebM5N50P1oYbI0LtQmqxTp17UZnUeixc,89
5
- aiqa/experiment_runner.py,sha256=ZEDwECstAv4lWXpcdB9WSxfDQj43iqkGzB_YzoY933M,12053
6
- aiqa/object_serialiser.py,sha256=pgcBVw5sZH8f7N6n3-qOvEcbNhuPS5yq7qdhaNT6Sks,15236
7
- aiqa/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- aiqa/test_experiment_runner.py,sha256=LM8BuCrzBZL0Wyu_ierK0tNLsOUxxMTAHbAGW2G0qp0,5562
9
- aiqa/test_tracing.py,sha256=mSVrhRQ6Dz5djlSUkCt097sIr84562w6E0BnuQDpMrI,8347
10
- aiqa/tracing.py,sha256=SsuK6WNgk3LbWt1aQwPPIDhitBmtyU6GOsMRvouXpDw,49706
11
- aiqa_client-0.3.7.dist-info/licenses/LICENSE,sha256=kIzkzLuzG0HHaWYm4F4W5FeJ1Yxut3Ec6bhLWyw798A,1062
12
- aiqa_client-0.3.7.dist-info/METADATA,sha256=LSshQdYneT3PfZIy19TwBt6Ow8L7IlJYHmx3EPH32pg,7505
13
- aiqa_client-0.3.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
14
- aiqa_client-0.3.7.dist-info/top_level.txt,sha256=nwcsuVVSuWu27iLxZd4n1evVzv1W6FVTrSnCXCc-NQs,5
15
- aiqa_client-0.3.7.dist-info/RECORD,,