PyPI - aiqa-client - Versions diffs - 0.3.7__tar.gz → 0.4.1__tar.gz - Mend

aiqa-client 0.3.7tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{aiqa_client-0.3.7/aiqa_client.egg-info → aiqa_client-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aiqa-client
-Version: 0.3.7
+Version: 0.4.1
 Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
 Author-email: AIQA <info@aiqa.dev>
 License: MIT
@@ -56,6 +56,20 @@ pip install -r requirements.txt
 pip install -e .
 ```
+### Development Setup
+For development, install with dev dependencies to run tests:
+```bash
+pip install -e ".[dev]"
+```
+Then run the unit tests:
+```bash
+pytest
+```
 See [TESTING.md](TESTING.md) for detailed testing instructions.
 ## Setup

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/README.md RENAMED Viewed

@@ -19,6 +19,20 @@ pip install -r requirements.txt
 pip install -e .
 ```
+### Development Setup
+For development, install with dev dependencies to run tests:
+```bash
+pip install -e ".[dev]"
+```
+Then run the unit tests:
+```bash
+pytest
+```
 See [TESTING.md](TESTING.md) for detailed testing instructions.
 ## Setup

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa/__init__.py RENAMED Viewed

@@ -1,22 +1,29 @@
 """
 Python client for AIQA server - OpenTelemetry tracing decorators.
-IMPORTANT: Before using any AIQA functionality, you must call get_aiqa_client() to initialize
-the client and load environment variables (AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG, etc.).
+Initialization is automatic - you don't need to call get_aiqa_client() explicitly.
+The client initializes automatically when WithTracing is first used.
+Set environment variables:
+    AIQA_SERVER_URL: URL of the AIQA server
+    AIQA_API_KEY: API key for authentication
+    AIQA_COMPONENT_TAG: Optional component identifier
+    AIQA_STARTUP_DELAY_SECONDS: Optional delay before first flush (default: 10s)
 Example:
     from dotenv import load_dotenv
-    from aiqa import get_aiqa_client, WithTracing
+    from aiqa import WithTracing
     # Load environment variables from .env file (if using one)
     load_dotenv()
-    # Initialize client (must be called before using WithTracing or other functions)
-    get_aiqa_client()
+    # No explicit initialization needed - it happens automatically when used
     @WithTracing
     def my_function():
         return "Hello, AIQA!"
+    # Call the function - initialization happens on first use
+    result = my_function()
 """
 from .tracing import (
@@ -36,8 +43,7 @@ from .tracing import (
 )
 from .client import get_aiqa_client
 from .experiment_runner import ExperimentRunner
-__version__ = "0.3.7"
+from .constants import VERSION
 __all__ = [
     "WithTracing",
@@ -55,6 +61,6 @@ __all__ = [
     "set_conversation_id",
     "set_component_tag",
     "get_span",
-    "__version__",
+    "VERSION",
 ]

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa/aiqa_exporter.py RENAMED Viewed

@@ -9,12 +9,12 @@ import logging
 import threading
 import time
 import io
+import asyncio
 from typing import List, Dict, Any, Optional
 from opentelemetry.sdk.trace import ReadableSpan
 from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
-from .constants import AIQA_TRACER_NAME
-from . import __version__
+from .constants import AIQA_TRACER_NAME, VERSION
 logger = logging.getLogger("AIQA")
@@ -32,6 +32,7 @@ class AIQASpanExporter(SpanExporter):
         flush_interval_seconds: float = 5.0,
         max_batch_size_bytes: int = 5 * 1024 * 1024,  # 5MB default
         max_buffer_spans: int = 10000,  # Maximum spans to buffer (prevents unbounded growth)
+        startup_delay_seconds: Optional[float] = None,
     ):
         """
         Initialize the AIQA span exporter.
@@ -41,24 +42,44 @@ class AIQASpanExporter(SpanExporter):
             api_key: API key for authentication (defaults to AIQA_API_KEY env var)
             flush_interval_seconds: How often to flush spans to the server
             max_batch_size_bytes: Maximum size of a single batch in bytes (default: 5mb)
+            max_buffer_spans: Maximum spans to buffer (prevents unbounded growth)
+            startup_delay_seconds: Delay before starting auto-flush (default: 10s, or AIQA_STARTUP_DELAY_SECONDS env var)
         """
         self._server_url = server_url
         self._api_key = api_key
         self.flush_interval_ms = flush_interval_seconds * 1000
         self.max_batch_size_bytes = max_batch_size_bytes
         self.max_buffer_spans = max_buffer_spans
+        # Get startup delay from parameter or environment variable (default: 10s)
+        if startup_delay_seconds is None:
+            env_delay = os.getenv("AIQA_STARTUP_DELAY_SECONDS")
+            if env_delay:
+                try:
+                    startup_delay_seconds = float(env_delay)
+                except ValueError:
+                    logger.warning(f"Invalid AIQA_STARTUP_DELAY_SECONDS value '{env_delay}', using default 10.0")
+                    startup_delay_seconds = 10.0
+            else:
+                startup_delay_seconds = 10.0
+        self.startup_delay_seconds = startup_delay_seconds
         self.buffer: List[Dict[str, Any]] = []
         self.buffer_span_keys: set = set()  # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
         self.buffer_lock = threading.Lock()
         self.flush_lock = threading.Lock()
+        # shutdown_requested is only set once (in shutdown()) and read many times
+        # No lock needed: worst case is reading stale False, which is acceptable
         self.shutdown_requested = False
         self.flush_timer: Optional[threading.Thread] = None
+        self._auto_flush_started = False
+        self._auto_flush_lock = threading.Lock()  # Lock for lazy thread creation
         logger.info(
             f"Initializing AIQASpanExporter: server_url={self.server_url or 'not set'}, "
-            f"flush_interval={flush_interval_seconds}s"
+            f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
         )
-        self._start_auto_flush()
+        # Don't start thread immediately - start lazily on first export to avoid startup issues
     @property
     def server_url(self) -> str:
@@ -89,6 +110,11 @@ class AIQASpanExporter(SpanExporter):
             pass
         logger.debug(f"AIQA export() called with {len(spans)} spans")
+        # Lazy initialization: start auto-flush thread on first export
+        # This avoids thread creation during initialization, which can cause issues in ECS deployments
+        self._ensure_auto_flush_started()
         # Serialize and add to buffer, deduplicating by (traceId, spanId)
         with self.buffer_lock:
             serialized_spans = []
@@ -198,7 +224,7 @@ class AIQASpanExporter(SpanExporter):
         """
         return {
             "name": AIQA_TRACER_NAME,
-            "version": __version__,
+            "version": VERSION,
         }
     def _time_to_tuple(self, nanoseconds: int) -> tuple:
@@ -325,6 +351,8 @@ class AIQASpanExporter(SpanExporter):
         """
         Flush buffered spans to the server. Thread-safe: ensures only one flush operation runs at a time.
         Atomically extracts spans to prevent race conditions with concurrent export() calls.
+        Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
         """
         logger.debug("flush() called - attempting to acquire flush lock")
         with self.flush_lock:
@@ -347,49 +375,88 @@ class AIQASpanExporter(SpanExporter):
                 self._remove_span_keys_from_tracking(spans_to_flush)
                 return
-            logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
-            try:
-                await self._send_spans(spans_to_flush)
-                logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
-                # Spans already removed from buffer during extraction
-                # Now clear their keys from tracking set to free memory
-                self._remove_span_keys_from_tracking(spans_to_flush)
-            except RuntimeError as error:
-                if self._is_interpreter_shutdown_error(error):
-                    if self.shutdown_requested:
-                        logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
-                        # Put spans back for retry with sync send during shutdown
-                        self._prepend_spans_to_buffer(spans_to_flush)
-                    else:
-                        logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
-                        # Put spans back for retry
-                        self._prepend_spans_to_buffer(spans_to_flush)
-                    raise
-                logger.error(f"Error flushing spans to server: {error}")
-                # Put spans back for retry
+        # Release flush_lock before I/O to avoid blocking other flush attempts
+        # Spans are already extracted, so concurrent exports won't interfere
+        logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
+        try:
+            await self._send_spans(spans_to_flush)
+            logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
+            # Spans already removed from buffer during extraction
+            # Now clear their keys from tracking set to free memory
+            self._remove_span_keys_from_tracking(spans_to_flush)
+        except RuntimeError as error:
+            if self._is_interpreter_shutdown_error(error):
+                if self.shutdown_requested:
+                    logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
+                else:
+                    logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
+                # Put spans back for retry with sync send during shutdown
                 self._prepend_spans_to_buffer(spans_to_flush)
                 raise
-            except Exception as error:
-                logger.error(f"Error flushing spans to server: {error}")
-                # Put spans back for retry
-                self._prepend_spans_to_buffer(spans_to_flush)
-                if self.shutdown_requested:
-                    raise
+            logger.error(f"Error flushing spans to server: {error}")
+            # Put spans back for retry
+            self._prepend_spans_to_buffer(spans_to_flush)
+            raise
+        except Exception as error:
+            logger.error(f"Error flushing spans to server: {error}")
+            # Put spans back for retry
+            self._prepend_spans_to_buffer(spans_to_flush)
+            if self.shutdown_requested:
+                raise
-    def _start_auto_flush(self) -> None:
-        """Start the auto-flush timer."""
-        if self.shutdown_requested:
-            logger.warning("_start_auto_flush() called but shutdown already requested")
+    def _ensure_auto_flush_started(self) -> None:
+        """Ensure auto-flush thread is started (lazy initialization). Thread-safe."""
+        # Fast path: check without lock first
+        if self._auto_flush_started or self.shutdown_requested:
             return
-        logger.info(f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s")
-        def flush_worker():
-            import asyncio
-            logger.debug("Auto-flush worker thread started")
+        # Slow path: acquire lock and double-check
+        with self._auto_flush_lock:
+            if self._auto_flush_started or self.shutdown_requested:
+                return
+            try:
+                self._start_auto_flush()
+                self._auto_flush_started = True
+            except Exception as e:
+                logger.error(f"Failed to start auto-flush thread: {e}", exc_info=True)
+                # Don't raise - allow spans to be buffered even if auto-flush fails
+                # They can still be flushed manually or on shutdown
+    def _flush_worker(self) -> None:
+        """Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
+        import asyncio
+        logger.debug("Auto-flush worker thread started")
+        # Wait for startup delay before beginning flush operations
+        # This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
+        if self.startup_delay_seconds > 0:
+            logger.info(f"Auto-flush waiting {self.startup_delay_seconds}s before first flush (startup delay)")
+            # Sleep in small increments to allow for early shutdown
+            sleep_interval = 0.5
+            remaining_delay = self.startup_delay_seconds
+            while remaining_delay > 0 and not self.shutdown_requested:
+                sleep_time = min(sleep_interval, remaining_delay)
+                time.sleep(sleep_time)
+                remaining_delay -= sleep_time
+            if self.shutdown_requested:
+                logger.debug("Auto-flush startup delay interrupted by shutdown")
+                return
+            logger.info("Auto-flush startup delay complete, beginning flush operations")
+        # Create event loop in this thread (isolated from main thread's event loop)
+        # This prevents interference with the main application's event loop
+        try:
             loop = asyncio.new_event_loop()
             asyncio.set_event_loop(loop)
+        except Exception as e:
+            logger.error(f"Failed to create event loop for auto-flush thread: {e}", exc_info=True)
+            return
+        # Ensure event loop is always closed, even if an exception occurs
+        try:
             cycle_count = 0
             while not self.shutdown_requested:
                 cycle_count += 1
@@ -397,27 +464,39 @@ class AIQASpanExporter(SpanExporter):
                 try:
                     loop.run_until_complete(self.flush())
                     logger.debug(f"Auto-flush cycle #{cycle_count} completed, sleeping {self.flush_interval_ms / 1000.0}s")
-                    time.sleep(self.flush_interval_ms / 1000.0)
                 except Exception as e:
                     logger.error(f"Error in auto-flush cycle #{cycle_count}: {e}")
                     logger.debug(f"Auto-flush cycle #{cycle_count} error handled, sleeping {self.flush_interval_ms / 1000.0}s")
+                # Sleep after each cycle (including errors) to avoid tight loops
+                if not self.shutdown_requested:
                     time.sleep(self.flush_interval_ms / 1000.0)
             logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
             # Don't do final flush here - shutdown() will handle it with synchronous send
             # This avoids event loop shutdown issues
             logger.debug("Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
-            # Close the event loop
+        finally:
+            # Always close the event loop, even if an exception occurs
             try:
                 if not loop.is_closed():
                     loop.close()
                 logger.debug("Auto-flush worker thread event loop closed")
             except Exception:
                 pass  # Ignore errors during cleanup
+    def _start_auto_flush(self) -> None:
+        """Start the auto-flush timer with startup delay."""
+        if self.shutdown_requested:
+            logger.warning("_start_auto_flush() called but shutdown already requested")
+            return
+        logger.info(
+            f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
+            f"startup delay {self.startup_delay_seconds}s"
+        )
-        flush_thread = threading.Thread(target=flush_worker, daemon=True, name="AIQA-AutoFlush")
+        flush_thread = threading.Thread(target=self._flush_worker, daemon=True, name="AIQA-AutoFlush")
         flush_thread.start()
         self.flush_timer = flush_thread
         logger.info(f"Auto-flush thread started: {flush_thread.name} (daemon={flush_thread.daemon})")
@@ -439,8 +518,10 @@ class AIQASpanExporter(SpanExporter):
         else:
             logger.debug("_send_spans() no API key provided")
+        # Use timeout to prevent hanging on unreachable servers
+        timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
         errors = []
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(timeout=timeout) as session:
             for batch_idx, batch in enumerate(batches):
                 try:
                     logger.debug(f"_send_spans() sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
@@ -458,6 +539,12 @@ class AIQASpanExporter(SpanExporter):
                             # Continue with other batches even if one fails
                             continue
                         logger.debug(f"_send_spans() batch {batch_idx + 1} successfully sent {len(batch)} spans")
+                except (aiohttp.ClientError, asyncio.TimeoutError) as e:
+                    # Network errors and timeouts - log but don't fail completely
+                    error_msg = f"Network error in batch {batch_idx + 1}: {type(e).__name__}: {e}"
+                    logger.warning(f"_send_spans() {error_msg} - will retry on next flush")
+                    errors.append((batch_idx + 1, error_msg))
+                    # Continue with other batches
                 except RuntimeError as e:
                     if self._is_interpreter_shutdown_error(e):
                         if self.shutdown_requested:
@@ -476,6 +563,7 @@ class AIQASpanExporter(SpanExporter):
                     # Continue with other batches
         # If any batches failed, raise an exception with details
+        # Spans will be restored to buffer for retry on next flush
         if errors:
             error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
             raise Exception(f"Failed to send some spans: {error_summary}")
@@ -537,7 +625,8 @@ class AIQASpanExporter(SpanExporter):
             logger.info(f"shutdown() buffer contains {buffer_size} span(s) before shutdown")
         # Wait for flush thread to finish (it will do final flush)
-        if self.flush_timer and self.flush_timer.is_alive():
+        # Only wait if thread was actually started
+        if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
             logger.info("shutdown() waiting for auto-flush thread to complete (timeout=10s)")
             self.flush_timer.join(timeout=10.0)
             if self.flush_timer.is_alive():

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa/client.py RENAMED Viewed

@@ -7,9 +7,6 @@ from opentelemetry import trace
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import BatchSpanProcessor
-if TYPE_CHECKING:
-    from .aiqa_exporter import AIQASpanExporter
 logger = logging.getLogger("AIQA")
 # Compatibility import for TraceIdRatioBased sampler
@@ -46,7 +43,7 @@ class AIQAClient:
         if cls._instance is None:
             cls._instance = super().__new__(cls)
             cls._instance._provider: Optional[TracerProvider] = None
-            cls._instance._exporter: Optional[AIQASpanExporter] = None
+            cls._instance._exporter = None # reduce circular import issues by not importing for typecheck here
             cls._instance._enabled: bool = True
             cls._instance._initialized: bool = False
         return cls._instance
@@ -62,12 +59,12 @@ class AIQAClient:
         self._provider = value
     @property
-    def exporter(self) -> Optional[AIQASpanExporter]:
+    def exporter(self) -> Optional[Any]:
         """Get the span exporter."""
         return self._exporter
     @exporter.setter
-    def exporter(self, value: Optional[AIQASpanExporter]) -> None:
+    def exporter(self, value: Optional[Any]) -> None:
         """Set the span exporter."""
         self._exporter = value
@@ -132,9 +129,14 @@ def get_aiqa_client() -> AIQAClient:
     """
     Initialize and return the AIQA client singleton.
-    This function must be called before using any AIQA tracing functionality to ensure
-    that environment variables (such as AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
-    are properly loaded and the tracing system is initialized.
+    This function is called automatically when WithTracing is first used, so you typically
+    don't need to call it explicitly. However, you can call it manually if you want to:
+    - Check if tracing is enabled (client.enabled)
+    - Initialize before the first @WithTracing usage
+    - Access the client object for advanced usage
+    The function loads environment variables (AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
+    and initializes the tracing system.
     The client object manages the tracing system state. Tracing is done by the WithTracing
     decorator. Experiments are run by the ExperimentRunner class.
@@ -145,12 +147,14 @@ def get_aiqa_client() -> AIQAClient:
     Example:
         from aiqa import get_aiqa_client, WithTracing
-        # Initialize client (loads env vars)
+        # Optional: Initialize explicitly (usually not needed)
         client = get_aiqa_client()
+        if client.enabled:
+            print("Tracing is enabled")
         @WithTracing
         def my_function():
-            pass
+            pass  # Initialization happens automatically here if not done above
     """
     global client
     try:
@@ -252,10 +256,10 @@ def get_aiqa_tracer() -> trace.Tracer:
     """
     try:
         # Import here to avoid circular import
-        from . import __version__
+        from . import VERSION
         # Compatibility: version parameter may not be supported in older OpenTelemetry versions
         # Try with version parameter (newer OpenTelemetry versions)
-        return trace.get_tracer(AIQA_TRACER_NAME, version=__version__)
+        return trace.get_tracer(AIQA_TRACER_NAME, version=VERSION)
     except Exception as e:
         # Log issue but still return a tracer
         logger.info(f"Issue getting AIQA tracer with version: {e}, using fallback")

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa/constants.py RENAMED Viewed

@@ -3,3 +3,4 @@ Constants used across the AIQA client package.
 """
 AIQA_TRACER_NAME = "aiqa-tracer"
+VERSION = "0.4.1" # automatically updated by set-version-json.sh

aiqa_client-0.4.1/aiqa/test_startup_reliability.py ADDED Viewed

@@ -0,0 +1,249 @@
+"""
+Test startup reliability - simulates ECS deployment scenarios where rapid initialization
+and network issues could cause deployment failures.
+These tests verify that:
+1. Exporter initialization doesn't block or create threads immediately
+2. Thread creation is lazy (only on first export)
+3. Network failures during startup don't cause hangs
+4. Multiple rapid initializations don't cause issues
+"""
+import os
+import time
+import threading
+import pytest
+from unittest.mock import patch, MagicMock
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from aiqa.client import get_aiqa_client, AIQAClient
+from aiqa.aiqa_exporter import AIQASpanExporter
+class TestStartupReliability:
+    """Tests for startup reliability in ECS-like scenarios."""
+    def test_exporter_initialization_does_not_create_thread_immediately(self):
+        """Verify that creating an exporter doesn't immediately start a thread."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            exporter = AIQASpanExporter(startup_delay_seconds=0.1)
+            # Thread should not be created immediately
+            assert exporter.flush_timer is None
+            assert not exporter._auto_flush_started
+            # Cleanup
+            exporter.shutdown()
+    def test_thread_created_lazily_on_first_export(self):
+        """Verify thread is only created when first span is exported."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            exporter = AIQASpanExporter(startup_delay_seconds=0.1)
+            # Thread should not exist yet
+            assert exporter.flush_timer is None
+            # Create a mock span and export it
+            from opentelemetry.sdk.trace import ReadableSpan
+            from opentelemetry.trace import SpanContext, TraceFlags
+            mock_span = MagicMock(spec=ReadableSpan)
+            mock_span.get_span_context.return_value = SpanContext(
+                trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
+            )
+            mock_span.name = "test_span"
+            mock_span.kind = 1
+            mock_span.start_time = 1000000000
+            mock_span.end_time = 2000000000
+            mock_span.status.status_code = 1
+            mock_span.attributes = {}
+            mock_span.links = []
+            mock_span.events = []
+            mock_span.resource.attributes = {}
+            mock_span.parent = None
+            # Export should trigger thread creation
+            result = exporter.export([mock_span])
+            # Give thread a moment to start
+            time.sleep(0.2)
+            # Now thread should exist
+            assert exporter._auto_flush_started
+            assert exporter.flush_timer is not None
+            assert exporter.flush_timer.is_alive()
+            # Cleanup
+            exporter.shutdown()
+            if exporter.flush_timer:
+                exporter.flush_timer.join(timeout=2.0)
+    def test_rapid_multiple_initializations(self):
+        """Test that multiple rapid initializations don't cause issues (simulates health checks)."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            # Simulate rapid health check calls
+            clients = []
+            for _ in range(10):
+                client = get_aiqa_client()
+                clients.append(client)
+                time.sleep(0.01)  # Very short delay
+            # All should be the same singleton
+            assert all(c is clients[0] for c in clients)
+            # Should not have created multiple threads
+            if clients[0].exporter:
+                assert clients[0].exporter._auto_flush_started or clients[0].exporter.flush_timer is None
+    def test_initialization_with_unreachable_server(self):
+        """Test that initialization doesn't hang when server is unreachable."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://unreachable-server:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            # Should not block or raise
+            client = get_aiqa_client()
+            assert client is not None
+            assert client._initialized
+            # Exporter should exist but thread shouldn't be started yet
+            if client.exporter:
+                # Thread creation is lazy, so it might not exist
+                assert client.exporter.flush_timer is None or not client.exporter._auto_flush_started
+    def test_startup_delay_respected(self):
+        """Verify that startup delay prevents immediate flush attempts."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            exporter = AIQASpanExporter(startup_delay_seconds=0.5)
+            # Create and export a span to trigger thread creation
+            from opentelemetry.sdk.trace import ReadableSpan
+            from opentelemetry.trace import SpanContext, TraceFlags
+            mock_span = MagicMock(spec=ReadableSpan)
+            mock_span.get_span_context.return_value = SpanContext(
+                trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
+            )
+            mock_span.name = "test_span"
+            mock_span.kind = 1
+            mock_span.start_time = 1000000000
+            mock_span.end_time = 2000000000
+            mock_span.status.status_code = 1
+            mock_span.attributes = {}
+            mock_span.links = []
+            mock_span.events = []
+            mock_span.resource.attributes = {}
+            mock_span.parent = None
+            exporter.export([mock_span])
+            # Thread should be created
+            time.sleep(0.1)
+            assert exporter._auto_flush_started
+            # But flush should not have happened yet (within delay period)
+            # We can't easily test this without mocking time, but we verify thread exists
+            assert exporter.flush_timer is not None
+            # Cleanup
+            exporter.shutdown()
+            if exporter.flush_timer:
+                exporter.flush_timer.join(timeout=2.0)
+    def test_concurrent_initialization(self):
+        """Test concurrent initialization from multiple threads (simulates ECS health checks)."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            clients = []
+            errors = []
+            def init_client():
+                try:
+                    client = get_aiqa_client()
+                    clients.append(client)
+                except Exception as e:
+                    errors.append(e)
+            # Start multiple threads initializing simultaneously
+            threads = [threading.Thread(target=init_client) for _ in range(5)]
+            for t in threads:
+                t.start()
+            for t in threads:
+                t.join(timeout=5.0)
+            # Should have no errors
+            assert len(errors) == 0
+            # All should be the same singleton
+            assert len(set(id(c) for c in clients)) == 1
+    def test_shutdown_before_thread_starts(self):
+        """Test that shutdown works even if thread was never started."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            exporter = AIQASpanExporter(startup_delay_seconds=1.0)
+            # Thread should not exist
+            assert exporter.flush_timer is None
+            # Shutdown should work without errors
+            exporter.shutdown()
+            # Should still be able to call shutdown again
+            exporter.shutdown()
+    def test_initialization_timeout(self):
+        """Test that initialization completes quickly even with network issues."""
+        with patch.dict(
+            os.environ,
+            {
+                "AIQA_SERVER_URL": "http://localhost:3000",
+                "AIQA_API_KEY": "test-api-key",
+            },
+        ):
+            start_time = time.time()
+            client = get_aiqa_client()
+            elapsed = time.time() - start_time
+            # Initialization should be fast (< 1 second)
+            assert elapsed < 1.0
+            assert client is not None

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa/tracing.py RENAMED Viewed

@@ -590,7 +590,8 @@ def WithTracing(
         is_generator = inspect.isgeneratorfunction(fn)
         is_async_generator = inspect.isasyncgenfunction(fn) if hasattr(inspect, 'isasyncgenfunction') else False
-        tracer = get_aiqa_tracer()
+        # Don't get tracer here - get it lazily when function is called
+        # This ensures initialization only happens when tracing is actually used
         def _setup_span(span: trace.Span, input_data: Any) -> bool:
             """Setup span with input data. Returns True if span is recording."""
@@ -627,10 +628,13 @@ def WithTracing(
         def _execute_with_span_sync(executor: Callable[[], Any], input_data: Any) -> Any:
             """Execute sync function within span context, handling input/output and exceptions."""
             # Ensure tracer provider is initialized before creating spans
+            # This is called lazily when the function runs, not at decorator definition time
             client = get_aiqa_client()
             if not client.enabled:
                 return executor()
+            # Get tracer after initialization (lazy)
+            tracer = get_aiqa_tracer()
             with tracer.start_as_current_span(fn_name) as span:
                 if not _setup_span(span, input_data):
                     return executor()
@@ -646,10 +650,13 @@ def WithTracing(
         async def _execute_with_span_async(executor: Callable[[], Any], input_data: Any) -> Any:
             """Execute async function within span context, handling input/output and exceptions."""
             # Ensure tracer provider is initialized before creating spans
+            # This is called lazily when the function runs, not at decorator definition time
             client = get_aiqa_client()
             if not client.enabled:
                 return await executor()
+            # Get tracer after initialization (lazy)
+            tracer = get_aiqa_tracer()
             with tracer.start_as_current_span(fn_name) as span:
                 if not _setup_span(span, input_data):
                     return await executor()
@@ -668,10 +675,13 @@ def WithTracing(
         def _execute_generator_sync(executor: Callable[[], Any], input_data: Any) -> Any:
             """Execute sync generator function, returning a traced generator."""
             # Ensure tracer provider is initialized before creating spans
+            # This is called lazily when the function runs, not at decorator definition time
             client = get_aiqa_client()
             if not client.enabled:
                 return executor()
+            # Get tracer after initialization (lazy)
+            tracer = get_aiqa_tracer()
             # Create span but don't use 'with' - span will be closed by TracedGenerator
             span = tracer.start_span(fn_name)
             token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
@@ -694,10 +704,13 @@ def WithTracing(
         async def _execute_generator_async(executor: Callable[[], Any], input_data: Any) -> Any:
             """Execute async generator function, returning a traced async generator."""
             # Ensure tracer provider is initialized before creating spans
+            # This is called lazily when the function runs, not at decorator definition time
             client = get_aiqa_client()
             if not client.enabled:
                 return await executor()
+            # Get tracer after initialization (lazy)
+            tracer = get_aiqa_tracer()
             # Create span but don't use 'with' - span will be closed by TracedAsyncGenerator
             span = tracer.start_span(fn_name)
             token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
@@ -935,7 +948,8 @@ def set_component_tag(tag: str) -> None:
     This can also be set via the AIQA_COMPONENT_TAG environment variable.
     The component tag allows you to identify which component/system generated the spans.
-    Note: If using environment variables, ensure you call get_aiqa_client() first to initialize
+    Note: Initialization is automatic when WithTracing is first used. You can also call
+    get_aiqa_client() explicitly if needed.
     the client and load environment variables.
     Args:
@@ -1045,6 +1059,8 @@ def create_span_from_trace_id(
         from opentelemetry.trace import set_span_in_context
         parent_context = set_span_in_context(trace.NonRecordingSpan(parent_span_context))
+        # Ensure initialization before creating span
+        get_aiqa_client()
         # Start a new span in this context (it will be a child of the parent span)
         tracer = get_aiqa_tracer()
         span = tracer.start_span(span_name, context=parent_context)
@@ -1057,6 +1073,8 @@ def create_span_from_trace_id(
         return span
     except (ValueError, AttributeError) as e:
         logger.error(f"Error creating span from trace_id: {e}")
+        # Ensure initialization before creating span
+        get_aiqa_client()
         # Fallback: create a new span
         tracer = get_aiqa_tracer()
         span = tracer.start_span(span_name)

{aiqa_client-0.3.7 → aiqa_client-0.4.1/aiqa_client.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aiqa-client
-Version: 0.3.7
+Version: 0.4.1
 Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
 Author-email: AIQA <info@aiqa.dev>
 License: MIT
@@ -56,6 +56,20 @@ pip install -r requirements.txt
 pip install -e .
 ```
+### Development Setup
+For development, install with dev dependencies to run tests:
+```bash
+pip install -e ".[dev]"
+```
+Then run the unit tests:
+```bash
+pytest
+```
 See [TESTING.md](TESTING.md) for detailed testing instructions.
 ## Setup

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/aiqa_client.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,7 @@ aiqa/experiment_runner.py
 aiqa/object_serialiser.py
 aiqa/py.typed
 aiqa/test_experiment_runner.py
+aiqa/test_startup_reliability.py
 aiqa/test_tracing.py
 aiqa/tracing.py
 aiqa_client.egg-info/PKG-INFO

{aiqa_client-0.3.7 → aiqa_client-0.4.1}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "aiqa-client"
-version = "0.3.7"
+version = "0.4.1"
 description = "OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server"
 readme = "README.md"
 requires-python = ">=3.8"