aiqa-client 0.4.0__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aiqa_client-0.4.0/aiqa_client.egg-info → aiqa_client-0.4.3}/PKG-INFO +20 -6
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/README.md +14 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/__init__.py +13 -6
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/aiqa_exporter.py +105 -64
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/client.py +13 -6
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/constants.py +1 -1
- aiqa_client-0.4.3/aiqa/test_startup_reliability.py +249 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/tracing.py +20 -2
- {aiqa_client-0.4.0 → aiqa_client-0.4.3/aiqa_client.egg-info}/PKG-INFO +20 -6
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa_client.egg-info/SOURCES.txt +2 -2
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/pyproject.toml +5 -5
- aiqa_client-0.4.0/setup.py +0 -9
- /aiqa_client-0.4.0/LICENSE → /aiqa_client-0.4.3/LICENSE.txt +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/MANIFEST.in +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/experiment_runner.py +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/object_serialiser.py +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/py.typed +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/test_experiment_runner.py +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa/test_tracing.py +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa_client.egg-info/dependency_links.txt +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa_client.egg-info/requires.txt +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/aiqa_client.egg-info/top_level.txt +0 -0
- {aiqa_client-0.4.0 → aiqa_client-0.4.3}/setup.cfg +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aiqa-client
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
|
|
5
5
|
Author-email: AIQA <info@aiqa.dev>
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/
|
|
8
|
-
Project-URL: Documentation, https://github.com/
|
|
9
|
-
Project-URL: Repository, https://github.com/
|
|
10
|
-
Project-URL: Issues, https://github.com/
|
|
7
|
+
Project-URL: Homepage, https://github.com/winterwell/aiqa-client-python
|
|
8
|
+
Project-URL: Documentation, https://github.com/winterwell/aiqa-client-python
|
|
9
|
+
Project-URL: Repository, https://github.com/winterwell/aiqa-client-python
|
|
10
|
+
Project-URL: Issues, https://github.com/winterwell/aiqa-client-python/issues
|
|
11
11
|
Keywords: opentelemetry,tracing,observability,aiqa,monitoring
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
@@ -22,7 +22,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
22
22
|
Classifier: Topic :: System :: Monitoring
|
|
23
23
|
Requires-Python: >=3.8
|
|
24
24
|
Description-Content-Type: text/markdown
|
|
25
|
-
License-File: LICENSE
|
|
25
|
+
License-File: LICENSE.txt
|
|
26
26
|
Requires-Dist: opentelemetry-api>=1.24.0
|
|
27
27
|
Requires-Dist: opentelemetry-sdk>=1.24.0
|
|
28
28
|
Requires-Dist: opentelemetry-semantic-conventions>=0.40b0
|
|
@@ -56,6 +56,20 @@ pip install -r requirements.txt
|
|
|
56
56
|
pip install -e .
|
|
57
57
|
```
|
|
58
58
|
|
|
59
|
+
### Development Setup
|
|
60
|
+
|
|
61
|
+
For development, install with dev dependencies to run tests:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[dev]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Then run the unit tests:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pytest
|
|
71
|
+
```
|
|
72
|
+
|
|
59
73
|
See [TESTING.md](TESTING.md) for detailed testing instructions.
|
|
60
74
|
|
|
61
75
|
## Setup
|
|
@@ -19,6 +19,20 @@ pip install -r requirements.txt
|
|
|
19
19
|
pip install -e .
|
|
20
20
|
```
|
|
21
21
|
|
|
22
|
+
### Development Setup
|
|
23
|
+
|
|
24
|
+
For development, install with dev dependencies to run tests:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install -e ".[dev]"
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Then run the unit tests:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pytest
|
|
34
|
+
```
|
|
35
|
+
|
|
22
36
|
See [TESTING.md](TESTING.md) for detailed testing instructions.
|
|
23
37
|
|
|
24
38
|
## Setup
|
|
@@ -1,22 +1,29 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Python client for AIQA server - OpenTelemetry tracing decorators.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
Initialization is automatic - you don't need to call get_aiqa_client() explicitly.
|
|
5
|
+
The client initializes automatically when WithTracing is first used.
|
|
6
|
+
|
|
7
|
+
Set environment variables:
|
|
8
|
+
AIQA_SERVER_URL: URL of the AIQA server
|
|
9
|
+
AIQA_API_KEY: API key for authentication
|
|
10
|
+
AIQA_COMPONENT_TAG: Optional component identifier
|
|
11
|
+
AIQA_STARTUP_DELAY_SECONDS: Optional delay before first flush (default: 10s)
|
|
6
12
|
|
|
7
13
|
Example:
|
|
8
14
|
from dotenv import load_dotenv
|
|
9
|
-
from aiqa import
|
|
15
|
+
from aiqa import WithTracing
|
|
10
16
|
|
|
11
17
|
# Load environment variables from .env file (if using one)
|
|
12
18
|
load_dotenv()
|
|
13
19
|
|
|
14
|
-
#
|
|
15
|
-
get_aiqa_client()
|
|
16
|
-
|
|
20
|
+
# No explicit initialization needed - it happens automatically when used
|
|
17
21
|
@WithTracing
|
|
18
22
|
def my_function():
|
|
19
23
|
return "Hello, AIQA!"
|
|
24
|
+
|
|
25
|
+
# Call the function - initialization happens on first use
|
|
26
|
+
result = my_function()
|
|
20
27
|
"""
|
|
21
28
|
|
|
22
29
|
from .tracing import (
|
|
@@ -68,14 +68,18 @@ class AIQASpanExporter(SpanExporter):
|
|
|
68
68
|
self.buffer_span_keys: set = set() # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
|
|
69
69
|
self.buffer_lock = threading.Lock()
|
|
70
70
|
self.flush_lock = threading.Lock()
|
|
71
|
+
# shutdown_requested is only set once (in shutdown()) and read many times
|
|
72
|
+
# No lock needed: worst case is reading stale False, which is acceptable
|
|
71
73
|
self.shutdown_requested = False
|
|
72
74
|
self.flush_timer: Optional[threading.Thread] = None
|
|
75
|
+
self._auto_flush_started = False
|
|
76
|
+
self._auto_flush_lock = threading.Lock() # Lock for lazy thread creation
|
|
73
77
|
|
|
74
78
|
logger.info(
|
|
75
79
|
f"Initializing AIQASpanExporter: server_url={self.server_url or 'not set'}, "
|
|
76
80
|
f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
|
|
77
81
|
)
|
|
78
|
-
|
|
82
|
+
# Don't start thread immediately - start lazily on first export to avoid startup issues
|
|
79
83
|
|
|
80
84
|
@property
|
|
81
85
|
def server_url(self) -> str:
|
|
@@ -106,6 +110,11 @@ class AIQASpanExporter(SpanExporter):
|
|
|
106
110
|
pass
|
|
107
111
|
|
|
108
112
|
logger.debug(f"AIQA export() called with {len(spans)} spans")
|
|
113
|
+
|
|
114
|
+
# Lazy initialization: start auto-flush thread on first export
|
|
115
|
+
# This avoids thread creation during initialization, which can cause issues in ECS deployments
|
|
116
|
+
self._ensure_auto_flush_started()
|
|
117
|
+
|
|
109
118
|
# Serialize and add to buffer, deduplicating by (traceId, spanId)
|
|
110
119
|
with self.buffer_lock:
|
|
111
120
|
serialized_spans = []
|
|
@@ -342,6 +351,8 @@ class AIQASpanExporter(SpanExporter):
|
|
|
342
351
|
"""
|
|
343
352
|
Flush buffered spans to the server. Thread-safe: ensures only one flush operation runs at a time.
|
|
344
353
|
Atomically extracts spans to prevent race conditions with concurrent export() calls.
|
|
354
|
+
|
|
355
|
+
Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
|
|
345
356
|
"""
|
|
346
357
|
logger.debug("flush() called - attempting to acquire flush lock")
|
|
347
358
|
with self.flush_lock:
|
|
@@ -364,71 +375,88 @@ class AIQASpanExporter(SpanExporter):
|
|
|
364
375
|
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
365
376
|
return
|
|
366
377
|
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
self._prepend_spans_to_buffer(spans_to_flush)
|
|
384
|
-
raise
|
|
385
|
-
logger.error(f"Error flushing spans to server: {error}")
|
|
386
|
-
# Put spans back for retry
|
|
378
|
+
# Release flush_lock before I/O to avoid blocking other flush attempts
|
|
379
|
+
# Spans are already extracted, so concurrent exports won't interfere
|
|
380
|
+
logger.info(f"flush() sending {len(spans_to_flush)} span(s) to server")
|
|
381
|
+
try:
|
|
382
|
+
await self._send_spans(spans_to_flush)
|
|
383
|
+
logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
|
|
384
|
+
# Spans already removed from buffer during extraction
|
|
385
|
+
# Now clear their keys from tracking set to free memory
|
|
386
|
+
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
387
|
+
except RuntimeError as error:
|
|
388
|
+
if self._is_interpreter_shutdown_error(error):
|
|
389
|
+
if self.shutdown_requested:
|
|
390
|
+
logger.debug(f"flush() skipped due to interpreter shutdown: {error}")
|
|
391
|
+
else:
|
|
392
|
+
logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
|
|
393
|
+
# Put spans back for retry with sync send during shutdown
|
|
387
394
|
self._prepend_spans_to_buffer(spans_to_flush)
|
|
388
395
|
raise
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
396
|
+
logger.error(f"Error flushing spans to server: {error}")
|
|
397
|
+
# Put spans back for retry
|
|
398
|
+
self._prepend_spans_to_buffer(spans_to_flush)
|
|
399
|
+
raise
|
|
400
|
+
except Exception as error:
|
|
401
|
+
logger.error(f"Error flushing spans to server: {error}")
|
|
402
|
+
# Put spans back for retry
|
|
403
|
+
self._prepend_spans_to_buffer(spans_to_flush)
|
|
404
|
+
if self.shutdown_requested:
|
|
405
|
+
raise
|
|
395
406
|
|
|
396
|
-
def
|
|
397
|
-
"""
|
|
398
|
-
|
|
399
|
-
|
|
407
|
+
def _ensure_auto_flush_started(self) -> None:
|
|
408
|
+
"""Ensure auto-flush thread is started (lazy initialization). Thread-safe."""
|
|
409
|
+
# Fast path: check without lock first
|
|
410
|
+
if self._auto_flush_started or self.shutdown_requested:
|
|
400
411
|
return
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
def flush_worker():
|
|
408
|
-
import asyncio
|
|
409
|
-
logger.debug("Auto-flush worker thread started")
|
|
412
|
+
|
|
413
|
+
# Slow path: acquire lock and double-check
|
|
414
|
+
with self._auto_flush_lock:
|
|
415
|
+
if self._auto_flush_started or self.shutdown_requested:
|
|
416
|
+
return
|
|
410
417
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
418
|
+
try:
|
|
419
|
+
self._start_auto_flush()
|
|
420
|
+
self._auto_flush_started = True
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.error(f"Failed to start auto-flush thread: {e}", exc_info=True)
|
|
423
|
+
# Don't raise - allow spans to be buffered even if auto-flush fails
|
|
424
|
+
# They can still be flushed manually or on shutdown
|
|
425
|
+
|
|
426
|
+
def _flush_worker(self) -> None:
|
|
427
|
+
"""Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
|
|
428
|
+
import asyncio
|
|
429
|
+
logger.debug("Auto-flush worker thread started")
|
|
430
|
+
|
|
431
|
+
# Wait for startup delay before beginning flush operations
|
|
432
|
+
# This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
|
|
433
|
+
if self.startup_delay_seconds > 0:
|
|
434
|
+
logger.info(f"Auto-flush waiting {self.startup_delay_seconds}s before first flush (startup delay)")
|
|
435
|
+
# Sleep in small increments to allow for early shutdown
|
|
436
|
+
sleep_interval = 0.5
|
|
437
|
+
remaining_delay = self.startup_delay_seconds
|
|
438
|
+
while remaining_delay > 0 and not self.shutdown_requested:
|
|
439
|
+
sleep_time = min(sleep_interval, remaining_delay)
|
|
440
|
+
time.sleep(sleep_time)
|
|
441
|
+
remaining_delay -= sleep_time
|
|
428
442
|
|
|
443
|
+
if self.shutdown_requested:
|
|
444
|
+
logger.debug("Auto-flush startup delay interrupted by shutdown")
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
logger.info("Auto-flush startup delay complete, beginning flush operations")
|
|
448
|
+
|
|
449
|
+
# Create event loop in this thread (isolated from main thread's event loop)
|
|
450
|
+
# This prevents interference with the main application's event loop
|
|
451
|
+
try:
|
|
429
452
|
loop = asyncio.new_event_loop()
|
|
430
453
|
asyncio.set_event_loop(loop)
|
|
431
|
-
|
|
454
|
+
except Exception as e:
|
|
455
|
+
logger.error(f"Failed to create event loop for auto-flush thread: {e}", exc_info=True)
|
|
456
|
+
return
|
|
457
|
+
|
|
458
|
+
# Ensure event loop is always closed, even if an exception occurs
|
|
459
|
+
try:
|
|
432
460
|
cycle_count = 0
|
|
433
461
|
while not self.shutdown_requested:
|
|
434
462
|
cycle_count += 1
|
|
@@ -436,27 +464,39 @@ class AIQASpanExporter(SpanExporter):
|
|
|
436
464
|
try:
|
|
437
465
|
loop.run_until_complete(self.flush())
|
|
438
466
|
logger.debug(f"Auto-flush cycle #{cycle_count} completed, sleeping {self.flush_interval_ms / 1000.0}s")
|
|
439
|
-
time.sleep(self.flush_interval_ms / 1000.0)
|
|
440
467
|
except Exception as e:
|
|
441
468
|
logger.error(f"Error in auto-flush cycle #{cycle_count}: {e}")
|
|
442
469
|
logger.debug(f"Auto-flush cycle #{cycle_count} error handled, sleeping {self.flush_interval_ms / 1000.0}s")
|
|
470
|
+
|
|
471
|
+
# Sleep after each cycle (including errors) to avoid tight loops
|
|
472
|
+
if not self.shutdown_requested:
|
|
443
473
|
time.sleep(self.flush_interval_ms / 1000.0)
|
|
444
474
|
|
|
445
475
|
logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
|
|
446
|
-
|
|
447
476
|
# Don't do final flush here - shutdown() will handle it with synchronous send
|
|
448
477
|
# This avoids event loop shutdown issues
|
|
449
478
|
logger.debug("Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
|
|
450
|
-
|
|
451
|
-
#
|
|
479
|
+
finally:
|
|
480
|
+
# Always close the event loop, even if an exception occurs
|
|
452
481
|
try:
|
|
453
482
|
if not loop.is_closed():
|
|
454
483
|
loop.close()
|
|
455
484
|
logger.debug("Auto-flush worker thread event loop closed")
|
|
456
485
|
except Exception:
|
|
457
486
|
pass # Ignore errors during cleanup
|
|
487
|
+
|
|
488
|
+
def _start_auto_flush(self) -> None:
|
|
489
|
+
"""Start the auto-flush timer with startup delay."""
|
|
490
|
+
if self.shutdown_requested:
|
|
491
|
+
logger.warning("_start_auto_flush() called but shutdown already requested")
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
logger.info(
|
|
495
|
+
f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
|
|
496
|
+
f"startup delay {self.startup_delay_seconds}s"
|
|
497
|
+
)
|
|
458
498
|
|
|
459
|
-
flush_thread = threading.Thread(target=
|
|
499
|
+
flush_thread = threading.Thread(target=self._flush_worker, daemon=True, name="AIQA-AutoFlush")
|
|
460
500
|
flush_thread.start()
|
|
461
501
|
self.flush_timer = flush_thread
|
|
462
502
|
logger.info(f"Auto-flush thread started: {flush_thread.name} (daemon={flush_thread.daemon})")
|
|
@@ -585,7 +625,8 @@ class AIQASpanExporter(SpanExporter):
|
|
|
585
625
|
logger.info(f"shutdown() buffer contains {buffer_size} span(s) before shutdown")
|
|
586
626
|
|
|
587
627
|
# Wait for flush thread to finish (it will do final flush)
|
|
588
|
-
if
|
|
628
|
+
# Only wait if thread was actually started
|
|
629
|
+
if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
|
|
589
630
|
logger.info("shutdown() waiting for auto-flush thread to complete (timeout=10s)")
|
|
590
631
|
self.flush_timer.join(timeout=10.0)
|
|
591
632
|
if self.flush_timer.is_alive():
|
|
@@ -118,7 +118,7 @@ def get_component_tag() -> str:
|
|
|
118
118
|
return _component_tag
|
|
119
119
|
|
|
120
120
|
|
|
121
|
-
def set_component_tag(tag: str
|
|
121
|
+
def set_component_tag(tag: Optional[str]) -> None:
|
|
122
122
|
"""Set the component tag programmatically (overrides environment variable)."""
|
|
123
123
|
global _component_tag
|
|
124
124
|
_component_tag = tag or ""
|
|
@@ -129,9 +129,14 @@ def get_aiqa_client() -> AIQAClient:
|
|
|
129
129
|
"""
|
|
130
130
|
Initialize and return the AIQA client singleton.
|
|
131
131
|
|
|
132
|
-
This function
|
|
133
|
-
|
|
134
|
-
|
|
132
|
+
This function is called automatically when WithTracing is first used, so you typically
|
|
133
|
+
don't need to call it explicitly. However, you can call it manually if you want to:
|
|
134
|
+
- Check if tracing is enabled (client.enabled)
|
|
135
|
+
- Initialize before the first @WithTracing usage
|
|
136
|
+
- Access the client object for advanced usage
|
|
137
|
+
|
|
138
|
+
The function loads environment variables (AIQA_SERVER_URL, AIQA_API_KEY, AIQA_COMPONENT_TAG)
|
|
139
|
+
and initializes the tracing system.
|
|
135
140
|
|
|
136
141
|
The client object manages the tracing system state. Tracing is done by the WithTracing
|
|
137
142
|
decorator. Experiments are run by the ExperimentRunner class.
|
|
@@ -142,12 +147,14 @@ def get_aiqa_client() -> AIQAClient:
|
|
|
142
147
|
Example:
|
|
143
148
|
from aiqa import get_aiqa_client, WithTracing
|
|
144
149
|
|
|
145
|
-
# Initialize
|
|
150
|
+
# Optional: Initialize explicitly (usually not needed)
|
|
146
151
|
client = get_aiqa_client()
|
|
152
|
+
if client.enabled:
|
|
153
|
+
print("Tracing is enabled")
|
|
147
154
|
|
|
148
155
|
@WithTracing
|
|
149
156
|
def my_function():
|
|
150
|
-
pass
|
|
157
|
+
pass # Initialization happens automatically here if not done above
|
|
151
158
|
"""
|
|
152
159
|
global client
|
|
153
160
|
try:
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test startup reliability - simulates ECS deployment scenarios where rapid initialization
|
|
3
|
+
and network issues could cause deployment failures.
|
|
4
|
+
|
|
5
|
+
These tests verify that:
|
|
6
|
+
1. Exporter initialization doesn't block or create threads immediately
|
|
7
|
+
2. Thread creation is lazy (only on first export)
|
|
8
|
+
3. Network failures during startup don't cause hangs
|
|
9
|
+
4. Multiple rapid initializations don't cause issues
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
import threading
|
|
15
|
+
import pytest
|
|
16
|
+
from unittest.mock import patch, MagicMock
|
|
17
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
18
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
19
|
+
|
|
20
|
+
from aiqa.client import get_aiqa_client, AIQAClient
|
|
21
|
+
from aiqa.aiqa_exporter import AIQASpanExporter
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TestStartupReliability:
|
|
25
|
+
"""Tests for startup reliability in ECS-like scenarios."""
|
|
26
|
+
|
|
27
|
+
def test_exporter_initialization_does_not_create_thread_immediately(self):
|
|
28
|
+
"""Verify that creating an exporter doesn't immediately start a thread."""
|
|
29
|
+
with patch.dict(
|
|
30
|
+
os.environ,
|
|
31
|
+
{
|
|
32
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
33
|
+
"AIQA_API_KEY": "test-api-key",
|
|
34
|
+
},
|
|
35
|
+
):
|
|
36
|
+
exporter = AIQASpanExporter(startup_delay_seconds=0.1)
|
|
37
|
+
|
|
38
|
+
# Thread should not be created immediately
|
|
39
|
+
assert exporter.flush_timer is None
|
|
40
|
+
assert not exporter._auto_flush_started
|
|
41
|
+
|
|
42
|
+
# Cleanup
|
|
43
|
+
exporter.shutdown()
|
|
44
|
+
|
|
45
|
+
def test_thread_created_lazily_on_first_export(self):
|
|
46
|
+
"""Verify thread is only created when first span is exported."""
|
|
47
|
+
with patch.dict(
|
|
48
|
+
os.environ,
|
|
49
|
+
{
|
|
50
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
51
|
+
"AIQA_API_KEY": "test-api-key",
|
|
52
|
+
},
|
|
53
|
+
):
|
|
54
|
+
exporter = AIQASpanExporter(startup_delay_seconds=0.1)
|
|
55
|
+
|
|
56
|
+
# Thread should not exist yet
|
|
57
|
+
assert exporter.flush_timer is None
|
|
58
|
+
|
|
59
|
+
# Create a mock span and export it
|
|
60
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
61
|
+
from opentelemetry.trace import SpanContext, TraceFlags
|
|
62
|
+
|
|
63
|
+
mock_span = MagicMock(spec=ReadableSpan)
|
|
64
|
+
mock_span.get_span_context.return_value = SpanContext(
|
|
65
|
+
trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
|
|
66
|
+
)
|
|
67
|
+
mock_span.name = "test_span"
|
|
68
|
+
mock_span.kind = 1
|
|
69
|
+
mock_span.start_time = 1000000000
|
|
70
|
+
mock_span.end_time = 2000000000
|
|
71
|
+
mock_span.status.status_code = 1
|
|
72
|
+
mock_span.attributes = {}
|
|
73
|
+
mock_span.links = []
|
|
74
|
+
mock_span.events = []
|
|
75
|
+
mock_span.resource.attributes = {}
|
|
76
|
+
mock_span.parent = None
|
|
77
|
+
|
|
78
|
+
# Export should trigger thread creation
|
|
79
|
+
result = exporter.export([mock_span])
|
|
80
|
+
|
|
81
|
+
# Give thread a moment to start
|
|
82
|
+
time.sleep(0.2)
|
|
83
|
+
|
|
84
|
+
# Now thread should exist
|
|
85
|
+
assert exporter._auto_flush_started
|
|
86
|
+
assert exporter.flush_timer is not None
|
|
87
|
+
assert exporter.flush_timer.is_alive()
|
|
88
|
+
|
|
89
|
+
# Cleanup
|
|
90
|
+
exporter.shutdown()
|
|
91
|
+
if exporter.flush_timer:
|
|
92
|
+
exporter.flush_timer.join(timeout=2.0)
|
|
93
|
+
|
|
94
|
+
def test_rapid_multiple_initializations(self):
|
|
95
|
+
"""Test that multiple rapid initializations don't cause issues (simulates health checks)."""
|
|
96
|
+
with patch.dict(
|
|
97
|
+
os.environ,
|
|
98
|
+
{
|
|
99
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
100
|
+
"AIQA_API_KEY": "test-api-key",
|
|
101
|
+
},
|
|
102
|
+
):
|
|
103
|
+
# Simulate rapid health check calls
|
|
104
|
+
clients = []
|
|
105
|
+
for _ in range(10):
|
|
106
|
+
client = get_aiqa_client()
|
|
107
|
+
clients.append(client)
|
|
108
|
+
time.sleep(0.01) # Very short delay
|
|
109
|
+
|
|
110
|
+
# All should be the same singleton
|
|
111
|
+
assert all(c is clients[0] for c in clients)
|
|
112
|
+
|
|
113
|
+
# Should not have created multiple threads
|
|
114
|
+
if clients[0].exporter:
|
|
115
|
+
assert clients[0].exporter._auto_flush_started or clients[0].exporter.flush_timer is None
|
|
116
|
+
|
|
117
|
+
def test_initialization_with_unreachable_server(self):
|
|
118
|
+
"""Test that initialization doesn't hang when server is unreachable."""
|
|
119
|
+
with patch.dict(
|
|
120
|
+
os.environ,
|
|
121
|
+
{
|
|
122
|
+
"AIQA_SERVER_URL": "http://unreachable-server:3000",
|
|
123
|
+
"AIQA_API_KEY": "test-api-key",
|
|
124
|
+
},
|
|
125
|
+
):
|
|
126
|
+
# Should not block or raise
|
|
127
|
+
client = get_aiqa_client()
|
|
128
|
+
assert client is not None
|
|
129
|
+
assert client._initialized
|
|
130
|
+
|
|
131
|
+
# Exporter should exist but thread shouldn't be started yet
|
|
132
|
+
if client.exporter:
|
|
133
|
+
# Thread creation is lazy, so it might not exist
|
|
134
|
+
assert client.exporter.flush_timer is None or not client.exporter._auto_flush_started
|
|
135
|
+
|
|
136
|
+
def test_startup_delay_respected(self):
|
|
137
|
+
"""Verify that startup delay prevents immediate flush attempts."""
|
|
138
|
+
with patch.dict(
|
|
139
|
+
os.environ,
|
|
140
|
+
{
|
|
141
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
142
|
+
"AIQA_API_KEY": "test-api-key",
|
|
143
|
+
},
|
|
144
|
+
):
|
|
145
|
+
exporter = AIQASpanExporter(startup_delay_seconds=0.5)
|
|
146
|
+
|
|
147
|
+
# Create and export a span to trigger thread creation
|
|
148
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
149
|
+
from opentelemetry.trace import SpanContext, TraceFlags
|
|
150
|
+
|
|
151
|
+
mock_span = MagicMock(spec=ReadableSpan)
|
|
152
|
+
mock_span.get_span_context.return_value = SpanContext(
|
|
153
|
+
trace_id=1, span_id=1, is_remote=False, trace_flags=TraceFlags(0x01)
|
|
154
|
+
)
|
|
155
|
+
mock_span.name = "test_span"
|
|
156
|
+
mock_span.kind = 1
|
|
157
|
+
mock_span.start_time = 1000000000
|
|
158
|
+
mock_span.end_time = 2000000000
|
|
159
|
+
mock_span.status.status_code = 1
|
|
160
|
+
mock_span.attributes = {}
|
|
161
|
+
mock_span.links = []
|
|
162
|
+
mock_span.events = []
|
|
163
|
+
mock_span.resource.attributes = {}
|
|
164
|
+
mock_span.parent = None
|
|
165
|
+
|
|
166
|
+
exporter.export([mock_span])
|
|
167
|
+
|
|
168
|
+
# Thread should be created
|
|
169
|
+
time.sleep(0.1)
|
|
170
|
+
assert exporter._auto_flush_started
|
|
171
|
+
|
|
172
|
+
# But flush should not have happened yet (within delay period)
|
|
173
|
+
# We can't easily test this without mocking time, but we verify thread exists
|
|
174
|
+
assert exporter.flush_timer is not None
|
|
175
|
+
|
|
176
|
+
# Cleanup
|
|
177
|
+
exporter.shutdown()
|
|
178
|
+
if exporter.flush_timer:
|
|
179
|
+
exporter.flush_timer.join(timeout=2.0)
|
|
180
|
+
|
|
181
|
+
def test_concurrent_initialization(self):
|
|
182
|
+
"""Test concurrent initialization from multiple threads (simulates ECS health checks)."""
|
|
183
|
+
with patch.dict(
|
|
184
|
+
os.environ,
|
|
185
|
+
{
|
|
186
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
187
|
+
"AIQA_API_KEY": "test-api-key",
|
|
188
|
+
},
|
|
189
|
+
):
|
|
190
|
+
clients = []
|
|
191
|
+
errors = []
|
|
192
|
+
|
|
193
|
+
def init_client():
|
|
194
|
+
try:
|
|
195
|
+
client = get_aiqa_client()
|
|
196
|
+
clients.append(client)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
errors.append(e)
|
|
199
|
+
|
|
200
|
+
# Start multiple threads initializing simultaneously
|
|
201
|
+
threads = [threading.Thread(target=init_client) for _ in range(5)]
|
|
202
|
+
for t in threads:
|
|
203
|
+
t.start()
|
|
204
|
+
for t in threads:
|
|
205
|
+
t.join(timeout=5.0)
|
|
206
|
+
|
|
207
|
+
# Should have no errors
|
|
208
|
+
assert len(errors) == 0
|
|
209
|
+
|
|
210
|
+
# All should be the same singleton
|
|
211
|
+
assert len(set(id(c) for c in clients)) == 1
|
|
212
|
+
|
|
213
|
+
def test_shutdown_before_thread_starts(self):
|
|
214
|
+
"""Test that shutdown works even if thread was never started."""
|
|
215
|
+
with patch.dict(
|
|
216
|
+
os.environ,
|
|
217
|
+
{
|
|
218
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
219
|
+
"AIQA_API_KEY": "test-api-key",
|
|
220
|
+
},
|
|
221
|
+
):
|
|
222
|
+
exporter = AIQASpanExporter(startup_delay_seconds=1.0)
|
|
223
|
+
|
|
224
|
+
# Thread should not exist
|
|
225
|
+
assert exporter.flush_timer is None
|
|
226
|
+
|
|
227
|
+
# Shutdown should work without errors
|
|
228
|
+
exporter.shutdown()
|
|
229
|
+
|
|
230
|
+
# Should still be able to call shutdown again
|
|
231
|
+
exporter.shutdown()
|
|
232
|
+
|
|
233
|
+
def test_initialization_timeout(self):
|
|
234
|
+
"""Test that initialization completes quickly even with network issues."""
|
|
235
|
+
with patch.dict(
|
|
236
|
+
os.environ,
|
|
237
|
+
{
|
|
238
|
+
"AIQA_SERVER_URL": "http://localhost:3000",
|
|
239
|
+
"AIQA_API_KEY": "test-api-key",
|
|
240
|
+
},
|
|
241
|
+
):
|
|
242
|
+
start_time = time.time()
|
|
243
|
+
client = get_aiqa_client()
|
|
244
|
+
elapsed = time.time() - start_time
|
|
245
|
+
|
|
246
|
+
# Initialization should be fast (< 1 second)
|
|
247
|
+
assert elapsed < 1.0
|
|
248
|
+
assert client is not None
|
|
249
|
+
|
|
@@ -590,7 +590,8 @@ def WithTracing(
|
|
|
590
590
|
is_generator = inspect.isgeneratorfunction(fn)
|
|
591
591
|
is_async_generator = inspect.isasyncgenfunction(fn) if hasattr(inspect, 'isasyncgenfunction') else False
|
|
592
592
|
|
|
593
|
-
tracer
|
|
593
|
+
# Don't get tracer here - get it lazily when function is called
|
|
594
|
+
# This ensures initialization only happens when tracing is actually used
|
|
594
595
|
|
|
595
596
|
def _setup_span(span: trace.Span, input_data: Any) -> bool:
|
|
596
597
|
"""Setup span with input data. Returns True if span is recording."""
|
|
@@ -627,10 +628,13 @@ def WithTracing(
|
|
|
627
628
|
def _execute_with_span_sync(executor: Callable[[], Any], input_data: Any) -> Any:
|
|
628
629
|
"""Execute sync function within span context, handling input/output and exceptions."""
|
|
629
630
|
# Ensure tracer provider is initialized before creating spans
|
|
631
|
+
# This is called lazily when the function runs, not at decorator definition time
|
|
630
632
|
client = get_aiqa_client()
|
|
631
633
|
if not client.enabled:
|
|
632
634
|
return executor()
|
|
633
635
|
|
|
636
|
+
# Get tracer after initialization (lazy)
|
|
637
|
+
tracer = get_aiqa_tracer()
|
|
634
638
|
with tracer.start_as_current_span(fn_name) as span:
|
|
635
639
|
if not _setup_span(span, input_data):
|
|
636
640
|
return executor()
|
|
@@ -646,10 +650,13 @@ def WithTracing(
|
|
|
646
650
|
async def _execute_with_span_async(executor: Callable[[], Any], input_data: Any) -> Any:
|
|
647
651
|
"""Execute async function within span context, handling input/output and exceptions."""
|
|
648
652
|
# Ensure tracer provider is initialized before creating spans
|
|
653
|
+
# This is called lazily when the function runs, not at decorator definition time
|
|
649
654
|
client = get_aiqa_client()
|
|
650
655
|
if not client.enabled:
|
|
651
656
|
return await executor()
|
|
652
657
|
|
|
658
|
+
# Get tracer after initialization (lazy)
|
|
659
|
+
tracer = get_aiqa_tracer()
|
|
653
660
|
with tracer.start_as_current_span(fn_name) as span:
|
|
654
661
|
if not _setup_span(span, input_data):
|
|
655
662
|
return await executor()
|
|
@@ -668,10 +675,13 @@ def WithTracing(
|
|
|
668
675
|
def _execute_generator_sync(executor: Callable[[], Any], input_data: Any) -> Any:
|
|
669
676
|
"""Execute sync generator function, returning a traced generator."""
|
|
670
677
|
# Ensure tracer provider is initialized before creating spans
|
|
678
|
+
# This is called lazily when the function runs, not at decorator definition time
|
|
671
679
|
client = get_aiqa_client()
|
|
672
680
|
if not client.enabled:
|
|
673
681
|
return executor()
|
|
674
682
|
|
|
683
|
+
# Get tracer after initialization (lazy)
|
|
684
|
+
tracer = get_aiqa_tracer()
|
|
675
685
|
# Create span but don't use 'with' - span will be closed by TracedGenerator
|
|
676
686
|
span = tracer.start_span(fn_name)
|
|
677
687
|
token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
|
|
@@ -694,10 +704,13 @@ def WithTracing(
|
|
|
694
704
|
async def _execute_generator_async(executor: Callable[[], Any], input_data: Any) -> Any:
|
|
695
705
|
"""Execute async generator function, returning a traced async generator."""
|
|
696
706
|
# Ensure tracer provider is initialized before creating spans
|
|
707
|
+
# This is called lazily when the function runs, not at decorator definition time
|
|
697
708
|
client = get_aiqa_client()
|
|
698
709
|
if not client.enabled:
|
|
699
710
|
return await executor()
|
|
700
711
|
|
|
712
|
+
# Get tracer after initialization (lazy)
|
|
713
|
+
tracer = get_aiqa_tracer()
|
|
701
714
|
# Create span but don't use 'with' - span will be closed by TracedAsyncGenerator
|
|
702
715
|
span = tracer.start_span(fn_name)
|
|
703
716
|
token = trace.context_api.attach(trace.context_api.set_span_in_context(span))
|
|
@@ -935,7 +948,8 @@ def set_component_tag(tag: str) -> None:
|
|
|
935
948
|
This can also be set via the AIQA_COMPONENT_TAG environment variable.
|
|
936
949
|
The component tag allows you to identify which component/system generated the spans.
|
|
937
950
|
|
|
938
|
-
Note:
|
|
951
|
+
Note: Initialization is automatic when WithTracing is first used. You can also call
|
|
952
|
+
get_aiqa_client() explicitly if needed.
|
|
939
953
|
the client and load environment variables.
|
|
940
954
|
|
|
941
955
|
Args:
|
|
@@ -1045,6 +1059,8 @@ def create_span_from_trace_id(
|
|
|
1045
1059
|
from opentelemetry.trace import set_span_in_context
|
|
1046
1060
|
parent_context = set_span_in_context(trace.NonRecordingSpan(parent_span_context))
|
|
1047
1061
|
|
|
1062
|
+
# Ensure initialization before creating span
|
|
1063
|
+
get_aiqa_client()
|
|
1048
1064
|
# Start a new span in this context (it will be a child of the parent span)
|
|
1049
1065
|
tracer = get_aiqa_tracer()
|
|
1050
1066
|
span = tracer.start_span(span_name, context=parent_context)
|
|
@@ -1057,6 +1073,8 @@ def create_span_from_trace_id(
|
|
|
1057
1073
|
return span
|
|
1058
1074
|
except (ValueError, AttributeError) as e:
|
|
1059
1075
|
logger.error(f"Error creating span from trace_id: {e}")
|
|
1076
|
+
# Ensure initialization before creating span
|
|
1077
|
+
get_aiqa_client()
|
|
1060
1078
|
# Fallback: create a new span
|
|
1061
1079
|
tracer = get_aiqa_tracer()
|
|
1062
1080
|
span = tracer.start_span(span_name)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aiqa-client
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
|
|
5
5
|
Author-email: AIQA <info@aiqa.dev>
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/
|
|
8
|
-
Project-URL: Documentation, https://github.com/
|
|
9
|
-
Project-URL: Repository, https://github.com/
|
|
10
|
-
Project-URL: Issues, https://github.com/
|
|
7
|
+
Project-URL: Homepage, https://github.com/winterwell/aiqa-client-python
|
|
8
|
+
Project-URL: Documentation, https://github.com/winterwell/aiqa-client-python
|
|
9
|
+
Project-URL: Repository, https://github.com/winterwell/aiqa-client-python
|
|
10
|
+
Project-URL: Issues, https://github.com/winterwell/aiqa-client-python/issues
|
|
11
11
|
Keywords: opentelemetry,tracing,observability,aiqa,monitoring
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
@@ -22,7 +22,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
22
22
|
Classifier: Topic :: System :: Monitoring
|
|
23
23
|
Requires-Python: >=3.8
|
|
24
24
|
Description-Content-Type: text/markdown
|
|
25
|
-
License-File: LICENSE
|
|
25
|
+
License-File: LICENSE.txt
|
|
26
26
|
Requires-Dist: opentelemetry-api>=1.24.0
|
|
27
27
|
Requires-Dist: opentelemetry-sdk>=1.24.0
|
|
28
28
|
Requires-Dist: opentelemetry-semantic-conventions>=0.40b0
|
|
@@ -56,6 +56,20 @@ pip install -r requirements.txt
|
|
|
56
56
|
pip install -e .
|
|
57
57
|
```
|
|
58
58
|
|
|
59
|
+
### Development Setup
|
|
60
|
+
|
|
61
|
+
For development, install with dev dependencies to run tests:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
pip install -e ".[dev]"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Then run the unit tests:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pytest
|
|
71
|
+
```
|
|
72
|
+
|
|
59
73
|
See [TESTING.md](TESTING.md) for detailed testing instructions.
|
|
60
74
|
|
|
61
75
|
## Setup
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
LICENSE
|
|
1
|
+
LICENSE.txt
|
|
2
2
|
MANIFEST.in
|
|
3
3
|
README.md
|
|
4
4
|
pyproject.toml
|
|
5
|
-
setup.py
|
|
6
5
|
aiqa/__init__.py
|
|
7
6
|
aiqa/aiqa_exporter.py
|
|
8
7
|
aiqa/client.py
|
|
@@ -11,6 +10,7 @@ aiqa/experiment_runner.py
|
|
|
11
10
|
aiqa/object_serialiser.py
|
|
12
11
|
aiqa/py.typed
|
|
13
12
|
aiqa/test_experiment_runner.py
|
|
13
|
+
aiqa/test_startup_reliability.py
|
|
14
14
|
aiqa/test_tracing.py
|
|
15
15
|
aiqa/tracing.py
|
|
16
16
|
aiqa_client.egg-info/PKG-INFO
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "aiqa-client"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.3"
|
|
8
8
|
description = "OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.8"
|
|
@@ -44,10 +44,10 @@ dev = [
|
|
|
44
44
|
]
|
|
45
45
|
|
|
46
46
|
[project.urls]
|
|
47
|
-
Homepage = "https://github.com/
|
|
48
|
-
Documentation = "https://github.com/
|
|
49
|
-
Repository = "https://github.com/
|
|
50
|
-
Issues = "https://github.com/
|
|
47
|
+
Homepage = "https://github.com/winterwell/aiqa-client-python"
|
|
48
|
+
Documentation = "https://github.com/winterwell/aiqa-client-python"
|
|
49
|
+
Repository = "https://github.com/winterwell/aiqa-client-python"
|
|
50
|
+
Issues = "https://github.com/winterwell/aiqa-client-python/issues"
|
|
51
51
|
|
|
52
52
|
[tool.setuptools]
|
|
53
53
|
packages = ["aiqa"]
|
aiqa_client-0.4.0/setup.py
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|