aiqa-client 0.4.1__tar.gz → 0.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aiqa_client-0.4.1/aiqa_client.egg-info → aiqa_client-0.4.7}/PKG-INFO +6 -6
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/aiqa_exporter.py +192 -99
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/client.py +14 -11
- aiqa_client-0.4.7/aiqa/constants.py +8 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/experiment_runner.py +12 -29
- aiqa_client-0.4.7/aiqa/http_utils.py +69 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/object_serialiser.py +136 -115
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/tracing.py +113 -253
- aiqa_client-0.4.7/aiqa/tracing_llm_utils.py +191 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7/aiqa_client.egg-info}/PKG-INFO +6 -6
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa_client.egg-info/SOURCES.txt +7 -6
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/pyproject.toml +5 -5
- aiqa_client-0.4.7/tests/test_object_serialiser.py +415 -0
- aiqa_client-0.4.1/aiqa/constants.py +0 -6
- aiqa_client-0.4.1/aiqa/test_experiment_runner.py +0 -176
- aiqa_client-0.4.1/setup.py +0 -9
- /aiqa_client-0.4.1/LICENSE → /aiqa_client-0.4.7/LICENSE.txt +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/MANIFEST.in +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/README.md +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/__init__.py +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa/py.typed +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa_client.egg-info/dependency_links.txt +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa_client.egg-info/requires.txt +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/aiqa_client.egg-info/top_level.txt +0 -0
- {aiqa_client-0.4.1 → aiqa_client-0.4.7}/setup.cfg +0 -0
- {aiqa_client-0.4.1/aiqa → aiqa_client-0.4.7/tests}/test_startup_reliability.py +0 -0
- {aiqa_client-0.4.1/aiqa → aiqa_client-0.4.7/tests}/test_tracing.py +0 -0
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aiqa-client
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.7
|
|
4
4
|
Summary: OpenTelemetry-based Python client for tracing functions and sending traces to the AIQA server
|
|
5
5
|
Author-email: AIQA <info@aiqa.dev>
|
|
6
6
|
License: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/
|
|
8
|
-
Project-URL: Documentation, https://github.com/
|
|
9
|
-
Project-URL: Repository, https://github.com/
|
|
10
|
-
Project-URL: Issues, https://github.com/
|
|
7
|
+
Project-URL: Homepage, https://github.com/winterwell/aiqa-client-python
|
|
8
|
+
Project-URL: Documentation, https://github.com/winterwell/aiqa-client-python
|
|
9
|
+
Project-URL: Repository, https://github.com/winterwell/aiqa-client-python
|
|
10
|
+
Project-URL: Issues, https://github.com/winterwell/aiqa-client-python/issues
|
|
11
11
|
Keywords: opentelemetry,tracing,observability,aiqa,monitoring
|
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
@@ -22,7 +22,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
22
22
|
Classifier: Topic :: System :: Monitoring
|
|
23
23
|
Requires-Python: >=3.8
|
|
24
24
|
Description-Content-Type: text/markdown
|
|
25
|
-
License-File: LICENSE
|
|
25
|
+
License-File: LICENSE.txt
|
|
26
26
|
Requires-Dist: opentelemetry-api>=1.24.0
|
|
27
27
|
Requires-Dist: opentelemetry-sdk>=1.24.0
|
|
28
28
|
Requires-Dist: opentelemetry-semantic-conventions>=0.40b0
|
|
@@ -14,9 +14,11 @@ from typing import List, Dict, Any, Optional
|
|
|
14
14
|
from opentelemetry.sdk.trace import ReadableSpan
|
|
15
15
|
from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
|
|
16
16
|
|
|
17
|
-
from .constants import AIQA_TRACER_NAME, VERSION
|
|
17
|
+
from .constants import AIQA_TRACER_NAME, VERSION, LOG_TAG
|
|
18
|
+
from .http_utils import get_server_url, get_api_key, build_headers
|
|
19
|
+
from .object_serialiser import toNumber
|
|
18
20
|
|
|
19
|
-
logger = logging.getLogger(
|
|
21
|
+
logger = logging.getLogger(LOG_TAG)
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
class AIQASpanExporter(SpanExporter):
|
|
@@ -31,7 +33,8 @@ class AIQASpanExporter(SpanExporter):
|
|
|
31
33
|
api_key: Optional[str] = None,
|
|
32
34
|
flush_interval_seconds: float = 5.0,
|
|
33
35
|
max_batch_size_bytes: int = 5 * 1024 * 1024, # 5MB default
|
|
34
|
-
max_buffer_spans: int =
|
|
36
|
+
max_buffer_spans: Optional[int] = None, # Maximum spans to buffer (prevents unbounded growth)
|
|
37
|
+
max_buffer_size_bytes: Optional[int] = None, # Maximum buffer size in bytes (prevents unbounded memory growth)
|
|
35
38
|
startup_delay_seconds: Optional[float] = None,
|
|
36
39
|
):
|
|
37
40
|
"""
|
|
@@ -42,15 +45,27 @@ class AIQASpanExporter(SpanExporter):
|
|
|
42
45
|
api_key: API key for authentication (defaults to AIQA_API_KEY env var)
|
|
43
46
|
flush_interval_seconds: How often to flush spans to the server
|
|
44
47
|
max_batch_size_bytes: Maximum size of a single batch in bytes (default: 5mb)
|
|
45
|
-
max_buffer_spans: Maximum spans to buffer (prevents unbounded growth)
|
|
48
|
+
max_buffer_spans: Maximum spans to buffer (prevents unbounded growth).
|
|
49
|
+
Defaults to 10000, or AIQA_MAX_BUFFER_SPANS env var if set.
|
|
50
|
+
max_buffer_size_bytes: Maximum total buffer size in bytes (prevents unbounded memory growth).
|
|
51
|
+
Defaults to None (no limit), or AIQA_MAX_BUFFER_SIZE_BYTES env var if set.
|
|
46
52
|
startup_delay_seconds: Delay before starting auto-flush (default: 10s, or AIQA_STARTUP_DELAY_SECONDS env var)
|
|
47
53
|
"""
|
|
48
|
-
self._server_url = server_url
|
|
49
|
-
self._api_key = api_key
|
|
54
|
+
self._server_url = get_server_url(server_url)
|
|
55
|
+
self._api_key = get_api_key(api_key)
|
|
50
56
|
self.flush_interval_ms = flush_interval_seconds * 1000
|
|
51
57
|
self.max_batch_size_bytes = max_batch_size_bytes
|
|
58
|
+
|
|
59
|
+
# Get max_buffer_spans from parameter, environment variable, or default
|
|
60
|
+
if not max_buffer_spans:
|
|
61
|
+
max_buffer_spans = toNumber(os.getenv("AIQA_MAX_BUFFER_SPANS")) or 10000
|
|
52
62
|
self.max_buffer_spans = max_buffer_spans
|
|
53
63
|
|
|
64
|
+
# Get max_buffer_size_bytes from parameter, environment variable, or default
|
|
65
|
+
if not max_buffer_size_bytes:
|
|
66
|
+
max_buffer_size_bytes = toNumber(os.getenv("AIQA_MAX_BUFFER_SIZE_BYTES")) or toNumber("100m")
|
|
67
|
+
self.max_buffer_size_bytes = max_buffer_size_bytes
|
|
68
|
+
|
|
54
69
|
# Get startup delay from parameter or environment variable (default: 10s)
|
|
55
70
|
if startup_delay_seconds is None:
|
|
56
71
|
env_delay = os.getenv("AIQA_STARTUP_DELAY_SECONDS")
|
|
@@ -66,6 +81,11 @@ class AIQASpanExporter(SpanExporter):
|
|
|
66
81
|
|
|
67
82
|
self.buffer: List[Dict[str, Any]] = []
|
|
68
83
|
self.buffer_span_keys: set = set() # Track (traceId, spanId) tuples to prevent duplicates (Python 3.8 compatible)
|
|
84
|
+
self.buffer_size_bytes: int = 0 # Track total size of buffered spans in bytes
|
|
85
|
+
# Cache span sizes to avoid recalculation (maps span_key -> size_bytes)
|
|
86
|
+
# Limited to max_buffer_spans * 2 to prevent unbounded growth
|
|
87
|
+
self._span_size_cache: Dict[tuple, int] = {}
|
|
88
|
+
self._max_cache_size = self.max_buffer_spans * 2 # Allow cache to be 2x buffer size
|
|
69
89
|
self.buffer_lock = threading.Lock()
|
|
70
90
|
self.flush_lock = threading.Lock()
|
|
71
91
|
# shutdown_requested is only set once (in shutdown()) and read many times
|
|
@@ -75,27 +95,19 @@ class AIQASpanExporter(SpanExporter):
|
|
|
75
95
|
self._auto_flush_started = False
|
|
76
96
|
self._auto_flush_lock = threading.Lock() # Lock for lazy thread creation
|
|
77
97
|
|
|
78
|
-
logger.info(
|
|
79
|
-
f"Initializing AIQASpanExporter: server_url={self.server_url or 'not set'}, "
|
|
98
|
+
logger.info(f"Initializing AIQASpanExporter: server_url={self._server_url or 'not set'}, "
|
|
80
99
|
f"flush_interval={flush_interval_seconds}s, startup_delay={startup_delay_seconds}s"
|
|
81
100
|
)
|
|
82
101
|
# Don't start thread immediately - start lazily on first export to avoid startup issues
|
|
83
102
|
|
|
84
|
-
@property
|
|
85
|
-
def server_url(self) -> str:
|
|
86
|
-
return self._server_url or os.getenv("AIQA_SERVER_URL", "").rstrip("/")
|
|
87
|
-
|
|
88
|
-
@property
|
|
89
|
-
def api_key(self) -> str:
|
|
90
|
-
return self._api_key or os.getenv("AIQA_API_KEY", "")
|
|
91
|
-
|
|
92
103
|
def export(self, spans: List[ReadableSpan]) -> SpanExportResult:
|
|
93
104
|
"""
|
|
94
105
|
Export spans to the AIQA server. Adds spans to buffer for async flushing.
|
|
95
106
|
Deduplicates spans based on (traceId, spanId) to prevent repeated exports.
|
|
107
|
+
Actual send is done by flush -> _send_spans, or shutdown -> _send_spans_sync
|
|
96
108
|
"""
|
|
97
109
|
if not spans:
|
|
98
|
-
logger.debug("export
|
|
110
|
+
logger.debug(f"export: called with empty spans list")
|
|
99
111
|
return SpanExportResult.SUCCESS
|
|
100
112
|
|
|
101
113
|
# Check if AIQA tracing is enabled
|
|
@@ -103,13 +115,13 @@ class AIQASpanExporter(SpanExporter):
|
|
|
103
115
|
from .client import get_aiqa_client
|
|
104
116
|
client = get_aiqa_client()
|
|
105
117
|
if not client.enabled:
|
|
106
|
-
logger.debug(f"AIQA export
|
|
118
|
+
logger.debug(f"AIQA export: skipped: tracing is disabled")
|
|
107
119
|
return SpanExportResult.SUCCESS
|
|
108
120
|
except Exception:
|
|
109
121
|
# If we can't check enabled status, proceed (fail open)
|
|
110
122
|
pass
|
|
111
123
|
|
|
112
|
-
logger.debug(f"AIQA export() called with {len(spans)} spans")
|
|
124
|
+
logger.debug(f"AIQA export() to buffer called with {len(spans)} spans")
|
|
113
125
|
|
|
114
126
|
# Lazy initialization: start auto-flush thread on first export
|
|
115
127
|
# This avoids thread creation during initialization, which can cause issues in ECS deployments
|
|
@@ -118,40 +130,74 @@ class AIQASpanExporter(SpanExporter):
|
|
|
118
130
|
# Serialize and add to buffer, deduplicating by (traceId, spanId)
|
|
119
131
|
with self.buffer_lock:
|
|
120
132
|
serialized_spans = []
|
|
133
|
+
serialized_sizes = [] # Track sizes of serialized spans
|
|
121
134
|
duplicates_count = 0
|
|
122
135
|
dropped_count = 0
|
|
136
|
+
dropped_memory_count = 0
|
|
137
|
+
flush_in_progress = self.flush_lock.locked()
|
|
138
|
+
|
|
123
139
|
for span in spans:
|
|
124
|
-
# Check if buffer is full (prevent unbounded growth)
|
|
140
|
+
# Check if buffer is full by span count (prevent unbounded growth)
|
|
125
141
|
if len(self.buffer) >= self.max_buffer_spans:
|
|
126
|
-
|
|
127
|
-
|
|
142
|
+
if flush_in_progress:
|
|
143
|
+
# Flush in progress, drop this span
|
|
144
|
+
dropped_count += 1
|
|
145
|
+
continue
|
|
146
|
+
# Flush not in progress, will trigger flush after adding spans
|
|
147
|
+
# Continue processing remaining spans to add them before flush
|
|
128
148
|
|
|
129
149
|
serialized = self._serialize_span(span)
|
|
130
150
|
span_key = (serialized["traceId"], serialized["spanId"])
|
|
131
151
|
if span_key not in self.buffer_span_keys:
|
|
152
|
+
# Estimate size of this span when serialized (cache for later use)
|
|
153
|
+
span_size = self._get_span_size(span_key, serialized)
|
|
154
|
+
|
|
155
|
+
# Check if buffer is full by memory size (prevent unbounded memory growth)
|
|
156
|
+
if self.max_buffer_size_bytes is not None and self.buffer_size_bytes + span_size > self.max_buffer_size_bytes:
|
|
157
|
+
if flush_in_progress:
|
|
158
|
+
# Flush in progress, drop this span
|
|
159
|
+
# Don't cache size for dropped spans to prevent memory leak
|
|
160
|
+
dropped_memory_count += 1
|
|
161
|
+
continue
|
|
162
|
+
# Flush not in progress, will trigger flush after adding spans
|
|
163
|
+
# Continue processing remaining spans to add them before flush
|
|
164
|
+
|
|
132
165
|
serialized_spans.append(serialized)
|
|
166
|
+
serialized_sizes.append(span_size)
|
|
133
167
|
self.buffer_span_keys.add(span_key)
|
|
134
168
|
else:
|
|
135
169
|
duplicates_count += 1
|
|
136
|
-
logger.debug(f"export
|
|
170
|
+
logger.debug(f"export: skipping duplicate span: traceId={serialized['traceId']}, spanId={serialized['spanId']}")
|
|
137
171
|
|
|
172
|
+
# Add spans and update buffer size
|
|
138
173
|
self.buffer.extend(serialized_spans)
|
|
174
|
+
self.buffer_size_bytes += sum(serialized_sizes)
|
|
139
175
|
buffer_size = len(self.buffer)
|
|
140
176
|
|
|
177
|
+
# Check if thresholds are reached after adding spans
|
|
178
|
+
threshold_reached = self._check_thresholds_reached()
|
|
179
|
+
|
|
141
180
|
if dropped_count > 0:
|
|
142
|
-
logger.warning(
|
|
143
|
-
f"WARNING: Buffer full ({buffer_size} spans), dropped {dropped_count} span(s). "
|
|
181
|
+
logger.warning(f"WARNING: Buffer full ({buffer_size} spans), dropped {dropped_count} span(s) (flush in progress). "
|
|
144
182
|
f"Consider increasing max_buffer_spans or fixing server connectivity."
|
|
145
183
|
)
|
|
184
|
+
if dropped_memory_count > 0:
|
|
185
|
+
logger.warning(f"WARNING: Buffer memory limit reached ({self.buffer_size_bytes} bytes / {self.max_buffer_size_bytes} bytes), "
|
|
186
|
+
f"dropped {dropped_memory_count} span(s) (flush in progress). "
|
|
187
|
+
f"Consider increasing AIQA_MAX_BUFFER_SIZE_BYTES or fixing server connectivity."
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Trigger immediate flush if threshold reached and flush not in progress
|
|
191
|
+
if threshold_reached and not flush_in_progress:
|
|
192
|
+
logger.info(f"Buffer threshold reached ({buffer_size} spans, {self.buffer_size_bytes} bytes), triggering immediate flush")
|
|
193
|
+
self._trigger_immediate_flush()
|
|
146
194
|
|
|
147
195
|
if duplicates_count > 0:
|
|
148
|
-
logger.debug(
|
|
149
|
-
f"export() added {len(serialized_spans)} span(s) to buffer, skipped {duplicates_count} duplicate(s). "
|
|
196
|
+
logger.debug(f"export() added {len(serialized_spans)} span(s) to buffer, skipped {duplicates_count} duplicate(s). "
|
|
150
197
|
f"Total buffered: {buffer_size}"
|
|
151
198
|
)
|
|
152
199
|
else:
|
|
153
|
-
logger.debug(
|
|
154
|
-
f"export() added {len(spans)} span(s) to buffer. "
|
|
200
|
+
logger.debug(f"export() added {len(spans)} span(s) to buffer. "
|
|
155
201
|
f"Total buffered: {buffer_size}"
|
|
156
202
|
)
|
|
157
203
|
|
|
@@ -233,18 +279,35 @@ class AIQASpanExporter(SpanExporter):
|
|
|
233
279
|
nanos = int(nanoseconds % 1_000_000_000)
|
|
234
280
|
return (seconds, nanos)
|
|
235
281
|
|
|
282
|
+
def _get_span_size(self, span_key: tuple, serialized: Dict[str, Any]) -> int:
|
|
283
|
+
"""
|
|
284
|
+
Get span size from cache or calculate and cache it.
|
|
285
|
+
Thread-safe when called within buffer_lock.
|
|
286
|
+
Limits cache size to prevent unbounded memory growth.
|
|
287
|
+
"""
|
|
288
|
+
if span_key in self._span_size_cache:
|
|
289
|
+
return self._span_size_cache[span_key]
|
|
290
|
+
span_json = json.dumps(serialized)
|
|
291
|
+
span_size = len(span_json.encode('utf-8'))
|
|
292
|
+
# Only cache if we have valid keys and cache isn't too large
|
|
293
|
+
if span_key[0] and span_key[1] and len(self._span_size_cache) < self._max_cache_size:
|
|
294
|
+
self._span_size_cache[span_key] = span_size
|
|
295
|
+
return span_size
|
|
296
|
+
|
|
297
|
+
def _check_thresholds_reached(self) -> bool:
|
|
298
|
+
"""Check if buffer thresholds are reached. Must be called within buffer_lock."""
|
|
299
|
+
if len(self.buffer) >= self.max_buffer_spans:
|
|
300
|
+
return True
|
|
301
|
+
if self.max_buffer_size_bytes is not None and self.buffer_size_bytes >= self.max_buffer_size_bytes:
|
|
302
|
+
return True
|
|
303
|
+
return False
|
|
304
|
+
|
|
236
305
|
def _build_request_headers(self) -> Dict[str, str]:
|
|
237
306
|
"""Build HTTP headers for span requests."""
|
|
238
|
-
|
|
239
|
-
if self.api_key:
|
|
240
|
-
headers["Authorization"] = f"ApiKey {self.api_key}"
|
|
241
|
-
return headers
|
|
307
|
+
return build_headers(self._api_key)
|
|
242
308
|
|
|
243
309
|
def _get_span_url(self) -> str:
|
|
244
|
-
"
|
|
245
|
-
if not self.server_url:
|
|
246
|
-
raise ValueError("AIQA_SERVER_URL is not set. Cannot send spans to server.")
|
|
247
|
-
return f"{self.server_url}/span"
|
|
310
|
+
return f"{self._server_url}/span"
|
|
248
311
|
|
|
249
312
|
def _is_interpreter_shutdown_error(self, error: Exception) -> bool:
|
|
250
313
|
"""Check if error is due to interpreter shutdown."""
|
|
@@ -263,36 +326,49 @@ class AIQASpanExporter(SpanExporter):
|
|
|
263
326
|
are added between extraction and clearing.
|
|
264
327
|
Note: Does NOT clear buffer_span_keys - that should be done after successful send
|
|
265
328
|
to avoid unnecessary clearing/rebuilding on failures.
|
|
329
|
+
Also resets buffer_size_bytes to 0.
|
|
266
330
|
"""
|
|
267
331
|
with self.buffer_lock:
|
|
268
332
|
spans = self.buffer[:]
|
|
269
333
|
self.buffer.clear()
|
|
334
|
+
self.buffer_size_bytes = 0
|
|
270
335
|
return spans
|
|
271
336
|
|
|
272
337
|
def _remove_span_keys_from_tracking(self, spans: List[Dict[str, Any]]) -> None:
|
|
273
338
|
"""
|
|
274
|
-
Remove span keys from tracking set (thread-safe). Called after successful send.
|
|
339
|
+
Remove span keys from tracking set and size cache (thread-safe). Called after successful send.
|
|
275
340
|
"""
|
|
276
341
|
with self.buffer_lock:
|
|
277
342
|
for span in spans:
|
|
278
343
|
span_key = (span["traceId"], span["spanId"])
|
|
279
344
|
self.buffer_span_keys.discard(span_key)
|
|
345
|
+
# Also remove from size cache to free memory
|
|
346
|
+
self._span_size_cache.pop(span_key, None)
|
|
280
347
|
|
|
281
348
|
def _prepend_spans_to_buffer(self, spans: List[Dict[str, Any]]) -> None:
|
|
282
349
|
"""
|
|
283
350
|
Prepend spans back to buffer (thread-safe). Used to restore spans
|
|
284
|
-
if sending fails. Rebuilds the span keys tracking set.
|
|
351
|
+
if sending fails. Rebuilds the span keys tracking set and buffer size.
|
|
352
|
+
Uses cached sizes when available to avoid re-serialization.
|
|
285
353
|
"""
|
|
286
354
|
with self.buffer_lock:
|
|
287
355
|
self.buffer[:0] = spans
|
|
288
356
|
# Rebuild span keys set from current buffer contents
|
|
289
357
|
self.buffer_span_keys = {(span["traceId"], span["spanId"]) for span in self.buffer}
|
|
358
|
+
# Recalculate buffer size using cache when available
|
|
359
|
+
total_size = 0
|
|
360
|
+
for span in self.buffer:
|
|
361
|
+
span_key = (span.get("traceId"), span.get("spanId"))
|
|
362
|
+
total_size += self._get_span_size(span_key, span)
|
|
363
|
+
self.buffer_size_bytes = total_size
|
|
290
364
|
|
|
291
365
|
def _clear_buffer(self) -> None:
|
|
292
366
|
"""Clear the buffer (thread-safe)."""
|
|
293
367
|
with self.buffer_lock:
|
|
294
368
|
self.buffer.clear()
|
|
295
369
|
self.buffer_span_keys.clear()
|
|
370
|
+
self.buffer_size_bytes = 0
|
|
371
|
+
self._span_size_cache.clear()
|
|
296
372
|
|
|
297
373
|
def _split_into_batches(self, spans: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
|
|
298
374
|
"""
|
|
@@ -308,9 +384,9 @@ class AIQASpanExporter(SpanExporter):
|
|
|
308
384
|
current_batch_size = 0
|
|
309
385
|
|
|
310
386
|
for span in spans:
|
|
311
|
-
#
|
|
312
|
-
|
|
313
|
-
span_size =
|
|
387
|
+
# Get size from cache if available, otherwise calculate it
|
|
388
|
+
span_key = (span.get("traceId"), span.get("spanId"))
|
|
389
|
+
span_size = self._get_span_size(span_key, span)
|
|
314
390
|
|
|
315
391
|
# Check if this single span exceeds the limit
|
|
316
392
|
if span_size > self.max_batch_size_bytes:
|
|
@@ -323,8 +399,7 @@ class AIQASpanExporter(SpanExporter):
|
|
|
323
399
|
# Log warning about oversized span
|
|
324
400
|
span_name = span.get('name', 'unknown')
|
|
325
401
|
span_trace_id = span.get('traceId', 'unknown')
|
|
326
|
-
logger.warning(
|
|
327
|
-
f"Span '{span_name}' (traceId={span_trace_id}) exceeds max_batch_size_bytes "
|
|
402
|
+
logger.warning(f"Span \'{span_name}' (traceId={span_trace_id}) exceeds max_batch_size_bytes "
|
|
328
403
|
f"({span_size} bytes > {self.max_batch_size_bytes} bytes). "
|
|
329
404
|
f"Will attempt to send it anyway - may fail if server/nginx limit is exceeded."
|
|
330
405
|
)
|
|
@@ -354,22 +429,21 @@ class AIQASpanExporter(SpanExporter):
|
|
|
354
429
|
|
|
355
430
|
Lock ordering: flush_lock -> buffer_lock (must be consistent to avoid deadlocks)
|
|
356
431
|
"""
|
|
357
|
-
logger.debug("flush
|
|
432
|
+
logger.debug(f"flush: called - attempting to acquire flush lock")
|
|
358
433
|
with self.flush_lock:
|
|
359
|
-
logger.debug("flush() acquired flush lock")
|
|
434
|
+
logger.debug(f"flush() acquired flush lock")
|
|
360
435
|
# Atomically extract and remove spans to prevent race conditions
|
|
361
436
|
# where export() adds spans between extraction and clearing
|
|
362
437
|
spans_to_flush = self._extract_and_remove_spans_from_buffer()
|
|
363
|
-
logger.debug(f"flush
|
|
438
|
+
logger.debug(f"flush: extracted {len(spans_to_flush)} span(s) from buffer")
|
|
364
439
|
|
|
365
440
|
if not spans_to_flush:
|
|
366
|
-
logger.debug("flush() completed: no spans to flush")
|
|
441
|
+
logger.debug(f"flush() completed: no spans to flush")
|
|
367
442
|
return
|
|
368
443
|
|
|
369
444
|
# Skip sending if server URL is not configured
|
|
370
|
-
if not self.
|
|
371
|
-
logger.warning(
|
|
372
|
-
f"Skipping flush: AIQA_SERVER_URL is not set. {len(spans_to_flush)} span(s) will not be sent."
|
|
445
|
+
if not self._server_url:
|
|
446
|
+
logger.warning(f"Skipping flush: AIQA_SERVER_URL is not set. {len(spans_to_flush)} span(s) will not be sent."
|
|
373
447
|
)
|
|
374
448
|
# Spans already removed from buffer, clear their keys to free memory
|
|
375
449
|
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
@@ -377,7 +451,7 @@ class AIQASpanExporter(SpanExporter):
|
|
|
377
451
|
|
|
378
452
|
# Release flush_lock before I/O to avoid blocking other flush attempts
|
|
379
453
|
# Spans are already extracted, so concurrent exports won't interfere
|
|
380
|
-
logger.info(f"flush
|
|
454
|
+
logger.info(f"flush: sending {len(spans_to_flush)} span(s) to server")
|
|
381
455
|
try:
|
|
382
456
|
await self._send_spans(spans_to_flush)
|
|
383
457
|
logger.info(f"flush() successfully sent {len(spans_to_flush)} span(s) to server")
|
|
@@ -387,7 +461,7 @@ class AIQASpanExporter(SpanExporter):
|
|
|
387
461
|
except RuntimeError as error:
|
|
388
462
|
if self._is_interpreter_shutdown_error(error):
|
|
389
463
|
if self.shutdown_requested:
|
|
390
|
-
logger.debug(f"flush
|
|
464
|
+
logger.debug(f"flush: skipped due to interpreter shutdown: {error}")
|
|
391
465
|
else:
|
|
392
466
|
logger.warning(f"flush() interrupted by interpreter shutdown: {error}")
|
|
393
467
|
# Put spans back for retry with sync send during shutdown
|
|
@@ -423,10 +497,32 @@ class AIQASpanExporter(SpanExporter):
|
|
|
423
497
|
# Don't raise - allow spans to be buffered even if auto-flush fails
|
|
424
498
|
# They can still be flushed manually or on shutdown
|
|
425
499
|
|
|
500
|
+
def _trigger_immediate_flush(self) -> None:
|
|
501
|
+
"""
|
|
502
|
+
Trigger an immediate flush in a background thread.
|
|
503
|
+
This is called when buffer thresholds are reached and no flush is in progress.
|
|
504
|
+
"""
|
|
505
|
+
def flush_in_thread():
|
|
506
|
+
"""Run flush in a new thread with its own event loop."""
|
|
507
|
+
try:
|
|
508
|
+
loop = asyncio.new_event_loop()
|
|
509
|
+
asyncio.set_event_loop(loop)
|
|
510
|
+
try:
|
|
511
|
+
loop.run_until_complete(self.flush())
|
|
512
|
+
finally:
|
|
513
|
+
if not loop.is_closed():
|
|
514
|
+
loop.close()
|
|
515
|
+
except Exception as e:
|
|
516
|
+
logger.error(f"Error in immediate flush thread: {e}", exc_info=True)
|
|
517
|
+
|
|
518
|
+
# Start flush in background thread (daemon so it doesn't block shutdown)
|
|
519
|
+
flush_thread = threading.Thread(target=flush_in_thread, daemon=True, name="AIQA-ImmediateFlush")
|
|
520
|
+
flush_thread.start()
|
|
521
|
+
|
|
426
522
|
def _flush_worker(self) -> None:
|
|
427
523
|
"""Worker function for auto-flush thread. Runs in a separate thread with its own event loop."""
|
|
428
524
|
import asyncio
|
|
429
|
-
logger.debug("Auto-flush worker thread started")
|
|
525
|
+
logger.debug(f"Auto-flush worker thread started")
|
|
430
526
|
|
|
431
527
|
# Wait for startup delay before beginning flush operations
|
|
432
528
|
# This gives the container/application time to stabilize, which helps avoid startup issues (seen with AWS ECS, Dec 2025).
|
|
@@ -441,10 +537,10 @@ class AIQASpanExporter(SpanExporter):
|
|
|
441
537
|
remaining_delay -= sleep_time
|
|
442
538
|
|
|
443
539
|
if self.shutdown_requested:
|
|
444
|
-
logger.debug("Auto-flush startup delay interrupted by shutdown")
|
|
540
|
+
logger.debug(f"Auto-flush startup delay interrupted by shutdown")
|
|
445
541
|
return
|
|
446
542
|
|
|
447
|
-
logger.info("Auto-flush startup delay complete, beginning flush operations")
|
|
543
|
+
logger.info(f"Auto-flush startup delay complete, beginning flush operations")
|
|
448
544
|
|
|
449
545
|
# Create event loop in this thread (isolated from main thread's event loop)
|
|
450
546
|
# This prevents interference with the main application's event loop
|
|
@@ -475,24 +571,23 @@ class AIQASpanExporter(SpanExporter):
|
|
|
475
571
|
logger.info(f"Auto-flush worker thread stopping (shutdown requested). Completed {cycle_count} cycles.")
|
|
476
572
|
# Don't do final flush here - shutdown() will handle it with synchronous send
|
|
477
573
|
# This avoids event loop shutdown issues
|
|
478
|
-
logger.debug("Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
|
|
574
|
+
logger.debug(f"Auto-flush thread skipping final flush (will be handled by shutdown() with sync send)")
|
|
479
575
|
finally:
|
|
480
576
|
# Always close the event loop, even if an exception occurs
|
|
481
577
|
try:
|
|
482
578
|
if not loop.is_closed():
|
|
483
579
|
loop.close()
|
|
484
|
-
logger.debug("Auto-flush worker thread event loop closed")
|
|
580
|
+
logger.debug(f"Auto-flush worker thread event loop closed")
|
|
485
581
|
except Exception:
|
|
486
582
|
pass # Ignore errors during cleanup
|
|
487
583
|
|
|
488
584
|
def _start_auto_flush(self) -> None:
|
|
489
585
|
"""Start the auto-flush timer with startup delay."""
|
|
490
586
|
if self.shutdown_requested:
|
|
491
|
-
logger.warning("_start_auto_flush() called but shutdown already requested")
|
|
587
|
+
logger.warning(f"_start_auto_flush() called but shutdown already requested")
|
|
492
588
|
return
|
|
493
589
|
|
|
494
|
-
logger.info(
|
|
495
|
-
f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
|
|
590
|
+
logger.info(f"Starting auto-flush thread with interval {self.flush_interval_ms / 1000.0}s, "
|
|
496
591
|
f"startup delay {self.startup_delay_seconds}s"
|
|
497
592
|
)
|
|
498
593
|
|
|
@@ -508,15 +603,15 @@ class AIQASpanExporter(SpanExporter):
|
|
|
508
603
|
# Split into batches if needed
|
|
509
604
|
batches = self._split_into_batches(spans)
|
|
510
605
|
if len(batches) > 1:
|
|
511
|
-
logger.info(f"_send_spans
|
|
606
|
+
logger.info(f"_send_spans: splitting {len(spans)} spans into {len(batches)} batches")
|
|
512
607
|
|
|
513
608
|
url = self._get_span_url()
|
|
514
609
|
headers = self._build_request_headers()
|
|
515
610
|
|
|
516
|
-
if self.
|
|
517
|
-
logger.
|
|
518
|
-
|
|
519
|
-
|
|
611
|
+
if not self._api_key: # This should not happen
|
|
612
|
+
logger.error(f"_send_spans: fail - no API key provided. {len(spans)} spans lost.")
|
|
613
|
+
# Spans were already removed from buffer before calling this method. They will now get forgotten
|
|
614
|
+
return
|
|
520
615
|
|
|
521
616
|
# Use timeout to prevent hanging on unreachable servers
|
|
522
617
|
timeout = aiohttp.ClientTimeout(total=30.0, connect=10.0)
|
|
@@ -524,41 +619,41 @@ class AIQASpanExporter(SpanExporter):
|
|
|
524
619
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
525
620
|
for batch_idx, batch in enumerate(batches):
|
|
526
621
|
try:
|
|
527
|
-
logger.debug(f"_send_spans
|
|
622
|
+
logger.debug(f"_send_spans: sending batch {batch_idx + 1}/{len(batches)} with {len(batch)} spans to {url}")
|
|
528
623
|
# Pre-serialize JSON to bytes and wrap in BytesIO to avoid blocking event loop
|
|
529
624
|
json_bytes = json.dumps(batch).encode('utf-8')
|
|
530
625
|
data = io.BytesIO(json_bytes)
|
|
531
626
|
|
|
532
627
|
async with session.post(url, data=data, headers=headers) as response:
|
|
533
|
-
logger.debug(f"_send_spans
|
|
628
|
+
logger.debug(f"_send_spans: batch {batch_idx + 1} received response: status={response.status}")
|
|
534
629
|
if not response.ok:
|
|
535
630
|
error_text = await response.text()
|
|
536
631
|
error_msg = f"Failed to send batch {batch_idx + 1}/{len(batches)}: {response.status} {response.reason} - {error_text[:200]}"
|
|
537
|
-
logger.error(f"_send_spans
|
|
632
|
+
logger.error(f"_send_spans: {error_msg}")
|
|
538
633
|
errors.append((batch_idx + 1, error_msg))
|
|
539
634
|
# Continue with other batches even if one fails
|
|
540
635
|
continue
|
|
541
|
-
logger.debug(f"_send_spans
|
|
636
|
+
logger.debug(f"_send_spans: batch {batch_idx + 1} successfully sent {len(batch)} spans")
|
|
542
637
|
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
543
638
|
# Network errors and timeouts - log but don't fail completely
|
|
544
639
|
error_msg = f"Network error in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
545
|
-
logger.warning(f"_send_spans
|
|
640
|
+
logger.warning(f"_send_spans: {error_msg} - will retry on next flush")
|
|
546
641
|
errors.append((batch_idx + 1, error_msg))
|
|
547
642
|
# Continue with other batches
|
|
548
643
|
except RuntimeError as e:
|
|
549
644
|
if self._is_interpreter_shutdown_error(e):
|
|
550
645
|
if self.shutdown_requested:
|
|
551
|
-
logger.debug(f"_send_spans
|
|
646
|
+
logger.debug(f"_send_spans: skipped due to interpreter shutdown: {e}")
|
|
552
647
|
else:
|
|
553
|
-
logger.warning(f"_send_spans
|
|
648
|
+
logger.warning(f"_send_spans: interrupted by interpreter shutdown: {e}")
|
|
554
649
|
raise
|
|
555
650
|
error_msg = f"RuntimeError in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
556
|
-
logger.error(f"_send_spans
|
|
651
|
+
logger.error(f"_send_spans: {error_msg}")
|
|
557
652
|
errors.append((batch_idx + 1, error_msg))
|
|
558
653
|
# Continue with other batches
|
|
559
654
|
except Exception as e:
|
|
560
655
|
error_msg = f"Exception in batch {batch_idx + 1}: {type(e).__name__}: {e}"
|
|
561
|
-
logger.error(f"_send_spans
|
|
656
|
+
logger.error(f"_send_spans: {error_msg}")
|
|
562
657
|
errors.append((batch_idx + 1, error_msg))
|
|
563
658
|
# Continue with other batches
|
|
564
659
|
|
|
@@ -568,7 +663,7 @@ class AIQASpanExporter(SpanExporter):
|
|
|
568
663
|
error_summary = "; ".join([f"batch {idx}: {msg}" for idx, msg in errors])
|
|
569
664
|
raise Exception(f"Failed to send some spans: {error_summary}")
|
|
570
665
|
|
|
571
|
-
logger.debug(f"_send_spans
|
|
666
|
+
logger.debug(f"_send_spans: successfully sent all {len(spans)} spans in {len(batches)} batch(es)")
|
|
572
667
|
|
|
573
668
|
def _send_spans_sync(self, spans: List[Dict[str, Any]]) -> None:
|
|
574
669
|
"""Send spans to the server API (synchronous, for shutdown scenarios). Batches large payloads automatically."""
|
|
@@ -582,10 +677,9 @@ class AIQASpanExporter(SpanExporter):
|
|
|
582
677
|
url = self._get_span_url()
|
|
583
678
|
headers = self._build_request_headers()
|
|
584
679
|
|
|
585
|
-
if self.
|
|
586
|
-
logger.
|
|
587
|
-
|
|
588
|
-
logger.debug("_send_spans_sync() no API key provided")
|
|
680
|
+
if not self._api_key:
|
|
681
|
+
logger.error(f"_send_spans_sync() fail - no API key provided")
|
|
682
|
+
return
|
|
589
683
|
|
|
590
684
|
errors = []
|
|
591
685
|
for batch_idx, batch in enumerate(batches):
|
|
@@ -616,64 +710,63 @@ class AIQASpanExporter(SpanExporter):
|
|
|
616
710
|
|
|
617
711
|
def shutdown(self) -> None:
|
|
618
712
|
"""Shutdown the exporter, flushing any remaining spans. Call before process exit."""
|
|
619
|
-
logger.info("shutdown
|
|
713
|
+
logger.info(f"shutdown: called - initiating exporter shutdown")
|
|
620
714
|
self.shutdown_requested = True
|
|
621
715
|
|
|
622
716
|
# Check buffer state before shutdown
|
|
623
717
|
with self.buffer_lock:
|
|
624
718
|
buffer_size = len(self.buffer)
|
|
625
|
-
logger.info(f"shutdown
|
|
719
|
+
logger.info(f"shutdown: buffer contains {buffer_size} span(s) before shutdown")
|
|
626
720
|
|
|
627
721
|
# Wait for flush thread to finish (it will do final flush)
|
|
628
722
|
# Only wait if thread was actually started
|
|
629
723
|
if self._auto_flush_started and self.flush_timer and self.flush_timer.is_alive():
|
|
630
|
-
logger.info("shutdown
|
|
724
|
+
logger.info(f"shutdown: waiting for auto-flush thread to complete (timeout=10s)")
|
|
631
725
|
self.flush_timer.join(timeout=10.0)
|
|
632
726
|
if self.flush_timer.is_alive():
|
|
633
|
-
logger.warning("shutdown
|
|
727
|
+
logger.warning(f"shutdown: auto-flush thread did not complete within timeout")
|
|
634
728
|
else:
|
|
635
|
-
logger.info("shutdown
|
|
729
|
+
logger.info(f"shutdown: auto-flush thread completed")
|
|
636
730
|
else:
|
|
637
|
-
logger.debug("shutdown
|
|
731
|
+
logger.debug(f"shutdown: no active auto-flush thread to wait for")
|
|
638
732
|
|
|
639
733
|
# Final flush attempt (use synchronous send to avoid event loop issues)
|
|
640
734
|
with self.flush_lock:
|
|
641
|
-
logger.debug("shutdown
|
|
735
|
+
logger.debug(f"shutdown: performing final flush with synchronous send")
|
|
642
736
|
# Atomically extract and remove spans to prevent race conditions
|
|
643
737
|
spans_to_flush = self._extract_and_remove_spans_from_buffer()
|
|
644
|
-
logger.debug(f"shutdown
|
|
738
|
+
logger.debug(f"shutdown: extracted {len(spans_to_flush)} span(s) from buffer for final flush")
|
|
645
739
|
|
|
646
740
|
if spans_to_flush:
|
|
647
|
-
if not self.
|
|
648
|
-
logger.warning(
|
|
649
|
-
f"shutdown() skipping final flush: AIQA_SERVER_URL is not set. "
|
|
741
|
+
if not self._server_url:
|
|
742
|
+
logger.warning(f"shutdown: skipping final flush: AIQA_SERVER_URL is not set. "
|
|
650
743
|
f"{len(spans_to_flush)} span(s) will not be sent."
|
|
651
744
|
)
|
|
652
745
|
# Spans already removed from buffer, clear their keys to free memory
|
|
653
746
|
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
654
747
|
else:
|
|
655
|
-
logger.info(f"shutdown
|
|
748
|
+
logger.info(f"shutdown: sending {len(spans_to_flush)} span(s) to server (synchronous)")
|
|
656
749
|
try:
|
|
657
750
|
self._send_spans_sync(spans_to_flush)
|
|
658
|
-
logger.info(f"shutdown
|
|
751
|
+
logger.info(f"shutdown: successfully sent {len(spans_to_flush)} span(s) to server")
|
|
659
752
|
# Spans already removed from buffer during extraction
|
|
660
753
|
# Clear their keys from tracking set to free memory
|
|
661
754
|
self._remove_span_keys_from_tracking(spans_to_flush)
|
|
662
755
|
except Exception as e:
|
|
663
|
-
logger.error(f"shutdown
|
|
756
|
+
logger.error(f"shutdown: failed to send spans: {e}")
|
|
664
757
|
# Spans already removed, but process is exiting anyway
|
|
665
|
-
logger.warning(f"shutdown
|
|
758
|
+
logger.warning(f"shutdown: {len(spans_to_flush)} span(s) were not sent due to error")
|
|
666
759
|
# Keys will remain in tracking set, but process is exiting so memory will be freed
|
|
667
760
|
else:
|
|
668
|
-
logger.debug("shutdown
|
|
761
|
+
logger.debug(f"shutdown: no spans to flush")
|
|
669
762
|
|
|
670
763
|
# Check buffer state after shutdown
|
|
671
764
|
with self.buffer_lock:
|
|
672
765
|
buffer_size = len(self.buffer)
|
|
673
766
|
if buffer_size > 0:
|
|
674
|
-
logger.warning(f"shutdown
|
|
767
|
+
logger.warning(f"shutdown: buffer still contains {buffer_size} span(s) after shutdown")
|
|
675
768
|
else:
|
|
676
|
-
logger.info("shutdown
|
|
769
|
+
logger.info(f"shutdown: buffer is empty after shutdown")
|
|
677
770
|
|
|
678
|
-
logger.info("shutdown
|
|
771
|
+
logger.info(f"shutdown: completed")
|
|
679
772
|
|