struct-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
struct_sdk/core.py ADDED
@@ -0,0 +1,755 @@
1
+ """Core SDK — wraps OpenTelemetry with Struct-specific defaults.
2
+
3
+ Users call struct.init() once at startup. The SDK auto-instruments any
4
+ supported libraries that are installed (anthropic, claude_agent_sdk).
5
+
6
+ OTel GenAI Semantic Conventions v1.37+ compliant.
7
+ """
8
+
9
+ import asyncio
10
+ import atexit
11
+ import contextvars
12
+ import functools
13
+ import json
14
+ import logging
15
+ import threading
16
+ import uuid
17
+ from enum import Enum
18
+ from typing import Any, Callable, Optional
19
+
20
+ from opentelemetry import trace
21
+ from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter
22
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
23
+ from opentelemetry.sdk._logs import LoggerProvider
24
+ from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
25
+ from opentelemetry.sdk.resources import Resource
26
+ from opentelemetry.sdk.trace import TracerProvider
27
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
28
+ from opentelemetry.trace import StatusCode
29
+
30
+ logger = logging.getLogger("struct_sdk")
31
+
32
+ DEFAULT_ENDPOINT = "https://ingest.struct.ai"
33
+
34
+
35
+ class ContentCaptureMode(str, Enum):
36
+ """Controls how LLM message content is captured in telemetry.
37
+
38
+ Follows the OTel GenAI spec content capture modes.
39
+ """
40
+ NONE = "none" # No message content captured
41
+ EVENT_ONLY = "event_only" # Content in log events only (default)
42
+ SPAN_ONLY = "span_only" # Content in span attributes only (legacy)
43
+ SPAN_AND_EVENT = "span_and_event" # Both log events and span attributes
44
+
45
+ # Context vars for propagating session context to child spans
46
+ _current_session_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("_current_session_id", default=None)
47
+ _current_conversation_id: contextvars.ContextVar[Optional[str]] = contextvars.ContextVar("_current_conversation_id", default=None)
48
+ _current_agent_span: contextvars.ContextVar[Optional[trace.Span]] = contextvars.ContextVar("_current_agent_span", default=None)
49
+
50
+ # Pending tool_use ids keyed by tool name (FIFO per name).
51
+ # Populated by the Anthropic monkey-patch when a chat response arrives with
52
+ # tool_use blocks, consumed by @struct.tool() / struct.tool(...) when the
53
+ # decorated function/context manager starts its span. Lets us auto-link
54
+ # execute_tool spans to the originating tool_use.id without forcing callers
55
+ # to pass tool_call_id= explicitly. Customers who already pass it keep
56
+ # working unchanged (explicit override wins).
57
+ _pending_tool_calls: contextvars.ContextVar[Optional[dict[str, list[str]]]] = contextvars.ContextVar("_pending_tool_calls", default=None)
58
+
59
+ # Registry of patched integrations — prevents double-patching
60
+ _patched_integrations: set[str] = set()
61
+
62
+ # Sites that have already logged a first-failure WARN; subsequent failures log at DEBUG.
63
+ _first_failure_logged: set[str] = set()
64
+
65
+
66
+ def _safe(fn: Callable[[], None], *, site: str) -> None:
67
+ """Run fn(); swallow any exception. First failure per site logs at WARN with stack; subsequent at DEBUG."""
68
+ try:
69
+ fn()
70
+ except Exception:
71
+ if site in _first_failure_logged:
72
+ logger.debug("Struct SDK suppressed exception at %s", site, exc_info=True)
73
+ else:
74
+ _first_failure_logged.add(site)
75
+ logger.warning("Struct SDK suppressed exception at %s", site, exc_info=True)
76
+
77
+
78
+ class StructSDK:
79
+ """Global SDK instance. Call init() once at startup."""
80
+
81
+ def __init__(self):
82
+ self._initialized = False
83
+ self._tracer_provider: Optional[TracerProvider] = None
84
+ self._logger_provider: Optional[LoggerProvider] = None
85
+ self._ingest_key: str = ""
86
+ self._endpoint: str = DEFAULT_ENDPOINT
87
+ self._content_capture: ContentCaptureMode = ContentCaptureMode.EVENT_ONLY
88
+ # Atexit/_shutdown reads this even when init() was never called or
89
+ # failed before reaching the assignment in init() — keep it set here.
90
+ self._shutdown_timeout_seconds: float = 5.0
91
+ # Serializes init() so concurrent callers cannot both pass the
92
+ # early-return check and double-run the init body. A second concurrent
93
+ # init blocks on this lock instead of racing.
94
+ self._init_lock: threading.Lock = threading.Lock()
95
+
96
+ def init(
97
+ self,
98
+ ingest_key: str,
99
+ *,
100
+ service_name: str = "default-agent",
101
+ service_version: str = "0.0.0",
102
+ environment: str = "development",
103
+ endpoint: str = DEFAULT_ENDPOINT,
104
+ shutdown_timeout_seconds: float = 5.0,
105
+ capture_content: bool = True,
106
+ content_capture: Optional[ContentCaptureMode] = None,
107
+ ):
108
+ """Initialize the Struct SDK and auto-instrument installed libraries.
109
+
110
+ After calling this, any installed supported library is automatically
111
+ instrumented — no additional setup required.
112
+
113
+ Auto-detected integrations:
114
+ - ``anthropic`` — patches messages.create() and messages.stream()
115
+ - ``claude_agent_sdk`` — patches ClaudeAgentOptions to inject OTel env vars
116
+
117
+ Args:
118
+ ingest_key: Your Struct ingest key (pk-...). Write-only, safe to expose.
119
+ service_name: Name of your agent/service (e.g., "checkout-agent").
120
+ service_version: Version string for your agent.
121
+ environment: Deployment environment (development, staging, production).
122
+ endpoint: Struct ingestion endpoint. Override for self-hosted or local dev.
123
+ shutdown_timeout_seconds: Maximum time the SDK's atexit shutdown is
124
+ allowed to spend flushing telemetry providers. If the ingest
125
+ endpoint is dead or slow, shutdown returns within this budget
126
+ rather than hanging the user's process exit. Defaults to 5.0.
127
+ capture_content: Whether to capture LLM prompts/completions. Disable for privacy.
128
+ Deprecated — use ``content_capture`` instead.
129
+ content_capture: Controls how LLM message content is captured.
130
+ Takes precedence over ``capture_content``. Defaults to EVENT_ONLY.
131
+ """
132
+ # Serialize init() so concurrent callers cannot both pass the early-return
133
+ # check and double-run the body (which would leak the first provider — the
134
+ # second assignment to self._tracer_provider would orphan the first without
135
+ # ever shutting it down). The single-threaded healthy path is unaffected:
136
+ # the lock is uncontended and acquired once.
137
+ with self._init_lock:
138
+ if self._initialized:
139
+ logger.warning("struct.init() called multiple times — ignoring")
140
+ return
141
+
142
+ self._ingest_key = ingest_key
143
+ self._endpoint = endpoint
144
+
145
+ # Resolve content capture mode: explicit content_capture > capture_content bool
146
+ if content_capture is not None:
147
+ self._content_capture = content_capture
148
+ elif not capture_content:
149
+ self._content_capture = ContentCaptureMode.NONE
150
+ else:
151
+ self._content_capture = ContentCaptureMode.EVENT_ONLY
152
+
153
+ try:
154
+ # Stored before atexit registers _shutdown so the hook always sees
155
+ # the user-configured timeout (or the default).
156
+ self._shutdown_timeout_seconds = shutdown_timeout_seconds
157
+
158
+ headers = {"x-struct-ingest-key": ingest_key}
159
+
160
+ # --- TracerProvider ---
161
+ span_exporter = OTLPSpanExporter(
162
+ endpoint=f"{endpoint.rstrip('/')}/v1/traces",
163
+ headers=headers,
164
+ )
165
+
166
+ span_processor = BatchSpanProcessor(
167
+ span_exporter,
168
+ max_queue_size=10000,
169
+ max_export_batch_size=100,
170
+ schedule_delay_millis=1000,
171
+ )
172
+
173
+ # Isolated TracerProvider — NOT set as the global.
174
+ resource = Resource.create(
175
+ {
176
+ "service.name": service_name,
177
+ "service.version": service_version,
178
+ "deployment.environment": environment,
179
+ }
180
+ )
181
+ self._tracer_provider = TracerProvider(resource=resource)
182
+ self._tracer_provider.add_span_processor(span_processor)
183
+
184
+ # --- LoggerProvider (for gen_ai message events) ---
185
+ log_exporter = OTLPLogExporter(
186
+ endpoint=f"{endpoint.rstrip('/')}/v1/logs",
187
+ headers=headers,
188
+ )
189
+
190
+ log_processor = BatchLogRecordProcessor(
191
+ log_exporter,
192
+ max_queue_size=10000,
193
+ max_export_batch_size=100,
194
+ schedule_delay_millis=1000,
195
+ )
196
+
197
+ self._logger_provider = LoggerProvider(resource=resource)
198
+ self._logger_provider.add_log_record_processor(log_processor)
199
+
200
+ self._initialized = True
201
+
202
+ self._auto_instrument()
203
+ except Exception:
204
+ logger.warning(
205
+ "Struct SDK init failed; SDK disabled: service=%s endpoint=%s",
206
+ service_name, endpoint, exc_info=True,
207
+ )
208
+ self._initialized = False
209
+ self._tracer_provider = None
210
+ self._logger_provider = None
211
+ return
212
+
213
+ atexit.register(self._shutdown)
214
+
215
+ logger.info(
216
+ "Struct SDK initialized: service=%s endpoint=%s content_capture=%s",
217
+ service_name, endpoint, self._content_capture.value,
218
+ )
219
+
220
+ def _auto_instrument(self):
221
+ """Detect and patch installed libraries."""
222
+ integrations = [
223
+ ("anthropic", "struct_sdk.anthropic"),
224
+ ("claude_agent_sdk", "struct_sdk.claude_agent"),
225
+ ("langchain_core", "struct_sdk.langchain"),
226
+ ]
227
+
228
+ for lib_name, module_path in integrations:
229
+ if lib_name in _patched_integrations:
230
+ continue
231
+ try:
232
+ import importlib
233
+ mod = importlib.import_module(module_path)
234
+ mod.patch(self) # type: ignore[attr-defined]
235
+ _patched_integrations.add(lib_name)
236
+ logger.info("Auto-instrumented: %s", lib_name)
237
+ except ImportError:
238
+ pass
239
+ except Exception:
240
+ logger.debug("Failed to instrument %s", lib_name, exc_info=True)
241
+
242
+ def get_tracer(self, name: str = "struct-sdk") -> trace.Tracer:
243
+ """Get an OTel tracer from our isolated provider."""
244
+ if self._tracer_provider is None:
245
+ raise RuntimeError("Call struct.init() before using the SDK")
246
+ return self._tracer_provider.get_tracer(name)
247
+
248
+ def get_logger(self, name: str = "struct-sdk") -> Any:
249
+ """Get an OTel logger from our isolated provider (for gen_ai log events)."""
250
+ if self._logger_provider is None:
251
+ raise RuntimeError("Call struct.init() before using the SDK")
252
+ return self._logger_provider.get_logger(name)
253
+
254
+ @property
255
+ def capture_content(self) -> bool:
256
+ """Backward-compatible property. True if any content capture is enabled."""
257
+ return self._content_capture != ContentCaptureMode.NONE
258
+
259
+ @property
260
+ def content_capture(self) -> ContentCaptureMode:
261
+ """The active content capture mode."""
262
+ return self._content_capture
263
+
264
+ @property
265
+ def emit_events(self) -> bool:
266
+ """True if content should be emitted as log events."""
267
+ return self._content_capture in (ContentCaptureMode.EVENT_ONLY, ContentCaptureMode.SPAN_AND_EVENT)
268
+
269
+ @property
270
+ def emit_span_content(self) -> bool:
271
+ """True if content should be set as span attributes."""
272
+ return self._content_capture in (ContentCaptureMode.SPAN_ONLY, ContentCaptureMode.SPAN_AND_EVENT)
273
+
274
+ # ── Decorators / Context Managers ──
275
+
276
+ def agent(
277
+ self,
278
+ fn: Any = None,
279
+ *,
280
+ name: Optional[str] = None,
281
+ session_id: Optional[str] = None,
282
+ agent_id: Optional[str] = None,
283
+ version: Optional[str] = None,
284
+ metadata: Optional[dict[str, str]] = None,
285
+ ) -> Any:
286
+ """Mark a function or block as an agent session.
287
+
288
+ Creates an ``invoke_agent`` span per the OTel GenAI spec that groups
289
+ all LLM and tool calls within the scope.
290
+
291
+ Usage::
292
+
293
+ @struct.agent()
294
+ async def my_agent():
295
+ response = await client.messages.create(...)
296
+
297
+ async with struct.agent(session_id=agent_id, name="checkout"):
298
+ ...
299
+ """
300
+ if not self._initialized:
301
+ return fn if fn is not None else _NoOpContext()
302
+
303
+ ctx = _AgentContext(self, name=name, session_id=session_id, agent_id=agent_id, version=version, metadata=metadata)
304
+ if fn is None:
305
+ return ctx
306
+ if callable(fn):
307
+ return ctx(fn)
308
+ raise TypeError("agent() argument must be callable or used as context manager")
309
+
310
+ def tool(
311
+ self,
312
+ fn: Any = None,
313
+ *,
314
+ name: Optional[str] = None,
315
+ tool_call_id: Optional[str] = None,
316
+ ) -> Any:
317
+ """Mark a function or block as a tool execution.
318
+
319
+ Creates an ``execute_tool`` span per the OTel GenAI spec.
320
+
321
+ ``gen_ai.tool.call.id`` is populated automatically when this is invoked
322
+ after a patched Anthropic call flow — the SDK stashes each ``tool_use.id``
323
+ from the assistant response and pops the first matching tool name when
324
+ this span starts. Pass ``tool_call_id=`` explicitly to override.
325
+
326
+ Usage::
327
+
328
+ @struct.tool()
329
+ async def search_investigations(query: str, limit: int):
330
+ return await repo.search(query, limit)
331
+
332
+ async with struct.tool(name="search"):
333
+ result = await do_search()
334
+ """
335
+ if not self._initialized:
336
+ return fn if fn is not None else _NoOpContext()
337
+
338
+ ctx = _ToolContext(self, name=name, tool_call_id=tool_call_id)
339
+ if fn is None:
340
+ return ctx
341
+ if callable(fn):
342
+ return ctx(fn)
343
+ raise TypeError("tool() argument must be callable or used as context manager")
344
+
345
+ def _shutdown(self) -> None:
346
+ """Best-effort shutdown bounded by ``shutdown_timeout_seconds``.
347
+
348
+ Runs ``provider.shutdown()`` in a daemon worker thread and waits up
349
+ to the configured timeout for it to finish. On timeout, the daemon
350
+ is abandoned (it dies with the process). On any failure — thread
351
+ creation, provider faults, attribute errors — we swallow the
352
+ exception: this runs in atexit context where propagating to the
353
+ interpreter would mean the user's process exits with a confusing
354
+ traceback for telemetry the SDK was supposed to hide.
355
+ """
356
+ try:
357
+ worker = threading.Thread(
358
+ target=self._do_shutdown,
359
+ name="struct-sdk-shutdown",
360
+ daemon=True,
361
+ )
362
+ worker.start()
363
+ worker.join(timeout=self._shutdown_timeout_seconds)
364
+ if worker.is_alive():
365
+ # Thread keeps running; daemon=True so it dies with the
366
+ # process. We log at DEBUG: stderr WARNs at atexit time
367
+ # clutter user output for a fault we already handled.
368
+ logger.debug(
369
+ "Struct SDK shutdown timed out after %.1fs (ingest may be unreachable)",
370
+ self._shutdown_timeout_seconds,
371
+ )
372
+ except Exception:
373
+ logger.debug("Struct SDK shutdown failed", exc_info=True)
374
+
375
+ def _do_shutdown(self) -> None:
376
+ """Worker that calls each provider's shutdown(); each call is isolated.
377
+
378
+ Order matches the previous behavior (logger first, tracer second) so
379
+ any consumer that depended on relative ordering keeps working.
380
+ """
381
+ if self._logger_provider is not None:
382
+ try:
383
+ self._logger_provider.shutdown()
384
+ except Exception:
385
+ logger.debug("LoggerProvider shutdown failed", exc_info=True)
386
+ if self._tracer_provider is not None:
387
+ try:
388
+ self._tracer_provider.shutdown()
389
+ except Exception:
390
+ logger.debug("TracerProvider shutdown failed", exc_info=True)
391
+
392
+
393
+ # ---------------------------------------------------------------------------
394
+ # No-op context (when SDK not initialized)
395
+ # ---------------------------------------------------------------------------
396
+
397
+ class _NoOpContext:
398
+ """No-op context manager / decorator when SDK is not initialized."""
399
+ def __call__(self, fn: Any) -> Any:
400
+ return fn
401
+ def __enter__(self) -> "_NoOpContext":
402
+ return self
403
+ def __exit__(self, *args: Any) -> None:
404
+ pass
405
+ async def __aenter__(self) -> "_NoOpContext":
406
+ return self
407
+ async def __aexit__(self, *args: Any) -> None:
408
+ pass
409
+
410
+
411
+ # ---------------------------------------------------------------------------
412
+ # Agent context — invoke_agent span (GenAI spec)
413
+ # ---------------------------------------------------------------------------
414
+
415
+ class _AgentContext:
416
+ """Creates an ``invoke_agent {name}`` span per the OTel GenAI spec."""
417
+
418
+ def __init__(self, sdk: StructSDK, *, name: Optional[str] = None, session_id: Optional[str] = None, agent_id: Optional[str] = None, version: Optional[str] = None, metadata: Optional[dict[str, str]] = None):
419
+ self._sdk = sdk
420
+ self._name = name
421
+ self._session_id = session_id or str(uuid.uuid4())
422
+ self._agent_id = agent_id
423
+ self._version = version
424
+ self._metadata = metadata
425
+ self._span: Optional[trace.Span] = None
426
+ self._ctx_manager: Optional[Any] = None
427
+ self._session_token: Optional[contextvars.Token[Optional[str]]] = None
428
+ self._conversation_token: Optional[contextvars.Token[Optional[str]]] = None
429
+ self._agent_span_token: Optional[contextvars.Token[Optional[trace.Span]]] = None
430
+ self._pending_tool_token: Optional[contextvars.Token[Optional[dict[str, list[str]]]]] = None
431
+
432
+ def __call__(self, fn: Any) -> Any:
433
+ """Use as decorator."""
434
+ span_name = self._name or fn.__name__
435
+ sdk = self._sdk
436
+ session_id = self._session_id
437
+ agent_id = self._agent_id
438
+ version = self._version
439
+ metadata = self._metadata
440
+
441
+ if asyncio.iscoroutinefunction(fn):
442
+ @functools.wraps(fn)
443
+ async def wrapper(*args: Any, **kwargs: Any) -> Any:
444
+ async with _AgentContext(sdk, name=span_name, session_id=session_id, agent_id=agent_id, version=version, metadata=metadata):
445
+ return await fn(*args, **kwargs)
446
+ return wrapper
447
+ else:
448
+ @functools.wraps(fn)
449
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
450
+ with _AgentContext(sdk, name=span_name, session_id=session_id, agent_id=agent_id, version=version, metadata=metadata):
451
+ return fn(*args, **kwargs)
452
+ return wrapper
453
+
454
+ def _start_span(self) -> None:
455
+ started = False
456
+ entered = False
457
+
458
+ def body() -> None:
459
+ nonlocal started, entered
460
+ agent_name = self._name or "agent"
461
+ tracer = self._sdk.get_tracer("struct-sdk")
462
+
463
+ # Capture the outer session id BEFORE overwriting the contextvar so we
464
+ # can link nested agents (subagents) back to the agent that spawned them.
465
+ # Subagent pattern: an outer @struct.agent() wraps a function; that function
466
+ # calls a tool that itself enters another @struct.agent() scope. The inner
467
+ # scope's struct.agent.parent_session_id points to the outer session_id.
468
+ parent_session_id = _current_session_id.get(None)
469
+
470
+ self._span = tracer.start_span(
471
+ f"invoke_agent {agent_name}",
472
+ kind=trace.SpanKind.INTERNAL,
473
+ )
474
+ # Required
475
+ self._span.set_attribute("gen_ai.operation.name", "invoke_agent")
476
+ self._span.set_attribute("gen_ai.provider.name", "struct")
477
+ # Conditionally required
478
+ self._span.set_attribute("gen_ai.agent.name", agent_name)
479
+ # gen_ai.agent.id is the stable agent-definition identifier per the
480
+ # OTel GenAI spec — not a per-invocation value. Only set it when the
481
+ # caller supplies one.
482
+ if self._agent_id:
483
+ self._span.set_attribute("gen_ai.agent.id", self._agent_id)
484
+ if self._version:
485
+ self._span.set_attribute("gen_ai.agent.version", self._version)
486
+ # gen_ai.conversation.id is the spec-blessed name; we drop the
487
+ # redundant session.id.
488
+ self._span.set_attribute("gen_ai.conversation.id", self._session_id)
489
+ # Link to the outer agent's session, if we're nested under one.
490
+ if parent_session_id and parent_session_id != self._session_id:
491
+ self._span.set_attribute("struct.agent.parent_session_id", parent_session_id)
492
+ # Custom metadata
493
+ if self._metadata:
494
+ for key, value in self._metadata.items():
495
+ self._span.set_attribute(f"struct.metadata.{key}", value)
496
+
497
+ self._ctx_manager = trace.use_span(self._span, end_on_exit=False)
498
+ self._ctx_manager.__enter__()
499
+ # Tracks whether the OTel context stack was actually pushed; only
500
+ # then is it correct to call __exit__ on rollback. If body raised
501
+ # between assigning self._ctx_manager and __enter__ returning,
502
+ # nothing was pushed and __exit__ would corrupt the stack.
503
+ entered = True
504
+ # Set context vars so child spans inherit session context
505
+ self._session_token = _current_session_id.set(self._session_id)
506
+ self._conversation_token = _current_conversation_id.set(self._session_id)
507
+ self._agent_span_token = _current_agent_span.set(self._span)
508
+ # Fresh pending-tool-calls dict scoped to this agent run, so tool_use
509
+ # ids from an outer agent cannot leak in or out.
510
+ self._pending_tool_token = _pending_tool_calls.set({})
511
+ started = True
512
+
513
+ _safe(body, site="agent.start_span")
514
+ if not started:
515
+ # body() raised partway. Roll back any partial state so __exit__ /
516
+ # _end_span see a clean "no telemetry" view: tokens are reset
517
+ # best-effort, the OTel context stack is popped if it was pushed,
518
+ # the span is ended if it was started, and references are dropped.
519
+ pending_tok = self._pending_tool_token
520
+ if pending_tok is not None:
521
+ _safe(lambda: _pending_tool_calls.reset(pending_tok),
522
+ site="agent.start_span.reset_pending_tool")
523
+ self._pending_tool_token = None
524
+ agent_tok = self._agent_span_token
525
+ if agent_tok is not None:
526
+ _safe(lambda: _current_agent_span.reset(agent_tok),
527
+ site="agent.start_span.reset_agent_span")
528
+ self._agent_span_token = None
529
+ conv_tok = self._conversation_token
530
+ if conv_tok is not None:
531
+ _safe(lambda: _current_conversation_id.reset(conv_tok),
532
+ site="agent.start_span.reset_conversation")
533
+ self._conversation_token = None
534
+ session_tok = self._session_token
535
+ if session_tok is not None:
536
+ _safe(lambda: _current_session_id.reset(session_tok),
537
+ site="agent.start_span.reset_session")
538
+ self._session_token = None
539
+ # Pop the OTel context stack first — use_span's __exit__ depends on
540
+ # the span still being current. Only call it if __enter__ ran.
541
+ ctx = self._ctx_manager
542
+ if entered and ctx is not None:
543
+ _safe(lambda: ctx.__exit__(None, None, None),
544
+ site="agent.start_span.rollback_ctx_exit")
545
+ self._ctx_manager = None
546
+ # Then end the span so it isn't leaked unended.
547
+ span = self._span
548
+ if span is not None:
549
+ _safe(span.end, site="agent.start_span.rollback_span_end")
550
+ self._span = None
551
+
552
+ def _end_span(self, exc_val: Any = None) -> None:
553
+ # Contextvar resets must always run — they're cheap, can't fault on the
554
+ # span, and leaving them set leaks session context into the caller.
555
+ pending_tok = self._pending_tool_token
556
+ if pending_tok is not None:
557
+ _safe(lambda: _pending_tool_calls.reset(pending_tok),
558
+ site="agent.end_span.reset_pending_tool")
559
+ agent_tok = self._agent_span_token
560
+ if agent_tok is not None:
561
+ _safe(lambda: _current_agent_span.reset(agent_tok),
562
+ site="agent.end_span.reset_agent_span")
563
+ session_tok = self._session_token
564
+ if session_tok is not None:
565
+ _safe(lambda: _current_session_id.reset(session_tok),
566
+ site="agent.end_span.reset_session")
567
+ conv_tok = self._conversation_token
568
+ if conv_tok is not None:
569
+ _safe(lambda: _current_conversation_id.reset(conv_tok),
570
+ site="agent.end_span.reset_conversation")
571
+ span = self._span
572
+ if span is not None:
573
+ if exc_val:
574
+ _safe(lambda: span.set_attribute("error.type", type(exc_val).__name__),
575
+ site="agent.end_span.error_type")
576
+ _safe(lambda: span.set_status(StatusCode.ERROR, str(exc_val)),
577
+ site="agent.end_span.error_status")
578
+ _safe(lambda: span.record_exception(exc_val),
579
+ site="agent.end_span.record_exception")
580
+ else:
581
+ _safe(lambda: span.set_status(StatusCode.OK),
582
+ site="agent.end_span.set_ok")
583
+ _safe(span.end, site="agent.end_span.end")
584
+ ctx = self._ctx_manager
585
+ if ctx is not None:
586
+ _safe(lambda: ctx.__exit__(None, None, None), site="agent.end_span.ctx_exit")
587
+
588
+ def __enter__(self) -> "_AgentContext":
589
+ self._start_span()
590
+ return self
591
+
592
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
593
+ self._end_span(exc_val)
594
+
595
+ async def __aenter__(self) -> "_AgentContext":
596
+ self._start_span()
597
+ return self
598
+
599
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
600
+ self._end_span(exc_val)
601
+
602
+ @property
603
+ def session_id(self) -> str:
604
+ return self._session_id
605
+
606
+
607
+ # ---------------------------------------------------------------------------
608
+ # Tool context — execute_tool span (GenAI spec)
609
+ # ---------------------------------------------------------------------------
610
+
611
+ class _ToolContext:
612
+ """Creates an ``execute_tool {name}`` span per the OTel GenAI spec."""
613
+
614
+ def __init__(self, sdk: StructSDK, *, name: Optional[str] = None, tool_call_id: Optional[str] = None):
615
+ self._sdk = sdk
616
+ self._name = name
617
+ self._tool_call_id = tool_call_id
618
+ self._span: Optional[trace.Span] = None
619
+ self._ctx_manager: Optional[Any] = None
620
+ self._result: Any = None
621
+
622
+ def __call__(self, fn: Any) -> Any:
623
+ """Use as decorator. Captures function args and return value."""
624
+ tool_name = self._name or fn.__name__
625
+ sdk = self._sdk
626
+ tool_call_id = self._tool_call_id
627
+
628
+ if asyncio.iscoroutinefunction(fn):
629
+ @functools.wraps(fn)
630
+ async def wrapper(*args: Any, **kwargs: Any) -> Any:
631
+ async with _ToolContext(sdk, name=tool_name, tool_call_id=tool_call_id) as ctx:
632
+ # Capture arguments (opt-in)
633
+ if sdk.capture_content and kwargs:
634
+ ctx._set_arguments(kwargs)
635
+ result = await fn(*args, **kwargs)
636
+ # Capture result (opt-in)
637
+ if sdk.capture_content and result is not None:
638
+ ctx._set_result(result)
639
+ return result
640
+ return wrapper
641
+ else:
642
+ @functools.wraps(fn)
643
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
644
+ with _ToolContext(sdk, name=tool_name, tool_call_id=tool_call_id) as ctx:
645
+ if sdk.capture_content and kwargs:
646
+ ctx._set_arguments(kwargs)
647
+ result = fn(*args, **kwargs)
648
+ if sdk.capture_content and result is not None:
649
+ ctx._set_result(result)
650
+ return result
651
+ return wrapper
652
+
653
+ def _start_span(self) -> None:
654
+ started = False
655
+ entered = False
656
+
657
+ def body() -> None:
658
+ nonlocal started, entered
659
+ tool_name = self._name or "tool"
660
+ tracer = self._sdk.get_tracer("struct-sdk")
661
+ self._span = tracer.start_span(
662
+ f"execute_tool {tool_name}",
663
+ kind=trace.SpanKind.INTERNAL,
664
+ )
665
+ # Required
666
+ self._span.set_attribute("gen_ai.operation.name", "execute_tool")
667
+ self._span.set_attribute("gen_ai.provider.name", "struct")
668
+ # Recommended
669
+ self._span.set_attribute("gen_ai.tool.name", tool_name)
670
+ # Auto-link to the originating tool_use.id from the preceding chat
671
+ # response if the caller didn't pass one explicitly. Explicit
672
+ # tool_call_id= always wins — pop from the pending queue only when
673
+ # the caller left it unset.
674
+ if self._tool_call_id is None:
675
+ pending = _pending_tool_calls.get()
676
+ if pending:
677
+ ids = pending.get(tool_name)
678
+ if ids:
679
+ self._tool_call_id = ids.pop(0)
680
+ if self._tool_call_id:
681
+ self._span.set_attribute("gen_ai.tool.call.id", self._tool_call_id)
682
+ # Propagate session context
683
+ session_id = _current_session_id.get(None)
684
+ if session_id:
685
+ self._span.set_attribute("gen_ai.conversation.id", session_id)
686
+
687
+ self._ctx_manager = trace.use_span(self._span, end_on_exit=False)
688
+ self._ctx_manager.__enter__()
689
+ # Tracks whether the OTel context stack was actually pushed; only
690
+ # then is it correct to call __exit__ on rollback.
691
+ entered = True
692
+ started = True
693
+
694
+ _safe(body, site="tool.start_span")
695
+ if not started:
696
+ # Body raised partway. Pop the OTel context stack if it was pushed,
697
+ # end the span if it was started, then drop references so
698
+ # _end_span sees a clean "no telemetry" view.
699
+ ctx = self._ctx_manager
700
+ if entered and ctx is not None:
701
+ _safe(lambda: ctx.__exit__(None, None, None),
702
+ site="tool.start_span.rollback_ctx_exit")
703
+ self._ctx_manager = None
704
+ span = self._span
705
+ if span is not None:
706
+ _safe(span.end, site="tool.start_span.rollback_span_end")
707
+ self._span = None
708
+
709
+ def _end_span(self, exc_val: Any = None) -> None:
710
+ span = self._span
711
+ if span is not None:
712
+ if exc_val:
713
+ _safe(lambda: span.set_attribute("error.type", type(exc_val).__name__),
714
+ site="tool.end_span.error_type")
715
+ _safe(lambda: span.set_status(StatusCode.ERROR, str(exc_val)),
716
+ site="tool.end_span.error_status")
717
+ _safe(lambda: span.record_exception(exc_val),
718
+ site="tool.end_span.record_exception")
719
+ else:
720
+ _safe(lambda: span.set_status(StatusCode.OK),
721
+ site="tool.end_span.set_ok")
722
+ _safe(span.end, site="tool.end_span.end")
723
+ ctx = self._ctx_manager
724
+ if ctx is not None:
725
+ _safe(lambda: ctx.__exit__(None, None, None), site="tool.end_span.ctx_exit")
726
+
727
+ def _set_arguments(self, kwargs: dict) -> None:
728
+ """Set tool call arguments (opt-in)."""
729
+ if self._span:
730
+ try:
731
+ self._span.set_attribute("gen_ai.tool.call.arguments", json.dumps(kwargs, default=str)[:8192])
732
+ except Exception:
733
+ pass
734
+
735
+ def _set_result(self, result: Any) -> None:
736
+ """Set tool call result (opt-in)."""
737
+ if self._span:
738
+ try:
739
+ self._span.set_attribute("gen_ai.tool.call.result", json.dumps(result, default=str)[:8192])
740
+ except Exception:
741
+ pass
742
+
743
+ def __enter__(self) -> "_ToolContext":
744
+ self._start_span()
745
+ return self
746
+
747
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
748
+ self._end_span(exc_val)
749
+
750
+ async def __aenter__(self) -> "_ToolContext":
751
+ self._start_span()
752
+ return self
753
+
754
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
755
+ self._end_span(exc_val)