chuk-tool-processor 0.8__py3-none-any.whl → 0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chuk-tool-processor might be problematic. Click here for more details.

@@ -0,0 +1,343 @@
1
+ """
2
+ OpenTelemetry tracing integration for chuk-tool-processor.
3
+
4
+ Provides drop-in distributed tracing with standardized span names and attributes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from contextlib import contextmanager
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ from chuk_tool_processor.logging import get_logger
13
+
14
+ if TYPE_CHECKING:
15
+ from opentelemetry.trace import Span, Tracer
16
+
17
+ logger = get_logger("chuk_tool_processor.observability.tracing")
18
+
19
+ # Global tracer instance
20
+ _tracer: Tracer | None = None
21
+ _tracing_enabled = False
22
+
23
+
24
+ def init_tracer(service_name: str = "chuk-tool-processor") -> Tracer | NoOpTracer:
25
+ """
26
+ Initialize OpenTelemetry tracer with best-practice configuration.
27
+
28
+ Args:
29
+ service_name: Service name for tracing
30
+
31
+ Returns:
32
+ Configured OpenTelemetry tracer or NoOpTracer if initialization fails
33
+ """
34
+ global _tracer, _tracing_enabled
35
+
36
+ try:
37
+ from opentelemetry import trace
38
+ from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
39
+ from opentelemetry.sdk.resources import Resource
40
+ from opentelemetry.sdk.trace import TracerProvider
41
+ from opentelemetry.sdk.trace.export import BatchSpanProcessor
42
+
43
+ # Create resource with service name
44
+ resource = Resource.create({"service.name": service_name})
45
+
46
+ # Create tracer provider
47
+ provider = TracerProvider(resource=resource)
48
+
49
+ # Add OTLP exporter (exports to OTEL collector)
50
+ otlp_exporter = OTLPSpanExporter()
51
+ provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
52
+
53
+ # Set as global tracer provider
54
+ trace.set_tracer_provider(provider)
55
+
56
+ _tracer = trace.get_tracer(__name__)
57
+ _tracing_enabled = True
58
+
59
+ logger.info(f"OpenTelemetry tracing initialized for service: {service_name}")
60
+ return _tracer
61
+
62
+ except ImportError as e:
63
+ logger.warning(f"OpenTelemetry packages not installed: {e}. Tracing disabled.")
64
+ _tracing_enabled = False
65
+ return NoOpTracer()
66
+
67
+
68
+ def get_tracer() -> Tracer | NoOpTracer:
69
+ """
70
+ Get the current tracer instance.
71
+
72
+ Returns:
73
+ OpenTelemetry tracer or no-op tracer if not initialized
74
+ """
75
+ if _tracer is None:
76
+ return NoOpTracer()
77
+ return _tracer
78
+
79
+
80
+ def is_tracing_enabled() -> bool:
81
+ """Check if tracing is enabled."""
82
+ return _tracing_enabled
83
+
84
+
85
+ @contextmanager
86
+ def trace_tool_execution(
87
+ tool: str,
88
+ namespace: str | None = None,
89
+ attributes: dict[str, Any] | None = None,
90
+ ):
91
+ """
92
+ Context manager for tracing tool execution.
93
+
94
+ Creates a span with name "tool.execute" and standard attributes.
95
+
96
+ Args:
97
+ tool: Tool name
98
+ namespace: Optional tool namespace
99
+ attributes: Additional span attributes
100
+
101
+ Example:
102
+ with trace_tool_execution("calculator", attributes={"operation": "add"}):
103
+ result = await tool.execute(a=5, b=3)
104
+ """
105
+ if not _tracing_enabled or _tracer is None:
106
+ yield None
107
+ return
108
+
109
+ span_name = "tool.execute"
110
+ span_attributes: dict[str, str | int | float | bool] = {
111
+ "tool.name": tool,
112
+ }
113
+
114
+ if namespace:
115
+ span_attributes["tool.namespace"] = namespace
116
+
117
+ if attributes:
118
+ # Flatten attributes with "tool." prefix
119
+ for key, value in attributes.items():
120
+ # Convert value to string for OTEL compatibility
121
+ if isinstance(value, (str, int, float, bool)):
122
+ span_attributes[f"tool.{key}"] = value
123
+ else:
124
+ span_attributes[f"tool.{key}"] = str(value)
125
+
126
+ with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
127
+ yield span
128
+
129
+
130
+ @contextmanager
131
+ def trace_cache_operation(
132
+ operation: str,
133
+ tool: str,
134
+ hit: bool | None = None,
135
+ attributes: dict[str, Any] | None = None,
136
+ ):
137
+ """
138
+ Context manager for tracing cache operations.
139
+
140
+ Args:
141
+ operation: Cache operation (lookup, set, invalidate)
142
+ tool: Tool name
143
+ hit: Whether cache hit (for lookup operations)
144
+ attributes: Additional span attributes
145
+
146
+ Example:
147
+ with trace_cache_operation("lookup", "calculator", hit=True):
148
+ result = await cache.get(tool, key)
149
+ """
150
+ if not _tracing_enabled or _tracer is None:
151
+ yield None
152
+ return
153
+
154
+ span_name = f"tool.cache.{operation}"
155
+ span_attributes: dict[str, str | int | float | bool] = {
156
+ "tool.name": tool,
157
+ "cache.operation": operation,
158
+ }
159
+
160
+ if hit is not None:
161
+ span_attributes["cache.hit"] = hit
162
+
163
+ if attributes:
164
+ for key, value in attributes.items():
165
+ if isinstance(value, (str, int, float, bool)):
166
+ span_attributes[f"cache.{key}"] = value
167
+ else:
168
+ span_attributes[f"cache.{key}"] = str(value)
169
+
170
+ with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
171
+ yield span
172
+
173
+
174
+ @contextmanager
175
+ def trace_retry_attempt(
176
+ tool: str,
177
+ attempt: int,
178
+ max_retries: int,
179
+ attributes: dict[str, Any] | None = None,
180
+ ):
181
+ """
182
+ Context manager for tracing retry attempts.
183
+
184
+ Args:
185
+ tool: Tool name
186
+ attempt: Current attempt number (0-indexed)
187
+ max_retries: Maximum retry attempts
188
+ attributes: Additional span attributes
189
+
190
+ Example:
191
+ with trace_retry_attempt("api_tool", attempt=1, max_retries=3):
192
+ result = await executor.execute([call])
193
+ """
194
+ if not _tracing_enabled or _tracer is None:
195
+ yield None
196
+ return
197
+
198
+ span_name = "tool.retry.attempt"
199
+ span_attributes: dict[str, str | int | float | bool] = {
200
+ "tool.name": tool,
201
+ "retry.attempt": attempt,
202
+ "retry.max_attempts": max_retries,
203
+ }
204
+
205
+ if attributes:
206
+ for key, value in attributes.items():
207
+ if isinstance(value, (str, int, float, bool)):
208
+ span_attributes[f"retry.{key}"] = value
209
+ else:
210
+ span_attributes[f"retry.{key}"] = str(value)
211
+
212
+ with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
213
+ yield span
214
+
215
+
216
+ @contextmanager
217
+ def trace_circuit_breaker(
218
+ tool: str,
219
+ state: str,
220
+ attributes: dict[str, Any] | None = None,
221
+ ):
222
+ """
223
+ Context manager for tracing circuit breaker operations.
224
+
225
+ Args:
226
+ tool: Tool name
227
+ state: Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
228
+ attributes: Additional span attributes
229
+
230
+ Example:
231
+ with trace_circuit_breaker("api_tool", state="OPEN"):
232
+ can_execute = await breaker.can_execute()
233
+ """
234
+ if not _tracing_enabled or _tracer is None:
235
+ yield None
236
+ return
237
+
238
+ span_name = "tool.circuit_breaker.check"
239
+ span_attributes: dict[str, str | int | float | bool] = {
240
+ "tool.name": tool,
241
+ "circuit.state": state,
242
+ }
243
+
244
+ if attributes:
245
+ for key, value in attributes.items():
246
+ if isinstance(value, (str, int, float, bool)):
247
+ span_attributes[f"circuit.{key}"] = value
248
+ else:
249
+ span_attributes[f"circuit.{key}"] = str(value)
250
+
251
+ with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
252
+ yield span
253
+
254
+
255
+ @contextmanager
256
+ def trace_rate_limit(
257
+ tool: str,
258
+ allowed: bool,
259
+ attributes: dict[str, Any] | None = None,
260
+ ):
261
+ """
262
+ Context manager for tracing rate limiting.
263
+
264
+ Args:
265
+ tool: Tool name
266
+ allowed: Whether request was allowed
267
+ attributes: Additional span attributes
268
+
269
+ Example:
270
+ with trace_rate_limit("api_tool", allowed=True):
271
+ await rate_limiter.acquire()
272
+ """
273
+ if not _tracing_enabled or _tracer is None:
274
+ yield None
275
+ return
276
+
277
+ span_name = "tool.rate_limit.check"
278
+ span_attributes: dict[str, str | int | float | bool] = {
279
+ "tool.name": tool,
280
+ "rate_limit.allowed": allowed,
281
+ }
282
+
283
+ if attributes:
284
+ for key, value in attributes.items():
285
+ if isinstance(value, (str, int, float, bool)):
286
+ span_attributes[f"rate_limit.{key}"] = value
287
+ else:
288
+ span_attributes[f"rate_limit.{key}"] = str(value)
289
+
290
+ with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
291
+ yield span
292
+
293
+
294
+ def add_span_event(span: Span | None, name: str, attributes: dict[str, Any] | None = None) -> None:
295
+ """
296
+ Add an event to the current span.
297
+
298
+ Args:
299
+ span: Span to add event to (can be None)
300
+ name: Event name
301
+ attributes: Event attributes
302
+ """
303
+ if span is None or not _tracing_enabled:
304
+ return
305
+
306
+ try:
307
+ span.add_event(name, attributes=attributes or {})
308
+ except Exception as e:
309
+ logger.debug(f"Error adding span event: {e}")
310
+
311
+
312
+ def set_span_error(span: Span | None, error: Exception | str) -> None:
313
+ """
314
+ Mark span as error and record exception details.
315
+
316
+ Args:
317
+ span: Span to mark as error (can be None)
318
+ error: Error to record
319
+ """
320
+ if span is None or not _tracing_enabled:
321
+ return
322
+
323
+ try:
324
+ from opentelemetry.trace import Status, StatusCode
325
+
326
+ span.set_status(Status(StatusCode.ERROR, str(error)))
327
+
328
+ if isinstance(error, Exception):
329
+ span.record_exception(error)
330
+ else:
331
+ span.add_event("error", {"error.message": str(error)})
332
+
333
+ except Exception as e:
334
+ logger.debug(f"Error setting span error: {e}")
335
+
336
+
337
+ class NoOpTracer:
338
+ """No-op tracer when OpenTelemetry is not available."""
339
+
340
+ @contextmanager
341
+ def start_as_current_span(self, _name: str, **_kwargs):
342
+ """No-op span context manager."""
343
+ yield None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chuk-tool-processor
3
- Version: 0.8
3
+ Version: 0.9
4
4
  Summary: Async-native framework for registering, discovering, and executing tools referenced in LLM responses
5
5
  Author-email: CHUK Team <chrishayuk@somejunkmailbox.com>
6
6
  Maintainer-email: CHUK Team <chrishayuk@somejunkmailbox.com>
@@ -184,7 +184,7 @@ asyncio.run(main())
184
184
  | 🔌 **Connect to external tools** | MCP integration (HTTP/STDIO/SSE) | [MCP Integration](#5-mcp-integration-external-tools) |
185
185
  | 🛡️ **Production deployment** | Timeouts, retries, rate limits, caching | [Production Configuration](#using-the-processor) |
186
186
  | 🔒 **Run untrusted code safely** | Subprocess isolation strategy | [Subprocess Strategy](#using-subprocess-strategy) |
187
- | 📊 **Monitor and observe** | Structured logging and metrics | [Observability](#observability) |
187
+ | 📊 **Monitor and observe** | OpenTelemetry + Prometheus | [Observability](#opentelemetry--prometheus-drop-in-observability) |
188
188
  | 🌊 **Stream incremental results** | StreamingTool pattern | [StreamingTool](#streamingtool-real-time-results) |
189
189
 
190
190
  ### Real-World Quick Start
@@ -1098,6 +1098,294 @@ async def main():
1098
1098
  asyncio.run(main())
1099
1099
  ```
1100
1100
 
1101
+ #### OpenTelemetry & Prometheus (Drop-in Observability)
1102
+
1103
+ **Why Telemetry Matters**: In production, you need to know *what* your tools are doing, *how long* they take, *when* they fail, and *why*. CHUK Tool Processor provides **enterprise-grade telemetry** that operations teams expect—with zero manual instrumentation.
1104
+
1105
+ **One function call. Full observability.**
1106
+
1107
+ ```python
1108
+ from chuk_tool_processor.observability import setup_observability
1109
+
1110
+ # Enable everything
1111
+ setup_observability(
1112
+ service_name="my-tool-service",
1113
+ enable_tracing=True, # OpenTelemetry distributed tracing
1114
+ enable_metrics=True, # Prometheus metrics endpoint
1115
+ metrics_port=9090 # HTTP endpoint at :9090/metrics
1116
+ )
1117
+
1118
+ # Every tool execution is now automatically traced and metered!
1119
+ ```
1120
+
1121
+ **What You Get (Automatically)**
1122
+
1123
+ ✅ **Distributed Traces** - Understand exactly what happened in each tool call
1124
+ - See the complete execution timeline for every tool
1125
+ - Track retries, cache hits, circuit breaker state changes
1126
+ - Correlate failures across your system
1127
+ - Export to Jaeger, Zipkin, or any OTLP-compatible backend
1128
+
1129
+ ✅ **Production Metrics** - Monitor health and performance in real-time
1130
+ - Track error rates, latency percentiles (P50/P95/P99)
1131
+ - Monitor cache hit rates and retry attempts
1132
+ - Alert on circuit breaker opens and rate limit hits
1133
+ - Export to Prometheus, Grafana, or any metrics backend
1134
+
1135
+ ✅ **Zero Configuration** - Works out of the box
1136
+ - No manual instrumentation needed
1137
+ - No code changes to existing tools
1138
+ - Gracefully degrades if packages not installed
1139
+ - Standard OTEL and Prometheus formats
1140
+
1141
+ **Installation**
1142
+
1143
+ ```bash
1144
+ # Install observability dependencies
1145
+ pip install chuk-tool-processor[observability]
1146
+
1147
+ # Or manually
1148
+ pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp prometheus-client
1149
+
1150
+ # Or with uv (recommended)
1151
+ uv pip install chuk-tool-processor --group observability
1152
+ ```
1153
+
1154
+ **Quick Start: See Your Tools in Action**
1155
+
1156
+ ```python
1157
+ import asyncio
1158
+ from chuk_tool_processor.observability import setup_observability
1159
+ from chuk_tool_processor.core.processor import ToolProcessor
1160
+ from chuk_tool_processor.registry import initialize, register_tool
1161
+
1162
+ @register_tool(name="weather_api")
1163
+ class WeatherTool:
1164
+ async def execute(self, location: str) -> dict:
1165
+ # Simulating API call
1166
+ return {"temperature": 72, "conditions": "sunny", "location": location}
1167
+
1168
+ async def main():
1169
+ # 1. Enable observability (one line!)
1170
+ setup_observability(
1171
+ service_name="weather-service",
1172
+ enable_tracing=True,
1173
+ enable_metrics=True,
1174
+ metrics_port=9090
1175
+ )
1176
+
1177
+ # 2. Create processor with production features
1178
+ await initialize()
1179
+ processor = ToolProcessor(
1180
+ enable_caching=True, # Cache expensive API calls
1181
+ enable_retries=True, # Auto-retry on failures
1182
+ enable_circuit_breaker=True, # Prevent cascading failures
1183
+ enable_rate_limiting=True, # Prevent API abuse
1184
+ )
1185
+
1186
+ # 3. Execute tools - automatically traced and metered
1187
+ results = await processor.process(
1188
+ '<tool name="weather_api" args=\'{"location": "San Francisco"}\'/>'
1189
+ )
1190
+
1191
+ print(f"Result: {results[0].result}")
1192
+ print(f"Duration: {results[0].duration}s")
1193
+ print(f"Cached: {results[0].cached}")
1194
+
1195
+ asyncio.run(main())
1196
+ ```
1197
+
1198
+ **View Your Data**
1199
+
1200
+ ```bash
1201
+ # Start Jaeger for trace visualization
1202
+ docker run -d -p 4317:4317 -p 16686:16686 jaegertracing/all-in-one:latest
1203
+
1204
+ # Start your application
1205
+ python your_app.py
1206
+
1207
+ # View distributed traces
1208
+ open http://localhost:16686
1209
+
1210
+ # View Prometheus metrics
1211
+ curl http://localhost:9090/metrics | grep tool_
1212
+ ```
1213
+
1214
+ **What Gets Traced (Automatic Spans)**
1215
+
1216
+ Every execution layer creates standardized OpenTelemetry spans:
1217
+
1218
+ | Span Name | When Created | Key Attributes |
1219
+ |-----------|--------------|----------------|
1220
+ | `tool.execute` | Every tool execution | `tool.name`, `tool.namespace`, `tool.duration_ms`, `tool.cached`, `tool.error`, `tool.success` |
1221
+ | `tool.cache.lookup` | Cache lookup | `cache.hit` (true/false), `cache.operation=lookup` |
1222
+ | `tool.cache.set` | Cache write | `cache.ttl`, `cache.operation=set` |
1223
+ | `tool.retry.attempt` | Each retry | `retry.attempt`, `retry.max_attempts`, `retry.success` |
1224
+ | `tool.circuit_breaker.check` | Circuit state check | `circuit.state` (CLOSED/OPEN/HALF_OPEN) |
1225
+ | `tool.rate_limit.check` | Rate limit check | `rate_limit.allowed` (true/false) |
1226
+
1227
+ **Example trace hierarchy:**
1228
+ ```
1229
+ tool.execute (weather_api)
1230
+ ├── tool.cache.lookup (miss)
1231
+ ├── tool.retry.attempt (0)
1232
+ │ └── tool.execute (actual API call)
1233
+ ├── tool.retry.attempt (1) [if first failed]
1234
+ └── tool.cache.set (store result)
1235
+ ```
1236
+
1237
+ **What Gets Metered (Automatic Metrics)**
1238
+
1239
+ Standard Prometheus metrics exposed at `/metrics`:
1240
+
1241
+ | Metric | Type | Labels | Use For |
1242
+ |--------|------|--------|---------|
1243
+ | `tool_executions_total` | Counter | `tool`, `namespace`, `status` | Error rate, request volume |
1244
+ | `tool_execution_duration_seconds` | Histogram | `tool`, `namespace` | P50/P95/P99 latency |
1245
+ | `tool_cache_operations_total` | Counter | `tool`, `operation`, `result` | Cache hit rate |
1246
+ | `tool_retry_attempts_total` | Counter | `tool`, `attempt`, `success` | Retry frequency |
1247
+ | `tool_circuit_breaker_state` | Gauge | `tool` | Circuit health (0=CLOSED, 1=OPEN, 2=HALF_OPEN) |
1248
+ | `tool_circuit_breaker_failures_total` | Counter | `tool` | Failure count |
1249
+ | `tool_rate_limit_checks_total` | Counter | `tool`, `allowed` | Rate limit hits |
1250
+
1251
+ **Useful PromQL Queries**
1252
+
1253
+ ```promql
1254
+ # Error rate per tool (last 5 minutes)
1255
+ rate(tool_executions_total{status="error"}[5m])
1256
+ / rate(tool_executions_total[5m])
1257
+
1258
+ # P95 latency
1259
+ histogram_quantile(0.95, rate(tool_execution_duration_seconds_bucket[5m]))
1260
+
1261
+ # Cache hit rate
1262
+ rate(tool_cache_operations_total{result="hit"}[5m])
1263
+ / rate(tool_cache_operations_total{operation="lookup"}[5m])
1264
+
1265
+ # Tools currently circuit broken
1266
+ tool_circuit_breaker_state == 1
1267
+
1268
+ # Retry rate (how often tools need retries)
1269
+ rate(tool_retry_attempts_total{attempt!="0"}[5m])
1270
+ / rate(tool_executions_total[5m])
1271
+ ```
1272
+
1273
+ **Configuration**
1274
+
1275
+ Configure via environment variables:
1276
+
1277
+ ```bash
1278
+ # OTLP endpoint (where traces are sent)
1279
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
1280
+
1281
+ # Service name (shown in traces)
1282
+ export OTEL_SERVICE_NAME=production-api
1283
+
1284
+ # Sampling (reduce overhead in high-traffic scenarios)
1285
+ export OTEL_TRACES_SAMPLER=traceidratio
1286
+ export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces
1287
+ ```
1288
+
1289
+ Or in code:
1290
+
1291
+ ```python
1292
+ status = setup_observability(
1293
+ service_name="my-service",
1294
+ enable_tracing=True,
1295
+ enable_metrics=True,
1296
+ metrics_port=9090,
1297
+ metrics_host="0.0.0.0" # Allow external Prometheus scraping
1298
+ )
1299
+
1300
+ # Check status
1301
+ if status["tracing_enabled"]:
1302
+ print("Traces exporting to OTLP endpoint")
1303
+ if status["metrics_server_started"]:
1304
+ print("Metrics available at http://localhost:9090/metrics")
1305
+ ```
1306
+
1307
+ **Production Integration**
1308
+
1309
+ **With Grafana + Prometheus:**
1310
+ ```yaml
1311
+ # prometheus.yml
1312
+ scrape_configs:
1313
+ - job_name: 'chuk-tool-processor'
1314
+ scrape_interval: 15s
1315
+ static_configs:
1316
+ - targets: ['app:9090']
1317
+ ```
1318
+
1319
+ **With OpenTelemetry Collector:**
1320
+ ```yaml
1321
+ # otel-collector-config.yaml
1322
+ receivers:
1323
+ otlp:
1324
+ protocols:
1325
+ grpc:
1326
+ endpoint: 0.0.0.0:4317
1327
+
1328
+ exporters:
1329
+ jaeger:
1330
+ endpoint: jaeger:14250
1331
+ prometheus:
1332
+ endpoint: 0.0.0.0:8889
1333
+
1334
+ service:
1335
+ pipelines:
1336
+ traces:
1337
+ receivers: [otlp]
1338
+ exporters: [jaeger]
1339
+ ```
1340
+
1341
+ **With Cloud Providers:**
1342
+ ```bash
1343
+ # AWS X-Ray
1344
+ export OTEL_TRACES_SAMPLER=xray
1345
+
1346
+ # Google Cloud Trace
1347
+ export OTEL_EXPORTER_OTLP_ENDPOINT=https://cloudtrace.googleapis.com/v1/projects/PROJECT_ID/traces
1348
+
1349
+ # Datadog
1350
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://datadog-agent:4317
1351
+ ```
1352
+
1353
+ **Why This Matters**
1354
+
1355
+ ❌ **Without telemetry:**
1356
+ - "Why is this tool slow?" → No idea
1357
+ - "Is caching helping?" → Guessing
1358
+ - "Did that retry work?" → Check logs manually
1359
+ - "Is the circuit breaker working?" → Hope so
1360
+ - "Which tool is failing?" → Debug blindly
1361
+
1362
+ ✅ **With telemetry:**
1363
+ - See exact execution timeline in Jaeger
1364
+ - Monitor cache hit rate in Grafana
1365
+ - Alert when retry rate spikes
1366
+ - Dashboard shows circuit breaker states
1367
+ - Metrics pinpoint the failing tool immediately
1368
+
1369
+ **Learn More**
1370
+
1371
+ 📖 **Complete Guide**: See [`OBSERVABILITY.md`](OBSERVABILITY.md) for:
1372
+ - Complete span and metric specifications
1373
+ - Architecture and implementation details
1374
+ - Integration guides (Jaeger, Grafana, OTEL Collector)
1375
+ - Testing observability features
1376
+ - Environment variable configuration
1377
+
1378
+ 🎯 **Working Example**: See `examples/observability_demo.py` for a complete demonstration with retries, caching, and circuit breakers
1379
+
1380
+ **Benefits**
1381
+
1382
+ ✅ **Drop-in** - One function call, zero code changes
1383
+ ✅ **Automatic** - All execution layers instrumented
1384
+ ✅ **Standard** - OTEL + Prometheus (works with existing tools)
1385
+ ✅ **Production-ready** - Ops teams get exactly what they expect
1386
+ ✅ **Optional** - Gracefully degrades if packages not installed
1387
+ ✅ **Zero-overhead** - No performance impact when disabled
1388
+
1101
1389
  ### Error Handling
1102
1390
 
1103
1391
  ```python
@@ -1328,6 +1616,7 @@ Check out the [`examples/`](examples/) directory for complete working examples:
1328
1616
  - **Execution strategies**: `examples/execution_strategies_demo.py` - InProcess vs Subprocess
1329
1617
  - **Production wrappers**: `examples/wrappers_demo.py` - Caching, retries, rate limiting
1330
1618
  - **Streaming tools**: `examples/streaming_demo.py` - Real-time incremental results
1619
+ - **Observability**: `examples/observability_demo.py` - OpenTelemetry + Prometheus integration
1331
1620
 
1332
1621
  ### MCP Integration (Real-World)
1333
1622
  - **Notion + OAuth**: `examples/notion_oauth.py` - Complete OAuth 2.1 flow with HTTP Streamable