chuk-tool-processor 0.8__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/PKG-INFO +291 -2
  2. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/README.md +290 -1
  3. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/pyproject.toml +8 -1
  4. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/wrappers/caching.py +38 -9
  5. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/wrappers/circuit_breaker.py +29 -2
  6. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/wrappers/rate_limiting.py +31 -1
  7. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/wrappers/retry.py +81 -53
  8. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/setup_mcp_http_streamable.py +8 -1
  9. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/setup_mcp_sse.py +8 -1
  10. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/http_streamable_transport.py +16 -3
  11. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/sse_transport.py +16 -3
  12. chuk_tool_processor-0.9.1/src/chuk_tool_processor/observability/__init__.py +30 -0
  13. chuk_tool_processor-0.9.1/src/chuk_tool_processor/observability/metrics.py +312 -0
  14. chuk_tool_processor-0.9.1/src/chuk_tool_processor/observability/setup.py +105 -0
  15. chuk_tool_processor-0.9.1/src/chuk_tool_processor/observability/tracing.py +345 -0
  16. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor.egg-info/PKG-INFO +291 -2
  17. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor.egg-info/SOURCES.txt +4 -0
  18. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/setup.cfg +0 -0
  19. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/__init__.py +0 -0
  20. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/core/__init__.py +0 -0
  21. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/core/exceptions.py +0 -0
  22. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/core/processor.py +0 -0
  23. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/__init__.py +0 -0
  24. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/strategies/__init__.py +0 -0
  25. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/strategies/inprocess_strategy.py +0 -0
  26. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/strategies/subprocess_strategy.py +0 -0
  27. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/tool_executor.py +0 -0
  28. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/execution/wrappers/__init__.py +0 -0
  29. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/logging/__init__.py +0 -0
  30. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/logging/context.py +0 -0
  31. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/logging/formatter.py +0 -0
  32. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/logging/helpers.py +0 -0
  33. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/logging/metrics.py +0 -0
  34. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/__init__.py +0 -0
  35. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/mcp_tool.py +0 -0
  36. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/register_mcp_tools.py +0 -0
  37. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/setup_mcp_stdio.py +0 -0
  38. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/stream_manager.py +0 -0
  39. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/__init__.py +0 -0
  40. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/base_transport.py +0 -0
  41. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/models.py +0 -0
  42. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/mcp/transport/stdio_transport.py +0 -0
  43. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/__init__.py +0 -0
  44. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/execution_strategy.py +0 -0
  45. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/streaming_tool.py +0 -0
  46. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/tool_call.py +0 -0
  47. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/tool_export_mixin.py +0 -0
  48. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/tool_result.py +0 -0
  49. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/tool_spec.py +0 -0
  50. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/models/validated_tool.py +0 -0
  51. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/__init__.py +0 -0
  52. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/discovery.py +0 -0
  53. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/__init__.py +0 -0
  54. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/base.py +0 -0
  55. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/function_call_tool.py +0 -0
  56. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/json_tool.py +0 -0
  57. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/openai_tool.py +0 -0
  58. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/plugins/parsers/xml_tool.py +0 -0
  59. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/__init__.py +0 -0
  60. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/auto_register.py +0 -0
  61. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/decorators.py +0 -0
  62. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/interface.py +0 -0
  63. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/metadata.py +0 -0
  64. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/provider.py +0 -0
  65. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/providers/__init__.py +0 -0
  66. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/providers/memory.py +0 -0
  67. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/registry/tool_export.py +0 -0
  68. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/utils/__init__.py +0 -0
  69. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor/utils/validation.py +0 -0
  70. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor.egg-info/dependency_links.txt +0 -0
  71. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor.egg-info/requires.txt +0 -0
  72. {chuk_tool_processor-0.8 → chuk_tool_processor-0.9.1}/src/chuk_tool_processor.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: chuk-tool-processor
3
- Version: 0.8
3
+ Version: 0.9.1
4
4
  Summary: Async-native framework for registering, discovering, and executing tools referenced in LLM responses
5
5
  Author-email: CHUK Team <chrishayuk@somejunkmailbox.com>
6
6
  Maintainer-email: CHUK Team <chrishayuk@somejunkmailbox.com>
@@ -184,7 +184,7 @@ asyncio.run(main())
184
184
  | 🔌 **Connect to external tools** | MCP integration (HTTP/STDIO/SSE) | [MCP Integration](#5-mcp-integration-external-tools) |
185
185
  | 🛡️ **Production deployment** | Timeouts, retries, rate limits, caching | [Production Configuration](#using-the-processor) |
186
186
  | 🔒 **Run untrusted code safely** | Subprocess isolation strategy | [Subprocess Strategy](#using-subprocess-strategy) |
187
- | 📊 **Monitor and observe** | Structured logging and metrics | [Observability](#observability) |
187
+ | 📊 **Monitor and observe** | OpenTelemetry + Prometheus | [Observability](#opentelemetry--prometheus-drop-in-observability) |
188
188
  | 🌊 **Stream incremental results** | StreamingTool pattern | [StreamingTool](#streamingtool-real-time-results) |
189
189
 
190
190
  ### Real-World Quick Start
@@ -1098,6 +1098,294 @@ async def main():
1098
1098
  asyncio.run(main())
1099
1099
  ```
1100
1100
 
1101
+ #### OpenTelemetry & Prometheus (Drop-in Observability)
1102
+
1103
+ **Why Telemetry Matters**: In production, you need to know *what* your tools are doing, *how long* they take, *when* they fail, and *why*. CHUK Tool Processor provides **enterprise-grade telemetry** that operations teams expect—with zero manual instrumentation.
1104
+
1105
+ **One function call. Full observability.**
1106
+
1107
+ ```python
1108
+ from chuk_tool_processor.observability import setup_observability
1109
+
1110
+ # Enable everything
1111
+ setup_observability(
1112
+ service_name="my-tool-service",
1113
+ enable_tracing=True, # OpenTelemetry distributed tracing
1114
+ enable_metrics=True, # Prometheus metrics endpoint
1115
+ metrics_port=9090 # HTTP endpoint at :9090/metrics
1116
+ )
1117
+
1118
+ # Every tool execution is now automatically traced and metered!
1119
+ ```
1120
+
1121
+ **What You Get (Automatically)**
1122
+
1123
+ ✅ **Distributed Traces** - Understand exactly what happened in each tool call
1124
+ - See the complete execution timeline for every tool
1125
+ - Track retries, cache hits, circuit breaker state changes
1126
+ - Correlate failures across your system
1127
+ - Export to Jaeger, Zipkin, or any OTLP-compatible backend
1128
+
1129
+ ✅ **Production Metrics** - Monitor health and performance in real-time
1130
+ - Track error rates, latency percentiles (P50/P95/P99)
1131
+ - Monitor cache hit rates and retry attempts
1132
+ - Alert on circuit breaker opens and rate limit hits
1133
+ - Export to Prometheus, Grafana, or any metrics backend
1134
+
1135
+ ✅ **Zero Configuration** - Works out of the box
1136
+ - No manual instrumentation needed
1137
+ - No code changes to existing tools
1138
+ - Gracefully degrades if packages not installed
1139
+ - Standard OTEL and Prometheus formats
1140
+
1141
+ **Installation**
1142
+
1143
+ ```bash
1144
+ # Install observability dependencies
1145
+ pip install chuk-tool-processor[observability]
1146
+
1147
+ # Or manually
1148
+ pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp prometheus-client
1149
+
1150
+ # Or with uv (recommended)
1151
+ uv pip install chuk-tool-processor --group observability
1152
+ ```
1153
+
1154
+ **Quick Start: See Your Tools in Action**
1155
+
1156
+ ```python
1157
+ import asyncio
1158
+ from chuk_tool_processor.observability import setup_observability
1159
+ from chuk_tool_processor.core.processor import ToolProcessor
1160
+ from chuk_tool_processor.registry import initialize, register_tool
1161
+
1162
+ @register_tool(name="weather_api")
1163
+ class WeatherTool:
1164
+ async def execute(self, location: str) -> dict:
1165
+ # Simulating API call
1166
+ return {"temperature": 72, "conditions": "sunny", "location": location}
1167
+
1168
+ async def main():
1169
+ # 1. Enable observability (one line!)
1170
+ setup_observability(
1171
+ service_name="weather-service",
1172
+ enable_tracing=True,
1173
+ enable_metrics=True,
1174
+ metrics_port=9090
1175
+ )
1176
+
1177
+ # 2. Create processor with production features
1178
+ await initialize()
1179
+ processor = ToolProcessor(
1180
+ enable_caching=True, # Cache expensive API calls
1181
+ enable_retries=True, # Auto-retry on failures
1182
+ enable_circuit_breaker=True, # Prevent cascading failures
1183
+ enable_rate_limiting=True, # Prevent API abuse
1184
+ )
1185
+
1186
+ # 3. Execute tools - automatically traced and metered
1187
+ results = await processor.process(
1188
+ '<tool name="weather_api" args=\'{"location": "San Francisco"}\'/>'
1189
+ )
1190
+
1191
+ print(f"Result: {results[0].result}")
1192
+ print(f"Duration: {results[0].duration}s")
1193
+ print(f"Cached: {results[0].cached}")
1194
+
1195
+ asyncio.run(main())
1196
+ ```
1197
+
1198
+ **View Your Data**
1199
+
1200
+ ```bash
1201
+ # Start Jaeger for trace visualization
1202
+ docker run -d -p 4317:4317 -p 16686:16686 jaegertracing/all-in-one:latest
1203
+
1204
+ # Start your application
1205
+ python your_app.py
1206
+
1207
+ # View distributed traces
1208
+ open http://localhost:16686
1209
+
1210
+ # View Prometheus metrics
1211
+ curl http://localhost:9090/metrics | grep tool_
1212
+ ```
1213
+
1214
+ **What Gets Traced (Automatic Spans)**
1215
+
1216
+ Every execution layer creates standardized OpenTelemetry spans:
1217
+
1218
+ | Span Name | When Created | Key Attributes |
1219
+ |-----------|--------------|----------------|
1220
+ | `tool.execute` | Every tool execution | `tool.name`, `tool.namespace`, `tool.duration_ms`, `tool.cached`, `tool.error`, `tool.success` |
1221
+ | `tool.cache.lookup` | Cache lookup | `cache.hit` (true/false), `cache.operation=lookup` |
1222
+ | `tool.cache.set` | Cache write | `cache.ttl`, `cache.operation=set` |
1223
+ | `tool.retry.attempt` | Each retry | `retry.attempt`, `retry.max_attempts`, `retry.success` |
1224
+ | `tool.circuit_breaker.check` | Circuit state check | `circuit.state` (CLOSED/OPEN/HALF_OPEN) |
1225
+ | `tool.rate_limit.check` | Rate limit check | `rate_limit.allowed` (true/false) |
1226
+
1227
+ **Example trace hierarchy:**
1228
+ ```
1229
+ tool.execute (weather_api)
1230
+ ├── tool.cache.lookup (miss)
1231
+ ├── tool.retry.attempt (0)
1232
+ │ └── tool.execute (actual API call)
1233
+ ├── tool.retry.attempt (1) [if first failed]
1234
+ └── tool.cache.set (store result)
1235
+ ```
1236
+
1237
+ **What Gets Metered (Automatic Metrics)**
1238
+
1239
+ Standard Prometheus metrics exposed at `/metrics`:
1240
+
1241
+ | Metric | Type | Labels | Use For |
1242
+ |--------|------|--------|---------|
1243
+ | `tool_executions_total` | Counter | `tool`, `namespace`, `status` | Error rate, request volume |
1244
+ | `tool_execution_duration_seconds` | Histogram | `tool`, `namespace` | P50/P95/P99 latency |
1245
+ | `tool_cache_operations_total` | Counter | `tool`, `operation`, `result` | Cache hit rate |
1246
+ | `tool_retry_attempts_total` | Counter | `tool`, `attempt`, `success` | Retry frequency |
1247
+ | `tool_circuit_breaker_state` | Gauge | `tool` | Circuit health (0=CLOSED, 1=OPEN, 2=HALF_OPEN) |
1248
+ | `tool_circuit_breaker_failures_total` | Counter | `tool` | Failure count |
1249
+ | `tool_rate_limit_checks_total` | Counter | `tool`, `allowed` | Rate limit hits |
1250
+
1251
+ **Useful PromQL Queries**
1252
+
1253
+ ```promql
1254
+ # Error rate per tool (last 5 minutes)
1255
+ rate(tool_executions_total{status="error"}[5m])
1256
+ / rate(tool_executions_total[5m])
1257
+
1258
+ # P95 latency
1259
+ histogram_quantile(0.95, rate(tool_execution_duration_seconds_bucket[5m]))
1260
+
1261
+ # Cache hit rate
1262
+ rate(tool_cache_operations_total{result="hit"}[5m])
1263
+ / rate(tool_cache_operations_total{operation="lookup"}[5m])
1264
+
1265
+ # Tools currently circuit broken
1266
+ tool_circuit_breaker_state == 1
1267
+
1268
+ # Retry rate (how often tools need retries)
1269
+ rate(tool_retry_attempts_total{attempt!="0"}[5m])
1270
+ / rate(tool_executions_total[5m])
1271
+ ```
1272
+
1273
+ **Configuration**
1274
+
1275
+ Configure via environment variables:
1276
+
1277
+ ```bash
1278
+ # OTLP endpoint (where traces are sent)
1279
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
1280
+
1281
+ # Service name (shown in traces)
1282
+ export OTEL_SERVICE_NAME=production-api
1283
+
1284
+ # Sampling (reduce overhead in high-traffic scenarios)
1285
+ export OTEL_TRACES_SAMPLER=traceidratio
1286
+ export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces
1287
+ ```
1288
+
1289
+ Or in code:
1290
+
1291
+ ```python
1292
+ status = setup_observability(
1293
+ service_name="my-service",
1294
+ enable_tracing=True,
1295
+ enable_metrics=True,
1296
+ metrics_port=9090,
1297
+ metrics_host="0.0.0.0" # Allow external Prometheus scraping
1298
+ )
1299
+
1300
+ # Check status
1301
+ if status["tracing_enabled"]:
1302
+ print("Traces exporting to OTLP endpoint")
1303
+ if status["metrics_server_started"]:
1304
+ print("Metrics available at http://localhost:9090/metrics")
1305
+ ```
1306
+
1307
+ **Production Integration**
1308
+
1309
+ **With Grafana + Prometheus:**
1310
+ ```yaml
1311
+ # prometheus.yml
1312
+ scrape_configs:
1313
+ - job_name: 'chuk-tool-processor'
1314
+ scrape_interval: 15s
1315
+ static_configs:
1316
+ - targets: ['app:9090']
1317
+ ```
1318
+
1319
+ **With OpenTelemetry Collector:**
1320
+ ```yaml
1321
+ # otel-collector-config.yaml
1322
+ receivers:
1323
+ otlp:
1324
+ protocols:
1325
+ grpc:
1326
+ endpoint: 0.0.0.0:4317
1327
+
1328
+ exporters:
1329
+ jaeger:
1330
+ endpoint: jaeger:14250
1331
+ prometheus:
1332
+ endpoint: 0.0.0.0:8889
1333
+
1334
+ service:
1335
+ pipelines:
1336
+ traces:
1337
+ receivers: [otlp]
1338
+ exporters: [jaeger]
1339
+ ```
1340
+
1341
+ **With Cloud Providers:**
1342
+ ```bash
1343
+ # AWS X-Ray
1344
+ export OTEL_TRACES_SAMPLER=xray
1345
+
1346
+ # Google Cloud Trace
1347
+ export OTEL_EXPORTER_OTLP_ENDPOINT=https://cloudtrace.googleapis.com/v1/projects/PROJECT_ID/traces
1348
+
1349
+ # Datadog
1350
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://datadog-agent:4317
1351
+ ```
1352
+
1353
+ **Why This Matters**
1354
+
1355
+ ❌ **Without telemetry:**
1356
+ - "Why is this tool slow?" → No idea
1357
+ - "Is caching helping?" → Guessing
1358
+ - "Did that retry work?" → Check logs manually
1359
+ - "Is the circuit breaker working?" → Hope so
1360
+ - "Which tool is failing?" → Debug blindly
1361
+
1362
+ ✅ **With telemetry:**
1363
+ - See exact execution timeline in Jaeger
1364
+ - Monitor cache hit rate in Grafana
1365
+ - Alert when retry rate spikes
1366
+ - Dashboard shows circuit breaker states
1367
+ - Metrics pinpoint the failing tool immediately
1368
+
1369
+ **Learn More**
1370
+
1371
+ 📖 **Complete Guide**: See [`OBSERVABILITY.md`](OBSERVABILITY.md) for:
1372
+ - Complete span and metric specifications
1373
+ - Architecture and implementation details
1374
+ - Integration guides (Jaeger, Grafana, OTEL Collector)
1375
+ - Testing observability features
1376
+ - Environment variable configuration
1377
+
1378
+ 🎯 **Working Example**: See `examples/observability_demo.py` for a complete demonstration with retries, caching, and circuit breakers
1379
+
1380
+ **Benefits**
1381
+
1382
+ ✅ **Drop-in** - One function call, zero code changes
1383
+ ✅ **Automatic** - All execution layers instrumented
1384
+ ✅ **Standard** - OTEL + Prometheus (works with existing tools)
1385
+ ✅ **Production-ready** - Ops teams get exactly what they expect
1386
+ ✅ **Optional** - Gracefully degrades if packages not installed
1387
+ ✅ **Zero-overhead** - No performance impact when disabled
1388
+
1101
1389
  ### Error Handling
1102
1390
 
1103
1391
  ```python
@@ -1328,6 +1616,7 @@ Check out the [`examples/`](examples/) directory for complete working examples:
1328
1616
  - **Execution strategies**: `examples/execution_strategies_demo.py` - InProcess vs Subprocess
1329
1617
  - **Production wrappers**: `examples/wrappers_demo.py` - Caching, retries, rate limiting
1330
1618
  - **Streaming tools**: `examples/streaming_demo.py` - Real-time incremental results
1619
+ - **Observability**: `examples/observability_demo.py` - OpenTelemetry + Prometheus integration
1331
1620
 
1332
1621
  ### MCP Integration (Real-World)
1333
1622
  - **Notion + OAuth**: `examples/notion_oauth.py` - Complete OAuth 2.1 flow with HTTP Streamable
@@ -156,7 +156,7 @@ asyncio.run(main())
156
156
  | 🔌 **Connect to external tools** | MCP integration (HTTP/STDIO/SSE) | [MCP Integration](#5-mcp-integration-external-tools) |
157
157
  | 🛡️ **Production deployment** | Timeouts, retries, rate limits, caching | [Production Configuration](#using-the-processor) |
158
158
  | 🔒 **Run untrusted code safely** | Subprocess isolation strategy | [Subprocess Strategy](#using-subprocess-strategy) |
159
- | 📊 **Monitor and observe** | Structured logging and metrics | [Observability](#observability) |
159
+ | 📊 **Monitor and observe** | OpenTelemetry + Prometheus | [Observability](#opentelemetry--prometheus-drop-in-observability) |
160
160
  | 🌊 **Stream incremental results** | StreamingTool pattern | [StreamingTool](#streamingtool-real-time-results) |
161
161
 
162
162
  ### Real-World Quick Start
@@ -1070,6 +1070,294 @@ async def main():
1070
1070
  asyncio.run(main())
1071
1071
  ```
1072
1072
 
1073
+ #### OpenTelemetry & Prometheus (Drop-in Observability)
1074
+
1075
+ **Why Telemetry Matters**: In production, you need to know *what* your tools are doing, *how long* they take, *when* they fail, and *why*. CHUK Tool Processor provides **enterprise-grade telemetry** that operations teams expect—with zero manual instrumentation.
1076
+
1077
+ **One function call. Full observability.**
1078
+
1079
+ ```python
1080
+ from chuk_tool_processor.observability import setup_observability
1081
+
1082
+ # Enable everything
1083
+ setup_observability(
1084
+ service_name="my-tool-service",
1085
+ enable_tracing=True, # OpenTelemetry distributed tracing
1086
+ enable_metrics=True, # Prometheus metrics endpoint
1087
+ metrics_port=9090 # HTTP endpoint at :9090/metrics
1088
+ )
1089
+
1090
+ # Every tool execution is now automatically traced and metered!
1091
+ ```
1092
+
1093
+ **What You Get (Automatically)**
1094
+
1095
+ ✅ **Distributed Traces** - Understand exactly what happened in each tool call
1096
+ - See the complete execution timeline for every tool
1097
+ - Track retries, cache hits, circuit breaker state changes
1098
+ - Correlate failures across your system
1099
+ - Export to Jaeger, Zipkin, or any OTLP-compatible backend
1100
+
1101
+ ✅ **Production Metrics** - Monitor health and performance in real-time
1102
+ - Track error rates, latency percentiles (P50/P95/P99)
1103
+ - Monitor cache hit rates and retry attempts
1104
+ - Alert on circuit breaker opens and rate limit hits
1105
+ - Export to Prometheus, Grafana, or any metrics backend
1106
+
1107
+ ✅ **Zero Configuration** - Works out of the box
1108
+ - No manual instrumentation needed
1109
+ - No code changes to existing tools
1110
+ - Gracefully degrades if packages not installed
1111
+ - Standard OTEL and Prometheus formats
1112
+
1113
+ **Installation**
1114
+
1115
+ ```bash
1116
+ # Install observability dependencies
1117
+ pip install chuk-tool-processor[observability]
1118
+
1119
+ # Or manually
1120
+ pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp prometheus-client
1121
+
1122
+ # Or with uv (recommended)
1123
+ uv pip install chuk-tool-processor --group observability
1124
+ ```
1125
+
1126
+ **Quick Start: See Your Tools in Action**
1127
+
1128
+ ```python
1129
+ import asyncio
1130
+ from chuk_tool_processor.observability import setup_observability
1131
+ from chuk_tool_processor.core.processor import ToolProcessor
1132
+ from chuk_tool_processor.registry import initialize, register_tool
1133
+
1134
+ @register_tool(name="weather_api")
1135
+ class WeatherTool:
1136
+ async def execute(self, location: str) -> dict:
1137
+ # Simulating API call
1138
+ return {"temperature": 72, "conditions": "sunny", "location": location}
1139
+
1140
+ async def main():
1141
+ # 1. Enable observability (one line!)
1142
+ setup_observability(
1143
+ service_name="weather-service",
1144
+ enable_tracing=True,
1145
+ enable_metrics=True,
1146
+ metrics_port=9090
1147
+ )
1148
+
1149
+ # 2. Create processor with production features
1150
+ await initialize()
1151
+ processor = ToolProcessor(
1152
+ enable_caching=True, # Cache expensive API calls
1153
+ enable_retries=True, # Auto-retry on failures
1154
+ enable_circuit_breaker=True, # Prevent cascading failures
1155
+ enable_rate_limiting=True, # Prevent API abuse
1156
+ )
1157
+
1158
+ # 3. Execute tools - automatically traced and metered
1159
+ results = await processor.process(
1160
+ '<tool name="weather_api" args=\'{"location": "San Francisco"}\'/>'
1161
+ )
1162
+
1163
+ print(f"Result: {results[0].result}")
1164
+ print(f"Duration: {results[0].duration}s")
1165
+ print(f"Cached: {results[0].cached}")
1166
+
1167
+ asyncio.run(main())
1168
+ ```
1169
+
1170
+ **View Your Data**
1171
+
1172
+ ```bash
1173
+ # Start Jaeger for trace visualization
1174
+ docker run -d -p 4317:4317 -p 16686:16686 jaegertracing/all-in-one:latest
1175
+
1176
+ # Start your application
1177
+ python your_app.py
1178
+
1179
+ # View distributed traces
1180
+ open http://localhost:16686
1181
+
1182
+ # View Prometheus metrics
1183
+ curl http://localhost:9090/metrics | grep tool_
1184
+ ```
1185
+
1186
+ **What Gets Traced (Automatic Spans)**
1187
+
1188
+ Every execution layer creates standardized OpenTelemetry spans:
1189
+
1190
+ | Span Name | When Created | Key Attributes |
1191
+ |-----------|--------------|----------------|
1192
+ | `tool.execute` | Every tool execution | `tool.name`, `tool.namespace`, `tool.duration_ms`, `tool.cached`, `tool.error`, `tool.success` |
1193
+ | `tool.cache.lookup` | Cache lookup | `cache.hit` (true/false), `cache.operation=lookup` |
1194
+ | `tool.cache.set` | Cache write | `cache.ttl`, `cache.operation=set` |
1195
+ | `tool.retry.attempt` | Each retry | `retry.attempt`, `retry.max_attempts`, `retry.success` |
1196
+ | `tool.circuit_breaker.check` | Circuit state check | `circuit.state` (CLOSED/OPEN/HALF_OPEN) |
1197
+ | `tool.rate_limit.check` | Rate limit check | `rate_limit.allowed` (true/false) |
1198
+
1199
+ **Example trace hierarchy:**
1200
+ ```
1201
+ tool.execute (weather_api)
1202
+ ├── tool.cache.lookup (miss)
1203
+ ├── tool.retry.attempt (0)
1204
+ │ └── tool.execute (actual API call)
1205
+ ├── tool.retry.attempt (1) [if first failed]
1206
+ └── tool.cache.set (store result)
1207
+ ```
1208
+
1209
+ **What Gets Metered (Automatic Metrics)**
1210
+
1211
+ Standard Prometheus metrics exposed at `/metrics`:
1212
+
1213
+ | Metric | Type | Labels | Use For |
1214
+ |--------|------|--------|---------|
1215
+ | `tool_executions_total` | Counter | `tool`, `namespace`, `status` | Error rate, request volume |
1216
+ | `tool_execution_duration_seconds` | Histogram | `tool`, `namespace` | P50/P95/P99 latency |
1217
+ | `tool_cache_operations_total` | Counter | `tool`, `operation`, `result` | Cache hit rate |
1218
+ | `tool_retry_attempts_total` | Counter | `tool`, `attempt`, `success` | Retry frequency |
1219
+ | `tool_circuit_breaker_state` | Gauge | `tool` | Circuit health (0=CLOSED, 1=OPEN, 2=HALF_OPEN) |
1220
+ | `tool_circuit_breaker_failures_total` | Counter | `tool` | Failure count |
1221
+ | `tool_rate_limit_checks_total` | Counter | `tool`, `allowed` | Rate limit hits |
1222
+
1223
+ **Useful PromQL Queries**
1224
+
1225
+ ```promql
1226
+ # Error rate per tool (last 5 minutes)
1227
+ rate(tool_executions_total{status="error"}[5m])
1228
+ / rate(tool_executions_total[5m])
1229
+
1230
+ # P95 latency
1231
+ histogram_quantile(0.95, rate(tool_execution_duration_seconds_bucket[5m]))
1232
+
1233
+ # Cache hit rate
1234
+ rate(tool_cache_operations_total{result="hit"}[5m])
1235
+ / rate(tool_cache_operations_total{operation="lookup"}[5m])
1236
+
1237
+ # Tools currently circuit broken
1238
+ tool_circuit_breaker_state == 1
1239
+
1240
+ # Retry rate (how often tools need retries)
1241
+ rate(tool_retry_attempts_total{attempt!="0"}[5m])
1242
+ / rate(tool_executions_total[5m])
1243
+ ```
1244
+
1245
+ **Configuration**
1246
+
1247
+ Configure via environment variables:
1248
+
1249
+ ```bash
1250
+ # OTLP endpoint (where traces are sent)
1251
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
1252
+
1253
+ # Service name (shown in traces)
1254
+ export OTEL_SERVICE_NAME=production-api
1255
+
1256
+ # Sampling (reduce overhead in high-traffic scenarios)
1257
+ export OTEL_TRACES_SAMPLER=traceidratio
1258
+ export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces
1259
+ ```
1260
+
1261
+ Or in code:
1262
+
1263
+ ```python
1264
+ status = setup_observability(
1265
+ service_name="my-service",
1266
+ enable_tracing=True,
1267
+ enable_metrics=True,
1268
+ metrics_port=9090,
1269
+ metrics_host="0.0.0.0" # Allow external Prometheus scraping
1270
+ )
1271
+
1272
+ # Check status
1273
+ if status["tracing_enabled"]:
1274
+ print("Traces exporting to OTLP endpoint")
1275
+ if status["metrics_server_started"]:
1276
+ print("Metrics available at http://localhost:9090/metrics")
1277
+ ```
1278
+
1279
+ **Production Integration**
1280
+
1281
+ **With Grafana + Prometheus:**
1282
+ ```yaml
1283
+ # prometheus.yml
1284
+ scrape_configs:
1285
+ - job_name: 'chuk-tool-processor'
1286
+ scrape_interval: 15s
1287
+ static_configs:
1288
+ - targets: ['app:9090']
1289
+ ```
1290
+
1291
+ **With OpenTelemetry Collector:**
1292
+ ```yaml
1293
+ # otel-collector-config.yaml
1294
+ receivers:
1295
+ otlp:
1296
+ protocols:
1297
+ grpc:
1298
+ endpoint: 0.0.0.0:4317
1299
+
1300
+ exporters:
1301
+ jaeger:
1302
+ endpoint: jaeger:14250
1303
+ prometheus:
1304
+ endpoint: 0.0.0.0:8889
1305
+
1306
+ service:
1307
+ pipelines:
1308
+ traces:
1309
+ receivers: [otlp]
1310
+ exporters: [jaeger]
1311
+ ```
1312
+
1313
+ **With Cloud Providers:**
1314
+ ```bash
1315
+ # AWS X-Ray
1316
+ export OTEL_TRACES_SAMPLER=xray
1317
+
1318
+ # Google Cloud Trace
1319
+ export OTEL_EXPORTER_OTLP_ENDPOINT=https://cloudtrace.googleapis.com/v1/projects/PROJECT_ID/traces
1320
+
1321
+ # Datadog
1322
+ export OTEL_EXPORTER_OTLP_ENDPOINT=http://datadog-agent:4317
1323
+ ```
1324
+
1325
+ **Why This Matters**
1326
+
1327
+ ❌ **Without telemetry:**
1328
+ - "Why is this tool slow?" → No idea
1329
+ - "Is caching helping?" → Guessing
1330
+ - "Did that retry work?" → Check logs manually
1331
+ - "Is the circuit breaker working?" → Hope so
1332
+ - "Which tool is failing?" → Debug blindly
1333
+
1334
+ ✅ **With telemetry:**
1335
+ - See exact execution timeline in Jaeger
1336
+ - Monitor cache hit rate in Grafana
1337
+ - Alert when retry rate spikes
1338
+ - Dashboard shows circuit breaker states
1339
+ - Metrics pinpoint the failing tool immediately
1340
+
1341
+ **Learn More**
1342
+
1343
+ 📖 **Complete Guide**: See [`OBSERVABILITY.md`](OBSERVABILITY.md) for:
1344
+ - Complete span and metric specifications
1345
+ - Architecture and implementation details
1346
+ - Integration guides (Jaeger, Grafana, OTEL Collector)
1347
+ - Testing observability features
1348
+ - Environment variable configuration
1349
+
1350
+ 🎯 **Working Example**: See `examples/observability_demo.py` for a complete demonstration with retries, caching, and circuit breakers
1351
+
1352
+ **Benefits**
1353
+
1354
+ ✅ **Drop-in** - One function call, zero code changes
1355
+ ✅ **Automatic** - All execution layers instrumented
1356
+ ✅ **Standard** - OTEL + Prometheus (works with existing tools)
1357
+ ✅ **Production-ready** - Ops teams get exactly what they expect
1358
+ ✅ **Optional** - Gracefully degrades if packages not installed
1359
+ ✅ **Zero-overhead** - No performance impact when disabled
1360
+
1073
1361
  ### Error Handling
1074
1362
 
1075
1363
  ```python
@@ -1300,6 +1588,7 @@ Check out the [`examples/`](examples/) directory for complete working examples:
1300
1588
  - **Execution strategies**: `examples/execution_strategies_demo.py` - InProcess vs Subprocess
1301
1589
  - **Production wrappers**: `examples/wrappers_demo.py` - Caching, retries, rate limiting
1302
1590
  - **Streaming tools**: `examples/streaming_demo.py` - Real-time incremental results
1591
+ - **Observability**: `examples/observability_demo.py` - OpenTelemetry + Prometheus integration
1303
1592
 
1304
1593
  ### MCP Integration (Real-World)
1305
1594
  - **Notion + OAuth**: `examples/notion_oauth.py` - Complete OAuth 2.1 flow with HTTP Streamable
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "chuk-tool-processor"
7
- version = "0.8"
7
+ version = "0.9.1"
8
8
  description = "Async-native framework for registering, discovering, and executing tools referenced in LLM responses"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -78,6 +78,13 @@ dev = [
78
78
  "coverage[toml]>=7.6.0",
79
79
  ]
80
80
 
81
+ observability = [
82
+ "opentelemetry-api>=1.28.0",
83
+ "opentelemetry-sdk>=1.28.0",
84
+ "opentelemetry-exporter-otlp>=1.28.0",
85
+ "prometheus-client>=0.21.0",
86
+ ]
87
+
81
88
  [tool.coverage.run]
82
89
  source = ["src"]
83
90
  omit = [