chuk-tool-processor 0.8__py3-none-any.whl → 0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chuk-tool-processor might be problematic. Click here for more details.
- chuk_tool_processor/execution/wrappers/caching.py +38 -9
- chuk_tool_processor/execution/wrappers/circuit_breaker.py +29 -2
- chuk_tool_processor/execution/wrappers/rate_limiting.py +31 -1
- chuk_tool_processor/execution/wrappers/retry.py +81 -53
- chuk_tool_processor/mcp/setup_mcp_http_streamable.py +8 -1
- chuk_tool_processor/mcp/setup_mcp_sse.py +8 -1
- chuk_tool_processor/mcp/transport/http_streamable_transport.py +16 -3
- chuk_tool_processor/mcp/transport/sse_transport.py +16 -3
- chuk_tool_processor/observability/__init__.py +30 -0
- chuk_tool_processor/observability/metrics.py +312 -0
- chuk_tool_processor/observability/setup.py +105 -0
- chuk_tool_processor/observability/tracing.py +343 -0
- {chuk_tool_processor-0.8.dist-info → chuk_tool_processor-0.9.dist-info}/METADATA +291 -2
- {chuk_tool_processor-0.8.dist-info → chuk_tool_processor-0.9.dist-info}/RECORD +16 -12
- {chuk_tool_processor-0.8.dist-info → chuk_tool_processor-0.9.dist-info}/WHEEL +0 -0
- {chuk_tool_processor-0.8.dist-info → chuk_tool_processor-0.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenTelemetry tracing integration for chuk-tool-processor.
|
|
3
|
+
|
|
4
|
+
Provides drop-in distributed tracing with standardized span names and attributes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from contextlib import contextmanager
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
from chuk_tool_processor.logging import get_logger
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from opentelemetry.trace import Span, Tracer
|
|
16
|
+
|
|
17
|
+
logger = get_logger("chuk_tool_processor.observability.tracing")
|
|
18
|
+
|
|
19
|
+
# Global tracer instance
|
|
20
|
+
_tracer: Tracer | None = None
|
|
21
|
+
_tracing_enabled = False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def init_tracer(service_name: str = "chuk-tool-processor") -> Tracer | NoOpTracer:
|
|
25
|
+
"""
|
|
26
|
+
Initialize OpenTelemetry tracer with best-practice configuration.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
service_name: Service name for tracing
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Configured OpenTelemetry tracer or NoOpTracer if initialization fails
|
|
33
|
+
"""
|
|
34
|
+
global _tracer, _tracing_enabled
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from opentelemetry import trace
|
|
38
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
39
|
+
from opentelemetry.sdk.resources import Resource
|
|
40
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
41
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
42
|
+
|
|
43
|
+
# Create resource with service name
|
|
44
|
+
resource = Resource.create({"service.name": service_name})
|
|
45
|
+
|
|
46
|
+
# Create tracer provider
|
|
47
|
+
provider = TracerProvider(resource=resource)
|
|
48
|
+
|
|
49
|
+
# Add OTLP exporter (exports to OTEL collector)
|
|
50
|
+
otlp_exporter = OTLPSpanExporter()
|
|
51
|
+
provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
|
|
52
|
+
|
|
53
|
+
# Set as global tracer provider
|
|
54
|
+
trace.set_tracer_provider(provider)
|
|
55
|
+
|
|
56
|
+
_tracer = trace.get_tracer(__name__)
|
|
57
|
+
_tracing_enabled = True
|
|
58
|
+
|
|
59
|
+
logger.info(f"OpenTelemetry tracing initialized for service: {service_name}")
|
|
60
|
+
return _tracer
|
|
61
|
+
|
|
62
|
+
except ImportError as e:
|
|
63
|
+
logger.warning(f"OpenTelemetry packages not installed: {e}. Tracing disabled.")
|
|
64
|
+
_tracing_enabled = False
|
|
65
|
+
return NoOpTracer()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_tracer() -> Tracer | NoOpTracer:
|
|
69
|
+
"""
|
|
70
|
+
Get the current tracer instance.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
OpenTelemetry tracer or no-op tracer if not initialized
|
|
74
|
+
"""
|
|
75
|
+
if _tracer is None:
|
|
76
|
+
return NoOpTracer()
|
|
77
|
+
return _tracer
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_tracing_enabled() -> bool:
|
|
81
|
+
"""Check if tracing is enabled."""
|
|
82
|
+
return _tracing_enabled
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@contextmanager
|
|
86
|
+
def trace_tool_execution(
|
|
87
|
+
tool: str,
|
|
88
|
+
namespace: str | None = None,
|
|
89
|
+
attributes: dict[str, Any] | None = None,
|
|
90
|
+
):
|
|
91
|
+
"""
|
|
92
|
+
Context manager for tracing tool execution.
|
|
93
|
+
|
|
94
|
+
Creates a span with name "tool.execute" and standard attributes.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
tool: Tool name
|
|
98
|
+
namespace: Optional tool namespace
|
|
99
|
+
attributes: Additional span attributes
|
|
100
|
+
|
|
101
|
+
Example:
|
|
102
|
+
with trace_tool_execution("calculator", attributes={"operation": "add"}):
|
|
103
|
+
result = await tool.execute(a=5, b=3)
|
|
104
|
+
"""
|
|
105
|
+
if not _tracing_enabled or _tracer is None:
|
|
106
|
+
yield None
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
span_name = "tool.execute"
|
|
110
|
+
span_attributes: dict[str, str | int | float | bool] = {
|
|
111
|
+
"tool.name": tool,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if namespace:
|
|
115
|
+
span_attributes["tool.namespace"] = namespace
|
|
116
|
+
|
|
117
|
+
if attributes:
|
|
118
|
+
# Flatten attributes with "tool." prefix
|
|
119
|
+
for key, value in attributes.items():
|
|
120
|
+
# Convert value to string for OTEL compatibility
|
|
121
|
+
if isinstance(value, (str, int, float, bool)):
|
|
122
|
+
span_attributes[f"tool.{key}"] = value
|
|
123
|
+
else:
|
|
124
|
+
span_attributes[f"tool.{key}"] = str(value)
|
|
125
|
+
|
|
126
|
+
with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
|
|
127
|
+
yield span
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@contextmanager
|
|
131
|
+
def trace_cache_operation(
|
|
132
|
+
operation: str,
|
|
133
|
+
tool: str,
|
|
134
|
+
hit: bool | None = None,
|
|
135
|
+
attributes: dict[str, Any] | None = None,
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Context manager for tracing cache operations.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
operation: Cache operation (lookup, set, invalidate)
|
|
142
|
+
tool: Tool name
|
|
143
|
+
hit: Whether cache hit (for lookup operations)
|
|
144
|
+
attributes: Additional span attributes
|
|
145
|
+
|
|
146
|
+
Example:
|
|
147
|
+
with trace_cache_operation("lookup", "calculator", hit=True):
|
|
148
|
+
result = await cache.get(tool, key)
|
|
149
|
+
"""
|
|
150
|
+
if not _tracing_enabled or _tracer is None:
|
|
151
|
+
yield None
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
span_name = f"tool.cache.{operation}"
|
|
155
|
+
span_attributes: dict[str, str | int | float | bool] = {
|
|
156
|
+
"tool.name": tool,
|
|
157
|
+
"cache.operation": operation,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
if hit is not None:
|
|
161
|
+
span_attributes["cache.hit"] = hit
|
|
162
|
+
|
|
163
|
+
if attributes:
|
|
164
|
+
for key, value in attributes.items():
|
|
165
|
+
if isinstance(value, (str, int, float, bool)):
|
|
166
|
+
span_attributes[f"cache.{key}"] = value
|
|
167
|
+
else:
|
|
168
|
+
span_attributes[f"cache.{key}"] = str(value)
|
|
169
|
+
|
|
170
|
+
with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
|
|
171
|
+
yield span
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@contextmanager
|
|
175
|
+
def trace_retry_attempt(
|
|
176
|
+
tool: str,
|
|
177
|
+
attempt: int,
|
|
178
|
+
max_retries: int,
|
|
179
|
+
attributes: dict[str, Any] | None = None,
|
|
180
|
+
):
|
|
181
|
+
"""
|
|
182
|
+
Context manager for tracing retry attempts.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
tool: Tool name
|
|
186
|
+
attempt: Current attempt number (0-indexed)
|
|
187
|
+
max_retries: Maximum retry attempts
|
|
188
|
+
attributes: Additional span attributes
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
with trace_retry_attempt("api_tool", attempt=1, max_retries=3):
|
|
192
|
+
result = await executor.execute([call])
|
|
193
|
+
"""
|
|
194
|
+
if not _tracing_enabled or _tracer is None:
|
|
195
|
+
yield None
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
span_name = "tool.retry.attempt"
|
|
199
|
+
span_attributes: dict[str, str | int | float | bool] = {
|
|
200
|
+
"tool.name": tool,
|
|
201
|
+
"retry.attempt": attempt,
|
|
202
|
+
"retry.max_attempts": max_retries,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if attributes:
|
|
206
|
+
for key, value in attributes.items():
|
|
207
|
+
if isinstance(value, (str, int, float, bool)):
|
|
208
|
+
span_attributes[f"retry.{key}"] = value
|
|
209
|
+
else:
|
|
210
|
+
span_attributes[f"retry.{key}"] = str(value)
|
|
211
|
+
|
|
212
|
+
with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
|
|
213
|
+
yield span
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
@contextmanager
|
|
217
|
+
def trace_circuit_breaker(
|
|
218
|
+
tool: str,
|
|
219
|
+
state: str,
|
|
220
|
+
attributes: dict[str, Any] | None = None,
|
|
221
|
+
):
|
|
222
|
+
"""
|
|
223
|
+
Context manager for tracing circuit breaker operations.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
tool: Tool name
|
|
227
|
+
state: Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
|
|
228
|
+
attributes: Additional span attributes
|
|
229
|
+
|
|
230
|
+
Example:
|
|
231
|
+
with trace_circuit_breaker("api_tool", state="OPEN"):
|
|
232
|
+
can_execute = await breaker.can_execute()
|
|
233
|
+
"""
|
|
234
|
+
if not _tracing_enabled or _tracer is None:
|
|
235
|
+
yield None
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
span_name = "tool.circuit_breaker.check"
|
|
239
|
+
span_attributes: dict[str, str | int | float | bool] = {
|
|
240
|
+
"tool.name": tool,
|
|
241
|
+
"circuit.state": state,
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if attributes:
|
|
245
|
+
for key, value in attributes.items():
|
|
246
|
+
if isinstance(value, (str, int, float, bool)):
|
|
247
|
+
span_attributes[f"circuit.{key}"] = value
|
|
248
|
+
else:
|
|
249
|
+
span_attributes[f"circuit.{key}"] = str(value)
|
|
250
|
+
|
|
251
|
+
with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
|
|
252
|
+
yield span
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
@contextmanager
|
|
256
|
+
def trace_rate_limit(
|
|
257
|
+
tool: str,
|
|
258
|
+
allowed: bool,
|
|
259
|
+
attributes: dict[str, Any] | None = None,
|
|
260
|
+
):
|
|
261
|
+
"""
|
|
262
|
+
Context manager for tracing rate limiting.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
tool: Tool name
|
|
266
|
+
allowed: Whether request was allowed
|
|
267
|
+
attributes: Additional span attributes
|
|
268
|
+
|
|
269
|
+
Example:
|
|
270
|
+
with trace_rate_limit("api_tool", allowed=True):
|
|
271
|
+
await rate_limiter.acquire()
|
|
272
|
+
"""
|
|
273
|
+
if not _tracing_enabled or _tracer is None:
|
|
274
|
+
yield None
|
|
275
|
+
return
|
|
276
|
+
|
|
277
|
+
span_name = "tool.rate_limit.check"
|
|
278
|
+
span_attributes: dict[str, str | int | float | bool] = {
|
|
279
|
+
"tool.name": tool,
|
|
280
|
+
"rate_limit.allowed": allowed,
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
if attributes:
|
|
284
|
+
for key, value in attributes.items():
|
|
285
|
+
if isinstance(value, (str, int, float, bool)):
|
|
286
|
+
span_attributes[f"rate_limit.{key}"] = value
|
|
287
|
+
else:
|
|
288
|
+
span_attributes[f"rate_limit.{key}"] = str(value)
|
|
289
|
+
|
|
290
|
+
with _tracer.start_as_current_span(span_name, attributes=span_attributes) as span:
|
|
291
|
+
yield span
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def add_span_event(span: Span | None, name: str, attributes: dict[str, Any] | None = None) -> None:
|
|
295
|
+
"""
|
|
296
|
+
Add an event to the current span.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
span: Span to add event to (can be None)
|
|
300
|
+
name: Event name
|
|
301
|
+
attributes: Event attributes
|
|
302
|
+
"""
|
|
303
|
+
if span is None or not _tracing_enabled:
|
|
304
|
+
return
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
span.add_event(name, attributes=attributes or {})
|
|
308
|
+
except Exception as e:
|
|
309
|
+
logger.debug(f"Error adding span event: {e}")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def set_span_error(span: Span | None, error: Exception | str) -> None:
|
|
313
|
+
"""
|
|
314
|
+
Mark span as error and record exception details.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
span: Span to mark as error (can be None)
|
|
318
|
+
error: Error to record
|
|
319
|
+
"""
|
|
320
|
+
if span is None or not _tracing_enabled:
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
from opentelemetry.trace import Status, StatusCode
|
|
325
|
+
|
|
326
|
+
span.set_status(Status(StatusCode.ERROR, str(error)))
|
|
327
|
+
|
|
328
|
+
if isinstance(error, Exception):
|
|
329
|
+
span.record_exception(error)
|
|
330
|
+
else:
|
|
331
|
+
span.add_event("error", {"error.message": str(error)})
|
|
332
|
+
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.debug(f"Error setting span error: {e}")
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
class NoOpTracer:
|
|
338
|
+
"""No-op tracer when OpenTelemetry is not available."""
|
|
339
|
+
|
|
340
|
+
@contextmanager
|
|
341
|
+
def start_as_current_span(self, _name: str, **_kwargs):
|
|
342
|
+
"""No-op span context manager."""
|
|
343
|
+
yield None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: chuk-tool-processor
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9
|
|
4
4
|
Summary: Async-native framework for registering, discovering, and executing tools referenced in LLM responses
|
|
5
5
|
Author-email: CHUK Team <chrishayuk@somejunkmailbox.com>
|
|
6
6
|
Maintainer-email: CHUK Team <chrishayuk@somejunkmailbox.com>
|
|
@@ -184,7 +184,7 @@ asyncio.run(main())
|
|
|
184
184
|
| 🔌 **Connect to external tools** | MCP integration (HTTP/STDIO/SSE) | [MCP Integration](#5-mcp-integration-external-tools) |
|
|
185
185
|
| 🛡️ **Production deployment** | Timeouts, retries, rate limits, caching | [Production Configuration](#using-the-processor) |
|
|
186
186
|
| 🔒 **Run untrusted code safely** | Subprocess isolation strategy | [Subprocess Strategy](#using-subprocess-strategy) |
|
|
187
|
-
| 📊 **Monitor and observe** |
|
|
187
|
+
| 📊 **Monitor and observe** | OpenTelemetry + Prometheus | [Observability](#opentelemetry--prometheus-drop-in-observability) |
|
|
188
188
|
| 🌊 **Stream incremental results** | StreamingTool pattern | [StreamingTool](#streamingtool-real-time-results) |
|
|
189
189
|
|
|
190
190
|
### Real-World Quick Start
|
|
@@ -1098,6 +1098,294 @@ async def main():
|
|
|
1098
1098
|
asyncio.run(main())
|
|
1099
1099
|
```
|
|
1100
1100
|
|
|
1101
|
+
#### OpenTelemetry & Prometheus (Drop-in Observability)
|
|
1102
|
+
|
|
1103
|
+
**Why Telemetry Matters**: In production, you need to know *what* your tools are doing, *how long* they take, *when* they fail, and *why*. CHUK Tool Processor provides **enterprise-grade telemetry** that operations teams expect—with zero manual instrumentation.
|
|
1104
|
+
|
|
1105
|
+
**One function call. Full observability.**
|
|
1106
|
+
|
|
1107
|
+
```python
|
|
1108
|
+
from chuk_tool_processor.observability import setup_observability
|
|
1109
|
+
|
|
1110
|
+
# Enable everything
|
|
1111
|
+
setup_observability(
|
|
1112
|
+
service_name="my-tool-service",
|
|
1113
|
+
enable_tracing=True, # OpenTelemetry distributed tracing
|
|
1114
|
+
enable_metrics=True, # Prometheus metrics endpoint
|
|
1115
|
+
metrics_port=9090 # HTTP endpoint at :9090/metrics
|
|
1116
|
+
)
|
|
1117
|
+
|
|
1118
|
+
# Every tool execution is now automatically traced and metered!
|
|
1119
|
+
```
|
|
1120
|
+
|
|
1121
|
+
**What You Get (Automatically)**
|
|
1122
|
+
|
|
1123
|
+
✅ **Distributed Traces** - Understand exactly what happened in each tool call
|
|
1124
|
+
- See the complete execution timeline for every tool
|
|
1125
|
+
- Track retries, cache hits, circuit breaker state changes
|
|
1126
|
+
- Correlate failures across your system
|
|
1127
|
+
- Export to Jaeger, Zipkin, or any OTLP-compatible backend
|
|
1128
|
+
|
|
1129
|
+
✅ **Production Metrics** - Monitor health and performance in real-time
|
|
1130
|
+
- Track error rates, latency percentiles (P50/P95/P99)
|
|
1131
|
+
- Monitor cache hit rates and retry attempts
|
|
1132
|
+
- Alert on circuit breaker opens and rate limit hits
|
|
1133
|
+
- Export to Prometheus, Grafana, or any metrics backend
|
|
1134
|
+
|
|
1135
|
+
✅ **Zero Configuration** - Works out of the box
|
|
1136
|
+
- No manual instrumentation needed
|
|
1137
|
+
- No code changes to existing tools
|
|
1138
|
+
- Gracefully degrades if packages not installed
|
|
1139
|
+
- Standard OTEL and Prometheus formats
|
|
1140
|
+
|
|
1141
|
+
**Installation**
|
|
1142
|
+
|
|
1143
|
+
```bash
|
|
1144
|
+
# Install observability dependencies
|
|
1145
|
+
pip install chuk-tool-processor[observability]
|
|
1146
|
+
|
|
1147
|
+
# Or manually
|
|
1148
|
+
pip install opentelemetry-api opentelemetry-sdk opentelemetry-exporter-otlp prometheus-client
|
|
1149
|
+
|
|
1150
|
+
# Or with uv (recommended)
|
|
1151
|
+
uv pip install chuk-tool-processor --group observability
|
|
1152
|
+
```
|
|
1153
|
+
|
|
1154
|
+
**Quick Start: See Your Tools in Action**
|
|
1155
|
+
|
|
1156
|
+
```python
|
|
1157
|
+
import asyncio
|
|
1158
|
+
from chuk_tool_processor.observability import setup_observability
|
|
1159
|
+
from chuk_tool_processor.core.processor import ToolProcessor
|
|
1160
|
+
from chuk_tool_processor.registry import initialize, register_tool
|
|
1161
|
+
|
|
1162
|
+
@register_tool(name="weather_api")
|
|
1163
|
+
class WeatherTool:
|
|
1164
|
+
async def execute(self, location: str) -> dict:
|
|
1165
|
+
# Simulating API call
|
|
1166
|
+
return {"temperature": 72, "conditions": "sunny", "location": location}
|
|
1167
|
+
|
|
1168
|
+
async def main():
|
|
1169
|
+
# 1. Enable observability (one line!)
|
|
1170
|
+
setup_observability(
|
|
1171
|
+
service_name="weather-service",
|
|
1172
|
+
enable_tracing=True,
|
|
1173
|
+
enable_metrics=True,
|
|
1174
|
+
metrics_port=9090
|
|
1175
|
+
)
|
|
1176
|
+
|
|
1177
|
+
# 2. Create processor with production features
|
|
1178
|
+
await initialize()
|
|
1179
|
+
processor = ToolProcessor(
|
|
1180
|
+
enable_caching=True, # Cache expensive API calls
|
|
1181
|
+
enable_retries=True, # Auto-retry on failures
|
|
1182
|
+
enable_circuit_breaker=True, # Prevent cascading failures
|
|
1183
|
+
enable_rate_limiting=True, # Prevent API abuse
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
# 3. Execute tools - automatically traced and metered
|
|
1187
|
+
results = await processor.process(
|
|
1188
|
+
'<tool name="weather_api" args=\'{"location": "San Francisco"}\'/>'
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
print(f"Result: {results[0].result}")
|
|
1192
|
+
print(f"Duration: {results[0].duration}s")
|
|
1193
|
+
print(f"Cached: {results[0].cached}")
|
|
1194
|
+
|
|
1195
|
+
asyncio.run(main())
|
|
1196
|
+
```
|
|
1197
|
+
|
|
1198
|
+
**View Your Data**
|
|
1199
|
+
|
|
1200
|
+
```bash
|
|
1201
|
+
# Start Jaeger for trace visualization
|
|
1202
|
+
docker run -d -p 4317:4317 -p 16686:16686 jaegertracing/all-in-one:latest
|
|
1203
|
+
|
|
1204
|
+
# Start your application
|
|
1205
|
+
python your_app.py
|
|
1206
|
+
|
|
1207
|
+
# View distributed traces
|
|
1208
|
+
open http://localhost:16686
|
|
1209
|
+
|
|
1210
|
+
# View Prometheus metrics
|
|
1211
|
+
curl http://localhost:9090/metrics | grep tool_
|
|
1212
|
+
```
|
|
1213
|
+
|
|
1214
|
+
**What Gets Traced (Automatic Spans)**
|
|
1215
|
+
|
|
1216
|
+
Every execution layer creates standardized OpenTelemetry spans:
|
|
1217
|
+
|
|
1218
|
+
| Span Name | When Created | Key Attributes |
|
|
1219
|
+
|-----------|--------------|----------------|
|
|
1220
|
+
| `tool.execute` | Every tool execution | `tool.name`, `tool.namespace`, `tool.duration_ms`, `tool.cached`, `tool.error`, `tool.success` |
|
|
1221
|
+
| `tool.cache.lookup` | Cache lookup | `cache.hit` (true/false), `cache.operation=lookup` |
|
|
1222
|
+
| `tool.cache.set` | Cache write | `cache.ttl`, `cache.operation=set` |
|
|
1223
|
+
| `tool.retry.attempt` | Each retry | `retry.attempt`, `retry.max_attempts`, `retry.success` |
|
|
1224
|
+
| `tool.circuit_breaker.check` | Circuit state check | `circuit.state` (CLOSED/OPEN/HALF_OPEN) |
|
|
1225
|
+
| `tool.rate_limit.check` | Rate limit check | `rate_limit.allowed` (true/false) |
|
|
1226
|
+
|
|
1227
|
+
**Example trace hierarchy:**
|
|
1228
|
+
```
|
|
1229
|
+
tool.execute (weather_api)
|
|
1230
|
+
├── tool.cache.lookup (miss)
|
|
1231
|
+
├── tool.retry.attempt (0)
|
|
1232
|
+
│ └── tool.execute (actual API call)
|
|
1233
|
+
├── tool.retry.attempt (1) [if first failed]
|
|
1234
|
+
└── tool.cache.set (store result)
|
|
1235
|
+
```
|
|
1236
|
+
|
|
1237
|
+
**What Gets Metered (Automatic Metrics)**
|
|
1238
|
+
|
|
1239
|
+
Standard Prometheus metrics exposed at `/metrics`:
|
|
1240
|
+
|
|
1241
|
+
| Metric | Type | Labels | Use For |
|
|
1242
|
+
|--------|------|--------|---------|
|
|
1243
|
+
| `tool_executions_total` | Counter | `tool`, `namespace`, `status` | Error rate, request volume |
|
|
1244
|
+
| `tool_execution_duration_seconds` | Histogram | `tool`, `namespace` | P50/P95/P99 latency |
|
|
1245
|
+
| `tool_cache_operations_total` | Counter | `tool`, `operation`, `result` | Cache hit rate |
|
|
1246
|
+
| `tool_retry_attempts_total` | Counter | `tool`, `attempt`, `success` | Retry frequency |
|
|
1247
|
+
| `tool_circuit_breaker_state` | Gauge | `tool` | Circuit health (0=CLOSED, 1=OPEN, 2=HALF_OPEN) |
|
|
1248
|
+
| `tool_circuit_breaker_failures_total` | Counter | `tool` | Failure count |
|
|
1249
|
+
| `tool_rate_limit_checks_total` | Counter | `tool`, `allowed` | Rate limit hits |
|
|
1250
|
+
|
|
1251
|
+
**Useful PromQL Queries**
|
|
1252
|
+
|
|
1253
|
+
```promql
|
|
1254
|
+
# Error rate per tool (last 5 minutes)
|
|
1255
|
+
rate(tool_executions_total{status="error"}[5m])
|
|
1256
|
+
/ rate(tool_executions_total[5m])
|
|
1257
|
+
|
|
1258
|
+
# P95 latency
|
|
1259
|
+
histogram_quantile(0.95, rate(tool_execution_duration_seconds_bucket[5m]))
|
|
1260
|
+
|
|
1261
|
+
# Cache hit rate
|
|
1262
|
+
rate(tool_cache_operations_total{result="hit"}[5m])
|
|
1263
|
+
/ rate(tool_cache_operations_total{operation="lookup"}[5m])
|
|
1264
|
+
|
|
1265
|
+
# Tools currently circuit broken
|
|
1266
|
+
tool_circuit_breaker_state == 1
|
|
1267
|
+
|
|
1268
|
+
# Retry rate (how often tools need retries)
|
|
1269
|
+
rate(tool_retry_attempts_total{attempt!="0"}[5m])
|
|
1270
|
+
/ rate(tool_executions_total[5m])
|
|
1271
|
+
```
|
|
1272
|
+
|
|
1273
|
+
**Configuration**
|
|
1274
|
+
|
|
1275
|
+
Configure via environment variables:
|
|
1276
|
+
|
|
1277
|
+
```bash
|
|
1278
|
+
# OTLP endpoint (where traces are sent)
|
|
1279
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
|
1280
|
+
|
|
1281
|
+
# Service name (shown in traces)
|
|
1282
|
+
export OTEL_SERVICE_NAME=production-api
|
|
1283
|
+
|
|
1284
|
+
# Sampling (reduce overhead in high-traffic scenarios)
|
|
1285
|
+
export OTEL_TRACES_SAMPLER=traceidratio
|
|
1286
|
+
export OTEL_TRACES_SAMPLER_ARG=0.1 # Sample 10% of traces
|
|
1287
|
+
```
|
|
1288
|
+
|
|
1289
|
+
Or in code:
|
|
1290
|
+
|
|
1291
|
+
```python
|
|
1292
|
+
status = setup_observability(
|
|
1293
|
+
service_name="my-service",
|
|
1294
|
+
enable_tracing=True,
|
|
1295
|
+
enable_metrics=True,
|
|
1296
|
+
metrics_port=9090,
|
|
1297
|
+
metrics_host="0.0.0.0" # Allow external Prometheus scraping
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
# Check status
|
|
1301
|
+
if status["tracing_enabled"]:
|
|
1302
|
+
print("Traces exporting to OTLP endpoint")
|
|
1303
|
+
if status["metrics_server_started"]:
|
|
1304
|
+
print("Metrics available at http://localhost:9090/metrics")
|
|
1305
|
+
```
|
|
1306
|
+
|
|
1307
|
+
**Production Integration**
|
|
1308
|
+
|
|
1309
|
+
**With Grafana + Prometheus:**
|
|
1310
|
+
```yaml
|
|
1311
|
+
# prometheus.yml
|
|
1312
|
+
scrape_configs:
|
|
1313
|
+
- job_name: 'chuk-tool-processor'
|
|
1314
|
+
scrape_interval: 15s
|
|
1315
|
+
static_configs:
|
|
1316
|
+
- targets: ['app:9090']
|
|
1317
|
+
```
|
|
1318
|
+
|
|
1319
|
+
**With OpenTelemetry Collector:**
|
|
1320
|
+
```yaml
|
|
1321
|
+
# otel-collector-config.yaml
|
|
1322
|
+
receivers:
|
|
1323
|
+
otlp:
|
|
1324
|
+
protocols:
|
|
1325
|
+
grpc:
|
|
1326
|
+
endpoint: 0.0.0.0:4317
|
|
1327
|
+
|
|
1328
|
+
exporters:
|
|
1329
|
+
jaeger:
|
|
1330
|
+
endpoint: jaeger:14250
|
|
1331
|
+
prometheus:
|
|
1332
|
+
endpoint: 0.0.0.0:8889
|
|
1333
|
+
|
|
1334
|
+
service:
|
|
1335
|
+
pipelines:
|
|
1336
|
+
traces:
|
|
1337
|
+
receivers: [otlp]
|
|
1338
|
+
exporters: [jaeger]
|
|
1339
|
+
```
|
|
1340
|
+
|
|
1341
|
+
**With Cloud Providers:**
|
|
1342
|
+
```bash
|
|
1343
|
+
# AWS X-Ray
|
|
1344
|
+
export OTEL_TRACES_SAMPLER=xray
|
|
1345
|
+
|
|
1346
|
+
# Google Cloud Trace
|
|
1347
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=https://cloudtrace.googleapis.com/v1/projects/PROJECT_ID/traces
|
|
1348
|
+
|
|
1349
|
+
# Datadog
|
|
1350
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://datadog-agent:4317
|
|
1351
|
+
```
|
|
1352
|
+
|
|
1353
|
+
**Why This Matters**
|
|
1354
|
+
|
|
1355
|
+
❌ **Without telemetry:**
|
|
1356
|
+
- "Why is this tool slow?" → No idea
|
|
1357
|
+
- "Is caching helping?" → Guessing
|
|
1358
|
+
- "Did that retry work?" → Check logs manually
|
|
1359
|
+
- "Is the circuit breaker working?" → Hope so
|
|
1360
|
+
- "Which tool is failing?" → Debug blindly
|
|
1361
|
+
|
|
1362
|
+
✅ **With telemetry:**
|
|
1363
|
+
- See exact execution timeline in Jaeger
|
|
1364
|
+
- Monitor cache hit rate in Grafana
|
|
1365
|
+
- Alert when retry rate spikes
|
|
1366
|
+
- Dashboard shows circuit breaker states
|
|
1367
|
+
- Metrics pinpoint the failing tool immediately
|
|
1368
|
+
|
|
1369
|
+
**Learn More**
|
|
1370
|
+
|
|
1371
|
+
📖 **Complete Guide**: See [`OBSERVABILITY.md`](OBSERVABILITY.md) for:
|
|
1372
|
+
- Complete span and metric specifications
|
|
1373
|
+
- Architecture and implementation details
|
|
1374
|
+
- Integration guides (Jaeger, Grafana, OTEL Collector)
|
|
1375
|
+
- Testing observability features
|
|
1376
|
+
- Environment variable configuration
|
|
1377
|
+
|
|
1378
|
+
🎯 **Working Example**: See `examples/observability_demo.py` for a complete demonstration with retries, caching, and circuit breakers
|
|
1379
|
+
|
|
1380
|
+
**Benefits**
|
|
1381
|
+
|
|
1382
|
+
✅ **Drop-in** - One function call, zero code changes
|
|
1383
|
+
✅ **Automatic** - All execution layers instrumented
|
|
1384
|
+
✅ **Standard** - OTEL + Prometheus (works with existing tools)
|
|
1385
|
+
✅ **Production-ready** - Ops teams get exactly what they expect
|
|
1386
|
+
✅ **Optional** - Gracefully degrades if packages not installed
|
|
1387
|
+
✅ **Zero-overhead** - No performance impact when disabled
|
|
1388
|
+
|
|
1101
1389
|
### Error Handling
|
|
1102
1390
|
|
|
1103
1391
|
```python
|
|
@@ -1328,6 +1616,7 @@ Check out the [`examples/`](examples/) directory for complete working examples:
|
|
|
1328
1616
|
- **Execution strategies**: `examples/execution_strategies_demo.py` - InProcess vs Subprocess
|
|
1329
1617
|
- **Production wrappers**: `examples/wrappers_demo.py` - Caching, retries, rate limiting
|
|
1330
1618
|
- **Streaming tools**: `examples/streaming_demo.py` - Real-time incremental results
|
|
1619
|
+
- **Observability**: `examples/observability_demo.py` - OpenTelemetry + Prometheus integration
|
|
1331
1620
|
|
|
1332
1621
|
### MCP Integration (Real-World)
|
|
1333
1622
|
- **Notion + OAuth**: `examples/notion_oauth.py` - Complete OAuth 2.1 flow with HTTP Streamable
|