chuk-tool-processor 0.7.0__py3-none-any.whl → 0.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chuk-tool-processor might be problematic. Click here for more details.
- chuk_tool_processor/__init__.py +114 -0
- chuk_tool_processor/core/__init__.py +31 -0
- chuk_tool_processor/core/exceptions.py +218 -12
- chuk_tool_processor/core/processor.py +391 -43
- chuk_tool_processor/execution/wrappers/__init__.py +42 -0
- chuk_tool_processor/execution/wrappers/caching.py +43 -10
- chuk_tool_processor/execution/wrappers/circuit_breaker.py +370 -0
- chuk_tool_processor/execution/wrappers/rate_limiting.py +31 -1
- chuk_tool_processor/execution/wrappers/retry.py +93 -53
- chuk_tool_processor/logging/__init__.py +5 -8
- chuk_tool_processor/logging/context.py +2 -5
- chuk_tool_processor/mcp/__init__.py +3 -0
- chuk_tool_processor/mcp/mcp_tool.py +8 -3
- chuk_tool_processor/mcp/models.py +87 -0
- chuk_tool_processor/mcp/setup_mcp_http_streamable.py +38 -2
- chuk_tool_processor/mcp/setup_mcp_sse.py +38 -2
- chuk_tool_processor/mcp/setup_mcp_stdio.py +92 -12
- chuk_tool_processor/mcp/stream_manager.py +109 -6
- chuk_tool_processor/mcp/transport/http_streamable_transport.py +18 -5
- chuk_tool_processor/mcp/transport/sse_transport.py +16 -3
- chuk_tool_processor/models/__init__.py +20 -0
- chuk_tool_processor/models/tool_call.py +34 -1
- chuk_tool_processor/models/tool_export_mixin.py +4 -4
- chuk_tool_processor/models/tool_spec.py +350 -0
- chuk_tool_processor/models/validated_tool.py +22 -2
- chuk_tool_processor/observability/__init__.py +30 -0
- chuk_tool_processor/observability/metrics.py +312 -0
- chuk_tool_processor/observability/setup.py +105 -0
- chuk_tool_processor/observability/tracing.py +346 -0
- chuk_tool_processor/py.typed +0 -0
- chuk_tool_processor/registry/interface.py +7 -7
- chuk_tool_processor/registry/providers/__init__.py +2 -1
- chuk_tool_processor/registry/tool_export.py +1 -6
- chuk_tool_processor-0.10.dist-info/METADATA +2326 -0
- chuk_tool_processor-0.10.dist-info/RECORD +69 -0
- chuk_tool_processor-0.7.0.dist-info/METADATA +0 -1230
- chuk_tool_processor-0.7.0.dist-info/RECORD +0 -61
- {chuk_tool_processor-0.7.0.dist-info → chuk_tool_processor-0.10.dist-info}/WHEEL +0 -0
- {chuk_tool_processor-0.7.0.dist-info → chuk_tool_processor-0.10.dist-info}/top_level.txt +0 -0
|
@@ -29,6 +29,24 @@ from chuk_tool_processor.models.tool_result import ToolResult
|
|
|
29
29
|
|
|
30
30
|
logger = get_logger("chuk_tool_processor.execution.wrappers.caching")
|
|
31
31
|
|
|
32
|
+
# Optional observability imports
|
|
33
|
+
try:
|
|
34
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
35
|
+
from chuk_tool_processor.observability.tracing import trace_cache_operation
|
|
36
|
+
|
|
37
|
+
_observability_available = True
|
|
38
|
+
except ImportError:
|
|
39
|
+
_observability_available = False
|
|
40
|
+
|
|
41
|
+
# No-op functions when observability not available
|
|
42
|
+
def get_metrics():
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
def trace_cache_operation(*_args, **_kwargs):
|
|
46
|
+
from contextlib import nullcontext
|
|
47
|
+
|
|
48
|
+
return nullcontext()
|
|
49
|
+
|
|
32
50
|
|
|
33
51
|
# --------------------------------------------------------------------------- #
|
|
34
52
|
# Cache primitives
|
|
@@ -428,8 +446,17 @@ class CachingToolExecutor:
|
|
|
428
446
|
uncached.append((idx, call))
|
|
429
447
|
continue
|
|
430
448
|
|
|
431
|
-
|
|
432
|
-
|
|
449
|
+
# Use idempotency_key if available, otherwise hash arguments
|
|
450
|
+
cache_key = call.idempotency_key or self._hash_arguments(call.arguments)
|
|
451
|
+
|
|
452
|
+
# Trace cache lookup operation
|
|
453
|
+
with trace_cache_operation("lookup", call.tool):
|
|
454
|
+
cached_val = await self.cache.get(call.tool, cache_key)
|
|
455
|
+
|
|
456
|
+
# Record metrics
|
|
457
|
+
metrics = get_metrics()
|
|
458
|
+
if metrics:
|
|
459
|
+
metrics.record_cache_operation(call.tool, "lookup", hit=(cached_val is not None))
|
|
433
460
|
|
|
434
461
|
if cached_val is None:
|
|
435
462
|
# Cache miss
|
|
@@ -480,19 +507,25 @@ class CachingToolExecutor:
|
|
|
480
507
|
# ------------------------------------------------------------------
|
|
481
508
|
if use_cache:
|
|
482
509
|
cache_tasks = []
|
|
510
|
+
metrics = get_metrics()
|
|
511
|
+
|
|
483
512
|
for (_idx, call), result in zip(uncached, uncached_results, strict=False):
|
|
484
513
|
if result.error is None and self._is_cacheable(call.tool):
|
|
485
514
|
ttl = self._ttl_for(call.tool)
|
|
486
515
|
logger.debug(f"Caching result for {call.tool} with TTL={ttl}s")
|
|
487
516
|
|
|
488
|
-
#
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
517
|
+
# Use idempotency_key if available, otherwise hash arguments
|
|
518
|
+
cache_key = call.idempotency_key or self._hash_arguments(call.arguments)
|
|
519
|
+
|
|
520
|
+
# Trace and record cache set operation
|
|
521
|
+
# Bind loop variables to avoid B023 error
|
|
522
|
+
async def cache_with_trace(tool=call.tool, key=cache_key, value=result.result, ttl_val=ttl):
|
|
523
|
+
with trace_cache_operation("set", tool, attributes={"ttl": ttl_val}):
|
|
524
|
+
await self.cache.set(tool, key, value, ttl=ttl_val)
|
|
525
|
+
if metrics:
|
|
526
|
+
metrics.record_cache_operation(tool, "set")
|
|
527
|
+
|
|
528
|
+
cache_tasks.append(cache_with_trace())
|
|
496
529
|
|
|
497
530
|
# Flag as non-cached so callers can tell
|
|
498
531
|
if hasattr(result, "cached"):
|
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
# chuk_tool_processor/execution/wrappers/circuit_breaker.py
|
|
2
|
+
"""
|
|
3
|
+
Circuit breaker pattern for tool execution.
|
|
4
|
+
|
|
5
|
+
Prevents cascading failures by tracking failure rates and temporarily
|
|
6
|
+
blocking calls to failing tools. Implements a state machine:
|
|
7
|
+
|
|
8
|
+
CLOSED → OPEN → HALF_OPEN → CLOSED (or back to OPEN)
|
|
9
|
+
|
|
10
|
+
States:
|
|
11
|
+
- CLOSED: Normal operation, requests pass through
|
|
12
|
+
- OPEN: Too many failures, requests blocked immediately
|
|
13
|
+
- HALF_OPEN: Testing if service recovered, limited requests allowed
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import time
|
|
20
|
+
from datetime import UTC, datetime
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from chuk_tool_processor.core.exceptions import ToolCircuitOpenError
|
|
25
|
+
from chuk_tool_processor.logging import get_logger
|
|
26
|
+
from chuk_tool_processor.models.tool_call import ToolCall
|
|
27
|
+
from chuk_tool_processor.models.tool_result import ToolResult
|
|
28
|
+
|
|
29
|
+
logger = get_logger("chuk_tool_processor.execution.wrappers.circuit_breaker")
|
|
30
|
+
|
|
31
|
+
# Optional observability imports
|
|
32
|
+
try:
|
|
33
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
34
|
+
from chuk_tool_processor.observability.tracing import trace_circuit_breaker
|
|
35
|
+
|
|
36
|
+
_observability_available = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
_observability_available = False
|
|
39
|
+
|
|
40
|
+
# No-op functions when observability not available
|
|
41
|
+
def get_metrics():
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
def trace_circuit_breaker(*_args, **_kwargs):
|
|
45
|
+
from contextlib import nullcontext
|
|
46
|
+
|
|
47
|
+
return nullcontext()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# --------------------------------------------------------------------------- #
|
|
51
|
+
# Circuit breaker state
|
|
52
|
+
# --------------------------------------------------------------------------- #
|
|
53
|
+
class CircuitState(str, Enum):
|
|
54
|
+
"""Circuit breaker states."""
|
|
55
|
+
|
|
56
|
+
CLOSED = "closed" # Normal operation
|
|
57
|
+
OPEN = "open" # Blocking requests due to failures
|
|
58
|
+
HALF_OPEN = "half_open" # Testing recovery with limited requests
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CircuitBreakerConfig:
|
|
62
|
+
"""Configuration for circuit breaker behavior."""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
failure_threshold: int = 5,
|
|
67
|
+
success_threshold: int = 2,
|
|
68
|
+
reset_timeout: float = 60.0,
|
|
69
|
+
half_open_max_calls: int = 1,
|
|
70
|
+
timeout_threshold: float | None = None,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Initialize circuit breaker configuration.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
failure_threshold: Number of failures before opening circuit
|
|
77
|
+
success_threshold: Number of successes in HALF_OPEN to close circuit
|
|
78
|
+
reset_timeout: Seconds to wait before trying HALF_OPEN
|
|
79
|
+
half_open_max_calls: Max concurrent calls in HALF_OPEN state
|
|
80
|
+
timeout_threshold: Optional timeout (s) to consider as failure
|
|
81
|
+
"""
|
|
82
|
+
self.failure_threshold = failure_threshold
|
|
83
|
+
self.success_threshold = success_threshold
|
|
84
|
+
self.reset_timeout = reset_timeout
|
|
85
|
+
self.half_open_max_calls = half_open_max_calls
|
|
86
|
+
self.timeout_threshold = timeout_threshold
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class CircuitBreakerState:
|
|
90
|
+
"""Per-tool circuit breaker state tracking."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, config: CircuitBreakerConfig):
|
|
93
|
+
self.config = config
|
|
94
|
+
self.state = CircuitState.CLOSED
|
|
95
|
+
self.failure_count = 0
|
|
96
|
+
self.success_count = 0
|
|
97
|
+
self.last_failure_time: float | None = None
|
|
98
|
+
self.opened_at: float | None = None
|
|
99
|
+
self.half_open_calls = 0
|
|
100
|
+
self._lock = asyncio.Lock()
|
|
101
|
+
|
|
102
|
+
async def record_success(self) -> None:
|
|
103
|
+
"""Record a successful call."""
|
|
104
|
+
async with self._lock:
|
|
105
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
106
|
+
self.success_count += 1
|
|
107
|
+
logger.debug(f"Circuit HALF_OPEN: success {self.success_count}/{self.config.success_threshold}")
|
|
108
|
+
|
|
109
|
+
# Enough successes? Close the circuit
|
|
110
|
+
if self.success_count >= self.config.success_threshold:
|
|
111
|
+
logger.info("Circuit breaker: Transitioning to CLOSED (service recovered)")
|
|
112
|
+
self.state = CircuitState.CLOSED
|
|
113
|
+
self.failure_count = 0
|
|
114
|
+
self.success_count = 0
|
|
115
|
+
self.opened_at = None
|
|
116
|
+
self.half_open_calls = 0
|
|
117
|
+
else:
|
|
118
|
+
# In CLOSED state, just reset failure count
|
|
119
|
+
self.failure_count = 0
|
|
120
|
+
|
|
121
|
+
async def record_failure(self) -> None:
|
|
122
|
+
"""Record a failed call."""
|
|
123
|
+
async with self._lock:
|
|
124
|
+
self.failure_count += 1
|
|
125
|
+
self.last_failure_time = time.monotonic()
|
|
126
|
+
logger.debug(f"Circuit: failure {self.failure_count}/{self.config.failure_threshold}")
|
|
127
|
+
|
|
128
|
+
if self.state == CircuitState.CLOSED:
|
|
129
|
+
# Check if we should open
|
|
130
|
+
if self.failure_count >= self.config.failure_threshold:
|
|
131
|
+
logger.warning(f"Circuit breaker: OPENING after {self.failure_count} failures")
|
|
132
|
+
self.state = CircuitState.OPEN
|
|
133
|
+
self.opened_at = time.monotonic()
|
|
134
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
135
|
+
# Failed during test → back to OPEN
|
|
136
|
+
logger.warning("Circuit breaker: Back to OPEN (test failed)")
|
|
137
|
+
self.state = CircuitState.OPEN
|
|
138
|
+
self.success_count = 0
|
|
139
|
+
self.opened_at = time.monotonic()
|
|
140
|
+
self.half_open_calls = 0
|
|
141
|
+
|
|
142
|
+
async def can_execute(self) -> bool:
|
|
143
|
+
"""Check if a call should be allowed through."""
|
|
144
|
+
async with self._lock:
|
|
145
|
+
if self.state == CircuitState.CLOSED:
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
149
|
+
# Limit concurrent calls in HALF_OPEN
|
|
150
|
+
if self.half_open_calls < self.config.half_open_max_calls:
|
|
151
|
+
self.half_open_calls += 1
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
# OPEN state: check if we should try HALF_OPEN
|
|
156
|
+
if self.opened_at is not None:
|
|
157
|
+
elapsed = time.monotonic() - self.opened_at
|
|
158
|
+
if elapsed >= self.config.reset_timeout:
|
|
159
|
+
logger.info("Circuit breaker: Transitioning to HALF_OPEN (testing recovery)")
|
|
160
|
+
self.state = CircuitState.HALF_OPEN
|
|
161
|
+
self.half_open_calls = 1
|
|
162
|
+
self.success_count = 0
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
async def release_half_open_slot(self) -> None:
|
|
168
|
+
"""Release a HALF_OPEN slot after call completes."""
|
|
169
|
+
async with self._lock:
|
|
170
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
171
|
+
self.half_open_calls = max(0, self.half_open_calls - 1)
|
|
172
|
+
|
|
173
|
+
def get_state(self) -> dict[str, Any]:
|
|
174
|
+
"""Get current state as dict."""
|
|
175
|
+
return {
|
|
176
|
+
"state": self.state.value,
|
|
177
|
+
"failure_count": self.failure_count,
|
|
178
|
+
"success_count": self.success_count,
|
|
179
|
+
"opened_at": self.opened_at,
|
|
180
|
+
"time_until_half_open": (
|
|
181
|
+
max(0, self.config.reset_timeout - (time.monotonic() - self.opened_at))
|
|
182
|
+
if self.opened_at and self.state == CircuitState.OPEN
|
|
183
|
+
else None
|
|
184
|
+
),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# --------------------------------------------------------------------------- #
|
|
189
|
+
# Circuit breaker executor wrapper
|
|
190
|
+
# --------------------------------------------------------------------------- #
|
|
191
|
+
class CircuitBreakerExecutor:
|
|
192
|
+
"""
|
|
193
|
+
Executor wrapper that implements circuit breaker pattern.
|
|
194
|
+
|
|
195
|
+
Tracks failures per tool and opens circuit breakers to prevent
|
|
196
|
+
cascading failures when tools are consistently failing.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(
|
|
200
|
+
self,
|
|
201
|
+
executor: Any,
|
|
202
|
+
*,
|
|
203
|
+
default_config: CircuitBreakerConfig | None = None,
|
|
204
|
+
tool_configs: dict[str, CircuitBreakerConfig] | None = None,
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Initialize circuit breaker executor.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
executor: Underlying executor to wrap
|
|
211
|
+
default_config: Default circuit breaker configuration
|
|
212
|
+
tool_configs: Per-tool circuit breaker configurations
|
|
213
|
+
"""
|
|
214
|
+
self.executor = executor
|
|
215
|
+
self.default_config = default_config or CircuitBreakerConfig()
|
|
216
|
+
self.tool_configs = tool_configs or {}
|
|
217
|
+
self._states: dict[str, CircuitBreakerState] = {}
|
|
218
|
+
self._states_lock = asyncio.Lock()
|
|
219
|
+
|
|
220
|
+
async def _get_state(self, tool: str) -> CircuitBreakerState:
|
|
221
|
+
"""Get or create circuit breaker state for a tool."""
|
|
222
|
+
if tool not in self._states:
|
|
223
|
+
async with self._states_lock:
|
|
224
|
+
if tool not in self._states:
|
|
225
|
+
config = self.tool_configs.get(tool, self.default_config)
|
|
226
|
+
self._states[tool] = CircuitBreakerState(config)
|
|
227
|
+
return self._states[tool]
|
|
228
|
+
|
|
229
|
+
async def execute(
|
|
230
|
+
self,
|
|
231
|
+
calls: list[ToolCall],
|
|
232
|
+
*,
|
|
233
|
+
timeout: float | None = None,
|
|
234
|
+
use_cache: bool = True,
|
|
235
|
+
) -> list[ToolResult]:
|
|
236
|
+
"""
|
|
237
|
+
Execute tool calls with circuit breaker protection.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
calls: List of tool calls to execute
|
|
241
|
+
timeout: Optional timeout for execution
|
|
242
|
+
use_cache: Whether to use cached results
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of tool results
|
|
246
|
+
"""
|
|
247
|
+
if not calls:
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
results: list[ToolResult] = []
|
|
251
|
+
|
|
252
|
+
for call in calls:
|
|
253
|
+
state = await self._get_state(call.tool)
|
|
254
|
+
|
|
255
|
+
# Record circuit breaker state
|
|
256
|
+
metrics = get_metrics()
|
|
257
|
+
if metrics:
|
|
258
|
+
metrics.record_circuit_breaker_state(call.tool, state.state.value)
|
|
259
|
+
|
|
260
|
+
# Check if circuit allows execution with tracing
|
|
261
|
+
with trace_circuit_breaker(call.tool, state.state.value):
|
|
262
|
+
can_execute = await state.can_execute()
|
|
263
|
+
|
|
264
|
+
if not can_execute:
|
|
265
|
+
# Circuit is OPEN - reject immediately
|
|
266
|
+
state_info = state.get_state()
|
|
267
|
+
logger.warning(f"Circuit breaker OPEN for {call.tool} (failures: {state.failure_count})")
|
|
268
|
+
|
|
269
|
+
reset_time = state_info.get("time_until_half_open")
|
|
270
|
+
error = ToolCircuitOpenError(
|
|
271
|
+
tool_name=call.tool,
|
|
272
|
+
failure_count=state.failure_count,
|
|
273
|
+
reset_timeout=reset_time,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
now = datetime.now(UTC)
|
|
277
|
+
results.append(
|
|
278
|
+
ToolResult(
|
|
279
|
+
tool=call.tool,
|
|
280
|
+
result=None,
|
|
281
|
+
error=str(error),
|
|
282
|
+
start_time=now,
|
|
283
|
+
end_time=now,
|
|
284
|
+
machine="circuit_breaker",
|
|
285
|
+
pid=0,
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
# Execute the call
|
|
291
|
+
start_time = time.monotonic()
|
|
292
|
+
try:
|
|
293
|
+
# Execute single call
|
|
294
|
+
executor_kwargs = {"timeout": timeout}
|
|
295
|
+
if hasattr(self.executor, "use_cache"):
|
|
296
|
+
executor_kwargs["use_cache"] = use_cache
|
|
297
|
+
|
|
298
|
+
result_list = await self.executor.execute([call], **executor_kwargs)
|
|
299
|
+
result = result_list[0]
|
|
300
|
+
|
|
301
|
+
# Check if successful
|
|
302
|
+
duration = time.monotonic() - start_time
|
|
303
|
+
|
|
304
|
+
# Determine success/failure
|
|
305
|
+
is_timeout = state.config.timeout_threshold is not None and duration > state.config.timeout_threshold
|
|
306
|
+
is_error = result.error is not None
|
|
307
|
+
|
|
308
|
+
if is_error or is_timeout:
|
|
309
|
+
await state.record_failure()
|
|
310
|
+
# Record circuit breaker failure metric
|
|
311
|
+
if metrics:
|
|
312
|
+
metrics.record_circuit_breaker_failure(call.tool)
|
|
313
|
+
else:
|
|
314
|
+
await state.record_success()
|
|
315
|
+
|
|
316
|
+
results.append(result)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
# Exception during execution
|
|
320
|
+
await state.record_failure()
|
|
321
|
+
|
|
322
|
+
now = datetime.now(UTC)
|
|
323
|
+
results.append(
|
|
324
|
+
ToolResult(
|
|
325
|
+
tool=call.tool,
|
|
326
|
+
result=None,
|
|
327
|
+
error=f"Circuit breaker caught exception: {str(e)}",
|
|
328
|
+
start_time=now,
|
|
329
|
+
end_time=now,
|
|
330
|
+
machine="circuit_breaker",
|
|
331
|
+
pid=0,
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
finally:
|
|
336
|
+
# Release HALF_OPEN slot if applicable
|
|
337
|
+
if state.state == CircuitState.HALF_OPEN:
|
|
338
|
+
await state.release_half_open_slot()
|
|
339
|
+
|
|
340
|
+
return results
|
|
341
|
+
|
|
342
|
+
async def get_circuit_states(self) -> dict[str, dict[str, Any]]:
|
|
343
|
+
"""
|
|
344
|
+
Get current state of all circuit breakers.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Dict mapping tool name to state info
|
|
348
|
+
"""
|
|
349
|
+
states = {}
|
|
350
|
+
async with self._states_lock:
|
|
351
|
+
for tool, state in self._states.items():
|
|
352
|
+
states[tool] = state.get_state()
|
|
353
|
+
return states
|
|
354
|
+
|
|
355
|
+
async def reset_circuit(self, tool: str) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Manually reset a circuit breaker.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
tool: Tool name to reset
|
|
361
|
+
"""
|
|
362
|
+
if tool in self._states:
|
|
363
|
+
state = self._states[tool]
|
|
364
|
+
async with state._lock:
|
|
365
|
+
state.state = CircuitState.CLOSED
|
|
366
|
+
state.failure_count = 0
|
|
367
|
+
state.success_count = 0
|
|
368
|
+
state.opened_at = None
|
|
369
|
+
state.half_open_calls = 0
|
|
370
|
+
logger.info(f"Manually reset circuit breaker for {tool}")
|
|
@@ -25,6 +25,24 @@ from chuk_tool_processor.models.tool_result import ToolResult
|
|
|
25
25
|
|
|
26
26
|
logger = get_logger("chuk_tool_processor.execution.wrappers.rate_limiting")
|
|
27
27
|
|
|
28
|
+
# Optional observability imports
|
|
29
|
+
try:
|
|
30
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
31
|
+
from chuk_tool_processor.observability.tracing import trace_rate_limit
|
|
32
|
+
|
|
33
|
+
_observability_available = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
_observability_available = False
|
|
36
|
+
|
|
37
|
+
# No-op functions when observability not available
|
|
38
|
+
def get_metrics():
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def trace_rate_limit(*_args, **_kwargs):
|
|
42
|
+
from contextlib import nullcontext
|
|
43
|
+
|
|
44
|
+
return nullcontext()
|
|
45
|
+
|
|
28
46
|
|
|
29
47
|
# --------------------------------------------------------------------------- #
|
|
30
48
|
# Core limiter
|
|
@@ -220,8 +238,20 @@ class RateLimitedToolExecutor:
|
|
|
220
238
|
return []
|
|
221
239
|
|
|
222
240
|
# Block for each call *before* dispatching to the wrapped executor
|
|
241
|
+
metrics = get_metrics()
|
|
242
|
+
|
|
223
243
|
for c in calls:
|
|
224
|
-
|
|
244
|
+
# Check limits first for metrics
|
|
245
|
+
global_limited, tool_limited = await self.limiter.check_limits(c.tool)
|
|
246
|
+
allowed = not (global_limited or tool_limited)
|
|
247
|
+
|
|
248
|
+
# Trace rate limit check
|
|
249
|
+
with trace_rate_limit(c.tool, allowed):
|
|
250
|
+
await self.limiter.wait(c.tool)
|
|
251
|
+
|
|
252
|
+
# Record metrics
|
|
253
|
+
if metrics:
|
|
254
|
+
metrics.record_rate_limit_check(c.tool, allowed)
|
|
225
255
|
|
|
226
256
|
# Check if the executor has a use_cache parameter
|
|
227
257
|
if hasattr(self.executor, "execute"):
|
|
@@ -21,6 +21,24 @@ from chuk_tool_processor.models.tool_result import ToolResult
|
|
|
21
21
|
|
|
22
22
|
logger = get_logger("chuk_tool_processor.execution.wrappers.retry")
|
|
23
23
|
|
|
24
|
+
# Optional observability imports
|
|
25
|
+
try:
|
|
26
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
27
|
+
from chuk_tool_processor.observability.tracing import trace_retry_attempt
|
|
28
|
+
|
|
29
|
+
_observability_available = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
_observability_available = False
|
|
32
|
+
|
|
33
|
+
# No-op functions when observability not available
|
|
34
|
+
def get_metrics():
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def trace_retry_attempt(*_args, **_kwargs):
|
|
38
|
+
from contextlib import nullcontext
|
|
39
|
+
|
|
40
|
+
return nullcontext()
|
|
41
|
+
|
|
24
42
|
|
|
25
43
|
# --------------------------------------------------------------------------- #
|
|
26
44
|
# Retry configuration
|
|
@@ -36,6 +54,7 @@ class RetryConfig:
|
|
|
36
54
|
jitter: bool = True,
|
|
37
55
|
retry_on_exceptions: list[type[Exception]] | None = None,
|
|
38
56
|
retry_on_error_substrings: list[str] | None = None,
|
|
57
|
+
skip_retry_on_error_substrings: list[str] | None = None,
|
|
39
58
|
):
|
|
40
59
|
if max_retries < 0:
|
|
41
60
|
raise ValueError("max_retries cannot be negative")
|
|
@@ -45,6 +64,7 @@ class RetryConfig:
|
|
|
45
64
|
self.jitter = jitter
|
|
46
65
|
self.retry_on_exceptions = retry_on_exceptions or []
|
|
47
66
|
self.retry_on_error_substrings = retry_on_error_substrings or []
|
|
67
|
+
self.skip_retry_on_error_substrings = skip_retry_on_error_substrings or []
|
|
48
68
|
|
|
49
69
|
# --------------------------------------------------------------------- #
|
|
50
70
|
# Decision helpers
|
|
@@ -60,6 +80,14 @@ class RetryConfig:
|
|
|
60
80
|
if attempt >= self.max_retries:
|
|
61
81
|
return False
|
|
62
82
|
|
|
83
|
+
# Check skip list first - these errors should never be retried
|
|
84
|
+
# (e.g., OAuth errors that need to be handled at transport layer)
|
|
85
|
+
if error_str and self.skip_retry_on_error_substrings:
|
|
86
|
+
error_lower = error_str.lower()
|
|
87
|
+
if any(skip_pattern.lower() in error_lower for skip_pattern in self.skip_retry_on_error_substrings):
|
|
88
|
+
logger.debug(f"Skipping retry for error matching skip pattern: {error_str[:100]}")
|
|
89
|
+
return False
|
|
90
|
+
|
|
63
91
|
# Nothing specified → always retry until max_retries reached
|
|
64
92
|
if not self.retry_on_exceptions and not self.retry_on_error_substrings:
|
|
65
93
|
return True
|
|
@@ -167,63 +195,73 @@ class RetryableToolExecutor:
|
|
|
167
195
|
# Execute one attempt
|
|
168
196
|
# ---------------------------------------------------------------- #
|
|
169
197
|
start_time = datetime.now(UTC)
|
|
170
|
-
try:
|
|
171
|
-
kwargs = {"timeout": remaining} if remaining is not None else {}
|
|
172
|
-
if hasattr(self.executor, "use_cache"):
|
|
173
|
-
kwargs["use_cache"] = use_cache
|
|
174
198
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
199
|
+
# Trace retry attempt
|
|
200
|
+
with trace_retry_attempt(call.tool, attempt, cfg.max_retries):
|
|
201
|
+
try:
|
|
202
|
+
kwargs = {"timeout": remaining} if remaining is not None else {}
|
|
203
|
+
if hasattr(self.executor, "use_cache"):
|
|
204
|
+
kwargs["use_cache"] = use_cache
|
|
205
|
+
|
|
206
|
+
result = (await self.executor.execute([call], **kwargs))[0]
|
|
207
|
+
pid = result.pid
|
|
208
|
+
machine = result.machine
|
|
209
|
+
|
|
210
|
+
# Record retry metrics
|
|
211
|
+
metrics = get_metrics()
|
|
212
|
+
success = result.error is None
|
|
213
|
+
|
|
214
|
+
if metrics:
|
|
215
|
+
metrics.record_retry_attempt(call.tool, attempt, success)
|
|
216
|
+
|
|
217
|
+
# Success?
|
|
218
|
+
if success:
|
|
219
|
+
result.attempts = attempt + 1
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
# Error: decide on retry
|
|
223
|
+
last_error = result.error
|
|
224
|
+
if cfg.should_retry(attempt, error_str=result.error):
|
|
225
|
+
delay = cfg.get_delay(attempt)
|
|
226
|
+
# never overshoot the deadline
|
|
227
|
+
if deadline is not None:
|
|
228
|
+
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
229
|
+
if delay:
|
|
230
|
+
await asyncio.sleep(delay)
|
|
231
|
+
attempt += 1
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
# No more retries wanted
|
|
235
|
+
result.error = self._wrap_error(last_error, attempt, cfg)
|
|
181
236
|
result.attempts = attempt + 1
|
|
182
237
|
return result
|
|
183
238
|
|
|
184
|
-
#
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
211
|
-
if delay:
|
|
212
|
-
await asyncio.sleep(delay)
|
|
213
|
-
attempt += 1
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
end_time = datetime.now(UTC)
|
|
217
|
-
return ToolResult(
|
|
218
|
-
tool=call.tool,
|
|
219
|
-
result=None,
|
|
220
|
-
error=self._wrap_error(err_str, attempt, cfg),
|
|
221
|
-
start_time=start_time,
|
|
222
|
-
end_time=end_time,
|
|
223
|
-
machine=machine,
|
|
224
|
-
pid=pid,
|
|
225
|
-
attempts=attempt + 1,
|
|
226
|
-
)
|
|
239
|
+
# ---------------------------------------------------------------- #
|
|
240
|
+
# Exception path
|
|
241
|
+
# ---------------------------------------------------------------- #
|
|
242
|
+
except Exception as exc: # noqa: BLE001
|
|
243
|
+
err_str = str(exc)
|
|
244
|
+
last_error = err_str
|
|
245
|
+
if cfg.should_retry(attempt, error=exc, error_str=err_str):
|
|
246
|
+
delay = cfg.get_delay(attempt)
|
|
247
|
+
if deadline is not None:
|
|
248
|
+
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
249
|
+
if delay:
|
|
250
|
+
await asyncio.sleep(delay)
|
|
251
|
+
attempt += 1
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
end_time = datetime.now(UTC)
|
|
255
|
+
return ToolResult(
|
|
256
|
+
tool=call.tool,
|
|
257
|
+
result=None,
|
|
258
|
+
error=self._wrap_error(err_str, attempt, cfg),
|
|
259
|
+
start_time=start_time,
|
|
260
|
+
end_time=end_time,
|
|
261
|
+
machine=machine,
|
|
262
|
+
pid=pid,
|
|
263
|
+
attempts=attempt + 1,
|
|
264
|
+
)
|
|
227
265
|
|
|
228
266
|
# --------------------------------------------------------------------- #
|
|
229
267
|
# Helpers
|
|
@@ -246,6 +284,7 @@ def retryable(
|
|
|
246
284
|
jitter: bool = True,
|
|
247
285
|
retry_on_exceptions: list[type[Exception]] | None = None,
|
|
248
286
|
retry_on_error_substrings: list[str] | None = None,
|
|
287
|
+
skip_retry_on_error_substrings: list[str] | None = None,
|
|
249
288
|
):
|
|
250
289
|
"""
|
|
251
290
|
Class decorator that attaches a :class:`RetryConfig` to a *tool* class.
|
|
@@ -267,6 +306,7 @@ def retryable(
|
|
|
267
306
|
jitter=jitter,
|
|
268
307
|
retry_on_exceptions=retry_on_exceptions,
|
|
269
308
|
retry_on_error_substrings=retry_on_error_substrings,
|
|
309
|
+
skip_retry_on_error_substrings=skip_retry_on_error_substrings,
|
|
270
310
|
)
|
|
271
311
|
return cls
|
|
272
312
|
|