chuk-tool-processor 0.6.13__py3-none-any.whl → 0.9.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of chuk-tool-processor might be problematic. Click here for more details.
- chuk_tool_processor/core/__init__.py +31 -0
- chuk_tool_processor/core/exceptions.py +218 -12
- chuk_tool_processor/core/processor.py +38 -7
- chuk_tool_processor/execution/strategies/__init__.py +6 -0
- chuk_tool_processor/execution/strategies/subprocess_strategy.py +2 -1
- chuk_tool_processor/execution/wrappers/__init__.py +42 -0
- chuk_tool_processor/execution/wrappers/caching.py +48 -13
- chuk_tool_processor/execution/wrappers/circuit_breaker.py +370 -0
- chuk_tool_processor/execution/wrappers/rate_limiting.py +31 -1
- chuk_tool_processor/execution/wrappers/retry.py +93 -53
- chuk_tool_processor/logging/metrics.py +2 -2
- chuk_tool_processor/mcp/mcp_tool.py +5 -5
- chuk_tool_processor/mcp/setup_mcp_http_streamable.py +44 -2
- chuk_tool_processor/mcp/setup_mcp_sse.py +44 -2
- chuk_tool_processor/mcp/setup_mcp_stdio.py +2 -0
- chuk_tool_processor/mcp/stream_manager.py +130 -75
- chuk_tool_processor/mcp/transport/__init__.py +10 -0
- chuk_tool_processor/mcp/transport/http_streamable_transport.py +193 -108
- chuk_tool_processor/mcp/transport/models.py +100 -0
- chuk_tool_processor/mcp/transport/sse_transport.py +155 -59
- chuk_tool_processor/mcp/transport/stdio_transport.py +58 -10
- chuk_tool_processor/models/__init__.py +20 -0
- chuk_tool_processor/models/tool_call.py +34 -1
- chuk_tool_processor/models/tool_spec.py +350 -0
- chuk_tool_processor/models/validated_tool.py +22 -2
- chuk_tool_processor/observability/__init__.py +30 -0
- chuk_tool_processor/observability/metrics.py +312 -0
- chuk_tool_processor/observability/setup.py +105 -0
- chuk_tool_processor/observability/tracing.py +345 -0
- chuk_tool_processor/plugins/discovery.py +1 -1
- chuk_tool_processor-0.9.7.dist-info/METADATA +1813 -0
- {chuk_tool_processor-0.6.13.dist-info → chuk_tool_processor-0.9.7.dist-info}/RECORD +34 -27
- chuk_tool_processor-0.6.13.dist-info/METADATA +0 -698
- {chuk_tool_processor-0.6.13.dist-info → chuk_tool_processor-0.9.7.dist-info}/WHEEL +0 -0
- {chuk_tool_processor-0.6.13.dist-info → chuk_tool_processor-0.9.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
# chuk_tool_processor/execution/wrappers/circuit_breaker.py
|
|
2
|
+
"""
|
|
3
|
+
Circuit breaker pattern for tool execution.
|
|
4
|
+
|
|
5
|
+
Prevents cascading failures by tracking failure rates and temporarily
|
|
6
|
+
blocking calls to failing tools. Implements a state machine:
|
|
7
|
+
|
|
8
|
+
CLOSED → OPEN → HALF_OPEN → CLOSED (or back to OPEN)
|
|
9
|
+
|
|
10
|
+
States:
|
|
11
|
+
- CLOSED: Normal operation, requests pass through
|
|
12
|
+
- OPEN: Too many failures, requests blocked immediately
|
|
13
|
+
- HALF_OPEN: Testing if service recovered, limited requests allowed
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import time
|
|
20
|
+
from datetime import UTC, datetime
|
|
21
|
+
from enum import Enum
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from chuk_tool_processor.core.exceptions import ToolCircuitOpenError
|
|
25
|
+
from chuk_tool_processor.logging import get_logger
|
|
26
|
+
from chuk_tool_processor.models.tool_call import ToolCall
|
|
27
|
+
from chuk_tool_processor.models.tool_result import ToolResult
|
|
28
|
+
|
|
29
|
+
logger = get_logger("chuk_tool_processor.execution.wrappers.circuit_breaker")
|
|
30
|
+
|
|
31
|
+
# Optional observability imports
|
|
32
|
+
try:
|
|
33
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
34
|
+
from chuk_tool_processor.observability.tracing import trace_circuit_breaker
|
|
35
|
+
|
|
36
|
+
_observability_available = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
_observability_available = False
|
|
39
|
+
|
|
40
|
+
# No-op functions when observability not available
|
|
41
|
+
def get_metrics():
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
def trace_circuit_breaker(*_args, **_kwargs):
|
|
45
|
+
from contextlib import nullcontext
|
|
46
|
+
|
|
47
|
+
return nullcontext()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# --------------------------------------------------------------------------- #
|
|
51
|
+
# Circuit breaker state
|
|
52
|
+
# --------------------------------------------------------------------------- #
|
|
53
|
+
class CircuitState(str, Enum):
|
|
54
|
+
"""Circuit breaker states."""
|
|
55
|
+
|
|
56
|
+
CLOSED = "closed" # Normal operation
|
|
57
|
+
OPEN = "open" # Blocking requests due to failures
|
|
58
|
+
HALF_OPEN = "half_open" # Testing recovery with limited requests
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class CircuitBreakerConfig:
|
|
62
|
+
"""Configuration for circuit breaker behavior."""
|
|
63
|
+
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
failure_threshold: int = 5,
|
|
67
|
+
success_threshold: int = 2,
|
|
68
|
+
reset_timeout: float = 60.0,
|
|
69
|
+
half_open_max_calls: int = 1,
|
|
70
|
+
timeout_threshold: float | None = None,
|
|
71
|
+
):
|
|
72
|
+
"""
|
|
73
|
+
Initialize circuit breaker configuration.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
failure_threshold: Number of failures before opening circuit
|
|
77
|
+
success_threshold: Number of successes in HALF_OPEN to close circuit
|
|
78
|
+
reset_timeout: Seconds to wait before trying HALF_OPEN
|
|
79
|
+
half_open_max_calls: Max concurrent calls in HALF_OPEN state
|
|
80
|
+
timeout_threshold: Optional timeout (s) to consider as failure
|
|
81
|
+
"""
|
|
82
|
+
self.failure_threshold = failure_threshold
|
|
83
|
+
self.success_threshold = success_threshold
|
|
84
|
+
self.reset_timeout = reset_timeout
|
|
85
|
+
self.half_open_max_calls = half_open_max_calls
|
|
86
|
+
self.timeout_threshold = timeout_threshold
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class CircuitBreakerState:
|
|
90
|
+
"""Per-tool circuit breaker state tracking."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, config: CircuitBreakerConfig):
|
|
93
|
+
self.config = config
|
|
94
|
+
self.state = CircuitState.CLOSED
|
|
95
|
+
self.failure_count = 0
|
|
96
|
+
self.success_count = 0
|
|
97
|
+
self.last_failure_time: float | None = None
|
|
98
|
+
self.opened_at: float | None = None
|
|
99
|
+
self.half_open_calls = 0
|
|
100
|
+
self._lock = asyncio.Lock()
|
|
101
|
+
|
|
102
|
+
async def record_success(self) -> None:
|
|
103
|
+
"""Record a successful call."""
|
|
104
|
+
async with self._lock:
|
|
105
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
106
|
+
self.success_count += 1
|
|
107
|
+
logger.debug(f"Circuit HALF_OPEN: success {self.success_count}/{self.config.success_threshold}")
|
|
108
|
+
|
|
109
|
+
# Enough successes? Close the circuit
|
|
110
|
+
if self.success_count >= self.config.success_threshold:
|
|
111
|
+
logger.info("Circuit breaker: Transitioning to CLOSED (service recovered)")
|
|
112
|
+
self.state = CircuitState.CLOSED
|
|
113
|
+
self.failure_count = 0
|
|
114
|
+
self.success_count = 0
|
|
115
|
+
self.opened_at = None
|
|
116
|
+
self.half_open_calls = 0
|
|
117
|
+
else:
|
|
118
|
+
# In CLOSED state, just reset failure count
|
|
119
|
+
self.failure_count = 0
|
|
120
|
+
|
|
121
|
+
async def record_failure(self) -> None:
|
|
122
|
+
"""Record a failed call."""
|
|
123
|
+
async with self._lock:
|
|
124
|
+
self.failure_count += 1
|
|
125
|
+
self.last_failure_time = time.monotonic()
|
|
126
|
+
logger.debug(f"Circuit: failure {self.failure_count}/{self.config.failure_threshold}")
|
|
127
|
+
|
|
128
|
+
if self.state == CircuitState.CLOSED:
|
|
129
|
+
# Check if we should open
|
|
130
|
+
if self.failure_count >= self.config.failure_threshold:
|
|
131
|
+
logger.warning(f"Circuit breaker: OPENING after {self.failure_count} failures")
|
|
132
|
+
self.state = CircuitState.OPEN
|
|
133
|
+
self.opened_at = time.monotonic()
|
|
134
|
+
elif self.state == CircuitState.HALF_OPEN:
|
|
135
|
+
# Failed during test → back to OPEN
|
|
136
|
+
logger.warning("Circuit breaker: Back to OPEN (test failed)")
|
|
137
|
+
self.state = CircuitState.OPEN
|
|
138
|
+
self.success_count = 0
|
|
139
|
+
self.opened_at = time.monotonic()
|
|
140
|
+
self.half_open_calls = 0
|
|
141
|
+
|
|
142
|
+
async def can_execute(self) -> bool:
|
|
143
|
+
"""Check if a call should be allowed through."""
|
|
144
|
+
async with self._lock:
|
|
145
|
+
if self.state == CircuitState.CLOSED:
|
|
146
|
+
return True
|
|
147
|
+
|
|
148
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
149
|
+
# Limit concurrent calls in HALF_OPEN
|
|
150
|
+
if self.half_open_calls < self.config.half_open_max_calls:
|
|
151
|
+
self.half_open_calls += 1
|
|
152
|
+
return True
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
# OPEN state: check if we should try HALF_OPEN
|
|
156
|
+
if self.opened_at is not None:
|
|
157
|
+
elapsed = time.monotonic() - self.opened_at
|
|
158
|
+
if elapsed >= self.config.reset_timeout:
|
|
159
|
+
logger.info("Circuit breaker: Transitioning to HALF_OPEN (testing recovery)")
|
|
160
|
+
self.state = CircuitState.HALF_OPEN
|
|
161
|
+
self.half_open_calls = 1
|
|
162
|
+
self.success_count = 0
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
async def release_half_open_slot(self) -> None:
|
|
168
|
+
"""Release a HALF_OPEN slot after call completes."""
|
|
169
|
+
async with self._lock:
|
|
170
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
171
|
+
self.half_open_calls = max(0, self.half_open_calls - 1)
|
|
172
|
+
|
|
173
|
+
def get_state(self) -> dict[str, Any]:
|
|
174
|
+
"""Get current state as dict."""
|
|
175
|
+
return {
|
|
176
|
+
"state": self.state.value,
|
|
177
|
+
"failure_count": self.failure_count,
|
|
178
|
+
"success_count": self.success_count,
|
|
179
|
+
"opened_at": self.opened_at,
|
|
180
|
+
"time_until_half_open": (
|
|
181
|
+
max(0, self.config.reset_timeout - (time.monotonic() - self.opened_at))
|
|
182
|
+
if self.opened_at and self.state == CircuitState.OPEN
|
|
183
|
+
else None
|
|
184
|
+
),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# --------------------------------------------------------------------------- #
|
|
189
|
+
# Circuit breaker executor wrapper
|
|
190
|
+
# --------------------------------------------------------------------------- #
|
|
191
|
+
class CircuitBreakerExecutor:
|
|
192
|
+
"""
|
|
193
|
+
Executor wrapper that implements circuit breaker pattern.
|
|
194
|
+
|
|
195
|
+
Tracks failures per tool and opens circuit breakers to prevent
|
|
196
|
+
cascading failures when tools are consistently failing.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(
|
|
200
|
+
self,
|
|
201
|
+
executor: Any,
|
|
202
|
+
*,
|
|
203
|
+
default_config: CircuitBreakerConfig | None = None,
|
|
204
|
+
tool_configs: dict[str, CircuitBreakerConfig] | None = None,
|
|
205
|
+
):
|
|
206
|
+
"""
|
|
207
|
+
Initialize circuit breaker executor.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
executor: Underlying executor to wrap
|
|
211
|
+
default_config: Default circuit breaker configuration
|
|
212
|
+
tool_configs: Per-tool circuit breaker configurations
|
|
213
|
+
"""
|
|
214
|
+
self.executor = executor
|
|
215
|
+
self.default_config = default_config or CircuitBreakerConfig()
|
|
216
|
+
self.tool_configs = tool_configs or {}
|
|
217
|
+
self._states: dict[str, CircuitBreakerState] = {}
|
|
218
|
+
self._states_lock = asyncio.Lock()
|
|
219
|
+
|
|
220
|
+
async def _get_state(self, tool: str) -> CircuitBreakerState:
|
|
221
|
+
"""Get or create circuit breaker state for a tool."""
|
|
222
|
+
if tool not in self._states:
|
|
223
|
+
async with self._states_lock:
|
|
224
|
+
if tool not in self._states:
|
|
225
|
+
config = self.tool_configs.get(tool, self.default_config)
|
|
226
|
+
self._states[tool] = CircuitBreakerState(config)
|
|
227
|
+
return self._states[tool]
|
|
228
|
+
|
|
229
|
+
async def execute(
|
|
230
|
+
self,
|
|
231
|
+
calls: list[ToolCall],
|
|
232
|
+
*,
|
|
233
|
+
timeout: float | None = None,
|
|
234
|
+
use_cache: bool = True,
|
|
235
|
+
) -> list[ToolResult]:
|
|
236
|
+
"""
|
|
237
|
+
Execute tool calls with circuit breaker protection.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
calls: List of tool calls to execute
|
|
241
|
+
timeout: Optional timeout for execution
|
|
242
|
+
use_cache: Whether to use cached results
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List of tool results
|
|
246
|
+
"""
|
|
247
|
+
if not calls:
|
|
248
|
+
return []
|
|
249
|
+
|
|
250
|
+
results: list[ToolResult] = []
|
|
251
|
+
|
|
252
|
+
for call in calls:
|
|
253
|
+
state = await self._get_state(call.tool)
|
|
254
|
+
|
|
255
|
+
# Record circuit breaker state
|
|
256
|
+
metrics = get_metrics()
|
|
257
|
+
if metrics:
|
|
258
|
+
metrics.record_circuit_breaker_state(call.tool, state.state.value)
|
|
259
|
+
|
|
260
|
+
# Check if circuit allows execution with tracing
|
|
261
|
+
with trace_circuit_breaker(call.tool, state.state.value):
|
|
262
|
+
can_execute = await state.can_execute()
|
|
263
|
+
|
|
264
|
+
if not can_execute:
|
|
265
|
+
# Circuit is OPEN - reject immediately
|
|
266
|
+
state_info = state.get_state()
|
|
267
|
+
logger.warning(f"Circuit breaker OPEN for {call.tool} (failures: {state.failure_count})")
|
|
268
|
+
|
|
269
|
+
reset_time = state_info.get("time_until_half_open")
|
|
270
|
+
error = ToolCircuitOpenError(
|
|
271
|
+
tool_name=call.tool,
|
|
272
|
+
failure_count=state.failure_count,
|
|
273
|
+
reset_timeout=reset_time,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
now = datetime.now(UTC)
|
|
277
|
+
results.append(
|
|
278
|
+
ToolResult(
|
|
279
|
+
tool=call.tool,
|
|
280
|
+
result=None,
|
|
281
|
+
error=str(error),
|
|
282
|
+
start_time=now,
|
|
283
|
+
end_time=now,
|
|
284
|
+
machine="circuit_breaker",
|
|
285
|
+
pid=0,
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
# Execute the call
|
|
291
|
+
start_time = time.monotonic()
|
|
292
|
+
try:
|
|
293
|
+
# Execute single call
|
|
294
|
+
executor_kwargs = {"timeout": timeout}
|
|
295
|
+
if hasattr(self.executor, "use_cache"):
|
|
296
|
+
executor_kwargs["use_cache"] = use_cache
|
|
297
|
+
|
|
298
|
+
result_list = await self.executor.execute([call], **executor_kwargs)
|
|
299
|
+
result = result_list[0]
|
|
300
|
+
|
|
301
|
+
# Check if successful
|
|
302
|
+
duration = time.monotonic() - start_time
|
|
303
|
+
|
|
304
|
+
# Determine success/failure
|
|
305
|
+
is_timeout = state.config.timeout_threshold is not None and duration > state.config.timeout_threshold
|
|
306
|
+
is_error = result.error is not None
|
|
307
|
+
|
|
308
|
+
if is_error or is_timeout:
|
|
309
|
+
await state.record_failure()
|
|
310
|
+
# Record circuit breaker failure metric
|
|
311
|
+
if metrics:
|
|
312
|
+
metrics.record_circuit_breaker_failure(call.tool)
|
|
313
|
+
else:
|
|
314
|
+
await state.record_success()
|
|
315
|
+
|
|
316
|
+
results.append(result)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
# Exception during execution
|
|
320
|
+
await state.record_failure()
|
|
321
|
+
|
|
322
|
+
now = datetime.now(UTC)
|
|
323
|
+
results.append(
|
|
324
|
+
ToolResult(
|
|
325
|
+
tool=call.tool,
|
|
326
|
+
result=None,
|
|
327
|
+
error=f"Circuit breaker caught exception: {str(e)}",
|
|
328
|
+
start_time=now,
|
|
329
|
+
end_time=now,
|
|
330
|
+
machine="circuit_breaker",
|
|
331
|
+
pid=0,
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
finally:
|
|
336
|
+
# Release HALF_OPEN slot if applicable
|
|
337
|
+
if state.state == CircuitState.HALF_OPEN:
|
|
338
|
+
await state.release_half_open_slot()
|
|
339
|
+
|
|
340
|
+
return results
|
|
341
|
+
|
|
342
|
+
async def get_circuit_states(self) -> dict[str, dict[str, Any]]:
|
|
343
|
+
"""
|
|
344
|
+
Get current state of all circuit breakers.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Dict mapping tool name to state info
|
|
348
|
+
"""
|
|
349
|
+
states = {}
|
|
350
|
+
async with self._states_lock:
|
|
351
|
+
for tool, state in self._states.items():
|
|
352
|
+
states[tool] = state.get_state()
|
|
353
|
+
return states
|
|
354
|
+
|
|
355
|
+
async def reset_circuit(self, tool: str) -> None:
|
|
356
|
+
"""
|
|
357
|
+
Manually reset a circuit breaker.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
tool: Tool name to reset
|
|
361
|
+
"""
|
|
362
|
+
if tool in self._states:
|
|
363
|
+
state = self._states[tool]
|
|
364
|
+
async with state._lock:
|
|
365
|
+
state.state = CircuitState.CLOSED
|
|
366
|
+
state.failure_count = 0
|
|
367
|
+
state.success_count = 0
|
|
368
|
+
state.opened_at = None
|
|
369
|
+
state.half_open_calls = 0
|
|
370
|
+
logger.info(f"Manually reset circuit breaker for {tool}")
|
|
@@ -25,6 +25,24 @@ from chuk_tool_processor.models.tool_result import ToolResult
|
|
|
25
25
|
|
|
26
26
|
logger = get_logger("chuk_tool_processor.execution.wrappers.rate_limiting")
|
|
27
27
|
|
|
28
|
+
# Optional observability imports
|
|
29
|
+
try:
|
|
30
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
31
|
+
from chuk_tool_processor.observability.tracing import trace_rate_limit
|
|
32
|
+
|
|
33
|
+
_observability_available = True
|
|
34
|
+
except ImportError:
|
|
35
|
+
_observability_available = False
|
|
36
|
+
|
|
37
|
+
# No-op functions when observability not available
|
|
38
|
+
def get_metrics():
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
def trace_rate_limit(*_args, **_kwargs):
|
|
42
|
+
from contextlib import nullcontext
|
|
43
|
+
|
|
44
|
+
return nullcontext()
|
|
45
|
+
|
|
28
46
|
|
|
29
47
|
# --------------------------------------------------------------------------- #
|
|
30
48
|
# Core limiter
|
|
@@ -220,8 +238,20 @@ class RateLimitedToolExecutor:
|
|
|
220
238
|
return []
|
|
221
239
|
|
|
222
240
|
# Block for each call *before* dispatching to the wrapped executor
|
|
241
|
+
metrics = get_metrics()
|
|
242
|
+
|
|
223
243
|
for c in calls:
|
|
224
|
-
|
|
244
|
+
# Check limits first for metrics
|
|
245
|
+
global_limited, tool_limited = await self.limiter.check_limits(c.tool)
|
|
246
|
+
allowed = not (global_limited or tool_limited)
|
|
247
|
+
|
|
248
|
+
# Trace rate limit check
|
|
249
|
+
with trace_rate_limit(c.tool, allowed):
|
|
250
|
+
await self.limiter.wait(c.tool)
|
|
251
|
+
|
|
252
|
+
# Record metrics
|
|
253
|
+
if metrics:
|
|
254
|
+
metrics.record_rate_limit_check(c.tool, allowed)
|
|
225
255
|
|
|
226
256
|
# Check if the executor has a use_cache parameter
|
|
227
257
|
if hasattr(self.executor, "execute"):
|
|
@@ -21,6 +21,24 @@ from chuk_tool_processor.models.tool_result import ToolResult
|
|
|
21
21
|
|
|
22
22
|
logger = get_logger("chuk_tool_processor.execution.wrappers.retry")
|
|
23
23
|
|
|
24
|
+
# Optional observability imports
|
|
25
|
+
try:
|
|
26
|
+
from chuk_tool_processor.observability.metrics import get_metrics
|
|
27
|
+
from chuk_tool_processor.observability.tracing import trace_retry_attempt
|
|
28
|
+
|
|
29
|
+
_observability_available = True
|
|
30
|
+
except ImportError:
|
|
31
|
+
_observability_available = False
|
|
32
|
+
|
|
33
|
+
# No-op functions when observability not available
|
|
34
|
+
def get_metrics():
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
def trace_retry_attempt(*_args, **_kwargs):
|
|
38
|
+
from contextlib import nullcontext
|
|
39
|
+
|
|
40
|
+
return nullcontext()
|
|
41
|
+
|
|
24
42
|
|
|
25
43
|
# --------------------------------------------------------------------------- #
|
|
26
44
|
# Retry configuration
|
|
@@ -36,6 +54,7 @@ class RetryConfig:
|
|
|
36
54
|
jitter: bool = True,
|
|
37
55
|
retry_on_exceptions: list[type[Exception]] | None = None,
|
|
38
56
|
retry_on_error_substrings: list[str] | None = None,
|
|
57
|
+
skip_retry_on_error_substrings: list[str] | None = None,
|
|
39
58
|
):
|
|
40
59
|
if max_retries < 0:
|
|
41
60
|
raise ValueError("max_retries cannot be negative")
|
|
@@ -45,6 +64,7 @@ class RetryConfig:
|
|
|
45
64
|
self.jitter = jitter
|
|
46
65
|
self.retry_on_exceptions = retry_on_exceptions or []
|
|
47
66
|
self.retry_on_error_substrings = retry_on_error_substrings or []
|
|
67
|
+
self.skip_retry_on_error_substrings = skip_retry_on_error_substrings or []
|
|
48
68
|
|
|
49
69
|
# --------------------------------------------------------------------- #
|
|
50
70
|
# Decision helpers
|
|
@@ -60,6 +80,14 @@ class RetryConfig:
|
|
|
60
80
|
if attempt >= self.max_retries:
|
|
61
81
|
return False
|
|
62
82
|
|
|
83
|
+
# Check skip list first - these errors should never be retried
|
|
84
|
+
# (e.g., OAuth errors that need to be handled at transport layer)
|
|
85
|
+
if error_str and self.skip_retry_on_error_substrings:
|
|
86
|
+
error_lower = error_str.lower()
|
|
87
|
+
if any(skip_pattern.lower() in error_lower for skip_pattern in self.skip_retry_on_error_substrings):
|
|
88
|
+
logger.debug(f"Skipping retry for error matching skip pattern: {error_str[:100]}")
|
|
89
|
+
return False
|
|
90
|
+
|
|
63
91
|
# Nothing specified → always retry until max_retries reached
|
|
64
92
|
if not self.retry_on_exceptions and not self.retry_on_error_substrings:
|
|
65
93
|
return True
|
|
@@ -167,63 +195,73 @@ class RetryableToolExecutor:
|
|
|
167
195
|
# Execute one attempt
|
|
168
196
|
# ---------------------------------------------------------------- #
|
|
169
197
|
start_time = datetime.now(UTC)
|
|
170
|
-
try:
|
|
171
|
-
kwargs = {"timeout": remaining} if remaining is not None else {}
|
|
172
|
-
if hasattr(self.executor, "use_cache"):
|
|
173
|
-
kwargs["use_cache"] = use_cache
|
|
174
198
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
199
|
+
# Trace retry attempt
|
|
200
|
+
with trace_retry_attempt(call.tool, attempt, cfg.max_retries):
|
|
201
|
+
try:
|
|
202
|
+
kwargs = {"timeout": remaining} if remaining is not None else {}
|
|
203
|
+
if hasattr(self.executor, "use_cache"):
|
|
204
|
+
kwargs["use_cache"] = use_cache
|
|
205
|
+
|
|
206
|
+
result = (await self.executor.execute([call], **kwargs))[0]
|
|
207
|
+
pid = result.pid
|
|
208
|
+
machine = result.machine
|
|
209
|
+
|
|
210
|
+
# Record retry metrics
|
|
211
|
+
metrics = get_metrics()
|
|
212
|
+
success = result.error is None
|
|
213
|
+
|
|
214
|
+
if metrics:
|
|
215
|
+
metrics.record_retry_attempt(call.tool, attempt, success)
|
|
216
|
+
|
|
217
|
+
# Success?
|
|
218
|
+
if success:
|
|
219
|
+
result.attempts = attempt + 1
|
|
220
|
+
return result
|
|
221
|
+
|
|
222
|
+
# Error: decide on retry
|
|
223
|
+
last_error = result.error
|
|
224
|
+
if cfg.should_retry(attempt, error_str=result.error):
|
|
225
|
+
delay = cfg.get_delay(attempt)
|
|
226
|
+
# never overshoot the deadline
|
|
227
|
+
if deadline is not None:
|
|
228
|
+
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
229
|
+
if delay:
|
|
230
|
+
await asyncio.sleep(delay)
|
|
231
|
+
attempt += 1
|
|
232
|
+
continue
|
|
233
|
+
|
|
234
|
+
# No more retries wanted
|
|
235
|
+
result.error = self._wrap_error(last_error, attempt, cfg)
|
|
181
236
|
result.attempts = attempt + 1
|
|
182
237
|
return result
|
|
183
238
|
|
|
184
|
-
#
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
211
|
-
if delay:
|
|
212
|
-
await asyncio.sleep(delay)
|
|
213
|
-
attempt += 1
|
|
214
|
-
continue
|
|
215
|
-
|
|
216
|
-
end_time = datetime.now(UTC)
|
|
217
|
-
return ToolResult(
|
|
218
|
-
tool=call.tool,
|
|
219
|
-
result=None,
|
|
220
|
-
error=self._wrap_error(err_str, attempt, cfg),
|
|
221
|
-
start_time=start_time,
|
|
222
|
-
end_time=end_time,
|
|
223
|
-
machine=machine,
|
|
224
|
-
pid=pid,
|
|
225
|
-
attempts=attempt + 1,
|
|
226
|
-
)
|
|
239
|
+
# ---------------------------------------------------------------- #
|
|
240
|
+
# Exception path
|
|
241
|
+
# ---------------------------------------------------------------- #
|
|
242
|
+
except Exception as exc: # noqa: BLE001
|
|
243
|
+
err_str = str(exc)
|
|
244
|
+
last_error = err_str
|
|
245
|
+
if cfg.should_retry(attempt, error=exc, error_str=err_str):
|
|
246
|
+
delay = cfg.get_delay(attempt)
|
|
247
|
+
if deadline is not None:
|
|
248
|
+
delay = min(delay, max(deadline - time.monotonic(), 0))
|
|
249
|
+
if delay:
|
|
250
|
+
await asyncio.sleep(delay)
|
|
251
|
+
attempt += 1
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
end_time = datetime.now(UTC)
|
|
255
|
+
return ToolResult(
|
|
256
|
+
tool=call.tool,
|
|
257
|
+
result=None,
|
|
258
|
+
error=self._wrap_error(err_str, attempt, cfg),
|
|
259
|
+
start_time=start_time,
|
|
260
|
+
end_time=end_time,
|
|
261
|
+
machine=machine,
|
|
262
|
+
pid=pid,
|
|
263
|
+
attempts=attempt + 1,
|
|
264
|
+
)
|
|
227
265
|
|
|
228
266
|
# --------------------------------------------------------------------- #
|
|
229
267
|
# Helpers
|
|
@@ -246,6 +284,7 @@ def retryable(
|
|
|
246
284
|
jitter: bool = True,
|
|
247
285
|
retry_on_exceptions: list[type[Exception]] | None = None,
|
|
248
286
|
retry_on_error_substrings: list[str] | None = None,
|
|
287
|
+
skip_retry_on_error_substrings: list[str] | None = None,
|
|
249
288
|
):
|
|
250
289
|
"""
|
|
251
290
|
Class decorator that attaches a :class:`RetryConfig` to a *tool* class.
|
|
@@ -267,6 +306,7 @@ def retryable(
|
|
|
267
306
|
jitter=jitter,
|
|
268
307
|
retry_on_exceptions=retry_on_exceptions,
|
|
269
308
|
retry_on_error_substrings=retry_on_error_substrings,
|
|
309
|
+
skip_retry_on_error_substrings=skip_retry_on_error_substrings,
|
|
270
310
|
)
|
|
271
311
|
return cls
|
|
272
312
|
|
|
@@ -45,7 +45,7 @@ class MetricsLogger:
|
|
|
45
45
|
cached: Whether the result was retrieved from cache
|
|
46
46
|
attempts: Number of execution attempts
|
|
47
47
|
"""
|
|
48
|
-
self.logger.
|
|
48
|
+
self.logger.debug(
|
|
49
49
|
f"Tool execution metric: {tool}",
|
|
50
50
|
extra={
|
|
51
51
|
"context": {
|
|
@@ -76,7 +76,7 @@ class MetricsLogger:
|
|
|
76
76
|
duration: Parsing duration in seconds
|
|
77
77
|
num_calls: Number of tool calls parsed
|
|
78
78
|
"""
|
|
79
|
-
self.logger.
|
|
79
|
+
self.logger.debug(
|
|
80
80
|
f"Parser metric: {parser}",
|
|
81
81
|
extra={
|
|
82
82
|
"context": {
|
|
@@ -370,7 +370,7 @@ class MCPTool:
|
|
|
370
370
|
self._circuit_open = False
|
|
371
371
|
self._circuit_open_time = None
|
|
372
372
|
self.connection_state = ConnectionState.HEALTHY
|
|
373
|
-
logger.
|
|
373
|
+
logger.debug(f"Circuit breaker closed for tool '{self.tool_name}' after successful execution")
|
|
374
374
|
|
|
375
375
|
async def _record_failure(self, is_connection_error: bool = False) -> None:
|
|
376
376
|
"""Record a failed execution."""
|
|
@@ -407,7 +407,7 @@ class MCPTool:
|
|
|
407
407
|
self._circuit_open = False
|
|
408
408
|
self._circuit_open_time = None
|
|
409
409
|
self.connection_state = ConnectionState.HEALTHY
|
|
410
|
-
logger.
|
|
410
|
+
logger.debug(f"Circuit breaker reset for tool '{self.tool_name}' after timeout")
|
|
411
411
|
return False
|
|
412
412
|
|
|
413
413
|
return True
|
|
@@ -462,12 +462,12 @@ class MCPTool:
|
|
|
462
462
|
self._circuit_open_time = None
|
|
463
463
|
self._consecutive_failures = 0
|
|
464
464
|
self.connection_state = ConnectionState.HEALTHY
|
|
465
|
-
logger.
|
|
465
|
+
logger.debug(f"Circuit breaker manually reset for tool '{self.tool_name}'")
|
|
466
466
|
|
|
467
467
|
def disable_resilience(self) -> None:
|
|
468
468
|
"""Disable resilience features for this tool instance."""
|
|
469
469
|
self.enable_resilience = False
|
|
470
|
-
logger.
|
|
470
|
+
logger.debug(f"Resilience features disabled for tool '{self.tool_name}'")
|
|
471
471
|
|
|
472
472
|
def set_stream_manager(self, stream_manager: StreamManager | None) -> None:
|
|
473
473
|
"""
|
|
@@ -482,7 +482,7 @@ class MCPTool:
|
|
|
482
482
|
if self._circuit_open:
|
|
483
483
|
self._circuit_open = False
|
|
484
484
|
self._circuit_open_time = None
|
|
485
|
-
logger.
|
|
485
|
+
logger.debug(f"Circuit breaker closed for tool '{self.tool_name}' due to new stream manager")
|
|
486
486
|
else:
|
|
487
487
|
self.connection_state = ConnectionState.DISCONNECTED
|
|
488
488
|
|