foundry-mcp 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foundry_mcp/__init__.py +7 -0
- foundry_mcp/cli/__init__.py +80 -0
- foundry_mcp/cli/__main__.py +9 -0
- foundry_mcp/cli/agent.py +96 -0
- foundry_mcp/cli/commands/__init__.py +37 -0
- foundry_mcp/cli/commands/cache.py +137 -0
- foundry_mcp/cli/commands/dashboard.py +148 -0
- foundry_mcp/cli/commands/dev.py +446 -0
- foundry_mcp/cli/commands/journal.py +377 -0
- foundry_mcp/cli/commands/lifecycle.py +274 -0
- foundry_mcp/cli/commands/modify.py +824 -0
- foundry_mcp/cli/commands/plan.py +633 -0
- foundry_mcp/cli/commands/pr.py +393 -0
- foundry_mcp/cli/commands/review.py +652 -0
- foundry_mcp/cli/commands/session.py +479 -0
- foundry_mcp/cli/commands/specs.py +856 -0
- foundry_mcp/cli/commands/tasks.py +807 -0
- foundry_mcp/cli/commands/testing.py +676 -0
- foundry_mcp/cli/commands/validate.py +982 -0
- foundry_mcp/cli/config.py +98 -0
- foundry_mcp/cli/context.py +259 -0
- foundry_mcp/cli/flags.py +266 -0
- foundry_mcp/cli/logging.py +212 -0
- foundry_mcp/cli/main.py +44 -0
- foundry_mcp/cli/output.py +122 -0
- foundry_mcp/cli/registry.py +110 -0
- foundry_mcp/cli/resilience.py +178 -0
- foundry_mcp/cli/transcript.py +217 -0
- foundry_mcp/config.py +850 -0
- foundry_mcp/core/__init__.py +144 -0
- foundry_mcp/core/ai_consultation.py +1636 -0
- foundry_mcp/core/cache.py +195 -0
- foundry_mcp/core/capabilities.py +446 -0
- foundry_mcp/core/concurrency.py +898 -0
- foundry_mcp/core/context.py +540 -0
- foundry_mcp/core/discovery.py +1603 -0
- foundry_mcp/core/error_collection.py +728 -0
- foundry_mcp/core/error_store.py +592 -0
- foundry_mcp/core/feature_flags.py +592 -0
- foundry_mcp/core/health.py +749 -0
- foundry_mcp/core/journal.py +694 -0
- foundry_mcp/core/lifecycle.py +412 -0
- foundry_mcp/core/llm_config.py +1350 -0
- foundry_mcp/core/llm_patterns.py +510 -0
- foundry_mcp/core/llm_provider.py +1569 -0
- foundry_mcp/core/logging_config.py +374 -0
- foundry_mcp/core/metrics_persistence.py +584 -0
- foundry_mcp/core/metrics_registry.py +327 -0
- foundry_mcp/core/metrics_store.py +641 -0
- foundry_mcp/core/modifications.py +224 -0
- foundry_mcp/core/naming.py +123 -0
- foundry_mcp/core/observability.py +1216 -0
- foundry_mcp/core/otel.py +452 -0
- foundry_mcp/core/otel_stubs.py +264 -0
- foundry_mcp/core/pagination.py +255 -0
- foundry_mcp/core/progress.py +317 -0
- foundry_mcp/core/prometheus.py +577 -0
- foundry_mcp/core/prompts/__init__.py +464 -0
- foundry_mcp/core/prompts/fidelity_review.py +546 -0
- foundry_mcp/core/prompts/markdown_plan_review.py +511 -0
- foundry_mcp/core/prompts/plan_review.py +623 -0
- foundry_mcp/core/providers/__init__.py +225 -0
- foundry_mcp/core/providers/base.py +476 -0
- foundry_mcp/core/providers/claude.py +460 -0
- foundry_mcp/core/providers/codex.py +619 -0
- foundry_mcp/core/providers/cursor_agent.py +642 -0
- foundry_mcp/core/providers/detectors.py +488 -0
- foundry_mcp/core/providers/gemini.py +405 -0
- foundry_mcp/core/providers/opencode.py +616 -0
- foundry_mcp/core/providers/opencode_wrapper.js +302 -0
- foundry_mcp/core/providers/package-lock.json +24 -0
- foundry_mcp/core/providers/package.json +25 -0
- foundry_mcp/core/providers/registry.py +607 -0
- foundry_mcp/core/providers/test_provider.py +171 -0
- foundry_mcp/core/providers/validation.py +729 -0
- foundry_mcp/core/rate_limit.py +427 -0
- foundry_mcp/core/resilience.py +600 -0
- foundry_mcp/core/responses.py +934 -0
- foundry_mcp/core/review.py +366 -0
- foundry_mcp/core/security.py +438 -0
- foundry_mcp/core/spec.py +1650 -0
- foundry_mcp/core/task.py +1289 -0
- foundry_mcp/core/testing.py +450 -0
- foundry_mcp/core/validation.py +2081 -0
- foundry_mcp/dashboard/__init__.py +32 -0
- foundry_mcp/dashboard/app.py +119 -0
- foundry_mcp/dashboard/components/__init__.py +17 -0
- foundry_mcp/dashboard/components/cards.py +88 -0
- foundry_mcp/dashboard/components/charts.py +234 -0
- foundry_mcp/dashboard/components/filters.py +136 -0
- foundry_mcp/dashboard/components/tables.py +195 -0
- foundry_mcp/dashboard/data/__init__.py +11 -0
- foundry_mcp/dashboard/data/stores.py +433 -0
- foundry_mcp/dashboard/launcher.py +289 -0
- foundry_mcp/dashboard/views/__init__.py +12 -0
- foundry_mcp/dashboard/views/errors.py +217 -0
- foundry_mcp/dashboard/views/metrics.py +174 -0
- foundry_mcp/dashboard/views/overview.py +160 -0
- foundry_mcp/dashboard/views/providers.py +83 -0
- foundry_mcp/dashboard/views/sdd_workflow.py +255 -0
- foundry_mcp/dashboard/views/tool_usage.py +139 -0
- foundry_mcp/prompts/__init__.py +9 -0
- foundry_mcp/prompts/workflows.py +525 -0
- foundry_mcp/resources/__init__.py +9 -0
- foundry_mcp/resources/specs.py +591 -0
- foundry_mcp/schemas/__init__.py +38 -0
- foundry_mcp/schemas/sdd-spec-schema.json +386 -0
- foundry_mcp/server.py +164 -0
- foundry_mcp/tools/__init__.py +10 -0
- foundry_mcp/tools/unified/__init__.py +71 -0
- foundry_mcp/tools/unified/authoring.py +1487 -0
- foundry_mcp/tools/unified/context_helpers.py +98 -0
- foundry_mcp/tools/unified/documentation_helpers.py +198 -0
- foundry_mcp/tools/unified/environment.py +939 -0
- foundry_mcp/tools/unified/error.py +462 -0
- foundry_mcp/tools/unified/health.py +225 -0
- foundry_mcp/tools/unified/journal.py +841 -0
- foundry_mcp/tools/unified/lifecycle.py +632 -0
- foundry_mcp/tools/unified/metrics.py +777 -0
- foundry_mcp/tools/unified/plan.py +745 -0
- foundry_mcp/tools/unified/pr.py +294 -0
- foundry_mcp/tools/unified/provider.py +629 -0
- foundry_mcp/tools/unified/review.py +685 -0
- foundry_mcp/tools/unified/review_helpers.py +299 -0
- foundry_mcp/tools/unified/router.py +102 -0
- foundry_mcp/tools/unified/server.py +580 -0
- foundry_mcp/tools/unified/spec.py +808 -0
- foundry_mcp/tools/unified/task.py +2202 -0
- foundry_mcp/tools/unified/test.py +370 -0
- foundry_mcp/tools/unified/verification.py +520 -0
- foundry_mcp-0.3.3.dist-info/METADATA +337 -0
- foundry_mcp-0.3.3.dist-info/RECORD +135 -0
- foundry_mcp-0.3.3.dist-info/WHEEL +4 -0
- foundry_mcp-0.3.3.dist-info/entry_points.txt +3 -0
- foundry_mcp-0.3.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resilience primitives for MCP tool operations.
|
|
3
|
+
|
|
4
|
+
Provides timeout budgets, retry patterns, circuit breakers, and health checks
|
|
5
|
+
for building robust MCP tools that handle failures gracefully.
|
|
6
|
+
|
|
7
|
+
Timeout Budget Categories
|
|
8
|
+
=========================
|
|
9
|
+
|
|
10
|
+
Use the appropriate timeout category based on operation type:
|
|
11
|
+
|
|
12
|
+
FAST_TIMEOUT (5s) - Cache lookups, simple queries
|
|
13
|
+
MEDIUM_TIMEOUT (30s) - Database operations, API calls
|
|
14
|
+
SLOW_TIMEOUT (120s) - File processing, complex operations
|
|
15
|
+
BACKGROUND_TIMEOUT (600s) - Batch jobs, large transfers
|
|
16
|
+
|
|
17
|
+
Example usage:
|
|
18
|
+
|
|
19
|
+
from foundry_mcp.core.resilience import (
|
|
20
|
+
MEDIUM_TIMEOUT,
|
|
21
|
+
with_timeout,
|
|
22
|
+
retry_with_backoff,
|
|
23
|
+
CircuitBreaker,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@mcp.tool()
|
|
27
|
+
@with_timeout(MEDIUM_TIMEOUT, "Database query timed out")
|
|
28
|
+
async def query_database(query: str) -> dict:
|
|
29
|
+
result = await db.execute(query)
|
|
30
|
+
return asdict(success_response(data={"result": result}))
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from datetime import datetime
|
|
35
|
+
from enum import Enum
|
|
36
|
+
from functools import wraps
|
|
37
|
+
from threading import Lock
|
|
38
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar
|
|
39
|
+
import asyncio
|
|
40
|
+
import random
|
|
41
|
+
import time
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Timeout Budget Constants
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
#: Fast operations: cache lookups, simple queries (default 5s, max 10s)
|
|
49
|
+
FAST_TIMEOUT: float = 5.0
|
|
50
|
+
FAST_TIMEOUT_MAX: float = 10.0
|
|
51
|
+
|
|
52
|
+
#: Medium operations: database ops, API calls (default 30s, max 60s)
|
|
53
|
+
MEDIUM_TIMEOUT: float = 30.0
|
|
54
|
+
MEDIUM_TIMEOUT_MAX: float = 60.0
|
|
55
|
+
|
|
56
|
+
#: Slow operations: file processing, complex operations (default 120s, max 300s)
|
|
57
|
+
SLOW_TIMEOUT: float = 120.0
|
|
58
|
+
SLOW_TIMEOUT_MAX: float = 300.0
|
|
59
|
+
|
|
60
|
+
#: Background operations: batch jobs, large transfers (default 600s, max 3600s)
|
|
61
|
+
BACKGROUND_TIMEOUT: float = 600.0
|
|
62
|
+
BACKGROUND_TIMEOUT_MAX: float = 3600.0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
T = TypeVar("T")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Timeout Error
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class TimeoutException(Exception):
|
|
74
|
+
"""Operation timed out.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
timeout_seconds: The timeout duration that was exceeded.
|
|
78
|
+
operation: Name of the operation that timed out.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
message: str,
|
|
84
|
+
timeout_seconds: Optional[float] = None,
|
|
85
|
+
operation: Optional[str] = None,
|
|
86
|
+
):
|
|
87
|
+
super().__init__(message)
|
|
88
|
+
self.timeout_seconds = timeout_seconds
|
|
89
|
+
self.operation = operation
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# Timeout Decorator
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def with_timeout(
|
|
98
|
+
seconds: float,
|
|
99
|
+
error_message: Optional[str] = None,
|
|
100
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
101
|
+
"""Decorator to add timeout to async functions.
|
|
102
|
+
|
|
103
|
+
Uses asyncio.wait_for to enforce timeout on async operations.
|
|
104
|
+
On timeout, raises TimeoutException with details.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
seconds: Timeout duration in seconds.
|
|
108
|
+
error_message: Custom error message (defaults to function name).
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Decorated async function with timeout enforcement.
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
>>> @with_timeout(30, "Database query timed out")
|
|
115
|
+
... async def query_database(query: str):
|
|
116
|
+
... return await db.execute(query)
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
TimeoutException: If the operation exceeds the timeout.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
123
|
+
@wraps(func)
|
|
124
|
+
async def wrapper(*args: Any, **kwargs: Any) -> T:
|
|
125
|
+
try:
|
|
126
|
+
return await asyncio.wait_for(
|
|
127
|
+
func(*args, **kwargs),
|
|
128
|
+
timeout=seconds,
|
|
129
|
+
)
|
|
130
|
+
except asyncio.TimeoutError:
|
|
131
|
+
msg = error_message or f"{func.__name__} timed out after {seconds}s"
|
|
132
|
+
raise TimeoutException(
|
|
133
|
+
msg,
|
|
134
|
+
timeout_seconds=seconds,
|
|
135
|
+
operation=func.__name__,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
return wrapper
|
|
139
|
+
|
|
140
|
+
return decorator
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# Retry with Backoff
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def retry_with_backoff(
|
|
149
|
+
func: Callable[..., T],
|
|
150
|
+
*,
|
|
151
|
+
max_retries: int = 3,
|
|
152
|
+
base_delay: float = 1.0,
|
|
153
|
+
max_delay: float = 60.0,
|
|
154
|
+
exponential_base: float = 2.0,
|
|
155
|
+
jitter: bool = True,
|
|
156
|
+
retryable_exceptions: Optional[List[Type[Exception]]] = None,
|
|
157
|
+
) -> T:
|
|
158
|
+
"""Retry a function with exponential backoff.
|
|
159
|
+
|
|
160
|
+
Retries the function on failure with increasing delays between attempts.
|
|
161
|
+
Supports jitter to prevent thundering herd problems.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
func: Function to retry (should take no arguments; use lambda for args).
|
|
165
|
+
max_retries: Maximum number of retry attempts (default 3).
|
|
166
|
+
base_delay: Initial delay in seconds (default 1.0).
|
|
167
|
+
max_delay: Maximum delay cap in seconds (default 60.0).
|
|
168
|
+
exponential_base: Multiplier for each retry (default 2.0).
|
|
169
|
+
jitter: Add randomness to delay (default True).
|
|
170
|
+
retryable_exceptions: List of exceptions to retry on (default: all).
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
Result from the function on success.
|
|
174
|
+
|
|
175
|
+
Raises:
|
|
176
|
+
Exception: The last exception if all retries exhausted.
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
>>> result = retry_with_backoff(
|
|
180
|
+
... lambda: http_client.get(url),
|
|
181
|
+
... max_retries=3,
|
|
182
|
+
... retryable_exceptions=[ConnectionError, TimeoutException],
|
|
183
|
+
... )
|
|
184
|
+
"""
|
|
185
|
+
retryable = tuple(retryable_exceptions or [Exception])
|
|
186
|
+
last_exception: Optional[Exception] = None
|
|
187
|
+
|
|
188
|
+
for attempt in range(max_retries + 1):
|
|
189
|
+
try:
|
|
190
|
+
return func()
|
|
191
|
+
except retryable as e:
|
|
192
|
+
last_exception = e
|
|
193
|
+
|
|
194
|
+
if attempt == max_retries:
|
|
195
|
+
break
|
|
196
|
+
|
|
197
|
+
# Calculate delay with exponential backoff
|
|
198
|
+
delay = min(base_delay * (exponential_base**attempt), max_delay)
|
|
199
|
+
|
|
200
|
+
# Add jitter to prevent thundering herd
|
|
201
|
+
if jitter:
|
|
202
|
+
delay = delay * (0.5 + random.random())
|
|
203
|
+
|
|
204
|
+
time.sleep(delay)
|
|
205
|
+
|
|
206
|
+
# All retries exhausted
|
|
207
|
+
if last_exception:
|
|
208
|
+
raise last_exception
|
|
209
|
+
raise RuntimeError("retry_with_backoff: unexpected state")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def retryable(
|
|
213
|
+
max_retries: int = 3,
|
|
214
|
+
delay: float = 1.0,
|
|
215
|
+
exceptions: Tuple[Type[Exception], ...] = (Exception,),
|
|
216
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
217
|
+
"""Decorator for automatic retries with exponential backoff.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
max_retries: Maximum retry attempts (default 3).
|
|
221
|
+
delay: Base delay in seconds (default 1.0).
|
|
222
|
+
exceptions: Tuple of exceptions to retry on.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Decorated function with retry logic.
|
|
226
|
+
|
|
227
|
+
Example:
|
|
228
|
+
>>> @retryable(max_retries=3, exceptions=(ConnectionError,))
|
|
229
|
+
... def call_api(endpoint: str):
|
|
230
|
+
... return http_client.get(endpoint)
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
234
|
+
@wraps(func)
|
|
235
|
+
def wrapper(*args: Any, **kwargs: Any) -> T:
|
|
236
|
+
return retry_with_backoff(
|
|
237
|
+
lambda: func(*args, **kwargs),
|
|
238
|
+
max_retries=max_retries,
|
|
239
|
+
base_delay=delay,
|
|
240
|
+
retryable_exceptions=list(exceptions),
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return wrapper
|
|
244
|
+
|
|
245
|
+
return decorator
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
# Circuit Breaker
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class CircuitState(Enum):
|
|
254
|
+
"""Circuit breaker states.
|
|
255
|
+
|
|
256
|
+
CLOSED: Normal operation, requests flow through.
|
|
257
|
+
OPEN: Failures exceeded threshold, requests rejected.
|
|
258
|
+
HALF_OPEN: Testing recovery, limited requests allowed.
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
CLOSED = "closed"
|
|
262
|
+
OPEN = "open"
|
|
263
|
+
HALF_OPEN = "half_open"
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
class CircuitBreakerError(Exception):
|
|
267
|
+
"""Circuit breaker is open and rejecting requests.
|
|
268
|
+
|
|
269
|
+
Attributes:
|
|
270
|
+
breaker_name: Name of the circuit breaker.
|
|
271
|
+
state: Current state of the breaker.
|
|
272
|
+
retry_after: Seconds until recovery timeout.
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
def __init__(
|
|
276
|
+
self,
|
|
277
|
+
message: str,
|
|
278
|
+
breaker_name: Optional[str] = None,
|
|
279
|
+
state: Optional[CircuitState] = None,
|
|
280
|
+
retry_after: Optional[float] = None,
|
|
281
|
+
):
|
|
282
|
+
super().__init__(message)
|
|
283
|
+
self.breaker_name = breaker_name
|
|
284
|
+
self.state = state
|
|
285
|
+
self.retry_after = retry_after
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
@dataclass
|
|
289
|
+
class CircuitBreaker:
|
|
290
|
+
"""Circuit breaker for external dependencies.
|
|
291
|
+
|
|
292
|
+
Prevents cascade failures by tracking failures and temporarily
|
|
293
|
+
blocking requests when a dependency is unhealthy.
|
|
294
|
+
|
|
295
|
+
States:
|
|
296
|
+
CLOSED: Normal operation, requests pass through.
|
|
297
|
+
OPEN: Too many failures, requests rejected immediately.
|
|
298
|
+
HALF_OPEN: Testing recovery, limited requests allowed.
|
|
299
|
+
|
|
300
|
+
Attributes:
|
|
301
|
+
name: Identifier for this circuit breaker.
|
|
302
|
+
failure_threshold: Failures before opening circuit (default 5).
|
|
303
|
+
recovery_timeout: Seconds before testing recovery (default 30).
|
|
304
|
+
half_open_max_calls: Test calls allowed in half-open (default 3).
|
|
305
|
+
|
|
306
|
+
Example:
|
|
307
|
+
>>> breaker = CircuitBreaker(name="database")
|
|
308
|
+
>>>
|
|
309
|
+
>>> if breaker.can_execute():
|
|
310
|
+
... try:
|
|
311
|
+
... result = db.query()
|
|
312
|
+
... breaker.record_success()
|
|
313
|
+
... except Exception:
|
|
314
|
+
... breaker.record_failure()
|
|
315
|
+
... raise
|
|
316
|
+
... else:
|
|
317
|
+
... raise CircuitBreakerError("Database circuit open")
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
name: str = "default"
|
|
321
|
+
failure_threshold: int = 5
|
|
322
|
+
recovery_timeout: float = 30.0
|
|
323
|
+
half_open_max_calls: int = 3
|
|
324
|
+
|
|
325
|
+
# Internal state (initialized in __post_init__)
|
|
326
|
+
state: CircuitState = field(default=CircuitState.CLOSED, init=False)
|
|
327
|
+
failure_count: int = field(default=0, init=False)
|
|
328
|
+
last_failure_time: float = field(default=0.0, init=False)
|
|
329
|
+
half_open_calls: int = field(default=0, init=False)
|
|
330
|
+
_lock: Lock = field(default_factory=Lock, init=False)
|
|
331
|
+
|
|
332
|
+
def can_execute(self) -> bool:
|
|
333
|
+
"""Check if request should proceed.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
True if request can proceed, False if circuit is open.
|
|
337
|
+
"""
|
|
338
|
+
with self._lock:
|
|
339
|
+
if self.state == CircuitState.CLOSED:
|
|
340
|
+
return True
|
|
341
|
+
|
|
342
|
+
if self.state == CircuitState.OPEN:
|
|
343
|
+
# Check if recovery timeout has elapsed
|
|
344
|
+
if time.time() - self.last_failure_time >= self.recovery_timeout:
|
|
345
|
+
self.state = CircuitState.HALF_OPEN
|
|
346
|
+
self.half_open_calls = 0
|
|
347
|
+
return True
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
351
|
+
if self.half_open_calls < self.half_open_max_calls:
|
|
352
|
+
self.half_open_calls += 1
|
|
353
|
+
return True
|
|
354
|
+
return False
|
|
355
|
+
|
|
356
|
+
return False
|
|
357
|
+
|
|
358
|
+
def record_success(self) -> None:
|
|
359
|
+
"""Record successful call.
|
|
360
|
+
|
|
361
|
+
In HALF_OPEN state, successful calls contribute to recovery.
|
|
362
|
+
Once enough calls succeed, circuit closes.
|
|
363
|
+
Note: half_open_calls is already incremented in can_execute().
|
|
364
|
+
"""
|
|
365
|
+
with self._lock:
|
|
366
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
367
|
+
# Check if enough successful calls for recovery
|
|
368
|
+
# (counter already incremented in can_execute)
|
|
369
|
+
if self.half_open_calls >= self.half_open_max_calls:
|
|
370
|
+
# Recovery successful
|
|
371
|
+
self.state = CircuitState.CLOSED
|
|
372
|
+
self.failure_count = 0
|
|
373
|
+
else:
|
|
374
|
+
# Reset failure count on success
|
|
375
|
+
self.failure_count = 0
|
|
376
|
+
|
|
377
|
+
def record_failure(self) -> None:
|
|
378
|
+
"""Record failed call.
|
|
379
|
+
|
|
380
|
+
Increments failure count. If threshold exceeded, opens circuit.
|
|
381
|
+
In HALF_OPEN state, any failure returns to OPEN.
|
|
382
|
+
"""
|
|
383
|
+
with self._lock:
|
|
384
|
+
self.failure_count += 1
|
|
385
|
+
self.last_failure_time = time.time()
|
|
386
|
+
|
|
387
|
+
if self.state == CircuitState.HALF_OPEN:
|
|
388
|
+
# Recovery failed, back to open
|
|
389
|
+
self.state = CircuitState.OPEN
|
|
390
|
+
|
|
391
|
+
elif self.failure_count >= self.failure_threshold:
|
|
392
|
+
self.state = CircuitState.OPEN
|
|
393
|
+
|
|
394
|
+
def reset(self) -> None:
|
|
395
|
+
"""Reset circuit breaker to closed state."""
|
|
396
|
+
with self._lock:
|
|
397
|
+
self.state = CircuitState.CLOSED
|
|
398
|
+
self.failure_count = 0
|
|
399
|
+
self.half_open_calls = 0
|
|
400
|
+
self.last_failure_time = 0.0
|
|
401
|
+
|
|
402
|
+
def get_status(self) -> Dict[str, Any]:
|
|
403
|
+
"""Get current circuit breaker status.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Dict with state, failure_count, and other metrics.
|
|
407
|
+
"""
|
|
408
|
+
with self._lock:
|
|
409
|
+
retry_after = None
|
|
410
|
+
if self.state == CircuitState.OPEN:
|
|
411
|
+
elapsed = time.time() - self.last_failure_time
|
|
412
|
+
retry_after = max(0.0, self.recovery_timeout - elapsed)
|
|
413
|
+
|
|
414
|
+
return {
|
|
415
|
+
"name": self.name,
|
|
416
|
+
"state": self.state.value,
|
|
417
|
+
"failure_count": self.failure_count,
|
|
418
|
+
"failure_threshold": self.failure_threshold,
|
|
419
|
+
"recovery_timeout": self.recovery_timeout,
|
|
420
|
+
"retry_after_seconds": retry_after,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def with_circuit_breaker(
|
|
425
|
+
breaker: CircuitBreaker,
|
|
426
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
427
|
+
"""Decorator to wrap function with circuit breaker protection.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
breaker: CircuitBreaker instance to use.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
Decorated function that checks circuit before execution.
|
|
434
|
+
|
|
435
|
+
Example:
|
|
436
|
+
>>> db_breaker = CircuitBreaker(name="database", failure_threshold=3)
|
|
437
|
+
>>>
|
|
438
|
+
>>> @with_circuit_breaker(db_breaker)
|
|
439
|
+
... def query_database(sql: str):
|
|
440
|
+
... return db.execute(sql)
|
|
441
|
+
|
|
442
|
+
Raises:
|
|
443
|
+
CircuitBreakerError: If circuit is open and rejecting requests.
|
|
444
|
+
"""
|
|
445
|
+
|
|
446
|
+
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
447
|
+
@wraps(func)
|
|
448
|
+
def wrapper(*args: Any, **kwargs: Any) -> T:
|
|
449
|
+
if not breaker.can_execute():
|
|
450
|
+
status = breaker.get_status()
|
|
451
|
+
raise CircuitBreakerError(
|
|
452
|
+
f"Circuit breaker '{breaker.name}' is open",
|
|
453
|
+
breaker_name=breaker.name,
|
|
454
|
+
state=breaker.state,
|
|
455
|
+
retry_after=status.get("retry_after_seconds"),
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
result = func(*args, **kwargs)
|
|
460
|
+
breaker.record_success()
|
|
461
|
+
return result
|
|
462
|
+
except Exception:
|
|
463
|
+
breaker.record_failure()
|
|
464
|
+
raise
|
|
465
|
+
|
|
466
|
+
return wrapper
|
|
467
|
+
|
|
468
|
+
return decorator
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# ---------------------------------------------------------------------------
|
|
472
|
+
# Health Check Utilities
|
|
473
|
+
# ---------------------------------------------------------------------------
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@dataclass
|
|
477
|
+
class HealthStatus:
|
|
478
|
+
"""Health status for a dependency.
|
|
479
|
+
|
|
480
|
+
Attributes:
|
|
481
|
+
name: Dependency identifier.
|
|
482
|
+
healthy: Whether dependency is healthy.
|
|
483
|
+
latency_ms: Check latency in milliseconds.
|
|
484
|
+
last_check: Timestamp of the check.
|
|
485
|
+
error: Error message if unhealthy.
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
name: str
|
|
489
|
+
healthy: bool
|
|
490
|
+
latency_ms: float
|
|
491
|
+
last_check: datetime
|
|
492
|
+
error: Optional[str] = None
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
async def health_check(
|
|
496
|
+
name: str,
|
|
497
|
+
check_func: Callable[[], Any],
|
|
498
|
+
timeout: float = FAST_TIMEOUT,
|
|
499
|
+
) -> HealthStatus:
|
|
500
|
+
"""Check health of a dependency with timeout.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
name: Identifier for the dependency.
|
|
504
|
+
check_func: Async callable that tests dependency health.
|
|
505
|
+
timeout: Maximum time to wait for check (default FAST_TIMEOUT).
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
HealthStatus with check results.
|
|
509
|
+
|
|
510
|
+
Example:
|
|
511
|
+
>>> status = await health_check(
|
|
512
|
+
... "database",
|
|
513
|
+
... lambda: db.execute("SELECT 1"),
|
|
514
|
+
... )
|
|
515
|
+
>>> if not status.healthy:
|
|
516
|
+
... logger.warning(f"DB unhealthy: {status.error}")
|
|
517
|
+
"""
|
|
518
|
+
start = time.perf_counter()
|
|
519
|
+
try:
|
|
520
|
+
result = check_func()
|
|
521
|
+
# Handle both sync and async callables
|
|
522
|
+
if asyncio.iscoroutine(result):
|
|
523
|
+
await asyncio.wait_for(result, timeout=timeout)
|
|
524
|
+
latency = (time.perf_counter() - start) * 1000
|
|
525
|
+
|
|
526
|
+
return HealthStatus(
|
|
527
|
+
name=name,
|
|
528
|
+
healthy=True,
|
|
529
|
+
latency_ms=latency,
|
|
530
|
+
last_check=datetime.utcnow(),
|
|
531
|
+
)
|
|
532
|
+
except asyncio.TimeoutError:
|
|
533
|
+
latency = (time.perf_counter() - start) * 1000
|
|
534
|
+
return HealthStatus(
|
|
535
|
+
name=name,
|
|
536
|
+
healthy=False,
|
|
537
|
+
latency_ms=latency,
|
|
538
|
+
last_check=datetime.utcnow(),
|
|
539
|
+
error=f"Health check timed out after {timeout}s",
|
|
540
|
+
)
|
|
541
|
+
except Exception as e:
|
|
542
|
+
latency = (time.perf_counter() - start) * 1000
|
|
543
|
+
return HealthStatus(
|
|
544
|
+
name=name,
|
|
545
|
+
healthy=False,
|
|
546
|
+
latency_ms=latency,
|
|
547
|
+
last_check=datetime.utcnow(),
|
|
548
|
+
error=str(e),
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
async def check_dependencies(
|
|
553
|
+
checks: Dict[str, Callable[[], Any]],
|
|
554
|
+
timeout_per_check: float = FAST_TIMEOUT,
|
|
555
|
+
) -> Dict[str, Any]:
|
|
556
|
+
"""Check health of multiple dependencies concurrently.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
checks: Dict mapping dependency names to check functions.
|
|
560
|
+
timeout_per_check: Timeout per individual check.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
Dict with overall status and per-dependency results.
|
|
564
|
+
|
|
565
|
+
Example:
|
|
566
|
+
>>> results = await check_dependencies({
|
|
567
|
+
... "database": lambda: db.execute("SELECT 1"),
|
|
568
|
+
... "cache": lambda: cache.ping(),
|
|
569
|
+
... "api": lambda: http.get(health_url),
|
|
570
|
+
... })
|
|
571
|
+
>>> if results["status"] == "degraded":
|
|
572
|
+
... logger.warning(f"Unhealthy: {results['unhealthy']}")
|
|
573
|
+
"""
|
|
574
|
+
results: Dict[str, Dict[str, Any]] = {}
|
|
575
|
+
|
|
576
|
+
# Run all checks concurrently
|
|
577
|
+
statuses = await asyncio.gather(
|
|
578
|
+
*[
|
|
579
|
+
health_check(name, check_func, timeout_per_check)
|
|
580
|
+
for name, check_func in checks.items()
|
|
581
|
+
],
|
|
582
|
+
return_exceptions=False,
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
unhealthy: List[str] = []
|
|
586
|
+
for status in statuses:
|
|
587
|
+
results[status.name] = {
|
|
588
|
+
"healthy": status.healthy,
|
|
589
|
+
"latency_ms": round(status.latency_ms, 2),
|
|
590
|
+
"error": status.error,
|
|
591
|
+
}
|
|
592
|
+
if not status.healthy:
|
|
593
|
+
unhealthy.append(status.name)
|
|
594
|
+
|
|
595
|
+
return {
|
|
596
|
+
"status": "healthy" if not unhealthy else "degraded",
|
|
597
|
+
"dependencies": results,
|
|
598
|
+
"unhealthy": unhealthy,
|
|
599
|
+
"checked_at": datetime.utcnow().isoformat(),
|
|
600
|
+
}
|