kailash 0.6.3__py3-none-any.whl → 0.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kailash/__init__.py +3 -3
- kailash/api/custom_nodes_secure.py +3 -3
- kailash/api/gateway.py +1 -1
- kailash/api/studio.py +1 -1
- kailash/api/workflow_api.py +2 -2
- kailash/core/resilience/bulkhead.py +475 -0
- kailash/core/resilience/circuit_breaker.py +92 -10
- kailash/core/resilience/health_monitor.py +578 -0
- kailash/edge/discovery.py +86 -0
- kailash/mcp_server/__init__.py +309 -33
- kailash/mcp_server/advanced_features.py +1022 -0
- kailash/mcp_server/ai_registry_server.py +27 -2
- kailash/mcp_server/auth.py +789 -0
- kailash/mcp_server/client.py +645 -378
- kailash/mcp_server/discovery.py +1593 -0
- kailash/mcp_server/errors.py +673 -0
- kailash/mcp_server/oauth.py +1727 -0
- kailash/mcp_server/protocol.py +1126 -0
- kailash/mcp_server/registry_integration.py +587 -0
- kailash/mcp_server/server.py +1228 -96
- kailash/mcp_server/transports.py +1169 -0
- kailash/mcp_server/utils/__init__.py +6 -1
- kailash/mcp_server/utils/cache.py +250 -7
- kailash/middleware/auth/auth_manager.py +3 -3
- kailash/middleware/communication/api_gateway.py +1 -1
- kailash/middleware/communication/realtime.py +1 -1
- kailash/middleware/mcp/enhanced_server.py +1 -1
- kailash/nodes/__init__.py +2 -0
- kailash/nodes/admin/audit_log.py +6 -6
- kailash/nodes/admin/permission_check.py +8 -8
- kailash/nodes/admin/role_management.py +32 -28
- kailash/nodes/admin/schema.sql +6 -1
- kailash/nodes/admin/schema_manager.py +13 -13
- kailash/nodes/admin/security_event.py +15 -15
- kailash/nodes/admin/tenant_isolation.py +3 -3
- kailash/nodes/admin/transaction_utils.py +3 -3
- kailash/nodes/admin/user_management.py +21 -21
- kailash/nodes/ai/a2a.py +11 -11
- kailash/nodes/ai/ai_providers.py +9 -12
- kailash/nodes/ai/embedding_generator.py +13 -14
- kailash/nodes/ai/intelligent_agent_orchestrator.py +19 -19
- kailash/nodes/ai/iterative_llm_agent.py +2 -2
- kailash/nodes/ai/llm_agent.py +210 -33
- kailash/nodes/ai/self_organizing.py +2 -2
- kailash/nodes/alerts/discord.py +4 -4
- kailash/nodes/api/graphql.py +6 -6
- kailash/nodes/api/http.py +10 -10
- kailash/nodes/api/rate_limiting.py +4 -4
- kailash/nodes/api/rest.py +15 -15
- kailash/nodes/auth/mfa.py +3 -3
- kailash/nodes/auth/risk_assessment.py +2 -2
- kailash/nodes/auth/session_management.py +5 -5
- kailash/nodes/auth/sso.py +143 -0
- kailash/nodes/base.py +8 -2
- kailash/nodes/base_async.py +16 -2
- kailash/nodes/base_with_acl.py +2 -2
- kailash/nodes/cache/__init__.py +9 -0
- kailash/nodes/cache/cache.py +1172 -0
- kailash/nodes/cache/cache_invalidation.py +874 -0
- kailash/nodes/cache/redis_pool_manager.py +595 -0
- kailash/nodes/code/async_python.py +2 -1
- kailash/nodes/code/python.py +194 -30
- kailash/nodes/compliance/data_retention.py +6 -6
- kailash/nodes/compliance/gdpr.py +5 -5
- kailash/nodes/data/__init__.py +10 -0
- kailash/nodes/data/async_sql.py +1956 -129
- kailash/nodes/data/optimistic_locking.py +906 -0
- kailash/nodes/data/readers.py +8 -8
- kailash/nodes/data/redis.py +378 -0
- kailash/nodes/data/sql.py +314 -3
- kailash/nodes/data/streaming.py +21 -0
- kailash/nodes/enterprise/__init__.py +8 -0
- kailash/nodes/enterprise/audit_logger.py +285 -0
- kailash/nodes/enterprise/batch_processor.py +22 -3
- kailash/nodes/enterprise/data_lineage.py +1 -1
- kailash/nodes/enterprise/mcp_executor.py +205 -0
- kailash/nodes/enterprise/service_discovery.py +150 -0
- kailash/nodes/enterprise/tenant_assignment.py +108 -0
- kailash/nodes/logic/async_operations.py +2 -2
- kailash/nodes/logic/convergence.py +1 -1
- kailash/nodes/logic/operations.py +1 -1
- kailash/nodes/monitoring/__init__.py +11 -1
- kailash/nodes/monitoring/health_check.py +456 -0
- kailash/nodes/monitoring/log_processor.py +817 -0
- kailash/nodes/monitoring/metrics_collector.py +627 -0
- kailash/nodes/monitoring/performance_benchmark.py +137 -11
- kailash/nodes/rag/advanced.py +7 -7
- kailash/nodes/rag/agentic.py +49 -2
- kailash/nodes/rag/conversational.py +3 -3
- kailash/nodes/rag/evaluation.py +3 -3
- kailash/nodes/rag/federated.py +3 -3
- kailash/nodes/rag/graph.py +3 -3
- kailash/nodes/rag/multimodal.py +3 -3
- kailash/nodes/rag/optimized.py +5 -5
- kailash/nodes/rag/privacy.py +3 -3
- kailash/nodes/rag/query_processing.py +6 -6
- kailash/nodes/rag/realtime.py +1 -1
- kailash/nodes/rag/registry.py +1 -1
- kailash/nodes/rag/router.py +1 -1
- kailash/nodes/rag/similarity.py +7 -7
- kailash/nodes/rag/strategies.py +4 -4
- kailash/nodes/security/abac_evaluator.py +6 -6
- kailash/nodes/security/behavior_analysis.py +5 -5
- kailash/nodes/security/credential_manager.py +1 -1
- kailash/nodes/security/rotating_credentials.py +11 -11
- kailash/nodes/security/threat_detection.py +8 -8
- kailash/nodes/testing/credential_testing.py +2 -2
- kailash/nodes/transform/processors.py +5 -5
- kailash/runtime/local.py +163 -9
- kailash/runtime/parameter_injection.py +425 -0
- kailash/runtime/parameter_injector.py +657 -0
- kailash/runtime/testing.py +2 -2
- kailash/testing/fixtures.py +2 -2
- kailash/workflow/builder.py +99 -14
- kailash/workflow/builder_improvements.py +207 -0
- kailash/workflow/input_handling.py +170 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/METADATA +22 -9
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/RECORD +122 -95
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/WHEEL +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/entry_points.txt +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/licenses/LICENSE +0 -0
- {kailash-0.6.3.dist-info → kailash-0.6.5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,673 @@
|
|
1
|
+
"""Enhanced error handling for MCP implementations.
|
2
|
+
|
3
|
+
This module provides structured error codes, error recovery strategies,
|
4
|
+
and enhanced error handling capabilities that build on top of the
|
5
|
+
official MCP SDK error handling.
|
6
|
+
|
7
|
+
Features:
|
8
|
+
- Structured error codes following MCP protocol
|
9
|
+
- Error recovery and retry strategies
|
10
|
+
- Circuit breaker patterns
|
11
|
+
- Error aggregation and reporting
|
12
|
+
- Graceful degradation mechanisms
|
13
|
+
|
14
|
+
Examples:
|
15
|
+
Structured error handling:
|
16
|
+
|
17
|
+
>>> try:
|
18
|
+
... result = await client.call_tool("search", {"query": "test"})
|
19
|
+
... except MCPError as e:
|
20
|
+
... if e.is_retryable():
|
21
|
+
... await asyncio.sleep(e.get_retry_delay())
|
22
|
+
... # retry logic
|
23
|
+
... else:
|
24
|
+
... logger.error(f"Non-retryable error: {e}")
|
25
|
+
|
26
|
+
Error recovery with circuit breaker:
|
27
|
+
|
28
|
+
>>> circuit_breaker = CircuitBreaker(failure_threshold=5, timeout=60)
|
29
|
+
>>> async with circuit_breaker:
|
30
|
+
... result = await risky_operation()
|
31
|
+
"""
|
32
|
+
|
33
|
+
import asyncio
|
34
|
+
import json
|
35
|
+
import logging
|
36
|
+
import time
|
37
|
+
from abc import ABC, abstractmethod
|
38
|
+
from enum import Enum
|
39
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
|
44
|
+
class MCPErrorCode(Enum):
|
45
|
+
"""Standardized MCP error codes following JSON-RPC conventions."""
|
46
|
+
|
47
|
+
# Standard JSON-RPC errors
|
48
|
+
PARSE_ERROR = -32700
|
49
|
+
INVALID_REQUEST = -32600
|
50
|
+
METHOD_NOT_FOUND = -32601
|
51
|
+
INVALID_PARAMS = -32602
|
52
|
+
INTERNAL_ERROR = -32603
|
53
|
+
|
54
|
+
# MCP-specific errors (in reserved range -32099 to -32000)
|
55
|
+
TRANSPORT_ERROR = -32001
|
56
|
+
AUTHENTICATION_FAILED = -32002
|
57
|
+
AUTHORIZATION_FAILED = -32003
|
58
|
+
RATE_LIMITED = -32004
|
59
|
+
TOOL_NOT_FOUND = -32005
|
60
|
+
TOOL_EXECUTION_FAILED = -32006
|
61
|
+
RESOURCE_NOT_FOUND = -32007
|
62
|
+
RESOURCE_ACCESS_FAILED = -32008
|
63
|
+
SERVER_UNAVAILABLE = -32009
|
64
|
+
PROTOCOL_VERSION_MISMATCH = -32010
|
65
|
+
CAPABILITY_NOT_SUPPORTED = -32011
|
66
|
+
SESSION_EXPIRED = -32012
|
67
|
+
CIRCUIT_BREAKER_OPEN = -32013
|
68
|
+
|
69
|
+
# Application-specific errors (positive codes)
|
70
|
+
VALIDATION_ERROR = 1001
|
71
|
+
BUSINESS_LOGIC_ERROR = 1002
|
72
|
+
EXTERNAL_SERVICE_ERROR = 1003
|
73
|
+
DATA_INTEGRITY_ERROR = 1004
|
74
|
+
QUOTA_EXCEEDED = 1005
|
75
|
+
REQUEST_TIMEOUT = 1006
|
76
|
+
REQUEST_CANCELLED = 1007
|
77
|
+
|
78
|
+
|
79
|
+
class MCPError(Exception):
|
80
|
+
"""Enhanced MCP error with structured information.
|
81
|
+
|
82
|
+
Extends the basic exception with MCP-specific error codes,
|
83
|
+
retry information, and recovery hints.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
message: Human-readable error message
|
87
|
+
error_code: Structured error code
|
88
|
+
data: Additional error data
|
89
|
+
retryable: Whether the error is retryable
|
90
|
+
retry_after: Suggested retry delay in seconds
|
91
|
+
|
92
|
+
Examples:
|
93
|
+
Create structured error:
|
94
|
+
|
95
|
+
>>> error = MCPError(
|
96
|
+
... "Tool execution failed",
|
97
|
+
... error_code=MCPErrorCode.TOOL_EXECUTION_FAILED,
|
98
|
+
... data={"tool": "search", "reason": "timeout"},
|
99
|
+
... retryable=True,
|
100
|
+
... retry_after=5
|
101
|
+
... )
|
102
|
+
"""
|
103
|
+
|
104
|
+
def __init__(
|
105
|
+
self,
|
106
|
+
message: str,
|
107
|
+
error_code: Union[MCPErrorCode, int] = MCPErrorCode.INTERNAL_ERROR,
|
108
|
+
data: Optional[Dict[str, Any]] = None,
|
109
|
+
retryable: bool = False,
|
110
|
+
retry_after: Optional[float] = None,
|
111
|
+
cause: Optional[Exception] = None,
|
112
|
+
):
|
113
|
+
"""Initialize MCP error."""
|
114
|
+
super().__init__(message)
|
115
|
+
self.message = message
|
116
|
+
self.error_code = (
|
117
|
+
error_code
|
118
|
+
if isinstance(error_code, MCPErrorCode)
|
119
|
+
else MCPErrorCode(error_code)
|
120
|
+
)
|
121
|
+
self.data = data or {}
|
122
|
+
self.retryable = retryable
|
123
|
+
self.retry_after = retry_after
|
124
|
+
self.cause = cause
|
125
|
+
self.timestamp = time.time()
|
126
|
+
|
127
|
+
def to_dict(self) -> Dict[str, Any]:
|
128
|
+
"""Convert error to dictionary format for JSON-RPC."""
|
129
|
+
error_dict = {"code": self.error_code.value, "message": self.message}
|
130
|
+
|
131
|
+
if self.data:
|
132
|
+
error_dict["data"] = self.data
|
133
|
+
|
134
|
+
return error_dict
|
135
|
+
|
136
|
+
def is_retryable(self) -> bool:
|
137
|
+
"""Check if error is retryable."""
|
138
|
+
return self.retryable
|
139
|
+
|
140
|
+
def get_retry_delay(self) -> float:
|
141
|
+
"""Get suggested retry delay."""
|
142
|
+
if self.retry_after is not None:
|
143
|
+
return self.retry_after
|
144
|
+
|
145
|
+
# Default retry delays based on error type
|
146
|
+
retry_delays = {
|
147
|
+
MCPErrorCode.RATE_LIMITED: 60.0,
|
148
|
+
MCPErrorCode.SERVER_UNAVAILABLE: 30.0,
|
149
|
+
MCPErrorCode.TRANSPORT_ERROR: 5.0,
|
150
|
+
MCPErrorCode.TOOL_EXECUTION_FAILED: 2.0,
|
151
|
+
MCPErrorCode.EXTERNAL_SERVICE_ERROR: 10.0,
|
152
|
+
}
|
153
|
+
|
154
|
+
return retry_delays.get(self.error_code, 1.0)
|
155
|
+
|
156
|
+
def get_severity(self) -> str:
|
157
|
+
"""Get error severity level."""
|
158
|
+
high_severity = {
|
159
|
+
MCPErrorCode.AUTHENTICATION_FAILED,
|
160
|
+
MCPErrorCode.AUTHORIZATION_FAILED,
|
161
|
+
MCPErrorCode.DATA_INTEGRITY_ERROR,
|
162
|
+
MCPErrorCode.PROTOCOL_VERSION_MISMATCH,
|
163
|
+
}
|
164
|
+
|
165
|
+
medium_severity = {
|
166
|
+
MCPErrorCode.TOOL_NOT_FOUND,
|
167
|
+
MCPErrorCode.RESOURCE_NOT_FOUND,
|
168
|
+
MCPErrorCode.VALIDATION_ERROR,
|
169
|
+
MCPErrorCode.BUSINESS_LOGIC_ERROR,
|
170
|
+
}
|
171
|
+
|
172
|
+
if self.error_code in high_severity:
|
173
|
+
return "high"
|
174
|
+
elif self.error_code in medium_severity:
|
175
|
+
return "medium"
|
176
|
+
else:
|
177
|
+
return "low"
|
178
|
+
|
179
|
+
|
180
|
+
class TransportError(MCPError):
|
181
|
+
"""Transport-related error."""
|
182
|
+
|
183
|
+
def __init__(self, message: str, transport_type: str = "unknown", **kwargs):
|
184
|
+
kwargs.setdefault("error_code", MCPErrorCode.TRANSPORT_ERROR)
|
185
|
+
kwargs.setdefault("retryable", True)
|
186
|
+
kwargs.setdefault("data", {})["transport_type"] = transport_type
|
187
|
+
super().__init__(message, **kwargs)
|
188
|
+
|
189
|
+
|
190
|
+
class AuthenticationError(MCPError):
|
191
|
+
"""Authentication-related error."""
|
192
|
+
|
193
|
+
def __init__(self, message: str, auth_type: str = "unknown", **kwargs):
|
194
|
+
kwargs.setdefault("error_code", MCPErrorCode.AUTHENTICATION_FAILED)
|
195
|
+
kwargs.setdefault("retryable", False)
|
196
|
+
kwargs.setdefault("data", {})["auth_type"] = auth_type
|
197
|
+
super().__init__(message, **kwargs)
|
198
|
+
|
199
|
+
|
200
|
+
class AuthorizationError(MCPError):
|
201
|
+
"""Authorization-related error."""
|
202
|
+
|
203
|
+
def __init__(self, message: str, required_permission: str = "", **kwargs):
|
204
|
+
kwargs.setdefault("error_code", MCPErrorCode.AUTHORIZATION_FAILED)
|
205
|
+
kwargs.setdefault("retryable", False)
|
206
|
+
kwargs.setdefault("data", {})["required_permission"] = required_permission
|
207
|
+
super().__init__(message, **kwargs)
|
208
|
+
|
209
|
+
|
210
|
+
class RateLimitError(MCPError):
|
211
|
+
"""Rate limiting error."""
|
212
|
+
|
213
|
+
def __init__(self, message: str, retry_after: float = 60.0, **kwargs):
|
214
|
+
kwargs.setdefault("error_code", MCPErrorCode.RATE_LIMITED)
|
215
|
+
kwargs.setdefault("retryable", True)
|
216
|
+
kwargs["retry_after"] = retry_after
|
217
|
+
super().__init__(message, **kwargs)
|
218
|
+
|
219
|
+
|
220
|
+
class ToolError(MCPError):
|
221
|
+
"""Tool-related error."""
|
222
|
+
|
223
|
+
def __init__(self, message: str, tool_name: str = "", **kwargs):
|
224
|
+
kwargs.setdefault("error_code", MCPErrorCode.TOOL_EXECUTION_FAILED)
|
225
|
+
kwargs.setdefault("retryable", True)
|
226
|
+
kwargs.setdefault("data", {})["tool_name"] = tool_name
|
227
|
+
super().__init__(message, **kwargs)
|
228
|
+
|
229
|
+
|
230
|
+
class ResourceError(MCPError):
|
231
|
+
"""Resource-related error."""
|
232
|
+
|
233
|
+
def __init__(self, message: str, resource_uri: str = "", **kwargs):
|
234
|
+
kwargs.setdefault("error_code", MCPErrorCode.RESOURCE_ACCESS_FAILED)
|
235
|
+
kwargs.setdefault("retryable", True)
|
236
|
+
kwargs.setdefault("data", {})["resource_uri"] = resource_uri
|
237
|
+
super().__init__(message, **kwargs)
|
238
|
+
|
239
|
+
|
240
|
+
class ServiceDiscoveryError(MCPError):
|
241
|
+
"""Service discovery related error."""
|
242
|
+
|
243
|
+
def __init__(self, message: str, discovery_type: str = "unknown", **kwargs):
|
244
|
+
kwargs.setdefault("error_code", MCPErrorCode.SERVER_UNAVAILABLE)
|
245
|
+
kwargs.setdefault("retryable", True)
|
246
|
+
kwargs.setdefault("data", {})["discovery_type"] = discovery_type
|
247
|
+
super().__init__(message, **kwargs)
|
248
|
+
|
249
|
+
|
250
|
+
class ValidationError(MCPError):
|
251
|
+
"""Validation error."""
|
252
|
+
|
253
|
+
def __init__(self, message: str, **kwargs):
|
254
|
+
kwargs.setdefault("error_code", MCPErrorCode.VALIDATION_ERROR)
|
255
|
+
kwargs.setdefault("retryable", False)
|
256
|
+
super().__init__(message, **kwargs)
|
257
|
+
|
258
|
+
|
259
|
+
class RetryStrategy(ABC):
|
260
|
+
"""Abstract base class for retry strategies."""
|
261
|
+
|
262
|
+
@abstractmethod
|
263
|
+
def should_retry(self, error: MCPError, attempt: int) -> bool:
|
264
|
+
"""Determine if operation should be retried."""
|
265
|
+
pass
|
266
|
+
|
267
|
+
@abstractmethod
|
268
|
+
def get_delay(self, error: MCPError, attempt: int) -> float:
|
269
|
+
"""Get delay before next retry attempt."""
|
270
|
+
pass
|
271
|
+
|
272
|
+
|
273
|
+
class ExponentialBackoffRetry(RetryStrategy):
|
274
|
+
"""Exponential backoff retry strategy.
|
275
|
+
|
276
|
+
Args:
|
277
|
+
max_attempts: Maximum retry attempts
|
278
|
+
base_delay: Base delay in seconds
|
279
|
+
max_delay: Maximum delay in seconds
|
280
|
+
backoff_factor: Exponential backoff factor
|
281
|
+
jitter: Add random jitter to prevent thundering herd
|
282
|
+
|
283
|
+
Examples:
|
284
|
+
Create retry strategy:
|
285
|
+
|
286
|
+
>>> retry = ExponentialBackoffRetry(
|
287
|
+
... max_attempts=5,
|
288
|
+
... base_delay=1.0,
|
289
|
+
... max_delay=60.0,
|
290
|
+
... backoff_factor=2.0,
|
291
|
+
... jitter=True
|
292
|
+
... )
|
293
|
+
"""
|
294
|
+
|
295
|
+
def __init__(
|
296
|
+
self,
|
297
|
+
max_attempts: int = 3,
|
298
|
+
base_delay: float = 1.0,
|
299
|
+
max_delay: float = 60.0,
|
300
|
+
backoff_factor: float = 2.0,
|
301
|
+
jitter: bool = True,
|
302
|
+
):
|
303
|
+
"""Initialize exponential backoff retry."""
|
304
|
+
self.max_attempts = max_attempts
|
305
|
+
self.base_delay = base_delay
|
306
|
+
self.max_delay = max_delay
|
307
|
+
self.backoff_factor = backoff_factor
|
308
|
+
self.jitter = jitter
|
309
|
+
|
310
|
+
def should_retry(self, error: MCPError, attempt: int) -> bool:
|
311
|
+
"""Check if operation should be retried."""
|
312
|
+
return (
|
313
|
+
attempt < self.max_attempts
|
314
|
+
and error.is_retryable()
|
315
|
+
and error.get_severity() != "high"
|
316
|
+
)
|
317
|
+
|
318
|
+
def get_delay(self, error: MCPError, attempt: int) -> float:
|
319
|
+
"""Calculate exponential backoff delay."""
|
320
|
+
# Use error's suggested delay if available
|
321
|
+
if error.retry_after is not None:
|
322
|
+
delay = error.retry_after
|
323
|
+
else:
|
324
|
+
delay = self.base_delay * (self.backoff_factor ** (attempt - 1))
|
325
|
+
|
326
|
+
# Apply maximum delay limit
|
327
|
+
delay = min(delay, self.max_delay)
|
328
|
+
|
329
|
+
# Add jitter to prevent thundering herd
|
330
|
+
if self.jitter:
|
331
|
+
import random
|
332
|
+
|
333
|
+
delay *= 0.5 + random.random() * 0.5
|
334
|
+
|
335
|
+
return delay
|
336
|
+
|
337
|
+
|
338
|
+
class CircuitBreakerRetry(RetryStrategy):
|
339
|
+
"""Circuit breaker retry strategy.
|
340
|
+
|
341
|
+
Implements the circuit breaker pattern to prevent cascading failures.
|
342
|
+
|
343
|
+
Args:
|
344
|
+
failure_threshold: Number of failures before opening circuit
|
345
|
+
timeout: Time to wait before trying to close circuit
|
346
|
+
success_threshold: Number of successes needed to close circuit
|
347
|
+
|
348
|
+
Examples:
|
349
|
+
Create circuit breaker:
|
350
|
+
|
351
|
+
>>> circuit_breaker = CircuitBreakerRetry(
|
352
|
+
... failure_threshold=5,
|
353
|
+
... timeout=60.0,
|
354
|
+
... success_threshold=3
|
355
|
+
... )
|
356
|
+
"""
|
357
|
+
|
358
|
+
def __init__(
|
359
|
+
self,
|
360
|
+
failure_threshold: int = 5,
|
361
|
+
timeout: float = 60.0,
|
362
|
+
success_threshold: int = 3,
|
363
|
+
):
|
364
|
+
"""Initialize circuit breaker."""
|
365
|
+
self.failure_threshold = failure_threshold
|
366
|
+
self.timeout = timeout
|
367
|
+
self.success_threshold = success_threshold
|
368
|
+
|
369
|
+
# Circuit breaker state
|
370
|
+
self.failure_count = 0
|
371
|
+
self.success_count = 0
|
372
|
+
self.last_failure_time = 0
|
373
|
+
self.state = "closed" # closed, open, half-open
|
374
|
+
|
375
|
+
def should_retry(self, error: MCPError, attempt: int) -> bool:
|
376
|
+
"""Check if operation should be retried based on circuit state."""
|
377
|
+
now = time.time()
|
378
|
+
|
379
|
+
if self.state == "open":
|
380
|
+
# Check if timeout has passed
|
381
|
+
if now - self.last_failure_time > self.timeout:
|
382
|
+
self.state = "half-open"
|
383
|
+
self.success_count = 0
|
384
|
+
return True
|
385
|
+
else:
|
386
|
+
return False
|
387
|
+
|
388
|
+
elif self.state == "half-open":
|
389
|
+
# Allow limited retries to test if service recovered
|
390
|
+
return self.success_count < self.success_threshold
|
391
|
+
|
392
|
+
else: # closed
|
393
|
+
return error.is_retryable()
|
394
|
+
|
395
|
+
def get_delay(self, error: MCPError, attempt: int) -> float:
|
396
|
+
"""Get delay based on circuit state."""
|
397
|
+
if self.state == "open":
|
398
|
+
return self.timeout - (time.time() - self.last_failure_time)
|
399
|
+
else:
|
400
|
+
return error.get_retry_delay()
|
401
|
+
|
402
|
+
def on_success(self):
|
403
|
+
"""Record successful operation."""
|
404
|
+
if self.state == "half-open":
|
405
|
+
self.success_count += 1
|
406
|
+
if self.success_count >= self.success_threshold:
|
407
|
+
self.state = "closed"
|
408
|
+
self.failure_count = 0
|
409
|
+
|
410
|
+
def on_failure(self, error: MCPError):
|
411
|
+
"""Record failed operation."""
|
412
|
+
self.failure_count += 1
|
413
|
+
self.last_failure_time = time.time()
|
414
|
+
|
415
|
+
if self.state == "half-open":
|
416
|
+
self.state = "open"
|
417
|
+
elif self.failure_count >= self.failure_threshold:
|
418
|
+
self.state = "open"
|
419
|
+
|
420
|
+
|
421
|
+
class RetryableOperation:
|
422
|
+
"""Wrapper for operations with retry logic.
|
423
|
+
|
424
|
+
Args:
|
425
|
+
retry_strategy: Retry strategy to use
|
426
|
+
logger: Optional logger for retry events
|
427
|
+
|
428
|
+
Examples:
|
429
|
+
Execute operation with retries:
|
430
|
+
|
431
|
+
>>> retry_op = RetryableOperation(
|
432
|
+
... ExponentialBackoffRetry(max_attempts=5)
|
433
|
+
... )
|
434
|
+
>>> result = await retry_op.execute(risky_function, arg1, arg2)
|
435
|
+
"""
|
436
|
+
|
437
|
+
def __init__(
|
438
|
+
self, retry_strategy: RetryStrategy, logger: Optional[logging.Logger] = None
|
439
|
+
):
|
440
|
+
"""Initialize retryable operation."""
|
441
|
+
self.retry_strategy = retry_strategy
|
442
|
+
self.logger = logger or logging.getLogger(__name__)
|
443
|
+
|
444
|
+
async def execute(self, func, *args, **kwargs):
|
445
|
+
"""Execute function with retry logic.
|
446
|
+
|
447
|
+
Args:
|
448
|
+
func: Function to execute (can be sync or async)
|
449
|
+
*args: Function arguments
|
450
|
+
**kwargs: Function keyword arguments
|
451
|
+
|
452
|
+
Returns:
|
453
|
+
Function result
|
454
|
+
|
455
|
+
Raises:
|
456
|
+
MCPError: If all retry attempts failed
|
457
|
+
"""
|
458
|
+
attempt = 0
|
459
|
+
last_error = None
|
460
|
+
|
461
|
+
while True:
|
462
|
+
attempt += 1
|
463
|
+
|
464
|
+
try:
|
465
|
+
# Execute function (handle both sync and async)
|
466
|
+
if asyncio.iscoroutinefunction(func):
|
467
|
+
result = await func(*args, **kwargs)
|
468
|
+
else:
|
469
|
+
result = func(*args, **kwargs)
|
470
|
+
|
471
|
+
# Record success for circuit breaker
|
472
|
+
if isinstance(self.retry_strategy, CircuitBreakerRetry):
|
473
|
+
self.retry_strategy.on_success()
|
474
|
+
|
475
|
+
return result
|
476
|
+
|
477
|
+
except MCPError as error:
|
478
|
+
last_error = error
|
479
|
+
|
480
|
+
# Record failure for circuit breaker
|
481
|
+
if isinstance(self.retry_strategy, CircuitBreakerRetry):
|
482
|
+
self.retry_strategy.on_failure(error)
|
483
|
+
|
484
|
+
# Check if we should retry
|
485
|
+
if not self.retry_strategy.should_retry(error, attempt):
|
486
|
+
self.logger.error(
|
487
|
+
f"Operation failed after {attempt} attempts: {error}"
|
488
|
+
)
|
489
|
+
raise error
|
490
|
+
|
491
|
+
# Calculate retry delay
|
492
|
+
delay = self.retry_strategy.get_delay(error, attempt)
|
493
|
+
self.logger.warning(
|
494
|
+
f"Operation failed (attempt {attempt}), retrying in {delay:.2f}s: {error}"
|
495
|
+
)
|
496
|
+
|
497
|
+
# Wait before retry
|
498
|
+
await asyncio.sleep(delay)
|
499
|
+
|
500
|
+
except Exception as error:
|
501
|
+
# Convert unexpected errors to MCPError
|
502
|
+
mcp_error = MCPError(
|
503
|
+
f"Unexpected error: {error}",
|
504
|
+
error_code=MCPErrorCode.INTERNAL_ERROR,
|
505
|
+
cause=error,
|
506
|
+
retryable=False,
|
507
|
+
)
|
508
|
+
self.logger.error(f"Unexpected error in retryable operation: {error}")
|
509
|
+
raise mcp_error
|
510
|
+
|
511
|
+
|
512
|
+
class ErrorAggregator:
|
513
|
+
"""Aggregates and reports errors for monitoring.
|
514
|
+
|
515
|
+
Collects error statistics and provides insights into
|
516
|
+
error patterns and trends.
|
517
|
+
|
518
|
+
Examples:
|
519
|
+
Track errors:
|
520
|
+
|
521
|
+
>>> aggregator = ErrorAggregator()
|
522
|
+
>>> aggregator.record_error(error)
|
523
|
+
>>> stats = aggregator.get_error_stats()
|
524
|
+
"""
|
525
|
+
|
526
|
+
def __init__(self, max_errors: int = 1000):
|
527
|
+
"""Initialize error aggregator."""
|
528
|
+
self.max_errors = max_errors
|
529
|
+
self.errors: List[MCPError] = []
|
530
|
+
self.error_counts: Dict[MCPErrorCode, int] = {}
|
531
|
+
|
532
|
+
def record_error(self, error: MCPError):
|
533
|
+
"""Record an error occurrence."""
|
534
|
+
self.errors.append(error)
|
535
|
+
|
536
|
+
# Keep only recent errors
|
537
|
+
if len(self.errors) > self.max_errors:
|
538
|
+
self.errors = self.errors[-self.max_errors :]
|
539
|
+
|
540
|
+
# Update counts
|
541
|
+
self.error_counts[error.error_code] = (
|
542
|
+
self.error_counts.get(error.error_code, 0) + 1
|
543
|
+
)
|
544
|
+
|
545
|
+
def get_error_stats(self, time_window: Optional[float] = None) -> Dict[str, Any]:
|
546
|
+
"""Get error statistics.
|
547
|
+
|
548
|
+
Args:
|
549
|
+
time_window: Time window in seconds (None for all errors)
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
Error statistics dictionary
|
553
|
+
"""
|
554
|
+
now = time.time()
|
555
|
+
|
556
|
+
# Filter errors by time window
|
557
|
+
if time_window:
|
558
|
+
recent_errors = [e for e in self.errors if now - e.timestamp <= time_window]
|
559
|
+
else:
|
560
|
+
recent_errors = self.errors
|
561
|
+
|
562
|
+
if not recent_errors:
|
563
|
+
return {"total_errors": 0}
|
564
|
+
|
565
|
+
# Calculate statistics
|
566
|
+
error_codes = [e.error_code for e in recent_errors]
|
567
|
+
severity_levels = [e.get_severity() for e in recent_errors]
|
568
|
+
|
569
|
+
from collections import Counter
|
570
|
+
|
571
|
+
return {
|
572
|
+
"total_errors": len(recent_errors),
|
573
|
+
"error_rate": len(recent_errors)
|
574
|
+
/ max(time_window or 3600, 1), # per second
|
575
|
+
"error_codes": dict(Counter(error_codes)),
|
576
|
+
"severity_levels": dict(Counter(severity_levels)),
|
577
|
+
"most_common_error": (
|
578
|
+
Counter(error_codes).most_common(1)[0] if error_codes else None
|
579
|
+
),
|
580
|
+
"retryable_errors": len([e for e in recent_errors if e.is_retryable()]),
|
581
|
+
"time_window": time_window,
|
582
|
+
}
|
583
|
+
|
584
|
+
def get_error_trends(self, bucket_size: float = 300) -> List[Dict[str, Any]]:
|
585
|
+
"""Get error trends over time.
|
586
|
+
|
587
|
+
Args:
|
588
|
+
bucket_size: Time bucket size in seconds
|
589
|
+
|
590
|
+
Returns:
|
591
|
+
List of time buckets with error counts
|
592
|
+
"""
|
593
|
+
if not self.errors:
|
594
|
+
return []
|
595
|
+
|
596
|
+
now = time.time()
|
597
|
+
oldest_error = min(e.timestamp for e in self.errors)
|
598
|
+
|
599
|
+
# Create time buckets
|
600
|
+
buckets = []
|
601
|
+
bucket_start = oldest_error
|
602
|
+
|
603
|
+
while bucket_start < now:
|
604
|
+
bucket_end = bucket_start + bucket_size
|
605
|
+
bucket_errors = [
|
606
|
+
e for e in self.errors if bucket_start <= e.timestamp < bucket_end
|
607
|
+
]
|
608
|
+
|
609
|
+
buckets.append(
|
610
|
+
{
|
611
|
+
"start_time": bucket_start,
|
612
|
+
"end_time": bucket_end,
|
613
|
+
"error_count": len(bucket_errors),
|
614
|
+
"error_codes": list(set(e.error_code for e in bucket_errors)),
|
615
|
+
}
|
616
|
+
)
|
617
|
+
|
618
|
+
bucket_start = bucket_end
|
619
|
+
|
620
|
+
return buckets
|
621
|
+
|
622
|
+
|
623
|
+
# Convenience functions
|
624
|
+
def create_retry_operation(
|
625
|
+
strategy: str = "exponential", **strategy_kwargs
|
626
|
+
) -> RetryableOperation:
|
627
|
+
"""Create a retryable operation with the specified strategy.
|
628
|
+
|
629
|
+
Args:
|
630
|
+
strategy: Strategy type ("exponential" or "circuit_breaker")
|
631
|
+
**strategy_kwargs: Strategy-specific arguments
|
632
|
+
|
633
|
+
Returns:
|
634
|
+
RetryableOperation instance
|
635
|
+
"""
|
636
|
+
if strategy == "exponential":
|
637
|
+
retry_strategy = ExponentialBackoffRetry(**strategy_kwargs)
|
638
|
+
elif strategy == "circuit_breaker":
|
639
|
+
retry_strategy = CircuitBreakerRetry(**strategy_kwargs)
|
640
|
+
else:
|
641
|
+
raise ValueError(f"Unknown retry strategy: {strategy}")
|
642
|
+
|
643
|
+
return RetryableOperation(retry_strategy)
|
644
|
+
|
645
|
+
|
646
|
+
def wrap_with_error_handling(func):
|
647
|
+
"""Decorator to wrap functions with MCP error handling.
|
648
|
+
|
649
|
+
Examples:
|
650
|
+
>>> @wrap_with_error_handling
|
651
|
+
... async def risky_operation():
|
652
|
+
... # This might fail
|
653
|
+
... return "success"
|
654
|
+
"""
|
655
|
+
|
656
|
+
async def wrapper(*args, **kwargs):
|
657
|
+
try:
|
658
|
+
if asyncio.iscoroutinefunction(func):
|
659
|
+
return await func(*args, **kwargs)
|
660
|
+
else:
|
661
|
+
return func(*args, **kwargs)
|
662
|
+
except MCPError:
|
663
|
+
raise # Re-raise MCP errors as-is
|
664
|
+
except Exception as e:
|
665
|
+
# Convert to MCP error
|
666
|
+
raise MCPError(
|
667
|
+
f"Operation failed: {e}",
|
668
|
+
error_code=MCPErrorCode.INTERNAL_ERROR,
|
669
|
+
cause=e,
|
670
|
+
retryable=True,
|
671
|
+
)
|
672
|
+
|
673
|
+
return wrapper
|