code-puppy 0.0.127__py3-none-any.whl → 0.0.128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_puppy/__init__.py +1 -0
- code_puppy/agent.py +65 -69
- code_puppy/agents/agent_code_puppy.py +0 -3
- code_puppy/agents/runtime_manager.py +212 -0
- code_puppy/command_line/command_handler.py +56 -25
- code_puppy/command_line/mcp_commands.py +1298 -0
- code_puppy/command_line/meta_command_handler.py +3 -2
- code_puppy/command_line/model_picker_completion.py +21 -8
- code_puppy/main.py +52 -157
- code_puppy/mcp/__init__.py +23 -0
- code_puppy/mcp/async_lifecycle.py +237 -0
- code_puppy/mcp/circuit_breaker.py +218 -0
- code_puppy/mcp/config_wizard.py +437 -0
- code_puppy/mcp/dashboard.py +291 -0
- code_puppy/mcp/error_isolation.py +360 -0
- code_puppy/mcp/examples/retry_example.py +208 -0
- code_puppy/mcp/health_monitor.py +549 -0
- code_puppy/mcp/managed_server.py +346 -0
- code_puppy/mcp/manager.py +701 -0
- code_puppy/mcp/registry.py +412 -0
- code_puppy/mcp/retry_manager.py +321 -0
- code_puppy/mcp/server_registry_catalog.py +751 -0
- code_puppy/mcp/status_tracker.py +355 -0
- code_puppy/messaging/spinner/textual_spinner.py +6 -2
- code_puppy/model_factory.py +19 -4
- code_puppy/models.json +8 -6
- code_puppy/tui/app.py +19 -27
- code_puppy/tui/tests/test_agent_command.py +22 -15
- {code_puppy-0.0.127.data → code_puppy-0.0.128.data}/data/code_puppy/models.json +8 -6
- {code_puppy-0.0.127.dist-info → code_puppy-0.0.128.dist-info}/METADATA +2 -3
- {code_puppy-0.0.127.dist-info → code_puppy-0.0.128.dist-info}/RECORD +34 -18
- {code_puppy-0.0.127.dist-info → code_puppy-0.0.128.dist-info}/WHEEL +0 -0
- {code_puppy-0.0.127.dist-info → code_puppy-0.0.128.dist-info}/entry_points.txt +0 -0
- {code_puppy-0.0.127.dist-info → code_puppy-0.0.128.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MCP Dashboard Implementation
|
|
3
|
+
|
|
4
|
+
Provides visual status dashboard for MCP servers using Rich tables.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import datetime, timedelta
|
|
8
|
+
from typing import Dict, List, Any, Optional
|
|
9
|
+
from rich.table import Table
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich import box
|
|
12
|
+
|
|
13
|
+
from .status_tracker import ServerState, Event
|
|
14
|
+
from .manager import get_mcp_manager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MCPDashboard:
|
|
18
|
+
"""Visual dashboard for MCP server status monitoring"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
"""Initialize the MCP Dashboard"""
|
|
22
|
+
self.console = Console()
|
|
23
|
+
|
|
24
|
+
def render_dashboard(self) -> Table:
|
|
25
|
+
"""
|
|
26
|
+
Render the main MCP server status dashboard
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Table: Rich table with server status information
|
|
30
|
+
"""
|
|
31
|
+
# Create the main table
|
|
32
|
+
table = Table(
|
|
33
|
+
title="MCP Server Status Dashboard",
|
|
34
|
+
box=box.ROUNDED,
|
|
35
|
+
show_header=True,
|
|
36
|
+
header_style="bold blue",
|
|
37
|
+
title_style="bold cyan"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Define columns
|
|
41
|
+
table.add_column("Name", style="white", no_wrap=True, min_width=10)
|
|
42
|
+
table.add_column("Type", style="white", no_wrap=True, width=8)
|
|
43
|
+
table.add_column("State", style="white", no_wrap=True, width=8)
|
|
44
|
+
table.add_column("Health", style="white", no_wrap=True, width=8)
|
|
45
|
+
table.add_column("Uptime", style="white", no_wrap=True, width=10)
|
|
46
|
+
table.add_column("Latency", style="white", no_wrap=True, width=10)
|
|
47
|
+
|
|
48
|
+
# Get manager and server info
|
|
49
|
+
try:
|
|
50
|
+
manager = get_mcp_manager()
|
|
51
|
+
servers = manager.list_servers()
|
|
52
|
+
|
|
53
|
+
if not servers:
|
|
54
|
+
# Empty state
|
|
55
|
+
table.add_row(
|
|
56
|
+
"[dim]No servers configured[/dim]",
|
|
57
|
+
"-", "-", "-", "-", "-"
|
|
58
|
+
)
|
|
59
|
+
else:
|
|
60
|
+
# Add row for each server
|
|
61
|
+
for server in servers:
|
|
62
|
+
row_data = self.render_server_row(server)
|
|
63
|
+
table.add_row(*row_data)
|
|
64
|
+
|
|
65
|
+
except Exception as e:
|
|
66
|
+
# Error state
|
|
67
|
+
table.add_row(
|
|
68
|
+
"[red]Error loading servers[/red]",
|
|
69
|
+
"-", "-", "-", "-", f"[red]{str(e)}[/red]"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return table
|
|
73
|
+
|
|
74
|
+
def render_server_row(self, server) -> List[str]:
|
|
75
|
+
"""
|
|
76
|
+
Render a single server row for the dashboard
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
server: ServerInfo object with server details
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List[str]: Formatted row data for the table
|
|
83
|
+
"""
|
|
84
|
+
# Server name
|
|
85
|
+
name = server.name or server.id[:8]
|
|
86
|
+
|
|
87
|
+
# Server type
|
|
88
|
+
server_type = server.type.upper() if server.type else "UNK"
|
|
89
|
+
|
|
90
|
+
# State indicator
|
|
91
|
+
state_indicator = self.render_state_indicator(server.state)
|
|
92
|
+
|
|
93
|
+
# Health indicator
|
|
94
|
+
health_indicator = self.render_health_indicator(server.health)
|
|
95
|
+
|
|
96
|
+
# Uptime
|
|
97
|
+
uptime_str = self.format_uptime(server.start_time) if server.start_time else "-"
|
|
98
|
+
|
|
99
|
+
# Latency
|
|
100
|
+
latency_str = self.format_latency(server.latency_ms) if server.latency_ms is not None else "-"
|
|
101
|
+
|
|
102
|
+
return [
|
|
103
|
+
name,
|
|
104
|
+
server_type,
|
|
105
|
+
state_indicator,
|
|
106
|
+
health_indicator,
|
|
107
|
+
uptime_str,
|
|
108
|
+
latency_str
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
def render_health_indicator(self, health: Optional[Dict]) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Render health status indicator
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
health: Health status dictionary or None
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: Formatted health indicator with color
|
|
120
|
+
"""
|
|
121
|
+
if not health:
|
|
122
|
+
return "[dim]?[/dim]"
|
|
123
|
+
|
|
124
|
+
is_healthy = health.get('is_healthy', False)
|
|
125
|
+
error = health.get('error')
|
|
126
|
+
|
|
127
|
+
if is_healthy:
|
|
128
|
+
return "[green]✓[/green]"
|
|
129
|
+
elif error:
|
|
130
|
+
return "[red]✗[/red]"
|
|
131
|
+
else:
|
|
132
|
+
return "[yellow]?[/yellow]"
|
|
133
|
+
|
|
134
|
+
def render_state_indicator(self, state: ServerState) -> str:
|
|
135
|
+
"""
|
|
136
|
+
Render server state indicator
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
state: Current server state
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
str: Formatted state indicator with color and symbol
|
|
143
|
+
"""
|
|
144
|
+
indicators = {
|
|
145
|
+
ServerState.RUNNING: "[green]✓ Run[/green]",
|
|
146
|
+
ServerState.STOPPED: "[red]✗ Stop[/red]",
|
|
147
|
+
ServerState.ERROR: "[red]⚠ Err[/red]",
|
|
148
|
+
ServerState.STARTING: "[yellow]⏳ Start[/yellow]",
|
|
149
|
+
ServerState.STOPPING: "[yellow]⏳ Stop[/yellow]",
|
|
150
|
+
ServerState.QUARANTINED: "[yellow]⏸ Quar[/yellow]",
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return indicators.get(state, "[dim]? Unk[/dim]")
|
|
154
|
+
|
|
155
|
+
def render_metrics_summary(self, metrics: Dict) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Render a summary of server metrics
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
metrics: Dictionary of server metrics
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
str: Formatted metrics summary
|
|
164
|
+
"""
|
|
165
|
+
if not metrics:
|
|
166
|
+
return "No metrics"
|
|
167
|
+
|
|
168
|
+
parts = []
|
|
169
|
+
|
|
170
|
+
# Request count
|
|
171
|
+
if 'request_count' in metrics:
|
|
172
|
+
parts.append(f"Req: {metrics['request_count']}")
|
|
173
|
+
|
|
174
|
+
# Error rate
|
|
175
|
+
if 'error_rate' in metrics:
|
|
176
|
+
error_rate = metrics['error_rate']
|
|
177
|
+
if error_rate > 0.1: # 10%
|
|
178
|
+
parts.append(f"[red]Err: {error_rate:.1%}[/red]")
|
|
179
|
+
elif error_rate > 0.05: # 5%
|
|
180
|
+
parts.append(f"[yellow]Err: {error_rate:.1%}[/yellow]")
|
|
181
|
+
else:
|
|
182
|
+
parts.append(f"[green]Err: {error_rate:.1%}[/green]")
|
|
183
|
+
|
|
184
|
+
# Response time
|
|
185
|
+
if 'avg_response_time' in metrics:
|
|
186
|
+
avg_time = metrics['avg_response_time']
|
|
187
|
+
parts.append(f"Avg: {avg_time:.0f}ms")
|
|
188
|
+
|
|
189
|
+
return " | ".join(parts) if parts else "No data"
|
|
190
|
+
|
|
191
|
+
def format_uptime(self, start_time: datetime) -> str:
|
|
192
|
+
"""
|
|
193
|
+
Format uptime duration in human readable format
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
start_time: Server start timestamp
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
str: Formatted uptime string (e.g., "2h 15m")
|
|
200
|
+
"""
|
|
201
|
+
if not start_time:
|
|
202
|
+
return "-"
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
uptime = datetime.now() - start_time
|
|
206
|
+
|
|
207
|
+
# Handle negative uptime (clock skew, etc.)
|
|
208
|
+
if uptime.total_seconds() < 0:
|
|
209
|
+
return "0s"
|
|
210
|
+
|
|
211
|
+
# Format based on duration
|
|
212
|
+
total_seconds = int(uptime.total_seconds())
|
|
213
|
+
|
|
214
|
+
if total_seconds < 60: # Less than 1 minute
|
|
215
|
+
return f"{total_seconds}s"
|
|
216
|
+
elif total_seconds < 3600: # Less than 1 hour
|
|
217
|
+
minutes = total_seconds // 60
|
|
218
|
+
seconds = total_seconds % 60
|
|
219
|
+
if seconds > 0:
|
|
220
|
+
return f"{minutes}m {seconds}s"
|
|
221
|
+
else:
|
|
222
|
+
return f"{minutes}m"
|
|
223
|
+
elif total_seconds < 86400: # Less than 1 day
|
|
224
|
+
hours = total_seconds // 3600
|
|
225
|
+
minutes = (total_seconds % 3600) // 60
|
|
226
|
+
if minutes > 0:
|
|
227
|
+
return f"{hours}h {minutes}m"
|
|
228
|
+
else:
|
|
229
|
+
return f"{hours}h"
|
|
230
|
+
else: # 1 day or more
|
|
231
|
+
days = total_seconds // 86400
|
|
232
|
+
hours = (total_seconds % 86400) // 3600
|
|
233
|
+
if hours > 0:
|
|
234
|
+
return f"{days}d {hours}h"
|
|
235
|
+
else:
|
|
236
|
+
return f"{days}d"
|
|
237
|
+
|
|
238
|
+
except Exception:
|
|
239
|
+
return "?"
|
|
240
|
+
|
|
241
|
+
def format_latency(self, latency_ms: float) -> str:
|
|
242
|
+
"""
|
|
243
|
+
Format latency in human readable format
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
latency_ms: Latency in milliseconds
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
str: Formatted latency string with color coding
|
|
250
|
+
"""
|
|
251
|
+
if latency_ms is None:
|
|
252
|
+
return "-"
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
if latency_ms < 0:
|
|
256
|
+
return "invalid"
|
|
257
|
+
elif latency_ms < 50: # Fast
|
|
258
|
+
return f"[green]{latency_ms:.0f}ms[/green]"
|
|
259
|
+
elif latency_ms < 200: # Acceptable
|
|
260
|
+
return f"[yellow]{latency_ms:.0f}ms[/yellow]"
|
|
261
|
+
elif latency_ms < 1000: # Slow
|
|
262
|
+
return f"[red]{latency_ms:.0f}ms[/red]"
|
|
263
|
+
elif latency_ms >= 30000: # Timeout (30s+)
|
|
264
|
+
return "[red]timeout[/red]"
|
|
265
|
+
else: # Very slow
|
|
266
|
+
seconds = latency_ms / 1000
|
|
267
|
+
return f"[red]{seconds:.1f}s[/red]"
|
|
268
|
+
|
|
269
|
+
except (ValueError, TypeError):
|
|
270
|
+
return "error"
|
|
271
|
+
|
|
272
|
+
def print_dashboard(self) -> None:
|
|
273
|
+
"""Print the dashboard to console"""
|
|
274
|
+
table = self.render_dashboard()
|
|
275
|
+
self.console.print(table)
|
|
276
|
+
self.console.print() # Add spacing
|
|
277
|
+
|
|
278
|
+
def get_dashboard_string(self) -> str:
|
|
279
|
+
"""
|
|
280
|
+
Get dashboard as a string for programmatic use
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
str: Dashboard rendered as plain text
|
|
284
|
+
"""
|
|
285
|
+
# Create a console that captures output
|
|
286
|
+
console = Console(file=None, width=80)
|
|
287
|
+
|
|
288
|
+
with console.capture() as capture:
|
|
289
|
+
console.print(self.render_dashboard())
|
|
290
|
+
|
|
291
|
+
return capture.get()
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MCP Error Isolation System
|
|
3
|
+
|
|
4
|
+
This module provides error isolation for MCP server calls to prevent
|
|
5
|
+
server errors from crashing the application. It implements quarantine
|
|
6
|
+
logic with exponential backoff for failed servers.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import datetime, timedelta
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any, Callable, Dict, Optional
|
|
14
|
+
from enum import Enum
|
|
15
|
+
import traceback
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class ErrorStats:
|
|
23
|
+
"""Statistics for MCP server errors and quarantine status."""
|
|
24
|
+
total_errors: int = 0
|
|
25
|
+
consecutive_errors: int = 0
|
|
26
|
+
last_error: Optional[datetime] = None
|
|
27
|
+
error_types: Dict[str, int] = field(default_factory=dict)
|
|
28
|
+
quarantine_count: int = 0
|
|
29
|
+
quarantine_until: Optional[datetime] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ErrorCategory(Enum):
|
|
33
|
+
"""Categories of errors that can be isolated."""
|
|
34
|
+
NETWORK = "network"
|
|
35
|
+
PROTOCOL = "protocol"
|
|
36
|
+
SERVER = "server"
|
|
37
|
+
RATE_LIMIT = "rate_limit"
|
|
38
|
+
AUTHENTICATION = "authentication"
|
|
39
|
+
UNKNOWN = "unknown"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MCPErrorIsolator:
|
|
43
|
+
"""
|
|
44
|
+
Isolates MCP server errors to prevent application crashes.
|
|
45
|
+
|
|
46
|
+
Features:
|
|
47
|
+
- Quarantine servers after consecutive failures
|
|
48
|
+
- Exponential backoff for quarantine duration
|
|
49
|
+
- Error categorization and tracking
|
|
50
|
+
- Automatic recovery after successful calls
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(self, quarantine_threshold: int = 5, max_quarantine_minutes: int = 30):
|
|
54
|
+
"""
|
|
55
|
+
Initialize the error isolator.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
quarantine_threshold: Number of consecutive errors to trigger quarantine
|
|
59
|
+
max_quarantine_minutes: Maximum quarantine duration in minutes
|
|
60
|
+
"""
|
|
61
|
+
self.quarantine_threshold = quarantine_threshold
|
|
62
|
+
self.max_quarantine_duration = timedelta(minutes=max_quarantine_minutes)
|
|
63
|
+
self.server_stats: Dict[str, ErrorStats] = {}
|
|
64
|
+
self._lock = asyncio.Lock()
|
|
65
|
+
|
|
66
|
+
logger.info(
|
|
67
|
+
f"MCPErrorIsolator initialized with threshold={quarantine_threshold}, "
|
|
68
|
+
f"max_quarantine={max_quarantine_minutes}min"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
async def isolated_call(self, server_id: str, func: Callable, *args, **kwargs) -> Any:
|
|
72
|
+
"""
|
|
73
|
+
Execute a function call with error isolation.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
server_id: ID of the MCP server making the call
|
|
77
|
+
func: Function to execute
|
|
78
|
+
*args: Arguments for the function
|
|
79
|
+
**kwargs: Keyword arguments for the function
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Result of the function call
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
Exception: If the server is quarantined or the call fails
|
|
86
|
+
"""
|
|
87
|
+
async with self._lock:
|
|
88
|
+
# Check if server is quarantined
|
|
89
|
+
if self.is_quarantined(server_id):
|
|
90
|
+
quarantine_until = self.server_stats[server_id].quarantine_until
|
|
91
|
+
raise QuarantinedServerError(
|
|
92
|
+
f"Server {server_id} is quarantined until {quarantine_until}"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
# Execute the function
|
|
97
|
+
if asyncio.iscoroutinefunction(func):
|
|
98
|
+
result = await func(*args, **kwargs)
|
|
99
|
+
else:
|
|
100
|
+
result = func(*args, **kwargs)
|
|
101
|
+
|
|
102
|
+
# Record success
|
|
103
|
+
async with self._lock:
|
|
104
|
+
await self._record_success(server_id)
|
|
105
|
+
|
|
106
|
+
return result
|
|
107
|
+
|
|
108
|
+
except Exception as error:
|
|
109
|
+
# Record and categorize the error
|
|
110
|
+
async with self._lock:
|
|
111
|
+
await self._record_error(server_id, error)
|
|
112
|
+
|
|
113
|
+
# Re-raise the error
|
|
114
|
+
raise
|
|
115
|
+
|
|
116
|
+
async def quarantine_server(self, server_id: str, duration: int) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Manually quarantine a server for a specific duration.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
server_id: ID of the server to quarantine
|
|
122
|
+
duration: Quarantine duration in seconds
|
|
123
|
+
"""
|
|
124
|
+
async with self._lock:
|
|
125
|
+
stats = self._get_or_create_stats(server_id)
|
|
126
|
+
stats.quarantine_until = datetime.now() + timedelta(seconds=duration)
|
|
127
|
+
stats.quarantine_count += 1
|
|
128
|
+
|
|
129
|
+
logger.warning(
|
|
130
|
+
f"Server {server_id} quarantined for {duration}s "
|
|
131
|
+
f"(count: {stats.quarantine_count})"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def is_quarantined(self, server_id: str) -> bool:
|
|
135
|
+
"""
|
|
136
|
+
Check if a server is currently quarantined.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
server_id: ID of the server to check
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
True if the server is quarantined, False otherwise
|
|
143
|
+
"""
|
|
144
|
+
if server_id not in self.server_stats:
|
|
145
|
+
return False
|
|
146
|
+
|
|
147
|
+
stats = self.server_stats[server_id]
|
|
148
|
+
if stats.quarantine_until is None:
|
|
149
|
+
return False
|
|
150
|
+
|
|
151
|
+
# Check if quarantine has expired
|
|
152
|
+
if datetime.now() >= stats.quarantine_until:
|
|
153
|
+
stats.quarantine_until = None
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
return True
|
|
157
|
+
|
|
158
|
+
async def release_quarantine(self, server_id: str) -> None:
|
|
159
|
+
"""
|
|
160
|
+
Manually release a server from quarantine.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
server_id: ID of the server to release
|
|
164
|
+
"""
|
|
165
|
+
async with self._lock:
|
|
166
|
+
if server_id in self.server_stats:
|
|
167
|
+
self.server_stats[server_id].quarantine_until = None
|
|
168
|
+
logger.info(f"Server {server_id} released from quarantine")
|
|
169
|
+
|
|
170
|
+
def get_error_stats(self, server_id: str) -> ErrorStats:
|
|
171
|
+
"""
|
|
172
|
+
Get error statistics for a server.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
server_id: ID of the server
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
ErrorStats object with current statistics
|
|
179
|
+
"""
|
|
180
|
+
if server_id not in self.server_stats:
|
|
181
|
+
return ErrorStats()
|
|
182
|
+
|
|
183
|
+
return self.server_stats[server_id]
|
|
184
|
+
|
|
185
|
+
def should_quarantine(self, server_id: str) -> bool:
|
|
186
|
+
"""
|
|
187
|
+
Check if a server should be quarantined based on error count.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
server_id: ID of the server to check
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
True if the server should be quarantined
|
|
194
|
+
"""
|
|
195
|
+
if server_id not in self.server_stats:
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
stats = self.server_stats[server_id]
|
|
199
|
+
return stats.consecutive_errors >= self.quarantine_threshold
|
|
200
|
+
|
|
201
|
+
def _get_or_create_stats(self, server_id: str) -> ErrorStats:
|
|
202
|
+
"""Get or create error stats for a server."""
|
|
203
|
+
if server_id not in self.server_stats:
|
|
204
|
+
self.server_stats[server_id] = ErrorStats()
|
|
205
|
+
return self.server_stats[server_id]
|
|
206
|
+
|
|
207
|
+
async def _record_success(self, server_id: str) -> None:
|
|
208
|
+
"""Record a successful call and reset consecutive error count."""
|
|
209
|
+
stats = self._get_or_create_stats(server_id)
|
|
210
|
+
stats.consecutive_errors = 0
|
|
211
|
+
|
|
212
|
+
logger.debug(f"Success recorded for server {server_id}, consecutive errors reset")
|
|
213
|
+
|
|
214
|
+
async def _record_error(self, server_id: str, error: Exception) -> None:
|
|
215
|
+
"""Record an error and potentially quarantine the server."""
|
|
216
|
+
stats = self._get_or_create_stats(server_id)
|
|
217
|
+
|
|
218
|
+
# Update error statistics
|
|
219
|
+
stats.total_errors += 1
|
|
220
|
+
stats.consecutive_errors += 1
|
|
221
|
+
stats.last_error = datetime.now()
|
|
222
|
+
|
|
223
|
+
# Categorize the error
|
|
224
|
+
error_category = self._categorize_error(error)
|
|
225
|
+
error_type = error_category.value
|
|
226
|
+
stats.error_types[error_type] = stats.error_types.get(error_type, 0) + 1
|
|
227
|
+
|
|
228
|
+
logger.warning(
|
|
229
|
+
f"Error recorded for server {server_id}: {error_type} - {str(error)} "
|
|
230
|
+
f"(consecutive: {stats.consecutive_errors})"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Check if quarantine is needed
|
|
234
|
+
if self.should_quarantine(server_id):
|
|
235
|
+
quarantine_duration = self._calculate_quarantine_duration(stats.quarantine_count)
|
|
236
|
+
stats.quarantine_until = datetime.now() + timedelta(seconds=quarantine_duration)
|
|
237
|
+
stats.quarantine_count += 1
|
|
238
|
+
|
|
239
|
+
logger.error(
|
|
240
|
+
f"Server {server_id} quarantined for {quarantine_duration}s "
|
|
241
|
+
f"after {stats.consecutive_errors} consecutive errors "
|
|
242
|
+
f"(quarantine count: {stats.quarantine_count})"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def _categorize_error(self, error: Exception) -> ErrorCategory:
|
|
246
|
+
"""
|
|
247
|
+
Categorize an error based on its type and properties.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
error: The exception to categorize
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
ErrorCategory enum value
|
|
254
|
+
"""
|
|
255
|
+
error_type = type(error).__name__.lower()
|
|
256
|
+
error_message = str(error).lower()
|
|
257
|
+
|
|
258
|
+
# Network errors
|
|
259
|
+
if any(keyword in error_type for keyword in [
|
|
260
|
+
'connection', 'timeout', 'network', 'socket', 'dns', 'ssl'
|
|
261
|
+
]):
|
|
262
|
+
return ErrorCategory.NETWORK
|
|
263
|
+
|
|
264
|
+
if any(keyword in error_message for keyword in [
|
|
265
|
+
'connection', 'timeout', 'network', 'unreachable', 'refused'
|
|
266
|
+
]):
|
|
267
|
+
return ErrorCategory.NETWORK
|
|
268
|
+
|
|
269
|
+
# Protocol errors
|
|
270
|
+
if any(keyword in error_type for keyword in [
|
|
271
|
+
'json', 'decode', 'parse', 'schema', 'validation', 'protocol'
|
|
272
|
+
]):
|
|
273
|
+
return ErrorCategory.PROTOCOL
|
|
274
|
+
|
|
275
|
+
if any(keyword in error_message for keyword in [
|
|
276
|
+
'json', 'decode', 'parse', 'invalid', 'malformed', 'schema'
|
|
277
|
+
]):
|
|
278
|
+
return ErrorCategory.PROTOCOL
|
|
279
|
+
|
|
280
|
+
# Authentication errors
|
|
281
|
+
if any(keyword in error_type for keyword in [
|
|
282
|
+
'auth', 'permission', 'unauthorized', 'forbidden'
|
|
283
|
+
]):
|
|
284
|
+
return ErrorCategory.AUTHENTICATION
|
|
285
|
+
|
|
286
|
+
if any(keyword in error_message for keyword in [
|
|
287
|
+
'401', '403', 'unauthorized', 'forbidden', 'authentication', 'permission'
|
|
288
|
+
]):
|
|
289
|
+
return ErrorCategory.AUTHENTICATION
|
|
290
|
+
|
|
291
|
+
# Rate limit errors
|
|
292
|
+
if any(keyword in error_type for keyword in ['rate', 'limit', 'throttle']):
|
|
293
|
+
return ErrorCategory.RATE_LIMIT
|
|
294
|
+
|
|
295
|
+
if any(keyword in error_message for keyword in [
|
|
296
|
+
'429', 'rate limit', 'too many requests', 'throttle'
|
|
297
|
+
]):
|
|
298
|
+
return ErrorCategory.RATE_LIMIT
|
|
299
|
+
|
|
300
|
+
# Server errors (5xx responses)
|
|
301
|
+
if any(keyword in error_message for keyword in [
|
|
302
|
+
'500', '501', '502', '503', '504', '505', 'internal server error',
|
|
303
|
+
'bad gateway', 'service unavailable', 'gateway timeout'
|
|
304
|
+
]):
|
|
305
|
+
return ErrorCategory.SERVER
|
|
306
|
+
|
|
307
|
+
if any(keyword in error_type for keyword in ['server', 'internal']):
|
|
308
|
+
return ErrorCategory.SERVER
|
|
309
|
+
|
|
310
|
+
# Default to unknown
|
|
311
|
+
return ErrorCategory.UNKNOWN
|
|
312
|
+
|
|
313
|
+
def _calculate_quarantine_duration(self, quarantine_count: int) -> int:
|
|
314
|
+
"""
|
|
315
|
+
Calculate quarantine duration using exponential backoff.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
quarantine_count: Number of times this server has been quarantined
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
Quarantine duration in seconds
|
|
322
|
+
"""
|
|
323
|
+
# Base duration: 30 seconds
|
|
324
|
+
base_duration = 30
|
|
325
|
+
|
|
326
|
+
# Exponential backoff: 30s, 60s, 120s, 240s, etc.
|
|
327
|
+
duration = base_duration * (2 ** quarantine_count)
|
|
328
|
+
|
|
329
|
+
# Cap at maximum duration (convert to seconds)
|
|
330
|
+
max_seconds = int(self.max_quarantine_duration.total_seconds())
|
|
331
|
+
duration = min(duration, max_seconds)
|
|
332
|
+
|
|
333
|
+
logger.debug(
|
|
334
|
+
f"Calculated quarantine duration: {duration}s "
|
|
335
|
+
f"(count: {quarantine_count}, max: {max_seconds}s)"
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
return duration
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class QuarantinedServerError(Exception):
|
|
342
|
+
"""Raised when attempting to call a quarantined server."""
|
|
343
|
+
pass
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# Global isolator instance
|
|
347
|
+
_isolator_instance: Optional[MCPErrorIsolator] = None
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def get_error_isolator() -> MCPErrorIsolator:
|
|
351
|
+
"""
|
|
352
|
+
Get the global MCPErrorIsolator instance.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
MCPErrorIsolator instance
|
|
356
|
+
"""
|
|
357
|
+
global _isolator_instance
|
|
358
|
+
if _isolator_instance is None:
|
|
359
|
+
_isolator_instance = MCPErrorIsolator()
|
|
360
|
+
return _isolator_instance
|