code-puppy 0.0.135__py3-none-any.whl → 0.0.136__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_puppy/agent.py +15 -17
- code_puppy/agents/agent_manager.py +320 -9
- code_puppy/agents/base_agent.py +58 -2
- code_puppy/agents/runtime_manager.py +68 -42
- code_puppy/command_line/command_handler.py +82 -33
- code_puppy/command_line/mcp/__init__.py +10 -0
- code_puppy/command_line/mcp/add_command.py +183 -0
- code_puppy/command_line/mcp/base.py +35 -0
- code_puppy/command_line/mcp/handler.py +133 -0
- code_puppy/command_line/mcp/help_command.py +146 -0
- code_puppy/command_line/mcp/install_command.py +176 -0
- code_puppy/command_line/mcp/list_command.py +94 -0
- code_puppy/command_line/mcp/logs_command.py +126 -0
- code_puppy/command_line/mcp/remove_command.py +82 -0
- code_puppy/command_line/mcp/restart_command.py +92 -0
- code_puppy/command_line/mcp/search_command.py +117 -0
- code_puppy/command_line/mcp/start_all_command.py +126 -0
- code_puppy/command_line/mcp/start_command.py +98 -0
- code_puppy/command_line/mcp/status_command.py +185 -0
- code_puppy/command_line/mcp/stop_all_command.py +109 -0
- code_puppy/command_line/mcp/stop_command.py +79 -0
- code_puppy/command_line/mcp/test_command.py +107 -0
- code_puppy/command_line/mcp/utils.py +129 -0
- code_puppy/command_line/mcp/wizard_utils.py +259 -0
- code_puppy/command_line/model_picker_completion.py +21 -4
- code_puppy/command_line/prompt_toolkit_completion.py +9 -0
- code_puppy/main.py +23 -17
- code_puppy/mcp/__init__.py +42 -16
- code_puppy/mcp/async_lifecycle.py +51 -49
- code_puppy/mcp/blocking_startup.py +125 -113
- code_puppy/mcp/captured_stdio_server.py +63 -70
- code_puppy/mcp/circuit_breaker.py +63 -47
- code_puppy/mcp/config_wizard.py +169 -136
- code_puppy/mcp/dashboard.py +79 -71
- code_puppy/mcp/error_isolation.py +147 -100
- code_puppy/mcp/examples/retry_example.py +55 -42
- code_puppy/mcp/health_monitor.py +152 -141
- code_puppy/mcp/managed_server.py +100 -93
- code_puppy/mcp/manager.py +168 -156
- code_puppy/mcp/registry.py +148 -110
- code_puppy/mcp/retry_manager.py +63 -61
- code_puppy/mcp/server_registry_catalog.py +271 -225
- code_puppy/mcp/status_tracker.py +80 -80
- code_puppy/mcp/system_tools.py +47 -52
- code_puppy/messaging/message_queue.py +20 -13
- code_puppy/messaging/renderers.py +30 -15
- code_puppy/state_management.py +103 -0
- code_puppy/tui/app.py +64 -7
- code_puppy/tui/components/chat_view.py +3 -3
- code_puppy/tui/components/human_input_modal.py +12 -8
- code_puppy/tui/screens/__init__.py +2 -2
- code_puppy/tui/screens/mcp_install_wizard.py +208 -179
- code_puppy/tui/tests/test_agent_command.py +3 -3
- {code_puppy-0.0.135.dist-info → code_puppy-0.0.136.dist-info}/METADATA +1 -1
- {code_puppy-0.0.135.dist-info → code_puppy-0.0.136.dist-info}/RECORD +59 -41
- code_puppy/command_line/mcp_commands.py +0 -1789
- {code_puppy-0.0.135.data → code_puppy-0.0.136.data}/data/code_puppy/models.json +0 -0
- {code_puppy-0.0.135.dist-info → code_puppy-0.0.136.dist-info}/WHEEL +0 -0
- {code_puppy-0.0.135.dist-info → code_puppy-0.0.136.dist-info}/entry_points.txt +0 -0
- {code_puppy-0.0.135.dist-info → code_puppy-0.0.136.dist-info}/licenses/LICENSE +0 -0
code_puppy/mcp/health_monitor.py
CHANGED
|
@@ -11,12 +11,11 @@ import time
|
|
|
11
11
|
from collections import defaultdict, deque
|
|
12
12
|
from dataclasses import dataclass
|
|
13
13
|
from datetime import datetime
|
|
14
|
-
from typing import
|
|
15
|
-
import httpx
|
|
16
|
-
import json
|
|
14
|
+
from typing import Callable, Dict, List, Optional
|
|
17
15
|
|
|
18
|
-
|
|
16
|
+
import httpx
|
|
19
17
|
|
|
18
|
+
from .managed_server import ManagedMCPServer
|
|
20
19
|
|
|
21
20
|
logger = logging.getLogger(__name__)
|
|
22
21
|
|
|
@@ -24,6 +23,7 @@ logger = logging.getLogger(__name__)
|
|
|
24
23
|
@dataclass
|
|
25
24
|
class HealthStatus:
|
|
26
25
|
"""Status of a health check for an MCP server."""
|
|
26
|
+
|
|
27
27
|
timestamp: datetime
|
|
28
28
|
is_healthy: bool
|
|
29
29
|
latency_ms: Optional[float]
|
|
@@ -34,6 +34,7 @@ class HealthStatus:
|
|
|
34
34
|
@dataclass
|
|
35
35
|
class HealthCheckResult:
|
|
36
36
|
"""Result of performing a health check."""
|
|
37
|
+
|
|
37
38
|
success: bool
|
|
38
39
|
latency_ms: float
|
|
39
40
|
error: Optional[str]
|
|
@@ -42,7 +43,7 @@ class HealthCheckResult:
|
|
|
42
43
|
class HealthMonitor:
|
|
43
44
|
"""
|
|
44
45
|
Continuous health monitoring system for MCP servers.
|
|
45
|
-
|
|
46
|
+
|
|
46
47
|
Features:
|
|
47
48
|
- Background monitoring tasks using asyncio
|
|
48
49
|
- Server type-specific health checks
|
|
@@ -50,22 +51,22 @@ class HealthMonitor:
|
|
|
50
51
|
- Custom health check registration
|
|
51
52
|
- Automatic recovery triggering on consecutive failures
|
|
52
53
|
- Configurable check intervals
|
|
53
|
-
|
|
54
|
+
|
|
54
55
|
Example usage:
|
|
55
56
|
monitor = HealthMonitor(check_interval=30)
|
|
56
57
|
await monitor.start_monitoring("server-1", managed_server)
|
|
57
|
-
|
|
58
|
+
|
|
58
59
|
# Check current health
|
|
59
60
|
is_healthy = monitor.is_healthy("server-1")
|
|
60
|
-
|
|
61
|
+
|
|
61
62
|
# Get health history
|
|
62
63
|
history = monitor.get_health_history("server-1", limit=50)
|
|
63
64
|
"""
|
|
64
|
-
|
|
65
|
+
|
|
65
66
|
def __init__(self, check_interval: int = 30):
|
|
66
67
|
"""
|
|
67
68
|
Initialize the health monitor.
|
|
68
|
-
|
|
69
|
+
|
|
69
70
|
Args:
|
|
70
71
|
check_interval: Interval between health checks in seconds
|
|
71
72
|
"""
|
|
@@ -75,22 +76,22 @@ class HealthMonitor:
|
|
|
75
76
|
self.custom_health_checks: Dict[str, Callable] = {}
|
|
76
77
|
self.consecutive_failures: Dict[str, int] = defaultdict(int)
|
|
77
78
|
self.last_check_time: Dict[str, datetime] = {}
|
|
78
|
-
|
|
79
|
+
|
|
79
80
|
# Register default health checks for each server type
|
|
80
81
|
self._register_default_health_checks()
|
|
81
|
-
|
|
82
|
+
|
|
82
83
|
logger.info(f"Health monitor initialized with {check_interval}s check interval")
|
|
83
|
-
|
|
84
|
+
|
|
84
85
|
def _register_default_health_checks(self) -> None:
|
|
85
86
|
"""Register default health check methods for each server type."""
|
|
86
87
|
self.register_health_check("sse", self._check_sse_health)
|
|
87
88
|
self.register_health_check("http", self._check_http_health)
|
|
88
89
|
self.register_health_check("stdio", self._check_stdio_health)
|
|
89
|
-
|
|
90
|
+
|
|
90
91
|
async def start_monitoring(self, server_id: str, server: ManagedMCPServer) -> None:
|
|
91
92
|
"""
|
|
92
93
|
Start continuous health monitoring for a server.
|
|
93
|
-
|
|
94
|
+
|
|
94
95
|
Args:
|
|
95
96
|
server_id: Unique identifier for the server
|
|
96
97
|
server: The managed MCP server instance to monitor
|
|
@@ -98,16 +99,15 @@ class HealthMonitor:
|
|
|
98
99
|
if server_id in self.monitoring_tasks:
|
|
99
100
|
logger.warning(f"Server {server_id} is already being monitored")
|
|
100
101
|
return
|
|
101
|
-
|
|
102
|
+
|
|
102
103
|
logger.info(f"Starting health monitoring for server {server_id}")
|
|
103
|
-
|
|
104
|
+
|
|
104
105
|
# Create background monitoring task
|
|
105
106
|
task = asyncio.create_task(
|
|
106
|
-
self._monitoring_loop(server_id, server),
|
|
107
|
-
name=f"health_monitor_{server_id}"
|
|
107
|
+
self._monitoring_loop(server_id, server), name=f"health_monitor_{server_id}"
|
|
108
108
|
)
|
|
109
109
|
self.monitoring_tasks[server_id] = task
|
|
110
|
-
|
|
110
|
+
|
|
111
111
|
# Perform initial health check
|
|
112
112
|
try:
|
|
113
113
|
health_status = await self.check_health(server)
|
|
@@ -119,14 +119,14 @@ class HealthMonitor:
|
|
|
119
119
|
is_healthy=False,
|
|
120
120
|
latency_ms=None,
|
|
121
121
|
error=str(e),
|
|
122
|
-
check_type="initial"
|
|
122
|
+
check_type="initial",
|
|
123
123
|
)
|
|
124
124
|
self._record_health_status(server_id, error_status)
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
async def stop_monitoring(self, server_id: str) -> None:
|
|
127
127
|
"""
|
|
128
128
|
Stop health monitoring for a server.
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
Args:
|
|
131
131
|
server_id: Unique identifier for the server
|
|
132
132
|
"""
|
|
@@ -138,36 +138,38 @@ class HealthMonitor:
|
|
|
138
138
|
await task
|
|
139
139
|
except asyncio.CancelledError:
|
|
140
140
|
pass
|
|
141
|
-
|
|
141
|
+
|
|
142
142
|
# Clean up tracking data
|
|
143
143
|
self.consecutive_failures.pop(server_id, None)
|
|
144
144
|
self.last_check_time.pop(server_id, None)
|
|
145
145
|
else:
|
|
146
146
|
logger.warning(f"No monitoring task found for server {server_id}")
|
|
147
|
-
|
|
147
|
+
|
|
148
148
|
async def check_health(self, server: ManagedMCPServer) -> HealthStatus:
|
|
149
149
|
"""
|
|
150
150
|
Perform a health check for a server.
|
|
151
|
-
|
|
151
|
+
|
|
152
152
|
Args:
|
|
153
153
|
server: The managed MCP server to check
|
|
154
|
-
|
|
154
|
+
|
|
155
155
|
Returns:
|
|
156
156
|
HealthStatus object with check results
|
|
157
157
|
"""
|
|
158
158
|
server_type = server.config.type.lower()
|
|
159
159
|
check_func = self.custom_health_checks.get(server_type)
|
|
160
|
-
|
|
160
|
+
|
|
161
161
|
if not check_func:
|
|
162
|
-
logger.warning(
|
|
162
|
+
logger.warning(
|
|
163
|
+
f"No health check function registered for server type: {server_type}"
|
|
164
|
+
)
|
|
163
165
|
return HealthStatus(
|
|
164
166
|
timestamp=datetime.now(),
|
|
165
167
|
is_healthy=False,
|
|
166
168
|
latency_ms=None,
|
|
167
169
|
error=f"No health check registered for type '{server_type}'",
|
|
168
|
-
check_type="unknown"
|
|
170
|
+
check_type="unknown",
|
|
169
171
|
)
|
|
170
|
-
|
|
172
|
+
|
|
171
173
|
try:
|
|
172
174
|
result = await self.perform_health_check(server)
|
|
173
175
|
return HealthStatus(
|
|
@@ -175,7 +177,7 @@ class HealthMonitor:
|
|
|
175
177
|
is_healthy=result.success,
|
|
176
178
|
latency_ms=result.latency_ms,
|
|
177
179
|
error=result.error,
|
|
178
|
-
check_type=server_type
|
|
180
|
+
check_type=server_type,
|
|
179
181
|
)
|
|
180
182
|
except Exception as e:
|
|
181
183
|
logger.error(f"Health check failed for server {server.config.id}: {e}")
|
|
@@ -184,39 +186,39 @@ class HealthMonitor:
|
|
|
184
186
|
is_healthy=False,
|
|
185
187
|
latency_ms=None,
|
|
186
188
|
error=str(e),
|
|
187
|
-
check_type=server_type
|
|
189
|
+
check_type=server_type,
|
|
188
190
|
)
|
|
189
|
-
|
|
191
|
+
|
|
190
192
|
async def perform_health_check(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
191
193
|
"""
|
|
192
194
|
Perform the actual health check based on server type.
|
|
193
|
-
|
|
195
|
+
|
|
194
196
|
Args:
|
|
195
197
|
server: The managed MCP server to check
|
|
196
|
-
|
|
198
|
+
|
|
197
199
|
Returns:
|
|
198
200
|
HealthCheckResult with timing and success information
|
|
199
201
|
"""
|
|
200
202
|
server_type = server.config.type.lower()
|
|
201
203
|
check_func = self.custom_health_checks.get(server_type)
|
|
202
|
-
|
|
204
|
+
|
|
203
205
|
if not check_func:
|
|
204
206
|
return HealthCheckResult(
|
|
205
207
|
success=False,
|
|
206
208
|
latency_ms=0.0,
|
|
207
|
-
error=f"No health check function for type '{server_type}'"
|
|
209
|
+
error=f"No health check function for type '{server_type}'",
|
|
208
210
|
)
|
|
209
|
-
|
|
211
|
+
|
|
210
212
|
start_time = time.time()
|
|
211
213
|
try:
|
|
212
214
|
result = await check_func(server)
|
|
213
215
|
latency_ms = (time.time() - start_time) * 1000
|
|
214
|
-
|
|
216
|
+
|
|
215
217
|
if isinstance(result, bool):
|
|
216
218
|
return HealthCheckResult(
|
|
217
219
|
success=result,
|
|
218
220
|
latency_ms=latency_ms,
|
|
219
|
-
error=None if result else "Health check returned False"
|
|
221
|
+
error=None if result else "Health check returned False",
|
|
220
222
|
)
|
|
221
223
|
elif isinstance(result, HealthCheckResult):
|
|
222
224
|
# Update latency if not already set
|
|
@@ -227,21 +229,17 @@ class HealthMonitor:
|
|
|
227
229
|
return HealthCheckResult(
|
|
228
230
|
success=False,
|
|
229
231
|
latency_ms=latency_ms,
|
|
230
|
-
error=f"Invalid health check result type: {type(result)}"
|
|
232
|
+
error=f"Invalid health check result type: {type(result)}",
|
|
231
233
|
)
|
|
232
|
-
|
|
234
|
+
|
|
233
235
|
except Exception as e:
|
|
234
236
|
latency_ms = (time.time() - start_time) * 1000
|
|
235
|
-
return HealthCheckResult(
|
|
236
|
-
|
|
237
|
-
latency_ms=latency_ms,
|
|
238
|
-
error=str(e)
|
|
239
|
-
)
|
|
240
|
-
|
|
237
|
+
return HealthCheckResult(success=False, latency_ms=latency_ms, error=str(e))
|
|
238
|
+
|
|
241
239
|
def register_health_check(self, server_type: str, check_func: Callable) -> None:
|
|
242
240
|
"""
|
|
243
241
|
Register a custom health check function for a server type.
|
|
244
|
-
|
|
242
|
+
|
|
245
243
|
Args:
|
|
246
244
|
server_type: The server type ("sse", "http", "stdio")
|
|
247
245
|
check_func: Async function that takes a ManagedMCPServer and returns
|
|
@@ -249,15 +247,17 @@ class HealthMonitor:
|
|
|
249
247
|
"""
|
|
250
248
|
self.custom_health_checks[server_type.lower()] = check_func
|
|
251
249
|
logger.info(f"Registered health check for server type: {server_type}")
|
|
252
|
-
|
|
253
|
-
def get_health_history(
|
|
250
|
+
|
|
251
|
+
def get_health_history(
|
|
252
|
+
self, server_id: str, limit: int = 100
|
|
253
|
+
) -> List[HealthStatus]:
|
|
254
254
|
"""
|
|
255
255
|
Get health check history for a server.
|
|
256
|
-
|
|
256
|
+
|
|
257
257
|
Args:
|
|
258
258
|
server_id: Unique identifier for the server
|
|
259
259
|
limit: Maximum number of history entries to return
|
|
260
|
-
|
|
260
|
+
|
|
261
261
|
Returns:
|
|
262
262
|
List of HealthStatus objects, most recent first
|
|
263
263
|
"""
|
|
@@ -267,48 +267,48 @@ class HealthMonitor:
|
|
|
267
267
|
# Reverse to get most recent first
|
|
268
268
|
result.reverse()
|
|
269
269
|
return result
|
|
270
|
-
|
|
270
|
+
|
|
271
271
|
def is_healthy(self, server_id: str) -> bool:
|
|
272
272
|
"""
|
|
273
273
|
Check if a server is currently healthy based on latest status.
|
|
274
|
-
|
|
274
|
+
|
|
275
275
|
Args:
|
|
276
276
|
server_id: Unique identifier for the server
|
|
277
|
-
|
|
277
|
+
|
|
278
278
|
Returns:
|
|
279
279
|
True if server is healthy, False otherwise
|
|
280
280
|
"""
|
|
281
281
|
history = self.health_history.get(server_id)
|
|
282
282
|
if not history:
|
|
283
283
|
return False
|
|
284
|
-
|
|
284
|
+
|
|
285
285
|
# Get most recent health status
|
|
286
286
|
latest_status = history[-1]
|
|
287
287
|
return latest_status.is_healthy
|
|
288
|
-
|
|
288
|
+
|
|
289
289
|
async def _monitoring_loop(self, server_id: str, server: ManagedMCPServer) -> None:
|
|
290
290
|
"""
|
|
291
291
|
Main monitoring loop that runs in the background.
|
|
292
|
-
|
|
292
|
+
|
|
293
293
|
Args:
|
|
294
294
|
server_id: Unique identifier for the server
|
|
295
295
|
server: The managed MCP server to monitor
|
|
296
296
|
"""
|
|
297
297
|
logger.info(f"Starting monitoring loop for server {server_id}")
|
|
298
|
-
|
|
298
|
+
|
|
299
299
|
while True:
|
|
300
300
|
try:
|
|
301
301
|
# Wait for check interval
|
|
302
302
|
await asyncio.sleep(self.check_interval)
|
|
303
|
-
|
|
303
|
+
|
|
304
304
|
# Skip if server is not enabled
|
|
305
305
|
if not server.is_enabled():
|
|
306
306
|
continue
|
|
307
|
-
|
|
307
|
+
|
|
308
308
|
# Perform health check
|
|
309
309
|
health_status = await self.check_health(server)
|
|
310
310
|
self._record_health_status(server_id, health_status)
|
|
311
|
-
|
|
311
|
+
|
|
312
312
|
# Handle consecutive failures
|
|
313
313
|
if not health_status.is_healthy:
|
|
314
314
|
self.consecutive_failures[server_id] += 1
|
|
@@ -316,17 +316,19 @@ class HealthMonitor:
|
|
|
316
316
|
f"Health check failed for {server_id}: {health_status.error} "
|
|
317
317
|
f"(consecutive failures: {self.consecutive_failures[server_id]})"
|
|
318
318
|
)
|
|
319
|
-
|
|
319
|
+
|
|
320
320
|
# Trigger recovery on consecutive failures
|
|
321
321
|
await self._handle_consecutive_failures(server_id, server)
|
|
322
322
|
else:
|
|
323
323
|
# Reset consecutive failure count on success
|
|
324
324
|
if self.consecutive_failures[server_id] > 0:
|
|
325
|
-
logger.info(
|
|
325
|
+
logger.info(
|
|
326
|
+
f"Server {server_id} recovered after health check success"
|
|
327
|
+
)
|
|
326
328
|
self.consecutive_failures[server_id] = 0
|
|
327
|
-
|
|
329
|
+
|
|
328
330
|
self.last_check_time[server_id] = datetime.now()
|
|
329
|
-
|
|
331
|
+
|
|
330
332
|
except asyncio.CancelledError:
|
|
331
333
|
logger.info(f"Monitoring loop cancelled for server {server_id}")
|
|
332
334
|
break
|
|
@@ -334,216 +336,225 @@ class HealthMonitor:
|
|
|
334
336
|
logger.error(f"Error in monitoring loop for {server_id}: {e}")
|
|
335
337
|
# Continue monitoring despite errors
|
|
336
338
|
await asyncio.sleep(5) # Brief delay before retrying
|
|
337
|
-
|
|
339
|
+
|
|
338
340
|
def _record_health_status(self, server_id: str, status: HealthStatus) -> None:
|
|
339
341
|
"""
|
|
340
342
|
Record a health status in the history.
|
|
341
|
-
|
|
343
|
+
|
|
342
344
|
Args:
|
|
343
345
|
server_id: Unique identifier for the server
|
|
344
346
|
status: The health status to record
|
|
345
347
|
"""
|
|
346
348
|
self.health_history[server_id].append(status)
|
|
347
|
-
|
|
349
|
+
|
|
348
350
|
# Log health status changes
|
|
349
351
|
if status.is_healthy:
|
|
350
|
-
logger.debug(
|
|
352
|
+
logger.debug(
|
|
353
|
+
f"Server {server_id} health check passed ({status.latency_ms:.1f}ms)"
|
|
354
|
+
)
|
|
351
355
|
else:
|
|
352
356
|
logger.warning(f"Server {server_id} health check failed: {status.error}")
|
|
353
|
-
|
|
354
|
-
async def _handle_consecutive_failures(
|
|
357
|
+
|
|
358
|
+
async def _handle_consecutive_failures(
|
|
359
|
+
self, server_id: str, server: ManagedMCPServer
|
|
360
|
+
) -> None:
|
|
355
361
|
"""
|
|
356
362
|
Handle consecutive health check failures.
|
|
357
|
-
|
|
363
|
+
|
|
358
364
|
Args:
|
|
359
365
|
server_id: Unique identifier for the server
|
|
360
366
|
server: The managed MCP server
|
|
361
367
|
"""
|
|
362
368
|
failure_count = self.consecutive_failures[server_id]
|
|
363
|
-
|
|
369
|
+
|
|
364
370
|
# Trigger recovery actions based on failure count
|
|
365
371
|
if failure_count >= 3:
|
|
366
|
-
logger.error(
|
|
367
|
-
|
|
372
|
+
logger.error(
|
|
373
|
+
f"Server {server_id} has {failure_count} consecutive failures, triggering recovery"
|
|
374
|
+
)
|
|
375
|
+
|
|
368
376
|
try:
|
|
369
377
|
# Attempt to recover the server
|
|
370
378
|
await self._trigger_recovery(server_id, server, failure_count)
|
|
371
379
|
except Exception as e:
|
|
372
380
|
logger.error(f"Recovery failed for server {server_id}: {e}")
|
|
373
|
-
|
|
381
|
+
|
|
374
382
|
# Quarantine server after many consecutive failures
|
|
375
383
|
if failure_count >= 5:
|
|
376
|
-
logger.critical(
|
|
384
|
+
logger.critical(
|
|
385
|
+
f"Quarantining server {server_id} after {failure_count} consecutive failures"
|
|
386
|
+
)
|
|
377
387
|
try:
|
|
378
388
|
# Calculate quarantine duration with exponential backoff
|
|
379
|
-
quarantine_duration = min(
|
|
389
|
+
quarantine_duration = min(
|
|
390
|
+
30 * (2 ** (failure_count - 5)), 1800
|
|
391
|
+
) # Max 30 minutes
|
|
380
392
|
server.quarantine(quarantine_duration)
|
|
381
393
|
except Exception as e:
|
|
382
394
|
logger.error(f"Failed to quarantine server {server_id}: {e}")
|
|
383
|
-
|
|
384
|
-
async def _trigger_recovery(
|
|
395
|
+
|
|
396
|
+
async def _trigger_recovery(
|
|
397
|
+
self, server_id: str, server: ManagedMCPServer, failure_count: int
|
|
398
|
+
) -> None:
|
|
385
399
|
"""
|
|
386
400
|
Trigger recovery actions for a failing server.
|
|
387
|
-
|
|
401
|
+
|
|
388
402
|
Args:
|
|
389
403
|
server_id: Unique identifier for the server
|
|
390
404
|
server: The managed MCP server
|
|
391
405
|
failure_count: Number of consecutive failures
|
|
392
406
|
"""
|
|
393
|
-
logger.info(
|
|
394
|
-
|
|
407
|
+
logger.info(
|
|
408
|
+
f"Triggering recovery for server {server_id} (failure count: {failure_count})"
|
|
409
|
+
)
|
|
410
|
+
|
|
395
411
|
try:
|
|
396
412
|
# For now, just disable and re-enable the server
|
|
397
413
|
# In the future, this could include more sophisticated recovery actions
|
|
398
414
|
server.disable()
|
|
399
415
|
await asyncio.sleep(1) # Brief delay
|
|
400
416
|
server.enable()
|
|
401
|
-
|
|
417
|
+
|
|
402
418
|
logger.info(f"Recovery attempt completed for server {server_id}")
|
|
403
|
-
|
|
419
|
+
|
|
404
420
|
except Exception as e:
|
|
405
421
|
logger.error(f"Recovery action failed for server {server_id}: {e}")
|
|
406
422
|
raise
|
|
407
|
-
|
|
423
|
+
|
|
408
424
|
async def _check_sse_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
409
425
|
"""
|
|
410
426
|
Health check for SSE servers using GET request.
|
|
411
|
-
|
|
427
|
+
|
|
412
428
|
Args:
|
|
413
429
|
server: The managed MCP server to check
|
|
414
|
-
|
|
430
|
+
|
|
415
431
|
Returns:
|
|
416
432
|
HealthCheckResult with check results
|
|
417
433
|
"""
|
|
418
434
|
try:
|
|
419
435
|
config = server.config.config
|
|
420
|
-
url = config.get(
|
|
436
|
+
url = config.get("url")
|
|
421
437
|
if not url:
|
|
422
438
|
return HealthCheckResult(
|
|
423
439
|
success=False,
|
|
424
440
|
latency_ms=0.0,
|
|
425
|
-
error="No URL configured for SSE server"
|
|
441
|
+
error="No URL configured for SSE server",
|
|
426
442
|
)
|
|
427
|
-
|
|
443
|
+
|
|
428
444
|
# Add health endpoint if available, otherwise use base URL
|
|
429
|
-
health_url =
|
|
430
|
-
|
|
445
|
+
health_url = (
|
|
446
|
+
f"{url.rstrip('/')}/health" if not url.endswith("/health") else url
|
|
447
|
+
)
|
|
448
|
+
|
|
431
449
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
432
450
|
response = await client.get(health_url)
|
|
433
|
-
|
|
451
|
+
|
|
434
452
|
if response.status_code == 404:
|
|
435
453
|
# Try base URL if health endpoint doesn't exist
|
|
436
454
|
response = await client.get(url)
|
|
437
|
-
|
|
455
|
+
|
|
438
456
|
success = 200 <= response.status_code < 400
|
|
439
|
-
error =
|
|
440
|
-
|
|
457
|
+
error = (
|
|
458
|
+
None
|
|
459
|
+
if success
|
|
460
|
+
else f"HTTP {response.status_code}: {response.reason_phrase}"
|
|
461
|
+
)
|
|
462
|
+
|
|
441
463
|
return HealthCheckResult(
|
|
442
464
|
success=success,
|
|
443
465
|
latency_ms=0.0, # Will be filled by perform_health_check
|
|
444
|
-
error=error
|
|
466
|
+
error=error,
|
|
445
467
|
)
|
|
446
|
-
|
|
468
|
+
|
|
447
469
|
except Exception as e:
|
|
448
|
-
return HealthCheckResult(
|
|
449
|
-
|
|
450
|
-
latency_ms=0.0,
|
|
451
|
-
error=str(e)
|
|
452
|
-
)
|
|
453
|
-
|
|
470
|
+
return HealthCheckResult(success=False, latency_ms=0.0, error=str(e))
|
|
471
|
+
|
|
454
472
|
async def _check_http_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
455
473
|
"""
|
|
456
474
|
Health check for HTTP servers using GET request.
|
|
457
|
-
|
|
475
|
+
|
|
458
476
|
Args:
|
|
459
477
|
server: The managed MCP server to check
|
|
460
|
-
|
|
478
|
+
|
|
461
479
|
Returns:
|
|
462
480
|
HealthCheckResult with check results
|
|
463
481
|
"""
|
|
464
482
|
# HTTP servers use the same check as SSE servers
|
|
465
483
|
return await self._check_sse_health(server)
|
|
466
|
-
|
|
484
|
+
|
|
467
485
|
async def _check_stdio_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
468
486
|
"""
|
|
469
487
|
Health check for stdio servers using ping command.
|
|
470
|
-
|
|
488
|
+
|
|
471
489
|
Args:
|
|
472
490
|
server: The managed MCP server to check
|
|
473
|
-
|
|
491
|
+
|
|
474
492
|
Returns:
|
|
475
493
|
HealthCheckResult with check results
|
|
476
494
|
"""
|
|
477
495
|
try:
|
|
478
496
|
# Get the pydantic server instance
|
|
479
|
-
|
|
480
|
-
|
|
497
|
+
server.get_pydantic_server()
|
|
498
|
+
|
|
481
499
|
# Try to get available tools as a health check
|
|
482
500
|
# This requires the server to be responsive
|
|
483
501
|
try:
|
|
484
502
|
# Attempt to list tools - this is a good health check for MCP servers
|
|
485
503
|
# Note: This is a simplified check. In a real implementation,
|
|
486
504
|
# we'd need to send an actual MCP message
|
|
487
|
-
|
|
505
|
+
|
|
488
506
|
# For now, we'll check if we can create the server instance
|
|
489
507
|
# and if it appears to be configured correctly
|
|
490
508
|
config = server.config.config
|
|
491
|
-
command = config.get(
|
|
492
|
-
|
|
509
|
+
command = config.get("command")
|
|
510
|
+
|
|
493
511
|
if not command:
|
|
494
512
|
return HealthCheckResult(
|
|
495
513
|
success=False,
|
|
496
514
|
latency_ms=0.0,
|
|
497
|
-
error="No command configured for stdio server"
|
|
515
|
+
error="No command configured for stdio server",
|
|
498
516
|
)
|
|
499
|
-
|
|
517
|
+
|
|
500
518
|
# Basic validation that command exists
|
|
501
519
|
import shutil
|
|
520
|
+
|
|
502
521
|
if not shutil.which(command):
|
|
503
522
|
return HealthCheckResult(
|
|
504
523
|
success=False,
|
|
505
524
|
latency_ms=0.0,
|
|
506
|
-
error=f"Command '{command}' not found in PATH"
|
|
525
|
+
error=f"Command '{command}' not found in PATH",
|
|
507
526
|
)
|
|
508
|
-
|
|
527
|
+
|
|
509
528
|
# If we get here, basic checks passed
|
|
510
|
-
return HealthCheckResult(
|
|
511
|
-
|
|
512
|
-
latency_ms=0.0,
|
|
513
|
-
error=None
|
|
514
|
-
)
|
|
515
|
-
|
|
529
|
+
return HealthCheckResult(success=True, latency_ms=0.0, error=None)
|
|
530
|
+
|
|
516
531
|
except Exception as e:
|
|
517
532
|
return HealthCheckResult(
|
|
518
533
|
success=False,
|
|
519
534
|
latency_ms=0.0,
|
|
520
|
-
error=f"Server communication failed: {str(e)}"
|
|
535
|
+
error=f"Server communication failed: {str(e)}",
|
|
521
536
|
)
|
|
522
|
-
|
|
537
|
+
|
|
523
538
|
except Exception as e:
|
|
524
|
-
return HealthCheckResult(
|
|
525
|
-
|
|
526
|
-
latency_ms=0.0,
|
|
527
|
-
error=str(e)
|
|
528
|
-
)
|
|
529
|
-
|
|
539
|
+
return HealthCheckResult(success=False, latency_ms=0.0, error=str(e))
|
|
540
|
+
|
|
530
541
|
async def shutdown(self) -> None:
|
|
531
542
|
"""
|
|
532
543
|
Shutdown all monitoring tasks gracefully.
|
|
533
544
|
"""
|
|
534
545
|
logger.info("Shutting down health monitor")
|
|
535
|
-
|
|
546
|
+
|
|
536
547
|
# Cancel all monitoring tasks
|
|
537
548
|
tasks = list(self.monitoring_tasks.values())
|
|
538
549
|
for task in tasks:
|
|
539
550
|
task.cancel()
|
|
540
|
-
|
|
551
|
+
|
|
541
552
|
# Wait for all tasks to complete
|
|
542
553
|
if tasks:
|
|
543
554
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
544
|
-
|
|
555
|
+
|
|
545
556
|
self.monitoring_tasks.clear()
|
|
546
557
|
self.consecutive_failures.clear()
|
|
547
558
|
self.last_check_time.clear()
|
|
548
|
-
|
|
549
|
-
logger.info("Health monitor shutdown complete")
|
|
559
|
+
|
|
560
|
+
logger.info("Health monitor shutdown complete")
|