code-puppy 0.0.126__py3-none-any.whl → 0.0.128__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_puppy/__init__.py +1 -0
- code_puppy/agent.py +65 -69
- code_puppy/agents/agent_code_puppy.py +0 -3
- code_puppy/agents/runtime_manager.py +212 -0
- code_puppy/command_line/command_handler.py +56 -25
- code_puppy/command_line/mcp_commands.py +1298 -0
- code_puppy/command_line/meta_command_handler.py +3 -2
- code_puppy/command_line/model_picker_completion.py +21 -8
- code_puppy/main.py +52 -157
- code_puppy/mcp/__init__.py +23 -0
- code_puppy/mcp/async_lifecycle.py +237 -0
- code_puppy/mcp/circuit_breaker.py +218 -0
- code_puppy/mcp/config_wizard.py +437 -0
- code_puppy/mcp/dashboard.py +291 -0
- code_puppy/mcp/error_isolation.py +360 -0
- code_puppy/mcp/examples/retry_example.py +208 -0
- code_puppy/mcp/health_monitor.py +549 -0
- code_puppy/mcp/managed_server.py +346 -0
- code_puppy/mcp/manager.py +701 -0
- code_puppy/mcp/registry.py +412 -0
- code_puppy/mcp/retry_manager.py +321 -0
- code_puppy/mcp/server_registry_catalog.py +751 -0
- code_puppy/mcp/status_tracker.py +355 -0
- code_puppy/messaging/spinner/textual_spinner.py +6 -2
- code_puppy/model_factory.py +19 -4
- code_puppy/models.json +22 -4
- code_puppy/tui/app.py +19 -27
- code_puppy/tui/tests/test_agent_command.py +22 -15
- {code_puppy-0.0.126.data → code_puppy-0.0.128.data}/data/code_puppy/models.json +22 -4
- {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/METADATA +2 -3
- {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/RECORD +34 -18
- {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/WHEEL +0 -0
- {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/entry_points.txt +0 -0
- {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health monitoring system for MCP servers.
|
|
3
|
+
|
|
4
|
+
This module provides continuous health monitoring for MCP servers with
|
|
5
|
+
automatic recovery actions when consecutive failures are detected.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import time
|
|
11
|
+
from collections import defaultdict, deque
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
15
|
+
import httpx
|
|
16
|
+
import json
|
|
17
|
+
|
|
18
|
+
from .managed_server import ManagedMCPServer, ServerState
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class HealthStatus:
|
|
26
|
+
"""Status of a health check for an MCP server."""
|
|
27
|
+
timestamp: datetime
|
|
28
|
+
is_healthy: bool
|
|
29
|
+
latency_ms: Optional[float]
|
|
30
|
+
error: Optional[str]
|
|
31
|
+
check_type: str # "ping", "list_tools", "get_request", etc.
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class HealthCheckResult:
|
|
36
|
+
"""Result of performing a health check."""
|
|
37
|
+
success: bool
|
|
38
|
+
latency_ms: float
|
|
39
|
+
error: Optional[str]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class HealthMonitor:
|
|
43
|
+
"""
|
|
44
|
+
Continuous health monitoring system for MCP servers.
|
|
45
|
+
|
|
46
|
+
Features:
|
|
47
|
+
- Background monitoring tasks using asyncio
|
|
48
|
+
- Server type-specific health checks
|
|
49
|
+
- Health history tracking with configurable limit
|
|
50
|
+
- Custom health check registration
|
|
51
|
+
- Automatic recovery triggering on consecutive failures
|
|
52
|
+
- Configurable check intervals
|
|
53
|
+
|
|
54
|
+
Example usage:
|
|
55
|
+
monitor = HealthMonitor(check_interval=30)
|
|
56
|
+
await monitor.start_monitoring("server-1", managed_server)
|
|
57
|
+
|
|
58
|
+
# Check current health
|
|
59
|
+
is_healthy = monitor.is_healthy("server-1")
|
|
60
|
+
|
|
61
|
+
# Get health history
|
|
62
|
+
history = monitor.get_health_history("server-1", limit=50)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, check_interval: int = 30):
|
|
66
|
+
"""
|
|
67
|
+
Initialize the health monitor.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
check_interval: Interval between health checks in seconds
|
|
71
|
+
"""
|
|
72
|
+
self.check_interval = check_interval
|
|
73
|
+
self.monitoring_tasks: Dict[str, asyncio.Task] = {}
|
|
74
|
+
self.health_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
|
75
|
+
self.custom_health_checks: Dict[str, Callable] = {}
|
|
76
|
+
self.consecutive_failures: Dict[str, int] = defaultdict(int)
|
|
77
|
+
self.last_check_time: Dict[str, datetime] = {}
|
|
78
|
+
|
|
79
|
+
# Register default health checks for each server type
|
|
80
|
+
self._register_default_health_checks()
|
|
81
|
+
|
|
82
|
+
logger.info(f"Health monitor initialized with {check_interval}s check interval")
|
|
83
|
+
|
|
84
|
+
def _register_default_health_checks(self) -> None:
|
|
85
|
+
"""Register default health check methods for each server type."""
|
|
86
|
+
self.register_health_check("sse", self._check_sse_health)
|
|
87
|
+
self.register_health_check("http", self._check_http_health)
|
|
88
|
+
self.register_health_check("stdio", self._check_stdio_health)
|
|
89
|
+
|
|
90
|
+
async def start_monitoring(self, server_id: str, server: ManagedMCPServer) -> None:
|
|
91
|
+
"""
|
|
92
|
+
Start continuous health monitoring for a server.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
server_id: Unique identifier for the server
|
|
96
|
+
server: The managed MCP server instance to monitor
|
|
97
|
+
"""
|
|
98
|
+
if server_id in self.monitoring_tasks:
|
|
99
|
+
logger.warning(f"Server {server_id} is already being monitored")
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
logger.info(f"Starting health monitoring for server {server_id}")
|
|
103
|
+
|
|
104
|
+
# Create background monitoring task
|
|
105
|
+
task = asyncio.create_task(
|
|
106
|
+
self._monitoring_loop(server_id, server),
|
|
107
|
+
name=f"health_monitor_{server_id}"
|
|
108
|
+
)
|
|
109
|
+
self.monitoring_tasks[server_id] = task
|
|
110
|
+
|
|
111
|
+
# Perform initial health check
|
|
112
|
+
try:
|
|
113
|
+
health_status = await self.check_health(server)
|
|
114
|
+
self._record_health_status(server_id, health_status)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f"Initial health check failed for {server_id}: {e}")
|
|
117
|
+
error_status = HealthStatus(
|
|
118
|
+
timestamp=datetime.now(),
|
|
119
|
+
is_healthy=False,
|
|
120
|
+
latency_ms=None,
|
|
121
|
+
error=str(e),
|
|
122
|
+
check_type="initial"
|
|
123
|
+
)
|
|
124
|
+
self._record_health_status(server_id, error_status)
|
|
125
|
+
|
|
126
|
+
async def stop_monitoring(self, server_id: str) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Stop health monitoring for a server.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
server_id: Unique identifier for the server
|
|
132
|
+
"""
|
|
133
|
+
task = self.monitoring_tasks.pop(server_id, None)
|
|
134
|
+
if task:
|
|
135
|
+
logger.info(f"Stopping health monitoring for server {server_id}")
|
|
136
|
+
task.cancel()
|
|
137
|
+
try:
|
|
138
|
+
await task
|
|
139
|
+
except asyncio.CancelledError:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
# Clean up tracking data
|
|
143
|
+
self.consecutive_failures.pop(server_id, None)
|
|
144
|
+
self.last_check_time.pop(server_id, None)
|
|
145
|
+
else:
|
|
146
|
+
logger.warning(f"No monitoring task found for server {server_id}")
|
|
147
|
+
|
|
148
|
+
async def check_health(self, server: ManagedMCPServer) -> HealthStatus:
|
|
149
|
+
"""
|
|
150
|
+
Perform a health check for a server.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
server: The managed MCP server to check
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
HealthStatus object with check results
|
|
157
|
+
"""
|
|
158
|
+
server_type = server.config.type.lower()
|
|
159
|
+
check_func = self.custom_health_checks.get(server_type)
|
|
160
|
+
|
|
161
|
+
if not check_func:
|
|
162
|
+
logger.warning(f"No health check function registered for server type: {server_type}")
|
|
163
|
+
return HealthStatus(
|
|
164
|
+
timestamp=datetime.now(),
|
|
165
|
+
is_healthy=False,
|
|
166
|
+
latency_ms=None,
|
|
167
|
+
error=f"No health check registered for type '{server_type}'",
|
|
168
|
+
check_type="unknown"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
result = await self.perform_health_check(server)
|
|
173
|
+
return HealthStatus(
|
|
174
|
+
timestamp=datetime.now(),
|
|
175
|
+
is_healthy=result.success,
|
|
176
|
+
latency_ms=result.latency_ms,
|
|
177
|
+
error=result.error,
|
|
178
|
+
check_type=server_type
|
|
179
|
+
)
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error(f"Health check failed for server {server.config.id}: {e}")
|
|
182
|
+
return HealthStatus(
|
|
183
|
+
timestamp=datetime.now(),
|
|
184
|
+
is_healthy=False,
|
|
185
|
+
latency_ms=None,
|
|
186
|
+
error=str(e),
|
|
187
|
+
check_type=server_type
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
async def perform_health_check(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
191
|
+
"""
|
|
192
|
+
Perform the actual health check based on server type.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
server: The managed MCP server to check
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
HealthCheckResult with timing and success information
|
|
199
|
+
"""
|
|
200
|
+
server_type = server.config.type.lower()
|
|
201
|
+
check_func = self.custom_health_checks.get(server_type)
|
|
202
|
+
|
|
203
|
+
if not check_func:
|
|
204
|
+
return HealthCheckResult(
|
|
205
|
+
success=False,
|
|
206
|
+
latency_ms=0.0,
|
|
207
|
+
error=f"No health check function for type '{server_type}'"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
start_time = time.time()
|
|
211
|
+
try:
|
|
212
|
+
result = await check_func(server)
|
|
213
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
214
|
+
|
|
215
|
+
if isinstance(result, bool):
|
|
216
|
+
return HealthCheckResult(
|
|
217
|
+
success=result,
|
|
218
|
+
latency_ms=latency_ms,
|
|
219
|
+
error=None if result else "Health check returned False"
|
|
220
|
+
)
|
|
221
|
+
elif isinstance(result, HealthCheckResult):
|
|
222
|
+
# Update latency if not already set
|
|
223
|
+
if result.latency_ms == 0.0:
|
|
224
|
+
result.latency_ms = latency_ms
|
|
225
|
+
return result
|
|
226
|
+
else:
|
|
227
|
+
return HealthCheckResult(
|
|
228
|
+
success=False,
|
|
229
|
+
latency_ms=latency_ms,
|
|
230
|
+
error=f"Invalid health check result type: {type(result)}"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
235
|
+
return HealthCheckResult(
|
|
236
|
+
success=False,
|
|
237
|
+
latency_ms=latency_ms,
|
|
238
|
+
error=str(e)
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def register_health_check(self, server_type: str, check_func: Callable) -> None:
|
|
242
|
+
"""
|
|
243
|
+
Register a custom health check function for a server type.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
server_type: The server type ("sse", "http", "stdio")
|
|
247
|
+
check_func: Async function that takes a ManagedMCPServer and returns
|
|
248
|
+
bool or HealthCheckResult
|
|
249
|
+
"""
|
|
250
|
+
self.custom_health_checks[server_type.lower()] = check_func
|
|
251
|
+
logger.info(f"Registered health check for server type: {server_type}")
|
|
252
|
+
|
|
253
|
+
def get_health_history(self, server_id: str, limit: int = 100) -> List[HealthStatus]:
|
|
254
|
+
"""
|
|
255
|
+
Get health check history for a server.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
server_id: Unique identifier for the server
|
|
259
|
+
limit: Maximum number of history entries to return
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
List of HealthStatus objects, most recent first
|
|
263
|
+
"""
|
|
264
|
+
history = self.health_history.get(server_id, deque())
|
|
265
|
+
# Convert deque to list and limit results
|
|
266
|
+
result = list(history)[-limit:] if limit > 0 else list(history)
|
|
267
|
+
# Reverse to get most recent first
|
|
268
|
+
result.reverse()
|
|
269
|
+
return result
|
|
270
|
+
|
|
271
|
+
def is_healthy(self, server_id: str) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Check if a server is currently healthy based on latest status.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
server_id: Unique identifier for the server
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
True if server is healthy, False otherwise
|
|
280
|
+
"""
|
|
281
|
+
history = self.health_history.get(server_id)
|
|
282
|
+
if not history:
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
# Get most recent health status
|
|
286
|
+
latest_status = history[-1]
|
|
287
|
+
return latest_status.is_healthy
|
|
288
|
+
|
|
289
|
+
async def _monitoring_loop(self, server_id: str, server: ManagedMCPServer) -> None:
|
|
290
|
+
"""
|
|
291
|
+
Main monitoring loop that runs in the background.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
server_id: Unique identifier for the server
|
|
295
|
+
server: The managed MCP server to monitor
|
|
296
|
+
"""
|
|
297
|
+
logger.info(f"Starting monitoring loop for server {server_id}")
|
|
298
|
+
|
|
299
|
+
while True:
|
|
300
|
+
try:
|
|
301
|
+
# Wait for check interval
|
|
302
|
+
await asyncio.sleep(self.check_interval)
|
|
303
|
+
|
|
304
|
+
# Skip if server is not enabled
|
|
305
|
+
if not server.is_enabled():
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
# Perform health check
|
|
309
|
+
health_status = await self.check_health(server)
|
|
310
|
+
self._record_health_status(server_id, health_status)
|
|
311
|
+
|
|
312
|
+
# Handle consecutive failures
|
|
313
|
+
if not health_status.is_healthy:
|
|
314
|
+
self.consecutive_failures[server_id] += 1
|
|
315
|
+
logger.warning(
|
|
316
|
+
f"Health check failed for {server_id}: {health_status.error} "
|
|
317
|
+
f"(consecutive failures: {self.consecutive_failures[server_id]})"
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Trigger recovery on consecutive failures
|
|
321
|
+
await self._handle_consecutive_failures(server_id, server)
|
|
322
|
+
else:
|
|
323
|
+
# Reset consecutive failure count on success
|
|
324
|
+
if self.consecutive_failures[server_id] > 0:
|
|
325
|
+
logger.info(f"Server {server_id} recovered after health check success")
|
|
326
|
+
self.consecutive_failures[server_id] = 0
|
|
327
|
+
|
|
328
|
+
self.last_check_time[server_id] = datetime.now()
|
|
329
|
+
|
|
330
|
+
except asyncio.CancelledError:
|
|
331
|
+
logger.info(f"Monitoring loop cancelled for server {server_id}")
|
|
332
|
+
break
|
|
333
|
+
except Exception as e:
|
|
334
|
+
logger.error(f"Error in monitoring loop for {server_id}: {e}")
|
|
335
|
+
# Continue monitoring despite errors
|
|
336
|
+
await asyncio.sleep(5) # Brief delay before retrying
|
|
337
|
+
|
|
338
|
+
def _record_health_status(self, server_id: str, status: HealthStatus) -> None:
|
|
339
|
+
"""
|
|
340
|
+
Record a health status in the history.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
server_id: Unique identifier for the server
|
|
344
|
+
status: The health status to record
|
|
345
|
+
"""
|
|
346
|
+
self.health_history[server_id].append(status)
|
|
347
|
+
|
|
348
|
+
# Log health status changes
|
|
349
|
+
if status.is_healthy:
|
|
350
|
+
logger.debug(f"Server {server_id} health check passed ({status.latency_ms:.1f}ms)")
|
|
351
|
+
else:
|
|
352
|
+
logger.warning(f"Server {server_id} health check failed: {status.error}")
|
|
353
|
+
|
|
354
|
+
async def _handle_consecutive_failures(self, server_id: str, server: ManagedMCPServer) -> None:
|
|
355
|
+
"""
|
|
356
|
+
Handle consecutive health check failures.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
server_id: Unique identifier for the server
|
|
360
|
+
server: The managed MCP server
|
|
361
|
+
"""
|
|
362
|
+
failure_count = self.consecutive_failures[server_id]
|
|
363
|
+
|
|
364
|
+
# Trigger recovery actions based on failure count
|
|
365
|
+
if failure_count >= 3:
|
|
366
|
+
logger.error(f"Server {server_id} has {failure_count} consecutive failures, triggering recovery")
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
# Attempt to recover the server
|
|
370
|
+
await self._trigger_recovery(server_id, server, failure_count)
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.error(f"Recovery failed for server {server_id}: {e}")
|
|
373
|
+
|
|
374
|
+
# Quarantine server after many consecutive failures
|
|
375
|
+
if failure_count >= 5:
|
|
376
|
+
logger.critical(f"Quarantining server {server_id} after {failure_count} consecutive failures")
|
|
377
|
+
try:
|
|
378
|
+
# Calculate quarantine duration with exponential backoff
|
|
379
|
+
quarantine_duration = min(30 * (2 ** (failure_count - 5)), 1800) # Max 30 minutes
|
|
380
|
+
server.quarantine(quarantine_duration)
|
|
381
|
+
except Exception as e:
|
|
382
|
+
logger.error(f"Failed to quarantine server {server_id}: {e}")
|
|
383
|
+
|
|
384
|
+
async def _trigger_recovery(self, server_id: str, server: ManagedMCPServer, failure_count: int) -> None:
|
|
385
|
+
"""
|
|
386
|
+
Trigger recovery actions for a failing server.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
server_id: Unique identifier for the server
|
|
390
|
+
server: The managed MCP server
|
|
391
|
+
failure_count: Number of consecutive failures
|
|
392
|
+
"""
|
|
393
|
+
logger.info(f"Triggering recovery for server {server_id} (failure count: {failure_count})")
|
|
394
|
+
|
|
395
|
+
try:
|
|
396
|
+
# For now, just disable and re-enable the server
|
|
397
|
+
# In the future, this could include more sophisticated recovery actions
|
|
398
|
+
server.disable()
|
|
399
|
+
await asyncio.sleep(1) # Brief delay
|
|
400
|
+
server.enable()
|
|
401
|
+
|
|
402
|
+
logger.info(f"Recovery attempt completed for server {server_id}")
|
|
403
|
+
|
|
404
|
+
except Exception as e:
|
|
405
|
+
logger.error(f"Recovery action failed for server {server_id}: {e}")
|
|
406
|
+
raise
|
|
407
|
+
|
|
408
|
+
async def _check_sse_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
409
|
+
"""
|
|
410
|
+
Health check for SSE servers using GET request.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
server: The managed MCP server to check
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
HealthCheckResult with check results
|
|
417
|
+
"""
|
|
418
|
+
try:
|
|
419
|
+
config = server.config.config
|
|
420
|
+
url = config.get('url')
|
|
421
|
+
if not url:
|
|
422
|
+
return HealthCheckResult(
|
|
423
|
+
success=False,
|
|
424
|
+
latency_ms=0.0,
|
|
425
|
+
error="No URL configured for SSE server"
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Add health endpoint if available, otherwise use base URL
|
|
429
|
+
health_url = f"{url.rstrip('/')}/health" if not url.endswith('/health') else url
|
|
430
|
+
|
|
431
|
+
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
432
|
+
response = await client.get(health_url)
|
|
433
|
+
|
|
434
|
+
if response.status_code == 404:
|
|
435
|
+
# Try base URL if health endpoint doesn't exist
|
|
436
|
+
response = await client.get(url)
|
|
437
|
+
|
|
438
|
+
success = 200 <= response.status_code < 400
|
|
439
|
+
error = None if success else f"HTTP {response.status_code}: {response.reason_phrase}"
|
|
440
|
+
|
|
441
|
+
return HealthCheckResult(
|
|
442
|
+
success=success,
|
|
443
|
+
latency_ms=0.0, # Will be filled by perform_health_check
|
|
444
|
+
error=error
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
except Exception as e:
|
|
448
|
+
return HealthCheckResult(
|
|
449
|
+
success=False,
|
|
450
|
+
latency_ms=0.0,
|
|
451
|
+
error=str(e)
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
async def _check_http_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
455
|
+
"""
|
|
456
|
+
Health check for HTTP servers using GET request.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
server: The managed MCP server to check
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
HealthCheckResult with check results
|
|
463
|
+
"""
|
|
464
|
+
# HTTP servers use the same check as SSE servers
|
|
465
|
+
return await self._check_sse_health(server)
|
|
466
|
+
|
|
467
|
+
async def _check_stdio_health(self, server: ManagedMCPServer) -> HealthCheckResult:
|
|
468
|
+
"""
|
|
469
|
+
Health check for stdio servers using ping command.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
server: The managed MCP server to check
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
HealthCheckResult with check results
|
|
476
|
+
"""
|
|
477
|
+
try:
|
|
478
|
+
# Get the pydantic server instance
|
|
479
|
+
pydantic_server = server.get_pydantic_server()
|
|
480
|
+
|
|
481
|
+
# Try to get available tools as a health check
|
|
482
|
+
# This requires the server to be responsive
|
|
483
|
+
try:
|
|
484
|
+
# Attempt to list tools - this is a good health check for MCP servers
|
|
485
|
+
# Note: This is a simplified check. In a real implementation,
|
|
486
|
+
# we'd need to send an actual MCP message
|
|
487
|
+
|
|
488
|
+
# For now, we'll check if we can create the server instance
|
|
489
|
+
# and if it appears to be configured correctly
|
|
490
|
+
config = server.config.config
|
|
491
|
+
command = config.get('command')
|
|
492
|
+
|
|
493
|
+
if not command:
|
|
494
|
+
return HealthCheckResult(
|
|
495
|
+
success=False,
|
|
496
|
+
latency_ms=0.0,
|
|
497
|
+
error="No command configured for stdio server"
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
# Basic validation that command exists
|
|
501
|
+
import shutil
|
|
502
|
+
if not shutil.which(command):
|
|
503
|
+
return HealthCheckResult(
|
|
504
|
+
success=False,
|
|
505
|
+
latency_ms=0.0,
|
|
506
|
+
error=f"Command '{command}' not found in PATH"
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# If we get here, basic checks passed
|
|
510
|
+
return HealthCheckResult(
|
|
511
|
+
success=True,
|
|
512
|
+
latency_ms=0.0,
|
|
513
|
+
error=None
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
return HealthCheckResult(
|
|
518
|
+
success=False,
|
|
519
|
+
latency_ms=0.0,
|
|
520
|
+
error=f"Server communication failed: {str(e)}"
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
except Exception as e:
|
|
524
|
+
return HealthCheckResult(
|
|
525
|
+
success=False,
|
|
526
|
+
latency_ms=0.0,
|
|
527
|
+
error=str(e)
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
async def shutdown(self) -> None:
|
|
531
|
+
"""
|
|
532
|
+
Shutdown all monitoring tasks gracefully.
|
|
533
|
+
"""
|
|
534
|
+
logger.info("Shutting down health monitor")
|
|
535
|
+
|
|
536
|
+
# Cancel all monitoring tasks
|
|
537
|
+
tasks = list(self.monitoring_tasks.values())
|
|
538
|
+
for task in tasks:
|
|
539
|
+
task.cancel()
|
|
540
|
+
|
|
541
|
+
# Wait for all tasks to complete
|
|
542
|
+
if tasks:
|
|
543
|
+
await asyncio.gather(*tasks, return_exceptions=True)
|
|
544
|
+
|
|
545
|
+
self.monitoring_tasks.clear()
|
|
546
|
+
self.consecutive_failures.clear()
|
|
547
|
+
self.last_check_time.clear()
|
|
548
|
+
|
|
549
|
+
logger.info("Health monitor shutdown complete")
|