code-puppy 0.0.126__py3-none-any.whl → 0.0.128__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. code_puppy/__init__.py +1 -0
  2. code_puppy/agent.py +65 -69
  3. code_puppy/agents/agent_code_puppy.py +0 -3
  4. code_puppy/agents/runtime_manager.py +212 -0
  5. code_puppy/command_line/command_handler.py +56 -25
  6. code_puppy/command_line/mcp_commands.py +1298 -0
  7. code_puppy/command_line/meta_command_handler.py +3 -2
  8. code_puppy/command_line/model_picker_completion.py +21 -8
  9. code_puppy/main.py +52 -157
  10. code_puppy/mcp/__init__.py +23 -0
  11. code_puppy/mcp/async_lifecycle.py +237 -0
  12. code_puppy/mcp/circuit_breaker.py +218 -0
  13. code_puppy/mcp/config_wizard.py +437 -0
  14. code_puppy/mcp/dashboard.py +291 -0
  15. code_puppy/mcp/error_isolation.py +360 -0
  16. code_puppy/mcp/examples/retry_example.py +208 -0
  17. code_puppy/mcp/health_monitor.py +549 -0
  18. code_puppy/mcp/managed_server.py +346 -0
  19. code_puppy/mcp/manager.py +701 -0
  20. code_puppy/mcp/registry.py +412 -0
  21. code_puppy/mcp/retry_manager.py +321 -0
  22. code_puppy/mcp/server_registry_catalog.py +751 -0
  23. code_puppy/mcp/status_tracker.py +355 -0
  24. code_puppy/messaging/spinner/textual_spinner.py +6 -2
  25. code_puppy/model_factory.py +19 -4
  26. code_puppy/models.json +22 -4
  27. code_puppy/tui/app.py +19 -27
  28. code_puppy/tui/tests/test_agent_command.py +22 -15
  29. {code_puppy-0.0.126.data → code_puppy-0.0.128.data}/data/code_puppy/models.json +22 -4
  30. {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/METADATA +2 -3
  31. {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/RECORD +34 -18
  32. {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/WHEEL +0 -0
  33. {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/entry_points.txt +0 -0
  34. {code_puppy-0.0.126.dist-info → code_puppy-0.0.128.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,549 @@
1
+ """
2
+ Health monitoring system for MCP servers.
3
+
4
+ This module provides continuous health monitoring for MCP servers with
5
+ automatic recovery actions when consecutive failures are detected.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import time
11
+ from collections import defaultdict, deque
12
+ from dataclasses import dataclass
13
+ from datetime import datetime
14
+ from typing import Any, Callable, Dict, List, Optional
15
+ import httpx
16
+ import json
17
+
18
+ from .managed_server import ManagedMCPServer, ServerState
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class HealthStatus:
26
+ """Status of a health check for an MCP server."""
27
+ timestamp: datetime
28
+ is_healthy: bool
29
+ latency_ms: Optional[float]
30
+ error: Optional[str]
31
+ check_type: str # "ping", "list_tools", "get_request", etc.
32
+
33
+
34
+ @dataclass
35
+ class HealthCheckResult:
36
+ """Result of performing a health check."""
37
+ success: bool
38
+ latency_ms: float
39
+ error: Optional[str]
40
+
41
+
42
+ class HealthMonitor:
43
+ """
44
+ Continuous health monitoring system for MCP servers.
45
+
46
+ Features:
47
+ - Background monitoring tasks using asyncio
48
+ - Server type-specific health checks
49
+ - Health history tracking with configurable limit
50
+ - Custom health check registration
51
+ - Automatic recovery triggering on consecutive failures
52
+ - Configurable check intervals
53
+
54
+ Example usage:
55
+ monitor = HealthMonitor(check_interval=30)
56
+ await monitor.start_monitoring("server-1", managed_server)
57
+
58
+ # Check current health
59
+ is_healthy = monitor.is_healthy("server-1")
60
+
61
+ # Get health history
62
+ history = monitor.get_health_history("server-1", limit=50)
63
+ """
64
+
65
+ def __init__(self, check_interval: int = 30):
66
+ """
67
+ Initialize the health monitor.
68
+
69
+ Args:
70
+ check_interval: Interval between health checks in seconds
71
+ """
72
+ self.check_interval = check_interval
73
+ self.monitoring_tasks: Dict[str, asyncio.Task] = {}
74
+ self.health_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
75
+ self.custom_health_checks: Dict[str, Callable] = {}
76
+ self.consecutive_failures: Dict[str, int] = defaultdict(int)
77
+ self.last_check_time: Dict[str, datetime] = {}
78
+
79
+ # Register default health checks for each server type
80
+ self._register_default_health_checks()
81
+
82
+ logger.info(f"Health monitor initialized with {check_interval}s check interval")
83
+
84
+ def _register_default_health_checks(self) -> None:
85
+ """Register default health check methods for each server type."""
86
+ self.register_health_check("sse", self._check_sse_health)
87
+ self.register_health_check("http", self._check_http_health)
88
+ self.register_health_check("stdio", self._check_stdio_health)
89
+
90
+ async def start_monitoring(self, server_id: str, server: ManagedMCPServer) -> None:
91
+ """
92
+ Start continuous health monitoring for a server.
93
+
94
+ Args:
95
+ server_id: Unique identifier for the server
96
+ server: The managed MCP server instance to monitor
97
+ """
98
+ if server_id in self.monitoring_tasks:
99
+ logger.warning(f"Server {server_id} is already being monitored")
100
+ return
101
+
102
+ logger.info(f"Starting health monitoring for server {server_id}")
103
+
104
+ # Create background monitoring task
105
+ task = asyncio.create_task(
106
+ self._monitoring_loop(server_id, server),
107
+ name=f"health_monitor_{server_id}"
108
+ )
109
+ self.monitoring_tasks[server_id] = task
110
+
111
+ # Perform initial health check
112
+ try:
113
+ health_status = await self.check_health(server)
114
+ self._record_health_status(server_id, health_status)
115
+ except Exception as e:
116
+ logger.error(f"Initial health check failed for {server_id}: {e}")
117
+ error_status = HealthStatus(
118
+ timestamp=datetime.now(),
119
+ is_healthy=False,
120
+ latency_ms=None,
121
+ error=str(e),
122
+ check_type="initial"
123
+ )
124
+ self._record_health_status(server_id, error_status)
125
+
126
+ async def stop_monitoring(self, server_id: str) -> None:
127
+ """
128
+ Stop health monitoring for a server.
129
+
130
+ Args:
131
+ server_id: Unique identifier for the server
132
+ """
133
+ task = self.monitoring_tasks.pop(server_id, None)
134
+ if task:
135
+ logger.info(f"Stopping health monitoring for server {server_id}")
136
+ task.cancel()
137
+ try:
138
+ await task
139
+ except asyncio.CancelledError:
140
+ pass
141
+
142
+ # Clean up tracking data
143
+ self.consecutive_failures.pop(server_id, None)
144
+ self.last_check_time.pop(server_id, None)
145
+ else:
146
+ logger.warning(f"No monitoring task found for server {server_id}")
147
+
148
+ async def check_health(self, server: ManagedMCPServer) -> HealthStatus:
149
+ """
150
+ Perform a health check for a server.
151
+
152
+ Args:
153
+ server: The managed MCP server to check
154
+
155
+ Returns:
156
+ HealthStatus object with check results
157
+ """
158
+ server_type = server.config.type.lower()
159
+ check_func = self.custom_health_checks.get(server_type)
160
+
161
+ if not check_func:
162
+ logger.warning(f"No health check function registered for server type: {server_type}")
163
+ return HealthStatus(
164
+ timestamp=datetime.now(),
165
+ is_healthy=False,
166
+ latency_ms=None,
167
+ error=f"No health check registered for type '{server_type}'",
168
+ check_type="unknown"
169
+ )
170
+
171
+ try:
172
+ result = await self.perform_health_check(server)
173
+ return HealthStatus(
174
+ timestamp=datetime.now(),
175
+ is_healthy=result.success,
176
+ latency_ms=result.latency_ms,
177
+ error=result.error,
178
+ check_type=server_type
179
+ )
180
+ except Exception as e:
181
+ logger.error(f"Health check failed for server {server.config.id}: {e}")
182
+ return HealthStatus(
183
+ timestamp=datetime.now(),
184
+ is_healthy=False,
185
+ latency_ms=None,
186
+ error=str(e),
187
+ check_type=server_type
188
+ )
189
+
190
+ async def perform_health_check(self, server: ManagedMCPServer) -> HealthCheckResult:
191
+ """
192
+ Perform the actual health check based on server type.
193
+
194
+ Args:
195
+ server: The managed MCP server to check
196
+
197
+ Returns:
198
+ HealthCheckResult with timing and success information
199
+ """
200
+ server_type = server.config.type.lower()
201
+ check_func = self.custom_health_checks.get(server_type)
202
+
203
+ if not check_func:
204
+ return HealthCheckResult(
205
+ success=False,
206
+ latency_ms=0.0,
207
+ error=f"No health check function for type '{server_type}'"
208
+ )
209
+
210
+ start_time = time.time()
211
+ try:
212
+ result = await check_func(server)
213
+ latency_ms = (time.time() - start_time) * 1000
214
+
215
+ if isinstance(result, bool):
216
+ return HealthCheckResult(
217
+ success=result,
218
+ latency_ms=latency_ms,
219
+ error=None if result else "Health check returned False"
220
+ )
221
+ elif isinstance(result, HealthCheckResult):
222
+ # Update latency if not already set
223
+ if result.latency_ms == 0.0:
224
+ result.latency_ms = latency_ms
225
+ return result
226
+ else:
227
+ return HealthCheckResult(
228
+ success=False,
229
+ latency_ms=latency_ms,
230
+ error=f"Invalid health check result type: {type(result)}"
231
+ )
232
+
233
+ except Exception as e:
234
+ latency_ms = (time.time() - start_time) * 1000
235
+ return HealthCheckResult(
236
+ success=False,
237
+ latency_ms=latency_ms,
238
+ error=str(e)
239
+ )
240
+
241
+ def register_health_check(self, server_type: str, check_func: Callable) -> None:
242
+ """
243
+ Register a custom health check function for a server type.
244
+
245
+ Args:
246
+ server_type: The server type ("sse", "http", "stdio")
247
+ check_func: Async function that takes a ManagedMCPServer and returns
248
+ bool or HealthCheckResult
249
+ """
250
+ self.custom_health_checks[server_type.lower()] = check_func
251
+ logger.info(f"Registered health check for server type: {server_type}")
252
+
253
+ def get_health_history(self, server_id: str, limit: int = 100) -> List[HealthStatus]:
254
+ """
255
+ Get health check history for a server.
256
+
257
+ Args:
258
+ server_id: Unique identifier for the server
259
+ limit: Maximum number of history entries to return
260
+
261
+ Returns:
262
+ List of HealthStatus objects, most recent first
263
+ """
264
+ history = self.health_history.get(server_id, deque())
265
+ # Convert deque to list and limit results
266
+ result = list(history)[-limit:] if limit > 0 else list(history)
267
+ # Reverse to get most recent first
268
+ result.reverse()
269
+ return result
270
+
271
+ def is_healthy(self, server_id: str) -> bool:
272
+ """
273
+ Check if a server is currently healthy based on latest status.
274
+
275
+ Args:
276
+ server_id: Unique identifier for the server
277
+
278
+ Returns:
279
+ True if server is healthy, False otherwise
280
+ """
281
+ history = self.health_history.get(server_id)
282
+ if not history:
283
+ return False
284
+
285
+ # Get most recent health status
286
+ latest_status = history[-1]
287
+ return latest_status.is_healthy
288
+
289
+ async def _monitoring_loop(self, server_id: str, server: ManagedMCPServer) -> None:
290
+ """
291
+ Main monitoring loop that runs in the background.
292
+
293
+ Args:
294
+ server_id: Unique identifier for the server
295
+ server: The managed MCP server to monitor
296
+ """
297
+ logger.info(f"Starting monitoring loop for server {server_id}")
298
+
299
+ while True:
300
+ try:
301
+ # Wait for check interval
302
+ await asyncio.sleep(self.check_interval)
303
+
304
+ # Skip if server is not enabled
305
+ if not server.is_enabled():
306
+ continue
307
+
308
+ # Perform health check
309
+ health_status = await self.check_health(server)
310
+ self._record_health_status(server_id, health_status)
311
+
312
+ # Handle consecutive failures
313
+ if not health_status.is_healthy:
314
+ self.consecutive_failures[server_id] += 1
315
+ logger.warning(
316
+ f"Health check failed for {server_id}: {health_status.error} "
317
+ f"(consecutive failures: {self.consecutive_failures[server_id]})"
318
+ )
319
+
320
+ # Trigger recovery on consecutive failures
321
+ await self._handle_consecutive_failures(server_id, server)
322
+ else:
323
+ # Reset consecutive failure count on success
324
+ if self.consecutive_failures[server_id] > 0:
325
+ logger.info(f"Server {server_id} recovered after health check success")
326
+ self.consecutive_failures[server_id] = 0
327
+
328
+ self.last_check_time[server_id] = datetime.now()
329
+
330
+ except asyncio.CancelledError:
331
+ logger.info(f"Monitoring loop cancelled for server {server_id}")
332
+ break
333
+ except Exception as e:
334
+ logger.error(f"Error in monitoring loop for {server_id}: {e}")
335
+ # Continue monitoring despite errors
336
+ await asyncio.sleep(5) # Brief delay before retrying
337
+
338
+ def _record_health_status(self, server_id: str, status: HealthStatus) -> None:
339
+ """
340
+ Record a health status in the history.
341
+
342
+ Args:
343
+ server_id: Unique identifier for the server
344
+ status: The health status to record
345
+ """
346
+ self.health_history[server_id].append(status)
347
+
348
+ # Log health status changes
349
+ if status.is_healthy:
350
+ logger.debug(f"Server {server_id} health check passed ({status.latency_ms:.1f}ms)")
351
+ else:
352
+ logger.warning(f"Server {server_id} health check failed: {status.error}")
353
+
354
+ async def _handle_consecutive_failures(self, server_id: str, server: ManagedMCPServer) -> None:
355
+ """
356
+ Handle consecutive health check failures.
357
+
358
+ Args:
359
+ server_id: Unique identifier for the server
360
+ server: The managed MCP server
361
+ """
362
+ failure_count = self.consecutive_failures[server_id]
363
+
364
+ # Trigger recovery actions based on failure count
365
+ if failure_count >= 3:
366
+ logger.error(f"Server {server_id} has {failure_count} consecutive failures, triggering recovery")
367
+
368
+ try:
369
+ # Attempt to recover the server
370
+ await self._trigger_recovery(server_id, server, failure_count)
371
+ except Exception as e:
372
+ logger.error(f"Recovery failed for server {server_id}: {e}")
373
+
374
+ # Quarantine server after many consecutive failures
375
+ if failure_count >= 5:
376
+ logger.critical(f"Quarantining server {server_id} after {failure_count} consecutive failures")
377
+ try:
378
+ # Calculate quarantine duration with exponential backoff
379
+ quarantine_duration = min(30 * (2 ** (failure_count - 5)), 1800) # Max 30 minutes
380
+ server.quarantine(quarantine_duration)
381
+ except Exception as e:
382
+ logger.error(f"Failed to quarantine server {server_id}: {e}")
383
+
384
+ async def _trigger_recovery(self, server_id: str, server: ManagedMCPServer, failure_count: int) -> None:
385
+ """
386
+ Trigger recovery actions for a failing server.
387
+
388
+ Args:
389
+ server_id: Unique identifier for the server
390
+ server: The managed MCP server
391
+ failure_count: Number of consecutive failures
392
+ """
393
+ logger.info(f"Triggering recovery for server {server_id} (failure count: {failure_count})")
394
+
395
+ try:
396
+ # For now, just disable and re-enable the server
397
+ # In the future, this could include more sophisticated recovery actions
398
+ server.disable()
399
+ await asyncio.sleep(1) # Brief delay
400
+ server.enable()
401
+
402
+ logger.info(f"Recovery attempt completed for server {server_id}")
403
+
404
+ except Exception as e:
405
+ logger.error(f"Recovery action failed for server {server_id}: {e}")
406
+ raise
407
+
408
+ async def _check_sse_health(self, server: ManagedMCPServer) -> HealthCheckResult:
409
+ """
410
+ Health check for SSE servers using GET request.
411
+
412
+ Args:
413
+ server: The managed MCP server to check
414
+
415
+ Returns:
416
+ HealthCheckResult with check results
417
+ """
418
+ try:
419
+ config = server.config.config
420
+ url = config.get('url')
421
+ if not url:
422
+ return HealthCheckResult(
423
+ success=False,
424
+ latency_ms=0.0,
425
+ error="No URL configured for SSE server"
426
+ )
427
+
428
+ # Add health endpoint if available, otherwise use base URL
429
+ health_url = f"{url.rstrip('/')}/health" if not url.endswith('/health') else url
430
+
431
+ async with httpx.AsyncClient(timeout=10.0) as client:
432
+ response = await client.get(health_url)
433
+
434
+ if response.status_code == 404:
435
+ # Try base URL if health endpoint doesn't exist
436
+ response = await client.get(url)
437
+
438
+ success = 200 <= response.status_code < 400
439
+ error = None if success else f"HTTP {response.status_code}: {response.reason_phrase}"
440
+
441
+ return HealthCheckResult(
442
+ success=success,
443
+ latency_ms=0.0, # Will be filled by perform_health_check
444
+ error=error
445
+ )
446
+
447
+ except Exception as e:
448
+ return HealthCheckResult(
449
+ success=False,
450
+ latency_ms=0.0,
451
+ error=str(e)
452
+ )
453
+
454
+ async def _check_http_health(self, server: ManagedMCPServer) -> HealthCheckResult:
455
+ """
456
+ Health check for HTTP servers using GET request.
457
+
458
+ Args:
459
+ server: The managed MCP server to check
460
+
461
+ Returns:
462
+ HealthCheckResult with check results
463
+ """
464
+ # HTTP servers use the same check as SSE servers
465
+ return await self._check_sse_health(server)
466
+
467
+ async def _check_stdio_health(self, server: ManagedMCPServer) -> HealthCheckResult:
468
+ """
469
+ Health check for stdio servers using ping command.
470
+
471
+ Args:
472
+ server: The managed MCP server to check
473
+
474
+ Returns:
475
+ HealthCheckResult with check results
476
+ """
477
+ try:
478
+ # Get the pydantic server instance
479
+ pydantic_server = server.get_pydantic_server()
480
+
481
+ # Try to get available tools as a health check
482
+ # This requires the server to be responsive
483
+ try:
484
+ # Attempt to list tools - this is a good health check for MCP servers
485
+ # Note: This is a simplified check. In a real implementation,
486
+ # we'd need to send an actual MCP message
487
+
488
+ # For now, we'll check if we can create the server instance
489
+ # and if it appears to be configured correctly
490
+ config = server.config.config
491
+ command = config.get('command')
492
+
493
+ if not command:
494
+ return HealthCheckResult(
495
+ success=False,
496
+ latency_ms=0.0,
497
+ error="No command configured for stdio server"
498
+ )
499
+
500
+ # Basic validation that command exists
501
+ import shutil
502
+ if not shutil.which(command):
503
+ return HealthCheckResult(
504
+ success=False,
505
+ latency_ms=0.0,
506
+ error=f"Command '{command}' not found in PATH"
507
+ )
508
+
509
+ # If we get here, basic checks passed
510
+ return HealthCheckResult(
511
+ success=True,
512
+ latency_ms=0.0,
513
+ error=None
514
+ )
515
+
516
+ except Exception as e:
517
+ return HealthCheckResult(
518
+ success=False,
519
+ latency_ms=0.0,
520
+ error=f"Server communication failed: {str(e)}"
521
+ )
522
+
523
+ except Exception as e:
524
+ return HealthCheckResult(
525
+ success=False,
526
+ latency_ms=0.0,
527
+ error=str(e)
528
+ )
529
+
530
+ async def shutdown(self) -> None:
531
+ """
532
+ Shutdown all monitoring tasks gracefully.
533
+ """
534
+ logger.info("Shutting down health monitor")
535
+
536
+ # Cancel all monitoring tasks
537
+ tasks = list(self.monitoring_tasks.values())
538
+ for task in tasks:
539
+ task.cancel()
540
+
541
+ # Wait for all tasks to complete
542
+ if tasks:
543
+ await asyncio.gather(*tasks, return_exceptions=True)
544
+
545
+ self.monitoring_tasks.clear()
546
+ self.consecutive_failures.clear()
547
+ self.last_check_time.clear()
548
+
549
+ logger.info("Health monitor shutdown complete")