mcp-mesh 0.7.12__py3-none-any.whl → 0.7.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _mcp_mesh/__init__.py +1 -1
- _mcp_mesh/engine/__init__.py +1 -22
- _mcp_mesh/engine/async_mcp_client.py +88 -25
- _mcp_mesh/engine/decorator_registry.py +10 -9
- _mcp_mesh/engine/dependency_injector.py +64 -53
- _mcp_mesh/engine/mesh_llm_agent.py +119 -5
- _mcp_mesh/engine/mesh_llm_agent_injector.py +30 -0
- _mcp_mesh/engine/session_aware_client.py +3 -3
- _mcp_mesh/engine/unified_mcp_proxy.py +82 -90
- _mcp_mesh/pipeline/api_heartbeat/api_dependency_resolution.py +0 -89
- _mcp_mesh/pipeline/api_heartbeat/api_fast_heartbeat_check.py +3 -3
- _mcp_mesh/pipeline/api_heartbeat/api_heartbeat_pipeline.py +30 -28
- _mcp_mesh/pipeline/mcp_heartbeat/dependency_resolution.py +16 -18
- _mcp_mesh/pipeline/mcp_heartbeat/fast_heartbeat_check.py +5 -5
- _mcp_mesh/pipeline/mcp_heartbeat/heartbeat_orchestrator.py +3 -3
- _mcp_mesh/pipeline/mcp_heartbeat/heartbeat_pipeline.py +6 -6
- _mcp_mesh/pipeline/mcp_heartbeat/heartbeat_send.py +1 -1
- _mcp_mesh/pipeline/mcp_heartbeat/llm_tools_resolution.py +15 -11
- _mcp_mesh/pipeline/mcp_heartbeat/registry_connection.py +3 -3
- _mcp_mesh/pipeline/mcp_startup/fastapiserver_setup.py +37 -268
- _mcp_mesh/pipeline/mcp_startup/lifespan_factory.py +142 -0
- _mcp_mesh/pipeline/mcp_startup/startup_orchestrator.py +57 -93
- _mcp_mesh/pipeline/shared/registry_connection.py +1 -1
- _mcp_mesh/shared/health_check_manager.py +313 -0
- _mcp_mesh/shared/logging_config.py +190 -7
- _mcp_mesh/shared/registry_client_wrapper.py +8 -8
- _mcp_mesh/shared/sse_parser.py +19 -17
- _mcp_mesh/tracing/execution_tracer.py +26 -1
- _mcp_mesh/tracing/fastapi_tracing_middleware.py +3 -4
- _mcp_mesh/tracing/trace_context_helper.py +25 -6
- {mcp_mesh-0.7.12.dist-info → mcp_mesh-0.7.14.dist-info}/METADATA +1 -1
- {mcp_mesh-0.7.12.dist-info → mcp_mesh-0.7.14.dist-info}/RECORD +38 -39
- mesh/__init__.py +3 -1
- mesh/decorators.py +81 -43
- mesh/helpers.py +72 -4
- mesh/types.py +48 -4
- _mcp_mesh/engine/full_mcp_proxy.py +0 -641
- _mcp_mesh/engine/mcp_client_proxy.py +0 -457
- _mcp_mesh/shared/health_check_cache.py +0 -246
- {mcp_mesh-0.7.12.dist-info → mcp_mesh-0.7.14.dist-info}/WHEEL +0 -0
- {mcp_mesh-0.7.12.dist-info → mcp_mesh-0.7.14.dist-info}/licenses/LICENSE +0 -0
|
@@ -120,11 +120,15 @@ class DebounceCoordinator:
|
|
|
120
120
|
|
|
121
121
|
def _execute_processing(self) -> None:
|
|
122
122
|
"""Execute the processing (called by timer)."""
|
|
123
|
-
|
|
123
|
+
# Copy orchestrator reference under lock to prevent race with cleanup()
|
|
124
|
+
with self._lock:
|
|
125
|
+
orchestrator = self._orchestrator
|
|
124
126
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
127
|
+
if orchestrator is None:
|
|
128
|
+
self.logger.error("❌ No orchestrator set for processing")
|
|
129
|
+
return
|
|
130
|
+
|
|
131
|
+
try:
|
|
128
132
|
|
|
129
133
|
self.logger.info(
|
|
130
134
|
f"🚀 Debounce delay ({self.delay_seconds}s) complete, processing all decorators"
|
|
@@ -160,10 +164,10 @@ class DebounceCoordinator:
|
|
|
160
164
|
# Execute appropriate pipeline based on type
|
|
161
165
|
if pipeline_type == "mcp":
|
|
162
166
|
# Phase 1: Run async MCP pipeline setup
|
|
163
|
-
result = asyncio.run(
|
|
167
|
+
result = asyncio.run(orchestrator.process_once())
|
|
164
168
|
elif pipeline_type == "api":
|
|
165
169
|
# Phase 1: Run async API pipeline setup
|
|
166
|
-
result = asyncio.run(
|
|
170
|
+
result = asyncio.run(orchestrator.process_api_once())
|
|
167
171
|
else:
|
|
168
172
|
raise RuntimeError(f"Unsupported pipeline type: {pipeline_type}")
|
|
169
173
|
|
|
@@ -177,8 +181,16 @@ class DebounceCoordinator:
|
|
|
177
181
|
# For API services, ONLY do dependency injection - user controls their FastAPI server
|
|
178
182
|
# Dependency injection is already complete from pipeline execution
|
|
179
183
|
# Optionally start heartbeat in background (non-blocking)
|
|
180
|
-
|
|
181
|
-
|
|
184
|
+
from ..api_heartbeat.api_lifespan_integration import (
|
|
185
|
+
api_heartbeat_lifespan_task,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
self._setup_heartbeat_background(
|
|
189
|
+
heartbeat_config,
|
|
190
|
+
pipeline_context,
|
|
191
|
+
api_heartbeat_lifespan_task,
|
|
192
|
+
id_field="service_id",
|
|
193
|
+
label="API service",
|
|
182
194
|
)
|
|
183
195
|
self.logger.info(
|
|
184
196
|
"✅ API dependency injection complete - user's FastAPI server can now start"
|
|
@@ -186,8 +198,14 @@ class DebounceCoordinator:
|
|
|
186
198
|
return # Don't block - let user's uvicorn run
|
|
187
199
|
elif fastapi_app and binding_config:
|
|
188
200
|
# For MCP agents - use same daemon thread pattern as API apps
|
|
189
|
-
|
|
190
|
-
|
|
201
|
+
from ..mcp_heartbeat.lifespan_integration import (
|
|
202
|
+
heartbeat_lifespan_task,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
self._setup_heartbeat_background(
|
|
206
|
+
heartbeat_config,
|
|
207
|
+
pipeline_context,
|
|
208
|
+
heartbeat_lifespan_task,
|
|
191
209
|
)
|
|
192
210
|
|
|
193
211
|
# Check if server was already reused from immediate uvicorn start
|
|
@@ -268,9 +286,9 @@ class DebounceCoordinator:
|
|
|
268
286
|
self.logger.info("🏁 Auto-run disabled - single execution mode")
|
|
269
287
|
|
|
270
288
|
if pipeline_type == "mcp":
|
|
271
|
-
result = asyncio.run(
|
|
289
|
+
result = asyncio.run(orchestrator.process_once())
|
|
272
290
|
elif pipeline_type == "api":
|
|
273
|
-
result = asyncio.run(
|
|
291
|
+
result = asyncio.run(orchestrator.process_api_once())
|
|
274
292
|
else:
|
|
275
293
|
raise RuntimeError(f"Unsupported pipeline type: {pipeline_type}")
|
|
276
294
|
|
|
@@ -311,119 +329,65 @@ class DebounceCoordinator:
|
|
|
311
329
|
self.logger.error(f"❌ FastAPI server error: {e}")
|
|
312
330
|
raise
|
|
313
331
|
|
|
314
|
-
def
|
|
315
|
-
self,
|
|
332
|
+
def _setup_heartbeat_background(
|
|
333
|
+
self,
|
|
334
|
+
heartbeat_config: dict[str, Any],
|
|
335
|
+
pipeline_context: dict[str, Any],
|
|
336
|
+
heartbeat_task_fn: Any,
|
|
337
|
+
id_field: str = "agent_id",
|
|
338
|
+
label: str = "MCP agent",
|
|
316
339
|
) -> None:
|
|
317
|
-
"""
|
|
318
|
-
|
|
319
|
-
# Populate heartbeat context with current pipeline context
|
|
320
|
-
heartbeat_config["context"] = pipeline_context
|
|
321
|
-
service_id = heartbeat_config.get("service_id", "unknown")
|
|
322
|
-
standalone_mode = heartbeat_config.get("standalone_mode", False)
|
|
323
|
-
|
|
324
|
-
if standalone_mode:
|
|
325
|
-
self.logger.info(
|
|
326
|
-
f"📝 API service '{service_id}' configured in standalone mode - no heartbeat"
|
|
327
|
-
)
|
|
328
|
-
return
|
|
329
|
-
|
|
330
|
-
self.logger.info(
|
|
331
|
-
f"🔗 Setting up background API heartbeat for service '{service_id}'"
|
|
332
|
-
)
|
|
333
|
-
|
|
334
|
-
# Import heartbeat functionality
|
|
335
|
-
import asyncio
|
|
336
|
-
import threading
|
|
337
|
-
|
|
338
|
-
from ..api_heartbeat.api_lifespan_integration import (
|
|
339
|
-
api_heartbeat_lifespan_task,
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
def run_heartbeat():
|
|
343
|
-
"""Run heartbeat in separate thread with its own event loop."""
|
|
344
|
-
self.logger.debug(
|
|
345
|
-
f"Starting background heartbeat thread for {service_id}"
|
|
346
|
-
)
|
|
347
|
-
try:
|
|
348
|
-
# Create new event loop for this thread
|
|
349
|
-
loop = asyncio.new_event_loop()
|
|
350
|
-
asyncio.set_event_loop(loop)
|
|
351
|
-
|
|
352
|
-
# Run heartbeat task
|
|
353
|
-
loop.run_until_complete(
|
|
354
|
-
api_heartbeat_lifespan_task(heartbeat_config)
|
|
355
|
-
)
|
|
356
|
-
except Exception as e:
|
|
357
|
-
self.logger.error(f"❌ Background heartbeat error: {e}")
|
|
358
|
-
finally:
|
|
359
|
-
loop.close()
|
|
360
|
-
|
|
361
|
-
# Start heartbeat in daemon thread (won't prevent process exit)
|
|
362
|
-
heartbeat_thread = threading.Thread(target=run_heartbeat, daemon=True)
|
|
363
|
-
heartbeat_thread.start()
|
|
340
|
+
"""
|
|
341
|
+
Setup heartbeat to run in background thread.
|
|
364
342
|
|
|
365
|
-
|
|
366
|
-
f"💓 Background API heartbeat thread started for service '{service_id}'"
|
|
367
|
-
)
|
|
343
|
+
Unified implementation for both API services and MCP agents.
|
|
368
344
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
345
|
+
Args:
|
|
346
|
+
heartbeat_config: Heartbeat configuration dict
|
|
347
|
+
pipeline_context: Pipeline context to populate into config
|
|
348
|
+
heartbeat_task_fn: Async function to run (api or mcp heartbeat task)
|
|
349
|
+
id_field: Config key for ID ("agent_id" or "service_id")
|
|
350
|
+
label: Label for log messages ("MCP agent" or "API service")
|
|
351
|
+
"""
|
|
352
|
+
import asyncio
|
|
353
|
+
import threading
|
|
372
354
|
|
|
373
|
-
def _setup_mcp_heartbeat_background(
|
|
374
|
-
self, heartbeat_config: dict[str, Any], pipeline_context: dict[str, Any]
|
|
375
|
-
) -> None:
|
|
376
|
-
"""Setup MCP heartbeat to run in background - same pattern as API apps."""
|
|
377
355
|
try:
|
|
378
|
-
# Populate heartbeat context with current pipeline context
|
|
379
356
|
heartbeat_config["context"] = pipeline_context
|
|
380
|
-
|
|
357
|
+
entity_id = heartbeat_config.get(id_field, "unknown")
|
|
381
358
|
standalone_mode = heartbeat_config.get("standalone_mode", False)
|
|
382
359
|
|
|
383
360
|
if standalone_mode:
|
|
384
361
|
self.logger.info(
|
|
385
|
-
f"
|
|
362
|
+
f"{label} '{entity_id}' configured in standalone mode - no heartbeat"
|
|
386
363
|
)
|
|
387
364
|
return
|
|
388
365
|
|
|
389
366
|
self.logger.info(
|
|
390
|
-
f"
|
|
367
|
+
f"Setting up background heartbeat for {label} '{entity_id}'"
|
|
391
368
|
)
|
|
392
369
|
|
|
393
|
-
# Import heartbeat functionality
|
|
394
|
-
import asyncio
|
|
395
|
-
import threading
|
|
396
|
-
|
|
397
|
-
from ..mcp_heartbeat.lifespan_integration import heartbeat_lifespan_task
|
|
398
|
-
|
|
399
370
|
def run_heartbeat():
|
|
400
371
|
"""Run heartbeat in separate thread with its own event loop."""
|
|
401
|
-
self.logger.debug(
|
|
402
|
-
f"Starting background heartbeat thread for {agent_id}"
|
|
403
|
-
)
|
|
372
|
+
self.logger.debug(f"Starting background heartbeat thread for {entity_id}")
|
|
404
373
|
try:
|
|
405
|
-
# Create new event loop for this thread
|
|
406
374
|
loop = asyncio.new_event_loop()
|
|
407
375
|
asyncio.set_event_loop(loop)
|
|
408
|
-
|
|
409
|
-
# Run heartbeat task
|
|
410
|
-
loop.run_until_complete(heartbeat_lifespan_task(heartbeat_config))
|
|
376
|
+
loop.run_until_complete(heartbeat_task_fn(heartbeat_config))
|
|
411
377
|
except Exception as e:
|
|
412
|
-
self.logger.error(f"
|
|
378
|
+
self.logger.error(f"Background heartbeat error: {e}")
|
|
413
379
|
finally:
|
|
414
380
|
loop.close()
|
|
415
381
|
|
|
416
|
-
# Start heartbeat in daemon thread (won't prevent process exit)
|
|
417
382
|
heartbeat_thread = threading.Thread(target=run_heartbeat, daemon=True)
|
|
418
383
|
heartbeat_thread.start()
|
|
419
384
|
|
|
420
385
|
self.logger.info(
|
|
421
|
-
f"
|
|
386
|
+
f"Background heartbeat thread started for {label} '{entity_id}'"
|
|
422
387
|
)
|
|
423
388
|
|
|
424
389
|
except Exception as e:
|
|
425
|
-
self.logger.warning(f"
|
|
426
|
-
# Don't fail - heartbeat is optional for MCP agents
|
|
390
|
+
self.logger.warning(f"Could not setup {label} heartbeat: {e}")
|
|
427
391
|
|
|
428
392
|
# Graceful shutdown is now handled by FastAPI lifespan in simple_shutdown.py
|
|
429
393
|
|
|
@@ -65,7 +65,7 @@ class RegistryConnectionStep(PipelineStep):
|
|
|
65
65
|
result.add_context("registry_wrapper", registry_wrapper)
|
|
66
66
|
|
|
67
67
|
result.message = f"Connected to registry at {registry_url}"
|
|
68
|
-
self.logger.
|
|
68
|
+
self.logger.trace(f"🔗 Registry connection established: {registry_url}")
|
|
69
69
|
|
|
70
70
|
except Exception as e:
|
|
71
71
|
result.status = PipelineStatus.FAILED
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health check manager with TTL caching and K8s response helpers.
|
|
3
|
+
|
|
4
|
+
Consolidates health check storage, caching, and Kubernetes endpoint response
|
|
5
|
+
generation into a single module.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import time
|
|
10
|
+
from collections.abc import Awaitable, Callable
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from .support_types import HealthStatus, HealthStatusType
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# =============================================================================
|
|
19
|
+
# Health Result Storage (moved from DecoratorRegistry)
|
|
20
|
+
# =============================================================================
|
|
21
|
+
|
|
22
|
+
# Simple storage for the latest health check result dict
|
|
23
|
+
# Format: {"status": "healthy/degraded/unhealthy", "agent": "...", ...}
|
|
24
|
+
_health_check_result: dict | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def store_health_check_result(result: dict) -> None:
|
|
28
|
+
"""Store health check result for K8s endpoints."""
|
|
29
|
+
global _health_check_result
|
|
30
|
+
_health_check_result = result
|
|
31
|
+
logger.debug(f"Stored health check result: {result.get('status', 'unknown')}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_health_check_result() -> dict | None:
|
|
35
|
+
"""Get stored health check result."""
|
|
36
|
+
return _health_check_result
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def clear_health_check_result() -> None:
|
|
40
|
+
"""Clear stored health check result."""
|
|
41
|
+
global _health_check_result
|
|
42
|
+
_health_check_result = None
|
|
43
|
+
logger.debug("Cleared health check result")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# =============================================================================
|
|
47
|
+
# TTL-Based Health Cache
|
|
48
|
+
# =============================================================================
|
|
49
|
+
|
|
50
|
+
# Global cache for HealthStatus objects with per-key TTL
|
|
51
|
+
# Format: {"health:agent_id": (HealthStatus, expiry_timestamp)}
|
|
52
|
+
_health_cache: dict[str, tuple[HealthStatus, float]] = {}
|
|
53
|
+
_max_cache_size = 100
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def get_health_status_with_cache(
|
|
57
|
+
agent_id: str,
|
|
58
|
+
health_check_fn: Callable[[], Awaitable[Any]] | None,
|
|
59
|
+
agent_config: dict[str, Any],
|
|
60
|
+
startup_context: dict[str, Any],
|
|
61
|
+
ttl: int = 15,
|
|
62
|
+
) -> HealthStatus:
|
|
63
|
+
"""
|
|
64
|
+
Get health status with TTL caching.
|
|
65
|
+
|
|
66
|
+
User health check can return:
|
|
67
|
+
- bool: True = HEALTHY, False = UNHEALTHY
|
|
68
|
+
- dict: {"status": "healthy/degraded/unhealthy", "checks": {...}, "errors": [...]}
|
|
69
|
+
- HealthStatus: Full object
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
agent_id: Unique identifier for the agent
|
|
73
|
+
health_check_fn: Optional async function for health check
|
|
74
|
+
agent_config: Agent configuration dict
|
|
75
|
+
startup_context: Full startup context with capabilities
|
|
76
|
+
ttl: Cache TTL in seconds (default: 15)
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
HealthStatus from cache or fresh check
|
|
80
|
+
"""
|
|
81
|
+
cache_key = f"health:{agent_id}"
|
|
82
|
+
current_time = time.time()
|
|
83
|
+
|
|
84
|
+
# Check cache
|
|
85
|
+
if cache_key in _health_cache:
|
|
86
|
+
cached_status, expiry_time = _health_cache[cache_key]
|
|
87
|
+
if current_time < expiry_time:
|
|
88
|
+
logger.debug(f"Health check cache HIT for agent '{agent_id}'")
|
|
89
|
+
return cached_status
|
|
90
|
+
else:
|
|
91
|
+
logger.debug(f"Health check cache EXPIRED for agent '{agent_id}'")
|
|
92
|
+
del _health_cache[cache_key]
|
|
93
|
+
|
|
94
|
+
logger.debug(f"Health check cache MISS for agent '{agent_id}'")
|
|
95
|
+
|
|
96
|
+
# Execute health check
|
|
97
|
+
health_status = await _execute_health_check(
|
|
98
|
+
agent_id, health_check_fn, agent_config, startup_context
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Store in cache
|
|
102
|
+
expiry_time = current_time + ttl
|
|
103
|
+
_health_cache[cache_key] = (health_status, expiry_time)
|
|
104
|
+
logger.debug(f"Cached health status for '{agent_id}' with TTL={ttl}s")
|
|
105
|
+
|
|
106
|
+
# Enforce max cache size
|
|
107
|
+
if len(_health_cache) > _max_cache_size:
|
|
108
|
+
oldest_key = min(_health_cache.keys(), key=lambda k: _health_cache[k][1])
|
|
109
|
+
del _health_cache[oldest_key]
|
|
110
|
+
logger.debug("Evicted oldest cache entry to maintain max size")
|
|
111
|
+
|
|
112
|
+
return health_status
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
async def _execute_health_check(
|
|
116
|
+
agent_id: str,
|
|
117
|
+
health_check_fn: Callable[[], Awaitable[Any]] | None,
|
|
118
|
+
agent_config: dict[str, Any],
|
|
119
|
+
startup_context: dict[str, Any],
|
|
120
|
+
) -> HealthStatus:
|
|
121
|
+
"""Execute health check function and build HealthStatus."""
|
|
122
|
+
capabilities = _get_capabilities(startup_context, agent_config)
|
|
123
|
+
|
|
124
|
+
if health_check_fn:
|
|
125
|
+
try:
|
|
126
|
+
logger.debug(f"Executing health check for agent '{agent_id}'")
|
|
127
|
+
user_result = await health_check_fn()
|
|
128
|
+
status_type, checks, errors = _parse_health_result(user_result)
|
|
129
|
+
|
|
130
|
+
logger.info(f"Health check for '{agent_id}': {status_type.value}")
|
|
131
|
+
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.warning(f"Health check failed for agent '{agent_id}': {e}")
|
|
134
|
+
status_type = HealthStatusType.DEGRADED
|
|
135
|
+
checks = {"health_check_execution": False}
|
|
136
|
+
errors = [f"Health check failed: {str(e)}"]
|
|
137
|
+
else:
|
|
138
|
+
# No health check provided - default to HEALTHY
|
|
139
|
+
logger.debug(f"No health check for '{agent_id}', using default HEALTHY")
|
|
140
|
+
status_type = HealthStatusType.HEALTHY
|
|
141
|
+
checks = {}
|
|
142
|
+
errors = []
|
|
143
|
+
|
|
144
|
+
return HealthStatus(
|
|
145
|
+
agent_name=agent_id,
|
|
146
|
+
status=status_type,
|
|
147
|
+
capabilities=capabilities,
|
|
148
|
+
checks=checks,
|
|
149
|
+
errors=errors,
|
|
150
|
+
timestamp=datetime.now(UTC),
|
|
151
|
+
version=agent_config.get("version", "1.0.0"),
|
|
152
|
+
metadata=agent_config,
|
|
153
|
+
uptime_seconds=0,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _get_capabilities(
|
|
158
|
+
startup_context: dict[str, Any],
|
|
159
|
+
agent_config: dict[str, Any],
|
|
160
|
+
) -> list[str]:
|
|
161
|
+
"""Get capabilities from context with fallbacks."""
|
|
162
|
+
capabilities = startup_context.get("capabilities", [])
|
|
163
|
+
if not capabilities:
|
|
164
|
+
capabilities = agent_config.get("capabilities", [])
|
|
165
|
+
if not capabilities:
|
|
166
|
+
capabilities = ["default"]
|
|
167
|
+
return capabilities
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _parse_health_result(
|
|
171
|
+
user_result: Any,
|
|
172
|
+
) -> tuple[HealthStatusType, dict, list]:
|
|
173
|
+
"""Parse user health check result into status, checks, errors."""
|
|
174
|
+
if isinstance(user_result, bool):
|
|
175
|
+
status_type = (
|
|
176
|
+
HealthStatusType.HEALTHY if user_result else HealthStatusType.UNHEALTHY
|
|
177
|
+
)
|
|
178
|
+
checks = {"health_check": user_result}
|
|
179
|
+
errors = [] if user_result else ["Health check returned False"]
|
|
180
|
+
|
|
181
|
+
elif isinstance(user_result, dict):
|
|
182
|
+
status_str = user_result.get("status", "healthy").lower()
|
|
183
|
+
status_map = {
|
|
184
|
+
"healthy": HealthStatusType.HEALTHY,
|
|
185
|
+
"degraded": HealthStatusType.DEGRADED,
|
|
186
|
+
"unhealthy": HealthStatusType.UNHEALTHY,
|
|
187
|
+
}
|
|
188
|
+
status_type = status_map.get(status_str, HealthStatusType.UNKNOWN)
|
|
189
|
+
checks = user_result.get("checks", {})
|
|
190
|
+
errors = user_result.get("errors", [])
|
|
191
|
+
|
|
192
|
+
elif isinstance(user_result, HealthStatus):
|
|
193
|
+
status_type = user_result.status
|
|
194
|
+
checks = user_result.checks
|
|
195
|
+
errors = user_result.errors
|
|
196
|
+
|
|
197
|
+
else:
|
|
198
|
+
logger.warning(f"Unexpected health check result type: {type(user_result)}")
|
|
199
|
+
status_type = HealthStatusType.UNHEALTHY
|
|
200
|
+
checks = {"health_check_return_type": False}
|
|
201
|
+
errors = [f"Invalid return type: {type(user_result)}"]
|
|
202
|
+
|
|
203
|
+
return status_type, checks, errors
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def clear_health_cache(agent_id: str | None = None) -> None:
|
|
207
|
+
"""Clear health cache for a specific agent or all agents."""
|
|
208
|
+
if agent_id:
|
|
209
|
+
cache_key = f"health:{agent_id}"
|
|
210
|
+
if cache_key in _health_cache:
|
|
211
|
+
del _health_cache[cache_key]
|
|
212
|
+
logger.debug(f"Cleared health cache for agent '{agent_id}'")
|
|
213
|
+
else:
|
|
214
|
+
_health_cache.clear()
|
|
215
|
+
logger.debug("Cleared entire health cache")
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def get_cache_stats() -> dict[str, Any]:
|
|
219
|
+
"""Get cache statistics for monitoring."""
|
|
220
|
+
return {
|
|
221
|
+
"size": len(_health_cache),
|
|
222
|
+
"maxsize": _max_cache_size,
|
|
223
|
+
"ttl": 15,
|
|
224
|
+
"cached_agents": [key.replace("health:", "") for key in _health_cache.keys()],
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# =============================================================================
|
|
229
|
+
# K8s Response Helpers
|
|
230
|
+
# =============================================================================
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def build_health_response(
|
|
234
|
+
agent_name: str,
|
|
235
|
+
health_status: HealthStatus | None = None,
|
|
236
|
+
) -> tuple[dict, int]:
|
|
237
|
+
"""
|
|
238
|
+
Build /health endpoint response with appropriate HTTP status code.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Tuple of (response_dict, http_status_code)
|
|
242
|
+
"""
|
|
243
|
+
if health_status:
|
|
244
|
+
status = health_status.status.value
|
|
245
|
+
response = {
|
|
246
|
+
"status": status,
|
|
247
|
+
"agent": agent_name,
|
|
248
|
+
"checks": health_status.checks,
|
|
249
|
+
"errors": health_status.errors,
|
|
250
|
+
"timestamp": health_status.timestamp.isoformat(),
|
|
251
|
+
}
|
|
252
|
+
else:
|
|
253
|
+
# Use stored result if available
|
|
254
|
+
stored = get_health_check_result()
|
|
255
|
+
if stored:
|
|
256
|
+
status = stored.get("status", "starting")
|
|
257
|
+
response = stored
|
|
258
|
+
else:
|
|
259
|
+
status = "starting"
|
|
260
|
+
response = {"status": "starting", "message": "Agent is starting"}
|
|
261
|
+
|
|
262
|
+
# K8s expects 200 for healthy, 503 for everything else
|
|
263
|
+
http_status = 200 if status == "healthy" else 503
|
|
264
|
+
return response, http_status
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def build_ready_response(
|
|
268
|
+
agent_name: str,
|
|
269
|
+
mcp_wrappers_count: int = 0,
|
|
270
|
+
) -> tuple[dict, int]:
|
|
271
|
+
"""
|
|
272
|
+
Build /ready endpoint response with appropriate HTTP status code.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
Tuple of (response_dict, http_status_code)
|
|
276
|
+
"""
|
|
277
|
+
stored = get_health_check_result()
|
|
278
|
+
|
|
279
|
+
if stored:
|
|
280
|
+
status = stored.get("status", "starting")
|
|
281
|
+
if status == "healthy":
|
|
282
|
+
return {
|
|
283
|
+
"ready": True,
|
|
284
|
+
"agent": agent_name,
|
|
285
|
+
"status": status,
|
|
286
|
+
"mcp_wrappers": mcp_wrappers_count,
|
|
287
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
288
|
+
}, 200
|
|
289
|
+
else:
|
|
290
|
+
return {
|
|
291
|
+
"ready": False,
|
|
292
|
+
"agent": agent_name,
|
|
293
|
+
"status": status,
|
|
294
|
+
"reason": f"Service is {status}",
|
|
295
|
+
"errors": stored.get("errors", []),
|
|
296
|
+
}, 503
|
|
297
|
+
else:
|
|
298
|
+
# No health check configured - assume ready
|
|
299
|
+
return {
|
|
300
|
+
"ready": True,
|
|
301
|
+
"agent": agent_name,
|
|
302
|
+
"mcp_wrappers": mcp_wrappers_count,
|
|
303
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
304
|
+
}, 200
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def build_livez_response(agent_name: str) -> dict:
|
|
308
|
+
"""Build /livez endpoint response (always returns 200)."""
|
|
309
|
+
return {
|
|
310
|
+
"alive": True,
|
|
311
|
+
"agent": agent_name,
|
|
312
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
313
|
+
}
|