mcp-hangar 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_hangar/__init__.py +139 -0
- mcp_hangar/application/__init__.py +1 -0
- mcp_hangar/application/commands/__init__.py +67 -0
- mcp_hangar/application/commands/auth_commands.py +118 -0
- mcp_hangar/application/commands/auth_handlers.py +296 -0
- mcp_hangar/application/commands/commands.py +59 -0
- mcp_hangar/application/commands/handlers.py +189 -0
- mcp_hangar/application/discovery/__init__.py +21 -0
- mcp_hangar/application/discovery/discovery_metrics.py +283 -0
- mcp_hangar/application/discovery/discovery_orchestrator.py +497 -0
- mcp_hangar/application/discovery/lifecycle_manager.py +315 -0
- mcp_hangar/application/discovery/security_validator.py +414 -0
- mcp_hangar/application/event_handlers/__init__.py +50 -0
- mcp_hangar/application/event_handlers/alert_handler.py +191 -0
- mcp_hangar/application/event_handlers/audit_handler.py +203 -0
- mcp_hangar/application/event_handlers/knowledge_base_handler.py +120 -0
- mcp_hangar/application/event_handlers/logging_handler.py +69 -0
- mcp_hangar/application/event_handlers/metrics_handler.py +152 -0
- mcp_hangar/application/event_handlers/persistent_audit_store.py +217 -0
- mcp_hangar/application/event_handlers/security_handler.py +604 -0
- mcp_hangar/application/mcp/tooling.py +158 -0
- mcp_hangar/application/ports/__init__.py +9 -0
- mcp_hangar/application/ports/observability.py +237 -0
- mcp_hangar/application/queries/__init__.py +52 -0
- mcp_hangar/application/queries/auth_handlers.py +237 -0
- mcp_hangar/application/queries/auth_queries.py +118 -0
- mcp_hangar/application/queries/handlers.py +227 -0
- mcp_hangar/application/read_models/__init__.py +11 -0
- mcp_hangar/application/read_models/provider_views.py +139 -0
- mcp_hangar/application/sagas/__init__.py +11 -0
- mcp_hangar/application/sagas/group_rebalance_saga.py +137 -0
- mcp_hangar/application/sagas/provider_failover_saga.py +266 -0
- mcp_hangar/application/sagas/provider_recovery_saga.py +172 -0
- mcp_hangar/application/services/__init__.py +9 -0
- mcp_hangar/application/services/provider_service.py +208 -0
- mcp_hangar/application/services/traced_provider_service.py +211 -0
- mcp_hangar/bootstrap/runtime.py +328 -0
- mcp_hangar/context.py +178 -0
- mcp_hangar/domain/__init__.py +117 -0
- mcp_hangar/domain/contracts/__init__.py +57 -0
- mcp_hangar/domain/contracts/authentication.py +225 -0
- mcp_hangar/domain/contracts/authorization.py +229 -0
- mcp_hangar/domain/contracts/event_store.py +178 -0
- mcp_hangar/domain/contracts/metrics_publisher.py +59 -0
- mcp_hangar/domain/contracts/persistence.py +383 -0
- mcp_hangar/domain/contracts/provider_runtime.py +146 -0
- mcp_hangar/domain/discovery/__init__.py +20 -0
- mcp_hangar/domain/discovery/conflict_resolver.py +267 -0
- mcp_hangar/domain/discovery/discovered_provider.py +185 -0
- mcp_hangar/domain/discovery/discovery_service.py +412 -0
- mcp_hangar/domain/discovery/discovery_source.py +192 -0
- mcp_hangar/domain/events.py +433 -0
- mcp_hangar/domain/exceptions.py +525 -0
- mcp_hangar/domain/model/__init__.py +70 -0
- mcp_hangar/domain/model/aggregate.py +58 -0
- mcp_hangar/domain/model/circuit_breaker.py +152 -0
- mcp_hangar/domain/model/event_sourced_api_key.py +413 -0
- mcp_hangar/domain/model/event_sourced_provider.py +423 -0
- mcp_hangar/domain/model/event_sourced_role_assignment.py +268 -0
- mcp_hangar/domain/model/health_tracker.py +183 -0
- mcp_hangar/domain/model/load_balancer.py +185 -0
- mcp_hangar/domain/model/provider.py +810 -0
- mcp_hangar/domain/model/provider_group.py +656 -0
- mcp_hangar/domain/model/tool_catalog.py +105 -0
- mcp_hangar/domain/policies/__init__.py +19 -0
- mcp_hangar/domain/policies/provider_health.py +187 -0
- mcp_hangar/domain/repository.py +249 -0
- mcp_hangar/domain/security/__init__.py +85 -0
- mcp_hangar/domain/security/input_validator.py +710 -0
- mcp_hangar/domain/security/rate_limiter.py +387 -0
- mcp_hangar/domain/security/roles.py +237 -0
- mcp_hangar/domain/security/sanitizer.py +387 -0
- mcp_hangar/domain/security/secrets.py +501 -0
- mcp_hangar/domain/services/__init__.py +20 -0
- mcp_hangar/domain/services/audit_service.py +376 -0
- mcp_hangar/domain/services/image_builder.py +328 -0
- mcp_hangar/domain/services/provider_launcher.py +1046 -0
- mcp_hangar/domain/value_objects.py +1138 -0
- mcp_hangar/errors.py +818 -0
- mcp_hangar/fastmcp_server.py +1105 -0
- mcp_hangar/gc.py +134 -0
- mcp_hangar/infrastructure/__init__.py +79 -0
- mcp_hangar/infrastructure/async_executor.py +133 -0
- mcp_hangar/infrastructure/auth/__init__.py +37 -0
- mcp_hangar/infrastructure/auth/api_key_authenticator.py +388 -0
- mcp_hangar/infrastructure/auth/event_sourced_store.py +567 -0
- mcp_hangar/infrastructure/auth/jwt_authenticator.py +360 -0
- mcp_hangar/infrastructure/auth/middleware.py +340 -0
- mcp_hangar/infrastructure/auth/opa_authorizer.py +243 -0
- mcp_hangar/infrastructure/auth/postgres_store.py +659 -0
- mcp_hangar/infrastructure/auth/projections.py +366 -0
- mcp_hangar/infrastructure/auth/rate_limiter.py +311 -0
- mcp_hangar/infrastructure/auth/rbac_authorizer.py +323 -0
- mcp_hangar/infrastructure/auth/sqlite_store.py +624 -0
- mcp_hangar/infrastructure/command_bus.py +112 -0
- mcp_hangar/infrastructure/discovery/__init__.py +110 -0
- mcp_hangar/infrastructure/discovery/docker_source.py +289 -0
- mcp_hangar/infrastructure/discovery/entrypoint_source.py +249 -0
- mcp_hangar/infrastructure/discovery/filesystem_source.py +383 -0
- mcp_hangar/infrastructure/discovery/kubernetes_source.py +247 -0
- mcp_hangar/infrastructure/event_bus.py +260 -0
- mcp_hangar/infrastructure/event_sourced_repository.py +443 -0
- mcp_hangar/infrastructure/event_store.py +396 -0
- mcp_hangar/infrastructure/knowledge_base/__init__.py +259 -0
- mcp_hangar/infrastructure/knowledge_base/contracts.py +202 -0
- mcp_hangar/infrastructure/knowledge_base/memory.py +177 -0
- mcp_hangar/infrastructure/knowledge_base/postgres.py +545 -0
- mcp_hangar/infrastructure/knowledge_base/sqlite.py +513 -0
- mcp_hangar/infrastructure/metrics_publisher.py +36 -0
- mcp_hangar/infrastructure/observability/__init__.py +10 -0
- mcp_hangar/infrastructure/observability/langfuse_adapter.py +534 -0
- mcp_hangar/infrastructure/persistence/__init__.py +33 -0
- mcp_hangar/infrastructure/persistence/audit_repository.py +371 -0
- mcp_hangar/infrastructure/persistence/config_repository.py +398 -0
- mcp_hangar/infrastructure/persistence/database.py +333 -0
- mcp_hangar/infrastructure/persistence/database_common.py +330 -0
- mcp_hangar/infrastructure/persistence/event_serializer.py +280 -0
- mcp_hangar/infrastructure/persistence/event_upcaster.py +166 -0
- mcp_hangar/infrastructure/persistence/in_memory_event_store.py +150 -0
- mcp_hangar/infrastructure/persistence/recovery_service.py +312 -0
- mcp_hangar/infrastructure/persistence/sqlite_event_store.py +386 -0
- mcp_hangar/infrastructure/persistence/unit_of_work.py +409 -0
- mcp_hangar/infrastructure/persistence/upcasters/README.md +13 -0
- mcp_hangar/infrastructure/persistence/upcasters/__init__.py +7 -0
- mcp_hangar/infrastructure/query_bus.py +153 -0
- mcp_hangar/infrastructure/saga_manager.py +401 -0
- mcp_hangar/logging_config.py +209 -0
- mcp_hangar/metrics.py +1007 -0
- mcp_hangar/models.py +31 -0
- mcp_hangar/observability/__init__.py +54 -0
- mcp_hangar/observability/health.py +487 -0
- mcp_hangar/observability/metrics.py +319 -0
- mcp_hangar/observability/tracing.py +433 -0
- mcp_hangar/progress.py +542 -0
- mcp_hangar/retry.py +613 -0
- mcp_hangar/server/__init__.py +120 -0
- mcp_hangar/server/__main__.py +6 -0
- mcp_hangar/server/auth_bootstrap.py +340 -0
- mcp_hangar/server/auth_cli.py +335 -0
- mcp_hangar/server/auth_config.py +305 -0
- mcp_hangar/server/bootstrap.py +735 -0
- mcp_hangar/server/cli.py +161 -0
- mcp_hangar/server/config.py +224 -0
- mcp_hangar/server/context.py +215 -0
- mcp_hangar/server/http_auth_middleware.py +165 -0
- mcp_hangar/server/lifecycle.py +467 -0
- mcp_hangar/server/state.py +117 -0
- mcp_hangar/server/tools/__init__.py +16 -0
- mcp_hangar/server/tools/discovery.py +186 -0
- mcp_hangar/server/tools/groups.py +75 -0
- mcp_hangar/server/tools/health.py +301 -0
- mcp_hangar/server/tools/provider.py +939 -0
- mcp_hangar/server/tools/registry.py +320 -0
- mcp_hangar/server/validation.py +113 -0
- mcp_hangar/stdio_client.py +229 -0
- mcp_hangar-0.2.0.dist-info/METADATA +347 -0
- mcp_hangar-0.2.0.dist-info/RECORD +160 -0
- mcp_hangar-0.2.0.dist-info/WHEEL +4 -0
- mcp_hangar-0.2.0.dist-info/entry_points.txt +2 -0
- mcp_hangar-0.2.0.dist-info/licenses/LICENSE +21 -0
mcp_hangar/models.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Core data models for the MCP registry with explicit state management.
|
|
2
|
+
|
|
3
|
+
This module provides backward compatibility imports for legacy code.
|
|
4
|
+
New code should import directly from the domain layer.
|
|
5
|
+
|
|
6
|
+
Deprecated imports (use domain layer instead):
|
|
7
|
+
- ProviderState -> from mcp_hangar.domain.value_objects import ProviderState
|
|
8
|
+
- MCPError, ProviderStartError, etc. -> from mcp_hangar.domain.exceptions import ...
|
|
9
|
+
- ToolSchema -> from mcp_hangar.domain.model import ToolSchema
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# Re-export all exceptions from the canonical location for backward compatibility
|
|
13
|
+
from .domain.exceptions import (
|
|
14
|
+
ClientError,
|
|
15
|
+
MCPError,
|
|
16
|
+
ProviderDegradedError,
|
|
17
|
+
ProviderError,
|
|
18
|
+
ProviderNotFoundError,
|
|
19
|
+
ProviderNotReadyError,
|
|
20
|
+
ProviderStartError,
|
|
21
|
+
ToolError,
|
|
22
|
+
ToolInvocationError,
|
|
23
|
+
ToolNotFoundError,
|
|
24
|
+
ValidationError,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Re-export ToolSchema from the canonical location
|
|
28
|
+
from .domain.model import ToolSchema
|
|
29
|
+
|
|
30
|
+
# Re-export ProviderState from the canonical location
|
|
31
|
+
from .domain.value_objects import ProviderState
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Observability module for MCP Hangar.
|
|
2
|
+
|
|
3
|
+
Provides unified observability stack:
|
|
4
|
+
- OpenTelemetry tracing
|
|
5
|
+
- Extended metrics
|
|
6
|
+
- Health endpoints
|
|
7
|
+
- Log correlation
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from mcp_hangar.observability import init_tracing, get_tracer
|
|
11
|
+
|
|
12
|
+
init_tracing(service_name="mcp-hangar")
|
|
13
|
+
tracer = get_tracer(__name__)
|
|
14
|
+
|
|
15
|
+
with tracer.start_as_current_span("operation") as span:
|
|
16
|
+
span.set_attribute("provider.id", provider_id)
|
|
17
|
+
# ... do work
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from mcp_hangar.observability.health import get_health_endpoint, HealthCheck, HealthEndpoint, HealthStatus
|
|
21
|
+
from mcp_hangar.observability.metrics import CircuitState, get_observability_metrics, ObservabilityMetrics
|
|
22
|
+
from mcp_hangar.observability.tracing import (
|
|
23
|
+
extract_trace_context,
|
|
24
|
+
get_current_span_id,
|
|
25
|
+
get_current_trace_id,
|
|
26
|
+
get_tracer,
|
|
27
|
+
init_tracing,
|
|
28
|
+
inject_trace_context,
|
|
29
|
+
shutdown_tracing,
|
|
30
|
+
trace_span,
|
|
31
|
+
trace_tool_invocation,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# Tracing
|
|
36
|
+
"init_tracing",
|
|
37
|
+
"shutdown_tracing",
|
|
38
|
+
"get_tracer",
|
|
39
|
+
"trace_tool_invocation",
|
|
40
|
+
"trace_span",
|
|
41
|
+
"inject_trace_context",
|
|
42
|
+
"extract_trace_context",
|
|
43
|
+
"get_current_trace_id",
|
|
44
|
+
"get_current_span_id",
|
|
45
|
+
# Metrics
|
|
46
|
+
"ObservabilityMetrics",
|
|
47
|
+
"get_observability_metrics",
|
|
48
|
+
"CircuitState",
|
|
49
|
+
# Health
|
|
50
|
+
"HealthStatus",
|
|
51
|
+
"HealthCheck",
|
|
52
|
+
"HealthEndpoint",
|
|
53
|
+
"get_health_endpoint",
|
|
54
|
+
]
|
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"""Health check endpoints and status for MCP Hangar.
|
|
2
|
+
|
|
3
|
+
Provides Kubernetes-compatible health endpoints:
|
|
4
|
+
- /health/live - Liveness probe (is the process alive?)
|
|
5
|
+
- /health/ready - Readiness probe (can it serve traffic?)
|
|
6
|
+
- /health/startup - Startup probe (has it finished initializing?)
|
|
7
|
+
|
|
8
|
+
Also provides detailed health status for dashboards.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
import threading
|
|
15
|
+
import time
|
|
16
|
+
from typing import Any, Callable, Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
from mcp_hangar.logging_config import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HealthStatus(Enum):
|
|
24
|
+
"""Health check result status."""
|
|
25
|
+
|
|
26
|
+
HEALTHY = "healthy"
|
|
27
|
+
DEGRADED = "degraded"
|
|
28
|
+
UNHEALTHY = "unhealthy"
|
|
29
|
+
UNKNOWN = "unknown"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class HealthCheckResult:
|
|
34
|
+
"""Result of a single health check."""
|
|
35
|
+
|
|
36
|
+
name: str
|
|
37
|
+
status: HealthStatus
|
|
38
|
+
message: str = ""
|
|
39
|
+
duration_ms: float = 0.0
|
|
40
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
41
|
+
timestamp: float = field(default_factory=time.time)
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
44
|
+
"""Convert to dictionary for JSON serialization."""
|
|
45
|
+
return {
|
|
46
|
+
"name": self.name,
|
|
47
|
+
"status": self.status.value,
|
|
48
|
+
"message": self.message,
|
|
49
|
+
"duration_ms": round(self.duration_ms, 2),
|
|
50
|
+
"details": self.details,
|
|
51
|
+
"timestamp": self.timestamp,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class HealthCheck:
|
|
57
|
+
"""Health check definition."""
|
|
58
|
+
|
|
59
|
+
name: str
|
|
60
|
+
check_fn: Callable[[], bool]
|
|
61
|
+
description: str = ""
|
|
62
|
+
timeout_seconds: float = 5.0
|
|
63
|
+
critical: bool = True # If False, failure degrades but doesn't make unhealthy
|
|
64
|
+
|
|
65
|
+
async def execute(self) -> HealthCheckResult:
|
|
66
|
+
"""Execute the health check.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
HealthCheckResult with status and timing.
|
|
70
|
+
"""
|
|
71
|
+
start = time.perf_counter()
|
|
72
|
+
try:
|
|
73
|
+
# Run check with timeout
|
|
74
|
+
if asyncio.iscoroutinefunction(self.check_fn):
|
|
75
|
+
result = await asyncio.wait_for(self.check_fn(), timeout=self.timeout_seconds)
|
|
76
|
+
else:
|
|
77
|
+
# Run sync function in thread pool
|
|
78
|
+
loop = asyncio.get_event_loop()
|
|
79
|
+
result = await asyncio.wait_for(
|
|
80
|
+
loop.run_in_executor(None, self.check_fn),
|
|
81
|
+
timeout=self.timeout_seconds,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
85
|
+
|
|
86
|
+
if result:
|
|
87
|
+
return HealthCheckResult(
|
|
88
|
+
name=self.name,
|
|
89
|
+
status=HealthStatus.HEALTHY,
|
|
90
|
+
message="Check passed",
|
|
91
|
+
duration_ms=duration_ms,
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
status = HealthStatus.UNHEALTHY if self.critical else HealthStatus.DEGRADED
|
|
95
|
+
return HealthCheckResult(
|
|
96
|
+
name=self.name,
|
|
97
|
+
status=status,
|
|
98
|
+
message="Check returned false",
|
|
99
|
+
duration_ms=duration_ms,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
except asyncio.TimeoutError:
|
|
103
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
104
|
+
status = HealthStatus.UNHEALTHY if self.critical else HealthStatus.DEGRADED
|
|
105
|
+
return HealthCheckResult(
|
|
106
|
+
name=self.name,
|
|
107
|
+
status=status,
|
|
108
|
+
message=f"Check timed out after {self.timeout_seconds}s",
|
|
109
|
+
duration_ms=duration_ms,
|
|
110
|
+
)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
113
|
+
status = HealthStatus.UNHEALTHY if self.critical else HealthStatus.DEGRADED
|
|
114
|
+
return HealthCheckResult(
|
|
115
|
+
name=self.name,
|
|
116
|
+
status=status,
|
|
117
|
+
message=f"Check failed: {str(e)}",
|
|
118
|
+
duration_ms=duration_ms,
|
|
119
|
+
details={"error_type": type(e).__name__},
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class HealthResponse:
|
|
125
|
+
"""Complete health response."""
|
|
126
|
+
|
|
127
|
+
status: HealthStatus
|
|
128
|
+
checks: List[HealthCheckResult]
|
|
129
|
+
version: str = "unknown"
|
|
130
|
+
uptime_seconds: float = 0.0
|
|
131
|
+
|
|
132
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
133
|
+
"""Convert to dictionary for JSON serialization."""
|
|
134
|
+
return {
|
|
135
|
+
"status": self.status.value,
|
|
136
|
+
"checks": [c.to_dict() for c in self.checks],
|
|
137
|
+
"version": self.version,
|
|
138
|
+
"uptime_seconds": round(self.uptime_seconds, 1),
|
|
139
|
+
"timestamp": time.time(),
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class HealthEndpoint:
|
|
144
|
+
"""Health endpoint manager.
|
|
145
|
+
|
|
146
|
+
Manages health checks and provides Kubernetes-compatible endpoints.
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
def __init__(self):
|
|
150
|
+
self._checks: List[HealthCheck] = []
|
|
151
|
+
self._startup_complete = False
|
|
152
|
+
self._start_time = time.time()
|
|
153
|
+
self._lock = threading.Lock()
|
|
154
|
+
self._last_results: Dict[str, HealthCheckResult] = {}
|
|
155
|
+
|
|
156
|
+
def register_check(self, check: HealthCheck) -> None:
|
|
157
|
+
"""Register a health check.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
check: HealthCheck to register.
|
|
161
|
+
"""
|
|
162
|
+
with self._lock:
|
|
163
|
+
# Avoid duplicates
|
|
164
|
+
existing_names = {c.name for c in self._checks}
|
|
165
|
+
if check.name not in existing_names:
|
|
166
|
+
self._checks.append(check)
|
|
167
|
+
logger.debug("health_check_registered", name=check.name)
|
|
168
|
+
|
|
169
|
+
def unregister_check(self, name: str) -> None:
|
|
170
|
+
"""Unregister a health check by name.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
name: Name of check to remove.
|
|
174
|
+
"""
|
|
175
|
+
with self._lock:
|
|
176
|
+
self._checks = [c for c in self._checks if c.name != name]
|
|
177
|
+
self._last_results.pop(name, None)
|
|
178
|
+
|
|
179
|
+
def mark_startup_complete(self) -> None:
|
|
180
|
+
"""Mark that startup is complete."""
|
|
181
|
+
self._startup_complete = True
|
|
182
|
+
logger.info("startup_marked_complete")
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def uptime_seconds(self) -> float:
|
|
186
|
+
"""Get server uptime in seconds."""
|
|
187
|
+
return time.time() - self._start_time
|
|
188
|
+
|
|
189
|
+
async def check_liveness(self) -> HealthResponse:
|
|
190
|
+
"""Liveness probe - is the process alive?
|
|
191
|
+
|
|
192
|
+
Always returns healthy unless completely broken.
|
|
193
|
+
Used by Kubernetes to restart the container.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
HealthResponse with liveness status.
|
|
197
|
+
"""
|
|
198
|
+
# Simple liveness - just verify we can respond
|
|
199
|
+
return HealthResponse(
|
|
200
|
+
status=HealthStatus.HEALTHY,
|
|
201
|
+
checks=[
|
|
202
|
+
HealthCheckResult(
|
|
203
|
+
name="liveness",
|
|
204
|
+
status=HealthStatus.HEALTHY,
|
|
205
|
+
message="Process is alive",
|
|
206
|
+
)
|
|
207
|
+
],
|
|
208
|
+
version=self._get_version(),
|
|
209
|
+
uptime_seconds=self.uptime_seconds,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
async def check_readiness(self) -> HealthResponse:
|
|
213
|
+
"""Readiness probe - can we serve traffic?
|
|
214
|
+
|
|
215
|
+
Runs all registered health checks.
|
|
216
|
+
Used by Kubernetes to route traffic.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
HealthResponse with aggregated status.
|
|
220
|
+
"""
|
|
221
|
+
results = await self._run_all_checks()
|
|
222
|
+
|
|
223
|
+
# Determine overall status
|
|
224
|
+
has_unhealthy = any(r.status == HealthStatus.UNHEALTHY for r in results)
|
|
225
|
+
has_degraded = any(r.status == HealthStatus.DEGRADED for r in results)
|
|
226
|
+
|
|
227
|
+
if has_unhealthy:
|
|
228
|
+
overall = HealthStatus.UNHEALTHY
|
|
229
|
+
elif has_degraded:
|
|
230
|
+
overall = HealthStatus.DEGRADED
|
|
231
|
+
elif not results:
|
|
232
|
+
overall = HealthStatus.HEALTHY # No checks = healthy
|
|
233
|
+
else:
|
|
234
|
+
overall = HealthStatus.HEALTHY
|
|
235
|
+
|
|
236
|
+
return HealthResponse(
|
|
237
|
+
status=overall,
|
|
238
|
+
checks=results,
|
|
239
|
+
version=self._get_version(),
|
|
240
|
+
uptime_seconds=self.uptime_seconds,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
async def check_startup(self) -> HealthResponse:
|
|
244
|
+
"""Startup probe - has initialization completed?
|
|
245
|
+
|
|
246
|
+
Returns unhealthy until mark_startup_complete() is called.
|
|
247
|
+
Used by Kubernetes to delay liveness/readiness probes.
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
HealthResponse with startup status.
|
|
251
|
+
"""
|
|
252
|
+
if self._startup_complete:
|
|
253
|
+
status = HealthStatus.HEALTHY
|
|
254
|
+
message = "Startup complete"
|
|
255
|
+
else:
|
|
256
|
+
status = HealthStatus.UNHEALTHY
|
|
257
|
+
message = "Startup in progress"
|
|
258
|
+
|
|
259
|
+
return HealthResponse(
|
|
260
|
+
status=status,
|
|
261
|
+
checks=[
|
|
262
|
+
HealthCheckResult(
|
|
263
|
+
name="startup",
|
|
264
|
+
status=status,
|
|
265
|
+
message=message,
|
|
266
|
+
)
|
|
267
|
+
],
|
|
268
|
+
version=self._get_version(),
|
|
269
|
+
uptime_seconds=self.uptime_seconds,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
async def get_detailed_status(self) -> Dict[str, Any]:
|
|
273
|
+
"""Get detailed health status for dashboards.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Detailed status dictionary.
|
|
277
|
+
"""
|
|
278
|
+
results = await self._run_all_checks()
|
|
279
|
+
|
|
280
|
+
# Calculate statistics
|
|
281
|
+
total = len(results)
|
|
282
|
+
healthy = sum(1 for r in results if r.status == HealthStatus.HEALTHY)
|
|
283
|
+
degraded = sum(1 for r in results if r.status == HealthStatus.DEGRADED)
|
|
284
|
+
unhealthy = sum(1 for r in results if r.status == HealthStatus.UNHEALTHY)
|
|
285
|
+
|
|
286
|
+
avg_duration = sum(r.duration_ms for r in results) / total if total > 0 else 0
|
|
287
|
+
|
|
288
|
+
return {
|
|
289
|
+
"status": self._aggregate_status(results).value,
|
|
290
|
+
"summary": {
|
|
291
|
+
"total_checks": total,
|
|
292
|
+
"healthy": healthy,
|
|
293
|
+
"degraded": degraded,
|
|
294
|
+
"unhealthy": unhealthy,
|
|
295
|
+
"avg_check_duration_ms": round(avg_duration, 2),
|
|
296
|
+
},
|
|
297
|
+
"checks": [r.to_dict() for r in results],
|
|
298
|
+
"version": self._get_version(),
|
|
299
|
+
"uptime_seconds": round(self.uptime_seconds, 1),
|
|
300
|
+
"startup_complete": self._startup_complete,
|
|
301
|
+
"timestamp": time.time(),
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
async def _run_all_checks(self) -> List[HealthCheckResult]:
|
|
305
|
+
"""Run all registered health checks.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
List of check results.
|
|
309
|
+
"""
|
|
310
|
+
with self._lock:
|
|
311
|
+
checks = list(self._checks)
|
|
312
|
+
|
|
313
|
+
if not checks:
|
|
314
|
+
return []
|
|
315
|
+
|
|
316
|
+
# Run checks concurrently
|
|
317
|
+
tasks = [check.execute() for check in checks]
|
|
318
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
319
|
+
|
|
320
|
+
# Process results
|
|
321
|
+
processed = []
|
|
322
|
+
for i, result in enumerate(results):
|
|
323
|
+
if isinstance(result, Exception):
|
|
324
|
+
processed.append(
|
|
325
|
+
HealthCheckResult(
|
|
326
|
+
name=checks[i].name,
|
|
327
|
+
status=HealthStatus.UNHEALTHY,
|
|
328
|
+
message=f"Check execution failed: {result}",
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
else:
|
|
332
|
+
processed.append(result)
|
|
333
|
+
|
|
334
|
+
# Cache results
|
|
335
|
+
with self._lock:
|
|
336
|
+
for r in processed:
|
|
337
|
+
self._last_results[r.name] = r
|
|
338
|
+
|
|
339
|
+
return processed
|
|
340
|
+
|
|
341
|
+
def _aggregate_status(self, results: List[HealthCheckResult]) -> HealthStatus:
|
|
342
|
+
"""Aggregate check results into overall status."""
|
|
343
|
+
if not results:
|
|
344
|
+
return HealthStatus.HEALTHY
|
|
345
|
+
|
|
346
|
+
has_unhealthy = any(r.status == HealthStatus.UNHEALTHY for r in results)
|
|
347
|
+
has_degraded = any(r.status == HealthStatus.DEGRADED for r in results)
|
|
348
|
+
|
|
349
|
+
if has_unhealthy:
|
|
350
|
+
return HealthStatus.UNHEALTHY
|
|
351
|
+
elif has_degraded:
|
|
352
|
+
return HealthStatus.DEGRADED
|
|
353
|
+
else:
|
|
354
|
+
return HealthStatus.HEALTHY
|
|
355
|
+
|
|
356
|
+
def _get_version(self) -> str:
|
|
357
|
+
"""Get MCP Hangar version."""
|
|
358
|
+
try:
|
|
359
|
+
from mcp_hangar import __version__
|
|
360
|
+
|
|
361
|
+
return __version__
|
|
362
|
+
except (ImportError, AttributeError):
|
|
363
|
+
return "unknown"
|
|
364
|
+
|
|
365
|
+
def get_last_result(self, name: str) -> Optional[HealthCheckResult]:
|
|
366
|
+
"""Get the last result for a specific check.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
name: Check name.
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Last result or None.
|
|
373
|
+
"""
|
|
374
|
+
with self._lock:
|
|
375
|
+
return self._last_results.get(name)
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# Global singleton
|
|
379
|
+
_health_endpoint: Optional[HealthEndpoint] = None
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def get_health_endpoint() -> HealthEndpoint:
|
|
383
|
+
"""Get the health endpoint singleton.
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
HealthEndpoint instance.
|
|
387
|
+
"""
|
|
388
|
+
global _health_endpoint
|
|
389
|
+
if _health_endpoint is None:
|
|
390
|
+
_health_endpoint = HealthEndpoint()
|
|
391
|
+
return _health_endpoint
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def reset_health_endpoint() -> None:
|
|
395
|
+
"""Reset health endpoint singleton (for testing)."""
|
|
396
|
+
global _health_endpoint
|
|
397
|
+
_health_endpoint = None
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
# Built-in health checks
|
|
401
|
+
def create_provider_health_check(providers_dict: Any) -> HealthCheck:
|
|
402
|
+
"""Create health check for provider availability.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
providers_dict: Providers dictionary or dict-like object.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
HealthCheck instance.
|
|
409
|
+
"""
|
|
410
|
+
|
|
411
|
+
def check() -> bool:
|
|
412
|
+
if not providers_dict:
|
|
413
|
+
return True # No providers = healthy (vacuous)
|
|
414
|
+
|
|
415
|
+
total = len(providers_dict)
|
|
416
|
+
ready = sum(1 for p in providers_dict.values() if hasattr(p, "state") and str(p.state) == "ready")
|
|
417
|
+
|
|
418
|
+
# At least 50% providers should be ready
|
|
419
|
+
return total == 0 or (ready / total) >= 0.5
|
|
420
|
+
|
|
421
|
+
return HealthCheck(
|
|
422
|
+
name="providers",
|
|
423
|
+
check_fn=check,
|
|
424
|
+
description="Check that at least 50% of providers are ready",
|
|
425
|
+
critical=False, # Degraded, not unhealthy
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def create_memory_health_check(
|
|
430
|
+
threshold_mb: int = 1024,
|
|
431
|
+
) -> HealthCheck:
|
|
432
|
+
"""Create health check for memory usage.
|
|
433
|
+
|
|
434
|
+
Args:
|
|
435
|
+
threshold_mb: Memory threshold in MB.
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
HealthCheck instance.
|
|
439
|
+
"""
|
|
440
|
+
|
|
441
|
+
def check() -> bool:
|
|
442
|
+
try:
|
|
443
|
+
import resource
|
|
444
|
+
|
|
445
|
+
# Get current memory usage (RSS in bytes)
|
|
446
|
+
usage = resource.getrusage(resource.RUSAGE_SELF)
|
|
447
|
+
rss_mb = usage.ru_maxrss / (1024 * 1024) # Convert to MB
|
|
448
|
+
|
|
449
|
+
# macOS reports in bytes, Linux in KB
|
|
450
|
+
import platform
|
|
451
|
+
|
|
452
|
+
if platform.system() == "Darwin":
|
|
453
|
+
rss_mb = usage.ru_maxrss / (1024 * 1024)
|
|
454
|
+
else:
|
|
455
|
+
rss_mb = usage.ru_maxrss / 1024
|
|
456
|
+
|
|
457
|
+
return rss_mb < threshold_mb
|
|
458
|
+
except (ImportError, AttributeError):
|
|
459
|
+
return True # Can't check, assume healthy
|
|
460
|
+
|
|
461
|
+
return HealthCheck(
|
|
462
|
+
name="memory",
|
|
463
|
+
check_fn=check,
|
|
464
|
+
description=f"Check memory usage is below {threshold_mb}MB",
|
|
465
|
+
critical=False,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def create_event_loop_health_check() -> HealthCheck:
|
|
470
|
+
"""Create health check for event loop responsiveness.
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
HealthCheck instance.
|
|
474
|
+
"""
|
|
475
|
+
|
|
476
|
+
async def check() -> bool:
|
|
477
|
+
# Simple check that async works
|
|
478
|
+
await asyncio.sleep(0.001)
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
return HealthCheck(
|
|
482
|
+
name="event_loop",
|
|
483
|
+
check_fn=check,
|
|
484
|
+
description="Check event loop is responsive",
|
|
485
|
+
timeout_seconds=1.0,
|
|
486
|
+
critical=True,
|
|
487
|
+
)
|