foundry-mcp 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- foundry_mcp/__init__.py +7 -0
- foundry_mcp/cli/__init__.py +80 -0
- foundry_mcp/cli/__main__.py +9 -0
- foundry_mcp/cli/agent.py +96 -0
- foundry_mcp/cli/commands/__init__.py +37 -0
- foundry_mcp/cli/commands/cache.py +137 -0
- foundry_mcp/cli/commands/dashboard.py +148 -0
- foundry_mcp/cli/commands/dev.py +446 -0
- foundry_mcp/cli/commands/journal.py +377 -0
- foundry_mcp/cli/commands/lifecycle.py +274 -0
- foundry_mcp/cli/commands/modify.py +824 -0
- foundry_mcp/cli/commands/plan.py +633 -0
- foundry_mcp/cli/commands/pr.py +393 -0
- foundry_mcp/cli/commands/review.py +652 -0
- foundry_mcp/cli/commands/session.py +479 -0
- foundry_mcp/cli/commands/specs.py +856 -0
- foundry_mcp/cli/commands/tasks.py +807 -0
- foundry_mcp/cli/commands/testing.py +676 -0
- foundry_mcp/cli/commands/validate.py +982 -0
- foundry_mcp/cli/config.py +98 -0
- foundry_mcp/cli/context.py +259 -0
- foundry_mcp/cli/flags.py +266 -0
- foundry_mcp/cli/logging.py +212 -0
- foundry_mcp/cli/main.py +44 -0
- foundry_mcp/cli/output.py +122 -0
- foundry_mcp/cli/registry.py +110 -0
- foundry_mcp/cli/resilience.py +178 -0
- foundry_mcp/cli/transcript.py +217 -0
- foundry_mcp/config.py +850 -0
- foundry_mcp/core/__init__.py +144 -0
- foundry_mcp/core/ai_consultation.py +1636 -0
- foundry_mcp/core/cache.py +195 -0
- foundry_mcp/core/capabilities.py +446 -0
- foundry_mcp/core/concurrency.py +898 -0
- foundry_mcp/core/context.py +540 -0
- foundry_mcp/core/discovery.py +1603 -0
- foundry_mcp/core/error_collection.py +728 -0
- foundry_mcp/core/error_store.py +592 -0
- foundry_mcp/core/feature_flags.py +592 -0
- foundry_mcp/core/health.py +749 -0
- foundry_mcp/core/journal.py +694 -0
- foundry_mcp/core/lifecycle.py +412 -0
- foundry_mcp/core/llm_config.py +1350 -0
- foundry_mcp/core/llm_patterns.py +510 -0
- foundry_mcp/core/llm_provider.py +1569 -0
- foundry_mcp/core/logging_config.py +374 -0
- foundry_mcp/core/metrics_persistence.py +584 -0
- foundry_mcp/core/metrics_registry.py +327 -0
- foundry_mcp/core/metrics_store.py +641 -0
- foundry_mcp/core/modifications.py +224 -0
- foundry_mcp/core/naming.py +123 -0
- foundry_mcp/core/observability.py +1216 -0
- foundry_mcp/core/otel.py +452 -0
- foundry_mcp/core/otel_stubs.py +264 -0
- foundry_mcp/core/pagination.py +255 -0
- foundry_mcp/core/progress.py +317 -0
- foundry_mcp/core/prometheus.py +577 -0
- foundry_mcp/core/prompts/__init__.py +464 -0
- foundry_mcp/core/prompts/fidelity_review.py +546 -0
- foundry_mcp/core/prompts/markdown_plan_review.py +511 -0
- foundry_mcp/core/prompts/plan_review.py +623 -0
- foundry_mcp/core/providers/__init__.py +225 -0
- foundry_mcp/core/providers/base.py +476 -0
- foundry_mcp/core/providers/claude.py +460 -0
- foundry_mcp/core/providers/codex.py +619 -0
- foundry_mcp/core/providers/cursor_agent.py +642 -0
- foundry_mcp/core/providers/detectors.py +488 -0
- foundry_mcp/core/providers/gemini.py +405 -0
- foundry_mcp/core/providers/opencode.py +616 -0
- foundry_mcp/core/providers/opencode_wrapper.js +302 -0
- foundry_mcp/core/providers/package-lock.json +24 -0
- foundry_mcp/core/providers/package.json +25 -0
- foundry_mcp/core/providers/registry.py +607 -0
- foundry_mcp/core/providers/test_provider.py +171 -0
- foundry_mcp/core/providers/validation.py +729 -0
- foundry_mcp/core/rate_limit.py +427 -0
- foundry_mcp/core/resilience.py +600 -0
- foundry_mcp/core/responses.py +934 -0
- foundry_mcp/core/review.py +366 -0
- foundry_mcp/core/security.py +438 -0
- foundry_mcp/core/spec.py +1650 -0
- foundry_mcp/core/task.py +1289 -0
- foundry_mcp/core/testing.py +450 -0
- foundry_mcp/core/validation.py +2081 -0
- foundry_mcp/dashboard/__init__.py +32 -0
- foundry_mcp/dashboard/app.py +119 -0
- foundry_mcp/dashboard/components/__init__.py +17 -0
- foundry_mcp/dashboard/components/cards.py +88 -0
- foundry_mcp/dashboard/components/charts.py +234 -0
- foundry_mcp/dashboard/components/filters.py +136 -0
- foundry_mcp/dashboard/components/tables.py +195 -0
- foundry_mcp/dashboard/data/__init__.py +11 -0
- foundry_mcp/dashboard/data/stores.py +433 -0
- foundry_mcp/dashboard/launcher.py +289 -0
- foundry_mcp/dashboard/views/__init__.py +12 -0
- foundry_mcp/dashboard/views/errors.py +217 -0
- foundry_mcp/dashboard/views/metrics.py +174 -0
- foundry_mcp/dashboard/views/overview.py +160 -0
- foundry_mcp/dashboard/views/providers.py +83 -0
- foundry_mcp/dashboard/views/sdd_workflow.py +255 -0
- foundry_mcp/dashboard/views/tool_usage.py +139 -0
- foundry_mcp/prompts/__init__.py +9 -0
- foundry_mcp/prompts/workflows.py +525 -0
- foundry_mcp/resources/__init__.py +9 -0
- foundry_mcp/resources/specs.py +591 -0
- foundry_mcp/schemas/__init__.py +38 -0
- foundry_mcp/schemas/sdd-spec-schema.json +386 -0
- foundry_mcp/server.py +164 -0
- foundry_mcp/tools/__init__.py +10 -0
- foundry_mcp/tools/unified/__init__.py +71 -0
- foundry_mcp/tools/unified/authoring.py +1487 -0
- foundry_mcp/tools/unified/context_helpers.py +98 -0
- foundry_mcp/tools/unified/documentation_helpers.py +198 -0
- foundry_mcp/tools/unified/environment.py +939 -0
- foundry_mcp/tools/unified/error.py +462 -0
- foundry_mcp/tools/unified/health.py +225 -0
- foundry_mcp/tools/unified/journal.py +841 -0
- foundry_mcp/tools/unified/lifecycle.py +632 -0
- foundry_mcp/tools/unified/metrics.py +777 -0
- foundry_mcp/tools/unified/plan.py +745 -0
- foundry_mcp/tools/unified/pr.py +294 -0
- foundry_mcp/tools/unified/provider.py +629 -0
- foundry_mcp/tools/unified/review.py +685 -0
- foundry_mcp/tools/unified/review_helpers.py +299 -0
- foundry_mcp/tools/unified/router.py +102 -0
- foundry_mcp/tools/unified/server.py +580 -0
- foundry_mcp/tools/unified/spec.py +808 -0
- foundry_mcp/tools/unified/task.py +2202 -0
- foundry_mcp/tools/unified/test.py +370 -0
- foundry_mcp/tools/unified/verification.py +520 -0
- foundry_mcp-0.3.3.dist-info/METADATA +337 -0
- foundry_mcp-0.3.3.dist-info/RECORD +135 -0
- foundry_mcp-0.3.3.dist-info/WHEEL +4 -0
- foundry_mcp-0.3.3.dist-info/entry_points.txt +3 -0
- foundry_mcp-0.3.3.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
"""Health check system for foundry-mcp.
|
|
2
|
+
|
|
3
|
+
Provides Kubernetes-style health probes (liveness, readiness, health)
|
|
4
|
+
with pluggable dependency checkers and configurable thresholds.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from foundry_mcp.core.health import (
|
|
8
|
+
get_health_manager,
|
|
9
|
+
HealthStatus,
|
|
10
|
+
check_liveness,
|
|
11
|
+
check_readiness,
|
|
12
|
+
check_health,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
# Quick checks
|
|
16
|
+
if check_liveness().is_healthy:
|
|
17
|
+
print("Server is alive")
|
|
18
|
+
|
|
19
|
+
# Full health check with details
|
|
20
|
+
result = check_health()
|
|
21
|
+
print(f"Status: {result.status.value}")
|
|
22
|
+
print(f"Dependencies: {result.dependencies}")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
import shutil
|
|
29
|
+
import time
|
|
30
|
+
from dataclasses import dataclass, field
|
|
31
|
+
from enum import Enum
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any, Dict, List, Optional, Protocol
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HealthStatus(str, Enum):
|
|
39
|
+
"""Health status values following Kubernetes conventions."""
|
|
40
|
+
|
|
41
|
+
HEALTHY = "healthy"
|
|
42
|
+
DEGRADED = "degraded"
|
|
43
|
+
UNHEALTHY = "unhealthy"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class DependencyHealth:
|
|
48
|
+
"""Health status of a single dependency."""
|
|
49
|
+
|
|
50
|
+
name: str
|
|
51
|
+
healthy: bool
|
|
52
|
+
status: HealthStatus
|
|
53
|
+
message: str = ""
|
|
54
|
+
latency_ms: Optional[float] = None
|
|
55
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
58
|
+
"""Convert to dictionary for JSON serialization."""
|
|
59
|
+
result = {
|
|
60
|
+
"name": self.name,
|
|
61
|
+
"healthy": self.healthy,
|
|
62
|
+
"status": self.status.value,
|
|
63
|
+
}
|
|
64
|
+
if self.message:
|
|
65
|
+
result["message"] = self.message
|
|
66
|
+
if self.latency_ms is not None:
|
|
67
|
+
result["latency_ms"] = round(self.latency_ms, 2)
|
|
68
|
+
if self.details:
|
|
69
|
+
result["details"] = self.details
|
|
70
|
+
return result
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class HealthResult:
|
|
75
|
+
"""Result of a health check operation."""
|
|
76
|
+
|
|
77
|
+
status: HealthStatus
|
|
78
|
+
is_healthy: bool
|
|
79
|
+
message: str = ""
|
|
80
|
+
timestamp: float = field(default_factory=time.time)
|
|
81
|
+
dependencies: List[DependencyHealth] = field(default_factory=list)
|
|
82
|
+
details: Dict[str, Any] = field(default_factory=dict)
|
|
83
|
+
|
|
84
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
85
|
+
"""Convert to dictionary for JSON serialization."""
|
|
86
|
+
result = {
|
|
87
|
+
"status": self.status.value,
|
|
88
|
+
"is_healthy": self.is_healthy,
|
|
89
|
+
"timestamp": self.timestamp,
|
|
90
|
+
}
|
|
91
|
+
if self.message:
|
|
92
|
+
result["message"] = self.message
|
|
93
|
+
if self.dependencies:
|
|
94
|
+
result["dependencies"] = [d.to_dict() for d in self.dependencies]
|
|
95
|
+
if self.details:
|
|
96
|
+
result["details"] = self.details
|
|
97
|
+
return result
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DependencyChecker(Protocol):
|
|
101
|
+
"""Protocol for dependency health checkers."""
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def name(self) -> str:
|
|
105
|
+
"""Unique name for this dependency."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
109
|
+
"""Check the health of this dependency."""
|
|
110
|
+
...
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# =============================================================================
|
|
114
|
+
# Built-in Dependency Checkers
|
|
115
|
+
# =============================================================================
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class SpecsDirectoryChecker:
|
|
119
|
+
"""Check that specs directory exists and is accessible."""
|
|
120
|
+
|
|
121
|
+
name = "specs_directory"
|
|
122
|
+
|
|
123
|
+
def __init__(self, specs_dir: Optional[Path] = None):
|
|
124
|
+
self.specs_dir = specs_dir
|
|
125
|
+
|
|
126
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
127
|
+
start = time.perf_counter()
|
|
128
|
+
try:
|
|
129
|
+
# Try to get specs_dir from config if not provided
|
|
130
|
+
if self.specs_dir is None:
|
|
131
|
+
from foundry_mcp.config import get_config
|
|
132
|
+
|
|
133
|
+
config = get_config()
|
|
134
|
+
self.specs_dir = config.specs_dir if config else None
|
|
135
|
+
|
|
136
|
+
if self.specs_dir is None:
|
|
137
|
+
return DependencyHealth(
|
|
138
|
+
name=self.name,
|
|
139
|
+
healthy=False,
|
|
140
|
+
status=HealthStatus.UNHEALTHY,
|
|
141
|
+
message="specs_dir not configured",
|
|
142
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if not self.specs_dir.exists():
|
|
146
|
+
return DependencyHealth(
|
|
147
|
+
name=self.name,
|
|
148
|
+
healthy=False,
|
|
149
|
+
status=HealthStatus.UNHEALTHY,
|
|
150
|
+
message=f"specs_dir does not exist: {self.specs_dir}",
|
|
151
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
if not self.specs_dir.is_dir():
|
|
155
|
+
return DependencyHealth(
|
|
156
|
+
name=self.name,
|
|
157
|
+
healthy=False,
|
|
158
|
+
status=HealthStatus.UNHEALTHY,
|
|
159
|
+
message=f"specs_dir is not a directory: {self.specs_dir}",
|
|
160
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Check if readable
|
|
164
|
+
try:
|
|
165
|
+
list(self.specs_dir.iterdir())
|
|
166
|
+
except PermissionError:
|
|
167
|
+
return DependencyHealth(
|
|
168
|
+
name=self.name,
|
|
169
|
+
healthy=False,
|
|
170
|
+
status=HealthStatus.UNHEALTHY,
|
|
171
|
+
message=f"specs_dir not readable: {self.specs_dir}",
|
|
172
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return DependencyHealth(
|
|
176
|
+
name=self.name,
|
|
177
|
+
healthy=True,
|
|
178
|
+
status=HealthStatus.HEALTHY,
|
|
179
|
+
message="specs_dir accessible",
|
|
180
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
181
|
+
details={"path": str(self.specs_dir)},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
except Exception as e:
|
|
185
|
+
return DependencyHealth(
|
|
186
|
+
name=self.name,
|
|
187
|
+
healthy=False,
|
|
188
|
+
status=HealthStatus.UNHEALTHY,
|
|
189
|
+
message=f"Error checking specs_dir: {e}",
|
|
190
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class DiskSpaceChecker:
|
|
195
|
+
"""Check available disk space meets threshold."""
|
|
196
|
+
|
|
197
|
+
name = "disk_space"
|
|
198
|
+
|
|
199
|
+
def __init__(
|
|
200
|
+
self,
|
|
201
|
+
path: Optional[Path] = None,
|
|
202
|
+
threshold_mb: int = 100,
|
|
203
|
+
warning_mb: int = 500,
|
|
204
|
+
):
|
|
205
|
+
self.path = path or Path(".")
|
|
206
|
+
self.threshold_mb = threshold_mb
|
|
207
|
+
self.warning_mb = warning_mb
|
|
208
|
+
|
|
209
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
210
|
+
start = time.perf_counter()
|
|
211
|
+
try:
|
|
212
|
+
usage = shutil.disk_usage(self.path)
|
|
213
|
+
free_mb = usage.free / (1024 * 1024)
|
|
214
|
+
|
|
215
|
+
details = {
|
|
216
|
+
"path": str(self.path),
|
|
217
|
+
"free_mb": round(free_mb, 2),
|
|
218
|
+
"total_mb": round(usage.total / (1024 * 1024), 2),
|
|
219
|
+
"threshold_mb": self.threshold_mb,
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if free_mb < self.threshold_mb:
|
|
223
|
+
return DependencyHealth(
|
|
224
|
+
name=self.name,
|
|
225
|
+
healthy=False,
|
|
226
|
+
status=HealthStatus.UNHEALTHY,
|
|
227
|
+
message=f"Disk space critically low: {free_mb:.1f}MB free",
|
|
228
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
229
|
+
details=details,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
if free_mb < self.warning_mb:
|
|
233
|
+
return DependencyHealth(
|
|
234
|
+
name=self.name,
|
|
235
|
+
healthy=True,
|
|
236
|
+
status=HealthStatus.DEGRADED,
|
|
237
|
+
message=f"Disk space low: {free_mb:.1f}MB free",
|
|
238
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
239
|
+
details=details,
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
return DependencyHealth(
|
|
243
|
+
name=self.name,
|
|
244
|
+
healthy=True,
|
|
245
|
+
status=HealthStatus.HEALTHY,
|
|
246
|
+
message=f"Disk space OK: {free_mb:.1f}MB free",
|
|
247
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
248
|
+
details=details,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
except Exception as e:
|
|
252
|
+
return DependencyHealth(
|
|
253
|
+
name=self.name,
|
|
254
|
+
healthy=False,
|
|
255
|
+
status=HealthStatus.UNHEALTHY,
|
|
256
|
+
message=f"Error checking disk space: {e}",
|
|
257
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class OpenTelemetryChecker:
|
|
262
|
+
"""Check OpenTelemetry availability."""
|
|
263
|
+
|
|
264
|
+
name = "opentelemetry"
|
|
265
|
+
|
|
266
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
267
|
+
start = time.perf_counter()
|
|
268
|
+
try:
|
|
269
|
+
from foundry_mcp.core.observability import get_observability_manager
|
|
270
|
+
|
|
271
|
+
manager = get_observability_manager()
|
|
272
|
+
is_enabled = manager.is_tracing_enabled()
|
|
273
|
+
|
|
274
|
+
# OTel being disabled is not unhealthy, just a different state
|
|
275
|
+
if is_enabled:
|
|
276
|
+
return DependencyHealth(
|
|
277
|
+
name=self.name,
|
|
278
|
+
healthy=True,
|
|
279
|
+
status=HealthStatus.HEALTHY,
|
|
280
|
+
message="OpenTelemetry tracing enabled",
|
|
281
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
282
|
+
details={"enabled": True},
|
|
283
|
+
)
|
|
284
|
+
else:
|
|
285
|
+
return DependencyHealth(
|
|
286
|
+
name=self.name,
|
|
287
|
+
healthy=True, # Disabled is still healthy
|
|
288
|
+
status=HealthStatus.HEALTHY,
|
|
289
|
+
message="OpenTelemetry tracing disabled (optional)",
|
|
290
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
291
|
+
details={"enabled": False},
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
except Exception as e:
|
|
295
|
+
return DependencyHealth(
|
|
296
|
+
name=self.name,
|
|
297
|
+
healthy=True, # OTel errors shouldn't fail health check
|
|
298
|
+
status=HealthStatus.DEGRADED,
|
|
299
|
+
message=f"OpenTelemetry check failed: {e}",
|
|
300
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class PrometheusChecker:
|
|
305
|
+
"""Check Prometheus metrics availability."""
|
|
306
|
+
|
|
307
|
+
name = "prometheus"
|
|
308
|
+
|
|
309
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
310
|
+
start = time.perf_counter()
|
|
311
|
+
try:
|
|
312
|
+
from foundry_mcp.core.observability import get_observability_manager
|
|
313
|
+
|
|
314
|
+
manager = get_observability_manager()
|
|
315
|
+
is_enabled = manager.is_metrics_enabled()
|
|
316
|
+
|
|
317
|
+
# Prometheus being disabled is not unhealthy
|
|
318
|
+
if is_enabled:
|
|
319
|
+
return DependencyHealth(
|
|
320
|
+
name=self.name,
|
|
321
|
+
healthy=True,
|
|
322
|
+
status=HealthStatus.HEALTHY,
|
|
323
|
+
message="Prometheus metrics enabled",
|
|
324
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
325
|
+
details={"enabled": True},
|
|
326
|
+
)
|
|
327
|
+
else:
|
|
328
|
+
return DependencyHealth(
|
|
329
|
+
name=self.name,
|
|
330
|
+
healthy=True,
|
|
331
|
+
status=HealthStatus.HEALTHY,
|
|
332
|
+
message="Prometheus metrics disabled (optional)",
|
|
333
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
334
|
+
details={"enabled": False},
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
except Exception as e:
|
|
338
|
+
return DependencyHealth(
|
|
339
|
+
name=self.name,
|
|
340
|
+
healthy=True, # Prometheus errors shouldn't fail health check
|
|
341
|
+
status=HealthStatus.DEGRADED,
|
|
342
|
+
message=f"Prometheus check failed: {e}",
|
|
343
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class AIProviderChecker:
|
|
348
|
+
"""Check AI provider availability."""
|
|
349
|
+
|
|
350
|
+
name = "ai_provider"
|
|
351
|
+
|
|
352
|
+
def check(self, timeout: float = 5.0) -> DependencyHealth:
|
|
353
|
+
start = time.perf_counter()
|
|
354
|
+
try:
|
|
355
|
+
from foundry_mcp.core.providers import (
|
|
356
|
+
available_providers,
|
|
357
|
+
get_provider_statuses,
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
available = available_providers()
|
|
361
|
+
statuses = get_provider_statuses()
|
|
362
|
+
|
|
363
|
+
# AI providers are optional - just report what's available
|
|
364
|
+
if available:
|
|
365
|
+
return DependencyHealth(
|
|
366
|
+
name=self.name,
|
|
367
|
+
healthy=True,
|
|
368
|
+
status=HealthStatus.HEALTHY,
|
|
369
|
+
message=f"AI providers available: {', '.join(available)}",
|
|
370
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
371
|
+
details={
|
|
372
|
+
"available": available,
|
|
373
|
+
# statuses is Dict[str, bool], not enum values
|
|
374
|
+
"statuses": statuses,
|
|
375
|
+
},
|
|
376
|
+
)
|
|
377
|
+
else:
|
|
378
|
+
return DependencyHealth(
|
|
379
|
+
name=self.name,
|
|
380
|
+
healthy=True, # No providers is not unhealthy
|
|
381
|
+
status=HealthStatus.DEGRADED,
|
|
382
|
+
message="No AI providers available (optional)",
|
|
383
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
384
|
+
details={"available": [], "statuses": {}},
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
except ImportError:
|
|
388
|
+
return DependencyHealth(
|
|
389
|
+
name=self.name,
|
|
390
|
+
healthy=True,
|
|
391
|
+
status=HealthStatus.HEALTHY,
|
|
392
|
+
message="AI provider module not available (optional)",
|
|
393
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
394
|
+
)
|
|
395
|
+
except Exception as e:
|
|
396
|
+
return DependencyHealth(
|
|
397
|
+
name=self.name,
|
|
398
|
+
healthy=True,
|
|
399
|
+
status=HealthStatus.DEGRADED,
|
|
400
|
+
message=f"AI provider check failed: {e}",
|
|
401
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
# =============================================================================
|
|
406
|
+
# Health Manager
|
|
407
|
+
# =============================================================================
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
@dataclass
|
|
411
|
+
class HealthConfig:
|
|
412
|
+
"""Configuration for health checks.
|
|
413
|
+
|
|
414
|
+
Attributes:
|
|
415
|
+
enabled: Whether health checks are enabled
|
|
416
|
+
liveness_timeout: Timeout for liveness checks (seconds)
|
|
417
|
+
readiness_timeout: Timeout for readiness checks (seconds)
|
|
418
|
+
health_timeout: Timeout for full health checks (seconds)
|
|
419
|
+
disk_space_threshold_mb: Minimum disk space before unhealthy
|
|
420
|
+
disk_space_warning_mb: Minimum disk space before degraded
|
|
421
|
+
"""
|
|
422
|
+
|
|
423
|
+
enabled: bool = True
|
|
424
|
+
liveness_timeout: float = 1.0
|
|
425
|
+
readiness_timeout: float = 5.0
|
|
426
|
+
health_timeout: float = 10.0
|
|
427
|
+
disk_space_threshold_mb: int = 100
|
|
428
|
+
disk_space_warning_mb: int = 500
|
|
429
|
+
|
|
430
|
+
@classmethod
|
|
431
|
+
def from_toml_dict(cls, data: Dict[str, Any]) -> "HealthConfig":
|
|
432
|
+
"""Create config from TOML dictionary."""
|
|
433
|
+
return cls(
|
|
434
|
+
enabled=data.get("enabled", True),
|
|
435
|
+
liveness_timeout=data.get("liveness_timeout", 1.0),
|
|
436
|
+
readiness_timeout=data.get("readiness_timeout", 5.0),
|
|
437
|
+
health_timeout=data.get("health_timeout", 10.0),
|
|
438
|
+
disk_space_threshold_mb=data.get("disk_space_threshold_mb", 100),
|
|
439
|
+
disk_space_warning_mb=data.get("disk_space_warning_mb", 500),
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
class HealthManager:
|
|
444
|
+
"""Manages health checks for the foundry-mcp server.
|
|
445
|
+
|
|
446
|
+
Provides three levels of health checks:
|
|
447
|
+
- Liveness: Is the process running? (always true if this code executes)
|
|
448
|
+
- Readiness: Can the server handle requests? (checks critical deps)
|
|
449
|
+
- Health: Full health status (all dependencies)
|
|
450
|
+
"""
|
|
451
|
+
|
|
452
|
+
def __init__(self, config: Optional[HealthConfig] = None):
|
|
453
|
+
self.config = config or HealthConfig()
|
|
454
|
+
self._liveness_checkers: List[DependencyChecker] = []
|
|
455
|
+
self._readiness_checkers: List[DependencyChecker] = []
|
|
456
|
+
self._health_checkers: List[DependencyChecker] = []
|
|
457
|
+
self._setup_default_checkers()
|
|
458
|
+
|
|
459
|
+
def _setup_default_checkers(self) -> None:
|
|
460
|
+
"""Set up default dependency checkers."""
|
|
461
|
+
# Readiness checks - critical for serving requests
|
|
462
|
+
specs_checker = SpecsDirectoryChecker()
|
|
463
|
+
disk_checker = DiskSpaceChecker(
|
|
464
|
+
threshold_mb=self.config.disk_space_threshold_mb,
|
|
465
|
+
warning_mb=self.config.disk_space_warning_mb,
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
self._readiness_checkers = [specs_checker, disk_checker]
|
|
469
|
+
|
|
470
|
+
# Health checks - full system status
|
|
471
|
+
self._health_checkers = [
|
|
472
|
+
specs_checker,
|
|
473
|
+
disk_checker,
|
|
474
|
+
OpenTelemetryChecker(),
|
|
475
|
+
PrometheusChecker(),
|
|
476
|
+
AIProviderChecker(),
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
def register_checker(
|
|
480
|
+
self,
|
|
481
|
+
checker: DependencyChecker,
|
|
482
|
+
*,
|
|
483
|
+
liveness: bool = False,
|
|
484
|
+
readiness: bool = False,
|
|
485
|
+
health: bool = True,
|
|
486
|
+
) -> None:
|
|
487
|
+
"""Register a custom dependency checker.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
checker: The dependency checker to register
|
|
491
|
+
liveness: Include in liveness checks
|
|
492
|
+
readiness: Include in readiness checks
|
|
493
|
+
health: Include in full health checks (default True)
|
|
494
|
+
"""
|
|
495
|
+
if liveness:
|
|
496
|
+
self._liveness_checkers.append(checker)
|
|
497
|
+
if readiness:
|
|
498
|
+
self._readiness_checkers.append(checker)
|
|
499
|
+
if health:
|
|
500
|
+
self._health_checkers.append(checker)
|
|
501
|
+
|
|
502
|
+
def check_liveness(self) -> HealthResult:
|
|
503
|
+
"""Check if the server is alive.
|
|
504
|
+
|
|
505
|
+
Liveness checks are intentionally minimal - if this code runs,
|
|
506
|
+
we're alive. Custom checkers can be added for process-level health.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
HealthResult indicating liveness status
|
|
510
|
+
"""
|
|
511
|
+
if not self.config.enabled:
|
|
512
|
+
return HealthResult(
|
|
513
|
+
status=HealthStatus.HEALTHY,
|
|
514
|
+
is_healthy=True,
|
|
515
|
+
message="Health checks disabled",
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
dependencies = []
|
|
519
|
+
for checker in self._liveness_checkers:
|
|
520
|
+
try:
|
|
521
|
+
result = checker.check(timeout=self.config.liveness_timeout)
|
|
522
|
+
dependencies.append(result)
|
|
523
|
+
except Exception as e:
|
|
524
|
+
dependencies.append(
|
|
525
|
+
DependencyHealth(
|
|
526
|
+
name=checker.name,
|
|
527
|
+
healthy=False,
|
|
528
|
+
status=HealthStatus.UNHEALTHY,
|
|
529
|
+
message=f"Check failed: {e}",
|
|
530
|
+
)
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# If no liveness checkers, we're alive
|
|
534
|
+
if not dependencies:
|
|
535
|
+
return HealthResult(
|
|
536
|
+
status=HealthStatus.HEALTHY,
|
|
537
|
+
is_healthy=True,
|
|
538
|
+
message="Server is alive",
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Check if any are unhealthy
|
|
542
|
+
unhealthy = [d for d in dependencies if not d.healthy]
|
|
543
|
+
if unhealthy:
|
|
544
|
+
return HealthResult(
|
|
545
|
+
status=HealthStatus.UNHEALTHY,
|
|
546
|
+
is_healthy=False,
|
|
547
|
+
message=f"Liveness check failed: {unhealthy[0].message}",
|
|
548
|
+
dependencies=dependencies,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
return HealthResult(
|
|
552
|
+
status=HealthStatus.HEALTHY,
|
|
553
|
+
is_healthy=True,
|
|
554
|
+
message="Server is alive",
|
|
555
|
+
dependencies=dependencies,
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
def check_readiness(self) -> HealthResult:
|
|
559
|
+
"""Check if the server is ready to handle requests.
|
|
560
|
+
|
|
561
|
+
Readiness checks verify critical dependencies are available.
|
|
562
|
+
|
|
563
|
+
Returns:
|
|
564
|
+
HealthResult indicating readiness status
|
|
565
|
+
"""
|
|
566
|
+
if not self.config.enabled:
|
|
567
|
+
return HealthResult(
|
|
568
|
+
status=HealthStatus.HEALTHY,
|
|
569
|
+
is_healthy=True,
|
|
570
|
+
message="Health checks disabled",
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
dependencies = []
|
|
574
|
+
for checker in self._readiness_checkers:
|
|
575
|
+
try:
|
|
576
|
+
result = checker.check(timeout=self.config.readiness_timeout)
|
|
577
|
+
dependencies.append(result)
|
|
578
|
+
except Exception as e:
|
|
579
|
+
dependencies.append(
|
|
580
|
+
DependencyHealth(
|
|
581
|
+
name=checker.name,
|
|
582
|
+
healthy=False,
|
|
583
|
+
status=HealthStatus.UNHEALTHY,
|
|
584
|
+
message=f"Check failed: {e}",
|
|
585
|
+
)
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
# Check if any critical dependencies are unhealthy
|
|
589
|
+
unhealthy = [d for d in dependencies if not d.healthy]
|
|
590
|
+
degraded = [d for d in dependencies if d.status == HealthStatus.DEGRADED]
|
|
591
|
+
|
|
592
|
+
if unhealthy:
|
|
593
|
+
return HealthResult(
|
|
594
|
+
status=HealthStatus.UNHEALTHY,
|
|
595
|
+
is_healthy=False,
|
|
596
|
+
message=f"Not ready: {unhealthy[0].message}",
|
|
597
|
+
dependencies=dependencies,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
if degraded:
|
|
601
|
+
return HealthResult(
|
|
602
|
+
status=HealthStatus.DEGRADED,
|
|
603
|
+
is_healthy=True, # Still ready, but degraded
|
|
604
|
+
message=f"Ready with warnings: {degraded[0].message}",
|
|
605
|
+
dependencies=dependencies,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
return HealthResult(
|
|
609
|
+
status=HealthStatus.HEALTHY,
|
|
610
|
+
is_healthy=True,
|
|
611
|
+
message="Server is ready",
|
|
612
|
+
dependencies=dependencies,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
def check_health(self) -> HealthResult:
|
|
616
|
+
"""Perform a full health check of all dependencies.
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
HealthResult with complete system health status
|
|
620
|
+
"""
|
|
621
|
+
if not self.config.enabled:
|
|
622
|
+
return HealthResult(
|
|
623
|
+
status=HealthStatus.HEALTHY,
|
|
624
|
+
is_healthy=True,
|
|
625
|
+
message="Health checks disabled",
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
dependencies = []
|
|
629
|
+
for checker in self._health_checkers:
|
|
630
|
+
try:
|
|
631
|
+
result = checker.check(timeout=self.config.health_timeout)
|
|
632
|
+
dependencies.append(result)
|
|
633
|
+
except Exception as e:
|
|
634
|
+
dependencies.append(
|
|
635
|
+
DependencyHealth(
|
|
636
|
+
name=checker.name,
|
|
637
|
+
healthy=False,
|
|
638
|
+
status=HealthStatus.UNHEALTHY,
|
|
639
|
+
message=f"Check failed: {e}",
|
|
640
|
+
)
|
|
641
|
+
)
|
|
642
|
+
|
|
643
|
+
# Aggregate status
|
|
644
|
+
unhealthy = [d for d in dependencies if not d.healthy]
|
|
645
|
+
degraded = [d for d in dependencies if d.status == HealthStatus.DEGRADED]
|
|
646
|
+
|
|
647
|
+
if unhealthy:
|
|
648
|
+
return HealthResult(
|
|
649
|
+
status=HealthStatus.UNHEALTHY,
|
|
650
|
+
is_healthy=False,
|
|
651
|
+
message=f"Unhealthy: {len(unhealthy)} failed check(s)",
|
|
652
|
+
dependencies=dependencies,
|
|
653
|
+
details={
|
|
654
|
+
"unhealthy_count": len(unhealthy),
|
|
655
|
+
"degraded_count": len(degraded),
|
|
656
|
+
"healthy_count": len(dependencies)
|
|
657
|
+
- len(unhealthy)
|
|
658
|
+
- len(degraded),
|
|
659
|
+
},
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
if degraded:
|
|
663
|
+
return HealthResult(
|
|
664
|
+
status=HealthStatus.DEGRADED,
|
|
665
|
+
is_healthy=True,
|
|
666
|
+
message=f"Degraded: {len(degraded)} warning(s)",
|
|
667
|
+
dependencies=dependencies,
|
|
668
|
+
details={
|
|
669
|
+
"unhealthy_count": 0,
|
|
670
|
+
"degraded_count": len(degraded),
|
|
671
|
+
"healthy_count": len(dependencies) - len(degraded),
|
|
672
|
+
},
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
return HealthResult(
|
|
676
|
+
status=HealthStatus.HEALTHY,
|
|
677
|
+
is_healthy=True,
|
|
678
|
+
message="All systems healthy",
|
|
679
|
+
dependencies=dependencies,
|
|
680
|
+
details={
|
|
681
|
+
"unhealthy_count": 0,
|
|
682
|
+
"degraded_count": 0,
|
|
683
|
+
"healthy_count": len(dependencies),
|
|
684
|
+
},
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
# =============================================================================
|
|
689
|
+
# Global Manager Instance
|
|
690
|
+
# =============================================================================
|
|
691
|
+
|
|
692
|
+
_health_manager: Optional[HealthManager] = None
|
|
693
|
+
_manager_lock = __import__("threading").Lock()
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def get_health_manager(config: Optional[HealthConfig] = None) -> HealthManager:
|
|
697
|
+
"""Get or create the global health manager.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
config: Optional config (only used on first call)
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
Global HealthManager instance
|
|
704
|
+
"""
|
|
705
|
+
global _health_manager
|
|
706
|
+
if _health_manager is None:
|
|
707
|
+
with _manager_lock:
|
|
708
|
+
if _health_manager is None:
|
|
709
|
+
_health_manager = HealthManager(config)
|
|
710
|
+
return _health_manager
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def reset_health_manager() -> None:
|
|
714
|
+
"""Reset the global health manager (for testing)."""
|
|
715
|
+
global _health_manager
|
|
716
|
+
with _manager_lock:
|
|
717
|
+
_health_manager = None
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
# =============================================================================
|
|
721
|
+
# Convenience Functions
|
|
722
|
+
# =============================================================================
|
|
723
|
+
|
|
724
|
+
|
|
725
|
+
def check_liveness() -> HealthResult:
|
|
726
|
+
"""Quick liveness check.
|
|
727
|
+
|
|
728
|
+
Returns:
|
|
729
|
+
HealthResult indicating if server is alive
|
|
730
|
+
"""
|
|
731
|
+
return get_health_manager().check_liveness()
|
|
732
|
+
|
|
733
|
+
|
|
734
|
+
def check_readiness() -> HealthResult:
|
|
735
|
+
"""Quick readiness check.
|
|
736
|
+
|
|
737
|
+
Returns:
|
|
738
|
+
HealthResult indicating if server is ready
|
|
739
|
+
"""
|
|
740
|
+
return get_health_manager().check_readiness()
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
def check_health() -> HealthResult:
|
|
744
|
+
"""Full health check.
|
|
745
|
+
|
|
746
|
+
Returns:
|
|
747
|
+
HealthResult with complete system status
|
|
748
|
+
"""
|
|
749
|
+
return get_health_manager().check_health()
|