svc-infra 0.1.589__py3-none-any.whl → 0.1.706__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of svc-infra might be problematic. Click here for more details.
- svc_infra/__init__.py +58 -2
- svc_infra/apf_payments/README.md +732 -0
- svc_infra/apf_payments/models.py +133 -42
- svc_infra/apf_payments/provider/__init__.py +4 -0
- svc_infra/apf_payments/provider/aiydan.py +871 -0
- svc_infra/apf_payments/provider/base.py +30 -9
- svc_infra/apf_payments/provider/stripe.py +156 -62
- svc_infra/apf_payments/schemas.py +19 -10
- svc_infra/apf_payments/service.py +211 -68
- svc_infra/apf_payments/settings.py +27 -3
- svc_infra/api/__init__.py +61 -0
- svc_infra/api/fastapi/__init__.py +15 -0
- svc_infra/api/fastapi/admin/__init__.py +3 -0
- svc_infra/api/fastapi/admin/add.py +245 -0
- svc_infra/api/fastapi/apf_payments/router.py +145 -46
- svc_infra/api/fastapi/apf_payments/setup.py +26 -8
- svc_infra/api/fastapi/auth/__init__.py +65 -0
- svc_infra/api/fastapi/auth/_cookies.py +6 -2
- svc_infra/api/fastapi/auth/add.py +27 -14
- svc_infra/api/fastapi/auth/gaurd.py +104 -13
- svc_infra/api/fastapi/auth/mfa/models.py +3 -1
- svc_infra/api/fastapi/auth/mfa/pre_auth.py +10 -6
- svc_infra/api/fastapi/auth/mfa/router.py +15 -8
- svc_infra/api/fastapi/auth/mfa/security.py +1 -2
- svc_infra/api/fastapi/auth/mfa/utils.py +2 -1
- svc_infra/api/fastapi/auth/mfa/verify.py +9 -2
- svc_infra/api/fastapi/auth/policy.py +0 -1
- svc_infra/api/fastapi/auth/providers.py +3 -1
- svc_infra/api/fastapi/auth/routers/apikey_router.py +6 -6
- svc_infra/api/fastapi/auth/routers/oauth_router.py +214 -75
- svc_infra/api/fastapi/auth/routers/session_router.py +67 -0
- svc_infra/api/fastapi/auth/security.py +31 -10
- svc_infra/api/fastapi/auth/sender.py +8 -1
- svc_infra/api/fastapi/auth/settings.py +2 -0
- svc_infra/api/fastapi/auth/state.py +3 -1
- svc_infra/api/fastapi/auth/ws_security.py +275 -0
- svc_infra/api/fastapi/billing/router.py +73 -0
- svc_infra/api/fastapi/billing/setup.py +19 -0
- svc_infra/api/fastapi/cache/add.py +9 -5
- svc_infra/api/fastapi/db/__init__.py +5 -1
- svc_infra/api/fastapi/db/http.py +3 -1
- svc_infra/api/fastapi/db/nosql/__init__.py +39 -1
- svc_infra/api/fastapi/db/nosql/mongo/add.py +47 -32
- svc_infra/api/fastapi/db/nosql/mongo/crud_router.py +30 -11
- svc_infra/api/fastapi/db/sql/__init__.py +5 -1
- svc_infra/api/fastapi/db/sql/add.py +71 -26
- svc_infra/api/fastapi/db/sql/crud_router.py +210 -22
- svc_infra/api/fastapi/db/sql/health.py +3 -1
- svc_infra/api/fastapi/db/sql/session.py +18 -0
- svc_infra/api/fastapi/db/sql/users.py +29 -5
- svc_infra/api/fastapi/dependencies/ratelimit.py +130 -0
- svc_infra/api/fastapi/docs/add.py +173 -0
- svc_infra/api/fastapi/docs/landing.py +4 -2
- svc_infra/api/fastapi/docs/scoped.py +62 -15
- svc_infra/api/fastapi/dual/__init__.py +12 -2
- svc_infra/api/fastapi/dual/dualize.py +1 -1
- svc_infra/api/fastapi/dual/protected.py +126 -4
- svc_infra/api/fastapi/dual/public.py +25 -0
- svc_infra/api/fastapi/dual/router.py +40 -13
- svc_infra/api/fastapi/dx.py +33 -2
- svc_infra/api/fastapi/ease.py +10 -2
- svc_infra/api/fastapi/http/concurrency.py +2 -1
- svc_infra/api/fastapi/http/conditional.py +3 -1
- svc_infra/api/fastapi/middleware/debug.py +4 -1
- svc_infra/api/fastapi/middleware/errors/catchall.py +6 -2
- svc_infra/api/fastapi/middleware/errors/exceptions.py +1 -1
- svc_infra/api/fastapi/middleware/errors/handlers.py +54 -8
- svc_infra/api/fastapi/middleware/graceful_shutdown.py +104 -0
- svc_infra/api/fastapi/middleware/idempotency.py +197 -70
- svc_infra/api/fastapi/middleware/idempotency_store.py +187 -0
- svc_infra/api/fastapi/middleware/optimistic_lock.py +42 -0
- svc_infra/api/fastapi/middleware/ratelimit.py +143 -31
- svc_infra/api/fastapi/middleware/ratelimit_store.py +111 -0
- svc_infra/api/fastapi/middleware/request_id.py +27 -11
- svc_infra/api/fastapi/middleware/request_size_limit.py +36 -0
- svc_infra/api/fastapi/middleware/timeout.py +177 -0
- svc_infra/api/fastapi/openapi/apply.py +5 -3
- svc_infra/api/fastapi/openapi/conventions.py +9 -2
- svc_infra/api/fastapi/openapi/mutators.py +165 -20
- svc_infra/api/fastapi/openapi/pipeline.py +1 -1
- svc_infra/api/fastapi/openapi/security.py +3 -1
- svc_infra/api/fastapi/ops/add.py +75 -0
- svc_infra/api/fastapi/pagination.py +47 -20
- svc_infra/api/fastapi/routers/__init__.py +43 -15
- svc_infra/api/fastapi/routers/ping.py +1 -0
- svc_infra/api/fastapi/setup.py +188 -56
- svc_infra/api/fastapi/tenancy/add.py +19 -0
- svc_infra/api/fastapi/tenancy/context.py +112 -0
- svc_infra/api/fastapi/versioned.py +101 -0
- svc_infra/app/README.md +5 -5
- svc_infra/app/__init__.py +3 -1
- svc_infra/app/env.py +69 -1
- svc_infra/app/logging/add.py +9 -2
- svc_infra/app/logging/formats.py +12 -5
- svc_infra/billing/__init__.py +23 -0
- svc_infra/billing/async_service.py +147 -0
- svc_infra/billing/jobs.py +241 -0
- svc_infra/billing/models.py +177 -0
- svc_infra/billing/quotas.py +103 -0
- svc_infra/billing/schemas.py +36 -0
- svc_infra/billing/service.py +123 -0
- svc_infra/bundled_docs/README.md +5 -0
- svc_infra/bundled_docs/__init__.py +1 -0
- svc_infra/bundled_docs/getting-started.md +6 -0
- svc_infra/cache/__init__.py +9 -0
- svc_infra/cache/add.py +170 -0
- svc_infra/cache/backend.py +7 -6
- svc_infra/cache/decorators.py +81 -15
- svc_infra/cache/demo.py +2 -2
- svc_infra/cache/keys.py +24 -4
- svc_infra/cache/recache.py +26 -14
- svc_infra/cache/resources.py +14 -5
- svc_infra/cache/tags.py +19 -44
- svc_infra/cache/utils.py +3 -1
- svc_infra/cli/__init__.py +52 -8
- svc_infra/cli/__main__.py +4 -0
- svc_infra/cli/cmds/__init__.py +39 -2
- svc_infra/cli/cmds/db/nosql/mongo/mongo_cmds.py +7 -4
- svc_infra/cli/cmds/db/nosql/mongo/mongo_scaffold_cmds.py +7 -5
- svc_infra/cli/cmds/db/ops_cmds.py +270 -0
- svc_infra/cli/cmds/db/sql/alembic_cmds.py +103 -18
- svc_infra/cli/cmds/db/sql/sql_export_cmds.py +88 -0
- svc_infra/cli/cmds/db/sql/sql_scaffold_cmds.py +3 -3
- svc_infra/cli/cmds/docs/docs_cmds.py +142 -0
- svc_infra/cli/cmds/dx/__init__.py +12 -0
- svc_infra/cli/cmds/dx/dx_cmds.py +116 -0
- svc_infra/cli/cmds/health/__init__.py +179 -0
- svc_infra/cli/cmds/health/health_cmds.py +8 -0
- svc_infra/cli/cmds/help.py +4 -0
- svc_infra/cli/cmds/jobs/__init__.py +1 -0
- svc_infra/cli/cmds/jobs/jobs_cmds.py +47 -0
- svc_infra/cli/cmds/obs/obs_cmds.py +36 -15
- svc_infra/cli/cmds/sdk/__init__.py +0 -0
- svc_infra/cli/cmds/sdk/sdk_cmds.py +112 -0
- svc_infra/cli/foundation/runner.py +6 -2
- svc_infra/data/add.py +61 -0
- svc_infra/data/backup.py +58 -0
- svc_infra/data/erasure.py +45 -0
- svc_infra/data/fixtures.py +42 -0
- svc_infra/data/retention.py +61 -0
- svc_infra/db/__init__.py +15 -0
- svc_infra/db/crud_schema.py +9 -9
- svc_infra/db/inbox.py +67 -0
- svc_infra/db/nosql/__init__.py +3 -0
- svc_infra/db/nosql/core.py +30 -9
- svc_infra/db/nosql/indexes.py +3 -1
- svc_infra/db/nosql/management.py +1 -1
- svc_infra/db/nosql/mongo/README.md +13 -13
- svc_infra/db/nosql/mongo/client.py +19 -2
- svc_infra/db/nosql/mongo/settings.py +6 -2
- svc_infra/db/nosql/repository.py +35 -15
- svc_infra/db/nosql/resource.py +20 -3
- svc_infra/db/nosql/scaffold.py +9 -3
- svc_infra/db/nosql/service.py +3 -1
- svc_infra/db/nosql/types.py +6 -2
- svc_infra/db/ops.py +384 -0
- svc_infra/db/outbox.py +108 -0
- svc_infra/db/sql/apikey.py +37 -9
- svc_infra/db/sql/authref.py +9 -3
- svc_infra/db/sql/constants.py +12 -8
- svc_infra/db/sql/core.py +2 -2
- svc_infra/db/sql/management.py +11 -8
- svc_infra/db/sql/repository.py +99 -26
- svc_infra/db/sql/resource.py +5 -0
- svc_infra/db/sql/scaffold.py +6 -2
- svc_infra/db/sql/service.py +15 -5
- svc_infra/db/sql/templates/models_schemas/auth/models.py.tmpl +7 -56
- svc_infra/db/sql/templates/models_schemas/auth/schemas.py.tmpl +1 -1
- svc_infra/db/sql/templates/setup/env_async.py.tmpl +34 -12
- svc_infra/db/sql/templates/setup/env_sync.py.tmpl +29 -7
- svc_infra/db/sql/tenant.py +88 -0
- svc_infra/db/sql/uniq_hooks.py +9 -3
- svc_infra/db/sql/utils.py +138 -51
- svc_infra/db/sql/versioning.py +14 -0
- svc_infra/deploy/__init__.py +538 -0
- svc_infra/documents/__init__.py +100 -0
- svc_infra/documents/add.py +264 -0
- svc_infra/documents/ease.py +233 -0
- svc_infra/documents/models.py +114 -0
- svc_infra/documents/storage.py +264 -0
- svc_infra/dx/add.py +65 -0
- svc_infra/dx/changelog.py +74 -0
- svc_infra/dx/checks.py +68 -0
- svc_infra/exceptions.py +141 -0
- svc_infra/health/__init__.py +864 -0
- svc_infra/http/__init__.py +13 -0
- svc_infra/http/client.py +105 -0
- svc_infra/jobs/builtins/outbox_processor.py +40 -0
- svc_infra/jobs/builtins/webhook_delivery.py +95 -0
- svc_infra/jobs/easy.py +33 -0
- svc_infra/jobs/loader.py +50 -0
- svc_infra/jobs/queue.py +116 -0
- svc_infra/jobs/redis_queue.py +256 -0
- svc_infra/jobs/runner.py +79 -0
- svc_infra/jobs/scheduler.py +53 -0
- svc_infra/jobs/worker.py +40 -0
- svc_infra/loaders/__init__.py +186 -0
- svc_infra/loaders/base.py +142 -0
- svc_infra/loaders/github.py +311 -0
- svc_infra/loaders/models.py +147 -0
- svc_infra/loaders/url.py +235 -0
- svc_infra/logging/__init__.py +374 -0
- svc_infra/mcp/svc_infra_mcp.py +91 -33
- svc_infra/obs/README.md +2 -0
- svc_infra/obs/add.py +65 -9
- svc_infra/obs/cloud_dash.py +2 -1
- svc_infra/obs/grafana/dashboards/http-overview.json +45 -0
- svc_infra/obs/metrics/__init__.py +52 -0
- svc_infra/obs/metrics/asgi.py +13 -7
- svc_infra/obs/metrics/http.py +9 -5
- svc_infra/obs/metrics/sqlalchemy.py +13 -9
- svc_infra/obs/metrics.py +53 -0
- svc_infra/obs/settings.py +6 -2
- svc_infra/security/add.py +217 -0
- svc_infra/security/audit.py +212 -0
- svc_infra/security/audit_service.py +74 -0
- svc_infra/security/headers.py +52 -0
- svc_infra/security/hibp.py +101 -0
- svc_infra/security/jwt_rotation.py +105 -0
- svc_infra/security/lockout.py +102 -0
- svc_infra/security/models.py +287 -0
- svc_infra/security/oauth_models.py +73 -0
- svc_infra/security/org_invites.py +130 -0
- svc_infra/security/passwords.py +79 -0
- svc_infra/security/permissions.py +171 -0
- svc_infra/security/session.py +98 -0
- svc_infra/security/signed_cookies.py +100 -0
- svc_infra/storage/__init__.py +93 -0
- svc_infra/storage/add.py +253 -0
- svc_infra/storage/backends/__init__.py +11 -0
- svc_infra/storage/backends/local.py +339 -0
- svc_infra/storage/backends/memory.py +216 -0
- svc_infra/storage/backends/s3.py +353 -0
- svc_infra/storage/base.py +239 -0
- svc_infra/storage/easy.py +185 -0
- svc_infra/storage/settings.py +195 -0
- svc_infra/testing/__init__.py +685 -0
- svc_infra/utils.py +7 -3
- svc_infra/webhooks/__init__.py +69 -0
- svc_infra/webhooks/add.py +339 -0
- svc_infra/webhooks/encryption.py +115 -0
- svc_infra/webhooks/fastapi.py +39 -0
- svc_infra/webhooks/router.py +55 -0
- svc_infra/webhooks/service.py +70 -0
- svc_infra/webhooks/signing.py +34 -0
- svc_infra/websocket/__init__.py +79 -0
- svc_infra/websocket/add.py +140 -0
- svc_infra/websocket/client.py +282 -0
- svc_infra/websocket/config.py +69 -0
- svc_infra/websocket/easy.py +76 -0
- svc_infra/websocket/exceptions.py +61 -0
- svc_infra/websocket/manager.py +344 -0
- svc_infra/websocket/models.py +49 -0
- svc_infra-0.1.706.dist-info/LICENSE +21 -0
- svc_infra-0.1.706.dist-info/METADATA +356 -0
- svc_infra-0.1.706.dist-info/RECORD +357 -0
- svc_infra-0.1.589.dist-info/METADATA +0 -79
- svc_infra-0.1.589.dist-info/RECORD +0 -234
- {svc_infra-0.1.589.dist-info → svc_infra-0.1.706.dist-info}/WHEEL +0 -0
- {svc_infra-0.1.589.dist-info → svc_infra-0.1.706.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
"""Health check utilities for svc-infra applications.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive health check infrastructure for
|
|
4
|
+
containerized deployments, including:
|
|
5
|
+
|
|
6
|
+
- **Startup probes**: Wait for dependencies before accepting traffic
|
|
7
|
+
- **Readiness probes**: Check if the service can handle requests
|
|
8
|
+
- **Liveness probes**: Verify the service is still running
|
|
9
|
+
- **Dependency checks**: Built-in checks for common services
|
|
10
|
+
|
|
11
|
+
Designed for Kubernetes, Docker, and PaaS deployments where proper
|
|
12
|
+
health probes prevent routing traffic to unhealthy instances.
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
>>> from svc_infra.health import (
|
|
16
|
+
... HealthRegistry,
|
|
17
|
+
... check_database,
|
|
18
|
+
... check_redis,
|
|
19
|
+
... check_url,
|
|
20
|
+
... add_health_routes,
|
|
21
|
+
... )
|
|
22
|
+
>>>
|
|
23
|
+
>>> # Create registry with checks
|
|
24
|
+
>>> registry = HealthRegistry()
|
|
25
|
+
>>> registry.add("database", check_database(os.getenv("DATABASE_URL")))
|
|
26
|
+
>>> registry.add("redis", check_redis(os.getenv("REDIS_URL")))
|
|
27
|
+
>>> registry.add("api", check_url("http://api-service:8080/health"))
|
|
28
|
+
>>>
|
|
29
|
+
>>> # Add to FastAPI app
|
|
30
|
+
>>> add_health_routes(app, registry)
|
|
31
|
+
>>>
|
|
32
|
+
>>> # Or wait for dependencies at startup
|
|
33
|
+
>>> await registry.wait_until_healthy(timeout=60, interval=2)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import asyncio
|
|
39
|
+
import time
|
|
40
|
+
from dataclasses import dataclass, field
|
|
41
|
+
from enum import StrEnum
|
|
42
|
+
from typing import Any, Awaitable, Callable, Optional
|
|
43
|
+
|
|
44
|
+
import httpx
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class HealthStatus(StrEnum):
|
|
48
|
+
"""Health check status values."""
|
|
49
|
+
|
|
50
|
+
HEALTHY = "healthy"
|
|
51
|
+
UNHEALTHY = "unhealthy"
|
|
52
|
+
DEGRADED = "degraded" # Partially working
|
|
53
|
+
UNKNOWN = "unknown" # Check hasn't run yet
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class HealthCheckResult:
|
|
58
|
+
"""Result of a single health check."""
|
|
59
|
+
|
|
60
|
+
name: str
|
|
61
|
+
status: HealthStatus
|
|
62
|
+
latency_ms: float
|
|
63
|
+
message: Optional[str] = None
|
|
64
|
+
details: Optional[dict[str, Any]] = None
|
|
65
|
+
|
|
66
|
+
def to_dict(self) -> dict[str, Any]:
|
|
67
|
+
"""Convert to dictionary for JSON serialization."""
|
|
68
|
+
result: dict[str, Any] = {
|
|
69
|
+
"name": self.name,
|
|
70
|
+
"status": self.status,
|
|
71
|
+
"latency_ms": round(self.latency_ms, 2),
|
|
72
|
+
}
|
|
73
|
+
if self.message:
|
|
74
|
+
result["message"] = self.message
|
|
75
|
+
if self.details:
|
|
76
|
+
result["details"] = self.details
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# Type alias for health check functions
|
|
81
|
+
HealthCheckFn = Callable[[], Awaitable[HealthCheckResult]]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass
|
|
85
|
+
class HealthCheck:
|
|
86
|
+
"""Registered health check with metadata."""
|
|
87
|
+
|
|
88
|
+
name: str
|
|
89
|
+
check_fn: HealthCheckFn
|
|
90
|
+
critical: bool = True # If False, failure doesn't fail overall health
|
|
91
|
+
timeout: float = 5.0 # Timeout in seconds
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class HealthRegistry:
|
|
95
|
+
"""Registry of health checks for a service.
|
|
96
|
+
|
|
97
|
+
The registry manages multiple health checks and provides methods to:
|
|
98
|
+
- Run all checks and aggregate results
|
|
99
|
+
- Wait for all critical checks to pass (startup probe)
|
|
100
|
+
- Determine overall service health
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
>>> registry = HealthRegistry()
|
|
104
|
+
>>> registry.add("database", check_database(db_url), critical=True)
|
|
105
|
+
>>> registry.add("cache", check_redis(redis_url), critical=False)
|
|
106
|
+
>>>
|
|
107
|
+
>>> # Run all checks
|
|
108
|
+
>>> result = await registry.check_all()
|
|
109
|
+
>>> print(result.status) # "healthy" or "unhealthy"
|
|
110
|
+
>>>
|
|
111
|
+
>>> # Wait for startup
|
|
112
|
+
>>> await registry.wait_until_healthy(timeout=60)
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(self) -> None:
|
|
116
|
+
"""Initialize empty health registry."""
|
|
117
|
+
self._checks: dict[str, HealthCheck] = {}
|
|
118
|
+
|
|
119
|
+
def add(
|
|
120
|
+
self,
|
|
121
|
+
name: str,
|
|
122
|
+
check_fn: HealthCheckFn,
|
|
123
|
+
*,
|
|
124
|
+
critical: bool = True,
|
|
125
|
+
timeout: float = 5.0,
|
|
126
|
+
) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Register a health check.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
name: Unique name for this check (e.g., "database", "redis")
|
|
132
|
+
check_fn: Async function that returns HealthCheckResult
|
|
133
|
+
critical: If True, failure means service is unhealthy
|
|
134
|
+
timeout: Maximum time to wait for this check (seconds)
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
ValueError: If a check with this name already exists
|
|
138
|
+
"""
|
|
139
|
+
if name in self._checks:
|
|
140
|
+
raise ValueError(f"Health check '{name}' already registered")
|
|
141
|
+
self._checks[name] = HealthCheck(
|
|
142
|
+
name=name,
|
|
143
|
+
check_fn=check_fn,
|
|
144
|
+
critical=critical,
|
|
145
|
+
timeout=timeout,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def remove(self, name: str) -> bool:
|
|
149
|
+
"""
|
|
150
|
+
Remove a health check by name.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
name: Name of the check to remove
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
True if check was removed, False if not found
|
|
157
|
+
"""
|
|
158
|
+
if name in self._checks:
|
|
159
|
+
del self._checks[name]
|
|
160
|
+
return True
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
def clear(self) -> None:
|
|
164
|
+
"""Remove all registered health checks."""
|
|
165
|
+
self._checks.clear()
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def checks(self) -> list[HealthCheck]:
|
|
169
|
+
"""Get list of all registered checks."""
|
|
170
|
+
return list(self._checks.values())
|
|
171
|
+
|
|
172
|
+
async def check_one(self, name: str) -> HealthCheckResult:
|
|
173
|
+
"""
|
|
174
|
+
Run a single health check by name.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
name: Name of the check to run
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
HealthCheckResult for the check
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
KeyError: If check not found
|
|
184
|
+
"""
|
|
185
|
+
if name not in self._checks:
|
|
186
|
+
raise KeyError(f"Health check '{name}' not found")
|
|
187
|
+
|
|
188
|
+
check = self._checks[name]
|
|
189
|
+
start = time.perf_counter()
|
|
190
|
+
|
|
191
|
+
try:
|
|
192
|
+
result = await asyncio.wait_for(check.check_fn(), timeout=check.timeout)
|
|
193
|
+
# Update latency from our timing
|
|
194
|
+
result.latency_ms = (time.perf_counter() - start) * 1000
|
|
195
|
+
return result
|
|
196
|
+
except asyncio.TimeoutError:
|
|
197
|
+
return HealthCheckResult(
|
|
198
|
+
name=name,
|
|
199
|
+
status=HealthStatus.UNHEALTHY,
|
|
200
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
201
|
+
message=f"Check timed out after {check.timeout}s",
|
|
202
|
+
)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
return HealthCheckResult(
|
|
205
|
+
name=name,
|
|
206
|
+
status=HealthStatus.UNHEALTHY,
|
|
207
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
208
|
+
message=str(e),
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
async def check_all(self) -> "AggregatedHealthResult":
|
|
212
|
+
"""
|
|
213
|
+
Run all registered health checks concurrently.
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
AggregatedHealthResult with overall status and individual results
|
|
217
|
+
"""
|
|
218
|
+
if not self._checks:
|
|
219
|
+
return AggregatedHealthResult(
|
|
220
|
+
status=HealthStatus.HEALTHY,
|
|
221
|
+
checks=[],
|
|
222
|
+
message="No health checks registered",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Run all checks concurrently
|
|
226
|
+
check_names = list(self._checks.keys())
|
|
227
|
+
results = await asyncio.gather(
|
|
228
|
+
*[self.check_one(name) for name in check_names],
|
|
229
|
+
return_exceptions=False,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Determine overall status
|
|
233
|
+
# - All critical checks must pass for HEALTHY
|
|
234
|
+
# - If any critical check fails, UNHEALTHY
|
|
235
|
+
# - If only non-critical checks fail, DEGRADED
|
|
236
|
+
critical_failed = False
|
|
237
|
+
non_critical_failed = False
|
|
238
|
+
|
|
239
|
+
for registered_name, result in zip(check_names, results):
|
|
240
|
+
check = self._checks.get(registered_name)
|
|
241
|
+
if result.status == HealthStatus.UNHEALTHY:
|
|
242
|
+
if check and check.critical:
|
|
243
|
+
critical_failed = True
|
|
244
|
+
else:
|
|
245
|
+
non_critical_failed = True
|
|
246
|
+
|
|
247
|
+
if critical_failed:
|
|
248
|
+
overall_status = HealthStatus.UNHEALTHY
|
|
249
|
+
elif non_critical_failed:
|
|
250
|
+
overall_status = HealthStatus.DEGRADED
|
|
251
|
+
else:
|
|
252
|
+
overall_status = HealthStatus.HEALTHY
|
|
253
|
+
|
|
254
|
+
return AggregatedHealthResult(
|
|
255
|
+
status=overall_status,
|
|
256
|
+
checks=results,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
async def wait_until_healthy(
|
|
260
|
+
self,
|
|
261
|
+
*,
|
|
262
|
+
timeout: float = 60.0,
|
|
263
|
+
interval: float = 2.0,
|
|
264
|
+
check_names: Optional[list[str]] = None,
|
|
265
|
+
) -> bool:
|
|
266
|
+
"""
|
|
267
|
+
Wait until all (or specified) critical checks pass.
|
|
268
|
+
|
|
269
|
+
Useful for startup scripts to wait for dependencies before
|
|
270
|
+
the main application starts accepting traffic.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
timeout: Maximum time to wait (seconds)
|
|
274
|
+
interval: Time between check attempts (seconds)
|
|
275
|
+
check_names: Specific checks to wait for (None = all critical)
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
True if all checks passed, False if timeout reached
|
|
279
|
+
|
|
280
|
+
Example:
|
|
281
|
+
>>> # Wait up to 60 seconds for database
|
|
282
|
+
>>> if not await registry.wait_until_healthy(timeout=60):
|
|
283
|
+
... raise RuntimeError("Dependencies not ready")
|
|
284
|
+
"""
|
|
285
|
+
deadline = time.monotonic() + timeout
|
|
286
|
+
|
|
287
|
+
while time.monotonic() < deadline:
|
|
288
|
+
if check_names:
|
|
289
|
+
# Check specific checks
|
|
290
|
+
all_healthy = True
|
|
291
|
+
for name in check_names:
|
|
292
|
+
try:
|
|
293
|
+
check_result = await self.check_one(name)
|
|
294
|
+
if check_result.status == HealthStatus.UNHEALTHY:
|
|
295
|
+
all_healthy = False
|
|
296
|
+
break
|
|
297
|
+
except KeyError:
|
|
298
|
+
all_healthy = False
|
|
299
|
+
break
|
|
300
|
+
else:
|
|
301
|
+
# Check all critical checks
|
|
302
|
+
agg_result = await self.check_all()
|
|
303
|
+
all_healthy = agg_result.status in (
|
|
304
|
+
HealthStatus.HEALTHY,
|
|
305
|
+
HealthStatus.DEGRADED,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
if all_healthy:
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
# Wait before next attempt
|
|
312
|
+
remaining = deadline - time.monotonic()
|
|
313
|
+
await asyncio.sleep(min(interval, max(0, remaining)))
|
|
314
|
+
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
@dataclass
|
|
319
|
+
class AggregatedHealthResult:
|
|
320
|
+
"""Aggregated result from multiple health checks."""
|
|
321
|
+
|
|
322
|
+
status: HealthStatus
|
|
323
|
+
checks: list[HealthCheckResult] = field(default_factory=list)
|
|
324
|
+
message: Optional[str] = None
|
|
325
|
+
|
|
326
|
+
def to_dict(self) -> dict[str, Any]:
|
|
327
|
+
"""Convert to dictionary for JSON serialization."""
|
|
328
|
+
result: dict[str, Any] = {
|
|
329
|
+
"status": self.status,
|
|
330
|
+
"checks": [c.to_dict() for c in self.checks],
|
|
331
|
+
}
|
|
332
|
+
if self.message:
|
|
333
|
+
result["message"] = self.message
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
# =============================================================================
|
|
338
|
+
# Built-in Health Check Functions
|
|
339
|
+
# =============================================================================
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def check_database(url: Optional[str]) -> HealthCheckFn:
|
|
343
|
+
"""
|
|
344
|
+
Create a health check for a PostgreSQL database.
|
|
345
|
+
|
|
346
|
+
Uses a simple "SELECT 1" query to verify connectivity.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
url: Database URL (postgres:// or postgresql://)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Async health check function
|
|
353
|
+
|
|
354
|
+
Example:
|
|
355
|
+
>>> check = check_database(os.getenv("DATABASE_URL"))
|
|
356
|
+
>>> registry.add("database", check, critical=True)
|
|
357
|
+
"""
|
|
358
|
+
|
|
359
|
+
async def _check() -> HealthCheckResult:
|
|
360
|
+
if not url:
|
|
361
|
+
return HealthCheckResult(
|
|
362
|
+
name="database",
|
|
363
|
+
status=HealthStatus.UNHEALTHY,
|
|
364
|
+
latency_ms=0,
|
|
365
|
+
message="DATABASE_URL not configured",
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
start = time.perf_counter()
|
|
369
|
+
try:
|
|
370
|
+
# Use asyncpg directly for lightweight check
|
|
371
|
+
import asyncpg
|
|
372
|
+
|
|
373
|
+
# Normalize URL for asyncpg
|
|
374
|
+
db_url = url
|
|
375
|
+
if db_url.startswith("postgres://"):
|
|
376
|
+
db_url = db_url.replace("postgres://", "postgresql://", 1)
|
|
377
|
+
if "+asyncpg" in db_url:
|
|
378
|
+
db_url = db_url.replace("+asyncpg", "")
|
|
379
|
+
|
|
380
|
+
conn = await asyncio.wait_for(
|
|
381
|
+
asyncpg.connect(db_url),
|
|
382
|
+
timeout=5.0,
|
|
383
|
+
)
|
|
384
|
+
try:
|
|
385
|
+
await conn.fetchval("SELECT 1")
|
|
386
|
+
finally:
|
|
387
|
+
await conn.close()
|
|
388
|
+
|
|
389
|
+
return HealthCheckResult(
|
|
390
|
+
name="database",
|
|
391
|
+
status=HealthStatus.HEALTHY,
|
|
392
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
393
|
+
)
|
|
394
|
+
except asyncio.TimeoutError:
|
|
395
|
+
return HealthCheckResult(
|
|
396
|
+
name="database",
|
|
397
|
+
status=HealthStatus.UNHEALTHY,
|
|
398
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
399
|
+
message="Connection timeout",
|
|
400
|
+
)
|
|
401
|
+
except ImportError:
|
|
402
|
+
# asyncpg not installed, try with httpx to a hypothetical health endpoint
|
|
403
|
+
return HealthCheckResult(
|
|
404
|
+
name="database",
|
|
405
|
+
status=HealthStatus.UNKNOWN,
|
|
406
|
+
latency_ms=0,
|
|
407
|
+
message="asyncpg not installed",
|
|
408
|
+
)
|
|
409
|
+
except Exception as e:
|
|
410
|
+
return HealthCheckResult(
|
|
411
|
+
name="database",
|
|
412
|
+
status=HealthStatus.UNHEALTHY,
|
|
413
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
414
|
+
message=str(e),
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
return _check
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def check_redis(url: Optional[str]) -> HealthCheckFn:
|
|
421
|
+
"""
|
|
422
|
+
Create a health check for Redis.
|
|
423
|
+
|
|
424
|
+
Uses PING command to verify connectivity.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
url: Redis URL (redis://)
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Async health check function
|
|
431
|
+
|
|
432
|
+
Example:
|
|
433
|
+
>>> check = check_redis(os.getenv("REDIS_URL"))
|
|
434
|
+
>>> registry.add("redis", check, critical=False)
|
|
435
|
+
"""
|
|
436
|
+
|
|
437
|
+
async def _check() -> HealthCheckResult:
|
|
438
|
+
if not url:
|
|
439
|
+
return HealthCheckResult(
|
|
440
|
+
name="redis",
|
|
441
|
+
status=HealthStatus.UNHEALTHY,
|
|
442
|
+
latency_ms=0,
|
|
443
|
+
message="REDIS_URL not configured",
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
start = time.perf_counter()
|
|
447
|
+
try:
|
|
448
|
+
import redis.asyncio as redis_async
|
|
449
|
+
|
|
450
|
+
client = redis_async.from_url(url, socket_connect_timeout=5.0)
|
|
451
|
+
try:
|
|
452
|
+
pong = await asyncio.wait_for(client.ping(), timeout=5.0)
|
|
453
|
+
if pong:
|
|
454
|
+
return HealthCheckResult(
|
|
455
|
+
name="redis",
|
|
456
|
+
status=HealthStatus.HEALTHY,
|
|
457
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
return HealthCheckResult(
|
|
461
|
+
name="redis",
|
|
462
|
+
status=HealthStatus.UNHEALTHY,
|
|
463
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
464
|
+
message="PING returned False",
|
|
465
|
+
)
|
|
466
|
+
finally:
|
|
467
|
+
await client.aclose()
|
|
468
|
+
except asyncio.TimeoutError:
|
|
469
|
+
return HealthCheckResult(
|
|
470
|
+
name="redis",
|
|
471
|
+
status=HealthStatus.UNHEALTHY,
|
|
472
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
473
|
+
message="Connection timeout",
|
|
474
|
+
)
|
|
475
|
+
except ImportError:
|
|
476
|
+
return HealthCheckResult(
|
|
477
|
+
name="redis",
|
|
478
|
+
status=HealthStatus.UNKNOWN,
|
|
479
|
+
latency_ms=0,
|
|
480
|
+
message="redis-py not installed",
|
|
481
|
+
)
|
|
482
|
+
except Exception as e:
|
|
483
|
+
return HealthCheckResult(
|
|
484
|
+
name="redis",
|
|
485
|
+
status=HealthStatus.UNHEALTHY,
|
|
486
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
487
|
+
message=str(e),
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
return _check
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def check_url(
|
|
494
|
+
url: str,
|
|
495
|
+
*,
|
|
496
|
+
method: str = "GET",
|
|
497
|
+
expected_status: int = 200,
|
|
498
|
+
timeout: float = 5.0,
|
|
499
|
+
headers: Optional[dict[str, str]] = None,
|
|
500
|
+
) -> HealthCheckFn:
|
|
501
|
+
"""
|
|
502
|
+
Create a health check for an HTTP endpoint.
|
|
503
|
+
|
|
504
|
+
Args:
|
|
505
|
+
url: URL to check
|
|
506
|
+
method: HTTP method (default: GET)
|
|
507
|
+
expected_status: Expected HTTP status code (default: 200)
|
|
508
|
+
timeout: Request timeout in seconds
|
|
509
|
+
headers: Optional headers to include
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Async health check function
|
|
513
|
+
|
|
514
|
+
Example:
|
|
515
|
+
>>> check = check_url("http://api:8080/health")
|
|
516
|
+
>>> registry.add("api", check)
|
|
517
|
+
>>>
|
|
518
|
+
>>> # With custom options
|
|
519
|
+
>>> check = check_url(
|
|
520
|
+
... "http://service:8080/ready",
|
|
521
|
+
... expected_status=204,
|
|
522
|
+
... headers={"Authorization": "Bearer token"},
|
|
523
|
+
... )
|
|
524
|
+
"""
|
|
525
|
+
# Extract name from URL for the result
|
|
526
|
+
try:
|
|
527
|
+
from urllib.parse import urlparse
|
|
528
|
+
|
|
529
|
+
parsed = urlparse(url)
|
|
530
|
+
name = parsed.netloc.split(":")[0]
|
|
531
|
+
except Exception:
|
|
532
|
+
name = "http"
|
|
533
|
+
|
|
534
|
+
async def _check() -> HealthCheckResult:
|
|
535
|
+
start = time.perf_counter()
|
|
536
|
+
try:
|
|
537
|
+
async with httpx.AsyncClient(timeout=timeout) as client:
|
|
538
|
+
response = await client.request(
|
|
539
|
+
method=method,
|
|
540
|
+
url=url,
|
|
541
|
+
headers=headers,
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
if response.status_code == expected_status:
|
|
545
|
+
return HealthCheckResult(
|
|
546
|
+
name=name,
|
|
547
|
+
status=HealthStatus.HEALTHY,
|
|
548
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
549
|
+
details={"status_code": response.status_code},
|
|
550
|
+
)
|
|
551
|
+
else:
|
|
552
|
+
return HealthCheckResult(
|
|
553
|
+
name=name,
|
|
554
|
+
status=HealthStatus.UNHEALTHY,
|
|
555
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
556
|
+
message=f"Expected status {expected_status}, got {response.status_code}",
|
|
557
|
+
details={"status_code": response.status_code},
|
|
558
|
+
)
|
|
559
|
+
except httpx.TimeoutException:
|
|
560
|
+
return HealthCheckResult(
|
|
561
|
+
name=name,
|
|
562
|
+
status=HealthStatus.UNHEALTHY,
|
|
563
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
564
|
+
message=f"Request timeout after {timeout}s",
|
|
565
|
+
)
|
|
566
|
+
except httpx.ConnectError as e:
|
|
567
|
+
return HealthCheckResult(
|
|
568
|
+
name=name,
|
|
569
|
+
status=HealthStatus.UNHEALTHY,
|
|
570
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
571
|
+
message=f"Connection failed: {e}",
|
|
572
|
+
)
|
|
573
|
+
except Exception as e:
|
|
574
|
+
return HealthCheckResult(
|
|
575
|
+
name=name,
|
|
576
|
+
status=HealthStatus.UNHEALTHY,
|
|
577
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
578
|
+
message=str(e),
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
return _check
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def check_tcp(
|
|
585
|
+
host: str,
|
|
586
|
+
port: int,
|
|
587
|
+
*,
|
|
588
|
+
timeout: float = 5.0,
|
|
589
|
+
) -> HealthCheckFn:
|
|
590
|
+
"""
|
|
591
|
+
Create a health check for a TCP port.
|
|
592
|
+
|
|
593
|
+
Useful for checking if a service is listening on a port
|
|
594
|
+
without needing protocol-specific logic.
|
|
595
|
+
|
|
596
|
+
Args:
|
|
597
|
+
host: Hostname or IP address
|
|
598
|
+
port: Port number
|
|
599
|
+
timeout: Connection timeout in seconds
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
Async health check function
|
|
603
|
+
|
|
604
|
+
Example:
|
|
605
|
+
>>> check = check_tcp("database", 5432)
|
|
606
|
+
>>> registry.add("postgres-port", check)
|
|
607
|
+
"""
|
|
608
|
+
name = f"{host}:{port}"
|
|
609
|
+
|
|
610
|
+
async def _check() -> HealthCheckResult:
|
|
611
|
+
start = time.perf_counter()
|
|
612
|
+
try:
|
|
613
|
+
_, writer = await asyncio.wait_for(
|
|
614
|
+
asyncio.open_connection(host, port),
|
|
615
|
+
timeout=timeout,
|
|
616
|
+
)
|
|
617
|
+
writer.close()
|
|
618
|
+
await writer.wait_closed()
|
|
619
|
+
|
|
620
|
+
return HealthCheckResult(
|
|
621
|
+
name=name,
|
|
622
|
+
status=HealthStatus.HEALTHY,
|
|
623
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
624
|
+
)
|
|
625
|
+
except asyncio.TimeoutError:
|
|
626
|
+
return HealthCheckResult(
|
|
627
|
+
name=name,
|
|
628
|
+
status=HealthStatus.UNHEALTHY,
|
|
629
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
630
|
+
message=f"Connection timeout after {timeout}s",
|
|
631
|
+
)
|
|
632
|
+
except OSError as e:
|
|
633
|
+
return HealthCheckResult(
|
|
634
|
+
name=name,
|
|
635
|
+
status=HealthStatus.UNHEALTHY,
|
|
636
|
+
latency_ms=(time.perf_counter() - start) * 1000,
|
|
637
|
+
message=str(e),
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
return _check
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
# =============================================================================
|
|
644
|
+
# FastAPI Integration
|
|
645
|
+
# =============================================================================
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def add_health_routes(
|
|
649
|
+
app: Any, # FastAPI
|
|
650
|
+
registry: HealthRegistry,
|
|
651
|
+
*,
|
|
652
|
+
prefix: str = "/_health",
|
|
653
|
+
include_in_schema: bool = False,
|
|
654
|
+
detailed_on_failure: bool = True,
|
|
655
|
+
) -> None:
|
|
656
|
+
"""
|
|
657
|
+
Add health check routes to a FastAPI application.
|
|
658
|
+
|
|
659
|
+
Creates three endpoints:
|
|
660
|
+
- `/_health/live` - Liveness probe (always returns 200)
|
|
661
|
+
- `/_health/ready` - Readiness probe (runs all checks)
|
|
662
|
+
- `/_health/startup` - Startup probe (runs critical checks)
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
app: FastAPI application instance
|
|
666
|
+
registry: HealthRegistry with registered checks
|
|
667
|
+
prefix: URL prefix for health routes
|
|
668
|
+
include_in_schema: Include in OpenAPI schema
|
|
669
|
+
detailed_on_failure: Include check details in error responses
|
|
670
|
+
|
|
671
|
+
Example:
|
|
672
|
+
>>> from fastapi import FastAPI
|
|
673
|
+
>>> from svc_infra.health import HealthRegistry, check_database, add_health_routes
|
|
674
|
+
>>>
|
|
675
|
+
>>> app = FastAPI()
|
|
676
|
+
>>> registry = HealthRegistry()
|
|
677
|
+
>>> registry.add("database", check_database(os.getenv("DATABASE_URL")))
|
|
678
|
+
>>> add_health_routes(app, registry)
|
|
679
|
+
"""
|
|
680
|
+
from starlette.responses import JSONResponse
|
|
681
|
+
|
|
682
|
+
from svc_infra.api.fastapi.dual.public import public_router
|
|
683
|
+
|
|
684
|
+
router = public_router(
|
|
685
|
+
prefix=prefix,
|
|
686
|
+
tags=["health"],
|
|
687
|
+
include_in_schema=include_in_schema,
|
|
688
|
+
)
|
|
689
|
+
|
|
690
|
+
@router.get("/live")
|
|
691
|
+
async def liveness() -> JSONResponse:
|
|
692
|
+
"""Liveness probe - always returns 200 if process is running."""
|
|
693
|
+
return JSONResponse({"status": "ok"})
|
|
694
|
+
|
|
695
|
+
@router.get("/ready")
|
|
696
|
+
async def readiness() -> JSONResponse:
|
|
697
|
+
"""Readiness probe - checks all dependencies."""
|
|
698
|
+
result = await registry.check_all()
|
|
699
|
+
|
|
700
|
+
if result.status == HealthStatus.HEALTHY:
|
|
701
|
+
return JSONResponse(result.to_dict(), status_code=200)
|
|
702
|
+
elif result.status == HealthStatus.DEGRADED:
|
|
703
|
+
# Degraded is still ready, but indicate the issue
|
|
704
|
+
return JSONResponse(result.to_dict(), status_code=200)
|
|
705
|
+
else:
|
|
706
|
+
if detailed_on_failure:
|
|
707
|
+
return JSONResponse(result.to_dict(), status_code=503)
|
|
708
|
+
else:
|
|
709
|
+
return JSONResponse({"status": "unhealthy"}, status_code=503)
|
|
710
|
+
|
|
711
|
+
@router.get("/startup")
|
|
712
|
+
async def startup() -> JSONResponse:
|
|
713
|
+
"""Startup probe - checks critical dependencies only."""
|
|
714
|
+
result = await registry.check_all()
|
|
715
|
+
|
|
716
|
+
# For startup, only critical checks matter
|
|
717
|
+
critical_healthy = result.status in (
|
|
718
|
+
HealthStatus.HEALTHY,
|
|
719
|
+
HealthStatus.DEGRADED,
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
if critical_healthy:
|
|
723
|
+
return JSONResponse({"status": "ok"}, status_code=200)
|
|
724
|
+
else:
|
|
725
|
+
if detailed_on_failure:
|
|
726
|
+
return JSONResponse(result.to_dict(), status_code=503)
|
|
727
|
+
else:
|
|
728
|
+
return JSONResponse({"status": "unhealthy"}, status_code=503)
|
|
729
|
+
|
|
730
|
+
@router.get("/checks/{name}")
|
|
731
|
+
async def check_single(name: str) -> JSONResponse:
|
|
732
|
+
"""Run a single health check by name."""
|
|
733
|
+
try:
|
|
734
|
+
result = await registry.check_one(name)
|
|
735
|
+
status_code = 200 if result.status == HealthStatus.HEALTHY else 503
|
|
736
|
+
return JSONResponse(result.to_dict(), status_code=status_code)
|
|
737
|
+
except KeyError:
|
|
738
|
+
return JSONResponse(
|
|
739
|
+
{"error": f"Health check '{name}' not found"},
|
|
740
|
+
status_code=404,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
app.include_router(router)
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def add_startup_probe(
|
|
747
|
+
app: Any, # FastAPI
|
|
748
|
+
checks: list[HealthCheckFn],
|
|
749
|
+
*,
|
|
750
|
+
timeout: float = 60.0,
|
|
751
|
+
interval: float = 2.0,
|
|
752
|
+
) -> None:
|
|
753
|
+
"""
|
|
754
|
+
Add a startup event that waits for dependencies.
|
|
755
|
+
|
|
756
|
+
This is useful for ensuring the database, cache, and other
|
|
757
|
+
dependencies are ready before the application starts accepting traffic.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
app: FastAPI application instance
|
|
761
|
+
checks: List of health check functions to wait for
|
|
762
|
+
timeout: Maximum time to wait for all checks (seconds)
|
|
763
|
+
interval: Time between check attempts (seconds)
|
|
764
|
+
|
|
765
|
+
Raises:
|
|
766
|
+
RuntimeError: If dependencies aren't ready within timeout
|
|
767
|
+
|
|
768
|
+
Example:
|
|
769
|
+
>>> from fastapi import FastAPI
|
|
770
|
+
>>> from svc_infra.health import check_database, check_redis, add_startup_probe
|
|
771
|
+
>>>
|
|
772
|
+
>>> app = FastAPI()
|
|
773
|
+
>>> add_startup_probe(
|
|
774
|
+
... app,
|
|
775
|
+
... checks=[
|
|
776
|
+
... check_database(os.getenv("DATABASE_URL")),
|
|
777
|
+
... check_redis(os.getenv("REDIS_URL")),
|
|
778
|
+
... ],
|
|
779
|
+
... timeout=60,
|
|
780
|
+
... )
|
|
781
|
+
"""
|
|
782
|
+
registry = HealthRegistry()
|
|
783
|
+
for i, check in enumerate(checks):
|
|
784
|
+
registry.add(f"startup_{i}", check, critical=True)
|
|
785
|
+
|
|
786
|
+
@app.on_event("startup")
|
|
787
|
+
async def _wait_for_dependencies() -> None:
|
|
788
|
+
import logging
|
|
789
|
+
|
|
790
|
+
logger = logging.getLogger("svc_infra.health")
|
|
791
|
+
logger.info(f"Waiting for {len(checks)} dependencies (timeout={timeout}s)...")
|
|
792
|
+
|
|
793
|
+
if await registry.wait_until_healthy(timeout=timeout, interval=interval):
|
|
794
|
+
logger.info("All dependencies ready")
|
|
795
|
+
else:
|
|
796
|
+
# Log which checks failed
|
|
797
|
+
result = await registry.check_all()
|
|
798
|
+
failed = [
|
|
799
|
+
c.name for c in result.checks if c.status == HealthStatus.UNHEALTHY
|
|
800
|
+
]
|
|
801
|
+
error_msg = f"Dependencies not ready after {timeout}s: {failed}"
|
|
802
|
+
logger.error(error_msg)
|
|
803
|
+
raise RuntimeError(error_msg)
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def add_dependency_health(
|
|
807
|
+
app: Any, # FastAPI
|
|
808
|
+
name: str,
|
|
809
|
+
check_fn: HealthCheckFn,
|
|
810
|
+
*,
|
|
811
|
+
critical: bool = True,
|
|
812
|
+
) -> None:
|
|
813
|
+
"""
|
|
814
|
+
Register a dependency health check on an existing app.
|
|
815
|
+
|
|
816
|
+
This adds the check to the app's health registry if one exists,
|
|
817
|
+
or creates a new one.
|
|
818
|
+
|
|
819
|
+
Args:
|
|
820
|
+
app: FastAPI application instance
|
|
821
|
+
name: Name for the health check
|
|
822
|
+
check_fn: Async function that returns HealthCheckResult
|
|
823
|
+
critical: Whether failure means service is unhealthy
|
|
824
|
+
|
|
825
|
+
Example:
|
|
826
|
+
>>> # Add checks incrementally
|
|
827
|
+
>>> add_dependency_health(app, "database", check_database(db_url))
|
|
828
|
+
>>> add_dependency_health(app, "cache", check_redis(redis_url), critical=False)
|
|
829
|
+
"""
|
|
830
|
+
# Get or create registry on app state
|
|
831
|
+
if not hasattr(app, "state"):
|
|
832
|
+
raise ValueError("App must have a 'state' attribute (FastAPI/Starlette)")
|
|
833
|
+
|
|
834
|
+
if not hasattr(app.state, "_health_registry"):
|
|
835
|
+
app.state._health_registry = HealthRegistry()
|
|
836
|
+
# Add routes for the registry
|
|
837
|
+
add_health_routes(app, app.state._health_registry)
|
|
838
|
+
|
|
839
|
+
app.state._health_registry.add(name, check_fn, critical=critical)
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
# =============================================================================
|
|
843
|
+
# Exports
|
|
844
|
+
# =============================================================================
|
|
845
|
+
|
|
846
|
+
__all__ = [
|
|
847
|
+
# Status types
|
|
848
|
+
"HealthStatus",
|
|
849
|
+
"HealthCheckResult",
|
|
850
|
+
"HealthCheck",
|
|
851
|
+
"HealthCheckFn",
|
|
852
|
+
"AggregatedHealthResult",
|
|
853
|
+
# Registry
|
|
854
|
+
"HealthRegistry",
|
|
855
|
+
# Built-in checks
|
|
856
|
+
"check_database",
|
|
857
|
+
"check_redis",
|
|
858
|
+
"check_url",
|
|
859
|
+
"check_tcp",
|
|
860
|
+
# FastAPI integration
|
|
861
|
+
"add_health_routes",
|
|
862
|
+
"add_startup_probe",
|
|
863
|
+
"add_dependency_health",
|
|
864
|
+
]
|