aegis-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aegis-stack might be problematic. Click here for more details.
- aegis/__init__.py +5 -0
- aegis/__main__.py +374 -0
- aegis/core/CLAUDE.md +365 -0
- aegis/core/__init__.py +6 -0
- aegis/core/components.py +115 -0
- aegis/core/dependency_resolver.py +119 -0
- aegis/core/template_generator.py +163 -0
- aegis/templates/CLAUDE.md +306 -0
- aegis/templates/cookiecutter-aegis-project/cookiecutter.json +27 -0
- aegis/templates/cookiecutter-aegis-project/hooks/post_gen_project.py +172 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.dockerignore +71 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.env.example.j2 +70 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.gitignore +127 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/Dockerfile +53 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/Makefile +211 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/README.md.j2 +196 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/__init__.py +5 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/__init__.py +6 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/health.py +321 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/load_test.py +638 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/main.py +41 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/__init__.py +0 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/__init__.py +0 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/health.py +134 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/models.py.j2 +247 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/routing.py.j2 +14 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/tasks.py.j2 +596 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/hooks.py +133 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/main.py +16 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/middleware/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/middleware/cors.py +20 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/shutdown/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/shutdown/cleanup.py +14 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/startup/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/startup/component_health.py.j2 +190 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/__init__.py +0 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/core/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/core/theme.py +46 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/main.py +687 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/scheduler/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/scheduler/main.py +138 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/CLAUDE.md +213 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/__init__.py +6 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/constants.py.j2 +30 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/pools.py +78 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/load_test.py +48 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/media.py +41 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/system.py +36 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/registry.py +139 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/__init__.py +119 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/load_tasks.py +526 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/simple_system_tasks.py +32 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/system_tasks.py +279 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/config.py.j2 +119 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/constants.py +60 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/db.py +67 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/log.py +85 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/webserver.py +40 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/{% if cookiecutter.include_scheduler == /"yes/" %}scheduler.py{% endif %}" +21 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/integrations/__init__.py +0 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/integrations/main.py +61 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/py.typed +0 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/load_test.py +661 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/load_test_models.py +269 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/shared/__init__.py +15 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/shared/models.py +26 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/__init__.py +52 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/alerts.py +94 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/health.py.j2 +1105 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/models.py +169 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/ui.py +52 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docker-compose.yml.j2 +195 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/api.md +191 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/components/scheduler.md +414 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/development.md +215 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/health.md +240 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/javascripts/mermaid-config.js +62 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/stylesheets/mermaid.css +95 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/mkdocs.yml.j2 +62 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/pyproject.toml.j2 +156 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/entrypoint.sh +87 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/entrypoint.sh.j2 +104 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/gen_docs.py +16 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/api/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/api/test_health_endpoints.py.j2 +239 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/components/test_scheduler.py +76 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/conftest.py.j2 +81 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/__init__.py +1 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_component_integration.py.j2 +376 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_health_logic.py.j2 +633 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_load_test_models.py +665 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_load_test_service.py +602 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_system_service.py +96 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_worker_health_registration.py.j2 +224 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/test_core.py +50 -0
- aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/uv.lock +1673 -0
- aegis_stack-0.1.0.dist-info/METADATA +114 -0
- aegis_stack-0.1.0.dist-info/RECORD +103 -0
- aegis_stack-0.1.0.dist-info/WHEEL +4 -0
- aegis_stack-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,1105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
System health monitoring functions.
|
|
3
|
+
|
|
4
|
+
Pure functions for system health checking, monitoring, and status reporting.
|
|
5
|
+
All functions use Pydantic models for type safety and validation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from collections.abc import Awaitable, Callable
|
|
10
|
+
from datetime import UTC, datetime
|
|
11
|
+
import os
|
|
12
|
+
import sqlite3
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Any, cast
|
|
15
|
+
|
|
16
|
+
import psutil
|
|
17
|
+
|
|
18
|
+
from app.core.config import settings
|
|
19
|
+
from app.core.log import logger
|
|
20
|
+
|
|
21
|
+
from .alerts import send_critical_alert, send_health_alert
|
|
22
|
+
from .models import ComponentStatus, ComponentStatusType, SystemStatus
|
|
23
|
+
|
|
24
|
+
# Global registry for custom health checks
|
|
25
|
+
_health_checks: dict[str, Callable[[], Awaitable[ComponentStatus]]] = {}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def format_bytes(size: int) -> str:
|
|
29
|
+
"""Format bytes into human-readable string."""
|
|
30
|
+
if size == 0:
|
|
31
|
+
return "0 B"
|
|
32
|
+
|
|
33
|
+
size_float = float(size)
|
|
34
|
+
for unit in ['B', 'KB', 'MB', 'GB']:
|
|
35
|
+
if size_float < 1024.0:
|
|
36
|
+
if unit == 'B':
|
|
37
|
+
return f"{int(size_float)} {unit}"
|
|
38
|
+
else:
|
|
39
|
+
return f"{size_float:.1f} {unit}"
|
|
40
|
+
size_float /= 1024.0
|
|
41
|
+
return f"{size_float:.1f} TB"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def propagate_status(child_statuses: list[ComponentStatusType]) -> ComponentStatusType:
|
|
45
|
+
"""
|
|
46
|
+
Determine parent status from child statuses using standard hierarchy.
|
|
47
|
+
|
|
48
|
+
Status priority (highest to lowest):
|
|
49
|
+
1. UNHEALTHY - Any unhealthy child makes parent unhealthy
|
|
50
|
+
2. WARNING - Any warning child makes parent warning (if no unhealthy)
|
|
51
|
+
3. INFO - Any info child makes parent info (if no unhealthy/warning)
|
|
52
|
+
4. HEALTHY - All children healthy makes parent healthy
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
child_statuses: List of ComponentStatusType from child components
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
ComponentStatusType for the parent component
|
|
59
|
+
"""
|
|
60
|
+
if not child_statuses:
|
|
61
|
+
return ComponentStatusType.HEALTHY
|
|
62
|
+
|
|
63
|
+
if any(status == ComponentStatusType.UNHEALTHY for status in child_statuses):
|
|
64
|
+
return ComponentStatusType.UNHEALTHY
|
|
65
|
+
elif any(status == ComponentStatusType.WARNING for status in child_statuses):
|
|
66
|
+
return ComponentStatusType.WARNING
|
|
67
|
+
elif any(status == ComponentStatusType.INFO for status in child_statuses):
|
|
68
|
+
return ComponentStatusType.INFO
|
|
69
|
+
elif all(status == ComponentStatusType.HEALTHY for status in child_statuses):
|
|
70
|
+
return ComponentStatusType.HEALTHY
|
|
71
|
+
else:
|
|
72
|
+
return ComponentStatusType.HEALTHY # Default for edge cases
|
|
73
|
+
|
|
74
|
+
# Cache for system metrics to improve performance
|
|
75
|
+
_system_metrics_cache: dict[str, tuple[ComponentStatus, datetime]] = {}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def register_health_check(
|
|
79
|
+
name: str, check_fn: Callable[[], Awaitable[ComponentStatus]]
|
|
80
|
+
) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Register a custom health check function.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
name: Unique name for the health check
|
|
86
|
+
check_fn: Async function that returns ComponentStatus or bool
|
|
87
|
+
"""
|
|
88
|
+
_health_checks[name] = check_fn
|
|
89
|
+
logger.info(f"Registered custom health check: {name}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def get_system_status() -> SystemStatus:
|
|
93
|
+
"""
|
|
94
|
+
Get comprehensive system status.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
SystemStatus with all component health information organized as Aegis tree
|
|
98
|
+
"""
|
|
99
|
+
logger.info("Running system health checks")
|
|
100
|
+
start_time = datetime.now(UTC)
|
|
101
|
+
|
|
102
|
+
# Run custom component checks (these are top-level components)
|
|
103
|
+
component_results = {}
|
|
104
|
+
component_tasks = []
|
|
105
|
+
for name, check_fn in _health_checks.items():
|
|
106
|
+
task = asyncio.create_task(_run_health_check(name, check_fn))
|
|
107
|
+
component_tasks.append((name, task))
|
|
108
|
+
|
|
109
|
+
# Collect component results
|
|
110
|
+
for name, task in component_tasks:
|
|
111
|
+
try:
|
|
112
|
+
component_results[name] = await task
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Component check failed for {name}: {e}")
|
|
115
|
+
component_results[name] = ComponentStatus(
|
|
116
|
+
name=name,
|
|
117
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
118
|
+
message=f"Health check failed: {str(e)}",
|
|
119
|
+
response_time_ms=None,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Get system metrics (with caching for performance)
|
|
123
|
+
system_metrics = await _get_cached_system_metrics(start_time)
|
|
124
|
+
|
|
125
|
+
# Group system metrics under backend component if it exists
|
|
126
|
+
if "backend" in component_results:
|
|
127
|
+
# Backend exists - recreate with system metrics as sub-components
|
|
128
|
+
backend_component = component_results["backend"]
|
|
129
|
+
|
|
130
|
+
# Propagate status from system metrics and original backend status
|
|
131
|
+
system_metrics_statuses = [
|
|
132
|
+
getattr(metric, 'status', ComponentStatusType.HEALTHY)
|
|
133
|
+
for metric in system_metrics.values()
|
|
134
|
+
]
|
|
135
|
+
original_backend_status = getattr(
|
|
136
|
+
backend_component, 'status', ComponentStatusType.HEALTHY
|
|
137
|
+
)
|
|
138
|
+
all_backend_statuses = system_metrics_statuses + [original_backend_status]
|
|
139
|
+
|
|
140
|
+
backend_status = propagate_status(all_backend_statuses)
|
|
141
|
+
|
|
142
|
+
component_results["backend"] = ComponentStatus(
|
|
143
|
+
name=backend_component.name,
|
|
144
|
+
status=backend_status,
|
|
145
|
+
message=backend_component.message,
|
|
146
|
+
response_time_ms=backend_component.response_time_ms,
|
|
147
|
+
metadata=backend_component.metadata,
|
|
148
|
+
sub_components=system_metrics,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
# Backend doesn't exist - create a virtual backend component to hold
|
|
152
|
+
# system metrics
|
|
153
|
+
backend_healthy = all(metric.healthy for metric in system_metrics.values())
|
|
154
|
+
|
|
155
|
+
# Propagate status from system metrics only
|
|
156
|
+
system_metrics_statuses = [
|
|
157
|
+
getattr(metric, 'status', ComponentStatusType.HEALTHY)
|
|
158
|
+
for metric in system_metrics.values()
|
|
159
|
+
]
|
|
160
|
+
backend_status = propagate_status(system_metrics_statuses)
|
|
161
|
+
|
|
162
|
+
backend_message = (
|
|
163
|
+
"System container metrics"
|
|
164
|
+
if backend_healthy
|
|
165
|
+
else "System container has issues"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
component_results["backend"] = ComponentStatus(
|
|
169
|
+
name="backend",
|
|
170
|
+
status=backend_status,
|
|
171
|
+
message=backend_message,
|
|
172
|
+
response_time_ms=None,
|
|
173
|
+
metadata={"type": "system_container", "virtual": True},
|
|
174
|
+
sub_components=system_metrics,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Calculate overall health (including sub-components)
|
|
178
|
+
all_statuses = list(component_results.values())
|
|
179
|
+
for component in component_results.values():
|
|
180
|
+
all_statuses.extend(component.sub_components.values())
|
|
181
|
+
overall_healthy = all(status.healthy for status in all_statuses)
|
|
182
|
+
|
|
183
|
+
# Create Aegis root structure with components underneath
|
|
184
|
+
aegis_healthy = all(status.healthy for status in all_statuses)
|
|
185
|
+
|
|
186
|
+
# Propagate status from all top-level components
|
|
187
|
+
component_statuses = [
|
|
188
|
+
getattr(component, 'status', ComponentStatusType.HEALTHY)
|
|
189
|
+
for component in component_results.values()
|
|
190
|
+
]
|
|
191
|
+
aegis_status = propagate_status(component_statuses)
|
|
192
|
+
|
|
193
|
+
aegis_message = (
|
|
194
|
+
"Aegis Stack application" if aegis_healthy else "Aegis Stack has issues"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
root_components = {
|
|
198
|
+
"aegis": ComponentStatus(
|
|
199
|
+
name="aegis",
|
|
200
|
+
status=aegis_status,
|
|
201
|
+
message=aegis_message,
|
|
202
|
+
response_time_ms=None,
|
|
203
|
+
metadata={"type": "application_root", "version": "1.0"},
|
|
204
|
+
sub_components=component_results,
|
|
205
|
+
)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Get system information
|
|
209
|
+
system_info = _get_system_info()
|
|
210
|
+
|
|
211
|
+
status = SystemStatus(
|
|
212
|
+
components=root_components,
|
|
213
|
+
overall_healthy=overall_healthy,
|
|
214
|
+
timestamp=start_time,
|
|
215
|
+
system_info=system_info,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Log unhealthy components
|
|
219
|
+
if not overall_healthy:
|
|
220
|
+
logger.warning(
|
|
221
|
+
f"System unhealthy: {status.unhealthy_components}",
|
|
222
|
+
extra={"unhealthy_components": status.unhealthy_components},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return status
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
async def is_system_healthy() -> bool:
|
|
229
|
+
"""Quick check if system is overall healthy."""
|
|
230
|
+
status = await get_system_status()
|
|
231
|
+
return status.overall_healthy
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def check_system_status() -> None:
|
|
235
|
+
"""
|
|
236
|
+
Scheduled health check function for use in APScheduler jobs.
|
|
237
|
+
|
|
238
|
+
This function gets the system status and logs any issues.
|
|
239
|
+
Can be extended to send alerts to Slack, email, etc.
|
|
240
|
+
"""
|
|
241
|
+
logger.info("🩺 Running scheduled system health check")
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
status = await get_system_status()
|
|
245
|
+
|
|
246
|
+
if status.overall_healthy:
|
|
247
|
+
log_msg = (
|
|
248
|
+
f"✅ System healthy: {len(status.healthy_top_level_components)}/"
|
|
249
|
+
f"{status.total_components} components OK"
|
|
250
|
+
)
|
|
251
|
+
logger.info(log_msg)
|
|
252
|
+
else:
|
|
253
|
+
logger.warning(
|
|
254
|
+
f"⚠️ System issues detected: "
|
|
255
|
+
f"{len(status.unhealthy_components)} unhealthy components",
|
|
256
|
+
extra={
|
|
257
|
+
"unhealthy_components": status.unhealthy_components,
|
|
258
|
+
"health_percentage": status.health_percentage,
|
|
259
|
+
},
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Log details for each unhealthy component
|
|
263
|
+
for component_name in status.unhealthy_components:
|
|
264
|
+
component = status.components[component_name]
|
|
265
|
+
logger.error(
|
|
266
|
+
f"❌ {component_name}: {component.message}",
|
|
267
|
+
extra={"component": component.name, "metadata": component.metadata},
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Send health alerts
|
|
271
|
+
await send_health_alert(status)
|
|
272
|
+
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.error(f"💥 System health check failed: {e}")
|
|
275
|
+
# Send critical alert about monitoring failure
|
|
276
|
+
await send_critical_alert(f"Health monitoring failed: {e}", str(e))
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
async def _get_cached_system_metrics(
|
|
280
|
+
current_time: datetime,
|
|
281
|
+
) -> dict[str, ComponentStatus]:
|
|
282
|
+
"""Get system metrics with caching for better performance."""
|
|
283
|
+
cache_duration = settings.SYSTEM_METRICS_CACHE_SECONDS
|
|
284
|
+
system_metric_checks = {
|
|
285
|
+
"memory": _check_memory,
|
|
286
|
+
"disk": _check_disk_space,
|
|
287
|
+
"cpu": _check_cpu_usage,
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
system_metrics = {}
|
|
291
|
+
tasks = []
|
|
292
|
+
|
|
293
|
+
for name, check_fn in system_metric_checks.items():
|
|
294
|
+
# Check if we have a valid cached result
|
|
295
|
+
if name in _system_metrics_cache:
|
|
296
|
+
cached_result, cached_time = _system_metrics_cache[name]
|
|
297
|
+
age_seconds = (current_time - cached_time).total_seconds()
|
|
298
|
+
|
|
299
|
+
if age_seconds < cache_duration:
|
|
300
|
+
# Use cached result
|
|
301
|
+
system_metrics[name] = cached_result
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
# Need to run the check
|
|
305
|
+
task = asyncio.create_task(
|
|
306
|
+
_run_health_check_with_cache(name, check_fn, current_time)
|
|
307
|
+
)
|
|
308
|
+
tasks.append((name, task))
|
|
309
|
+
|
|
310
|
+
# Collect results from non-cached checks
|
|
311
|
+
for name, task in tasks:
|
|
312
|
+
try:
|
|
313
|
+
system_metrics[name] = await task
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.error(f"System metric check failed for {name}: {e}")
|
|
316
|
+
system_metrics[name] = ComponentStatus(
|
|
317
|
+
name=name,
|
|
318
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
319
|
+
message=f"Health check failed: {str(e)}",
|
|
320
|
+
response_time_ms=None,
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
return system_metrics
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
async def _run_health_check_with_cache(
|
|
327
|
+
name: str, check_fn: Callable[[], Awaitable[ComponentStatus]], timestamp: datetime
|
|
328
|
+
) -> ComponentStatus:
|
|
329
|
+
"""Run health check and cache the result."""
|
|
330
|
+
result = await _run_health_check(name, check_fn)
|
|
331
|
+
_system_metrics_cache[name] = (result, timestamp)
|
|
332
|
+
return result
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
async def _run_health_check(
|
|
336
|
+
name: str, check_fn: Callable[[], Awaitable[ComponentStatus]]
|
|
337
|
+
) -> ComponentStatus:
|
|
338
|
+
"""Run a single health check with timing."""
|
|
339
|
+
start_time = datetime.now(UTC)
|
|
340
|
+
try:
|
|
341
|
+
result = await check_fn()
|
|
342
|
+
end_time = datetime.now(UTC)
|
|
343
|
+
response_time = (end_time - start_time).total_seconds() * 1000
|
|
344
|
+
|
|
345
|
+
if isinstance(result, ComponentStatus):
|
|
346
|
+
result.response_time_ms = response_time
|
|
347
|
+
return result
|
|
348
|
+
else:
|
|
349
|
+
return ComponentStatus(
|
|
350
|
+
name=name,
|
|
351
|
+
status=(
|
|
352
|
+
ComponentStatusType.HEALTHY if bool(result)
|
|
353
|
+
else ComponentStatusType.UNHEALTHY
|
|
354
|
+
),
|
|
355
|
+
message="OK" if result else "Failed",
|
|
356
|
+
response_time_ms=response_time,
|
|
357
|
+
)
|
|
358
|
+
except Exception as e:
|
|
359
|
+
end_time = datetime.now(UTC)
|
|
360
|
+
response_time = (end_time - start_time).total_seconds() * 1000
|
|
361
|
+
return ComponentStatus(
|
|
362
|
+
name=name,
|
|
363
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
364
|
+
message=f"Error: {str(e)}",
|
|
365
|
+
response_time_ms=response_time,
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _get_system_info() -> dict[str, Any]:
|
|
370
|
+
"""Get general system information."""
|
|
371
|
+
try:
|
|
372
|
+
return {
|
|
373
|
+
"python_version": (
|
|
374
|
+
f"{sys.version_info.major}."
|
|
375
|
+
f"{sys.version_info.minor}."
|
|
376
|
+
f"{sys.version_info.micro}"
|
|
377
|
+
),
|
|
378
|
+
"platform": psutil.WINDOWS if psutil.WINDOWS else "unix",
|
|
379
|
+
"containerized": "docker" if os.path.exists("/.dockerenv") else "false",
|
|
380
|
+
}
|
|
381
|
+
except Exception as e:
|
|
382
|
+
logger.warning(f"Failed to get system info: {e}")
|
|
383
|
+
return {"error": str(e)}
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
async def _check_memory() -> ComponentStatus:
|
|
387
|
+
"""Check system memory usage."""
|
|
388
|
+
try:
|
|
389
|
+
# Run in thread to avoid blocking
|
|
390
|
+
memory = await asyncio.to_thread(psutil.virtual_memory)
|
|
391
|
+
memory_percent = memory.percent
|
|
392
|
+
|
|
393
|
+
# Determine status based on memory usage thresholds
|
|
394
|
+
if memory_percent >= settings.MEMORY_THRESHOLD_PERCENT:
|
|
395
|
+
status = ComponentStatusType.UNHEALTHY
|
|
396
|
+
elif memory_percent >= settings.MEMORY_THRESHOLD_PERCENT * 0.8:
|
|
397
|
+
status = ComponentStatusType.WARNING
|
|
398
|
+
else:
|
|
399
|
+
status = ComponentStatusType.HEALTHY
|
|
400
|
+
|
|
401
|
+
return ComponentStatus(
|
|
402
|
+
name="memory",
|
|
403
|
+
status=status,
|
|
404
|
+
message=f"Memory usage: {memory_percent:.1f}%",
|
|
405
|
+
response_time_ms=None,
|
|
406
|
+
metadata={
|
|
407
|
+
"percent_used": memory_percent,
|
|
408
|
+
"total_gb": round(memory.total / (1024**3), 2),
|
|
409
|
+
"available_gb": round(memory.available / (1024**3), 2),
|
|
410
|
+
"threshold_percent": settings.MEMORY_THRESHOLD_PERCENT,
|
|
411
|
+
},
|
|
412
|
+
)
|
|
413
|
+
except Exception as e:
|
|
414
|
+
return ComponentStatus(
|
|
415
|
+
name="memory",
|
|
416
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
417
|
+
message=f"Failed to check memory: {e}",
|
|
418
|
+
response_time_ms=None,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
async def _check_disk_space() -> ComponentStatus:
|
|
423
|
+
"""Check disk space usage."""
|
|
424
|
+
try:
|
|
425
|
+
# Run in thread to avoid blocking
|
|
426
|
+
disk = await asyncio.to_thread(psutil.disk_usage, "/")
|
|
427
|
+
disk_percent = (disk.used / disk.total) * 100
|
|
428
|
+
|
|
429
|
+
# Determine status based on disk usage thresholds
|
|
430
|
+
if disk_percent >= settings.DISK_THRESHOLD_PERCENT:
|
|
431
|
+
status = ComponentStatusType.UNHEALTHY
|
|
432
|
+
elif disk_percent >= settings.DISK_THRESHOLD_PERCENT * 0.8:
|
|
433
|
+
status = ComponentStatusType.WARNING
|
|
434
|
+
else:
|
|
435
|
+
status = ComponentStatusType.HEALTHY
|
|
436
|
+
|
|
437
|
+
return ComponentStatus(
|
|
438
|
+
name="disk",
|
|
439
|
+
status=status,
|
|
440
|
+
message=f"Disk usage: {disk_percent:.1f}%",
|
|
441
|
+
response_time_ms=None,
|
|
442
|
+
metadata={
|
|
443
|
+
"percent_used": disk_percent,
|
|
444
|
+
"total_gb": round(disk.total / (1024**3), 2),
|
|
445
|
+
"free_gb": round(disk.free / (1024**3), 2),
|
|
446
|
+
"threshold_percent": settings.DISK_THRESHOLD_PERCENT,
|
|
447
|
+
},
|
|
448
|
+
)
|
|
449
|
+
except Exception as e:
|
|
450
|
+
return ComponentStatus(
|
|
451
|
+
name="disk",
|
|
452
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
453
|
+
message=f"Failed to check disk space: {e}",
|
|
454
|
+
response_time_ms=None,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
async def _check_cpu_usage() -> ComponentStatus:
|
|
459
|
+
"""Check CPU usage (instant sampling)."""
|
|
460
|
+
try:
|
|
461
|
+
# Get instant CPU usage (non-blocking, immediate reading)
|
|
462
|
+
cpu_percent = await asyncio.to_thread(psutil.cpu_percent, None)
|
|
463
|
+
|
|
464
|
+
# Determine status based on CPU usage thresholds
|
|
465
|
+
if cpu_percent >= settings.CPU_THRESHOLD_PERCENT:
|
|
466
|
+
status = ComponentStatusType.UNHEALTHY
|
|
467
|
+
elif cpu_percent >= settings.CPU_THRESHOLD_PERCENT * 0.8:
|
|
468
|
+
status = ComponentStatusType.WARNING
|
|
469
|
+
else:
|
|
470
|
+
status = ComponentStatusType.HEALTHY
|
|
471
|
+
|
|
472
|
+
return ComponentStatus(
|
|
473
|
+
name="cpu",
|
|
474
|
+
status=status,
|
|
475
|
+
message=f"CPU usage: {cpu_percent:.1f}%",
|
|
476
|
+
response_time_ms=None,
|
|
477
|
+
metadata={
|
|
478
|
+
"percent_used": cpu_percent,
|
|
479
|
+
"cpu_count": psutil.cpu_count(),
|
|
480
|
+
"threshold_percent": settings.CPU_THRESHOLD_PERCENT,
|
|
481
|
+
},
|
|
482
|
+
)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
return ComponentStatus(
|
|
485
|
+
name="cpu",
|
|
486
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
487
|
+
message=f"Failed to check CPU usage: {e}",
|
|
488
|
+
response_time_ms=None,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
{% if cookiecutter.include_redis == "yes" %}
|
|
493
|
+
async def check_cache_health() -> ComponentStatus:
|
|
494
|
+
"""
|
|
495
|
+
Check cache connectivity and basic functionality.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
ComponentStatus indicating cache health
|
|
499
|
+
"""
|
|
500
|
+
try:
|
|
501
|
+
import redis.asyncio as aioredis
|
|
502
|
+
|
|
503
|
+
# Create Redis connection with timeout
|
|
504
|
+
redis_connection = aioredis.from_url( # type: ignore[no-untyped-call]
|
|
505
|
+
settings.REDIS_URL,
|
|
506
|
+
db=settings.REDIS_DB,
|
|
507
|
+
socket_timeout=settings.HEALTH_CHECK_TIMEOUT_SECONDS,
|
|
508
|
+
socket_connect_timeout=settings.HEALTH_CHECK_TIMEOUT_SECONDS,
|
|
509
|
+
)
|
|
510
|
+
redis_client: aioredis.Redis = cast(aioredis.Redis, redis_connection)
|
|
511
|
+
|
|
512
|
+
start_time = datetime.now(UTC)
|
|
513
|
+
|
|
514
|
+
# Test basic connectivity with ping
|
|
515
|
+
await redis_client.ping()
|
|
516
|
+
|
|
517
|
+
# Test basic set/get functionality
|
|
518
|
+
test_key = "health_check:test"
|
|
519
|
+
test_value = f"test_{start_time.timestamp()}"
|
|
520
|
+
await redis_client.set(test_key, test_value, ex=10) # Expire in 10 seconds
|
|
521
|
+
retrieved_value = await redis_client.get(test_key)
|
|
522
|
+
|
|
523
|
+
# Cleanup test key
|
|
524
|
+
await redis_client.delete(test_key)
|
|
525
|
+
await redis_client.aclose()
|
|
526
|
+
|
|
527
|
+
# Verify test worked
|
|
528
|
+
if retrieved_value.decode() != test_value:
|
|
529
|
+
raise Exception("Redis set/get test failed")
|
|
530
|
+
|
|
531
|
+
# Get Redis info for metadata
|
|
532
|
+
redis_info_connection = aioredis.from_url( # type: ignore[no-untyped-call]
|
|
533
|
+
settings.REDIS_URL, db=settings.REDIS_DB
|
|
534
|
+
)
|
|
535
|
+
redis_info_client: aioredis.Redis = cast(aioredis.Redis, redis_info_connection)
|
|
536
|
+
info = await redis_info_client.info()
|
|
537
|
+
await redis_info_client.aclose()
|
|
538
|
+
|
|
539
|
+
return ComponentStatus(
|
|
540
|
+
name="cache",
|
|
541
|
+
status=ComponentStatusType.HEALTHY,
|
|
542
|
+
message="Redis cache connection and operations successful",
|
|
543
|
+
response_time_ms=None, # Will be set by caller
|
|
544
|
+
metadata={
|
|
545
|
+
"implementation": "redis",
|
|
546
|
+
"version": info.get("redis_version", "unknown"),
|
|
547
|
+
"connected_clients": info.get("connected_clients", 0),
|
|
548
|
+
"used_memory_human": info.get("used_memory_human", "unknown"),
|
|
549
|
+
"uptime_in_seconds": info.get("uptime_in_seconds", 0),
|
|
550
|
+
"url": settings.REDIS_URL,
|
|
551
|
+
"db": settings.REDIS_DB,
|
|
552
|
+
},
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
except ImportError:
|
|
556
|
+
return ComponentStatus(
|
|
557
|
+
name="cache",
|
|
558
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
559
|
+
message="Cache library not installed",
|
|
560
|
+
response_time_ms=None,
|
|
561
|
+
metadata={
|
|
562
|
+
"implementation": "redis",
|
|
563
|
+
"error": "Redis library not available",
|
|
564
|
+
},
|
|
565
|
+
)
|
|
566
|
+
except Exception as e:
|
|
567
|
+
return ComponentStatus(
|
|
568
|
+
name="cache",
|
|
569
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
570
|
+
message=f"Cache health check failed: {str(e)}",
|
|
571
|
+
response_time_ms=None,
|
|
572
|
+
metadata={
|
|
573
|
+
"implementation": "redis",
|
|
574
|
+
"url": settings.REDIS_URL,
|
|
575
|
+
"db": settings.REDIS_DB,
|
|
576
|
+
"error": str(e),
|
|
577
|
+
},
|
|
578
|
+
)
|
|
579
|
+
{% endif %}
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
{% if cookiecutter.include_database == "yes" %}
|
|
583
|
+
async def check_database_health() -> ComponentStatus:
|
|
584
|
+
"""
|
|
585
|
+
Check database connectivity and basic functionality.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
ComponentStatus indicating database health
|
|
589
|
+
"""
|
|
590
|
+
try:
|
|
591
|
+
# Import db_session from generated project
|
|
592
|
+
from app.core.db import db_session
|
|
593
|
+
from pathlib import Path
|
|
594
|
+
|
|
595
|
+
# Check if database file exists for SQLite
|
|
596
|
+
db_url = settings.DATABASE_URL
|
|
597
|
+
if db_url.startswith("sqlite:///"):
|
|
598
|
+
# Extract path from SQLite URL
|
|
599
|
+
db_path = db_url.replace("sqlite:///", "").lstrip("./")
|
|
600
|
+
|
|
601
|
+
# Check if database file exists
|
|
602
|
+
if not Path(db_path).exists():
|
|
603
|
+
return ComponentStatus(
|
|
604
|
+
name="database",
|
|
605
|
+
status=ComponentStatusType.WARNING,
|
|
606
|
+
message="Database not initialized - file does not exist",
|
|
607
|
+
response_time_ms=None,
|
|
608
|
+
metadata={
|
|
609
|
+
"implementation": "sqlite",
|
|
610
|
+
"database_exists": False,
|
|
611
|
+
"expected_path": db_path,
|
|
612
|
+
"url": settings.DATABASE_URL,
|
|
613
|
+
"recommendation": (
|
|
614
|
+
"Run database migrations or create database file"
|
|
615
|
+
),
|
|
616
|
+
},
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
# Test database connection with simple query and collect enhanced metadata
|
|
620
|
+
enhanced_metadata = {
|
|
621
|
+
"implementation": "sqlite",
|
|
622
|
+
"url": settings.DATABASE_URL,
|
|
623
|
+
"database_exists": True,
|
|
624
|
+
"engine_echo": settings.DATABASE_ENGINE_ECHO,
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
# Collect additional metadata for SQLite databases
|
|
628
|
+
if db_url.startswith("sqlite:///"):
|
|
629
|
+
try:
|
|
630
|
+
# Add SQLite version
|
|
631
|
+
enhanced_metadata["version"] = sqlite3.sqlite_version
|
|
632
|
+
|
|
633
|
+
# Extract and add file size information
|
|
634
|
+
db_path = db_url.replace("sqlite:///", "").lstrip("./")
|
|
635
|
+
if Path(db_path).exists():
|
|
636
|
+
file_size = Path(db_path).stat().st_size
|
|
637
|
+
enhanced_metadata["file_size_bytes"] = file_size
|
|
638
|
+
enhanced_metadata["file_size_human"] = format_bytes(file_size)
|
|
639
|
+
|
|
640
|
+
# Get engine and connection pool information
|
|
641
|
+
from app.core.db import engine
|
|
642
|
+
if hasattr(engine.pool, 'size'):
|
|
643
|
+
enhanced_metadata["connection_pool_size"] = engine.pool.size()
|
|
644
|
+
else:
|
|
645
|
+
# SQLite typically uses NullPool or StaticPool with size 1
|
|
646
|
+
enhanced_metadata["connection_pool_size"] = 1
|
|
647
|
+
|
|
648
|
+
except Exception as e:
|
|
649
|
+
# If any enhanced metadata collection fails, log but don't break
|
|
650
|
+
# health check
|
|
651
|
+
logger.debug(
|
|
652
|
+
"Failed to collect enhanced database metadata", exc_info=True
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
# Test database connection and collect PRAGMA settings
|
|
656
|
+
with db_session(autocommit=False) as session:
|
|
657
|
+
# Execute a simple query to test connectivity
|
|
658
|
+
from sqlalchemy import text
|
|
659
|
+
session.execute(text("SELECT 1"))
|
|
660
|
+
|
|
661
|
+
# Collect SQLite PRAGMA settings for additional metadata
|
|
662
|
+
if db_url.startswith("sqlite:///"):
|
|
663
|
+
try:
|
|
664
|
+
pragma_settings = {}
|
|
665
|
+
|
|
666
|
+
# Get foreign keys setting
|
|
667
|
+
result = session.execute(text("PRAGMA foreign_keys")).fetchone()
|
|
668
|
+
if result:
|
|
669
|
+
pragma_settings["foreign_keys"] = bool(result[0])
|
|
670
|
+
|
|
671
|
+
# Get journal mode
|
|
672
|
+
result = session.execute(text("PRAGMA journal_mode")).fetchone()
|
|
673
|
+
if result:
|
|
674
|
+
journal_mode = result[0].lower()
|
|
675
|
+
pragma_settings["journal_mode"] = journal_mode
|
|
676
|
+
enhanced_metadata["wal_enabled"] = journal_mode == "wal"
|
|
677
|
+
|
|
678
|
+
# Add cache size if available
|
|
679
|
+
result = session.execute(text("PRAGMA cache_size")).fetchone()
|
|
680
|
+
if result:
|
|
681
|
+
pragma_settings["cache_size"] = result[0]
|
|
682
|
+
|
|
683
|
+
enhanced_metadata["pragma_settings"] = pragma_settings
|
|
684
|
+
|
|
685
|
+
except Exception as e:
|
|
686
|
+
# PRAGMA queries can fail in some SQLite configurations
|
|
687
|
+
logger.debug(
|
|
688
|
+
"Failed to collect SQLite PRAGMA settings", exc_info=True
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
# No need to commit since we're just testing connectivity
|
|
692
|
+
|
|
693
|
+
return ComponentStatus(
|
|
694
|
+
name="database",
|
|
695
|
+
status=ComponentStatusType.HEALTHY,
|
|
696
|
+
message="Database connection successful",
|
|
697
|
+
response_time_ms=None, # Will be set by caller
|
|
698
|
+
metadata=enhanced_metadata,
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
except ImportError:
|
|
702
|
+
return ComponentStatus(
|
|
703
|
+
name="database",
|
|
704
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
705
|
+
message="Database module not available",
|
|
706
|
+
response_time_ms=None,
|
|
707
|
+
metadata={
|
|
708
|
+
"implementation": "sqlite",
|
|
709
|
+
"error": "Database module not imported or configured",
|
|
710
|
+
},
|
|
711
|
+
)
|
|
712
|
+
except Exception as e:
|
|
713
|
+
# Check if it's a file not found error
|
|
714
|
+
error_str = str(e).lower()
|
|
715
|
+
if "unable to open database file" in error_str or "no such file" in error_str:
|
|
716
|
+
return ComponentStatus(
|
|
717
|
+
name="database",
|
|
718
|
+
status=ComponentStatusType.WARNING,
|
|
719
|
+
message="Database file not accessible",
|
|
720
|
+
response_time_ms=None,
|
|
721
|
+
metadata={
|
|
722
|
+
"implementation": "sqlite",
|
|
723
|
+
"url": settings.DATABASE_URL,
|
|
724
|
+
"error": str(e),
|
|
725
|
+
"recommendation": "Check database file path and permissions",
|
|
726
|
+
},
|
|
727
|
+
)
|
|
728
|
+
|
|
729
|
+
return ComponentStatus(
|
|
730
|
+
name="database",
|
|
731
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
732
|
+
message=f"Database connection failed: {str(e)}",
|
|
733
|
+
response_time_ms=None,
|
|
734
|
+
metadata={
|
|
735
|
+
"implementation": "sqlite",
|
|
736
|
+
"url": settings.DATABASE_URL,
|
|
737
|
+
"error": str(e),
|
|
738
|
+
},
|
|
739
|
+
)
|
|
740
|
+
{% endif %}
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
{% if cookiecutter.include_worker == "yes" %}
|
|
744
|
+
async def check_worker_health() -> ComponentStatus:
|
|
745
|
+
"""
|
|
746
|
+
Check arq worker status using arq's native health checks and queue configuration.
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
ComponentStatus indicating worker infrastructure health with queue
|
|
750
|
+
sub-components
|
|
751
|
+
"""
|
|
752
|
+
try:
|
|
753
|
+
import re
|
|
754
|
+
|
|
755
|
+
import redis.asyncio as aioredis
|
|
756
|
+
|
|
757
|
+
# Create Redis connection
|
|
758
|
+
# Step 1: Call untyped function with explicit ignore
|
|
759
|
+
redis_connection = aioredis.from_url( # type: ignore[no-untyped-call]
|
|
760
|
+
settings.REDIS_URL,
|
|
761
|
+
db=settings.REDIS_DB
|
|
762
|
+
)
|
|
763
|
+
# Step 2: Cast the result to proper type
|
|
764
|
+
redis_client: aioredis.Redis = cast(aioredis.Redis, redis_connection)
|
|
765
|
+
|
|
766
|
+
# Get queue metadata from WorkerSettings classes via dynamic discovery
|
|
767
|
+
from app.components.worker.registry import get_all_queue_metadata
|
|
768
|
+
functional_queues = get_all_queue_metadata()
|
|
769
|
+
|
|
770
|
+
# Check each queue and create sub-components
|
|
771
|
+
queue_sub_components = {}
|
|
772
|
+
total_queued = 0
|
|
773
|
+
total_completed = 0
|
|
774
|
+
total_failed = 0
|
|
775
|
+
total_retried = 0
|
|
776
|
+
total_ongoing = 0
|
|
777
|
+
overall_healthy = True
|
|
778
|
+
active_workers = 0
|
|
779
|
+
|
|
780
|
+
for queue_type, queue_config in functional_queues.items():
|
|
781
|
+
queue_name = queue_config["queue_name"]
|
|
782
|
+
|
|
783
|
+
try:
|
|
784
|
+
# Get queue length (actual queued jobs)
|
|
785
|
+
queue_length_result = redis_client.llen(queue_name)
|
|
786
|
+
if hasattr(queue_length_result, '__await__'):
|
|
787
|
+
queue_length = await queue_length_result
|
|
788
|
+
else:
|
|
789
|
+
queue_length = queue_length_result
|
|
790
|
+
total_queued += queue_length
|
|
791
|
+
|
|
792
|
+
# Look for arq health check key for this queue
|
|
793
|
+
# arq health check key format: {queue_name}:health-check
|
|
794
|
+
health_check_key = f"{queue_name}:health-check"
|
|
795
|
+
health_check_data = await redis_client.get(health_check_key)
|
|
796
|
+
|
|
797
|
+
# Parse arq health check data if available
|
|
798
|
+
j_complete = j_failed = j_retried = j_ongoing = 0
|
|
799
|
+
worker_alive = False
|
|
800
|
+
last_health_check = None
|
|
801
|
+
|
|
802
|
+
if health_check_data:
|
|
803
|
+
health_string = health_check_data.decode()
|
|
804
|
+
# Parse format: "Mar-01 17:41:22 j_complete=0 j_failed=0 ..."
|
|
805
|
+
logger.debug(
|
|
806
|
+
f"Raw health check data for {queue_type}: {health_string}"
|
|
807
|
+
)
|
|
808
|
+
|
|
809
|
+
# Extract timestamp (first part before job stats)
|
|
810
|
+
timestamp_match = re.match(r"^(\w+-\d+ \d+:\d+:\d+)", health_string)
|
|
811
|
+
if timestamp_match:
|
|
812
|
+
last_health_check = timestamp_match.group(1)
|
|
813
|
+
|
|
814
|
+
# Extract job statistics using regex
|
|
815
|
+
j_complete_match = re.search(r"j_complete=(\d+)", health_string)
|
|
816
|
+
j_failed_match = re.search(r"j_failed=(\d+)", health_string)
|
|
817
|
+
j_retried_match = re.search(r"j_retried=(\d+)", health_string)
|
|
818
|
+
j_ongoing_match = re.search(r"j_ongoing=(\d+)", health_string)
|
|
819
|
+
|
|
820
|
+
if j_complete_match:
|
|
821
|
+
j_complete = int(j_complete_match.group(1))
|
|
822
|
+
total_completed += j_complete
|
|
823
|
+
if j_failed_match:
|
|
824
|
+
j_failed = int(j_failed_match.group(1))
|
|
825
|
+
total_failed += j_failed
|
|
826
|
+
if j_retried_match:
|
|
827
|
+
j_retried = int(j_retried_match.group(1))
|
|
828
|
+
total_retried += j_retried
|
|
829
|
+
if j_ongoing_match:
|
|
830
|
+
j_ongoing = int(j_ongoing_match.group(1))
|
|
831
|
+
total_ongoing += j_ongoing
|
|
832
|
+
|
|
833
|
+
worker_alive = True
|
|
834
|
+
active_workers += 1
|
|
835
|
+
|
|
836
|
+
# Create queue status message
|
|
837
|
+
status_parts = []
|
|
838
|
+
if not worker_alive:
|
|
839
|
+
status_parts.append("worker offline - no health check data")
|
|
840
|
+
elif j_ongoing > 0:
|
|
841
|
+
status_parts.append(f"{j_ongoing} processing")
|
|
842
|
+
elif queue_length > 0:
|
|
843
|
+
status_parts.append(f"{queue_length} queued")
|
|
844
|
+
else:
|
|
845
|
+
status_parts.append("idle")
|
|
846
|
+
|
|
847
|
+
# Add job statistics to status if worker is alive
|
|
848
|
+
if worker_alive and (j_complete > 0 or j_failed > 0):
|
|
849
|
+
if j_failed > 0:
|
|
850
|
+
failure_rate = (j_failed / max(j_complete + j_failed, 1)) * 100
|
|
851
|
+
status_parts.append(f"{j_failed} failed ({failure_rate:.1f}%)")
|
|
852
|
+
if j_complete > 0:
|
|
853
|
+
status_parts.append(f"{j_complete} completed")
|
|
854
|
+
|
|
855
|
+
# Check if queue has no functions configured (empty functions list)
|
|
856
|
+
queue_functions = queue_config.get("functions", [])
|
|
857
|
+
has_functions = len(queue_functions) > 0
|
|
858
|
+
|
|
859
|
+
# Determine queue status based on worker health and failure rate
|
|
860
|
+
failure_rate = (
|
|
861
|
+
(j_failed / max(j_complete + j_failed, 1)) * 100
|
|
862
|
+
if worker_alive
|
|
863
|
+
else 100
|
|
864
|
+
)
|
|
865
|
+
|
|
866
|
+
if not worker_alive and not has_functions:
|
|
867
|
+
# Queue configured but no functions - show as INFO
|
|
868
|
+
queue_status = ComponentStatusType.INFO
|
|
869
|
+
status_parts = ["configured - no functions defined"]
|
|
870
|
+
elif not worker_alive:
|
|
871
|
+
queue_status = ComponentStatusType.UNHEALTHY
|
|
872
|
+
elif failure_rate > 25: # Unhealthy threshold at 25%
|
|
873
|
+
queue_status = ComponentStatusType.UNHEALTHY
|
|
874
|
+
elif failure_rate > 10: # Warning threshold at 10%
|
|
875
|
+
queue_status = ComponentStatusType.WARNING
|
|
876
|
+
else:
|
|
877
|
+
queue_status = ComponentStatusType.HEALTHY
|
|
878
|
+
|
|
879
|
+
queue_message = (
|
|
880
|
+
f"{queue_config['description']}: {', '.join(status_parts)}"
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# Update overall health based on this queue
|
|
884
|
+
if queue_status == ComponentStatusType.UNHEALTHY:
|
|
885
|
+
overall_healthy = False
|
|
886
|
+
|
|
887
|
+
queue_metadata = {
|
|
888
|
+
"queue_type": queue_type,
|
|
889
|
+
"queue_name": queue_name,
|
|
890
|
+
"queued_jobs": queue_length,
|
|
891
|
+
"max_concurrency": queue_config["max_jobs"],
|
|
892
|
+
"timeout_seconds": queue_config["timeout"],
|
|
893
|
+
"description": queue_config["description"],
|
|
894
|
+
"worker_alive": worker_alive,
|
|
895
|
+
"health_check_key": health_check_key,
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
# Add arq health check statistics if available
|
|
899
|
+
if worker_alive:
|
|
900
|
+
queue_metadata.update(
|
|
901
|
+
{
|
|
902
|
+
"jobs_completed": j_complete,
|
|
903
|
+
"jobs_failed": j_failed,
|
|
904
|
+
"jobs_retried": j_retried,
|
|
905
|
+
"jobs_ongoing": j_ongoing,
|
|
906
|
+
"failure_rate_percent": round(failure_rate, 1),
|
|
907
|
+
"last_health_check": last_health_check,
|
|
908
|
+
}
|
|
909
|
+
)
|
|
910
|
+
else:
|
|
911
|
+
queue_metadata["offline_reason"] = "Health check key not found"
|
|
912
|
+
|
|
913
|
+
queue_sub_components[queue_type] = ComponentStatus(
|
|
914
|
+
name=queue_type,
|
|
915
|
+
status=queue_status,
|
|
916
|
+
message=queue_message,
|
|
917
|
+
response_time_ms=None,
|
|
918
|
+
metadata=queue_metadata,
|
|
919
|
+
sub_components={},
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
except aioredis.ConnectionError as e:
|
|
923
|
+
logger.error(f"Redis connection failed for {queue_type}: {e}")
|
|
924
|
+
overall_healthy = False
|
|
925
|
+
|
|
926
|
+
# Extract more specific connection error details
|
|
927
|
+
error_details = str(e).lower()
|
|
928
|
+
if "connection refused" in error_details:
|
|
929
|
+
connection_issue = "Redis server not running"
|
|
930
|
+
elif (
|
|
931
|
+
"name or service not known" in error_details
|
|
932
|
+
or "nodename nor servname" in error_details
|
|
933
|
+
):
|
|
934
|
+
connection_issue = "Redis server DNS resolution failed"
|
|
935
|
+
elif "timeout" in error_details:
|
|
936
|
+
connection_issue = "Redis server connection timeout"
|
|
937
|
+
else:
|
|
938
|
+
connection_issue = "Redis server unreachable"
|
|
939
|
+
|
|
940
|
+
queue_sub_components[queue_type] = ComponentStatus(
|
|
941
|
+
name=queue_type,
|
|
942
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
943
|
+
message=f"{connection_issue} - worker offline",
|
|
944
|
+
response_time_ms=None,
|
|
945
|
+
metadata={
|
|
946
|
+
"queue_type": queue_type,
|
|
947
|
+
"queue_name": queue_name,
|
|
948
|
+
"error_type": "redis_connection_error",
|
|
949
|
+
"error": str(e),
|
|
950
|
+
"connection_issue": connection_issue,
|
|
951
|
+
"recommendation": (
|
|
952
|
+
"Check Redis server status and network connectivity"
|
|
953
|
+
),
|
|
954
|
+
},
|
|
955
|
+
sub_components={},
|
|
956
|
+
)
|
|
957
|
+
except aioredis.ResponseError as e:
|
|
958
|
+
if "WRONGTYPE" in str(e):
|
|
959
|
+
logger.error(f"Redis data corruption for {queue_type}: {e}")
|
|
960
|
+
message = f"Redis data corruption detected"
|
|
961
|
+
recommendation = "Clear Redis cache to fix data type conflicts"
|
|
962
|
+
error_type = "redis_key_type_error"
|
|
963
|
+
else:
|
|
964
|
+
logger.error(f"Redis operation failed for {queue_type}: {e}")
|
|
965
|
+
message = f"Redis operation failed"
|
|
966
|
+
recommendation = "Check Redis configuration and permissions"
|
|
967
|
+
error_type = "redis_response_error"
|
|
968
|
+
|
|
969
|
+
overall_healthy = False
|
|
970
|
+
queue_sub_components[queue_type] = ComponentStatus(
|
|
971
|
+
name=queue_type,
|
|
972
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
973
|
+
message=message,
|
|
974
|
+
response_time_ms=None,
|
|
975
|
+
metadata={
|
|
976
|
+
"queue_type": queue_type,
|
|
977
|
+
"queue_name": queue_name,
|
|
978
|
+
"error_type": error_type,
|
|
979
|
+
"error": str(e),
|
|
980
|
+
"recommendation": recommendation,
|
|
981
|
+
},
|
|
982
|
+
sub_components={},
|
|
983
|
+
)
|
|
984
|
+
except Exception as e:
|
|
985
|
+
logger.error(
|
|
986
|
+
f"Unexpected error checking {queue_type} queue health: {e}"
|
|
987
|
+
)
|
|
988
|
+
overall_healthy = False
|
|
989
|
+
queue_sub_components[queue_type] = ComponentStatus(
|
|
990
|
+
name=queue_type,
|
|
991
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
992
|
+
message=f"Health check failed: {type(e).__name__}",
|
|
993
|
+
response_time_ms=None,
|
|
994
|
+
metadata={
|
|
995
|
+
"queue_type": queue_type,
|
|
996
|
+
"queue_name": queue_name,
|
|
997
|
+
"error_type": "unexpected_error",
|
|
998
|
+
"error": str(e),
|
|
999
|
+
"exception_class": type(e).__name__,
|
|
1000
|
+
},
|
|
1001
|
+
sub_components={},
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
await redis_client.aclose()
|
|
1005
|
+
|
|
1006
|
+
# Create main worker status message
|
|
1007
|
+
message_parts = []
|
|
1008
|
+
if active_workers == 0:
|
|
1009
|
+
message_parts.append("No active workers")
|
|
1010
|
+
overall_healthy = False
|
|
1011
|
+
else:
|
|
1012
|
+
message_parts.append(
|
|
1013
|
+
f"{active_workers}/{len(functional_queues)} workers active"
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
if total_queued > 0:
|
|
1017
|
+
message_parts.append(f"{total_queued} queued")
|
|
1018
|
+
if total_ongoing > 0:
|
|
1019
|
+
message_parts.append(f"{total_ongoing} processing")
|
|
1020
|
+
if total_failed > 0:
|
|
1021
|
+
failure_rate = (total_failed / max(total_completed + total_failed, 1)) * 100
|
|
1022
|
+
message_parts.append(f"{total_failed} failed ({failure_rate:.1f}%)")
|
|
1023
|
+
|
|
1024
|
+
main_message = f"arq worker infrastructure: {', '.join(message_parts)}"
|
|
1025
|
+
|
|
1026
|
+
# Create a "queues" intermediate component that contains all queue
|
|
1027
|
+
# sub-components - determine status from child statuses
|
|
1028
|
+
queue_statuses = [queue.status for queue in queue_sub_components.values()]
|
|
1029
|
+
queues_status = propagate_status(queue_statuses)
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
queues_message = f"{len(functional_queues)} functional queues configured"
|
|
1033
|
+
if active_workers < len(functional_queues):
|
|
1034
|
+
queues_message += f" ({active_workers} active)"
|
|
1035
|
+
|
|
1036
|
+
queues_component = ComponentStatus(
|
|
1037
|
+
name="queues",
|
|
1038
|
+
status=queues_status,
|
|
1039
|
+
message=queues_message,
|
|
1040
|
+
response_time_ms=None,
|
|
1041
|
+
metadata={
|
|
1042
|
+
"configured_queues": len(functional_queues),
|
|
1043
|
+
"active_workers": active_workers,
|
|
1044
|
+
"queue_types": list(functional_queues.keys()),
|
|
1045
|
+
},
|
|
1046
|
+
sub_components=queue_sub_components,
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
# Determine worker status based on overall health and queues status
|
|
1050
|
+
if not overall_healthy:
|
|
1051
|
+
worker_status = ComponentStatusType.UNHEALTHY
|
|
1052
|
+
else:
|
|
1053
|
+
worker_status = propagate_status([queues_status])
|
|
1054
|
+
|
|
1055
|
+
return ComponentStatus(
|
|
1056
|
+
name="worker",
|
|
1057
|
+
status=worker_status,
|
|
1058
|
+
message=main_message,
|
|
1059
|
+
response_time_ms=None,
|
|
1060
|
+
metadata={
|
|
1061
|
+
"total_queued": total_queued,
|
|
1062
|
+
"total_completed": total_completed,
|
|
1063
|
+
"total_failed": total_failed,
|
|
1064
|
+
"total_retried": total_retried,
|
|
1065
|
+
"total_ongoing": total_ongoing,
|
|
1066
|
+
"overall_failure_rate_percent": round(
|
|
1067
|
+
(total_failed / max(total_completed + total_failed, 1)) * 100, 1
|
|
1068
|
+
)
|
|
1069
|
+
if total_completed + total_failed > 0
|
|
1070
|
+
else 0,
|
|
1071
|
+
"redis_url": settings.REDIS_URL,
|
|
1072
|
+
"queue_configuration": {
|
|
1073
|
+
queue_type: {
|
|
1074
|
+
"description": config["description"],
|
|
1075
|
+
"max_jobs": config["max_jobs"],
|
|
1076
|
+
"timeout_seconds": config["timeout"],
|
|
1077
|
+
}
|
|
1078
|
+
for queue_type, config in functional_queues.items()
|
|
1079
|
+
},
|
|
1080
|
+
},
|
|
1081
|
+
sub_components={"queues": queues_component},
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
except ImportError:
|
|
1085
|
+
return ComponentStatus(
|
|
1086
|
+
name="worker",
|
|
1087
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
1088
|
+
message="Redis library not available for worker health check",
|
|
1089
|
+
response_time_ms=None,
|
|
1090
|
+
sub_components={},
|
|
1091
|
+
)
|
|
1092
|
+
except Exception as e:
|
|
1093
|
+
logger.error(f"Worker health check failed: {e}")
|
|
1094
|
+
return ComponentStatus(
|
|
1095
|
+
name="worker",
|
|
1096
|
+
status=ComponentStatusType.UNHEALTHY,
|
|
1097
|
+
message=f"Worker health check failed: {str(e)}",
|
|
1098
|
+
response_time_ms=None,
|
|
1099
|
+
metadata={
|
|
1100
|
+
"error": str(e),
|
|
1101
|
+
"redis_url": settings.REDIS_URL,
|
|
1102
|
+
},
|
|
1103
|
+
sub_components={},
|
|
1104
|
+
)
|
|
1105
|
+
{% endif %}
|