aegis-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aegis-stack might be problematic. Click here for more details.

Files changed (103) hide show
  1. aegis/__init__.py +5 -0
  2. aegis/__main__.py +374 -0
  3. aegis/core/CLAUDE.md +365 -0
  4. aegis/core/__init__.py +6 -0
  5. aegis/core/components.py +115 -0
  6. aegis/core/dependency_resolver.py +119 -0
  7. aegis/core/template_generator.py +163 -0
  8. aegis/templates/CLAUDE.md +306 -0
  9. aegis/templates/cookiecutter-aegis-project/cookiecutter.json +27 -0
  10. aegis/templates/cookiecutter-aegis-project/hooks/post_gen_project.py +172 -0
  11. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.dockerignore +71 -0
  12. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.env.example.j2 +70 -0
  13. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/.gitignore +127 -0
  14. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/Dockerfile +53 -0
  15. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/Makefile +211 -0
  16. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/README.md.j2 +196 -0
  17. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/__init__.py +5 -0
  18. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/__init__.py +6 -0
  19. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/health.py +321 -0
  20. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/load_test.py +638 -0
  21. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/cli/main.py +41 -0
  22. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/__init__.py +0 -0
  23. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/__init__.py +0 -0
  24. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/health.py +134 -0
  25. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/models.py.j2 +247 -0
  26. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/routing.py.j2 +14 -0
  27. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/api/tasks.py.j2 +596 -0
  28. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/hooks.py +133 -0
  29. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/main.py +16 -0
  30. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/middleware/__init__.py +1 -0
  31. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/middleware/cors.py +20 -0
  32. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/shutdown/__init__.py +1 -0
  33. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/shutdown/cleanup.py +14 -0
  34. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/startup/__init__.py +1 -0
  35. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/backend/startup/component_health.py.j2 +190 -0
  36. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/__init__.py +0 -0
  37. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/core/__init__.py +1 -0
  38. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/core/theme.py +46 -0
  39. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/frontend/main.py +687 -0
  40. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/scheduler/__init__.py +1 -0
  41. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/scheduler/main.py +138 -0
  42. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/CLAUDE.md +213 -0
  43. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/__init__.py +6 -0
  44. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/constants.py.j2 +30 -0
  45. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/pools.py +78 -0
  46. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/__init__.py +1 -0
  47. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/load_test.py +48 -0
  48. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/media.py +41 -0
  49. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/queues/system.py +36 -0
  50. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/registry.py +139 -0
  51. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/__init__.py +119 -0
  52. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/load_tasks.py +526 -0
  53. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/simple_system_tasks.py +32 -0
  54. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/components/worker/tasks/system_tasks.py +279 -0
  55. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/config.py.j2 +119 -0
  56. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/constants.py +60 -0
  57. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/db.py +67 -0
  58. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/core/log.py +85 -0
  59. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/__init__.py +1 -0
  60. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/webserver.py +40 -0
  61. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/entrypoints/{% if cookiecutter.include_scheduler == /"yes/" %}scheduler.py{% endif %}" +21 -0
  62. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/integrations/__init__.py +0 -0
  63. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/integrations/main.py +61 -0
  64. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/py.typed +0 -0
  65. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/__init__.py +1 -0
  66. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/load_test.py +661 -0
  67. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/load_test_models.py +269 -0
  68. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/shared/__init__.py +15 -0
  69. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/shared/models.py +26 -0
  70. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/__init__.py +52 -0
  71. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/alerts.py +94 -0
  72. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/health.py.j2 +1105 -0
  73. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/models.py +169 -0
  74. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/app/services/system/ui.py +52 -0
  75. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docker-compose.yml.j2 +195 -0
  76. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/api.md +191 -0
  77. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/components/scheduler.md +414 -0
  78. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/development.md +215 -0
  79. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/health.md +240 -0
  80. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/javascripts/mermaid-config.js +62 -0
  81. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/docs/stylesheets/mermaid.css +95 -0
  82. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/mkdocs.yml.j2 +62 -0
  83. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/pyproject.toml.j2 +156 -0
  84. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/entrypoint.sh +87 -0
  85. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/entrypoint.sh.j2 +104 -0
  86. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/scripts/gen_docs.py +16 -0
  87. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/api/__init__.py +1 -0
  88. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/api/test_health_endpoints.py.j2 +239 -0
  89. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/components/test_scheduler.py +76 -0
  90. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/conftest.py.j2 +81 -0
  91. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/__init__.py +1 -0
  92. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_component_integration.py.j2 +376 -0
  93. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_health_logic.py.j2 +633 -0
  94. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_load_test_models.py +665 -0
  95. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_load_test_service.py +602 -0
  96. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_system_service.py +96 -0
  97. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/services/test_worker_health_registration.py.j2 +224 -0
  98. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/tests/test_core.py +50 -0
  99. aegis/templates/cookiecutter-aegis-project/{{cookiecutter.project_slug}}/uv.lock +1673 -0
  100. aegis_stack-0.1.0.dist-info/METADATA +114 -0
  101. aegis_stack-0.1.0.dist-info/RECORD +103 -0
  102. aegis_stack-0.1.0.dist-info/WHEEL +4 -0
  103. aegis_stack-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,1105 @@
1
+ """
2
+ System health monitoring functions.
3
+
4
+ Pure functions for system health checking, monitoring, and status reporting.
5
+ All functions use Pydantic models for type safety and validation.
6
+ """
7
+
8
+ import asyncio
9
+ from collections.abc import Awaitable, Callable
10
+ from datetime import UTC, datetime
11
+ import os
12
+ import sqlite3
13
+ import sys
14
+ from typing import Any, cast
15
+
16
+ import psutil
17
+
18
+ from app.core.config import settings
19
+ from app.core.log import logger
20
+
21
+ from .alerts import send_critical_alert, send_health_alert
22
+ from .models import ComponentStatus, ComponentStatusType, SystemStatus
23
+
24
+ # Global registry for custom health checks
25
+ _health_checks: dict[str, Callable[[], Awaitable[ComponentStatus]]] = {}
26
+
27
+
28
+ def format_bytes(size: int) -> str:
29
+ """Format bytes into human-readable string."""
30
+ if size == 0:
31
+ return "0 B"
32
+
33
+ size_float = float(size)
34
+ for unit in ['B', 'KB', 'MB', 'GB']:
35
+ if size_float < 1024.0:
36
+ if unit == 'B':
37
+ return f"{int(size_float)} {unit}"
38
+ else:
39
+ return f"{size_float:.1f} {unit}"
40
+ size_float /= 1024.0
41
+ return f"{size_float:.1f} TB"
42
+
43
+
44
+ def propagate_status(child_statuses: list[ComponentStatusType]) -> ComponentStatusType:
45
+ """
46
+ Determine parent status from child statuses using standard hierarchy.
47
+
48
+ Status priority (highest to lowest):
49
+ 1. UNHEALTHY - Any unhealthy child makes parent unhealthy
50
+ 2. WARNING - Any warning child makes parent warning (if no unhealthy)
51
+ 3. INFO - Any info child makes parent info (if no unhealthy/warning)
52
+ 4. HEALTHY - All children healthy makes parent healthy
53
+
54
+ Args:
55
+ child_statuses: List of ComponentStatusType from child components
56
+
57
+ Returns:
58
+ ComponentStatusType for the parent component
59
+ """
60
+ if not child_statuses:
61
+ return ComponentStatusType.HEALTHY
62
+
63
+ if any(status == ComponentStatusType.UNHEALTHY for status in child_statuses):
64
+ return ComponentStatusType.UNHEALTHY
65
+ elif any(status == ComponentStatusType.WARNING for status in child_statuses):
66
+ return ComponentStatusType.WARNING
67
+ elif any(status == ComponentStatusType.INFO for status in child_statuses):
68
+ return ComponentStatusType.INFO
69
+ elif all(status == ComponentStatusType.HEALTHY for status in child_statuses):
70
+ return ComponentStatusType.HEALTHY
71
+ else:
72
+ return ComponentStatusType.HEALTHY # Default for edge cases
73
+
74
+ # Cache for system metrics to improve performance
75
+ _system_metrics_cache: dict[str, tuple[ComponentStatus, datetime]] = {}
76
+
77
+
78
+ def register_health_check(
79
+ name: str, check_fn: Callable[[], Awaitable[ComponentStatus]]
80
+ ) -> None:
81
+ """
82
+ Register a custom health check function.
83
+
84
+ Args:
85
+ name: Unique name for the health check
86
+ check_fn: Async function that returns ComponentStatus or bool
87
+ """
88
+ _health_checks[name] = check_fn
89
+ logger.info(f"Registered custom health check: {name}")
90
+
91
+
92
+ async def get_system_status() -> SystemStatus:
93
+ """
94
+ Get comprehensive system status.
95
+
96
+ Returns:
97
+ SystemStatus with all component health information organized as Aegis tree
98
+ """
99
+ logger.info("Running system health checks")
100
+ start_time = datetime.now(UTC)
101
+
102
+ # Run custom component checks (these are top-level components)
103
+ component_results = {}
104
+ component_tasks = []
105
+ for name, check_fn in _health_checks.items():
106
+ task = asyncio.create_task(_run_health_check(name, check_fn))
107
+ component_tasks.append((name, task))
108
+
109
+ # Collect component results
110
+ for name, task in component_tasks:
111
+ try:
112
+ component_results[name] = await task
113
+ except Exception as e:
114
+ logger.error(f"Component check failed for {name}: {e}")
115
+ component_results[name] = ComponentStatus(
116
+ name=name,
117
+ status=ComponentStatusType.UNHEALTHY,
118
+ message=f"Health check failed: {str(e)}",
119
+ response_time_ms=None,
120
+ )
121
+
122
+ # Get system metrics (with caching for performance)
123
+ system_metrics = await _get_cached_system_metrics(start_time)
124
+
125
+ # Group system metrics under backend component if it exists
126
+ if "backend" in component_results:
127
+ # Backend exists - recreate with system metrics as sub-components
128
+ backend_component = component_results["backend"]
129
+
130
+ # Propagate status from system metrics and original backend status
131
+ system_metrics_statuses = [
132
+ getattr(metric, 'status', ComponentStatusType.HEALTHY)
133
+ for metric in system_metrics.values()
134
+ ]
135
+ original_backend_status = getattr(
136
+ backend_component, 'status', ComponentStatusType.HEALTHY
137
+ )
138
+ all_backend_statuses = system_metrics_statuses + [original_backend_status]
139
+
140
+ backend_status = propagate_status(all_backend_statuses)
141
+
142
+ component_results["backend"] = ComponentStatus(
143
+ name=backend_component.name,
144
+ status=backend_status,
145
+ message=backend_component.message,
146
+ response_time_ms=backend_component.response_time_ms,
147
+ metadata=backend_component.metadata,
148
+ sub_components=system_metrics,
149
+ )
150
+ else:
151
+ # Backend doesn't exist - create a virtual backend component to hold
152
+ # system metrics
153
+ backend_healthy = all(metric.healthy for metric in system_metrics.values())
154
+
155
+ # Propagate status from system metrics only
156
+ system_metrics_statuses = [
157
+ getattr(metric, 'status', ComponentStatusType.HEALTHY)
158
+ for metric in system_metrics.values()
159
+ ]
160
+ backend_status = propagate_status(system_metrics_statuses)
161
+
162
+ backend_message = (
163
+ "System container metrics"
164
+ if backend_healthy
165
+ else "System container has issues"
166
+ )
167
+
168
+ component_results["backend"] = ComponentStatus(
169
+ name="backend",
170
+ status=backend_status,
171
+ message=backend_message,
172
+ response_time_ms=None,
173
+ metadata={"type": "system_container", "virtual": True},
174
+ sub_components=system_metrics,
175
+ )
176
+
177
+ # Calculate overall health (including sub-components)
178
+ all_statuses = list(component_results.values())
179
+ for component in component_results.values():
180
+ all_statuses.extend(component.sub_components.values())
181
+ overall_healthy = all(status.healthy for status in all_statuses)
182
+
183
+ # Create Aegis root structure with components underneath
184
+ aegis_healthy = all(status.healthy for status in all_statuses)
185
+
186
+ # Propagate status from all top-level components
187
+ component_statuses = [
188
+ getattr(component, 'status', ComponentStatusType.HEALTHY)
189
+ for component in component_results.values()
190
+ ]
191
+ aegis_status = propagate_status(component_statuses)
192
+
193
+ aegis_message = (
194
+ "Aegis Stack application" if aegis_healthy else "Aegis Stack has issues"
195
+ )
196
+
197
+ root_components = {
198
+ "aegis": ComponentStatus(
199
+ name="aegis",
200
+ status=aegis_status,
201
+ message=aegis_message,
202
+ response_time_ms=None,
203
+ metadata={"type": "application_root", "version": "1.0"},
204
+ sub_components=component_results,
205
+ )
206
+ }
207
+
208
+ # Get system information
209
+ system_info = _get_system_info()
210
+
211
+ status = SystemStatus(
212
+ components=root_components,
213
+ overall_healthy=overall_healthy,
214
+ timestamp=start_time,
215
+ system_info=system_info,
216
+ )
217
+
218
+ # Log unhealthy components
219
+ if not overall_healthy:
220
+ logger.warning(
221
+ f"System unhealthy: {status.unhealthy_components}",
222
+ extra={"unhealthy_components": status.unhealthy_components},
223
+ )
224
+
225
+ return status
226
+
227
+
228
+ async def is_system_healthy() -> bool:
229
+ """Quick check if system is overall healthy."""
230
+ status = await get_system_status()
231
+ return status.overall_healthy
232
+
233
+
234
+ async def check_system_status() -> None:
235
+ """
236
+ Scheduled health check function for use in APScheduler jobs.
237
+
238
+ This function gets the system status and logs any issues.
239
+ Can be extended to send alerts to Slack, email, etc.
240
+ """
241
+ logger.info("🩺 Running scheduled system health check")
242
+
243
+ try:
244
+ status = await get_system_status()
245
+
246
+ if status.overall_healthy:
247
+ log_msg = (
248
+ f"✅ System healthy: {len(status.healthy_top_level_components)}/"
249
+ f"{status.total_components} components OK"
250
+ )
251
+ logger.info(log_msg)
252
+ else:
253
+ logger.warning(
254
+ f"⚠️ System issues detected: "
255
+ f"{len(status.unhealthy_components)} unhealthy components",
256
+ extra={
257
+ "unhealthy_components": status.unhealthy_components,
258
+ "health_percentage": status.health_percentage,
259
+ },
260
+ )
261
+
262
+ # Log details for each unhealthy component
263
+ for component_name in status.unhealthy_components:
264
+ component = status.components[component_name]
265
+ logger.error(
266
+ f"❌ {component_name}: {component.message}",
267
+ extra={"component": component.name, "metadata": component.metadata},
268
+ )
269
+
270
+ # Send health alerts
271
+ await send_health_alert(status)
272
+
273
+ except Exception as e:
274
+ logger.error(f"💥 System health check failed: {e}")
275
+ # Send critical alert about monitoring failure
276
+ await send_critical_alert(f"Health monitoring failed: {e}", str(e))
277
+
278
+
279
+ async def _get_cached_system_metrics(
280
+ current_time: datetime,
281
+ ) -> dict[str, ComponentStatus]:
282
+ """Get system metrics with caching for better performance."""
283
+ cache_duration = settings.SYSTEM_METRICS_CACHE_SECONDS
284
+ system_metric_checks = {
285
+ "memory": _check_memory,
286
+ "disk": _check_disk_space,
287
+ "cpu": _check_cpu_usage,
288
+ }
289
+
290
+ system_metrics = {}
291
+ tasks = []
292
+
293
+ for name, check_fn in system_metric_checks.items():
294
+ # Check if we have a valid cached result
295
+ if name in _system_metrics_cache:
296
+ cached_result, cached_time = _system_metrics_cache[name]
297
+ age_seconds = (current_time - cached_time).total_seconds()
298
+
299
+ if age_seconds < cache_duration:
300
+ # Use cached result
301
+ system_metrics[name] = cached_result
302
+ continue
303
+
304
+ # Need to run the check
305
+ task = asyncio.create_task(
306
+ _run_health_check_with_cache(name, check_fn, current_time)
307
+ )
308
+ tasks.append((name, task))
309
+
310
+ # Collect results from non-cached checks
311
+ for name, task in tasks:
312
+ try:
313
+ system_metrics[name] = await task
314
+ except Exception as e:
315
+ logger.error(f"System metric check failed for {name}: {e}")
316
+ system_metrics[name] = ComponentStatus(
317
+ name=name,
318
+ status=ComponentStatusType.UNHEALTHY,
319
+ message=f"Health check failed: {str(e)}",
320
+ response_time_ms=None,
321
+ )
322
+
323
+ return system_metrics
324
+
325
+
326
+ async def _run_health_check_with_cache(
327
+ name: str, check_fn: Callable[[], Awaitable[ComponentStatus]], timestamp: datetime
328
+ ) -> ComponentStatus:
329
+ """Run health check and cache the result."""
330
+ result = await _run_health_check(name, check_fn)
331
+ _system_metrics_cache[name] = (result, timestamp)
332
+ return result
333
+
334
+
335
+ async def _run_health_check(
336
+ name: str, check_fn: Callable[[], Awaitable[ComponentStatus]]
337
+ ) -> ComponentStatus:
338
+ """Run a single health check with timing."""
339
+ start_time = datetime.now(UTC)
340
+ try:
341
+ result = await check_fn()
342
+ end_time = datetime.now(UTC)
343
+ response_time = (end_time - start_time).total_seconds() * 1000
344
+
345
+ if isinstance(result, ComponentStatus):
346
+ result.response_time_ms = response_time
347
+ return result
348
+ else:
349
+ return ComponentStatus(
350
+ name=name,
351
+ status=(
352
+ ComponentStatusType.HEALTHY if bool(result)
353
+ else ComponentStatusType.UNHEALTHY
354
+ ),
355
+ message="OK" if result else "Failed",
356
+ response_time_ms=response_time,
357
+ )
358
+ except Exception as e:
359
+ end_time = datetime.now(UTC)
360
+ response_time = (end_time - start_time).total_seconds() * 1000
361
+ return ComponentStatus(
362
+ name=name,
363
+ status=ComponentStatusType.UNHEALTHY,
364
+ message=f"Error: {str(e)}",
365
+ response_time_ms=response_time,
366
+ )
367
+
368
+
369
+ def _get_system_info() -> dict[str, Any]:
370
+ """Get general system information."""
371
+ try:
372
+ return {
373
+ "python_version": (
374
+ f"{sys.version_info.major}."
375
+ f"{sys.version_info.minor}."
376
+ f"{sys.version_info.micro}"
377
+ ),
378
+ "platform": psutil.WINDOWS if psutil.WINDOWS else "unix",
379
+ "containerized": "docker" if os.path.exists("/.dockerenv") else "false",
380
+ }
381
+ except Exception as e:
382
+ logger.warning(f"Failed to get system info: {e}")
383
+ return {"error": str(e)}
384
+
385
+
386
+ async def _check_memory() -> ComponentStatus:
387
+ """Check system memory usage."""
388
+ try:
389
+ # Run in thread to avoid blocking
390
+ memory = await asyncio.to_thread(psutil.virtual_memory)
391
+ memory_percent = memory.percent
392
+
393
+ # Determine status based on memory usage thresholds
394
+ if memory_percent >= settings.MEMORY_THRESHOLD_PERCENT:
395
+ status = ComponentStatusType.UNHEALTHY
396
+ elif memory_percent >= settings.MEMORY_THRESHOLD_PERCENT * 0.8:
397
+ status = ComponentStatusType.WARNING
398
+ else:
399
+ status = ComponentStatusType.HEALTHY
400
+
401
+ return ComponentStatus(
402
+ name="memory",
403
+ status=status,
404
+ message=f"Memory usage: {memory_percent:.1f}%",
405
+ response_time_ms=None,
406
+ metadata={
407
+ "percent_used": memory_percent,
408
+ "total_gb": round(memory.total / (1024**3), 2),
409
+ "available_gb": round(memory.available / (1024**3), 2),
410
+ "threshold_percent": settings.MEMORY_THRESHOLD_PERCENT,
411
+ },
412
+ )
413
+ except Exception as e:
414
+ return ComponentStatus(
415
+ name="memory",
416
+ status=ComponentStatusType.UNHEALTHY,
417
+ message=f"Failed to check memory: {e}",
418
+ response_time_ms=None,
419
+ )
420
+
421
+
422
+ async def _check_disk_space() -> ComponentStatus:
423
+ """Check disk space usage."""
424
+ try:
425
+ # Run in thread to avoid blocking
426
+ disk = await asyncio.to_thread(psutil.disk_usage, "/")
427
+ disk_percent = (disk.used / disk.total) * 100
428
+
429
+ # Determine status based on disk usage thresholds
430
+ if disk_percent >= settings.DISK_THRESHOLD_PERCENT:
431
+ status = ComponentStatusType.UNHEALTHY
432
+ elif disk_percent >= settings.DISK_THRESHOLD_PERCENT * 0.8:
433
+ status = ComponentStatusType.WARNING
434
+ else:
435
+ status = ComponentStatusType.HEALTHY
436
+
437
+ return ComponentStatus(
438
+ name="disk",
439
+ status=status,
440
+ message=f"Disk usage: {disk_percent:.1f}%",
441
+ response_time_ms=None,
442
+ metadata={
443
+ "percent_used": disk_percent,
444
+ "total_gb": round(disk.total / (1024**3), 2),
445
+ "free_gb": round(disk.free / (1024**3), 2),
446
+ "threshold_percent": settings.DISK_THRESHOLD_PERCENT,
447
+ },
448
+ )
449
+ except Exception as e:
450
+ return ComponentStatus(
451
+ name="disk",
452
+ status=ComponentStatusType.UNHEALTHY,
453
+ message=f"Failed to check disk space: {e}",
454
+ response_time_ms=None,
455
+ )
456
+
457
+
458
+ async def _check_cpu_usage() -> ComponentStatus:
459
+ """Check CPU usage (instant sampling)."""
460
+ try:
461
+ # Get instant CPU usage (non-blocking, immediate reading)
462
+ cpu_percent = await asyncio.to_thread(psutil.cpu_percent, None)
463
+
464
+ # Determine status based on CPU usage thresholds
465
+ if cpu_percent >= settings.CPU_THRESHOLD_PERCENT:
466
+ status = ComponentStatusType.UNHEALTHY
467
+ elif cpu_percent >= settings.CPU_THRESHOLD_PERCENT * 0.8:
468
+ status = ComponentStatusType.WARNING
469
+ else:
470
+ status = ComponentStatusType.HEALTHY
471
+
472
+ return ComponentStatus(
473
+ name="cpu",
474
+ status=status,
475
+ message=f"CPU usage: {cpu_percent:.1f}%",
476
+ response_time_ms=None,
477
+ metadata={
478
+ "percent_used": cpu_percent,
479
+ "cpu_count": psutil.cpu_count(),
480
+ "threshold_percent": settings.CPU_THRESHOLD_PERCENT,
481
+ },
482
+ )
483
+ except Exception as e:
484
+ return ComponentStatus(
485
+ name="cpu",
486
+ status=ComponentStatusType.UNHEALTHY,
487
+ message=f"Failed to check CPU usage: {e}",
488
+ response_time_ms=None,
489
+ )
490
+
491
+
492
+ {% if cookiecutter.include_redis == "yes" %}
493
+ async def check_cache_health() -> ComponentStatus:
494
+ """
495
+ Check cache connectivity and basic functionality.
496
+
497
+ Returns:
498
+ ComponentStatus indicating cache health
499
+ """
500
+ try:
501
+ import redis.asyncio as aioredis
502
+
503
+ # Create Redis connection with timeout
504
+ redis_connection = aioredis.from_url( # type: ignore[no-untyped-call]
505
+ settings.REDIS_URL,
506
+ db=settings.REDIS_DB,
507
+ socket_timeout=settings.HEALTH_CHECK_TIMEOUT_SECONDS,
508
+ socket_connect_timeout=settings.HEALTH_CHECK_TIMEOUT_SECONDS,
509
+ )
510
+ redis_client: aioredis.Redis = cast(aioredis.Redis, redis_connection)
511
+
512
+ start_time = datetime.now(UTC)
513
+
514
+ # Test basic connectivity with ping
515
+ await redis_client.ping()
516
+
517
+ # Test basic set/get functionality
518
+ test_key = "health_check:test"
519
+ test_value = f"test_{start_time.timestamp()}"
520
+ await redis_client.set(test_key, test_value, ex=10) # Expire in 10 seconds
521
+ retrieved_value = await redis_client.get(test_key)
522
+
523
+ # Cleanup test key
524
+ await redis_client.delete(test_key)
525
+ await redis_client.aclose()
526
+
527
+ # Verify test worked
528
+ if retrieved_value.decode() != test_value:
529
+ raise Exception("Redis set/get test failed")
530
+
531
+ # Get Redis info for metadata
532
+ redis_info_connection = aioredis.from_url( # type: ignore[no-untyped-call]
533
+ settings.REDIS_URL, db=settings.REDIS_DB
534
+ )
535
+ redis_info_client: aioredis.Redis = cast(aioredis.Redis, redis_info_connection)
536
+ info = await redis_info_client.info()
537
+ await redis_info_client.aclose()
538
+
539
+ return ComponentStatus(
540
+ name="cache",
541
+ status=ComponentStatusType.HEALTHY,
542
+ message="Redis cache connection and operations successful",
543
+ response_time_ms=None, # Will be set by caller
544
+ metadata={
545
+ "implementation": "redis",
546
+ "version": info.get("redis_version", "unknown"),
547
+ "connected_clients": info.get("connected_clients", 0),
548
+ "used_memory_human": info.get("used_memory_human", "unknown"),
549
+ "uptime_in_seconds": info.get("uptime_in_seconds", 0),
550
+ "url": settings.REDIS_URL,
551
+ "db": settings.REDIS_DB,
552
+ },
553
+ )
554
+
555
+ except ImportError:
556
+ return ComponentStatus(
557
+ name="cache",
558
+ status=ComponentStatusType.UNHEALTHY,
559
+ message="Cache library not installed",
560
+ response_time_ms=None,
561
+ metadata={
562
+ "implementation": "redis",
563
+ "error": "Redis library not available",
564
+ },
565
+ )
566
+ except Exception as e:
567
+ return ComponentStatus(
568
+ name="cache",
569
+ status=ComponentStatusType.UNHEALTHY,
570
+ message=f"Cache health check failed: {str(e)}",
571
+ response_time_ms=None,
572
+ metadata={
573
+ "implementation": "redis",
574
+ "url": settings.REDIS_URL,
575
+ "db": settings.REDIS_DB,
576
+ "error": str(e),
577
+ },
578
+ )
579
+ {% endif %}
580
+
581
+
582
+ {% if cookiecutter.include_database == "yes" %}
583
+ async def check_database_health() -> ComponentStatus:
584
+ """
585
+ Check database connectivity and basic functionality.
586
+
587
+ Returns:
588
+ ComponentStatus indicating database health
589
+ """
590
+ try:
591
+ # Import db_session from generated project
592
+ from app.core.db import db_session
593
+ from pathlib import Path
594
+
595
+ # Check if database file exists for SQLite
596
+ db_url = settings.DATABASE_URL
597
+ if db_url.startswith("sqlite:///"):
598
+ # Extract path from SQLite URL
599
+ db_path = db_url.replace("sqlite:///", "").lstrip("./")
600
+
601
+ # Check if database file exists
602
+ if not Path(db_path).exists():
603
+ return ComponentStatus(
604
+ name="database",
605
+ status=ComponentStatusType.WARNING,
606
+ message="Database not initialized - file does not exist",
607
+ response_time_ms=None,
608
+ metadata={
609
+ "implementation": "sqlite",
610
+ "database_exists": False,
611
+ "expected_path": db_path,
612
+ "url": settings.DATABASE_URL,
613
+ "recommendation": (
614
+ "Run database migrations or create database file"
615
+ ),
616
+ },
617
+ )
618
+
619
+ # Test database connection with simple query and collect enhanced metadata
620
+ enhanced_metadata = {
621
+ "implementation": "sqlite",
622
+ "url": settings.DATABASE_URL,
623
+ "database_exists": True,
624
+ "engine_echo": settings.DATABASE_ENGINE_ECHO,
625
+ }
626
+
627
+ # Collect additional metadata for SQLite databases
628
+ if db_url.startswith("sqlite:///"):
629
+ try:
630
+ # Add SQLite version
631
+ enhanced_metadata["version"] = sqlite3.sqlite_version
632
+
633
+ # Extract and add file size information
634
+ db_path = db_url.replace("sqlite:///", "").lstrip("./")
635
+ if Path(db_path).exists():
636
+ file_size = Path(db_path).stat().st_size
637
+ enhanced_metadata["file_size_bytes"] = file_size
638
+ enhanced_metadata["file_size_human"] = format_bytes(file_size)
639
+
640
+ # Get engine and connection pool information
641
+ from app.core.db import engine
642
+ if hasattr(engine.pool, 'size'):
643
+ enhanced_metadata["connection_pool_size"] = engine.pool.size()
644
+ else:
645
+ # SQLite typically uses NullPool or StaticPool with size 1
646
+ enhanced_metadata["connection_pool_size"] = 1
647
+
648
+ except Exception as e:
649
+ # If any enhanced metadata collection fails, log but don't break
650
+ # health check
651
+ logger.debug(
652
+ "Failed to collect enhanced database metadata", exc_info=True
653
+ )
654
+
655
+ # Test database connection and collect PRAGMA settings
656
+ with db_session(autocommit=False) as session:
657
+ # Execute a simple query to test connectivity
658
+ from sqlalchemy import text
659
+ session.execute(text("SELECT 1"))
660
+
661
+ # Collect SQLite PRAGMA settings for additional metadata
662
+ if db_url.startswith("sqlite:///"):
663
+ try:
664
+ pragma_settings = {}
665
+
666
+ # Get foreign keys setting
667
+ result = session.execute(text("PRAGMA foreign_keys")).fetchone()
668
+ if result:
669
+ pragma_settings["foreign_keys"] = bool(result[0])
670
+
671
+ # Get journal mode
672
+ result = session.execute(text("PRAGMA journal_mode")).fetchone()
673
+ if result:
674
+ journal_mode = result[0].lower()
675
+ pragma_settings["journal_mode"] = journal_mode
676
+ enhanced_metadata["wal_enabled"] = journal_mode == "wal"
677
+
678
+ # Add cache size if available
679
+ result = session.execute(text("PRAGMA cache_size")).fetchone()
680
+ if result:
681
+ pragma_settings["cache_size"] = result[0]
682
+
683
+ enhanced_metadata["pragma_settings"] = pragma_settings
684
+
685
+ except Exception as e:
686
+ # PRAGMA queries can fail in some SQLite configurations
687
+ logger.debug(
688
+ "Failed to collect SQLite PRAGMA settings", exc_info=True
689
+ )
690
+
691
+ # No need to commit since we're just testing connectivity
692
+
693
+ return ComponentStatus(
694
+ name="database",
695
+ status=ComponentStatusType.HEALTHY,
696
+ message="Database connection successful",
697
+ response_time_ms=None, # Will be set by caller
698
+ metadata=enhanced_metadata,
699
+ )
700
+
701
+ except ImportError:
702
+ return ComponentStatus(
703
+ name="database",
704
+ status=ComponentStatusType.UNHEALTHY,
705
+ message="Database module not available",
706
+ response_time_ms=None,
707
+ metadata={
708
+ "implementation": "sqlite",
709
+ "error": "Database module not imported or configured",
710
+ },
711
+ )
712
+ except Exception as e:
713
+ # Check if it's a file not found error
714
+ error_str = str(e).lower()
715
+ if "unable to open database file" in error_str or "no such file" in error_str:
716
+ return ComponentStatus(
717
+ name="database",
718
+ status=ComponentStatusType.WARNING,
719
+ message="Database file not accessible",
720
+ response_time_ms=None,
721
+ metadata={
722
+ "implementation": "sqlite",
723
+ "url": settings.DATABASE_URL,
724
+ "error": str(e),
725
+ "recommendation": "Check database file path and permissions",
726
+ },
727
+ )
728
+
729
+ return ComponentStatus(
730
+ name="database",
731
+ status=ComponentStatusType.UNHEALTHY,
732
+ message=f"Database connection failed: {str(e)}",
733
+ response_time_ms=None,
734
+ metadata={
735
+ "implementation": "sqlite",
736
+ "url": settings.DATABASE_URL,
737
+ "error": str(e),
738
+ },
739
+ )
740
+ {% endif %}
741
+
742
+
743
+ {% if cookiecutter.include_worker == "yes" %}
744
+ async def check_worker_health() -> ComponentStatus:
745
+ """
746
+ Check arq worker status using arq's native health checks and queue configuration.
747
+
748
+ Returns:
749
+ ComponentStatus indicating worker infrastructure health with queue
750
+ sub-components
751
+ """
752
+ try:
753
+ import re
754
+
755
+ import redis.asyncio as aioredis
756
+
757
+ # Create Redis connection
758
+ # Step 1: Call untyped function with explicit ignore
759
+ redis_connection = aioredis.from_url( # type: ignore[no-untyped-call]
760
+ settings.REDIS_URL,
761
+ db=settings.REDIS_DB
762
+ )
763
+ # Step 2: Cast the result to proper type
764
+ redis_client: aioredis.Redis = cast(aioredis.Redis, redis_connection)
765
+
766
+ # Get queue metadata from WorkerSettings classes via dynamic discovery
767
+ from app.components.worker.registry import get_all_queue_metadata
768
+ functional_queues = get_all_queue_metadata()
769
+
770
+ # Check each queue and create sub-components
771
+ queue_sub_components = {}
772
+ total_queued = 0
773
+ total_completed = 0
774
+ total_failed = 0
775
+ total_retried = 0
776
+ total_ongoing = 0
777
+ overall_healthy = True
778
+ active_workers = 0
779
+
780
+ for queue_type, queue_config in functional_queues.items():
781
+ queue_name = queue_config["queue_name"]
782
+
783
+ try:
784
+ # Get queue length (actual queued jobs)
785
+ queue_length_result = redis_client.llen(queue_name)
786
+ if hasattr(queue_length_result, '__await__'):
787
+ queue_length = await queue_length_result
788
+ else:
789
+ queue_length = queue_length_result
790
+ total_queued += queue_length
791
+
792
+ # Look for arq health check key for this queue
793
+ # arq health check key format: {queue_name}:health-check
794
+ health_check_key = f"{queue_name}:health-check"
795
+ health_check_data = await redis_client.get(health_check_key)
796
+
797
+ # Parse arq health check data if available
798
+ j_complete = j_failed = j_retried = j_ongoing = 0
799
+ worker_alive = False
800
+ last_health_check = None
801
+
802
+ if health_check_data:
803
+ health_string = health_check_data.decode()
804
+ # Parse format: "Mar-01 17:41:22 j_complete=0 j_failed=0 ..."
805
+ logger.debug(
806
+ f"Raw health check data for {queue_type}: {health_string}"
807
+ )
808
+
809
+ # Extract timestamp (first part before job stats)
810
+ timestamp_match = re.match(r"^(\w+-\d+ \d+:\d+:\d+)", health_string)
811
+ if timestamp_match:
812
+ last_health_check = timestamp_match.group(1)
813
+
814
+ # Extract job statistics using regex
815
+ j_complete_match = re.search(r"j_complete=(\d+)", health_string)
816
+ j_failed_match = re.search(r"j_failed=(\d+)", health_string)
817
+ j_retried_match = re.search(r"j_retried=(\d+)", health_string)
818
+ j_ongoing_match = re.search(r"j_ongoing=(\d+)", health_string)
819
+
820
+ if j_complete_match:
821
+ j_complete = int(j_complete_match.group(1))
822
+ total_completed += j_complete
823
+ if j_failed_match:
824
+ j_failed = int(j_failed_match.group(1))
825
+ total_failed += j_failed
826
+ if j_retried_match:
827
+ j_retried = int(j_retried_match.group(1))
828
+ total_retried += j_retried
829
+ if j_ongoing_match:
830
+ j_ongoing = int(j_ongoing_match.group(1))
831
+ total_ongoing += j_ongoing
832
+
833
+ worker_alive = True
834
+ active_workers += 1
835
+
836
+ # Create queue status message
837
+ status_parts = []
838
+ if not worker_alive:
839
+ status_parts.append("worker offline - no health check data")
840
+ elif j_ongoing > 0:
841
+ status_parts.append(f"{j_ongoing} processing")
842
+ elif queue_length > 0:
843
+ status_parts.append(f"{queue_length} queued")
844
+ else:
845
+ status_parts.append("idle")
846
+
847
+ # Add job statistics to status if worker is alive
848
+ if worker_alive and (j_complete > 0 or j_failed > 0):
849
+ if j_failed > 0:
850
+ failure_rate = (j_failed / max(j_complete + j_failed, 1)) * 100
851
+ status_parts.append(f"{j_failed} failed ({failure_rate:.1f}%)")
852
+ if j_complete > 0:
853
+ status_parts.append(f"{j_complete} completed")
854
+
855
+ # Check if queue has no functions configured (empty functions list)
856
+ queue_functions = queue_config.get("functions", [])
857
+ has_functions = len(queue_functions) > 0
858
+
859
+ # Determine queue status based on worker health and failure rate
860
+ failure_rate = (
861
+ (j_failed / max(j_complete + j_failed, 1)) * 100
862
+ if worker_alive
863
+ else 100
864
+ )
865
+
866
+ if not worker_alive and not has_functions:
867
+ # Queue configured but no functions - show as INFO
868
+ queue_status = ComponentStatusType.INFO
869
+ status_parts = ["configured - no functions defined"]
870
+ elif not worker_alive:
871
+ queue_status = ComponentStatusType.UNHEALTHY
872
+ elif failure_rate > 25: # Unhealthy threshold at 25%
873
+ queue_status = ComponentStatusType.UNHEALTHY
874
+ elif failure_rate > 10: # Warning threshold at 10%
875
+ queue_status = ComponentStatusType.WARNING
876
+ else:
877
+ queue_status = ComponentStatusType.HEALTHY
878
+
879
+ queue_message = (
880
+ f"{queue_config['description']}: {', '.join(status_parts)}"
881
+ )
882
+
883
+ # Update overall health based on this queue
884
+ if queue_status == ComponentStatusType.UNHEALTHY:
885
+ overall_healthy = False
886
+
887
+ queue_metadata = {
888
+ "queue_type": queue_type,
889
+ "queue_name": queue_name,
890
+ "queued_jobs": queue_length,
891
+ "max_concurrency": queue_config["max_jobs"],
892
+ "timeout_seconds": queue_config["timeout"],
893
+ "description": queue_config["description"],
894
+ "worker_alive": worker_alive,
895
+ "health_check_key": health_check_key,
896
+ }
897
+
898
+ # Add arq health check statistics if available
899
+ if worker_alive:
900
+ queue_metadata.update(
901
+ {
902
+ "jobs_completed": j_complete,
903
+ "jobs_failed": j_failed,
904
+ "jobs_retried": j_retried,
905
+ "jobs_ongoing": j_ongoing,
906
+ "failure_rate_percent": round(failure_rate, 1),
907
+ "last_health_check": last_health_check,
908
+ }
909
+ )
910
+ else:
911
+ queue_metadata["offline_reason"] = "Health check key not found"
912
+
913
+ queue_sub_components[queue_type] = ComponentStatus(
914
+ name=queue_type,
915
+ status=queue_status,
916
+ message=queue_message,
917
+ response_time_ms=None,
918
+ metadata=queue_metadata,
919
+ sub_components={},
920
+ )
921
+
922
+ except aioredis.ConnectionError as e:
923
+ logger.error(f"Redis connection failed for {queue_type}: {e}")
924
+ overall_healthy = False
925
+
926
+ # Extract more specific connection error details
927
+ error_details = str(e).lower()
928
+ if "connection refused" in error_details:
929
+ connection_issue = "Redis server not running"
930
+ elif (
931
+ "name or service not known" in error_details
932
+ or "nodename nor servname" in error_details
933
+ ):
934
+ connection_issue = "Redis server DNS resolution failed"
935
+ elif "timeout" in error_details:
936
+ connection_issue = "Redis server connection timeout"
937
+ else:
938
+ connection_issue = "Redis server unreachable"
939
+
940
+ queue_sub_components[queue_type] = ComponentStatus(
941
+ name=queue_type,
942
+ status=ComponentStatusType.UNHEALTHY,
943
+ message=f"{connection_issue} - worker offline",
944
+ response_time_ms=None,
945
+ metadata={
946
+ "queue_type": queue_type,
947
+ "queue_name": queue_name,
948
+ "error_type": "redis_connection_error",
949
+ "error": str(e),
950
+ "connection_issue": connection_issue,
951
+ "recommendation": (
952
+ "Check Redis server status and network connectivity"
953
+ ),
954
+ },
955
+ sub_components={},
956
+ )
957
+ except aioredis.ResponseError as e:
958
+ if "WRONGTYPE" in str(e):
959
+ logger.error(f"Redis data corruption for {queue_type}: {e}")
960
+ message = f"Redis data corruption detected"
961
+ recommendation = "Clear Redis cache to fix data type conflicts"
962
+ error_type = "redis_key_type_error"
963
+ else:
964
+ logger.error(f"Redis operation failed for {queue_type}: {e}")
965
+ message = f"Redis operation failed"
966
+ recommendation = "Check Redis configuration and permissions"
967
+ error_type = "redis_response_error"
968
+
969
+ overall_healthy = False
970
+ queue_sub_components[queue_type] = ComponentStatus(
971
+ name=queue_type,
972
+ status=ComponentStatusType.UNHEALTHY,
973
+ message=message,
974
+ response_time_ms=None,
975
+ metadata={
976
+ "queue_type": queue_type,
977
+ "queue_name": queue_name,
978
+ "error_type": error_type,
979
+ "error": str(e),
980
+ "recommendation": recommendation,
981
+ },
982
+ sub_components={},
983
+ )
984
+ except Exception as e:
985
+ logger.error(
986
+ f"Unexpected error checking {queue_type} queue health: {e}"
987
+ )
988
+ overall_healthy = False
989
+ queue_sub_components[queue_type] = ComponentStatus(
990
+ name=queue_type,
991
+ status=ComponentStatusType.UNHEALTHY,
992
+ message=f"Health check failed: {type(e).__name__}",
993
+ response_time_ms=None,
994
+ metadata={
995
+ "queue_type": queue_type,
996
+ "queue_name": queue_name,
997
+ "error_type": "unexpected_error",
998
+ "error": str(e),
999
+ "exception_class": type(e).__name__,
1000
+ },
1001
+ sub_components={},
1002
+ )
1003
+
1004
+ await redis_client.aclose()
1005
+
1006
+ # Create main worker status message
1007
+ message_parts = []
1008
+ if active_workers == 0:
1009
+ message_parts.append("No active workers")
1010
+ overall_healthy = False
1011
+ else:
1012
+ message_parts.append(
1013
+ f"{active_workers}/{len(functional_queues)} workers active"
1014
+ )
1015
+
1016
+ if total_queued > 0:
1017
+ message_parts.append(f"{total_queued} queued")
1018
+ if total_ongoing > 0:
1019
+ message_parts.append(f"{total_ongoing} processing")
1020
+ if total_failed > 0:
1021
+ failure_rate = (total_failed / max(total_completed + total_failed, 1)) * 100
1022
+ message_parts.append(f"{total_failed} failed ({failure_rate:.1f}%)")
1023
+
1024
+ main_message = f"arq worker infrastructure: {', '.join(message_parts)}"
1025
+
1026
+ # Create a "queues" intermediate component that contains all queue
1027
+ # sub-components - determine status from child statuses
1028
+ queue_statuses = [queue.status for queue in queue_sub_components.values()]
1029
+ queues_status = propagate_status(queue_statuses)
1030
+
1031
+
1032
+ queues_message = f"{len(functional_queues)} functional queues configured"
1033
+ if active_workers < len(functional_queues):
1034
+ queues_message += f" ({active_workers} active)"
1035
+
1036
+ queues_component = ComponentStatus(
1037
+ name="queues",
1038
+ status=queues_status,
1039
+ message=queues_message,
1040
+ response_time_ms=None,
1041
+ metadata={
1042
+ "configured_queues": len(functional_queues),
1043
+ "active_workers": active_workers,
1044
+ "queue_types": list(functional_queues.keys()),
1045
+ },
1046
+ sub_components=queue_sub_components,
1047
+ )
1048
+
1049
+ # Determine worker status based on overall health and queues status
1050
+ if not overall_healthy:
1051
+ worker_status = ComponentStatusType.UNHEALTHY
1052
+ else:
1053
+ worker_status = propagate_status([queues_status])
1054
+
1055
+ return ComponentStatus(
1056
+ name="worker",
1057
+ status=worker_status,
1058
+ message=main_message,
1059
+ response_time_ms=None,
1060
+ metadata={
1061
+ "total_queued": total_queued,
1062
+ "total_completed": total_completed,
1063
+ "total_failed": total_failed,
1064
+ "total_retried": total_retried,
1065
+ "total_ongoing": total_ongoing,
1066
+ "overall_failure_rate_percent": round(
1067
+ (total_failed / max(total_completed + total_failed, 1)) * 100, 1
1068
+ )
1069
+ if total_completed + total_failed > 0
1070
+ else 0,
1071
+ "redis_url": settings.REDIS_URL,
1072
+ "queue_configuration": {
1073
+ queue_type: {
1074
+ "description": config["description"],
1075
+ "max_jobs": config["max_jobs"],
1076
+ "timeout_seconds": config["timeout"],
1077
+ }
1078
+ for queue_type, config in functional_queues.items()
1079
+ },
1080
+ },
1081
+ sub_components={"queues": queues_component},
1082
+ )
1083
+
1084
+ except ImportError:
1085
+ return ComponentStatus(
1086
+ name="worker",
1087
+ status=ComponentStatusType.UNHEALTHY,
1088
+ message="Redis library not available for worker health check",
1089
+ response_time_ms=None,
1090
+ sub_components={},
1091
+ )
1092
+ except Exception as e:
1093
+ logger.error(f"Worker health check failed: {e}")
1094
+ return ComponentStatus(
1095
+ name="worker",
1096
+ status=ComponentStatusType.UNHEALTHY,
1097
+ message=f"Worker health check failed: {str(e)}",
1098
+ response_time_ms=None,
1099
+ metadata={
1100
+ "error": str(e),
1101
+ "redis_url": settings.REDIS_URL,
1102
+ },
1103
+ sub_components={},
1104
+ )
1105
+ {% endif %}