procler 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- procler/__init__.py +3 -0
- procler/__main__.py +6 -0
- procler/api/__init__.py +5 -0
- procler/api/app.py +261 -0
- procler/api/deps.py +21 -0
- procler/api/routes/__init__.py +5 -0
- procler/api/routes/config.py +290 -0
- procler/api/routes/groups.py +62 -0
- procler/api/routes/logs.py +43 -0
- procler/api/routes/processes.py +185 -0
- procler/api/routes/recipes.py +69 -0
- procler/api/routes/snippets.py +134 -0
- procler/api/routes/ws.py +459 -0
- procler/cli.py +1478 -0
- procler/config/__init__.py +65 -0
- procler/config/changelog.py +148 -0
- procler/config/loader.py +256 -0
- procler/config/schema.py +315 -0
- procler/core/__init__.py +54 -0
- procler/core/context_base.py +117 -0
- procler/core/context_docker.py +384 -0
- procler/core/context_local.py +287 -0
- procler/core/daemon_detector.py +325 -0
- procler/core/events.py +74 -0
- procler/core/groups.py +419 -0
- procler/core/health.py +280 -0
- procler/core/log_tailer.py +262 -0
- procler/core/process_manager.py +1277 -0
- procler/core/recipes.py +330 -0
- procler/core/snippets.py +231 -0
- procler/core/variable_substitution.py +65 -0
- procler/db.py +96 -0
- procler/logging.py +41 -0
- procler/models.py +130 -0
- procler/py.typed +0 -0
- procler/settings.py +29 -0
- procler/static/assets/AboutView-BwZnsfpW.js +4 -0
- procler/static/assets/AboutView-UHbxWXcS.css +1 -0
- procler/static/assets/Code-HTS-H1S6.js +74 -0
- procler/static/assets/ConfigView-CGJcmp9G.css +1 -0
- procler/static/assets/ConfigView-aVtbRDf8.js +1 -0
- procler/static/assets/DashboardView-C5jw9Nsd.css +1 -0
- procler/static/assets/DashboardView-Dab7Cu9v.js +1 -0
- procler/static/assets/DataTable-z39TOAa4.js +746 -0
- procler/static/assets/DescriptionsItem-B2E8YbqJ.js +74 -0
- procler/static/assets/Divider-Dk-6aD2Y.js +42 -0
- procler/static/assets/Empty-MuygEHZM.js +24 -0
- procler/static/assets/Grid-CZ9QVKAT.js +1 -0
- procler/static/assets/GroupsView-BALG7i1X.js +1 -0
- procler/static/assets/GroupsView-gXAI1CVC.css +1 -0
- procler/static/assets/Input-e0xaxoWE.js +259 -0
- procler/static/assets/PhArrowsClockwise.vue-DqDg31az.js +1 -0
- procler/static/assets/PhCheckCircle.vue-Fwj9sh9m.js +1 -0
- procler/static/assets/PhEye.vue-JcPHciC2.js +1 -0
- procler/static/assets/PhPlay.vue-CZm7Gy3u.js +1 -0
- procler/static/assets/PhPlus.vue-yTWqKlSh.js +1 -0
- procler/static/assets/PhStop.vue-DxsqwIki.js +1 -0
- procler/static/assets/PhTrash.vue-DcqQbN1_.js +125 -0
- procler/static/assets/PhXCircle.vue-BXWmrabV.js +1 -0
- procler/static/assets/ProcessDetailView-DDbtIWq9.css +1 -0
- procler/static/assets/ProcessDetailView-DPtdNV-q.js +1 -0
- procler/static/assets/ProcessesView-B3a6Umur.js +1 -0
- procler/static/assets/ProcessesView-goLmghbJ.css +1 -0
- procler/static/assets/RecipesView-D2VxdneD.js +166 -0
- procler/static/assets/RecipesView-DXnFDCK4.css +1 -0
- procler/static/assets/Select-BBR17AHq.js +317 -0
- procler/static/assets/SnippetsView-B3a9q3AI.css +1 -0
- procler/static/assets/SnippetsView-DBCB2yGq.js +1 -0
- procler/static/assets/Spin-BXTjvFUk.js +90 -0
- procler/static/assets/Tag-Bh_qV63A.js +71 -0
- procler/static/assets/changelog-KkTT4H9-.js +1 -0
- procler/static/assets/groups-Zu-_v8ey.js +1 -0
- procler/static/assets/index-BsN-YMXq.css +1 -0
- procler/static/assets/index-BzW1XhyH.js +1282 -0
- procler/static/assets/procler-DOrSB1Vj.js +1 -0
- procler/static/assets/recipes-1w5SseGb.js +1 -0
- procler/static/index.html +17 -0
- procler/static/procler.png +0 -0
- procler-0.2.0.dist-info/METADATA +545 -0
- procler-0.2.0.dist-info/RECORD +83 -0
- procler-0.2.0.dist-info/WHEEL +4 -0
- procler-0.2.0.dist-info/entry_points.txt +2 -0
- procler-0.2.0.dist-info/licenses/LICENSE +21 -0
procler/core/groups.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""Process group management."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from ..config import DependencyCondition, get_config
|
|
9
|
+
from .health import get_health_checker
|
|
10
|
+
from .process_manager import get_process_manager
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GroupManager:
|
|
14
|
+
"""Manages process groups with ordered operations and dependency support."""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self._process_manager = get_process_manager()
|
|
18
|
+
self._health_checker = get_health_checker()
|
|
19
|
+
|
|
20
|
+
def list_groups(self) -> dict[str, Any]:
|
|
21
|
+
"""List all defined groups."""
|
|
22
|
+
config = get_config()
|
|
23
|
+
|
|
24
|
+
groups_data = []
|
|
25
|
+
for name, group in config.groups.items():
|
|
26
|
+
groups_data.append(
|
|
27
|
+
{
|
|
28
|
+
"name": name,
|
|
29
|
+
"description": group.description,
|
|
30
|
+
"processes": group.processes,
|
|
31
|
+
"stop_order": group.get_stop_order(),
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
"success": True,
|
|
37
|
+
"data": {
|
|
38
|
+
"groups": groups_data,
|
|
39
|
+
"count": len(groups_data),
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def get_group(self, name: str) -> dict[str, Any]:
|
|
44
|
+
"""Get a specific group by name."""
|
|
45
|
+
config = get_config()
|
|
46
|
+
|
|
47
|
+
if name not in config.groups:
|
|
48
|
+
return {
|
|
49
|
+
"success": False,
|
|
50
|
+
"error": f"Group '{name}' not found",
|
|
51
|
+
"error_code": "group_not_found",
|
|
52
|
+
"suggestion": "Run 'procler group list' to see available groups",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
group = config.groups[name]
|
|
56
|
+
return {
|
|
57
|
+
"success": True,
|
|
58
|
+
"data": {
|
|
59
|
+
"group": {
|
|
60
|
+
"name": name,
|
|
61
|
+
"description": group.description,
|
|
62
|
+
"processes": group.processes,
|
|
63
|
+
"stop_order": group.get_stop_order(),
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async def start_group(
|
|
69
|
+
self,
|
|
70
|
+
name: str,
|
|
71
|
+
respect_dependencies: bool = True,
|
|
72
|
+
dependency_timeout: float = 60.0,
|
|
73
|
+
) -> dict[str, Any]:
|
|
74
|
+
"""
|
|
75
|
+
Start all processes in a group in order.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
name: Group name
|
|
79
|
+
respect_dependencies: If True, wait for process dependencies before starting
|
|
80
|
+
dependency_timeout: Max seconds to wait for each dependency
|
|
81
|
+
"""
|
|
82
|
+
config = get_config()
|
|
83
|
+
|
|
84
|
+
if name not in config.groups:
|
|
85
|
+
return {
|
|
86
|
+
"success": False,
|
|
87
|
+
"error": f"Group '{name}' not found",
|
|
88
|
+
"error_code": "group_not_found",
|
|
89
|
+
"suggestion": "Run 'procler group list' to see available groups",
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
group = config.groups[name]
|
|
93
|
+
results = []
|
|
94
|
+
all_success = True
|
|
95
|
+
|
|
96
|
+
for proc_name in group.processes:
|
|
97
|
+
# Check if process is defined in config
|
|
98
|
+
if proc_name not in config.processes:
|
|
99
|
+
results.append(
|
|
100
|
+
{
|
|
101
|
+
"process": proc_name,
|
|
102
|
+
"success": False,
|
|
103
|
+
"error": f"Process '{proc_name}' not defined in config",
|
|
104
|
+
"error_code": "process_not_in_config",
|
|
105
|
+
"suggestion": "Add process definition to .procler/config.yaml",
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
all_success = False
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
proc_def = config.processes[proc_name]
|
|
112
|
+
|
|
113
|
+
# Check dependencies before starting
|
|
114
|
+
if respect_dependencies and proc_def.depends_on:
|
|
115
|
+
dep_result = await self._wait_for_dependencies(proc_name, proc_def, config, dependency_timeout)
|
|
116
|
+
if not dep_result["success"]:
|
|
117
|
+
results.append(
|
|
118
|
+
{
|
|
119
|
+
"process": proc_name,
|
|
120
|
+
"success": False,
|
|
121
|
+
"error": dep_result["error"],
|
|
122
|
+
"dependency_failures": dep_result.get("failures", []),
|
|
123
|
+
}
|
|
124
|
+
)
|
|
125
|
+
all_success = False
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
# Ensure process exists in runtime DB
|
|
129
|
+
await self._ensure_process_in_db(proc_name, proc_def)
|
|
130
|
+
|
|
131
|
+
# Start the process
|
|
132
|
+
result = await self._process_manager.start(proc_name)
|
|
133
|
+
|
|
134
|
+
# Start health checking if configured
|
|
135
|
+
if proc_def.healthcheck:
|
|
136
|
+
self._health_checker.register_process(proc_name, proc_def.healthcheck)
|
|
137
|
+
asyncio.create_task(self._health_checker.start_checking(proc_name, proc_def.healthcheck))
|
|
138
|
+
|
|
139
|
+
results.append(
|
|
140
|
+
{
|
|
141
|
+
"process": proc_name,
|
|
142
|
+
"success": result["success"],
|
|
143
|
+
"status": result.get("data", {}).get("status"),
|
|
144
|
+
"error": result.get("error"),
|
|
145
|
+
}
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if not result["success"]:
|
|
149
|
+
all_success = False
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
"success": all_success,
|
|
153
|
+
"data": {
|
|
154
|
+
"group": name,
|
|
155
|
+
"action": "started",
|
|
156
|
+
"results": results,
|
|
157
|
+
},
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
async def _wait_for_dependencies(
|
|
161
|
+
self,
|
|
162
|
+
proc_name: str,
|
|
163
|
+
proc_def,
|
|
164
|
+
config,
|
|
165
|
+
timeout: float,
|
|
166
|
+
) -> dict[str, Any]:
|
|
167
|
+
"""Wait for all dependencies of a process to be ready."""
|
|
168
|
+
dependencies = proc_def.get_dependencies()
|
|
169
|
+
failures = []
|
|
170
|
+
|
|
171
|
+
for dep in dependencies:
|
|
172
|
+
dep_name = dep.name
|
|
173
|
+
|
|
174
|
+
# Check if dependency exists
|
|
175
|
+
if dep_name not in config.processes:
|
|
176
|
+
failures.append(
|
|
177
|
+
{
|
|
178
|
+
"dependency": dep_name,
|
|
179
|
+
"error": "Dependency not defined in config",
|
|
180
|
+
}
|
|
181
|
+
)
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
dep_def = config.processes[dep_name]
|
|
185
|
+
|
|
186
|
+
# Check if dependency is running
|
|
187
|
+
status_result = await self._process_manager.status(dep_name)
|
|
188
|
+
if not status_result["success"]:
|
|
189
|
+
failures.append(
|
|
190
|
+
{
|
|
191
|
+
"dependency": dep_name,
|
|
192
|
+
"error": f"Could not get status: {status_result.get('error')}",
|
|
193
|
+
}
|
|
194
|
+
)
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
proc_status = status_result.get("data", {}).get("process", {}).get("status")
|
|
198
|
+
|
|
199
|
+
if proc_status != "running":
|
|
200
|
+
failures.append(
|
|
201
|
+
{
|
|
202
|
+
"dependency": dep_name,
|
|
203
|
+
"error": f"Dependency not running (status: {proc_status})",
|
|
204
|
+
"condition": dep.condition.value,
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
# If condition is 'healthy', wait for health check
|
|
210
|
+
if dep.condition == DependencyCondition.HEALTHY:
|
|
211
|
+
if not dep_def.healthcheck:
|
|
212
|
+
failures.append(
|
|
213
|
+
{
|
|
214
|
+
"dependency": dep_name,
|
|
215
|
+
"error": "Dependency requires healthy condition but has no healthcheck",
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
# Wait for healthy status
|
|
221
|
+
is_healthy = await self._health_checker.wait_for_healthy(dep_name, dep_def.healthcheck, timeout)
|
|
222
|
+
if not is_healthy:
|
|
223
|
+
failures.append(
|
|
224
|
+
{
|
|
225
|
+
"dependency": dep_name,
|
|
226
|
+
"error": f"Dependency not healthy after {timeout}s",
|
|
227
|
+
"condition": "healthy",
|
|
228
|
+
}
|
|
229
|
+
)
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
if failures:
|
|
233
|
+
return {
|
|
234
|
+
"success": False,
|
|
235
|
+
"error": f"Dependencies not satisfied for '{proc_name}'",
|
|
236
|
+
"failures": failures,
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
return {"success": True}
|
|
240
|
+
|
|
241
|
+
async def stop_group(self, name: str) -> dict[str, Any]:
|
|
242
|
+
"""Stop all processes in a group in stop order."""
|
|
243
|
+
config = get_config()
|
|
244
|
+
|
|
245
|
+
if name not in config.groups:
|
|
246
|
+
return {
|
|
247
|
+
"success": False,
|
|
248
|
+
"error": f"Group '{name}' not found",
|
|
249
|
+
"error_code": "group_not_found",
|
|
250
|
+
"suggestion": "Run 'procler group list' to see available groups",
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
group = config.groups[name]
|
|
254
|
+
stop_order = group.get_stop_order()
|
|
255
|
+
results = []
|
|
256
|
+
all_success = True
|
|
257
|
+
|
|
258
|
+
for proc_name in stop_order:
|
|
259
|
+
# Check if process is defined
|
|
260
|
+
if proc_name not in config.processes:
|
|
261
|
+
results.append(
|
|
262
|
+
{
|
|
263
|
+
"process": proc_name,
|
|
264
|
+
"success": False,
|
|
265
|
+
"error": f"Process '{proc_name}' not defined in config",
|
|
266
|
+
"error_code": "process_not_in_config",
|
|
267
|
+
"suggestion": "Add process definition to .procler/config.yaml",
|
|
268
|
+
}
|
|
269
|
+
)
|
|
270
|
+
all_success = False
|
|
271
|
+
continue
|
|
272
|
+
|
|
273
|
+
# Stop health checking first
|
|
274
|
+
await self._health_checker.stop_checking(proc_name)
|
|
275
|
+
|
|
276
|
+
# Ensure process exists in runtime DB
|
|
277
|
+
await self._ensure_process_in_db(proc_name, config.processes[proc_name])
|
|
278
|
+
|
|
279
|
+
# Stop the process
|
|
280
|
+
result = await self._process_manager.stop(proc_name)
|
|
281
|
+
results.append(
|
|
282
|
+
{
|
|
283
|
+
"process": proc_name,
|
|
284
|
+
"success": result["success"],
|
|
285
|
+
"status": result.get("data", {}).get("status"),
|
|
286
|
+
"error": result.get("error"),
|
|
287
|
+
}
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
if not result["success"]:
|
|
291
|
+
all_success = False
|
|
292
|
+
|
|
293
|
+
return {
|
|
294
|
+
"success": all_success,
|
|
295
|
+
"data": {
|
|
296
|
+
"group": name,
|
|
297
|
+
"action": "stopped",
|
|
298
|
+
"results": results,
|
|
299
|
+
},
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
async def status_group(self, name: str) -> dict[str, Any]:
|
|
303
|
+
"""Get status of all processes in a group, including health information."""
|
|
304
|
+
config = get_config()
|
|
305
|
+
|
|
306
|
+
if name not in config.groups:
|
|
307
|
+
return {
|
|
308
|
+
"success": False,
|
|
309
|
+
"error": f"Group '{name}' not found",
|
|
310
|
+
"error_code": "group_not_found",
|
|
311
|
+
"suggestion": "Run 'procler group list' to see available groups",
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
group = config.groups[name]
|
|
315
|
+
statuses = []
|
|
316
|
+
|
|
317
|
+
for proc_name in group.processes:
|
|
318
|
+
if proc_name not in config.processes:
|
|
319
|
+
statuses.append(
|
|
320
|
+
{
|
|
321
|
+
"process": proc_name,
|
|
322
|
+
"status": "not_defined",
|
|
323
|
+
"error": "Not defined in config",
|
|
324
|
+
}
|
|
325
|
+
)
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
proc_def = config.processes[proc_name]
|
|
329
|
+
|
|
330
|
+
# Ensure process exists in runtime DB
|
|
331
|
+
await self._ensure_process_in_db(proc_name, proc_def)
|
|
332
|
+
|
|
333
|
+
result = await self._process_manager.status(proc_name)
|
|
334
|
+
if result["success"]:
|
|
335
|
+
proc_data = result["data"]["process"]
|
|
336
|
+
status_entry = {
|
|
337
|
+
"process": proc_name,
|
|
338
|
+
"status": proc_data.get("status", "unknown"),
|
|
339
|
+
"pid": proc_data.get("pid"),
|
|
340
|
+
"uptime_seconds": proc_data.get("uptime_seconds"),
|
|
341
|
+
"linux_state": proc_data.get("linux_state"),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
# Include health status if process has health check
|
|
345
|
+
if proc_def.healthcheck:
|
|
346
|
+
status_entry["health"] = self._health_checker.to_dict(proc_name)
|
|
347
|
+
|
|
348
|
+
# Include dependency info
|
|
349
|
+
if proc_def.depends_on:
|
|
350
|
+
status_entry["depends_on"] = [
|
|
351
|
+
{"name": d.name, "condition": d.condition.value} for d in proc_def.get_dependencies()
|
|
352
|
+
]
|
|
353
|
+
|
|
354
|
+
statuses.append(status_entry)
|
|
355
|
+
else:
|
|
356
|
+
statuses.append(
|
|
357
|
+
{
|
|
358
|
+
"process": proc_name,
|
|
359
|
+
"status": "unknown",
|
|
360
|
+
"error": result.get("error"),
|
|
361
|
+
}
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return {
|
|
365
|
+
"success": True,
|
|
366
|
+
"data": {
|
|
367
|
+
"group": name,
|
|
368
|
+
"description": group.description,
|
|
369
|
+
"statuses": statuses,
|
|
370
|
+
},
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
async def _ensure_process_in_db(self, name: str, proc_def) -> None:
|
|
374
|
+
"""Ensure a process from config exists in the runtime database."""
|
|
375
|
+
from datetime import datetime
|
|
376
|
+
|
|
377
|
+
from sqler.query import SQLerField as F
|
|
378
|
+
|
|
379
|
+
from ..db import init_database
|
|
380
|
+
from ..models import Process
|
|
381
|
+
|
|
382
|
+
init_database()
|
|
383
|
+
|
|
384
|
+
# Check if already exists
|
|
385
|
+
existing = Process.query().filter(F("name") == name).all()
|
|
386
|
+
if existing:
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
# Create from config definition
|
|
390
|
+
tags = proc_def.tags if proc_def.tags else None
|
|
391
|
+
process = Process(
|
|
392
|
+
name=name,
|
|
393
|
+
command=proc_def.command,
|
|
394
|
+
context_type=proc_def.context.value,
|
|
395
|
+
container_name=proc_def.container,
|
|
396
|
+
cwd=proc_def.cwd,
|
|
397
|
+
tags=tags,
|
|
398
|
+
created_at=datetime.now().isoformat(),
|
|
399
|
+
updated_at=datetime.now().isoformat(),
|
|
400
|
+
)
|
|
401
|
+
process.save()
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# Singleton
|
|
405
|
+
_group_manager: GroupManager | None = None
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def get_group_manager() -> GroupManager:
|
|
409
|
+
"""Get the singleton GroupManager instance."""
|
|
410
|
+
global _group_manager
|
|
411
|
+
if _group_manager is None:
|
|
412
|
+
_group_manager = GroupManager()
|
|
413
|
+
return _group_manager
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def reset_group_manager() -> None:
|
|
417
|
+
"""Reset the singleton (for testing)."""
|
|
418
|
+
global _group_manager
|
|
419
|
+
_group_manager = None
|
procler/core/health.py
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Health check management for processes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from ..config import HealthCheckDef
|
|
13
|
+
from .context_local import get_local_context
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HealthStatus(str, Enum):
|
|
17
|
+
"""Health check status."""
|
|
18
|
+
|
|
19
|
+
UNKNOWN = "unknown" # Never checked
|
|
20
|
+
STARTING = "starting" # In start_period grace window
|
|
21
|
+
HEALTHY = "healthy" # Passing health checks
|
|
22
|
+
UNHEALTHY = "unhealthy" # Failed health checks
|
|
23
|
+
DEAD = "dead" # Process not running
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class HealthState:
|
|
28
|
+
"""Current health state for a process."""
|
|
29
|
+
|
|
30
|
+
status: HealthStatus
|
|
31
|
+
last_check: datetime | None = None
|
|
32
|
+
consecutive_failures: int = 0
|
|
33
|
+
last_error: str | None = None
|
|
34
|
+
check_count: int = 0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class HealthChecker:
|
|
38
|
+
"""
|
|
39
|
+
Manages health checks for processes.
|
|
40
|
+
|
|
41
|
+
Runs health check commands at intervals and tracks pass/fail state.
|
|
42
|
+
Supports start_period grace period and configurable retries.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
self._health_states: dict[str, HealthState] = {}
|
|
47
|
+
self._check_tasks: dict[str, asyncio.Task] = {}
|
|
48
|
+
self._callbacks: dict[str, list[Callable[[str, HealthStatus], None]]] = {}
|
|
49
|
+
self._local_context = get_local_context()
|
|
50
|
+
|
|
51
|
+
def get_health(self, process_name: str) -> HealthState:
|
|
52
|
+
"""Get current health state for a process."""
|
|
53
|
+
return self._health_states.get(process_name, HealthState(status=HealthStatus.UNKNOWN))
|
|
54
|
+
|
|
55
|
+
def register_process(
|
|
56
|
+
self,
|
|
57
|
+
process_name: str,
|
|
58
|
+
healthcheck: HealthCheckDef,
|
|
59
|
+
on_status_change: Callable[[str, HealthStatus], None] | None = None,
|
|
60
|
+
) -> None:
|
|
61
|
+
"""
|
|
62
|
+
Register a process for health checking.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
process_name: Name of the process
|
|
66
|
+
healthcheck: Health check definition from config
|
|
67
|
+
on_status_change: Optional callback for status changes
|
|
68
|
+
"""
|
|
69
|
+
# Initialize state
|
|
70
|
+
self._health_states[process_name] = HealthState(
|
|
71
|
+
status=HealthStatus.STARTING,
|
|
72
|
+
last_check=None,
|
|
73
|
+
consecutive_failures=0,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
if on_status_change:
|
|
77
|
+
if process_name not in self._callbacks:
|
|
78
|
+
self._callbacks[process_name] = []
|
|
79
|
+
self._callbacks[process_name].append(on_status_change)
|
|
80
|
+
|
|
81
|
+
async def start_checking(
|
|
82
|
+
self,
|
|
83
|
+
process_name: str,
|
|
84
|
+
healthcheck: HealthCheckDef,
|
|
85
|
+
) -> None:
|
|
86
|
+
"""Start the health check loop for a process."""
|
|
87
|
+
# Cancel any existing check task
|
|
88
|
+
if process_name in self._check_tasks:
|
|
89
|
+
self._check_tasks[process_name].cancel()
|
|
90
|
+
try:
|
|
91
|
+
await self._check_tasks[process_name]
|
|
92
|
+
except asyncio.CancelledError:
|
|
93
|
+
pass
|
|
94
|
+
|
|
95
|
+
# Start new check task
|
|
96
|
+
self._check_tasks[process_name] = asyncio.create_task(self._health_check_loop(process_name, healthcheck))
|
|
97
|
+
|
|
98
|
+
async def stop_checking(self, process_name: str) -> None:
|
|
99
|
+
"""Stop health checking for a process."""
|
|
100
|
+
if process_name in self._check_tasks:
|
|
101
|
+
self._check_tasks[process_name].cancel()
|
|
102
|
+
try:
|
|
103
|
+
await self._check_tasks[process_name]
|
|
104
|
+
except asyncio.CancelledError:
|
|
105
|
+
pass
|
|
106
|
+
del self._check_tasks[process_name]
|
|
107
|
+
|
|
108
|
+
if process_name in self._health_states:
|
|
109
|
+
self._health_states[process_name].status = HealthStatus.DEAD
|
|
110
|
+
|
|
111
|
+
async def _health_check_loop(
|
|
112
|
+
self,
|
|
113
|
+
process_name: str,
|
|
114
|
+
healthcheck: HealthCheckDef,
|
|
115
|
+
) -> None:
|
|
116
|
+
"""Main health check loop for a process."""
|
|
117
|
+
start_period = healthcheck.get_start_period_seconds()
|
|
118
|
+
interval = healthcheck.get_interval_seconds()
|
|
119
|
+
timeout = healthcheck.get_timeout_seconds()
|
|
120
|
+
retries = healthcheck.retries
|
|
121
|
+
|
|
122
|
+
# Wait for start period
|
|
123
|
+
if start_period > 0:
|
|
124
|
+
await asyncio.sleep(start_period)
|
|
125
|
+
|
|
126
|
+
# Update status to unknown (ready to check)
|
|
127
|
+
if process_name in self._health_states:
|
|
128
|
+
self._health_states[process_name].status = HealthStatus.UNKNOWN
|
|
129
|
+
|
|
130
|
+
while True:
|
|
131
|
+
try:
|
|
132
|
+
# Run the health check command
|
|
133
|
+
result = await self._local_context.exec_command(
|
|
134
|
+
healthcheck.test,
|
|
135
|
+
timeout=timeout,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
state = self._health_states.get(process_name)
|
|
139
|
+
if not state:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
state.last_check = datetime.now()
|
|
143
|
+
state.check_count += 1
|
|
144
|
+
old_status = state.status
|
|
145
|
+
|
|
146
|
+
if result.exit_code == 0:
|
|
147
|
+
# Health check passed
|
|
148
|
+
state.consecutive_failures = 0
|
|
149
|
+
state.last_error = None
|
|
150
|
+
state.status = HealthStatus.HEALTHY
|
|
151
|
+
else:
|
|
152
|
+
# Health check failed
|
|
153
|
+
state.consecutive_failures += 1
|
|
154
|
+
state.last_error = result.stderr or f"Exit code: {result.exit_code}"
|
|
155
|
+
|
|
156
|
+
if state.consecutive_failures >= retries:
|
|
157
|
+
state.status = HealthStatus.UNHEALTHY
|
|
158
|
+
|
|
159
|
+
# Notify callbacks if status changed
|
|
160
|
+
if old_status != state.status:
|
|
161
|
+
self._notify_status_change(process_name, state.status)
|
|
162
|
+
|
|
163
|
+
# Wait for next interval
|
|
164
|
+
await asyncio.sleep(interval)
|
|
165
|
+
|
|
166
|
+
except asyncio.CancelledError:
|
|
167
|
+
break
|
|
168
|
+
except Exception as e:
|
|
169
|
+
# Log error but continue checking
|
|
170
|
+
if process_name in self._health_states:
|
|
171
|
+
self._health_states[process_name].last_error = str(e)
|
|
172
|
+
await asyncio.sleep(interval)
|
|
173
|
+
|
|
174
|
+
def _notify_status_change(self, process_name: str, status: HealthStatus) -> None:
|
|
175
|
+
"""Notify all callbacks of a status change."""
|
|
176
|
+
callbacks = self._callbacks.get(process_name, [])
|
|
177
|
+
for callback in callbacks:
|
|
178
|
+
try:
|
|
179
|
+
callback(process_name, status)
|
|
180
|
+
except Exception:
|
|
181
|
+
pass # Don't let callback errors break health checking
|
|
182
|
+
|
|
183
|
+
async def run_single_check(
|
|
184
|
+
self,
|
|
185
|
+
process_name: str,
|
|
186
|
+
healthcheck: HealthCheckDef,
|
|
187
|
+
) -> dict[str, Any]:
|
|
188
|
+
"""
|
|
189
|
+
Run a single health check and return result.
|
|
190
|
+
|
|
191
|
+
Useful for manual health check triggers.
|
|
192
|
+
"""
|
|
193
|
+
timeout = healthcheck.get_timeout_seconds()
|
|
194
|
+
|
|
195
|
+
result = await self._local_context.exec_command(
|
|
196
|
+
healthcheck.test,
|
|
197
|
+
timeout=timeout,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return {
|
|
201
|
+
"success": result.exit_code == 0,
|
|
202
|
+
"exit_code": result.exit_code,
|
|
203
|
+
"stdout": result.stdout,
|
|
204
|
+
"stderr": result.stderr,
|
|
205
|
+
"timestamp": datetime.now().isoformat(),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
async def wait_for_healthy(
|
|
209
|
+
self,
|
|
210
|
+
process_name: str,
|
|
211
|
+
healthcheck: HealthCheckDef,
|
|
212
|
+
timeout: float = 60.0,
|
|
213
|
+
) -> bool:
|
|
214
|
+
"""
|
|
215
|
+
Wait until a process becomes healthy or timeout.
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
process_name: Name of the process
|
|
219
|
+
healthcheck: Health check definition
|
|
220
|
+
timeout: Maximum time to wait in seconds
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
True if healthy, False if timeout or unhealthy
|
|
224
|
+
"""
|
|
225
|
+
start = datetime.now()
|
|
226
|
+
check_interval = min(healthcheck.get_interval_seconds(), 1.0)
|
|
227
|
+
|
|
228
|
+
while True:
|
|
229
|
+
elapsed = (datetime.now() - start).total_seconds()
|
|
230
|
+
if elapsed >= timeout:
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
result = await self.run_single_check(process_name, healthcheck)
|
|
234
|
+
if result["success"]:
|
|
235
|
+
if process_name in self._health_states:
|
|
236
|
+
self._health_states[process_name].status = HealthStatus.HEALTHY
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
await asyncio.sleep(check_interval)
|
|
240
|
+
|
|
241
|
+
def to_dict(self, process_name: str) -> dict[str, Any]:
|
|
242
|
+
"""Get health state as a dictionary."""
|
|
243
|
+
state = self.get_health(process_name)
|
|
244
|
+
return {
|
|
245
|
+
"status": state.status.value,
|
|
246
|
+
"last_check": state.last_check.isoformat() if state.last_check else None,
|
|
247
|
+
"consecutive_failures": state.consecutive_failures,
|
|
248
|
+
"last_error": state.last_error,
|
|
249
|
+
"check_count": state.check_count,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
def reset(self) -> None:
|
|
253
|
+
"""Reset all health state (for testing)."""
|
|
254
|
+
# Cancel all tasks - they'll clean up on their own
|
|
255
|
+
for task in self._check_tasks.values():
|
|
256
|
+
if not task.done():
|
|
257
|
+
task.cancel()
|
|
258
|
+
self._check_tasks.clear()
|
|
259
|
+
self._health_states.clear()
|
|
260
|
+
self._callbacks.clear()
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# Singleton
|
|
264
|
+
_health_checker: HealthChecker | None = None
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_health_checker() -> HealthChecker:
|
|
268
|
+
"""Get the singleton HealthChecker instance."""
|
|
269
|
+
global _health_checker
|
|
270
|
+
if _health_checker is None:
|
|
271
|
+
_health_checker = HealthChecker()
|
|
272
|
+
return _health_checker
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def reset_health_checker() -> None:
|
|
276
|
+
"""Reset the singleton (for testing)."""
|
|
277
|
+
global _health_checker
|
|
278
|
+
if _health_checker:
|
|
279
|
+
_health_checker.reset()
|
|
280
|
+
_health_checker = None
|