mlx-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_stack/__init__.py +5 -0
- mlx_stack/_version.py +24 -0
- mlx_stack/cli/__init__.py +5 -0
- mlx_stack/cli/bench.py +221 -0
- mlx_stack/cli/config.py +166 -0
- mlx_stack/cli/down.py +109 -0
- mlx_stack/cli/init.py +180 -0
- mlx_stack/cli/install.py +165 -0
- mlx_stack/cli/logs.py +234 -0
- mlx_stack/cli/main.py +187 -0
- mlx_stack/cli/models.py +304 -0
- mlx_stack/cli/profile.py +65 -0
- mlx_stack/cli/pull.py +134 -0
- mlx_stack/cli/recommend.py +397 -0
- mlx_stack/cli/status.py +111 -0
- mlx_stack/cli/up.py +163 -0
- mlx_stack/cli/watch.py +252 -0
- mlx_stack/core/__init__.py +1 -0
- mlx_stack/core/benchmark.py +1182 -0
- mlx_stack/core/catalog.py +560 -0
- mlx_stack/core/config.py +471 -0
- mlx_stack/core/deps.py +323 -0
- mlx_stack/core/hardware.py +304 -0
- mlx_stack/core/launchd.py +531 -0
- mlx_stack/core/litellm_gen.py +188 -0
- mlx_stack/core/log_rotation.py +231 -0
- mlx_stack/core/log_viewer.py +386 -0
- mlx_stack/core/models.py +639 -0
- mlx_stack/core/paths.py +79 -0
- mlx_stack/core/process.py +887 -0
- mlx_stack/core/pull.py +815 -0
- mlx_stack/core/scoring.py +611 -0
- mlx_stack/core/stack_down.py +317 -0
- mlx_stack/core/stack_init.py +524 -0
- mlx_stack/core/stack_status.py +229 -0
- mlx_stack/core/stack_up.py +856 -0
- mlx_stack/core/watchdog.py +744 -0
- mlx_stack/data/__init__.py +1 -0
- mlx_stack/data/catalog/__init__.py +1 -0
- mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
- mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
- mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
- mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
- mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
- mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
- mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
- mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
- mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
- mlx_stack/py.typed +1 -0
- mlx_stack/utils/__init__.py +1 -0
- mlx_stack-0.1.0.dist-info/METADATA +397 -0
- mlx_stack-0.1.0.dist-info/RECORD +61 -0
- mlx_stack-0.1.0.dist-info/WHEEL +4 -0
- mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
- mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,744 @@
|
|
|
1
|
+
"""Watchdog health monitor for mlx-stack.
|
|
2
|
+
|
|
3
|
+
Implements a long-running health monitor that:
|
|
4
|
+
- Polls service health at configurable intervals (default 30s)
|
|
5
|
+
- Auto-restarts crashed services (PID file exists, process dead)
|
|
6
|
+
- Does NOT restart stopped services (no PID file)
|
|
7
|
+
- Tracks flap detection with configurable max-restarts threshold
|
|
8
|
+
- Uses exponential backoff on restart delay
|
|
9
|
+
- Acquires lock only during restart operations
|
|
10
|
+
- Supports daemon mode (os.fork + os.setsid + PID file)
|
|
11
|
+
- Handles SIGTERM/SIGINT for clean shutdown
|
|
12
|
+
- Triggers log rotation each poll cycle
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import time
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from mlx_stack.core.config import ConfigCorruptError, get_value
|
|
25
|
+
from mlx_stack.core.log_rotation import rotate_log
|
|
26
|
+
from mlx_stack.core.paths import get_data_home, get_logs_dir
|
|
27
|
+
from mlx_stack.core.process import (
|
|
28
|
+
LockError,
|
|
29
|
+
acquire_lock,
|
|
30
|
+
is_process_alive,
|
|
31
|
+
read_pid_file,
|
|
32
|
+
remove_pid_file,
|
|
33
|
+
start_service,
|
|
34
|
+
write_pid_file,
|
|
35
|
+
)
|
|
36
|
+
from mlx_stack.core.stack_status import (
|
|
37
|
+
ServiceStatus,
|
|
38
|
+
run_status,
|
|
39
|
+
)
|
|
40
|
+
from mlx_stack.core.stack_up import (
|
|
41
|
+
LITELLM_SERVICE_NAME,
|
|
42
|
+
build_litellm_command,
|
|
43
|
+
build_vllm_command,
|
|
44
|
+
load_stack_definition,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
# --------------------------------------------------------------------------- #
|
|
50
|
+
# Constants
|
|
51
|
+
# --------------------------------------------------------------------------- #
|
|
52
|
+
|
|
53
|
+
WATCHDOG_SERVICE_NAME = "watchdog"
|
|
54
|
+
WATCHDOG_PID_FILE = "watchdog.pid"
|
|
55
|
+
|
|
56
|
+
DEFAULT_INTERVAL = 30
|
|
57
|
+
DEFAULT_MAX_RESTARTS = 5
|
|
58
|
+
DEFAULT_RESTART_DELAY = 10
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# --------------------------------------------------------------------------- #
|
|
62
|
+
# Exceptions
|
|
63
|
+
# --------------------------------------------------------------------------- #
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class WatchdogError(Exception):
|
|
67
|
+
"""Raised when the watchdog encounters a fatal error."""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# --------------------------------------------------------------------------- #
|
|
71
|
+
# Data classes
|
|
72
|
+
# --------------------------------------------------------------------------- #
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class RestartRecord:
|
|
77
|
+
"""Record of a service restart event."""
|
|
78
|
+
|
|
79
|
+
service: str
|
|
80
|
+
timestamp: float
|
|
81
|
+
reason: str
|
|
82
|
+
attempt: int
|
|
83
|
+
success: bool
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class ServiceTracker:
|
|
88
|
+
"""Tracks restart history and flap state per service."""
|
|
89
|
+
|
|
90
|
+
restart_timestamps: list[float] = field(default_factory=list)
|
|
91
|
+
restart_count: int = 0
|
|
92
|
+
is_flapping: bool = False
|
|
93
|
+
last_restart_time: float = 0.0
|
|
94
|
+
consecutive_failures: int = 0
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class WatchdogState:
|
|
99
|
+
"""Internal state of the watchdog monitor."""
|
|
100
|
+
|
|
101
|
+
shutdown_requested: bool = False
|
|
102
|
+
service_trackers: dict[str, ServiceTracker] = field(default_factory=dict)
|
|
103
|
+
restart_log: list[RestartRecord] = field(default_factory=list)
|
|
104
|
+
is_daemon: bool = False
|
|
105
|
+
cycle_count: int = 0
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class PollResult:
|
|
110
|
+
"""Result of a single poll cycle."""
|
|
111
|
+
|
|
112
|
+
statuses: list[ServiceStatus]
|
|
113
|
+
restarts_attempted: int = 0
|
|
114
|
+
restarts_succeeded: int = 0
|
|
115
|
+
rotations_performed: int = 0
|
|
116
|
+
flapping_services: list[str] = field(default_factory=list)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# --------------------------------------------------------------------------- #
|
|
120
|
+
# Stack definition helpers
|
|
121
|
+
# --------------------------------------------------------------------------- #
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _load_stack_for_watchdog(stack_name: str = "default") -> dict[str, Any]:
|
|
125
|
+
"""Load the stack definition for watchdog use.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
stack_name: Stack definition name.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
The parsed stack definition dict.
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
WatchdogError: If the stack cannot be loaded.
|
|
135
|
+
"""
|
|
136
|
+
try:
|
|
137
|
+
return load_stack_definition(stack_name)
|
|
138
|
+
except Exception as exc:
|
|
139
|
+
msg = f"No stack configuration found. Run 'mlx-stack init' first.\n{exc}"
|
|
140
|
+
raise WatchdogError(msg) from None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _get_tier_by_name(
|
|
144
|
+
stack: dict[str, Any],
|
|
145
|
+
service_name: str,
|
|
146
|
+
) -> dict[str, Any] | None:
|
|
147
|
+
"""Find a tier definition by service name.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
stack: The stack definition dict.
|
|
151
|
+
service_name: The service/tier name to look up.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
The tier dict if found, else None.
|
|
155
|
+
"""
|
|
156
|
+
for tier in stack.get("tiers", []):
|
|
157
|
+
if tier.get("name") == service_name:
|
|
158
|
+
return tier
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# --------------------------------------------------------------------------- #
|
|
163
|
+
# Flap detection
|
|
164
|
+
# --------------------------------------------------------------------------- #
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def check_flapping(
|
|
168
|
+
tracker: ServiceTracker,
|
|
169
|
+
max_restarts: int,
|
|
170
|
+
window_seconds: float = 600.0,
|
|
171
|
+
) -> bool:
|
|
172
|
+
"""Check if a service is flapping (restarting too frequently).
|
|
173
|
+
|
|
174
|
+
A service is considered flapping if it has been restarted more than
|
|
175
|
+
``max_restarts`` times within the rolling window.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
tracker: The service's restart tracker.
|
|
179
|
+
max_restarts: Maximum restarts allowed in the window.
|
|
180
|
+
window_seconds: The rolling window size in seconds (default 10 min).
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
True if the service is flapping.
|
|
184
|
+
"""
|
|
185
|
+
now = time.monotonic()
|
|
186
|
+
cutoff = now - window_seconds
|
|
187
|
+
|
|
188
|
+
# Prune old timestamps outside the window
|
|
189
|
+
tracker.restart_timestamps = [
|
|
190
|
+
ts for ts in tracker.restart_timestamps if ts > cutoff
|
|
191
|
+
]
|
|
192
|
+
|
|
193
|
+
if len(tracker.restart_timestamps) >= max_restarts:
|
|
194
|
+
tracker.is_flapping = True
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
return False
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def reset_flap_state(
|
|
201
|
+
tracker: ServiceTracker,
|
|
202
|
+
stable_period: float = 300.0,
|
|
203
|
+
) -> bool:
|
|
204
|
+
"""Reset flap state if the service has been stable.
|
|
205
|
+
|
|
206
|
+
A service is considered stable if no restarts have occurred in the
|
|
207
|
+
last ``stable_period`` seconds.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
tracker: The service's restart tracker.
|
|
211
|
+
stable_period: Time in seconds of no restarts to consider stable.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
True if the flap state was reset.
|
|
215
|
+
"""
|
|
216
|
+
if not tracker.is_flapping:
|
|
217
|
+
return False
|
|
218
|
+
|
|
219
|
+
now = time.monotonic()
|
|
220
|
+
if tracker.last_restart_time == 0.0:
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
if (now - tracker.last_restart_time) >= stable_period:
|
|
224
|
+
tracker.is_flapping = False
|
|
225
|
+
tracker.restart_timestamps.clear()
|
|
226
|
+
tracker.restart_count = 0
|
|
227
|
+
tracker.consecutive_failures = 0
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
return False
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# --------------------------------------------------------------------------- #
|
|
234
|
+
# Restart delay with exponential backoff
|
|
235
|
+
# --------------------------------------------------------------------------- #
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def calculate_restart_delay(
|
|
239
|
+
base_delay: float,
|
|
240
|
+
consecutive_failures: int,
|
|
241
|
+
max_delay: float = 300.0,
|
|
242
|
+
) -> float:
|
|
243
|
+
"""Calculate the restart delay with exponential backoff.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
base_delay: The base restart delay in seconds.
|
|
247
|
+
consecutive_failures: Number of consecutive failures.
|
|
248
|
+
max_delay: Maximum delay cap in seconds.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
The delay in seconds before the next restart attempt.
|
|
252
|
+
"""
|
|
253
|
+
if consecutive_failures <= 0:
|
|
254
|
+
return base_delay
|
|
255
|
+
|
|
256
|
+
delay = base_delay * (2 ** (consecutive_failures - 1))
|
|
257
|
+
return min(delay, max_delay)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# --------------------------------------------------------------------------- #
|
|
261
|
+
# Service restart
|
|
262
|
+
# --------------------------------------------------------------------------- #
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def restart_service(
|
|
266
|
+
service_name: str,
|
|
267
|
+
stack: dict[str, Any],
|
|
268
|
+
tracker: ServiceTracker,
|
|
269
|
+
vllm_binary: str | None = None,
|
|
270
|
+
litellm_binary: str | None = None,
|
|
271
|
+
) -> bool:
|
|
272
|
+
"""Restart a crashed service using the stack definition.
|
|
273
|
+
|
|
274
|
+
Acquires the lock only during the restart operation and releases
|
|
275
|
+
it immediately after.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
service_name: Name of the service to restart.
|
|
279
|
+
stack: The stack definition dict.
|
|
280
|
+
tracker: The service's restart tracker.
|
|
281
|
+
vllm_binary: Path to vllm-mlx binary (resolved if None).
|
|
282
|
+
litellm_binary: Path to litellm binary (resolved if None).
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
True if the restart succeeded, False otherwise.
|
|
286
|
+
"""
|
|
287
|
+
import shutil
|
|
288
|
+
|
|
289
|
+
# Resolve binaries
|
|
290
|
+
if vllm_binary is None:
|
|
291
|
+
vllm_binary = shutil.which("vllm-mlx") or "vllm-mlx"
|
|
292
|
+
if litellm_binary is None:
|
|
293
|
+
litellm_binary = shutil.which("litellm") or "litellm"
|
|
294
|
+
|
|
295
|
+
# Acquire lock BEFORE removing PID file. If the lock cannot be
|
|
296
|
+
# obtained, leave the PID file intact so the service stays in
|
|
297
|
+
# "crashed" state for the next poll cycle.
|
|
298
|
+
try:
|
|
299
|
+
with acquire_lock():
|
|
300
|
+
# Clean up old PID file only after lock is held
|
|
301
|
+
remove_pid_file(service_name)
|
|
302
|
+
|
|
303
|
+
if service_name == LITELLM_SERVICE_NAME:
|
|
304
|
+
return _restart_litellm(service_name, stack, litellm_binary)
|
|
305
|
+
else:
|
|
306
|
+
return _restart_tier(service_name, stack, vllm_binary)
|
|
307
|
+
except LockError:
|
|
308
|
+
logger.warning(
|
|
309
|
+
"Could not acquire lock to restart '%s' — another operation is in progress.",
|
|
310
|
+
service_name,
|
|
311
|
+
)
|
|
312
|
+
return False
|
|
313
|
+
except Exception:
|
|
314
|
+
logger.exception("Failed to restart service '%s'.", service_name)
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _restart_tier(
|
|
319
|
+
service_name: str,
|
|
320
|
+
stack: dict[str, Any],
|
|
321
|
+
vllm_binary: str,
|
|
322
|
+
) -> bool:
|
|
323
|
+
"""Restart a vllm-mlx tier service.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
service_name: The tier name.
|
|
327
|
+
stack: The stack definition dict.
|
|
328
|
+
vllm_binary: Path to vllm-mlx binary.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
True if the restart succeeded.
|
|
332
|
+
"""
|
|
333
|
+
tier = _get_tier_by_name(stack, service_name)
|
|
334
|
+
if tier is None:
|
|
335
|
+
logger.error("Tier '%s' not found in stack definition.", service_name)
|
|
336
|
+
return False
|
|
337
|
+
|
|
338
|
+
cmd = build_vllm_command(tier, vllm_binary)
|
|
339
|
+
port = tier["port"]
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
start_service(
|
|
343
|
+
service_name=service_name,
|
|
344
|
+
cmd=cmd,
|
|
345
|
+
port=port,
|
|
346
|
+
)
|
|
347
|
+
return True
|
|
348
|
+
except Exception:
|
|
349
|
+
logger.exception("Failed to start tier '%s'.", service_name)
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _restart_litellm(
|
|
354
|
+
service_name: str,
|
|
355
|
+
stack: dict[str, Any],
|
|
356
|
+
litellm_binary: str,
|
|
357
|
+
) -> bool:
|
|
358
|
+
"""Restart the LiteLLM proxy service.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
service_name: The service name (litellm).
|
|
362
|
+
stack: The stack definition dict.
|
|
363
|
+
litellm_binary: Path to litellm binary.
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
True if the restart succeeded.
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
litellm_port = int(get_value("litellm-port"))
|
|
370
|
+
except (ConfigCorruptError, ValueError):
|
|
371
|
+
litellm_port = 4000
|
|
372
|
+
|
|
373
|
+
litellm_config_path = get_data_home() / "litellm.yaml"
|
|
374
|
+
|
|
375
|
+
cmd = build_litellm_command(litellm_binary, litellm_port, litellm_config_path)
|
|
376
|
+
|
|
377
|
+
# Build env with OpenRouter key if configured
|
|
378
|
+
env: dict[str, str] | None = None
|
|
379
|
+
try:
|
|
380
|
+
openrouter_key = str(get_value("openrouter-key"))
|
|
381
|
+
if openrouter_key:
|
|
382
|
+
env = {"OPENROUTER_API_KEY": openrouter_key}
|
|
383
|
+
except (ConfigCorruptError, Exception):
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
try:
|
|
387
|
+
start_service(
|
|
388
|
+
service_name=service_name,
|
|
389
|
+
cmd=cmd,
|
|
390
|
+
port=litellm_port,
|
|
391
|
+
env=env,
|
|
392
|
+
)
|
|
393
|
+
return True
|
|
394
|
+
except Exception:
|
|
395
|
+
logger.exception("Failed to start LiteLLM.")
|
|
396
|
+
return False
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
# --------------------------------------------------------------------------- #
|
|
400
|
+
# Log rotation during poll
|
|
401
|
+
# --------------------------------------------------------------------------- #
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def rotate_service_logs() -> int:
|
|
405
|
+
"""Rotate all service log files that exceed the configured threshold.
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Number of files that were rotated.
|
|
409
|
+
"""
|
|
410
|
+
try:
|
|
411
|
+
max_size_mb = int(get_value("log-max-size-mb"))
|
|
412
|
+
except (ConfigCorruptError, ValueError):
|
|
413
|
+
max_size_mb = 50
|
|
414
|
+
|
|
415
|
+
try:
|
|
416
|
+
max_files = int(get_value("log-max-files"))
|
|
417
|
+
except (ConfigCorruptError, ValueError):
|
|
418
|
+
max_files = 5
|
|
419
|
+
|
|
420
|
+
logs_dir = get_logs_dir()
|
|
421
|
+
if not logs_dir.exists():
|
|
422
|
+
return 0
|
|
423
|
+
|
|
424
|
+
rotated_count = 0
|
|
425
|
+
for log_path in logs_dir.iterdir():
|
|
426
|
+
if log_path.suffix == ".log" and log_path.is_file():
|
|
427
|
+
try:
|
|
428
|
+
if rotate_log(log_path, max_size_mb=max_size_mb, max_files=max_files):
|
|
429
|
+
rotated_count += 1
|
|
430
|
+
except Exception:
|
|
431
|
+
logger.exception("Error rotating log %s.", log_path)
|
|
432
|
+
|
|
433
|
+
return rotated_count
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# --------------------------------------------------------------------------- #
|
|
437
|
+
# Daemon mode
|
|
438
|
+
# --------------------------------------------------------------------------- #
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def check_existing_watchdog() -> int | None:
|
|
442
|
+
"""Check if a watchdog is already running.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
The PID of the running watchdog, or None.
|
|
446
|
+
"""
|
|
447
|
+
try:
|
|
448
|
+
pid = read_pid_file(WATCHDOG_SERVICE_NAME)
|
|
449
|
+
except Exception:
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
if pid is None:
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
if is_process_alive(pid):
|
|
456
|
+
return pid
|
|
457
|
+
|
|
458
|
+
# Stale PID file — clean up
|
|
459
|
+
remove_pid_file(WATCHDOG_SERVICE_NAME)
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def daemonize() -> None:
|
|
464
|
+
"""Fork the process to run as a daemon.
|
|
465
|
+
|
|
466
|
+
Uses the double-fork technique with os.setsid() to fully detach
|
|
467
|
+
from the terminal. Writes the daemon PID to the watchdog PID file.
|
|
468
|
+
|
|
469
|
+
Raises:
|
|
470
|
+
WatchdogError: If daemonization fails.
|
|
471
|
+
"""
|
|
472
|
+
try:
|
|
473
|
+
pid = os.fork()
|
|
474
|
+
if pid > 0:
|
|
475
|
+
# Parent exits
|
|
476
|
+
os._exit(0)
|
|
477
|
+
except OSError as exc:
|
|
478
|
+
msg = f"First fork failed: {exc}"
|
|
479
|
+
raise WatchdogError(msg) from None
|
|
480
|
+
|
|
481
|
+
# Become session leader
|
|
482
|
+
os.setsid()
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
pid = os.fork()
|
|
486
|
+
if pid > 0:
|
|
487
|
+
# First child exits
|
|
488
|
+
os._exit(0)
|
|
489
|
+
except OSError as exc:
|
|
490
|
+
msg = f"Second fork failed: {exc}"
|
|
491
|
+
raise WatchdogError(msg) from None
|
|
492
|
+
|
|
493
|
+
# Redirect stdin/stdout/stderr to /dev/null
|
|
494
|
+
devnull = os.open(os.devnull, os.O_RDWR)
|
|
495
|
+
os.dup2(devnull, 0)
|
|
496
|
+
os.dup2(devnull, 1)
|
|
497
|
+
os.dup2(devnull, 2)
|
|
498
|
+
os.close(devnull)
|
|
499
|
+
|
|
500
|
+
# Write daemon PID
|
|
501
|
+
write_pid_file(WATCHDOG_SERVICE_NAME, os.getpid())
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def remove_watchdog_pid() -> None:
|
|
505
|
+
"""Remove the watchdog PID file."""
|
|
506
|
+
remove_pid_file(WATCHDOG_SERVICE_NAME)
|
|
507
|
+
|
|
508
|
+
|
|
509
|
+
# --------------------------------------------------------------------------- #
|
|
510
|
+
# Signal handling
|
|
511
|
+
# --------------------------------------------------------------------------- #
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def setup_signal_handlers(state: WatchdogState) -> None:
|
|
515
|
+
"""Register SIGTERM and SIGINT handlers for clean shutdown.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
state: The watchdog state — sets shutdown_requested flag.
|
|
519
|
+
"""
|
|
520
|
+
def handler(signum: int, frame: Any) -> None:
|
|
521
|
+
state.shutdown_requested = True
|
|
522
|
+
|
|
523
|
+
signal.signal(signal.SIGTERM, handler)
|
|
524
|
+
signal.signal(signal.SIGINT, handler)
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# --------------------------------------------------------------------------- #
|
|
528
|
+
# Main poll cycle
|
|
529
|
+
# --------------------------------------------------------------------------- #
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def poll_cycle(
|
|
533
|
+
state: WatchdogState,
|
|
534
|
+
stack: dict[str, Any],
|
|
535
|
+
interval: int,
|
|
536
|
+
max_restarts: int,
|
|
537
|
+
restart_delay: int,
|
|
538
|
+
vllm_binary: str | None = None,
|
|
539
|
+
litellm_binary: str | None = None,
|
|
540
|
+
) -> PollResult:
|
|
541
|
+
"""Execute a single watchdog poll cycle.
|
|
542
|
+
|
|
543
|
+
1. Get service status for all services.
|
|
544
|
+
2. For each crashed service, attempt restart (with flap/backoff checks).
|
|
545
|
+
3. Rotate logs.
|
|
546
|
+
4. Return the poll result.
|
|
547
|
+
|
|
548
|
+
Args:
|
|
549
|
+
state: The watchdog state.
|
|
550
|
+
stack: The stack definition.
|
|
551
|
+
interval: Poll interval (for display).
|
|
552
|
+
max_restarts: Maximum restarts before flap detection.
|
|
553
|
+
restart_delay: Base restart delay in seconds.
|
|
554
|
+
vllm_binary: Path to vllm-mlx binary.
|
|
555
|
+
litellm_binary: Path to litellm binary.
|
|
556
|
+
|
|
557
|
+
Returns:
|
|
558
|
+
PollResult with status and action details.
|
|
559
|
+
"""
|
|
560
|
+
state.cycle_count += 1
|
|
561
|
+
|
|
562
|
+
# Get current status
|
|
563
|
+
status_result = run_status()
|
|
564
|
+
result = PollResult(statuses=status_result.services)
|
|
565
|
+
|
|
566
|
+
if status_result.no_stack:
|
|
567
|
+
return result
|
|
568
|
+
|
|
569
|
+
# Check each service for crashed state
|
|
570
|
+
for svc in status_result.services:
|
|
571
|
+
service_name = svc.tier
|
|
572
|
+
tracker = state.service_trackers.setdefault(
|
|
573
|
+
service_name, ServiceTracker()
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
# Try to reset flap state if service has been stable
|
|
577
|
+
reset_flap_state(tracker)
|
|
578
|
+
|
|
579
|
+
if svc.status != "crashed":
|
|
580
|
+
# Service is not crashed — reset consecutive failure counter
|
|
581
|
+
# if it was previously restarted and is now healthy
|
|
582
|
+
if svc.status == "healthy" and tracker.consecutive_failures > 0:
|
|
583
|
+
tracker.consecutive_failures = 0
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
# Service is crashed — decide whether to restart
|
|
587
|
+
if tracker.is_flapping:
|
|
588
|
+
result.flapping_services.append(service_name)
|
|
589
|
+
continue
|
|
590
|
+
|
|
591
|
+
# Check flap detection before restarting
|
|
592
|
+
if check_flapping(tracker, max_restarts):
|
|
593
|
+
result.flapping_services.append(service_name)
|
|
594
|
+
logger.warning(
|
|
595
|
+
"Service '%s' marked as flapping after %d restarts. "
|
|
596
|
+
"Stopping auto-restart.",
|
|
597
|
+
service_name,
|
|
598
|
+
max_restarts,
|
|
599
|
+
)
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
# Calculate restart delay with backoff
|
|
603
|
+
delay = calculate_restart_delay(
|
|
604
|
+
base_delay=float(restart_delay),
|
|
605
|
+
consecutive_failures=tracker.consecutive_failures,
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# Check if enough time has passed since last restart
|
|
609
|
+
now = time.monotonic()
|
|
610
|
+
if tracker.last_restart_time > 0:
|
|
611
|
+
elapsed = now - tracker.last_restart_time
|
|
612
|
+
if elapsed < delay:
|
|
613
|
+
continue # Not enough time elapsed; skip this cycle
|
|
614
|
+
|
|
615
|
+
# Attempt restart
|
|
616
|
+
result.restarts_attempted += 1
|
|
617
|
+
tracker.restart_count += 1
|
|
618
|
+
|
|
619
|
+
success = restart_service(
|
|
620
|
+
service_name=service_name,
|
|
621
|
+
stack=stack,
|
|
622
|
+
tracker=tracker,
|
|
623
|
+
vllm_binary=vllm_binary,
|
|
624
|
+
litellm_binary=litellm_binary,
|
|
625
|
+
)
|
|
626
|
+
|
|
627
|
+
record = RestartRecord(
|
|
628
|
+
service=service_name,
|
|
629
|
+
timestamp=time.time(),
|
|
630
|
+
reason="crashed (PID file exists, process dead)",
|
|
631
|
+
attempt=tracker.restart_count,
|
|
632
|
+
success=success,
|
|
633
|
+
)
|
|
634
|
+
state.restart_log.append(record)
|
|
635
|
+
|
|
636
|
+
tracker.last_restart_time = time.monotonic()
|
|
637
|
+
tracker.restart_timestamps.append(time.monotonic())
|
|
638
|
+
|
|
639
|
+
if success:
|
|
640
|
+
result.restarts_succeeded += 1
|
|
641
|
+
tracker.consecutive_failures = 0
|
|
642
|
+
else:
|
|
643
|
+
tracker.consecutive_failures += 1
|
|
644
|
+
|
|
645
|
+
# Rotate logs
|
|
646
|
+
result.rotations_performed = rotate_service_logs()
|
|
647
|
+
|
|
648
|
+
return result
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
# --------------------------------------------------------------------------- #
|
|
652
|
+
# Main watchdog loop
|
|
653
|
+
# --------------------------------------------------------------------------- #
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def run_watchdog(
|
|
657
|
+
interval: int = DEFAULT_INTERVAL,
|
|
658
|
+
max_restarts: int = DEFAULT_MAX_RESTARTS,
|
|
659
|
+
restart_delay: int = DEFAULT_RESTART_DELAY,
|
|
660
|
+
daemon: bool = False,
|
|
661
|
+
stack_name: str = "default",
|
|
662
|
+
status_callback: Any = None,
|
|
663
|
+
restart_callback: Any = None,
|
|
664
|
+
) -> WatchdogState:
|
|
665
|
+
"""Run the watchdog health monitor main loop.
|
|
666
|
+
|
|
667
|
+
Args:
|
|
668
|
+
interval: Seconds between health polls.
|
|
669
|
+
max_restarts: Max restarts before marking as flapping.
|
|
670
|
+
restart_delay: Base delay in seconds before restart.
|
|
671
|
+
daemon: Whether to daemonize.
|
|
672
|
+
stack_name: Stack definition name.
|
|
673
|
+
status_callback: Called with (PollResult, WatchdogState) each cycle.
|
|
674
|
+
restart_callback: Called with (RestartRecord) for each restart event.
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
The final WatchdogState.
|
|
678
|
+
|
|
679
|
+
Raises:
|
|
680
|
+
WatchdogError: On fatal errors (no stack, already running, etc.).
|
|
681
|
+
"""
|
|
682
|
+
# Check stack prerequisite
|
|
683
|
+
stack = _load_stack_for_watchdog(stack_name)
|
|
684
|
+
|
|
685
|
+
# Check for existing watchdog
|
|
686
|
+
existing_pid = check_existing_watchdog()
|
|
687
|
+
if existing_pid is not None:
|
|
688
|
+
msg = (
|
|
689
|
+
f"A watchdog is already running (PID {existing_pid}). "
|
|
690
|
+
"Stop it before starting a new one."
|
|
691
|
+
)
|
|
692
|
+
raise WatchdogError(msg)
|
|
693
|
+
|
|
694
|
+
state = WatchdogState(is_daemon=daemon)
|
|
695
|
+
|
|
696
|
+
# Daemon mode
|
|
697
|
+
if daemon:
|
|
698
|
+
daemonize()
|
|
699
|
+
|
|
700
|
+
# Set up signal handlers
|
|
701
|
+
setup_signal_handlers(state)
|
|
702
|
+
|
|
703
|
+
# Write PID for non-daemon mode too (for single-instance check)
|
|
704
|
+
if not daemon:
|
|
705
|
+
write_pid_file(WATCHDOG_SERVICE_NAME, os.getpid())
|
|
706
|
+
|
|
707
|
+
# Resolve binaries
|
|
708
|
+
import shutil
|
|
709
|
+
|
|
710
|
+
vllm_binary = shutil.which("vllm-mlx") or "vllm-mlx"
|
|
711
|
+
litellm_binary = shutil.which("litellm") or "litellm"
|
|
712
|
+
|
|
713
|
+
try:
|
|
714
|
+
while not state.shutdown_requested:
|
|
715
|
+
result = poll_cycle(
|
|
716
|
+
state=state,
|
|
717
|
+
stack=stack,
|
|
718
|
+
interval=interval,
|
|
719
|
+
max_restarts=max_restarts,
|
|
720
|
+
restart_delay=restart_delay,
|
|
721
|
+
vllm_binary=vllm_binary,
|
|
722
|
+
litellm_binary=litellm_binary,
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
# Callbacks
|
|
726
|
+
if status_callback is not None:
|
|
727
|
+
status_callback(result, state)
|
|
728
|
+
|
|
729
|
+
if restart_callback is not None and result.restarts_attempted > 0:
|
|
730
|
+
for record in state.restart_log[-result.restarts_attempted:]:
|
|
731
|
+
restart_callback(record)
|
|
732
|
+
|
|
733
|
+
# Sleep in small increments so we can check shutdown flag
|
|
734
|
+
sleep_end = time.monotonic() + interval
|
|
735
|
+
while time.monotonic() < sleep_end:
|
|
736
|
+
if state.shutdown_requested:
|
|
737
|
+
break
|
|
738
|
+
time.sleep(min(0.5, sleep_end - time.monotonic()))
|
|
739
|
+
|
|
740
|
+
finally:
|
|
741
|
+
# Clean shutdown
|
|
742
|
+
remove_watchdog_pid()
|
|
743
|
+
|
|
744
|
+
return state
|