mlx-stack 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mlx_stack/__init__.py +5 -0
- mlx_stack/_version.py +24 -0
- mlx_stack/cli/__init__.py +5 -0
- mlx_stack/cli/bench.py +221 -0
- mlx_stack/cli/config.py +166 -0
- mlx_stack/cli/down.py +109 -0
- mlx_stack/cli/init.py +180 -0
- mlx_stack/cli/install.py +165 -0
- mlx_stack/cli/logs.py +234 -0
- mlx_stack/cli/main.py +187 -0
- mlx_stack/cli/models.py +304 -0
- mlx_stack/cli/profile.py +65 -0
- mlx_stack/cli/pull.py +134 -0
- mlx_stack/cli/recommend.py +397 -0
- mlx_stack/cli/status.py +111 -0
- mlx_stack/cli/up.py +163 -0
- mlx_stack/cli/watch.py +252 -0
- mlx_stack/core/__init__.py +1 -0
- mlx_stack/core/benchmark.py +1182 -0
- mlx_stack/core/catalog.py +560 -0
- mlx_stack/core/config.py +471 -0
- mlx_stack/core/deps.py +323 -0
- mlx_stack/core/hardware.py +304 -0
- mlx_stack/core/launchd.py +531 -0
- mlx_stack/core/litellm_gen.py +188 -0
- mlx_stack/core/log_rotation.py +231 -0
- mlx_stack/core/log_viewer.py +386 -0
- mlx_stack/core/models.py +639 -0
- mlx_stack/core/paths.py +79 -0
- mlx_stack/core/process.py +887 -0
- mlx_stack/core/pull.py +815 -0
- mlx_stack/core/scoring.py +611 -0
- mlx_stack/core/stack_down.py +317 -0
- mlx_stack/core/stack_init.py +524 -0
- mlx_stack/core/stack_status.py +229 -0
- mlx_stack/core/stack_up.py +856 -0
- mlx_stack/core/watchdog.py +744 -0
- mlx_stack/data/__init__.py +1 -0
- mlx_stack/data/catalog/__init__.py +1 -0
- mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
- mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
- mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
- mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
- mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
- mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
- mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
- mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
- mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
- mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
- mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
- mlx_stack/py.typed +1 -0
- mlx_stack/utils/__init__.py +1 -0
- mlx_stack-0.1.0.dist-info/METADATA +397 -0
- mlx_stack-0.1.0.dist-info/RECORD +61 -0
- mlx_stack-0.1.0.dist-info/WHEEL +4 -0
- mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
- mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,887 @@
|
|
|
1
|
+
"""Process management module for mlx-stack.
|
|
2
|
+
|
|
3
|
+
Handles starting, stopping, and health-checking of vllm-mlx and LiteLLM
|
|
4
|
+
subprocesses. Manages PID files in ~/.mlx-stack/pids/, log file redirection
|
|
5
|
+
to ~/.mlx-stack/logs/, HTTP health checks with exponential backoff,
|
|
6
|
+
lockfile (fcntl.flock) for concurrent invocation prevention,
|
|
7
|
+
SIGTERM/SIGKILL shutdown with grace period, stale PID detection and cleanup,
|
|
8
|
+
and port conflict detection.
|
|
9
|
+
|
|
10
|
+
This is the infrastructure module used by up, down, and status commands.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import fcntl
|
|
16
|
+
import os
|
|
17
|
+
import signal
|
|
18
|
+
import subprocess
|
|
19
|
+
import time
|
|
20
|
+
from contextlib import contextmanager
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Iterator
|
|
24
|
+
|
|
25
|
+
import httpx
|
|
26
|
+
import psutil
|
|
27
|
+
|
|
28
|
+
from mlx_stack.core.paths import (
|
|
29
|
+
ensure_data_home,
|
|
30
|
+
get_lock_path,
|
|
31
|
+
get_logs_dir,
|
|
32
|
+
get_pids_dir,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# --------------------------------------------------------------------------- #
|
|
36
|
+
# Constants
|
|
37
|
+
# --------------------------------------------------------------------------- #
|
|
38
|
+
|
|
39
|
+
# Default grace period (seconds) for SIGTERM before SIGKILL
|
|
40
|
+
SHUTDOWN_GRACE_PERIOD = 10
|
|
41
|
+
|
|
42
|
+
# Health check defaults
|
|
43
|
+
HEALTH_CHECK_TIMEOUT = 120 # total timeout in seconds
|
|
44
|
+
HEALTH_CHECK_INITIAL_DELAY = 0.5 # initial retry delay in seconds
|
|
45
|
+
HEALTH_CHECK_MAX_DELAY = 10.0 # maximum retry delay in seconds
|
|
46
|
+
HEALTH_CHECK_BACKOFF_FACTOR = 2.0 # exponential backoff multiplier
|
|
47
|
+
|
|
48
|
+
# Status health check timeout for the status command
|
|
49
|
+
STATUS_CHECK_TIMEOUT = 5.0 # per-service HTTP timeout in seconds
|
|
50
|
+
STATUS_DEGRADED_THRESHOLD = 2.0 # response time > 2s = degraded
|
|
51
|
+
|
|
52
|
+
# --------------------------------------------------------------------------- #
|
|
53
|
+
# Exceptions
|
|
54
|
+
# --------------------------------------------------------------------------- #
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ProcessError(Exception):
|
|
58
|
+
"""Raised when a process management operation fails."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class LockError(ProcessError):
|
|
62
|
+
"""Raised when the lockfile cannot be acquired."""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class HealthCheckError(ProcessError):
|
|
66
|
+
"""Raised when a health check fails after all retries."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class PortConflictError(ProcessError):
|
|
70
|
+
"""Raised when a port is already in use by another process."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, port: int, pid: int | None = None, name: str | None = None) -> None:
|
|
73
|
+
self.port = port
|
|
74
|
+
self.pid = pid
|
|
75
|
+
self.name = name
|
|
76
|
+
parts = [f"Port {port} is already in use"]
|
|
77
|
+
if pid is not None:
|
|
78
|
+
parts.append(f"by PID {pid}")
|
|
79
|
+
if name is not None:
|
|
80
|
+
parts.append(f"({name})")
|
|
81
|
+
super().__init__(" ".join(parts))
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# --------------------------------------------------------------------------- #
|
|
85
|
+
# Data classes
|
|
86
|
+
# --------------------------------------------------------------------------- #
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass(frozen=True)
|
|
90
|
+
class ServiceInfo:
|
|
91
|
+
"""Information about a managed service."""
|
|
92
|
+
|
|
93
|
+
name: str
|
|
94
|
+
pid: int | None
|
|
95
|
+
port: int
|
|
96
|
+
log_path: Path | None
|
|
97
|
+
pid_path: Path | None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ShutdownResult:
|
|
101
|
+
"""Result of shutting down a single service."""
|
|
102
|
+
|
|
103
|
+
def __init__(self, name: str, pid: int, graceful: bool) -> None:
|
|
104
|
+
self.name = name
|
|
105
|
+
self.pid = pid
|
|
106
|
+
self.graceful = graceful
|
|
107
|
+
|
|
108
|
+
def __repr__(self) -> str:
|
|
109
|
+
method = "graceful" if self.graceful else "forced"
|
|
110
|
+
return f"ShutdownResult(name={self.name!r}, pid={self.pid}, method={method!r})"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass(frozen=True)
|
|
114
|
+
class HealthCheckResult:
|
|
115
|
+
"""Result of a health check against a service."""
|
|
116
|
+
|
|
117
|
+
healthy: bool
|
|
118
|
+
response_time: float | None # seconds, None if no response
|
|
119
|
+
status_code: int | None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# --------------------------------------------------------------------------- #
|
|
123
|
+
# PID file management
|
|
124
|
+
# --------------------------------------------------------------------------- #
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _ensure_pids_dir() -> Path:
|
|
128
|
+
"""Ensure the PID directory exists and return its path."""
|
|
129
|
+
pids_dir = get_pids_dir()
|
|
130
|
+
pids_dir.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
return pids_dir
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _ensure_logs_dir() -> Path:
|
|
135
|
+
"""Ensure the logs directory exists and return its path."""
|
|
136
|
+
logs_dir = get_logs_dir()
|
|
137
|
+
logs_dir.mkdir(parents=True, exist_ok=True)
|
|
138
|
+
return logs_dir
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def write_pid_file(service_name: str, pid: int) -> Path:
|
|
142
|
+
"""Write a PID file for a service.
|
|
143
|
+
|
|
144
|
+
The PID file contains exactly the integer PID with no trailing
|
|
145
|
+
whitespace or newline.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
service_name: Name of the service (e.g. 'fast', 'litellm').
|
|
149
|
+
pid: The process ID to write.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Path to the created PID file.
|
|
153
|
+
|
|
154
|
+
Raises:
|
|
155
|
+
ProcessError: If the PID file cannot be written.
|
|
156
|
+
"""
|
|
157
|
+
pids_dir = _ensure_pids_dir()
|
|
158
|
+
pid_path = pids_dir / f"{service_name}.pid"
|
|
159
|
+
try:
|
|
160
|
+
pid_path.write_text(str(pid))
|
|
161
|
+
except OSError as exc:
|
|
162
|
+
msg = f"Could not write PID file for '{service_name}': {exc}"
|
|
163
|
+
raise ProcessError(msg) from None
|
|
164
|
+
return pid_path
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def read_pid_file(service_name: str) -> int | None:
|
|
168
|
+
"""Read a PID from a service's PID file.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
service_name: Name of the service.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
The PID as an integer, or None if the file doesn't exist.
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
ProcessError: If the PID file exists but contains non-numeric content.
|
|
178
|
+
"""
|
|
179
|
+
pid_path = get_pids_dir() / f"{service_name}.pid"
|
|
180
|
+
if not pid_path.exists():
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
content = pid_path.read_text().strip()
|
|
184
|
+
if not content:
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
return int(content)
|
|
189
|
+
except ValueError:
|
|
190
|
+
msg = (
|
|
191
|
+
f"PID file for '{service_name}' contains non-numeric content: "
|
|
192
|
+
f"{content!r}"
|
|
193
|
+
)
|
|
194
|
+
raise ProcessError(msg) from None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def remove_pid_file(service_name: str) -> bool:
|
|
198
|
+
"""Remove a service's PID file.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
service_name: Name of the service.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
True if the file was removed, False if it didn't exist.
|
|
205
|
+
"""
|
|
206
|
+
pid_path = get_pids_dir() / f"{service_name}.pid"
|
|
207
|
+
if pid_path.exists():
|
|
208
|
+
try:
|
|
209
|
+
pid_path.unlink()
|
|
210
|
+
except OSError:
|
|
211
|
+
pass # Best-effort removal
|
|
212
|
+
return True
|
|
213
|
+
return False
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def list_pid_files() -> dict[str, Path]:
|
|
217
|
+
"""List all PID files in the pids directory.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
A dict mapping service name to PID file path.
|
|
221
|
+
"""
|
|
222
|
+
pids_dir = get_pids_dir()
|
|
223
|
+
if not pids_dir.exists():
|
|
224
|
+
return {}
|
|
225
|
+
|
|
226
|
+
result: dict[str, Path] = {}
|
|
227
|
+
for pid_file in pids_dir.glob("*.pid"):
|
|
228
|
+
service_name = pid_file.stem
|
|
229
|
+
result[service_name] = pid_file
|
|
230
|
+
return result
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# --------------------------------------------------------------------------- #
|
|
234
|
+
# Process state checks
|
|
235
|
+
# --------------------------------------------------------------------------- #
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def is_process_alive(pid: int) -> bool:
|
|
239
|
+
"""Check if a process with the given PID is still running.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
pid: The process ID to check.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
True if the process is alive, False otherwise.
|
|
246
|
+
"""
|
|
247
|
+
try:
|
|
248
|
+
return psutil.pid_exists(pid) and psutil.Process(pid).status() != psutil.STATUS_ZOMBIE
|
|
249
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def is_stale_pid(service_name: str) -> bool:
|
|
254
|
+
"""Check if a service has a stale PID file (file exists, process dead).
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
service_name: Name of the service.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
True if the PID file exists but the process is not running.
|
|
261
|
+
"""
|
|
262
|
+
try:
|
|
263
|
+
pid = read_pid_file(service_name)
|
|
264
|
+
except ProcessError:
|
|
265
|
+
# Corrupt PID file — treat as stale
|
|
266
|
+
return True
|
|
267
|
+
|
|
268
|
+
if pid is None:
|
|
269
|
+
return False # No PID file at all
|
|
270
|
+
|
|
271
|
+
return not is_process_alive(pid)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def cleanup_stale_pid(service_name: str) -> bool:
|
|
275
|
+
"""Clean up a stale PID file if the process is dead.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
service_name: Name of the service.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
True if a stale PID was cleaned up, False if process is alive
|
|
282
|
+
or no PID file exists.
|
|
283
|
+
"""
|
|
284
|
+
if is_stale_pid(service_name):
|
|
285
|
+
remove_pid_file(service_name)
|
|
286
|
+
return True
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# --------------------------------------------------------------------------- #
|
|
291
|
+
# Lockfile management
|
|
292
|
+
# --------------------------------------------------------------------------- #
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@contextmanager
|
|
296
|
+
def acquire_lock() -> Iterator[None]:
|
|
297
|
+
"""Acquire an exclusive lock to prevent concurrent operations.
|
|
298
|
+
|
|
299
|
+
Uses ``fcntl.flock`` on ``~/.mlx-stack/lock``. The lock is released
|
|
300
|
+
when the context manager exits (or on process termination via OS-level
|
|
301
|
+
FD cleanup).
|
|
302
|
+
|
|
303
|
+
Yields:
|
|
304
|
+
None when the lock is acquired.
|
|
305
|
+
|
|
306
|
+
Raises:
|
|
307
|
+
LockError: If the lock is already held by another process.
|
|
308
|
+
"""
|
|
309
|
+
ensure_data_home()
|
|
310
|
+
lock_path = get_lock_path()
|
|
311
|
+
|
|
312
|
+
# Open in write mode, creating if it doesn't exist
|
|
313
|
+
fd = os.open(str(lock_path), os.O_WRONLY | os.O_CREAT, 0o644)
|
|
314
|
+
try:
|
|
315
|
+
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
316
|
+
except OSError:
|
|
317
|
+
os.close(fd)
|
|
318
|
+
msg = (
|
|
319
|
+
"Another mlx-stack operation is already running. "
|
|
320
|
+
"Wait for it to finish or remove the lock file: "
|
|
321
|
+
f"{lock_path}"
|
|
322
|
+
)
|
|
323
|
+
raise LockError(msg) from None
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
yield
|
|
327
|
+
finally:
|
|
328
|
+
try:
|
|
329
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
330
|
+
except OSError:
|
|
331
|
+
pass
|
|
332
|
+
os.close(fd)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# --------------------------------------------------------------------------- #
|
|
336
|
+
# Health checks
|
|
337
|
+
# --------------------------------------------------------------------------- #
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def http_health_check(
|
|
341
|
+
port: int,
|
|
342
|
+
path: str = "/v1/models",
|
|
343
|
+
timeout: float = STATUS_CHECK_TIMEOUT,
|
|
344
|
+
host: str = "127.0.0.1",
|
|
345
|
+
) -> HealthCheckResult:
|
|
346
|
+
"""Perform a single HTTP health check against a service.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
port: The port to check.
|
|
350
|
+
path: The HTTP path to request.
|
|
351
|
+
timeout: Request timeout in seconds.
|
|
352
|
+
host: The host to connect to.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
A HealthCheckResult with the outcome.
|
|
356
|
+
"""
|
|
357
|
+
url = f"http://{host}:{port}{path}"
|
|
358
|
+
try:
|
|
359
|
+
start = time.monotonic()
|
|
360
|
+
response = httpx.get(url, timeout=timeout)
|
|
361
|
+
elapsed = time.monotonic() - start
|
|
362
|
+
return HealthCheckResult(
|
|
363
|
+
healthy=response.status_code == 200,
|
|
364
|
+
response_time=elapsed,
|
|
365
|
+
status_code=response.status_code,
|
|
366
|
+
)
|
|
367
|
+
except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPError, OSError):
|
|
368
|
+
return HealthCheckResult(
|
|
369
|
+
healthy=False,
|
|
370
|
+
response_time=None,
|
|
371
|
+
status_code=None,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def wait_for_healthy(
|
|
376
|
+
port: int,
|
|
377
|
+
path: str = "/v1/models",
|
|
378
|
+
total_timeout: float = HEALTH_CHECK_TIMEOUT,
|
|
379
|
+
initial_delay: float = HEALTH_CHECK_INITIAL_DELAY,
|
|
380
|
+
max_delay: float = HEALTH_CHECK_MAX_DELAY,
|
|
381
|
+
backoff_factor: float = HEALTH_CHECK_BACKOFF_FACTOR,
|
|
382
|
+
host: str = "127.0.0.1",
|
|
383
|
+
) -> HealthCheckResult:
|
|
384
|
+
"""Wait for a service to become healthy with exponential backoff.
|
|
385
|
+
|
|
386
|
+
Polls the service's health endpoint repeatedly with increasing delay
|
|
387
|
+
between attempts until either the service responds with HTTP 200 or
|
|
388
|
+
the total timeout is exceeded.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
port: The port to check.
|
|
392
|
+
path: The HTTP path to request.
|
|
393
|
+
total_timeout: Maximum total time to wait in seconds.
|
|
394
|
+
initial_delay: Initial delay between retries in seconds.
|
|
395
|
+
max_delay: Maximum delay between retries in seconds.
|
|
396
|
+
backoff_factor: Multiplier for exponential backoff.
|
|
397
|
+
host: The host to connect to.
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
A HealthCheckResult from the final check.
|
|
401
|
+
|
|
402
|
+
Raises:
|
|
403
|
+
HealthCheckError: If the service does not become healthy
|
|
404
|
+
within the total timeout.
|
|
405
|
+
"""
|
|
406
|
+
deadline = time.monotonic() + total_timeout
|
|
407
|
+
delay = initial_delay
|
|
408
|
+
last_result: HealthCheckResult | None = None
|
|
409
|
+
|
|
410
|
+
while time.monotonic() < deadline:
|
|
411
|
+
per_request_timeout = min(5.0, deadline - time.monotonic())
|
|
412
|
+
if per_request_timeout <= 0:
|
|
413
|
+
break
|
|
414
|
+
|
|
415
|
+
result = http_health_check(
|
|
416
|
+
port=port,
|
|
417
|
+
path=path,
|
|
418
|
+
timeout=per_request_timeout,
|
|
419
|
+
host=host,
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
if result.healthy:
|
|
423
|
+
return result
|
|
424
|
+
|
|
425
|
+
last_result = result
|
|
426
|
+
|
|
427
|
+
# Wait before next retry, respecting the deadline
|
|
428
|
+
remaining = deadline - time.monotonic()
|
|
429
|
+
if remaining <= 0:
|
|
430
|
+
break
|
|
431
|
+
sleep_time = min(delay, remaining)
|
|
432
|
+
time.sleep(sleep_time)
|
|
433
|
+
delay = min(delay * backoff_factor, max_delay)
|
|
434
|
+
|
|
435
|
+
# Timed out
|
|
436
|
+
if last_result is None:
|
|
437
|
+
last_result = HealthCheckResult(healthy=False, response_time=None, status_code=None)
|
|
438
|
+
|
|
439
|
+
msg = (
|
|
440
|
+
f"Health check timed out after {total_timeout}s waiting for "
|
|
441
|
+
f"http://{host}:{port}{path}"
|
|
442
|
+
)
|
|
443
|
+
raise HealthCheckError(msg)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
# --------------------------------------------------------------------------- #
|
|
447
|
+
# Port conflict detection
|
|
448
|
+
# --------------------------------------------------------------------------- #
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def _socket_bind_check(port: int) -> bool:
|
|
452
|
+
"""Check if a port is available by attempting a socket bind.
|
|
453
|
+
|
|
454
|
+
This is more reliable than psutil.net_connections on macOS where
|
|
455
|
+
the latter can fail with AccessDenied.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
port: The TCP port to check.
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
True if the port is in use (bind failed), False if available.
|
|
462
|
+
"""
|
|
463
|
+
import socket
|
|
464
|
+
|
|
465
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
466
|
+
try:
|
|
467
|
+
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 0)
|
|
468
|
+
sock.bind(("127.0.0.1", port))
|
|
469
|
+
return False # Port is available
|
|
470
|
+
except OSError:
|
|
471
|
+
return True # Port is in use
|
|
472
|
+
finally:
|
|
473
|
+
sock.close()
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _find_pid_on_port(port: int) -> tuple[int, str] | None:
|
|
477
|
+
"""Find the PID and process name listening on a port via psutil.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
port: The TCP port to look up.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
A tuple of (pid, process_name) if found, or None.
|
|
484
|
+
"""
|
|
485
|
+
try:
|
|
486
|
+
for conn in psutil.net_connections(kind="inet"):
|
|
487
|
+
if conn.laddr and conn.laddr.port == port and conn.status == "LISTEN":
|
|
488
|
+
pid = conn.pid
|
|
489
|
+
if pid is not None:
|
|
490
|
+
try:
|
|
491
|
+
proc = psutil.Process(pid)
|
|
492
|
+
return (pid, proc.name())
|
|
493
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
494
|
+
return (pid, "<unknown>")
|
|
495
|
+
except (psutil.AccessDenied, OSError):
|
|
496
|
+
pass
|
|
497
|
+
return None
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def check_port_conflict(port: int) -> tuple[int, str] | None:
|
|
501
|
+
"""Check if a port is in use and identify the owning process.
|
|
502
|
+
|
|
503
|
+
Uses a two-phase approach for reliability:
|
|
504
|
+
1. Attempt a socket bind to definitively check port availability.
|
|
505
|
+
2. If the port is in use, look up the owning PID/process via psutil.
|
|
506
|
+
|
|
507
|
+
This ensures detection works even when psutil.net_connections is
|
|
508
|
+
restricted by macOS permissions.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
port: The TCP port to check.
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
A tuple of (pid, process_name) if the port is in use,
|
|
515
|
+
or None if the port is available.
|
|
516
|
+
"""
|
|
517
|
+
# Phase 1: Socket bind check (most reliable)
|
|
518
|
+
if not _socket_bind_check(port):
|
|
519
|
+
return None # Port is available
|
|
520
|
+
|
|
521
|
+
# Phase 2: Port is in use — try to identify the owner
|
|
522
|
+
owner = _find_pid_on_port(port)
|
|
523
|
+
if owner is not None:
|
|
524
|
+
return owner
|
|
525
|
+
|
|
526
|
+
# Port is occupied but we can't identify the owner
|
|
527
|
+
return (0, "<unknown>")
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def detect_port_conflict(port: int) -> None:
|
|
532
|
+
"""Raise PortConflictError if a port is already in use.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
port: The TCP port to check.
|
|
536
|
+
|
|
537
|
+
Raises:
|
|
538
|
+
PortConflictError: If the port is in use.
|
|
539
|
+
"""
|
|
540
|
+
conflict = check_port_conflict(port)
|
|
541
|
+
if conflict is not None:
|
|
542
|
+
pid, name = conflict
|
|
543
|
+
raise PortConflictError(port=port, pid=pid, name=name)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
# --------------------------------------------------------------------------- #
|
|
547
|
+
# Subprocess management
|
|
548
|
+
# --------------------------------------------------------------------------- #
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def start_service(
|
|
552
|
+
service_name: str,
|
|
553
|
+
cmd: list[str],
|
|
554
|
+
port: int,
|
|
555
|
+
env: dict[str, str] | None = None,
|
|
556
|
+
log_dir: Path | None = None,
|
|
557
|
+
) -> ServiceInfo:
|
|
558
|
+
"""Start a subprocess for a service with log redirection and PID tracking.
|
|
559
|
+
|
|
560
|
+
Starts the process detached from the CLI's lifecycle. Stdout and stderr
|
|
561
|
+
are redirected to ``<log_dir>/<service_name>.log``.
|
|
562
|
+
|
|
563
|
+
Args:
|
|
564
|
+
service_name: Name for the service (used for PID/log files).
|
|
565
|
+
cmd: The command to execute as a list of strings.
|
|
566
|
+
port: The port the service will listen on.
|
|
567
|
+
env: Optional environment variables for the subprocess.
|
|
568
|
+
Merged with the current environment.
|
|
569
|
+
log_dir: Directory for log files. Defaults to ``~/.mlx-stack/logs/``.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
A ServiceInfo with the started service's details.
|
|
573
|
+
|
|
574
|
+
Raises:
|
|
575
|
+
ProcessError: If the service cannot be started.
|
|
576
|
+
PortConflictError: If the port is already in use.
|
|
577
|
+
"""
|
|
578
|
+
# Ensure directories exist
|
|
579
|
+
if log_dir is None:
|
|
580
|
+
log_dir = _ensure_logs_dir()
|
|
581
|
+
else:
|
|
582
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
583
|
+
|
|
584
|
+
log_path = log_dir / f"{service_name}.log"
|
|
585
|
+
|
|
586
|
+
# Build environment
|
|
587
|
+
process_env: dict[str, str] = dict(os.environ)
|
|
588
|
+
if env:
|
|
589
|
+
process_env.update(env)
|
|
590
|
+
|
|
591
|
+
try:
|
|
592
|
+
log_file = open(log_path, "a") # noqa: SIM115
|
|
593
|
+
except OSError as exc:
|
|
594
|
+
msg = f"Could not open log file for '{service_name}': {exc}"
|
|
595
|
+
raise ProcessError(msg) from None
|
|
596
|
+
|
|
597
|
+
try:
|
|
598
|
+
proc = subprocess.Popen(
|
|
599
|
+
cmd,
|
|
600
|
+
stdout=log_file,
|
|
601
|
+
stderr=log_file,
|
|
602
|
+
env=process_env,
|
|
603
|
+
start_new_session=True,
|
|
604
|
+
)
|
|
605
|
+
except OSError as exc:
|
|
606
|
+
log_file.close()
|
|
607
|
+
msg = f"Could not start service '{service_name}': {exc}"
|
|
608
|
+
raise ProcessError(msg) from None
|
|
609
|
+
|
|
610
|
+
# Write PID file — if this fails, kill the spawned process to prevent
|
|
611
|
+
# leaked unmanaged subprocesses (scrutiny fix: orphan prevention).
|
|
612
|
+
try:
|
|
613
|
+
pid_path = write_pid_file(service_name, proc.pid)
|
|
614
|
+
except ProcessError:
|
|
615
|
+
# Kill the orphaned process before re-raising
|
|
616
|
+
try:
|
|
617
|
+
proc.terminate()
|
|
618
|
+
proc.wait(timeout=5)
|
|
619
|
+
except Exception:
|
|
620
|
+
try:
|
|
621
|
+
proc.kill()
|
|
622
|
+
except Exception:
|
|
623
|
+
pass
|
|
624
|
+
log_file.close()
|
|
625
|
+
msg = (
|
|
626
|
+
f"Could not write PID file for '{service_name}' after "
|
|
627
|
+
f"starting process (PID {proc.pid}). "
|
|
628
|
+
f"The process has been terminated to prevent orphans."
|
|
629
|
+
)
|
|
630
|
+
raise ProcessError(msg) from None
|
|
631
|
+
|
|
632
|
+
# Detach: close the log file handle in the parent process
|
|
633
|
+
# The child process has its own file descriptors
|
|
634
|
+
log_file.close()
|
|
635
|
+
|
|
636
|
+
return ServiceInfo(
|
|
637
|
+
name=service_name,
|
|
638
|
+
pid=proc.pid,
|
|
639
|
+
port=port,
|
|
640
|
+
log_path=log_path,
|
|
641
|
+
pid_path=pid_path,
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
# --------------------------------------------------------------------------- #
|
|
646
|
+
# Service shutdown
|
|
647
|
+
# --------------------------------------------------------------------------- #
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def stop_service(
|
|
651
|
+
service_name: str,
|
|
652
|
+
grace_period: float = SHUTDOWN_GRACE_PERIOD,
|
|
653
|
+
) -> ShutdownResult | None:
|
|
654
|
+
"""Stop a managed service by its PID file.
|
|
655
|
+
|
|
656
|
+
Sends SIGTERM first with a grace period. If the process hasn't exited
|
|
657
|
+
after the grace period, sends SIGKILL. Only removes the PID file once
|
|
658
|
+
process termination is confirmed (scrutiny fix: verified termination).
|
|
659
|
+
|
|
660
|
+
Args:
|
|
661
|
+
service_name: Name of the service to stop.
|
|
662
|
+
grace_period: Seconds to wait after SIGTERM before SIGKILL.
|
|
663
|
+
|
|
664
|
+
Returns:
|
|
665
|
+
A ShutdownResult if a process was stopped, or None if no process
|
|
666
|
+
was found (PID file missing or process already dead).
|
|
667
|
+
|
|
668
|
+
Raises:
|
|
669
|
+
ProcessError: If the PID file is corrupt (non-numeric content).
|
|
670
|
+
"""
|
|
671
|
+
try:
|
|
672
|
+
pid = read_pid_file(service_name)
|
|
673
|
+
except ProcessError:
|
|
674
|
+
# Corrupt PID file — remove it and report
|
|
675
|
+
remove_pid_file(service_name)
|
|
676
|
+
return None
|
|
677
|
+
|
|
678
|
+
if pid is None:
|
|
679
|
+
return None
|
|
680
|
+
|
|
681
|
+
if not is_process_alive(pid):
|
|
682
|
+
# Stale PID — clean up
|
|
683
|
+
remove_pid_file(service_name)
|
|
684
|
+
return None
|
|
685
|
+
|
|
686
|
+
# Send SIGTERM, escalate to SIGKILL if needed
|
|
687
|
+
graceful, confirmed = _terminate_process(pid, grace_period)
|
|
688
|
+
|
|
689
|
+
# Only remove PID file once termination is confirmed
|
|
690
|
+
if confirmed:
|
|
691
|
+
remove_pid_file(service_name)
|
|
692
|
+
|
|
693
|
+
return ShutdownResult(name=service_name, pid=pid, graceful=graceful)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def _terminate_process(pid: int, grace_period: float) -> tuple[bool, bool]:
|
|
697
|
+
"""Terminate a process with SIGTERM, escalating to SIGKILL if needed.
|
|
698
|
+
|
|
699
|
+
Verifies process termination after SIGKILL before returning
|
|
700
|
+
(scrutiny fix: confirmed termination).
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
pid: Process ID to terminate.
|
|
704
|
+
grace_period: Seconds to wait after SIGTERM.
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
A tuple of (graceful, confirmed):
|
|
708
|
+
- graceful: True if SIGTERM was sufficient, False if SIGKILL used.
|
|
709
|
+
- confirmed: True if the process is confirmed dead, False if it
|
|
710
|
+
may still be running after SIGKILL.
|
|
711
|
+
"""
|
|
712
|
+
try:
|
|
713
|
+
os.kill(pid, signal.SIGTERM)
|
|
714
|
+
except OSError:
|
|
715
|
+
# Process may have already exited
|
|
716
|
+
return True, True
|
|
717
|
+
|
|
718
|
+
# Wait for process to exit
|
|
719
|
+
deadline = time.monotonic() + grace_period
|
|
720
|
+
while time.monotonic() < deadline:
|
|
721
|
+
if not is_process_alive(pid):
|
|
722
|
+
return True, True
|
|
723
|
+
time.sleep(0.2)
|
|
724
|
+
|
|
725
|
+
# Grace period expired — send SIGKILL
|
|
726
|
+
try:
|
|
727
|
+
os.kill(pid, signal.SIGKILL)
|
|
728
|
+
except OSError:
|
|
729
|
+
# Process may have exited between check and kill
|
|
730
|
+
return True, True
|
|
731
|
+
|
|
732
|
+
# Wait for SIGKILL to take effect — verify process is actually dead
|
|
733
|
+
for _ in range(25):
|
|
734
|
+
if not is_process_alive(pid):
|
|
735
|
+
return False, True
|
|
736
|
+
time.sleep(0.1)
|
|
737
|
+
|
|
738
|
+
# Process is still alive after SIGKILL — not confirmed dead
|
|
739
|
+
return False, False
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
# --------------------------------------------------------------------------- #
|
|
743
|
+
# Service status
|
|
744
|
+
# --------------------------------------------------------------------------- #
|
|
745
|
+
|
|
746
|
+
|
|
747
|
+
def get_service_status(
|
|
748
|
+
service_name: str,
|
|
749
|
+
port: int,
|
|
750
|
+
health_path: str = "/v1/models",
|
|
751
|
+
) -> dict[str, Any]:
|
|
752
|
+
"""Get the current status of a managed service.
|
|
753
|
+
|
|
754
|
+
Implements 5-state reporting:
|
|
755
|
+
- healthy: PID alive and HTTP 200 within 2s
|
|
756
|
+
- degraded: PID alive and HTTP 200 but response time > 2s
|
|
757
|
+
- down: PID alive but no HTTP response within 5s
|
|
758
|
+
- crashed: PID file exists but process is dead
|
|
759
|
+
- stopped: No PID file
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
service_name: Name of the service.
|
|
763
|
+
port: The port the service listens on.
|
|
764
|
+
health_path: The HTTP path for health checks.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
A dict with keys: status, pid, uptime (seconds or None),
|
|
768
|
+
response_time (seconds or None).
|
|
769
|
+
"""
|
|
770
|
+
pid_path = get_pids_dir() / f"{service_name}.pid"
|
|
771
|
+
|
|
772
|
+
# No PID file → stopped
|
|
773
|
+
if not pid_path.exists():
|
|
774
|
+
return {
|
|
775
|
+
"status": "stopped",
|
|
776
|
+
"pid": None,
|
|
777
|
+
"uptime": None,
|
|
778
|
+
"response_time": None,
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
# Read PID
|
|
782
|
+
try:
|
|
783
|
+
pid = read_pid_file(service_name)
|
|
784
|
+
except ProcessError:
|
|
785
|
+
# Corrupt PID file
|
|
786
|
+
return {
|
|
787
|
+
"status": "crashed",
|
|
788
|
+
"pid": None,
|
|
789
|
+
"uptime": None,
|
|
790
|
+
"response_time": None,
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
if pid is None:
|
|
794
|
+
return {
|
|
795
|
+
"status": "stopped",
|
|
796
|
+
"pid": None,
|
|
797
|
+
"uptime": None,
|
|
798
|
+
"response_time": None,
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
# PID file exists but process is dead → crashed
|
|
802
|
+
if not is_process_alive(pid):
|
|
803
|
+
return {
|
|
804
|
+
"status": "crashed",
|
|
805
|
+
"pid": pid,
|
|
806
|
+
"uptime": None,
|
|
807
|
+
"response_time": None,
|
|
808
|
+
}
|
|
809
|
+
|
|
810
|
+
# PID alive → check HTTP health
|
|
811
|
+
uptime = _get_uptime_from_pid_file(service_name)
|
|
812
|
+
result = http_health_check(
|
|
813
|
+
port=port,
|
|
814
|
+
path=health_path,
|
|
815
|
+
timeout=STATUS_CHECK_TIMEOUT,
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
if result.healthy and result.response_time is not None:
|
|
819
|
+
if result.response_time <= STATUS_DEGRADED_THRESHOLD:
|
|
820
|
+
status = "healthy"
|
|
821
|
+
else:
|
|
822
|
+
status = "degraded"
|
|
823
|
+
else:
|
|
824
|
+
status = "down"
|
|
825
|
+
|
|
826
|
+
return {
|
|
827
|
+
"status": status,
|
|
828
|
+
"pid": pid,
|
|
829
|
+
"uptime": uptime,
|
|
830
|
+
"response_time": result.response_time,
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def _get_uptime_from_pid_file(service_name: str) -> float | None:
|
|
835
|
+
"""Calculate uptime from the PID file's modification timestamp.
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
service_name: Name of the service.
|
|
839
|
+
|
|
840
|
+
Returns:
|
|
841
|
+
Uptime in seconds, or None if the file doesn't exist.
|
|
842
|
+
"""
|
|
843
|
+
pid_path = get_pids_dir() / f"{service_name}.pid"
|
|
844
|
+
if not pid_path.exists():
|
|
845
|
+
return None
|
|
846
|
+
try:
|
|
847
|
+
mtime = pid_path.stat().st_mtime
|
|
848
|
+
return time.time() - mtime
|
|
849
|
+
except OSError:
|
|
850
|
+
return None
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def format_uptime(seconds: float | None) -> str:
|
|
854
|
+
"""Format uptime seconds into a human-readable string.
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
seconds: Uptime in seconds, or None.
|
|
858
|
+
|
|
859
|
+
Returns:
|
|
860
|
+
Human-readable uptime string (e.g. '2h 15m') or '-'.
|
|
861
|
+
"""
|
|
862
|
+
if seconds is None:
|
|
863
|
+
return "-"
|
|
864
|
+
|
|
865
|
+
seconds = int(seconds)
|
|
866
|
+
if seconds < 60:
|
|
867
|
+
return f"{seconds}s"
|
|
868
|
+
|
|
869
|
+
minutes = seconds // 60
|
|
870
|
+
if minutes < 60:
|
|
871
|
+
remaining_s = seconds % 60
|
|
872
|
+
if remaining_s > 0:
|
|
873
|
+
return f"{minutes}m {remaining_s}s"
|
|
874
|
+
return f"{minutes}m"
|
|
875
|
+
|
|
876
|
+
hours = minutes // 60
|
|
877
|
+
remaining_m = minutes % 60
|
|
878
|
+
if hours < 24:
|
|
879
|
+
if remaining_m > 0:
|
|
880
|
+
return f"{hours}h {remaining_m}m"
|
|
881
|
+
return f"{hours}h"
|
|
882
|
+
|
|
883
|
+
days = hours // 24
|
|
884
|
+
remaining_h = hours % 24
|
|
885
|
+
if remaining_h > 0:
|
|
886
|
+
return f"{days}d {remaining_h}h"
|
|
887
|
+
return f"{days}d"
|