mlx-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. mlx_stack/__init__.py +5 -0
  2. mlx_stack/_version.py +24 -0
  3. mlx_stack/cli/__init__.py +5 -0
  4. mlx_stack/cli/bench.py +221 -0
  5. mlx_stack/cli/config.py +166 -0
  6. mlx_stack/cli/down.py +109 -0
  7. mlx_stack/cli/init.py +180 -0
  8. mlx_stack/cli/install.py +165 -0
  9. mlx_stack/cli/logs.py +234 -0
  10. mlx_stack/cli/main.py +187 -0
  11. mlx_stack/cli/models.py +304 -0
  12. mlx_stack/cli/profile.py +65 -0
  13. mlx_stack/cli/pull.py +134 -0
  14. mlx_stack/cli/recommend.py +397 -0
  15. mlx_stack/cli/status.py +111 -0
  16. mlx_stack/cli/up.py +163 -0
  17. mlx_stack/cli/watch.py +252 -0
  18. mlx_stack/core/__init__.py +1 -0
  19. mlx_stack/core/benchmark.py +1182 -0
  20. mlx_stack/core/catalog.py +560 -0
  21. mlx_stack/core/config.py +471 -0
  22. mlx_stack/core/deps.py +323 -0
  23. mlx_stack/core/hardware.py +304 -0
  24. mlx_stack/core/launchd.py +531 -0
  25. mlx_stack/core/litellm_gen.py +188 -0
  26. mlx_stack/core/log_rotation.py +231 -0
  27. mlx_stack/core/log_viewer.py +386 -0
  28. mlx_stack/core/models.py +639 -0
  29. mlx_stack/core/paths.py +79 -0
  30. mlx_stack/core/process.py +887 -0
  31. mlx_stack/core/pull.py +815 -0
  32. mlx_stack/core/scoring.py +611 -0
  33. mlx_stack/core/stack_down.py +317 -0
  34. mlx_stack/core/stack_init.py +524 -0
  35. mlx_stack/core/stack_status.py +229 -0
  36. mlx_stack/core/stack_up.py +856 -0
  37. mlx_stack/core/watchdog.py +744 -0
  38. mlx_stack/data/__init__.py +1 -0
  39. mlx_stack/data/catalog/__init__.py +1 -0
  40. mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
  41. mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
  42. mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
  43. mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
  44. mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
  45. mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
  46. mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
  47. mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
  48. mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
  49. mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
  50. mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
  51. mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
  52. mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
  53. mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
  54. mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
  55. mlx_stack/py.typed +1 -0
  56. mlx_stack/utils/__init__.py +1 -0
  57. mlx_stack-0.1.0.dist-info/METADATA +397 -0
  58. mlx_stack-0.1.0.dist-info/RECORD +61 -0
  59. mlx_stack-0.1.0.dist-info/WHEEL +4 -0
  60. mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
  61. mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,887 @@
1
+ """Process management module for mlx-stack.
2
+
3
+ Handles starting, stopping, and health-checking of vllm-mlx and LiteLLM
4
+ subprocesses. Manages PID files in ~/.mlx-stack/pids/, log file redirection
5
+ to ~/.mlx-stack/logs/, HTTP health checks with exponential backoff,
6
+ lockfile (fcntl.flock) for concurrent invocation prevention,
7
+ SIGTERM/SIGKILL shutdown with grace period, stale PID detection and cleanup,
8
+ and port conflict detection.
9
+
10
+ This is the infrastructure module used by up, down, and status commands.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import fcntl
16
+ import os
17
+ import signal
18
+ import subprocess
19
+ import time
20
+ from contextlib import contextmanager
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+ from typing import Any, Iterator
24
+
25
+ import httpx
26
+ import psutil
27
+
28
+ from mlx_stack.core.paths import (
29
+ ensure_data_home,
30
+ get_lock_path,
31
+ get_logs_dir,
32
+ get_pids_dir,
33
+ )
34
+
35
+ # --------------------------------------------------------------------------- #
36
+ # Constants
37
+ # --------------------------------------------------------------------------- #
38
+
39
+ # Default grace period (seconds) for SIGTERM before SIGKILL
40
+ SHUTDOWN_GRACE_PERIOD = 10
41
+
42
+ # Health check defaults
43
+ HEALTH_CHECK_TIMEOUT = 120 # total timeout in seconds
44
+ HEALTH_CHECK_INITIAL_DELAY = 0.5 # initial retry delay in seconds
45
+ HEALTH_CHECK_MAX_DELAY = 10.0 # maximum retry delay in seconds
46
+ HEALTH_CHECK_BACKOFF_FACTOR = 2.0 # exponential backoff multiplier
47
+
48
+ # Status health check timeout for the status command
49
+ STATUS_CHECK_TIMEOUT = 5.0 # per-service HTTP timeout in seconds
50
+ STATUS_DEGRADED_THRESHOLD = 2.0 # response time > 2s = degraded
51
+
52
+ # --------------------------------------------------------------------------- #
53
+ # Exceptions
54
+ # --------------------------------------------------------------------------- #
55
+
56
+
57
+ class ProcessError(Exception):
58
+ """Raised when a process management operation fails."""
59
+
60
+
61
+ class LockError(ProcessError):
62
+ """Raised when the lockfile cannot be acquired."""
63
+
64
+
65
+ class HealthCheckError(ProcessError):
66
+ """Raised when a health check fails after all retries."""
67
+
68
+
69
+ class PortConflictError(ProcessError):
70
+ """Raised when a port is already in use by another process."""
71
+
72
+ def __init__(self, port: int, pid: int | None = None, name: str | None = None) -> None:
73
+ self.port = port
74
+ self.pid = pid
75
+ self.name = name
76
+ parts = [f"Port {port} is already in use"]
77
+ if pid is not None:
78
+ parts.append(f"by PID {pid}")
79
+ if name is not None:
80
+ parts.append(f"({name})")
81
+ super().__init__(" ".join(parts))
82
+
83
+
84
+ # --------------------------------------------------------------------------- #
85
+ # Data classes
86
+ # --------------------------------------------------------------------------- #
87
+
88
+
89
+ @dataclass(frozen=True)
90
+ class ServiceInfo:
91
+ """Information about a managed service."""
92
+
93
+ name: str
94
+ pid: int | None
95
+ port: int
96
+ log_path: Path | None
97
+ pid_path: Path | None
98
+
99
+
100
+ class ShutdownResult:
101
+ """Result of shutting down a single service."""
102
+
103
+ def __init__(self, name: str, pid: int, graceful: bool) -> None:
104
+ self.name = name
105
+ self.pid = pid
106
+ self.graceful = graceful
107
+
108
+ def __repr__(self) -> str:
109
+ method = "graceful" if self.graceful else "forced"
110
+ return f"ShutdownResult(name={self.name!r}, pid={self.pid}, method={method!r})"
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class HealthCheckResult:
115
+ """Result of a health check against a service."""
116
+
117
+ healthy: bool
118
+ response_time: float | None # seconds, None if no response
119
+ status_code: int | None
120
+
121
+
122
+ # --------------------------------------------------------------------------- #
123
+ # PID file management
124
+ # --------------------------------------------------------------------------- #
125
+
126
+
127
+ def _ensure_pids_dir() -> Path:
128
+ """Ensure the PID directory exists and return its path."""
129
+ pids_dir = get_pids_dir()
130
+ pids_dir.mkdir(parents=True, exist_ok=True)
131
+ return pids_dir
132
+
133
+
134
+ def _ensure_logs_dir() -> Path:
135
+ """Ensure the logs directory exists and return its path."""
136
+ logs_dir = get_logs_dir()
137
+ logs_dir.mkdir(parents=True, exist_ok=True)
138
+ return logs_dir
139
+
140
+
141
+ def write_pid_file(service_name: str, pid: int) -> Path:
142
+ """Write a PID file for a service.
143
+
144
+ The PID file contains exactly the integer PID with no trailing
145
+ whitespace or newline.
146
+
147
+ Args:
148
+ service_name: Name of the service (e.g. 'fast', 'litellm').
149
+ pid: The process ID to write.
150
+
151
+ Returns:
152
+ Path to the created PID file.
153
+
154
+ Raises:
155
+ ProcessError: If the PID file cannot be written.
156
+ """
157
+ pids_dir = _ensure_pids_dir()
158
+ pid_path = pids_dir / f"{service_name}.pid"
159
+ try:
160
+ pid_path.write_text(str(pid))
161
+ except OSError as exc:
162
+ msg = f"Could not write PID file for '{service_name}': {exc}"
163
+ raise ProcessError(msg) from None
164
+ return pid_path
165
+
166
+
167
+ def read_pid_file(service_name: str) -> int | None:
168
+ """Read a PID from a service's PID file.
169
+
170
+ Args:
171
+ service_name: Name of the service.
172
+
173
+ Returns:
174
+ The PID as an integer, or None if the file doesn't exist.
175
+
176
+ Raises:
177
+ ProcessError: If the PID file exists but contains non-numeric content.
178
+ """
179
+ pid_path = get_pids_dir() / f"{service_name}.pid"
180
+ if not pid_path.exists():
181
+ return None
182
+
183
+ content = pid_path.read_text().strip()
184
+ if not content:
185
+ return None
186
+
187
+ try:
188
+ return int(content)
189
+ except ValueError:
190
+ msg = (
191
+ f"PID file for '{service_name}' contains non-numeric content: "
192
+ f"{content!r}"
193
+ )
194
+ raise ProcessError(msg) from None
195
+
196
+
197
+ def remove_pid_file(service_name: str) -> bool:
198
+ """Remove a service's PID file.
199
+
200
+ Args:
201
+ service_name: Name of the service.
202
+
203
+ Returns:
204
+ True if the file was removed, False if it didn't exist.
205
+ """
206
+ pid_path = get_pids_dir() / f"{service_name}.pid"
207
+ if pid_path.exists():
208
+ try:
209
+ pid_path.unlink()
210
+ except OSError:
211
+ pass # Best-effort removal
212
+ return True
213
+ return False
214
+
215
+
216
+ def list_pid_files() -> dict[str, Path]:
217
+ """List all PID files in the pids directory.
218
+
219
+ Returns:
220
+ A dict mapping service name to PID file path.
221
+ """
222
+ pids_dir = get_pids_dir()
223
+ if not pids_dir.exists():
224
+ return {}
225
+
226
+ result: dict[str, Path] = {}
227
+ for pid_file in pids_dir.glob("*.pid"):
228
+ service_name = pid_file.stem
229
+ result[service_name] = pid_file
230
+ return result
231
+
232
+
233
+ # --------------------------------------------------------------------------- #
234
+ # Process state checks
235
+ # --------------------------------------------------------------------------- #
236
+
237
+
238
+ def is_process_alive(pid: int) -> bool:
239
+ """Check if a process with the given PID is still running.
240
+
241
+ Args:
242
+ pid: The process ID to check.
243
+
244
+ Returns:
245
+ True if the process is alive, False otherwise.
246
+ """
247
+ try:
248
+ return psutil.pid_exists(pid) and psutil.Process(pid).status() != psutil.STATUS_ZOMBIE
249
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
250
+ return False
251
+
252
+
253
+ def is_stale_pid(service_name: str) -> bool:
254
+ """Check if a service has a stale PID file (file exists, process dead).
255
+
256
+ Args:
257
+ service_name: Name of the service.
258
+
259
+ Returns:
260
+ True if the PID file exists but the process is not running.
261
+ """
262
+ try:
263
+ pid = read_pid_file(service_name)
264
+ except ProcessError:
265
+ # Corrupt PID file — treat as stale
266
+ return True
267
+
268
+ if pid is None:
269
+ return False # No PID file at all
270
+
271
+ return not is_process_alive(pid)
272
+
273
+
274
+ def cleanup_stale_pid(service_name: str) -> bool:
275
+ """Clean up a stale PID file if the process is dead.
276
+
277
+ Args:
278
+ service_name: Name of the service.
279
+
280
+ Returns:
281
+ True if a stale PID was cleaned up, False if process is alive
282
+ or no PID file exists.
283
+ """
284
+ if is_stale_pid(service_name):
285
+ remove_pid_file(service_name)
286
+ return True
287
+ return False
288
+
289
+
290
+ # --------------------------------------------------------------------------- #
291
+ # Lockfile management
292
+ # --------------------------------------------------------------------------- #
293
+
294
+
295
+ @contextmanager
296
+ def acquire_lock() -> Iterator[None]:
297
+ """Acquire an exclusive lock to prevent concurrent operations.
298
+
299
+ Uses ``fcntl.flock`` on ``~/.mlx-stack/lock``. The lock is released
300
+ when the context manager exits (or on process termination via OS-level
301
+ FD cleanup).
302
+
303
+ Yields:
304
+ None when the lock is acquired.
305
+
306
+ Raises:
307
+ LockError: If the lock is already held by another process.
308
+ """
309
+ ensure_data_home()
310
+ lock_path = get_lock_path()
311
+
312
+ # Open in write mode, creating if it doesn't exist
313
+ fd = os.open(str(lock_path), os.O_WRONLY | os.O_CREAT, 0o644)
314
+ try:
315
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
316
+ except OSError:
317
+ os.close(fd)
318
+ msg = (
319
+ "Another mlx-stack operation is already running. "
320
+ "Wait for it to finish or remove the lock file: "
321
+ f"{lock_path}"
322
+ )
323
+ raise LockError(msg) from None
324
+
325
+ try:
326
+ yield
327
+ finally:
328
+ try:
329
+ fcntl.flock(fd, fcntl.LOCK_UN)
330
+ except OSError:
331
+ pass
332
+ os.close(fd)
333
+
334
+
335
+ # --------------------------------------------------------------------------- #
336
+ # Health checks
337
+ # --------------------------------------------------------------------------- #
338
+
339
+
340
+ def http_health_check(
341
+ port: int,
342
+ path: str = "/v1/models",
343
+ timeout: float = STATUS_CHECK_TIMEOUT,
344
+ host: str = "127.0.0.1",
345
+ ) -> HealthCheckResult:
346
+ """Perform a single HTTP health check against a service.
347
+
348
+ Args:
349
+ port: The port to check.
350
+ path: The HTTP path to request.
351
+ timeout: Request timeout in seconds.
352
+ host: The host to connect to.
353
+
354
+ Returns:
355
+ A HealthCheckResult with the outcome.
356
+ """
357
+ url = f"http://{host}:{port}{path}"
358
+ try:
359
+ start = time.monotonic()
360
+ response = httpx.get(url, timeout=timeout)
361
+ elapsed = time.monotonic() - start
362
+ return HealthCheckResult(
363
+ healthy=response.status_code == 200,
364
+ response_time=elapsed,
365
+ status_code=response.status_code,
366
+ )
367
+ except (httpx.ConnectError, httpx.TimeoutException, httpx.HTTPError, OSError):
368
+ return HealthCheckResult(
369
+ healthy=False,
370
+ response_time=None,
371
+ status_code=None,
372
+ )
373
+
374
+
375
+ def wait_for_healthy(
376
+ port: int,
377
+ path: str = "/v1/models",
378
+ total_timeout: float = HEALTH_CHECK_TIMEOUT,
379
+ initial_delay: float = HEALTH_CHECK_INITIAL_DELAY,
380
+ max_delay: float = HEALTH_CHECK_MAX_DELAY,
381
+ backoff_factor: float = HEALTH_CHECK_BACKOFF_FACTOR,
382
+ host: str = "127.0.0.1",
383
+ ) -> HealthCheckResult:
384
+ """Wait for a service to become healthy with exponential backoff.
385
+
386
+ Polls the service's health endpoint repeatedly with increasing delay
387
+ between attempts until either the service responds with HTTP 200 or
388
+ the total timeout is exceeded.
389
+
390
+ Args:
391
+ port: The port to check.
392
+ path: The HTTP path to request.
393
+ total_timeout: Maximum total time to wait in seconds.
394
+ initial_delay: Initial delay between retries in seconds.
395
+ max_delay: Maximum delay between retries in seconds.
396
+ backoff_factor: Multiplier for exponential backoff.
397
+ host: The host to connect to.
398
+
399
+ Returns:
400
+ A HealthCheckResult from the final check.
401
+
402
+ Raises:
403
+ HealthCheckError: If the service does not become healthy
404
+ within the total timeout.
405
+ """
406
+ deadline = time.monotonic() + total_timeout
407
+ delay = initial_delay
408
+ last_result: HealthCheckResult | None = None
409
+
410
+ while time.monotonic() < deadline:
411
+ per_request_timeout = min(5.0, deadline - time.monotonic())
412
+ if per_request_timeout <= 0:
413
+ break
414
+
415
+ result = http_health_check(
416
+ port=port,
417
+ path=path,
418
+ timeout=per_request_timeout,
419
+ host=host,
420
+ )
421
+
422
+ if result.healthy:
423
+ return result
424
+
425
+ last_result = result
426
+
427
+ # Wait before next retry, respecting the deadline
428
+ remaining = deadline - time.monotonic()
429
+ if remaining <= 0:
430
+ break
431
+ sleep_time = min(delay, remaining)
432
+ time.sleep(sleep_time)
433
+ delay = min(delay * backoff_factor, max_delay)
434
+
435
+ # Timed out
436
+ if last_result is None:
437
+ last_result = HealthCheckResult(healthy=False, response_time=None, status_code=None)
438
+
439
+ msg = (
440
+ f"Health check timed out after {total_timeout}s waiting for "
441
+ f"http://{host}:{port}{path}"
442
+ )
443
+ raise HealthCheckError(msg)
444
+
445
+
446
+ # --------------------------------------------------------------------------- #
447
+ # Port conflict detection
448
+ # --------------------------------------------------------------------------- #
449
+
450
+
451
+ def _socket_bind_check(port: int) -> bool:
452
+ """Check if a port is available by attempting a socket bind.
453
+
454
+ This is more reliable than psutil.net_connections on macOS where
455
+ the latter can fail with AccessDenied.
456
+
457
+ Args:
458
+ port: The TCP port to check.
459
+
460
+ Returns:
461
+ True if the port is in use (bind failed), False if available.
462
+ """
463
+ import socket
464
+
465
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
466
+ try:
467
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 0)
468
+ sock.bind(("127.0.0.1", port))
469
+ return False # Port is available
470
+ except OSError:
471
+ return True # Port is in use
472
+ finally:
473
+ sock.close()
474
+
475
+
476
+ def _find_pid_on_port(port: int) -> tuple[int, str] | None:
477
+ """Find the PID and process name listening on a port via psutil.
478
+
479
+ Args:
480
+ port: The TCP port to look up.
481
+
482
+ Returns:
483
+ A tuple of (pid, process_name) if found, or None.
484
+ """
485
+ try:
486
+ for conn in psutil.net_connections(kind="inet"):
487
+ if conn.laddr and conn.laddr.port == port and conn.status == "LISTEN":
488
+ pid = conn.pid
489
+ if pid is not None:
490
+ try:
491
+ proc = psutil.Process(pid)
492
+ return (pid, proc.name())
493
+ except (psutil.NoSuchProcess, psutil.AccessDenied):
494
+ return (pid, "<unknown>")
495
+ except (psutil.AccessDenied, OSError):
496
+ pass
497
+ return None
498
+
499
+
500
+ def check_port_conflict(port: int) -> tuple[int, str] | None:
501
+ """Check if a port is in use and identify the owning process.
502
+
503
+ Uses a two-phase approach for reliability:
504
+ 1. Attempt a socket bind to definitively check port availability.
505
+ 2. If the port is in use, look up the owning PID/process via psutil.
506
+
507
+ This ensures detection works even when psutil.net_connections is
508
+ restricted by macOS permissions.
509
+
510
+ Args:
511
+ port: The TCP port to check.
512
+
513
+ Returns:
514
+ A tuple of (pid, process_name) if the port is in use,
515
+ or None if the port is available.
516
+ """
517
+ # Phase 1: Socket bind check (most reliable)
518
+ if not _socket_bind_check(port):
519
+ return None # Port is available
520
+
521
+ # Phase 2: Port is in use — try to identify the owner
522
+ owner = _find_pid_on_port(port)
523
+ if owner is not None:
524
+ return owner
525
+
526
+ # Port is occupied but we can't identify the owner
527
+ return (0, "<unknown>")
528
+
529
+
530
+
531
+ def detect_port_conflict(port: int) -> None:
532
+ """Raise PortConflictError if a port is already in use.
533
+
534
+ Args:
535
+ port: The TCP port to check.
536
+
537
+ Raises:
538
+ PortConflictError: If the port is in use.
539
+ """
540
+ conflict = check_port_conflict(port)
541
+ if conflict is not None:
542
+ pid, name = conflict
543
+ raise PortConflictError(port=port, pid=pid, name=name)
544
+
545
+
546
+ # --------------------------------------------------------------------------- #
547
+ # Subprocess management
548
+ # --------------------------------------------------------------------------- #
549
+
550
+
551
+ def start_service(
552
+ service_name: str,
553
+ cmd: list[str],
554
+ port: int,
555
+ env: dict[str, str] | None = None,
556
+ log_dir: Path | None = None,
557
+ ) -> ServiceInfo:
558
+ """Start a subprocess for a service with log redirection and PID tracking.
559
+
560
+ Starts the process detached from the CLI's lifecycle. Stdout and stderr
561
+ are redirected to ``<log_dir>/<service_name>.log``.
562
+
563
+ Args:
564
+ service_name: Name for the service (used for PID/log files).
565
+ cmd: The command to execute as a list of strings.
566
+ port: The port the service will listen on.
567
+ env: Optional environment variables for the subprocess.
568
+ Merged with the current environment.
569
+ log_dir: Directory for log files. Defaults to ``~/.mlx-stack/logs/``.
570
+
571
+ Returns:
572
+ A ServiceInfo with the started service's details.
573
+
574
+ Raises:
575
+ ProcessError: If the service cannot be started.
576
+ PortConflictError: If the port is already in use.
577
+ """
578
+ # Ensure directories exist
579
+ if log_dir is None:
580
+ log_dir = _ensure_logs_dir()
581
+ else:
582
+ log_dir.mkdir(parents=True, exist_ok=True)
583
+
584
+ log_path = log_dir / f"{service_name}.log"
585
+
586
+ # Build environment
587
+ process_env: dict[str, str] = dict(os.environ)
588
+ if env:
589
+ process_env.update(env)
590
+
591
+ try:
592
+ log_file = open(log_path, "a") # noqa: SIM115
593
+ except OSError as exc:
594
+ msg = f"Could not open log file for '{service_name}': {exc}"
595
+ raise ProcessError(msg) from None
596
+
597
+ try:
598
+ proc = subprocess.Popen(
599
+ cmd,
600
+ stdout=log_file,
601
+ stderr=log_file,
602
+ env=process_env,
603
+ start_new_session=True,
604
+ )
605
+ except OSError as exc:
606
+ log_file.close()
607
+ msg = f"Could not start service '{service_name}': {exc}"
608
+ raise ProcessError(msg) from None
609
+
610
+ # Write PID file — if this fails, kill the spawned process to prevent
611
+ # leaked unmanaged subprocesses (scrutiny fix: orphan prevention).
612
+ try:
613
+ pid_path = write_pid_file(service_name, proc.pid)
614
+ except ProcessError:
615
+ # Kill the orphaned process before re-raising
616
+ try:
617
+ proc.terminate()
618
+ proc.wait(timeout=5)
619
+ except Exception:
620
+ try:
621
+ proc.kill()
622
+ except Exception:
623
+ pass
624
+ log_file.close()
625
+ msg = (
626
+ f"Could not write PID file for '{service_name}' after "
627
+ f"starting process (PID {proc.pid}). "
628
+ f"The process has been terminated to prevent orphans."
629
+ )
630
+ raise ProcessError(msg) from None
631
+
632
+ # Detach: close the log file handle in the parent process
633
+ # The child process has its own file descriptors
634
+ log_file.close()
635
+
636
+ return ServiceInfo(
637
+ name=service_name,
638
+ pid=proc.pid,
639
+ port=port,
640
+ log_path=log_path,
641
+ pid_path=pid_path,
642
+ )
643
+
644
+
645
+ # --------------------------------------------------------------------------- #
646
+ # Service shutdown
647
+ # --------------------------------------------------------------------------- #
648
+
649
+
650
+ def stop_service(
651
+ service_name: str,
652
+ grace_period: float = SHUTDOWN_GRACE_PERIOD,
653
+ ) -> ShutdownResult | None:
654
+ """Stop a managed service by its PID file.
655
+
656
+ Sends SIGTERM first with a grace period. If the process hasn't exited
657
+ after the grace period, sends SIGKILL. Only removes the PID file once
658
+ process termination is confirmed (scrutiny fix: verified termination).
659
+
660
+ Args:
661
+ service_name: Name of the service to stop.
662
+ grace_period: Seconds to wait after SIGTERM before SIGKILL.
663
+
664
+ Returns:
665
+ A ShutdownResult if a process was stopped, or None if no process
666
+ was found (PID file missing or process already dead).
667
+
668
+ Raises:
669
+ ProcessError: If the PID file is corrupt (non-numeric content).
670
+ """
671
+ try:
672
+ pid = read_pid_file(service_name)
673
+ except ProcessError:
674
+ # Corrupt PID file — remove it and report
675
+ remove_pid_file(service_name)
676
+ return None
677
+
678
+ if pid is None:
679
+ return None
680
+
681
+ if not is_process_alive(pid):
682
+ # Stale PID — clean up
683
+ remove_pid_file(service_name)
684
+ return None
685
+
686
+ # Send SIGTERM, escalate to SIGKILL if needed
687
+ graceful, confirmed = _terminate_process(pid, grace_period)
688
+
689
+ # Only remove PID file once termination is confirmed
690
+ if confirmed:
691
+ remove_pid_file(service_name)
692
+
693
+ return ShutdownResult(name=service_name, pid=pid, graceful=graceful)
694
+
695
+
696
+ def _terminate_process(pid: int, grace_period: float) -> tuple[bool, bool]:
697
+ """Terminate a process with SIGTERM, escalating to SIGKILL if needed.
698
+
699
+ Verifies process termination after SIGKILL before returning
700
+ (scrutiny fix: confirmed termination).
701
+
702
+ Args:
703
+ pid: Process ID to terminate.
704
+ grace_period: Seconds to wait after SIGTERM.
705
+
706
+ Returns:
707
+ A tuple of (graceful, confirmed):
708
+ - graceful: True if SIGTERM was sufficient, False if SIGKILL used.
709
+ - confirmed: True if the process is confirmed dead, False if it
710
+ may still be running after SIGKILL.
711
+ """
712
+ try:
713
+ os.kill(pid, signal.SIGTERM)
714
+ except OSError:
715
+ # Process may have already exited
716
+ return True, True
717
+
718
+ # Wait for process to exit
719
+ deadline = time.monotonic() + grace_period
720
+ while time.monotonic() < deadline:
721
+ if not is_process_alive(pid):
722
+ return True, True
723
+ time.sleep(0.2)
724
+
725
+ # Grace period expired — send SIGKILL
726
+ try:
727
+ os.kill(pid, signal.SIGKILL)
728
+ except OSError:
729
+ # Process may have exited between check and kill
730
+ return True, True
731
+
732
+ # Wait for SIGKILL to take effect — verify process is actually dead
733
+ for _ in range(25):
734
+ if not is_process_alive(pid):
735
+ return False, True
736
+ time.sleep(0.1)
737
+
738
+ # Process is still alive after SIGKILL — not confirmed dead
739
+ return False, False
740
+
741
+
742
+ # --------------------------------------------------------------------------- #
743
+ # Service status
744
+ # --------------------------------------------------------------------------- #
745
+
746
+
747
+ def get_service_status(
748
+ service_name: str,
749
+ port: int,
750
+ health_path: str = "/v1/models",
751
+ ) -> dict[str, Any]:
752
+ """Get the current status of a managed service.
753
+
754
+ Implements 5-state reporting:
755
+ - healthy: PID alive and HTTP 200 within 2s
756
+ - degraded: PID alive and HTTP 200 but response time > 2s
757
+ - down: PID alive but no HTTP response within 5s
758
+ - crashed: PID file exists but process is dead
759
+ - stopped: No PID file
760
+
761
+ Args:
762
+ service_name: Name of the service.
763
+ port: The port the service listens on.
764
+ health_path: The HTTP path for health checks.
765
+
766
+ Returns:
767
+ A dict with keys: status, pid, uptime (seconds or None),
768
+ response_time (seconds or None).
769
+ """
770
+ pid_path = get_pids_dir() / f"{service_name}.pid"
771
+
772
+ # No PID file → stopped
773
+ if not pid_path.exists():
774
+ return {
775
+ "status": "stopped",
776
+ "pid": None,
777
+ "uptime": None,
778
+ "response_time": None,
779
+ }
780
+
781
+ # Read PID
782
+ try:
783
+ pid = read_pid_file(service_name)
784
+ except ProcessError:
785
+ # Corrupt PID file
786
+ return {
787
+ "status": "crashed",
788
+ "pid": None,
789
+ "uptime": None,
790
+ "response_time": None,
791
+ }
792
+
793
+ if pid is None:
794
+ return {
795
+ "status": "stopped",
796
+ "pid": None,
797
+ "uptime": None,
798
+ "response_time": None,
799
+ }
800
+
801
+ # PID file exists but process is dead → crashed
802
+ if not is_process_alive(pid):
803
+ return {
804
+ "status": "crashed",
805
+ "pid": pid,
806
+ "uptime": None,
807
+ "response_time": None,
808
+ }
809
+
810
+ # PID alive → check HTTP health
811
+ uptime = _get_uptime_from_pid_file(service_name)
812
+ result = http_health_check(
813
+ port=port,
814
+ path=health_path,
815
+ timeout=STATUS_CHECK_TIMEOUT,
816
+ )
817
+
818
+ if result.healthy and result.response_time is not None:
819
+ if result.response_time <= STATUS_DEGRADED_THRESHOLD:
820
+ status = "healthy"
821
+ else:
822
+ status = "degraded"
823
+ else:
824
+ status = "down"
825
+
826
+ return {
827
+ "status": status,
828
+ "pid": pid,
829
+ "uptime": uptime,
830
+ "response_time": result.response_time,
831
+ }
832
+
833
+
834
+ def _get_uptime_from_pid_file(service_name: str) -> float | None:
835
+ """Calculate uptime from the PID file's modification timestamp.
836
+
837
+ Args:
838
+ service_name: Name of the service.
839
+
840
+ Returns:
841
+ Uptime in seconds, or None if the file doesn't exist.
842
+ """
843
+ pid_path = get_pids_dir() / f"{service_name}.pid"
844
+ if not pid_path.exists():
845
+ return None
846
+ try:
847
+ mtime = pid_path.stat().st_mtime
848
+ return time.time() - mtime
849
+ except OSError:
850
+ return None
851
+
852
+
853
+ def format_uptime(seconds: float | None) -> str:
854
+ """Format uptime seconds into a human-readable string.
855
+
856
+ Args:
857
+ seconds: Uptime in seconds, or None.
858
+
859
+ Returns:
860
+ Human-readable uptime string (e.g. '2h 15m') or '-'.
861
+ """
862
+ if seconds is None:
863
+ return "-"
864
+
865
+ seconds = int(seconds)
866
+ if seconds < 60:
867
+ return f"{seconds}s"
868
+
869
+ minutes = seconds // 60
870
+ if minutes < 60:
871
+ remaining_s = seconds % 60
872
+ if remaining_s > 0:
873
+ return f"{minutes}m {remaining_s}s"
874
+ return f"{minutes}m"
875
+
876
+ hours = minutes // 60
877
+ remaining_m = minutes % 60
878
+ if hours < 24:
879
+ if remaining_m > 0:
880
+ return f"{hours}h {remaining_m}m"
881
+ return f"{hours}h"
882
+
883
+ days = hours // 24
884
+ remaining_h = hours % 24
885
+ if remaining_h > 0:
886
+ return f"{days}d {remaining_h}h"
887
+ return f"{days}d"