mlx-stack 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. mlx_stack/__init__.py +5 -0
  2. mlx_stack/_version.py +24 -0
  3. mlx_stack/cli/__init__.py +5 -0
  4. mlx_stack/cli/bench.py +221 -0
  5. mlx_stack/cli/config.py +166 -0
  6. mlx_stack/cli/down.py +109 -0
  7. mlx_stack/cli/init.py +180 -0
  8. mlx_stack/cli/install.py +165 -0
  9. mlx_stack/cli/logs.py +234 -0
  10. mlx_stack/cli/main.py +187 -0
  11. mlx_stack/cli/models.py +304 -0
  12. mlx_stack/cli/profile.py +65 -0
  13. mlx_stack/cli/pull.py +134 -0
  14. mlx_stack/cli/recommend.py +397 -0
  15. mlx_stack/cli/status.py +111 -0
  16. mlx_stack/cli/up.py +163 -0
  17. mlx_stack/cli/watch.py +252 -0
  18. mlx_stack/core/__init__.py +1 -0
  19. mlx_stack/core/benchmark.py +1182 -0
  20. mlx_stack/core/catalog.py +560 -0
  21. mlx_stack/core/config.py +471 -0
  22. mlx_stack/core/deps.py +323 -0
  23. mlx_stack/core/hardware.py +304 -0
  24. mlx_stack/core/launchd.py +531 -0
  25. mlx_stack/core/litellm_gen.py +188 -0
  26. mlx_stack/core/log_rotation.py +231 -0
  27. mlx_stack/core/log_viewer.py +386 -0
  28. mlx_stack/core/models.py +639 -0
  29. mlx_stack/core/paths.py +79 -0
  30. mlx_stack/core/process.py +887 -0
  31. mlx_stack/core/pull.py +815 -0
  32. mlx_stack/core/scoring.py +611 -0
  33. mlx_stack/core/stack_down.py +317 -0
  34. mlx_stack/core/stack_init.py +524 -0
  35. mlx_stack/core/stack_status.py +229 -0
  36. mlx_stack/core/stack_up.py +856 -0
  37. mlx_stack/core/watchdog.py +744 -0
  38. mlx_stack/data/__init__.py +1 -0
  39. mlx_stack/data/catalog/__init__.py +1 -0
  40. mlx_stack/data/catalog/deepseek-r1-32b.yaml +46 -0
  41. mlx_stack/data/catalog/deepseek-r1-8b.yaml +45 -0
  42. mlx_stack/data/catalog/gemma3-12b.yaml +45 -0
  43. mlx_stack/data/catalog/gemma3-27b.yaml +45 -0
  44. mlx_stack/data/catalog/gemma3-4b.yaml +45 -0
  45. mlx_stack/data/catalog/llama3.3-8b.yaml +44 -0
  46. mlx_stack/data/catalog/nemotron-49b.yaml +41 -0
  47. mlx_stack/data/catalog/nemotron-8b.yaml +44 -0
  48. mlx_stack/data/catalog/qwen3-8b.yaml +45 -0
  49. mlx_stack/data/catalog/qwen3.5-0.8b.yaml +45 -0
  50. mlx_stack/data/catalog/qwen3.5-14b.yaml +46 -0
  51. mlx_stack/data/catalog/qwen3.5-32b.yaml +45 -0
  52. mlx_stack/data/catalog/qwen3.5-3b.yaml +44 -0
  53. mlx_stack/data/catalog/qwen3.5-72b.yaml +42 -0
  54. mlx_stack/data/catalog/qwen3.5-8b.yaml +45 -0
  55. mlx_stack/py.typed +1 -0
  56. mlx_stack/utils/__init__.py +1 -0
  57. mlx_stack-0.1.0.dist-info/METADATA +397 -0
  58. mlx_stack-0.1.0.dist-info/RECORD +61 -0
  59. mlx_stack-0.1.0.dist-info/WHEEL +4 -0
  60. mlx_stack-0.1.0.dist-info/entry_points.txt +2 -0
  61. mlx_stack-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,744 @@
1
+ """Watchdog health monitor for mlx-stack.
2
+
3
+ Implements a long-running health monitor that:
4
+ - Polls service health at configurable intervals (default 30s)
5
+ - Auto-restarts crashed services (PID file exists, process dead)
6
+ - Does NOT restart stopped services (no PID file)
7
+ - Tracks flap detection with configurable max-restarts threshold
8
+ - Uses exponential backoff on restart delay
9
+ - Acquires lock only during restart operations
10
+ - Supports daemon mode (os.fork + os.setsid + PID file)
11
+ - Handles SIGTERM/SIGINT for clean shutdown
12
+ - Triggers log rotation each poll cycle
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import os
19
+ import signal
20
+ import time
21
+ from dataclasses import dataclass, field
22
+ from typing import Any
23
+
24
+ from mlx_stack.core.config import ConfigCorruptError, get_value
25
+ from mlx_stack.core.log_rotation import rotate_log
26
+ from mlx_stack.core.paths import get_data_home, get_logs_dir
27
+ from mlx_stack.core.process import (
28
+ LockError,
29
+ acquire_lock,
30
+ is_process_alive,
31
+ read_pid_file,
32
+ remove_pid_file,
33
+ start_service,
34
+ write_pid_file,
35
+ )
36
+ from mlx_stack.core.stack_status import (
37
+ ServiceStatus,
38
+ run_status,
39
+ )
40
+ from mlx_stack.core.stack_up import (
41
+ LITELLM_SERVICE_NAME,
42
+ build_litellm_command,
43
+ build_vllm_command,
44
+ load_stack_definition,
45
+ )
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # --------------------------------------------------------------------------- #
50
+ # Constants
51
+ # --------------------------------------------------------------------------- #
52
+
53
+ WATCHDOG_SERVICE_NAME = "watchdog"
54
+ WATCHDOG_PID_FILE = "watchdog.pid"
55
+
56
+ DEFAULT_INTERVAL = 30
57
+ DEFAULT_MAX_RESTARTS = 5
58
+ DEFAULT_RESTART_DELAY = 10
59
+
60
+
61
+ # --------------------------------------------------------------------------- #
62
+ # Exceptions
63
+ # --------------------------------------------------------------------------- #
64
+
65
+
66
+ class WatchdogError(Exception):
67
+ """Raised when the watchdog encounters a fatal error."""
68
+
69
+
70
+ # --------------------------------------------------------------------------- #
71
+ # Data classes
72
+ # --------------------------------------------------------------------------- #
73
+
74
+
75
+ @dataclass
76
+ class RestartRecord:
77
+ """Record of a service restart event."""
78
+
79
+ service: str
80
+ timestamp: float
81
+ reason: str
82
+ attempt: int
83
+ success: bool
84
+
85
+
86
+ @dataclass
87
+ class ServiceTracker:
88
+ """Tracks restart history and flap state per service."""
89
+
90
+ restart_timestamps: list[float] = field(default_factory=list)
91
+ restart_count: int = 0
92
+ is_flapping: bool = False
93
+ last_restart_time: float = 0.0
94
+ consecutive_failures: int = 0
95
+
96
+
97
+ @dataclass
98
+ class WatchdogState:
99
+ """Internal state of the watchdog monitor."""
100
+
101
+ shutdown_requested: bool = False
102
+ service_trackers: dict[str, ServiceTracker] = field(default_factory=dict)
103
+ restart_log: list[RestartRecord] = field(default_factory=list)
104
+ is_daemon: bool = False
105
+ cycle_count: int = 0
106
+
107
+
108
+ @dataclass
109
+ class PollResult:
110
+ """Result of a single poll cycle."""
111
+
112
+ statuses: list[ServiceStatus]
113
+ restarts_attempted: int = 0
114
+ restarts_succeeded: int = 0
115
+ rotations_performed: int = 0
116
+ flapping_services: list[str] = field(default_factory=list)
117
+
118
+
119
+ # --------------------------------------------------------------------------- #
120
+ # Stack definition helpers
121
+ # --------------------------------------------------------------------------- #
122
+
123
+
124
+ def _load_stack_for_watchdog(stack_name: str = "default") -> dict[str, Any]:
125
+ """Load the stack definition for watchdog use.
126
+
127
+ Args:
128
+ stack_name: Stack definition name.
129
+
130
+ Returns:
131
+ The parsed stack definition dict.
132
+
133
+ Raises:
134
+ WatchdogError: If the stack cannot be loaded.
135
+ """
136
+ try:
137
+ return load_stack_definition(stack_name)
138
+ except Exception as exc:
139
+ msg = f"No stack configuration found. Run 'mlx-stack init' first.\n{exc}"
140
+ raise WatchdogError(msg) from None
141
+
142
+
143
+ def _get_tier_by_name(
144
+ stack: dict[str, Any],
145
+ service_name: str,
146
+ ) -> dict[str, Any] | None:
147
+ """Find a tier definition by service name.
148
+
149
+ Args:
150
+ stack: The stack definition dict.
151
+ service_name: The service/tier name to look up.
152
+
153
+ Returns:
154
+ The tier dict if found, else None.
155
+ """
156
+ for tier in stack.get("tiers", []):
157
+ if tier.get("name") == service_name:
158
+ return tier
159
+ return None
160
+
161
+
162
+ # --------------------------------------------------------------------------- #
163
+ # Flap detection
164
+ # --------------------------------------------------------------------------- #
165
+
166
+
167
+ def check_flapping(
168
+ tracker: ServiceTracker,
169
+ max_restarts: int,
170
+ window_seconds: float = 600.0,
171
+ ) -> bool:
172
+ """Check if a service is flapping (restarting too frequently).
173
+
174
+ A service is considered flapping if it has been restarted more than
175
+ ``max_restarts`` times within the rolling window.
176
+
177
+ Args:
178
+ tracker: The service's restart tracker.
179
+ max_restarts: Maximum restarts allowed in the window.
180
+ window_seconds: The rolling window size in seconds (default 10 min).
181
+
182
+ Returns:
183
+ True if the service is flapping.
184
+ """
185
+ now = time.monotonic()
186
+ cutoff = now - window_seconds
187
+
188
+ # Prune old timestamps outside the window
189
+ tracker.restart_timestamps = [
190
+ ts for ts in tracker.restart_timestamps if ts > cutoff
191
+ ]
192
+
193
+ if len(tracker.restart_timestamps) >= max_restarts:
194
+ tracker.is_flapping = True
195
+ return True
196
+
197
+ return False
198
+
199
+
200
+ def reset_flap_state(
201
+ tracker: ServiceTracker,
202
+ stable_period: float = 300.0,
203
+ ) -> bool:
204
+ """Reset flap state if the service has been stable.
205
+
206
+ A service is considered stable if no restarts have occurred in the
207
+ last ``stable_period`` seconds.
208
+
209
+ Args:
210
+ tracker: The service's restart tracker.
211
+ stable_period: Time in seconds of no restarts to consider stable.
212
+
213
+ Returns:
214
+ True if the flap state was reset.
215
+ """
216
+ if not tracker.is_flapping:
217
+ return False
218
+
219
+ now = time.monotonic()
220
+ if tracker.last_restart_time == 0.0:
221
+ return False
222
+
223
+ if (now - tracker.last_restart_time) >= stable_period:
224
+ tracker.is_flapping = False
225
+ tracker.restart_timestamps.clear()
226
+ tracker.restart_count = 0
227
+ tracker.consecutive_failures = 0
228
+ return True
229
+
230
+ return False
231
+
232
+
233
+ # --------------------------------------------------------------------------- #
234
+ # Restart delay with exponential backoff
235
+ # --------------------------------------------------------------------------- #
236
+
237
+
238
+ def calculate_restart_delay(
239
+ base_delay: float,
240
+ consecutive_failures: int,
241
+ max_delay: float = 300.0,
242
+ ) -> float:
243
+ """Calculate the restart delay with exponential backoff.
244
+
245
+ Args:
246
+ base_delay: The base restart delay in seconds.
247
+ consecutive_failures: Number of consecutive failures.
248
+ max_delay: Maximum delay cap in seconds.
249
+
250
+ Returns:
251
+ The delay in seconds before the next restart attempt.
252
+ """
253
+ if consecutive_failures <= 0:
254
+ return base_delay
255
+
256
+ delay = base_delay * (2 ** (consecutive_failures - 1))
257
+ return min(delay, max_delay)
258
+
259
+
260
+ # --------------------------------------------------------------------------- #
261
+ # Service restart
262
+ # --------------------------------------------------------------------------- #
263
+
264
+
265
+ def restart_service(
266
+ service_name: str,
267
+ stack: dict[str, Any],
268
+ tracker: ServiceTracker,
269
+ vllm_binary: str | None = None,
270
+ litellm_binary: str | None = None,
271
+ ) -> bool:
272
+ """Restart a crashed service using the stack definition.
273
+
274
+ Acquires the lock only during the restart operation and releases
275
+ it immediately after.
276
+
277
+ Args:
278
+ service_name: Name of the service to restart.
279
+ stack: The stack definition dict.
280
+ tracker: The service's restart tracker.
281
+ vllm_binary: Path to vllm-mlx binary (resolved if None).
282
+ litellm_binary: Path to litellm binary (resolved if None).
283
+
284
+ Returns:
285
+ True if the restart succeeded, False otherwise.
286
+ """
287
+ import shutil
288
+
289
+ # Resolve binaries
290
+ if vllm_binary is None:
291
+ vllm_binary = shutil.which("vllm-mlx") or "vllm-mlx"
292
+ if litellm_binary is None:
293
+ litellm_binary = shutil.which("litellm") or "litellm"
294
+
295
+ # Acquire lock BEFORE removing PID file. If the lock cannot be
296
+ # obtained, leave the PID file intact so the service stays in
297
+ # "crashed" state for the next poll cycle.
298
+ try:
299
+ with acquire_lock():
300
+ # Clean up old PID file only after lock is held
301
+ remove_pid_file(service_name)
302
+
303
+ if service_name == LITELLM_SERVICE_NAME:
304
+ return _restart_litellm(service_name, stack, litellm_binary)
305
+ else:
306
+ return _restart_tier(service_name, stack, vllm_binary)
307
+ except LockError:
308
+ logger.warning(
309
+ "Could not acquire lock to restart '%s' — another operation is in progress.",
310
+ service_name,
311
+ )
312
+ return False
313
+ except Exception:
314
+ logger.exception("Failed to restart service '%s'.", service_name)
315
+ return False
316
+
317
+
318
+ def _restart_tier(
319
+ service_name: str,
320
+ stack: dict[str, Any],
321
+ vllm_binary: str,
322
+ ) -> bool:
323
+ """Restart a vllm-mlx tier service.
324
+
325
+ Args:
326
+ service_name: The tier name.
327
+ stack: The stack definition dict.
328
+ vllm_binary: Path to vllm-mlx binary.
329
+
330
+ Returns:
331
+ True if the restart succeeded.
332
+ """
333
+ tier = _get_tier_by_name(stack, service_name)
334
+ if tier is None:
335
+ logger.error("Tier '%s' not found in stack definition.", service_name)
336
+ return False
337
+
338
+ cmd = build_vllm_command(tier, vllm_binary)
339
+ port = tier["port"]
340
+
341
+ try:
342
+ start_service(
343
+ service_name=service_name,
344
+ cmd=cmd,
345
+ port=port,
346
+ )
347
+ return True
348
+ except Exception:
349
+ logger.exception("Failed to start tier '%s'.", service_name)
350
+ return False
351
+
352
+
353
+ def _restart_litellm(
354
+ service_name: str,
355
+ stack: dict[str, Any],
356
+ litellm_binary: str,
357
+ ) -> bool:
358
+ """Restart the LiteLLM proxy service.
359
+
360
+ Args:
361
+ service_name: The service name (litellm).
362
+ stack: The stack definition dict.
363
+ litellm_binary: Path to litellm binary.
364
+
365
+ Returns:
366
+ True if the restart succeeded.
367
+ """
368
+ try:
369
+ litellm_port = int(get_value("litellm-port"))
370
+ except (ConfigCorruptError, ValueError):
371
+ litellm_port = 4000
372
+
373
+ litellm_config_path = get_data_home() / "litellm.yaml"
374
+
375
+ cmd = build_litellm_command(litellm_binary, litellm_port, litellm_config_path)
376
+
377
+ # Build env with OpenRouter key if configured
378
+ env: dict[str, str] | None = None
379
+ try:
380
+ openrouter_key = str(get_value("openrouter-key"))
381
+ if openrouter_key:
382
+ env = {"OPENROUTER_API_KEY": openrouter_key}
383
+ except (ConfigCorruptError, Exception):
384
+ pass
385
+
386
+ try:
387
+ start_service(
388
+ service_name=service_name,
389
+ cmd=cmd,
390
+ port=litellm_port,
391
+ env=env,
392
+ )
393
+ return True
394
+ except Exception:
395
+ logger.exception("Failed to start LiteLLM.")
396
+ return False
397
+
398
+
399
+ # --------------------------------------------------------------------------- #
400
+ # Log rotation during poll
401
+ # --------------------------------------------------------------------------- #
402
+
403
+
404
+ def rotate_service_logs() -> int:
405
+ """Rotate all service log files that exceed the configured threshold.
406
+
407
+ Returns:
408
+ Number of files that were rotated.
409
+ """
410
+ try:
411
+ max_size_mb = int(get_value("log-max-size-mb"))
412
+ except (ConfigCorruptError, ValueError):
413
+ max_size_mb = 50
414
+
415
+ try:
416
+ max_files = int(get_value("log-max-files"))
417
+ except (ConfigCorruptError, ValueError):
418
+ max_files = 5
419
+
420
+ logs_dir = get_logs_dir()
421
+ if not logs_dir.exists():
422
+ return 0
423
+
424
+ rotated_count = 0
425
+ for log_path in logs_dir.iterdir():
426
+ if log_path.suffix == ".log" and log_path.is_file():
427
+ try:
428
+ if rotate_log(log_path, max_size_mb=max_size_mb, max_files=max_files):
429
+ rotated_count += 1
430
+ except Exception:
431
+ logger.exception("Error rotating log %s.", log_path)
432
+
433
+ return rotated_count
434
+
435
+
436
+ # --------------------------------------------------------------------------- #
437
+ # Daemon mode
438
+ # --------------------------------------------------------------------------- #
439
+
440
+
441
+ def check_existing_watchdog() -> int | None:
442
+ """Check if a watchdog is already running.
443
+
444
+ Returns:
445
+ The PID of the running watchdog, or None.
446
+ """
447
+ try:
448
+ pid = read_pid_file(WATCHDOG_SERVICE_NAME)
449
+ except Exception:
450
+ return None
451
+
452
+ if pid is None:
453
+ return None
454
+
455
+ if is_process_alive(pid):
456
+ return pid
457
+
458
+ # Stale PID file — clean up
459
+ remove_pid_file(WATCHDOG_SERVICE_NAME)
460
+ return None
461
+
462
+
463
+ def daemonize() -> None:
464
+ """Fork the process to run as a daemon.
465
+
466
+ Uses the double-fork technique with os.setsid() to fully detach
467
+ from the terminal. Writes the daemon PID to the watchdog PID file.
468
+
469
+ Raises:
470
+ WatchdogError: If daemonization fails.
471
+ """
472
+ try:
473
+ pid = os.fork()
474
+ if pid > 0:
475
+ # Parent exits
476
+ os._exit(0)
477
+ except OSError as exc:
478
+ msg = f"First fork failed: {exc}"
479
+ raise WatchdogError(msg) from None
480
+
481
+ # Become session leader
482
+ os.setsid()
483
+
484
+ try:
485
+ pid = os.fork()
486
+ if pid > 0:
487
+ # First child exits
488
+ os._exit(0)
489
+ except OSError as exc:
490
+ msg = f"Second fork failed: {exc}"
491
+ raise WatchdogError(msg) from None
492
+
493
+ # Redirect stdin/stdout/stderr to /dev/null
494
+ devnull = os.open(os.devnull, os.O_RDWR)
495
+ os.dup2(devnull, 0)
496
+ os.dup2(devnull, 1)
497
+ os.dup2(devnull, 2)
498
+ os.close(devnull)
499
+
500
+ # Write daemon PID
501
+ write_pid_file(WATCHDOG_SERVICE_NAME, os.getpid())
502
+
503
+
504
+ def remove_watchdog_pid() -> None:
505
+ """Remove the watchdog PID file."""
506
+ remove_pid_file(WATCHDOG_SERVICE_NAME)
507
+
508
+
509
+ # --------------------------------------------------------------------------- #
510
+ # Signal handling
511
+ # --------------------------------------------------------------------------- #
512
+
513
+
514
+ def setup_signal_handlers(state: WatchdogState) -> None:
515
+ """Register SIGTERM and SIGINT handlers for clean shutdown.
516
+
517
+ Args:
518
+ state: The watchdog state — sets shutdown_requested flag.
519
+ """
520
+ def handler(signum: int, frame: Any) -> None:
521
+ state.shutdown_requested = True
522
+
523
+ signal.signal(signal.SIGTERM, handler)
524
+ signal.signal(signal.SIGINT, handler)
525
+
526
+
527
+ # --------------------------------------------------------------------------- #
528
+ # Main poll cycle
529
+ # --------------------------------------------------------------------------- #
530
+
531
+
532
+ def poll_cycle(
533
+ state: WatchdogState,
534
+ stack: dict[str, Any],
535
+ interval: int,
536
+ max_restarts: int,
537
+ restart_delay: int,
538
+ vllm_binary: str | None = None,
539
+ litellm_binary: str | None = None,
540
+ ) -> PollResult:
541
+ """Execute a single watchdog poll cycle.
542
+
543
+ 1. Get service status for all services.
544
+ 2. For each crashed service, attempt restart (with flap/backoff checks).
545
+ 3. Rotate logs.
546
+ 4. Return the poll result.
547
+
548
+ Args:
549
+ state: The watchdog state.
550
+ stack: The stack definition.
551
+ interval: Poll interval (for display).
552
+ max_restarts: Maximum restarts before flap detection.
553
+ restart_delay: Base restart delay in seconds.
554
+ vllm_binary: Path to vllm-mlx binary.
555
+ litellm_binary: Path to litellm binary.
556
+
557
+ Returns:
558
+ PollResult with status and action details.
559
+ """
560
+ state.cycle_count += 1
561
+
562
+ # Get current status
563
+ status_result = run_status()
564
+ result = PollResult(statuses=status_result.services)
565
+
566
+ if status_result.no_stack:
567
+ return result
568
+
569
+ # Check each service for crashed state
570
+ for svc in status_result.services:
571
+ service_name = svc.tier
572
+ tracker = state.service_trackers.setdefault(
573
+ service_name, ServiceTracker()
574
+ )
575
+
576
+ # Try to reset flap state if service has been stable
577
+ reset_flap_state(tracker)
578
+
579
+ if svc.status != "crashed":
580
+ # Service is not crashed — reset consecutive failure counter
581
+ # if it was previously restarted and is now healthy
582
+ if svc.status == "healthy" and tracker.consecutive_failures > 0:
583
+ tracker.consecutive_failures = 0
584
+ continue
585
+
586
+ # Service is crashed — decide whether to restart
587
+ if tracker.is_flapping:
588
+ result.flapping_services.append(service_name)
589
+ continue
590
+
591
+ # Check flap detection before restarting
592
+ if check_flapping(tracker, max_restarts):
593
+ result.flapping_services.append(service_name)
594
+ logger.warning(
595
+ "Service '%s' marked as flapping after %d restarts. "
596
+ "Stopping auto-restart.",
597
+ service_name,
598
+ max_restarts,
599
+ )
600
+ continue
601
+
602
+ # Calculate restart delay with backoff
603
+ delay = calculate_restart_delay(
604
+ base_delay=float(restart_delay),
605
+ consecutive_failures=tracker.consecutive_failures,
606
+ )
607
+
608
+ # Check if enough time has passed since last restart
609
+ now = time.monotonic()
610
+ if tracker.last_restart_time > 0:
611
+ elapsed = now - tracker.last_restart_time
612
+ if elapsed < delay:
613
+ continue # Not enough time elapsed; skip this cycle
614
+
615
+ # Attempt restart
616
+ result.restarts_attempted += 1
617
+ tracker.restart_count += 1
618
+
619
+ success = restart_service(
620
+ service_name=service_name,
621
+ stack=stack,
622
+ tracker=tracker,
623
+ vllm_binary=vllm_binary,
624
+ litellm_binary=litellm_binary,
625
+ )
626
+
627
+ record = RestartRecord(
628
+ service=service_name,
629
+ timestamp=time.time(),
630
+ reason="crashed (PID file exists, process dead)",
631
+ attempt=tracker.restart_count,
632
+ success=success,
633
+ )
634
+ state.restart_log.append(record)
635
+
636
+ tracker.last_restart_time = time.monotonic()
637
+ tracker.restart_timestamps.append(time.monotonic())
638
+
639
+ if success:
640
+ result.restarts_succeeded += 1
641
+ tracker.consecutive_failures = 0
642
+ else:
643
+ tracker.consecutive_failures += 1
644
+
645
+ # Rotate logs
646
+ result.rotations_performed = rotate_service_logs()
647
+
648
+ return result
649
+
650
+
651
+ # --------------------------------------------------------------------------- #
652
+ # Main watchdog loop
653
+ # --------------------------------------------------------------------------- #
654
+
655
+
656
+ def run_watchdog(
657
+ interval: int = DEFAULT_INTERVAL,
658
+ max_restarts: int = DEFAULT_MAX_RESTARTS,
659
+ restart_delay: int = DEFAULT_RESTART_DELAY,
660
+ daemon: bool = False,
661
+ stack_name: str = "default",
662
+ status_callback: Any = None,
663
+ restart_callback: Any = None,
664
+ ) -> WatchdogState:
665
+ """Run the watchdog health monitor main loop.
666
+
667
+ Args:
668
+ interval: Seconds between health polls.
669
+ max_restarts: Max restarts before marking as flapping.
670
+ restart_delay: Base delay in seconds before restart.
671
+ daemon: Whether to daemonize.
672
+ stack_name: Stack definition name.
673
+ status_callback: Called with (PollResult, WatchdogState) each cycle.
674
+ restart_callback: Called with (RestartRecord) for each restart event.
675
+
676
+ Returns:
677
+ The final WatchdogState.
678
+
679
+ Raises:
680
+ WatchdogError: On fatal errors (no stack, already running, etc.).
681
+ """
682
+ # Check stack prerequisite
683
+ stack = _load_stack_for_watchdog(stack_name)
684
+
685
+ # Check for existing watchdog
686
+ existing_pid = check_existing_watchdog()
687
+ if existing_pid is not None:
688
+ msg = (
689
+ f"A watchdog is already running (PID {existing_pid}). "
690
+ "Stop it before starting a new one."
691
+ )
692
+ raise WatchdogError(msg)
693
+
694
+ state = WatchdogState(is_daemon=daemon)
695
+
696
+ # Daemon mode
697
+ if daemon:
698
+ daemonize()
699
+
700
+ # Set up signal handlers
701
+ setup_signal_handlers(state)
702
+
703
+ # Write PID for non-daemon mode too (for single-instance check)
704
+ if not daemon:
705
+ write_pid_file(WATCHDOG_SERVICE_NAME, os.getpid())
706
+
707
+ # Resolve binaries
708
+ import shutil
709
+
710
+ vllm_binary = shutil.which("vllm-mlx") or "vllm-mlx"
711
+ litellm_binary = shutil.which("litellm") or "litellm"
712
+
713
+ try:
714
+ while not state.shutdown_requested:
715
+ result = poll_cycle(
716
+ state=state,
717
+ stack=stack,
718
+ interval=interval,
719
+ max_restarts=max_restarts,
720
+ restart_delay=restart_delay,
721
+ vllm_binary=vllm_binary,
722
+ litellm_binary=litellm_binary,
723
+ )
724
+
725
+ # Callbacks
726
+ if status_callback is not None:
727
+ status_callback(result, state)
728
+
729
+ if restart_callback is not None and result.restarts_attempted > 0:
730
+ for record in state.restart_log[-result.restarts_attempted:]:
731
+ restart_callback(record)
732
+
733
+ # Sleep in small increments so we can check shutdown flag
734
+ sleep_end = time.monotonic() + interval
735
+ while time.monotonic() < sleep_end:
736
+ if state.shutdown_requested:
737
+ break
738
+ time.sleep(min(0.5, sleep_end - time.monotonic()))
739
+
740
+ finally:
741
+ # Clean shutdown
742
+ remove_watchdog_pid()
743
+
744
+ return state