claude-mpm 4.2.28__py3-none-any.whl → 4.2.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,739 @@
1
+ """
2
+ Unified Daemon Manager Service
3
+ ==============================
4
+
5
+ WHY: This service consolidates ALL daemon lifecycle operations into a single place,
6
+ eliminating duplicate code and race conditions from having daemon management logic
7
+ scattered across multiple files.
8
+
9
+ DESIGN DECISIONS:
10
+ - Single source of truth for all daemon operations
11
+ - Robust port cleanup with retry logic
12
+ - Thread-safe operations with proper locking
13
+ - Comprehensive error handling and recovery
14
+ - Supports both foreground and background/daemon modes
15
+ - Manages PID files, port conflicts, and process lifecycle
16
+
17
+ This replaces duplicate logic that was in:
18
+ - UnifiedMonitorDaemon._cleanup_port_conflicts()
19
+ - UnifiedDashboardManager._cleanup_port_conflicts()
20
+ - Various daemon startup/stop logic spread across files
21
+ """
22
+
23
+ import os
24
+ import signal
25
+ import socket
26
+ import subprocess
27
+ import sys
28
+ import tempfile
29
+ import threading
30
+ import time
31
+ from pathlib import Path
32
+ from typing import Optional, Tuple
33
+
34
+ from ...core.logging_config import get_logger
35
+
36
+
37
+ class DaemonManager:
38
+ """Centralized manager for all daemon lifecycle operations.
39
+
40
+ This is the SINGLE source of truth for:
41
+ - Port conflict resolution
42
+ - Process cleanup
43
+ - Daemon startup/stop
44
+ - PID file management
45
+ - Service detection
46
+ """
47
+
48
+ # Class-level lock for thread safety
49
+ _lock = threading.Lock()
50
+
51
+ def __init__(
52
+ self,
53
+ port: int = 8765,
54
+ host: str = "localhost",
55
+ pid_file: Optional[str] = None,
56
+ log_file: Optional[str] = None,
57
+ ):
58
+ """Initialize the daemon manager.
59
+
60
+ Args:
61
+ port: Port number for the daemon
62
+ host: Host to bind to
63
+ pid_file: Path to PID file (uses default if None)
64
+ log_file: Path to log file for daemon mode
65
+ """
66
+ self.port = port
67
+ self.host = host
68
+ self.logger = get_logger(__name__)
69
+
70
+ # Set up paths
71
+ if pid_file:
72
+ self.pid_file = Path(pid_file)
73
+ else:
74
+ self.pid_file = self._get_default_pid_file()
75
+
76
+ self.log_file = Path(log_file) if log_file else self._get_default_log_file()
77
+
78
+ # Startup status communication
79
+ self.startup_status_file = None
80
+
81
+ def _get_default_pid_file(self) -> Path:
82
+ """Get default PID file path."""
83
+ project_root = Path.cwd()
84
+ claude_mpm_dir = project_root / ".claude-mpm"
85
+ claude_mpm_dir.mkdir(exist_ok=True)
86
+ return claude_mpm_dir / "monitor-daemon.pid"
87
+
88
+ def _get_default_log_file(self) -> Path:
89
+ """Get default log file path."""
90
+ project_root = Path.cwd()
91
+ claude_mpm_dir = project_root / ".claude-mpm"
92
+ claude_mpm_dir.mkdir(exist_ok=True)
93
+ return claude_mpm_dir / "monitor-daemon.log"
94
+
95
+ def cleanup_port_conflicts(self, max_retries: int = 3) -> bool:
96
+ """Clean up any processes using the daemon port.
97
+
98
+ This is the SINGLE implementation for port cleanup, replacing
99
+ duplicate logic in multiple files.
100
+
101
+ Args:
102
+ max_retries: Maximum number of cleanup attempts
103
+
104
+ Returns:
105
+ True if port is available after cleanup, False otherwise
106
+ """
107
+ with self._lock:
108
+ for attempt in range(max_retries):
109
+ if attempt > 0:
110
+ self.logger.info(
111
+ f"Port cleanup attempt {attempt + 1}/{max_retries}"
112
+ )
113
+
114
+ # First check if port is actually in use
115
+ if self._is_port_available():
116
+ self.logger.debug(f"Port {self.port} is available")
117
+ return True
118
+
119
+ self.logger.info(f"Port {self.port} is in use, attempting cleanup")
120
+
121
+ # Try to find and kill processes using the port
122
+ if self._kill_processes_on_port():
123
+ # Wait for port to be released
124
+ time.sleep(2 if attempt == 0 else 3)
125
+
126
+ # Verify port is now free
127
+ if self._is_port_available():
128
+ self.logger.info(f"Port {self.port} successfully cleaned up")
129
+ return True
130
+
131
+ if attempt < max_retries - 1:
132
+ # Wait longer between attempts
133
+ time.sleep(3)
134
+
135
+ self.logger.error(
136
+ f"Failed to clean up port {self.port} after {max_retries} attempts"
137
+ )
138
+ return False
139
+
140
+ def _is_port_available(self) -> bool:
141
+ """Check if the port is available for binding.
142
+
143
+ Returns:
144
+ True if port is available, False otherwise
145
+ """
146
+ try:
147
+ test_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
148
+ test_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
149
+ test_sock.bind((self.host, self.port))
150
+ test_sock.close()
151
+ return True
152
+ except OSError:
153
+ return False
154
+
155
+ def _kill_processes_on_port(self) -> bool:
156
+ """Kill processes using the daemon port.
157
+
158
+ Returns:
159
+ True if processes were killed or none found, False on error
160
+ """
161
+ try:
162
+ # Try using lsof first (most reliable)
163
+ if self._kill_using_lsof():
164
+ return True
165
+
166
+ # Fallback to checking our known PID file
167
+ if self._kill_using_pid_file():
168
+ return True
169
+
170
+ # Try to identify claude-mpm processes
171
+ if self._kill_claude_mpm_processes():
172
+ return True
173
+
174
+ return False
175
+
176
+ except Exception as e:
177
+ self.logger.error(f"Error killing processes on port: {e}")
178
+ return False
179
+
180
+ def _kill_using_lsof(self) -> bool:
181
+ """Kill processes using lsof to find them.
182
+
183
+ Returns:
184
+ True if successful or lsof not available, False on error
185
+ """
186
+ try:
187
+ # Find processes using the port
188
+ result = subprocess.run(
189
+ ["lsof", "-ti", f":{self.port}"], capture_output=True, text=True, check=False
190
+ )
191
+
192
+ if result.returncode != 0 or not result.stdout.strip():
193
+ self.logger.debug(f"No processes found using port {self.port}")
194
+ return True
195
+
196
+ pids = result.stdout.strip().split("\n")
197
+ self.logger.info(f"Found processes using port {self.port}: {pids}")
198
+
199
+ # Kill each process
200
+ for pid_str in pids:
201
+ try:
202
+ pid = int(pid_str.strip())
203
+
204
+ # Check if it's a Python/Claude process
205
+ process_info = subprocess.run(
206
+ ["ps", "-p", str(pid), "-o", "comm="],
207
+ capture_output=True,
208
+ text=True, check=False,
209
+ )
210
+
211
+ process_name = process_info.stdout.strip().lower()
212
+ if "python" in process_name or "claude" in process_name:
213
+ self.logger.info(f"Killing Python/Claude process {pid}")
214
+ os.kill(pid, signal.SIGTERM)
215
+
216
+ # Wait briefly for graceful shutdown
217
+ time.sleep(1)
218
+
219
+ # Check if still alive and force kill if needed
220
+ try:
221
+ os.kill(pid, 0) # Check if process exists
222
+ self.logger.warning(
223
+ f"Process {pid} didn't terminate, force killing"
224
+ )
225
+ os.kill(pid, signal.SIGKILL)
226
+ time.sleep(0.5)
227
+ except ProcessLookupError:
228
+ pass # Process already dead
229
+ else:
230
+ self.logger.warning(
231
+ f"Process {pid} ({process_name}) is not a Claude MPM process"
232
+ )
233
+ return False
234
+
235
+ except (ValueError, ProcessLookupError) as e:
236
+ self.logger.debug(f"Error handling PID {pid_str}: {e}")
237
+ continue
238
+
239
+ return True
240
+
241
+ except FileNotFoundError:
242
+ # lsof not available
243
+ self.logger.debug("lsof not available, using alternative methods")
244
+ return True
245
+ except Exception as e:
246
+ self.logger.error(f"Error using lsof: {e}")
247
+ return False
248
+
249
+ def _kill_using_pid_file(self) -> bool:
250
+ """Kill process using PID file.
251
+
252
+ Returns:
253
+ True if successful or no PID file, False on error
254
+ """
255
+ try:
256
+ if not self.pid_file.exists():
257
+ return True
258
+
259
+ with open(self.pid_file) as f:
260
+ pid = int(f.read().strip())
261
+
262
+ self.logger.info(f"Found PID {pid} in PID file")
263
+
264
+ # Kill the process
265
+ try:
266
+ os.kill(pid, signal.SIGTERM)
267
+ time.sleep(1)
268
+
269
+ # Check if still alive
270
+ try:
271
+ os.kill(pid, 0)
272
+ os.kill(pid, signal.SIGKILL)
273
+ time.sleep(0.5)
274
+ except ProcessLookupError:
275
+ pass
276
+
277
+ # Remove PID file
278
+ self.pid_file.unlink(missing_ok=True)
279
+ return True
280
+
281
+ except ProcessLookupError:
282
+ # Process doesn't exist, just remove PID file
283
+ self.pid_file.unlink(missing_ok=True)
284
+ return True
285
+
286
+ except Exception as e:
287
+ self.logger.error(f"Error killing process from PID file: {e}")
288
+ return False
289
+
290
+ def _kill_claude_mpm_processes(self) -> bool:
291
+ """Kill any claude-mpm monitor processes.
292
+
293
+ Returns:
294
+ True if successful, False on error
295
+ """
296
+ try:
297
+ # Look for claude-mpm monitor processes
298
+ result = subprocess.run(["ps", "aux"], capture_output=True, text=True, check=False)
299
+
300
+ if result.returncode != 0:
301
+ return False
302
+
303
+ lines = result.stdout.strip().split("\n")
304
+ killed_any = False
305
+
306
+ for line in lines:
307
+ if "claude" in line.lower() and "monitor" in line.lower():
308
+ parts = line.split()
309
+ if len(parts) > 1:
310
+ try:
311
+ pid = int(parts[1])
312
+ self.logger.info(
313
+ f"Killing claude-mpm monitor process {pid}"
314
+ )
315
+ os.kill(pid, signal.SIGTERM)
316
+ killed_any = True
317
+ time.sleep(0.5)
318
+ except (ValueError, ProcessLookupError):
319
+ continue
320
+
321
+ if killed_any:
322
+ time.sleep(1) # Give processes time to exit
323
+
324
+ return True
325
+
326
+ except Exception as e:
327
+ self.logger.error(f"Error killing claude-mpm processes: {e}")
328
+ return False
329
+
330
+ def is_our_service(self) -> Tuple[bool, Optional[int]]:
331
+ """Check if the service on the port is our claude-mpm monitor.
332
+
333
+ Returns:
334
+ Tuple of (is_ours, pid) where is_ours is True if it's our service
335
+ """
336
+ try:
337
+ # First check PID file
338
+ if self.pid_file.exists():
339
+ try:
340
+ with open(self.pid_file) as f:
341
+ pid = int(f.read().strip())
342
+
343
+ # Verify process exists
344
+ os.kill(pid, 0)
345
+
346
+ # Check if it's a Python process
347
+ process_info = subprocess.run(
348
+ ["ps", "-p", str(pid), "-o", "comm="],
349
+ capture_output=True,
350
+ text=True, check=False,
351
+ )
352
+
353
+ if "python" in process_info.stdout.lower():
354
+ return True, pid
355
+
356
+ except (ValueError, ProcessLookupError, subprocess.CalledProcessError):
357
+ # PID file exists but process doesn't or isn't Python
358
+ self.pid_file.unlink(missing_ok=True)
359
+
360
+ # Check if service responds to our health endpoint
361
+ try:
362
+ import requests
363
+
364
+ response = requests.get(
365
+ f"http://{self.host}:{self.port}/health", timeout=2
366
+ )
367
+
368
+ if response.status_code == 200:
369
+ # Try to get service info
370
+ try:
371
+ data = response.json()
372
+ if "claude" in str(data).lower() or "mpm" in str(data).lower():
373
+ # It's likely our service, try to find PID
374
+ pid = self._find_service_pid()
375
+ return True, pid
376
+ except:
377
+ pass
378
+
379
+ except:
380
+ pass
381
+
382
+ return False, None
383
+
384
+ except Exception as e:
385
+ self.logger.error(f"Error checking service ownership: {e}")
386
+ return False, None
387
+
388
+ def _find_service_pid(self) -> Optional[int]:
389
+ """Find PID of service on our port using lsof.
390
+
391
+ Returns:
392
+ PID if found, None otherwise
393
+ """
394
+ try:
395
+ result = subprocess.run(
396
+ ["lsof", "-ti", f":{self.port}"], capture_output=True, text=True, check=False
397
+ )
398
+
399
+ if result.returncode == 0 and result.stdout.strip():
400
+ pids = result.stdout.strip().split("\n")
401
+ if pids:
402
+ return int(pids[0].strip())
403
+
404
+ except:
405
+ pass
406
+
407
+ return None
408
+
409
+ def start_daemon(self, force_restart: bool = False) -> bool:
410
+ """Start the daemon with automatic cleanup and retry.
411
+
412
+ Args:
413
+ force_restart: Force restart even if already running
414
+
415
+ Returns:
416
+ True if daemon started successfully
417
+ """
418
+ with self._lock:
419
+ # Check if already running
420
+ if self.is_running():
421
+ if not force_restart:
422
+ pid = self.get_pid()
423
+ self.logger.info(f"Daemon already running with PID {pid}")
424
+ return True
425
+
426
+ # Stop existing daemon
427
+ self.logger.info("Force restarting daemon")
428
+ if not self.stop_daemon():
429
+ self.logger.error("Failed to stop existing daemon")
430
+ return False
431
+
432
+ # Wait for cleanup
433
+ time.sleep(2)
434
+
435
+ # Clean up port conflicts
436
+ if not self.cleanup_port_conflicts():
437
+ self.logger.error(f"Cannot start daemon - port {self.port} is in use")
438
+ return False
439
+
440
+ # Daemonize the process
441
+ return self.daemonize()
442
+
443
+ def daemonize(self) -> bool:
444
+ """Daemonize the current process.
445
+
446
+ Returns:
447
+ True if successful (in parent), doesn't return in child
448
+ """
449
+ try:
450
+ # Clean up asyncio event loops before forking
451
+ self._cleanup_event_loops()
452
+
453
+ # Create status file for communication
454
+ with tempfile.NamedTemporaryFile(
455
+ mode="w", delete=False, suffix=".status"
456
+ ) as f:
457
+ self.startup_status_file = f.name
458
+ f.write("starting")
459
+
460
+ # First fork
461
+ pid = os.fork()
462
+ if pid > 0:
463
+ # Parent process - wait for child to confirm startup
464
+ return self._parent_wait_for_startup(pid)
465
+
466
+ except OSError as e:
467
+ self.logger.error(f"First fork failed: {e}")
468
+ return False
469
+
470
+ # Child process continues...
471
+
472
+ # Decouple from parent
473
+ os.chdir("/")
474
+ os.setsid()
475
+ os.umask(0)
476
+
477
+ try:
478
+ # Second fork
479
+ pid = os.fork()
480
+ if pid > 0:
481
+ # First child exits
482
+ sys.exit(0)
483
+ except OSError as e:
484
+ self.logger.error(f"Second fork failed: {e}")
485
+ self._report_startup_error(f"Second fork failed: {e}")
486
+ sys.exit(1)
487
+
488
+ # Grandchild process - the actual daemon
489
+
490
+ # Write PID file
491
+ self.write_pid_file()
492
+
493
+ # Redirect streams
494
+ self._redirect_streams()
495
+
496
+ # Setup signal handlers
497
+ self._setup_signal_handlers()
498
+
499
+ self.logger.info(f"Daemon process started with PID {os.getpid()}")
500
+
501
+ # Report successful startup
502
+ self._report_startup_success()
503
+
504
+ # Note: Daemon process continues running
505
+ # Caller is responsible for running the actual service
506
+ return True
507
+
508
+ def stop_daemon(self, timeout: int = 10) -> bool:
509
+ """Stop the daemon process.
510
+
511
+ Args:
512
+ timeout: Maximum time to wait for daemon to stop
513
+
514
+ Returns:
515
+ True if stopped successfully
516
+ """
517
+ with self._lock:
518
+ try:
519
+ pid = self.get_pid()
520
+ if not pid:
521
+ self.logger.info("No daemon PID found")
522
+ # Still try to clean up port
523
+ self.cleanup_port_conflicts()
524
+ return True
525
+
526
+ self.logger.info(f"Stopping daemon with PID {pid}")
527
+
528
+ # Send SIGTERM for graceful shutdown
529
+ try:
530
+ os.kill(pid, signal.SIGTERM)
531
+ except ProcessLookupError:
532
+ # Process already dead
533
+ self.cleanup_pid_file()
534
+ return True
535
+
536
+ # Wait for process to exit
537
+ start_time = time.time()
538
+ while time.time() - start_time < timeout:
539
+ try:
540
+ os.kill(pid, 0) # Check if still alive
541
+ time.sleep(0.5)
542
+ except ProcessLookupError:
543
+ # Process exited
544
+ self.cleanup_pid_file()
545
+ return True
546
+
547
+ # Force kill if still running
548
+ self.logger.warning("Daemon didn't stop gracefully, force killing")
549
+ try:
550
+ os.kill(pid, signal.SIGKILL)
551
+ time.sleep(1)
552
+ except ProcessLookupError:
553
+ pass
554
+
555
+ self.cleanup_pid_file()
556
+ return True
557
+
558
+ except Exception as e:
559
+ self.logger.error(f"Error stopping daemon: {e}")
560
+ return False
561
+
562
+ def is_running(self) -> bool:
563
+ """Check if daemon is running.
564
+
565
+ Returns:
566
+ True if daemon is running
567
+ """
568
+ try:
569
+ pid = self.get_pid()
570
+ if not pid:
571
+ return False
572
+
573
+ # Check if process exists
574
+ os.kill(pid, 0)
575
+ return True
576
+
577
+ except ProcessLookupError:
578
+ # Process doesn't exist
579
+ self.cleanup_pid_file()
580
+ return False
581
+
582
+ def get_pid(self) -> Optional[int]:
583
+ """Get daemon PID from PID file.
584
+
585
+ Returns:
586
+ PID if found, None otherwise
587
+ """
588
+ try:
589
+ if not self.pid_file.exists():
590
+ return None
591
+
592
+ with open(self.pid_file) as f:
593
+ return int(f.read().strip())
594
+
595
+ except Exception as e:
596
+ self.logger.error(f"Error reading PID file: {e}")
597
+ return None
598
+
599
+ def write_pid_file(self):
600
+ """Write current PID to PID file."""
601
+ try:
602
+ self.pid_file.parent.mkdir(parents=True, exist_ok=True)
603
+ with open(self.pid_file, "w") as f:
604
+ f.write(str(os.getpid()))
605
+ self.logger.debug(f"PID file written: {self.pid_file}")
606
+ except Exception as e:
607
+ self.logger.error(f"Error writing PID file: {e}")
608
+ raise
609
+
610
+ def cleanup_pid_file(self):
611
+ """Remove PID file."""
612
+ try:
613
+ self.pid_file.unlink(missing_ok=True)
614
+ self.logger.debug("PID file removed")
615
+ except Exception as e:
616
+ self.logger.error(f"Error removing PID file: {e}")
617
+
618
+ def _cleanup_event_loops(self):
619
+ """Clean up asyncio event loops before forking."""
620
+ try:
621
+ import asyncio
622
+
623
+ try:
624
+ loop = asyncio.get_event_loop()
625
+ if loop and not loop.is_closed():
626
+ # Cancel pending tasks
627
+ pending = asyncio.all_tasks(loop)
628
+ for task in pending:
629
+ task.cancel()
630
+
631
+ # Stop and close loop
632
+ if loop.is_running():
633
+ loop.stop()
634
+
635
+ asyncio.set_event_loop(None)
636
+ loop.close()
637
+
638
+ except RuntimeError:
639
+ # No event loop
640
+ pass
641
+
642
+ except Exception as e:
643
+ self.logger.debug(f"Error cleaning up event loops: {e}")
644
+
645
+ def _redirect_streams(self):
646
+ """Redirect standard streams for daemon mode."""
647
+ try:
648
+ sys.stdout.flush()
649
+ sys.stderr.flush()
650
+
651
+ # Redirect stdin to /dev/null
652
+ with open("/dev/null") as null_in:
653
+ os.dup2(null_in.fileno(), sys.stdin.fileno())
654
+
655
+ # Redirect stdout and stderr to log file
656
+ self.log_file.parent.mkdir(parents=True, exist_ok=True)
657
+ with open(self.log_file, "a") as log_out:
658
+ os.dup2(log_out.fileno(), sys.stdout.fileno())
659
+ os.dup2(log_out.fileno(), sys.stderr.fileno())
660
+
661
+ except Exception as e:
662
+ self.logger.error(f"Error redirecting streams: {e}")
663
+
664
+ def _setup_signal_handlers(self):
665
+ """Setup signal handlers for graceful shutdown."""
666
+
667
+ def signal_handler(signum, frame):
668
+ self.logger.info(f"Received signal {signum}, shutting down")
669
+ self.cleanup_pid_file()
670
+ sys.exit(0)
671
+
672
+ signal.signal(signal.SIGTERM, signal_handler)
673
+ signal.signal(signal.SIGINT, signal_handler)
674
+
675
+ def _parent_wait_for_startup(self, child_pid: int, timeout: float = 10.0) -> bool:
676
+ """Parent process waits for child to confirm startup.
677
+
678
+ Args:
679
+ child_pid: PID of child process
680
+ timeout: Maximum time to wait
681
+
682
+ Returns:
683
+ True if child started successfully
684
+ """
685
+ try:
686
+ start_time = time.time()
687
+
688
+ while time.time() - start_time < timeout:
689
+ if (
690
+ not self.startup_status_file
691
+ or not Path(self.startup_status_file).exists()
692
+ ):
693
+ time.sleep(0.1)
694
+ continue
695
+
696
+ try:
697
+ with open(self.startup_status_file) as f:
698
+ status = f.read().strip()
699
+
700
+ if status == "success":
701
+ # Cleanup status file
702
+ Path(self.startup_status_file).unlink(missing_ok=True)
703
+ return True
704
+
705
+ if status.startswith("error:"):
706
+ error_msg = status[6:]
707
+ self.logger.error(f"Daemon startup failed: {error_msg}")
708
+ Path(self.startup_status_file).unlink(missing_ok=True)
709
+ return False
710
+
711
+ except Exception:
712
+ pass
713
+
714
+ time.sleep(0.1)
715
+
716
+ self.logger.error("Daemon startup timed out")
717
+ return False
718
+
719
+ except Exception as e:
720
+ self.logger.error(f"Error waiting for daemon startup: {e}")
721
+ return False
722
+
723
+ def _report_startup_success(self):
724
+ """Report successful startup to parent process."""
725
+ if self.startup_status_file and Path(self.startup_status_file).exists():
726
+ try:
727
+ with open(self.startup_status_file, "w") as f:
728
+ f.write("success")
729
+ except Exception as e:
730
+ self.logger.error(f"Error reporting startup success: {e}")
731
+
732
+ def _report_startup_error(self, error: str):
733
+ """Report startup error to parent process."""
734
+ if self.startup_status_file and Path(self.startup_status_file).exists():
735
+ try:
736
+ with open(self.startup_status_file, "w") as f:
737
+ f.write(f"error:{error}")
738
+ except Exception as e:
739
+ self.logger.error(f"Error reporting startup error: {e}")