claude-mpm 4.5.6__py3-none-any.whl → 4.5.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/__init__.py +20 -5
- claude_mpm/agents/BASE_OPS.md +10 -0
- claude_mpm/agents/PM_INSTRUCTIONS.md +28 -4
- claude_mpm/agents/agent_loader.py +19 -2
- claude_mpm/agents/base_agent_loader.py +5 -5
- claude_mpm/agents/templates/agent-manager.json +3 -3
- claude_mpm/agents/templates/agentic-coder-optimizer.json +3 -3
- claude_mpm/agents/templates/api_qa.json +1 -1
- claude_mpm/agents/templates/clerk-ops.json +3 -3
- claude_mpm/agents/templates/code_analyzer.json +3 -3
- claude_mpm/agents/templates/dart_engineer.json +294 -0
- claude_mpm/agents/templates/data_engineer.json +3 -3
- claude_mpm/agents/templates/documentation.json +2 -2
- claude_mpm/agents/templates/engineer.json +2 -2
- claude_mpm/agents/templates/gcp_ops_agent.json +2 -2
- claude_mpm/agents/templates/imagemagick.json +1 -1
- claude_mpm/agents/templates/local_ops_agent.json +363 -49
- claude_mpm/agents/templates/memory_manager.json +2 -2
- claude_mpm/agents/templates/nextjs_engineer.json +2 -2
- claude_mpm/agents/templates/ops.json +2 -2
- claude_mpm/agents/templates/php-engineer.json +1 -1
- claude_mpm/agents/templates/project_organizer.json +1 -1
- claude_mpm/agents/templates/prompt-engineer.json +6 -4
- claude_mpm/agents/templates/python_engineer.json +2 -2
- claude_mpm/agents/templates/qa.json +1 -1
- claude_mpm/agents/templates/react_engineer.json +3 -3
- claude_mpm/agents/templates/refactoring_engineer.json +3 -3
- claude_mpm/agents/templates/research.json +2 -2
- claude_mpm/agents/templates/security.json +2 -2
- claude_mpm/agents/templates/ticketing.json +2 -2
- claude_mpm/agents/templates/typescript_engineer.json +2 -2
- claude_mpm/agents/templates/vercel_ops_agent.json +2 -2
- claude_mpm/agents/templates/version_control.json +2 -2
- claude_mpm/agents/templates/web_qa.json +6 -6
- claude_mpm/agents/templates/web_ui.json +3 -3
- claude_mpm/cli/__init__.py +49 -19
- claude_mpm/cli/commands/configure.py +591 -7
- claude_mpm/cli/parsers/configure_parser.py +5 -0
- claude_mpm/core/__init__.py +53 -17
- claude_mpm/core/config.py +1 -1
- claude_mpm/core/log_manager.py +7 -0
- claude_mpm/hooks/claude_hooks/response_tracking.py +16 -11
- claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +9 -11
- claude_mpm/services/__init__.py +140 -156
- claude_mpm/services/agents/deployment/deployment_config_loader.py +21 -0
- claude_mpm/services/agents/loading/base_agent_manager.py +12 -2
- claude_mpm/services/async_session_logger.py +112 -96
- claude_mpm/services/claude_session_logger.py +63 -61
- claude_mpm/services/mcp_config_manager.py +328 -38
- claude_mpm/services/mcp_gateway/__init__.py +98 -94
- claude_mpm/services/monitor/event_emitter.py +1 -1
- claude_mpm/services/orphan_detection.py +791 -0
- claude_mpm/services/project_port_allocator.py +601 -0
- claude_mpm/services/response_tracker.py +17 -6
- claude_mpm/services/session_manager.py +176 -0
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/METADATA +1 -1
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/RECORD +62 -58
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/WHEEL +0 -0
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.5.6.dist-info → claude_mpm-4.5.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,791 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Orphan Detection Service
|
4
|
+
========================
|
5
|
+
|
6
|
+
Detects and manages orphaned deployment processes across different deployment methods.
|
7
|
+
Provides safe cleanup capabilities with multiple safety checks to prevent accidental
|
8
|
+
termination of active services.
|
9
|
+
|
10
|
+
Part of local-ops agent improvements for process lifecycle management.
|
11
|
+
|
12
|
+
WHY: Deployments can leave orphaned processes when:
|
13
|
+
- PM2 processes outlive their parent
|
14
|
+
- Docker containers keep running after deployment fails
|
15
|
+
- State files reference dead processes
|
16
|
+
- Projects are deleted but processes remain
|
17
|
+
|
18
|
+
SAFETY PHILOSOPHY:
|
19
|
+
- Never kill processes without verification
|
20
|
+
- Require manual confirmation for high-severity cases
|
21
|
+
- Preserve Claude MPM/MCP services at all costs
|
22
|
+
- Respect process ownership boundaries
|
23
|
+
- Implement multiple safety checks before any action
|
24
|
+
|
25
|
+
DESIGN DECISIONS:
|
26
|
+
- Multi-method support: PM2, Docker, native processes
|
27
|
+
- Severity levels: low, medium, high (affects confirmation requirements)
|
28
|
+
- Age-based protection: Never touch processes < 1 minute old
|
29
|
+
- Protected port ranges: Claude Code services (8765-8785)
|
30
|
+
- Ownership verification: Cross-reference with state files
|
31
|
+
"""
|
32
|
+
|
33
|
+
import json
|
34
|
+
import subprocess
|
35
|
+
import time
|
36
|
+
from datetime import datetime, timezone
|
37
|
+
from enum import Enum
|
38
|
+
from pathlib import Path
|
39
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
40
|
+
|
41
|
+
import psutil
|
42
|
+
|
43
|
+
from .core.base import SyncBaseService
|
44
|
+
from .port_manager import PortManager
|
45
|
+
|
46
|
+
|
47
|
+
class OrphanSeverity(Enum):
|
48
|
+
"""Severity levels for orphaned processes."""
|
49
|
+
|
50
|
+
LOW = "low" # Safe to auto-cleanup (e.g., old test processes)
|
51
|
+
MEDIUM = "medium" # Needs user awareness (e.g., untracked deployments)
|
52
|
+
HIGH = "high" # Requires explicit confirmation (e.g., running production services)
|
53
|
+
|
54
|
+
|
55
|
+
class OrphanType(Enum):
|
56
|
+
"""Types of orphaned resources."""
|
57
|
+
|
58
|
+
DEAD_PID = "dead_pid" # State file references dead process
|
59
|
+
DELETED_PROJECT = "deleted_project" # Process for non-existent project
|
60
|
+
UNTRACKED_PROCESS = "untracked_process" # Process on managed port without state
|
61
|
+
PM2_ORPHAN = "pm2_orphan" # PM2 process not in any state file
|
62
|
+
DOCKER_ORPHAN = "docker_orphan" # Docker container not in any state file
|
63
|
+
STALE_DEPLOYMENT = "stale_deployment" # Deployment hasn't been updated in days
|
64
|
+
|
65
|
+
|
66
|
+
class OrphanInfo:
|
67
|
+
"""Information about an orphaned resource."""
|
68
|
+
|
69
|
+
def __init__(
|
70
|
+
self,
|
71
|
+
orphan_type: OrphanType,
|
72
|
+
severity: OrphanSeverity,
|
73
|
+
description: str,
|
74
|
+
details: Dict[str, Any],
|
75
|
+
cleanup_action: Optional[str] = None,
|
76
|
+
):
|
77
|
+
"""
|
78
|
+
Initialize orphan info.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
orphan_type: Type of orphan
|
82
|
+
severity: Severity level
|
83
|
+
description: Human-readable description
|
84
|
+
details: Additional details (PID, port, etc.)
|
85
|
+
cleanup_action: Suggested cleanup action
|
86
|
+
"""
|
87
|
+
self.orphan_type = orphan_type
|
88
|
+
self.severity = severity
|
89
|
+
self.description = description
|
90
|
+
self.details = details
|
91
|
+
self.cleanup_action = cleanup_action
|
92
|
+
self.detected_at = datetime.now(timezone.utc)
|
93
|
+
|
94
|
+
def to_dict(self) -> Dict[str, Any]:
|
95
|
+
"""Convert to dictionary for serialization."""
|
96
|
+
return {
|
97
|
+
"type": self.orphan_type.value,
|
98
|
+
"severity": self.severity.value,
|
99
|
+
"description": self.description,
|
100
|
+
"details": self.details,
|
101
|
+
"cleanup_action": self.cleanup_action,
|
102
|
+
"detected_at": self.detected_at.isoformat(),
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
class OrphanDetectionService(SyncBaseService):
|
107
|
+
"""
|
108
|
+
Service for detecting and managing orphaned deployment processes.
|
109
|
+
|
110
|
+
Capabilities:
|
111
|
+
- Scan for orphaned PM2 processes
|
112
|
+
- Scan for orphaned Docker containers
|
113
|
+
- Detect untracked processes on managed ports
|
114
|
+
- Verify state file integrity
|
115
|
+
- Safe cleanup with multiple safety checks
|
116
|
+
"""
|
117
|
+
|
118
|
+
# Minimum process age before considering for cleanup (safety measure)
|
119
|
+
MIN_PROCESS_AGE_SECONDS = 60 # 1 minute
|
120
|
+
|
121
|
+
# Protected port ranges (Claude Code services)
|
122
|
+
PROTECTED_PORT_RANGES = [(8765, 8785)]
|
123
|
+
|
124
|
+
# Protected process patterns
|
125
|
+
PROTECTED_PATTERNS = [
|
126
|
+
"claude-mpm",
|
127
|
+
"claude_mpm",
|
128
|
+
"socketio_daemon",
|
129
|
+
"mcp-",
|
130
|
+
"monitor",
|
131
|
+
]
|
132
|
+
|
133
|
+
# Port range for user projects
|
134
|
+
USER_PORT_RANGE_START = 3000
|
135
|
+
USER_PORT_RANGE_END = 9999
|
136
|
+
|
137
|
+
def __init__(self, project_root: Optional[Path] = None):
|
138
|
+
"""
|
139
|
+
Initialize the orphan detection service.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
project_root: Project directory (default: current working directory)
|
143
|
+
"""
|
144
|
+
super().__init__(service_name="OrphanDetectionService")
|
145
|
+
|
146
|
+
self.project_root = (project_root or Path.cwd()).resolve()
|
147
|
+
self.state_dir = self.project_root / ".claude-mpm"
|
148
|
+
self.state_file = self.state_dir / "deployment-state.json"
|
149
|
+
|
150
|
+
# Global registry
|
151
|
+
self.global_registry_dir = Path.home() / ".claude-mpm"
|
152
|
+
self.global_registry_file = (
|
153
|
+
self.global_registry_dir / "global-port-registry.json"
|
154
|
+
)
|
155
|
+
|
156
|
+
# Port manager for process checks
|
157
|
+
self.port_manager = PortManager(project_root=self.project_root)
|
158
|
+
|
159
|
+
def initialize(self) -> bool:
|
160
|
+
"""
|
161
|
+
Initialize the service.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
True if initialization successful
|
165
|
+
"""
|
166
|
+
try:
|
167
|
+
self._initialized = True
|
168
|
+
self.log_info("OrphanDetectionService initialized successfully")
|
169
|
+
return True
|
170
|
+
except Exception as e:
|
171
|
+
self.log_error(f"Failed to initialize: {e}")
|
172
|
+
return False
|
173
|
+
|
174
|
+
def shutdown(self) -> None:
|
175
|
+
"""Shutdown the service gracefully."""
|
176
|
+
self._shutdown = True
|
177
|
+
self.log_info("OrphanDetectionService shutdown")
|
178
|
+
|
179
|
+
def _is_protected_process(self, cmdline: str) -> bool:
|
180
|
+
"""
|
181
|
+
Check if process is protected (Claude MPM services).
|
182
|
+
|
183
|
+
Args:
|
184
|
+
cmdline: Process command line
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
True if process is protected
|
188
|
+
"""
|
189
|
+
cmdline_lower = cmdline.lower()
|
190
|
+
return any(pattern in cmdline_lower for pattern in self.PROTECTED_PATTERNS)
|
191
|
+
|
192
|
+
def _is_protected_port(self, port: int) -> bool:
|
193
|
+
"""
|
194
|
+
Check if port is in protected range.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
port: Port number
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
True if port is protected
|
201
|
+
"""
|
202
|
+
for start, end in self.PROTECTED_PORT_RANGES:
|
203
|
+
if start <= port <= end:
|
204
|
+
return True
|
205
|
+
return False
|
206
|
+
|
207
|
+
def _get_process_age(self, pid: int) -> Optional[float]:
|
208
|
+
"""
|
209
|
+
Get process age in seconds.
|
210
|
+
|
211
|
+
Args:
|
212
|
+
pid: Process ID
|
213
|
+
|
214
|
+
Returns:
|
215
|
+
Age in seconds or None if process not found
|
216
|
+
"""
|
217
|
+
try:
|
218
|
+
process = psutil.Process(pid)
|
219
|
+
create_time = process.create_time()
|
220
|
+
return time.time() - create_time
|
221
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
222
|
+
return None
|
223
|
+
|
224
|
+
def _is_process_safe_to_kill(self, pid: int, cmdline: str) -> Tuple[bool, str]:
|
225
|
+
"""
|
226
|
+
Check if a process is safe to kill.
|
227
|
+
|
228
|
+
Args:
|
229
|
+
pid: Process ID
|
230
|
+
cmdline: Process command line
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
Tuple of (is_safe, reason)
|
234
|
+
"""
|
235
|
+
# Check if protected
|
236
|
+
if self._is_protected_process(cmdline):
|
237
|
+
return False, "Protected Claude MPM/MCP service"
|
238
|
+
|
239
|
+
# Check process age
|
240
|
+
age = self._get_process_age(pid)
|
241
|
+
if age is None:
|
242
|
+
return False, "Cannot determine process age"
|
243
|
+
|
244
|
+
if age < self.MIN_PROCESS_AGE_SECONDS:
|
245
|
+
return (
|
246
|
+
False,
|
247
|
+
f"Process too young ({age:.1f}s < {self.MIN_PROCESS_AGE_SECONDS}s)",
|
248
|
+
)
|
249
|
+
|
250
|
+
return True, "Safe to cleanup"
|
251
|
+
|
252
|
+
def scan_dead_pids(self) -> List[OrphanInfo]:
|
253
|
+
"""
|
254
|
+
Scan for dead PIDs in state files.
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
List of orphaned state entries
|
258
|
+
"""
|
259
|
+
orphans = []
|
260
|
+
|
261
|
+
try:
|
262
|
+
if not self.state_file.exists():
|
263
|
+
return orphans
|
264
|
+
|
265
|
+
with open(self.state_file) as f:
|
266
|
+
state = json.load(f)
|
267
|
+
|
268
|
+
deployments = state.get("deployments", {})
|
269
|
+
|
270
|
+
for service_name, deployment in deployments.items():
|
271
|
+
pid = deployment.get("pid")
|
272
|
+
|
273
|
+
if not pid:
|
274
|
+
continue
|
275
|
+
|
276
|
+
# Check if process exists
|
277
|
+
if not psutil.pid_exists(pid):
|
278
|
+
orphans.append(
|
279
|
+
OrphanInfo(
|
280
|
+
orphan_type=OrphanType.DEAD_PID,
|
281
|
+
severity=OrphanSeverity.LOW,
|
282
|
+
description=f"State file references dead process (PID: {pid})",
|
283
|
+
details={
|
284
|
+
"service_name": service_name,
|
285
|
+
"pid": pid,
|
286
|
+
"port": deployment.get("port"),
|
287
|
+
"state_file": str(self.state_file),
|
288
|
+
},
|
289
|
+
cleanup_action="Remove from state file",
|
290
|
+
)
|
291
|
+
)
|
292
|
+
|
293
|
+
except Exception as e:
|
294
|
+
self.log_error(f"Error scanning dead PIDs: {e}")
|
295
|
+
|
296
|
+
return orphans
|
297
|
+
|
298
|
+
def scan_deleted_projects(self) -> List[OrphanInfo]:
|
299
|
+
"""
|
300
|
+
Scan global registry for projects that no longer exist.
|
301
|
+
|
302
|
+
Returns:
|
303
|
+
List of orphaned project entries
|
304
|
+
"""
|
305
|
+
orphans = []
|
306
|
+
|
307
|
+
try:
|
308
|
+
if not self.global_registry_file.exists():
|
309
|
+
return orphans
|
310
|
+
|
311
|
+
with open(self.global_registry_file) as f:
|
312
|
+
registry = json.load(f)
|
313
|
+
|
314
|
+
allocations = registry.get("allocations", {})
|
315
|
+
|
316
|
+
for port_str, allocation in allocations.items():
|
317
|
+
project_path = Path(allocation.get("project_path", ""))
|
318
|
+
|
319
|
+
# Check if project directory exists
|
320
|
+
if not project_path.exists():
|
321
|
+
orphans.append(
|
322
|
+
OrphanInfo(
|
323
|
+
orphan_type=OrphanType.DELETED_PROJECT,
|
324
|
+
severity=OrphanSeverity.MEDIUM,
|
325
|
+
description="Port allocated to deleted project",
|
326
|
+
details={
|
327
|
+
"port": int(port_str),
|
328
|
+
"project_path": str(project_path),
|
329
|
+
"service_name": allocation.get("service_name"),
|
330
|
+
},
|
331
|
+
cleanup_action="Remove from global registry",
|
332
|
+
)
|
333
|
+
)
|
334
|
+
|
335
|
+
except Exception as e:
|
336
|
+
self.log_error(f"Error scanning deleted projects: {e}")
|
337
|
+
|
338
|
+
return orphans
|
339
|
+
|
340
|
+
def scan_untracked_processes(self) -> List[OrphanInfo]:
|
341
|
+
"""
|
342
|
+
Scan for processes on managed ports without state tracking.
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
List of untracked processes
|
346
|
+
"""
|
347
|
+
orphans = []
|
348
|
+
|
349
|
+
try:
|
350
|
+
# Load global registry to know which ports are managed
|
351
|
+
managed_ports = set()
|
352
|
+
if self.global_registry_file.exists():
|
353
|
+
with open(self.global_registry_file) as f:
|
354
|
+
registry = json.load(f)
|
355
|
+
managed_ports = set(
|
356
|
+
int(p) for p in registry.get("allocations", {}).keys()
|
357
|
+
)
|
358
|
+
|
359
|
+
# Scan all network connections
|
360
|
+
for conn in psutil.net_connections(kind="inet"):
|
361
|
+
if conn.status != "LISTEN":
|
362
|
+
continue
|
363
|
+
|
364
|
+
port = conn.laddr.port
|
365
|
+
|
366
|
+
# Skip if not in user port range
|
367
|
+
if not (self.USER_PORT_RANGE_START <= port <= self.USER_PORT_RANGE_END):
|
368
|
+
continue
|
369
|
+
|
370
|
+
# Skip protected ports
|
371
|
+
if self._is_protected_port(port):
|
372
|
+
continue
|
373
|
+
|
374
|
+
# Check if port is tracked in global registry
|
375
|
+
if port not in managed_ports:
|
376
|
+
try:
|
377
|
+
process = psutil.Process(conn.pid)
|
378
|
+
cmdline = " ".join(process.cmdline())
|
379
|
+
|
380
|
+
# Skip protected processes
|
381
|
+
if self._is_protected_process(cmdline):
|
382
|
+
continue
|
383
|
+
|
384
|
+
orphans.append(
|
385
|
+
OrphanInfo(
|
386
|
+
orphan_type=OrphanType.UNTRACKED_PROCESS,
|
387
|
+
severity=OrphanSeverity.MEDIUM,
|
388
|
+
description=f"Process on port {port} not tracked in state files",
|
389
|
+
details={
|
390
|
+
"pid": conn.pid,
|
391
|
+
"port": port,
|
392
|
+
"process_name": process.name(),
|
393
|
+
"cmdline": cmdline[:100],
|
394
|
+
},
|
395
|
+
cleanup_action="Investigate and add to state or cleanup",
|
396
|
+
)
|
397
|
+
)
|
398
|
+
|
399
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
400
|
+
pass
|
401
|
+
|
402
|
+
except Exception as e:
|
403
|
+
self.log_error(f"Error scanning untracked processes: {e}")
|
404
|
+
|
405
|
+
return orphans
|
406
|
+
|
407
|
+
def scan_pm2_orphans(self) -> List[OrphanInfo]:
|
408
|
+
"""
|
409
|
+
Scan for orphaned PM2 processes.
|
410
|
+
|
411
|
+
Returns:
|
412
|
+
List of orphaned PM2 processes
|
413
|
+
"""
|
414
|
+
orphans = []
|
415
|
+
|
416
|
+
try:
|
417
|
+
# Get all PM2 processes
|
418
|
+
result = subprocess.run(
|
419
|
+
["pm2", "jlist"],
|
420
|
+
capture_output=True,
|
421
|
+
text=True,
|
422
|
+
timeout=5,
|
423
|
+
check=False,
|
424
|
+
)
|
425
|
+
|
426
|
+
if result.returncode != 0:
|
427
|
+
self.log_debug("PM2 not available or no processes")
|
428
|
+
return orphans
|
429
|
+
|
430
|
+
pm2_processes = json.loads(result.stdout)
|
431
|
+
|
432
|
+
# Load all state files to find tracked PM2 processes
|
433
|
+
tracked_pm2_names = self._get_tracked_pm2_processes()
|
434
|
+
|
435
|
+
for proc in pm2_processes:
|
436
|
+
name = proc.get("name")
|
437
|
+
pid = proc.get("pid")
|
438
|
+
|
439
|
+
# Skip if tracked in any state file
|
440
|
+
if name in tracked_pm2_names:
|
441
|
+
continue
|
442
|
+
|
443
|
+
# Skip protected processes
|
444
|
+
script = proc.get("pm2_env", {}).get("pm_exec_path", "")
|
445
|
+
if self._is_protected_process(script):
|
446
|
+
continue
|
447
|
+
|
448
|
+
orphans.append(
|
449
|
+
OrphanInfo(
|
450
|
+
orphan_type=OrphanType.PM2_ORPHAN,
|
451
|
+
severity=OrphanSeverity.HIGH, # High severity - running service
|
452
|
+
description=f"PM2 process '{name}' not tracked in any state file",
|
453
|
+
details={
|
454
|
+
"pm2_name": name,
|
455
|
+
"pid": pid,
|
456
|
+
"status": proc.get("pm2_env", {}).get("status"),
|
457
|
+
"restart_count": proc.get("pm2_env", {}).get(
|
458
|
+
"restart_time", 0
|
459
|
+
),
|
460
|
+
},
|
461
|
+
cleanup_action="pm2 delete {name}",
|
462
|
+
)
|
463
|
+
)
|
464
|
+
|
465
|
+
except subprocess.TimeoutExpired:
|
466
|
+
self.log_warning("PM2 command timed out")
|
467
|
+
except json.JSONDecodeError:
|
468
|
+
self.log_warning("Failed to parse PM2 output")
|
469
|
+
except Exception as e:
|
470
|
+
self.log_error(f"Error scanning PM2 orphans: {e}")
|
471
|
+
|
472
|
+
return orphans
|
473
|
+
|
474
|
+
def scan_docker_orphans(self) -> List[OrphanInfo]:
|
475
|
+
"""
|
476
|
+
Scan for orphaned Docker containers.
|
477
|
+
|
478
|
+
Returns:
|
479
|
+
List of orphaned Docker containers
|
480
|
+
"""
|
481
|
+
orphans = []
|
482
|
+
|
483
|
+
try:
|
484
|
+
# Get all running Docker containers
|
485
|
+
result = subprocess.run(
|
486
|
+
["docker", "ps", "--format", "{{json .}}"],
|
487
|
+
capture_output=True,
|
488
|
+
text=True,
|
489
|
+
timeout=5,
|
490
|
+
check=False,
|
491
|
+
)
|
492
|
+
|
493
|
+
if result.returncode != 0:
|
494
|
+
self.log_debug("Docker not available or no containers")
|
495
|
+
return orphans
|
496
|
+
|
497
|
+
# Load tracked Docker containers
|
498
|
+
tracked_containers = self._get_tracked_docker_containers()
|
499
|
+
|
500
|
+
for line in result.stdout.strip().split("\n"):
|
501
|
+
if not line:
|
502
|
+
continue
|
503
|
+
|
504
|
+
try:
|
505
|
+
container = json.loads(line)
|
506
|
+
container_id = container.get("ID")
|
507
|
+
container_name = container.get("Names")
|
508
|
+
|
509
|
+
# Skip if tracked
|
510
|
+
if (
|
511
|
+
container_id in tracked_containers
|
512
|
+
or container_name in tracked_containers
|
513
|
+
):
|
514
|
+
continue
|
515
|
+
|
516
|
+
# Skip protected containers
|
517
|
+
if any(
|
518
|
+
pattern in container_name.lower()
|
519
|
+
for pattern in self.PROTECTED_PATTERNS
|
520
|
+
):
|
521
|
+
continue
|
522
|
+
|
523
|
+
orphans.append(
|
524
|
+
OrphanInfo(
|
525
|
+
orphan_type=OrphanType.DOCKER_ORPHAN,
|
526
|
+
severity=OrphanSeverity.HIGH,
|
527
|
+
description=f"Docker container '{container_name}' not tracked in any state file",
|
528
|
+
details={
|
529
|
+
"container_id": container_id,
|
530
|
+
"container_name": container_name,
|
531
|
+
"image": container.get("Image"),
|
532
|
+
"status": container.get("Status"),
|
533
|
+
},
|
534
|
+
cleanup_action=f"docker stop {container_id}",
|
535
|
+
)
|
536
|
+
)
|
537
|
+
|
538
|
+
except json.JSONDecodeError:
|
539
|
+
continue
|
540
|
+
|
541
|
+
except subprocess.TimeoutExpired:
|
542
|
+
self.log_warning("Docker command timed out")
|
543
|
+
except Exception as e:
|
544
|
+
self.log_error(f"Error scanning Docker orphans: {e}")
|
545
|
+
|
546
|
+
return orphans
|
547
|
+
|
548
|
+
def _get_tracked_pm2_processes(self) -> Set[str]:
|
549
|
+
"""
|
550
|
+
Get set of PM2 process names tracked in state files.
|
551
|
+
|
552
|
+
Returns:
|
553
|
+
Set of PM2 process names
|
554
|
+
"""
|
555
|
+
tracked = set()
|
556
|
+
|
557
|
+
# Check project state
|
558
|
+
if self.state_file.exists():
|
559
|
+
try:
|
560
|
+
with open(self.state_file) as f:
|
561
|
+
state = json.load(f)
|
562
|
+
|
563
|
+
for deployment in state.get("deployments", {}).values():
|
564
|
+
if deployment.get("method") == "pm2":
|
565
|
+
process_name = deployment.get("process_name")
|
566
|
+
if process_name:
|
567
|
+
tracked.add(process_name)
|
568
|
+
|
569
|
+
except Exception as e:
|
570
|
+
self.log_warning(f"Error reading state file: {e}")
|
571
|
+
|
572
|
+
# TODO: Could also scan other projects' state files for comprehensive check
|
573
|
+
|
574
|
+
return tracked
|
575
|
+
|
576
|
+
def _get_tracked_docker_containers(self) -> Set[str]:
|
577
|
+
"""
|
578
|
+
Get set of Docker containers tracked in state files.
|
579
|
+
|
580
|
+
Returns:
|
581
|
+
Set of container IDs and names
|
582
|
+
"""
|
583
|
+
tracked = set()
|
584
|
+
|
585
|
+
# Check project state
|
586
|
+
if self.state_file.exists():
|
587
|
+
try:
|
588
|
+
with open(self.state_file) as f:
|
589
|
+
state = json.load(f)
|
590
|
+
|
591
|
+
for deployment in state.get("deployments", {}).values():
|
592
|
+
if deployment.get("method") == "docker":
|
593
|
+
container_id = deployment.get("container_id")
|
594
|
+
container_name = deployment.get("container_name")
|
595
|
+
|
596
|
+
if container_id:
|
597
|
+
tracked.add(container_id)
|
598
|
+
if container_name:
|
599
|
+
tracked.add(container_name)
|
600
|
+
|
601
|
+
except Exception as e:
|
602
|
+
self.log_warning(f"Error reading state file: {e}")
|
603
|
+
|
604
|
+
return tracked
|
605
|
+
|
606
|
+
def scan_all_orphans(self) -> Dict[str, List[OrphanInfo]]:
|
607
|
+
"""
|
608
|
+
Perform comprehensive orphan scan.
|
609
|
+
|
610
|
+
Returns:
|
611
|
+
Dictionary mapping orphan types to lists of orphans
|
612
|
+
"""
|
613
|
+
results = {
|
614
|
+
"dead_pids": self.scan_dead_pids(),
|
615
|
+
"deleted_projects": self.scan_deleted_projects(),
|
616
|
+
"untracked_processes": self.scan_untracked_processes(),
|
617
|
+
"pm2_orphans": self.scan_pm2_orphans(),
|
618
|
+
"docker_orphans": self.scan_docker_orphans(),
|
619
|
+
}
|
620
|
+
|
621
|
+
total = sum(len(orphans) for orphans in results.values())
|
622
|
+
self.log_info(f"Orphan scan complete: found {total} potential orphans")
|
623
|
+
|
624
|
+
return results
|
625
|
+
|
626
|
+
def cleanup_orphan(
|
627
|
+
self,
|
628
|
+
orphan: OrphanInfo,
|
629
|
+
force: bool = False,
|
630
|
+
) -> Tuple[bool, str]:
|
631
|
+
"""
|
632
|
+
Clean up a specific orphan.
|
633
|
+
|
634
|
+
Args:
|
635
|
+
orphan: Orphan info
|
636
|
+
force: Skip safety checks (use with extreme caution)
|
637
|
+
|
638
|
+
Returns:
|
639
|
+
Tuple of (success, message)
|
640
|
+
"""
|
641
|
+
# High severity orphans require explicit confirmation
|
642
|
+
if orphan.severity == OrphanSeverity.HIGH and not force:
|
643
|
+
return False, "High severity orphan requires explicit force=True"
|
644
|
+
|
645
|
+
try:
|
646
|
+
if orphan.orphan_type == OrphanType.DEAD_PID:
|
647
|
+
return self._cleanup_dead_pid(orphan)
|
648
|
+
|
649
|
+
if orphan.orphan_type == OrphanType.DELETED_PROJECT:
|
650
|
+
return self._cleanup_deleted_project(orphan)
|
651
|
+
|
652
|
+
if orphan.orphan_type == OrphanType.UNTRACKED_PROCESS:
|
653
|
+
return self._cleanup_untracked_process(orphan, force)
|
654
|
+
|
655
|
+
if orphan.orphan_type == OrphanType.PM2_ORPHAN:
|
656
|
+
return self._cleanup_pm2_orphan(orphan, force)
|
657
|
+
|
658
|
+
if orphan.orphan_type == OrphanType.DOCKER_ORPHAN:
|
659
|
+
return self._cleanup_docker_orphan(orphan, force)
|
660
|
+
|
661
|
+
return False, f"Unknown orphan type: {orphan.orphan_type}"
|
662
|
+
|
663
|
+
except Exception as e:
|
664
|
+
self.log_error(f"Error cleaning up orphan: {e}")
|
665
|
+
return False, str(e)
|
666
|
+
|
667
|
+
def _cleanup_dead_pid(self, orphan: OrphanInfo) -> Tuple[bool, str]:
|
668
|
+
"""Clean up dead PID entry from state file."""
|
669
|
+
try:
|
670
|
+
with open(self.state_file) as f:
|
671
|
+
state = json.load(f)
|
672
|
+
|
673
|
+
service_name = orphan.details.get("service_name")
|
674
|
+
if service_name in state.get("deployments", {}):
|
675
|
+
del state["deployments"][service_name]
|
676
|
+
|
677
|
+
with open(self.state_file, "w") as f:
|
678
|
+
json.dump(state, f, indent=2)
|
679
|
+
|
680
|
+
return True, f"Removed dead PID entry for {service_name}"
|
681
|
+
|
682
|
+
return False, "Entry not found in state file"
|
683
|
+
|
684
|
+
except Exception as e:
|
685
|
+
return False, f"Failed to cleanup: {e}"
|
686
|
+
|
687
|
+
def _cleanup_deleted_project(self, orphan: OrphanInfo) -> Tuple[bool, str]:
|
688
|
+
"""Clean up deleted project entry from global registry."""
|
689
|
+
try:
|
690
|
+
with open(self.global_registry_file) as f:
|
691
|
+
registry = json.load(f)
|
692
|
+
|
693
|
+
port = str(orphan.details.get("port"))
|
694
|
+
if port in registry.get("allocations", {}):
|
695
|
+
del registry["allocations"][port]
|
696
|
+
|
697
|
+
with open(self.global_registry_file, "w") as f:
|
698
|
+
json.dump(registry, f, indent=2)
|
699
|
+
|
700
|
+
return True, f"Removed deleted project entry for port {port}"
|
701
|
+
|
702
|
+
return False, "Entry not found in global registry"
|
703
|
+
|
704
|
+
except Exception as e:
|
705
|
+
return False, f"Failed to cleanup: {e}"
|
706
|
+
|
707
|
+
def _cleanup_untracked_process(
|
708
|
+
self,
|
709
|
+
orphan: OrphanInfo,
|
710
|
+
force: bool,
|
711
|
+
) -> Tuple[bool, str]:
|
712
|
+
"""Clean up untracked process."""
|
713
|
+
pid = orphan.details.get("pid")
|
714
|
+
cmdline = orphan.details.get("cmdline", "")
|
715
|
+
|
716
|
+
# Safety check
|
717
|
+
is_safe, reason = self._is_process_safe_to_kill(pid, cmdline)
|
718
|
+
if not is_safe and not force:
|
719
|
+
return False, f"Safety check failed: {reason}"
|
720
|
+
|
721
|
+
try:
|
722
|
+
process = psutil.Process(pid)
|
723
|
+
process.terminate()
|
724
|
+
|
725
|
+
# Wait for graceful termination
|
726
|
+
process.wait(timeout=5)
|
727
|
+
|
728
|
+
return True, f"Terminated untracked process {pid}"
|
729
|
+
|
730
|
+
except psutil.TimeoutExpired:
|
731
|
+
if force:
|
732
|
+
process.kill()
|
733
|
+
return True, f"Force killed untracked process {pid}"
|
734
|
+
return False, "Process did not terminate gracefully"
|
735
|
+
|
736
|
+
except Exception as e:
|
737
|
+
return False, f"Failed to terminate process: {e}"
|
738
|
+
|
739
|
+
def _cleanup_pm2_orphan(
|
740
|
+
self,
|
741
|
+
orphan: OrphanInfo,
|
742
|
+
force: bool,
|
743
|
+
) -> Tuple[bool, str]:
|
744
|
+
"""Clean up orphaned PM2 process."""
|
745
|
+
if not force:
|
746
|
+
return False, "PM2 cleanup requires force=True"
|
747
|
+
|
748
|
+
pm2_name = orphan.details.get("pm2_name")
|
749
|
+
|
750
|
+
try:
|
751
|
+
result = subprocess.run(
|
752
|
+
["pm2", "delete", pm2_name],
|
753
|
+
capture_output=True,
|
754
|
+
text=True,
|
755
|
+
timeout=10,
|
756
|
+
check=False,
|
757
|
+
)
|
758
|
+
|
759
|
+
if result.returncode == 0:
|
760
|
+
return True, f"Deleted PM2 process '{pm2_name}'"
|
761
|
+
return False, f"PM2 delete failed: {result.stderr}"
|
762
|
+
|
763
|
+
except Exception as e:
|
764
|
+
return False, f"Failed to delete PM2 process: {e}"
|
765
|
+
|
766
|
+
def _cleanup_docker_orphan(
|
767
|
+
self,
|
768
|
+
orphan: OrphanInfo,
|
769
|
+
force: bool,
|
770
|
+
) -> Tuple[bool, str]:
|
771
|
+
"""Clean up orphaned Docker container."""
|
772
|
+
if not force:
|
773
|
+
return False, "Docker cleanup requires force=True"
|
774
|
+
|
775
|
+
container_id = orphan.details.get("container_id")
|
776
|
+
|
777
|
+
try:
|
778
|
+
result = subprocess.run(
|
779
|
+
["docker", "stop", container_id],
|
780
|
+
capture_output=True,
|
781
|
+
text=True,
|
782
|
+
timeout=30,
|
783
|
+
check=False,
|
784
|
+
)
|
785
|
+
|
786
|
+
if result.returncode == 0:
|
787
|
+
return True, f"Stopped Docker container {container_id}"
|
788
|
+
return False, f"Docker stop failed: {result.stderr}"
|
789
|
+
|
790
|
+
except Exception as e:
|
791
|
+
return False, f"Failed to stop Docker container: {e}"
|