claude-mpm 4.13.2__py3-none-any.whl → 4.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of claude-mpm might be problematic. Click here for more details.
- claude_mpm/VERSION +1 -1
- claude_mpm/cli/__init__.py +10 -0
- claude_mpm/cli/commands/local_deploy.py +536 -0
- claude_mpm/cli/parsers/base_parser.py +7 -0
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/interactive_session.py +3 -0
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +35 -0
- claude_mpm/services/core/models/health.py +189 -0
- claude_mpm/services/core/models/process.py +258 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +223 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +235 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +254 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +371 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/METADATA +1 -1
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/RECORD +44 -12
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.14.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Process Management Interfaces for Claude MPM Framework
|
|
3
|
+
=======================================================
|
|
4
|
+
|
|
5
|
+
WHY: This module defines interfaces for local process management operations,
|
|
6
|
+
enabling the local-ops-agent to spawn, track, and manage background processes
|
|
7
|
+
with proper isolation, state persistence, and port conflict prevention.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISION: Process management interfaces are separated from other service
|
|
10
|
+
interfaces to maintain clear boundaries between deployment operations and other
|
|
11
|
+
system services.
|
|
12
|
+
|
|
13
|
+
ARCHITECTURE:
|
|
14
|
+
- ILocalProcessManager: Core interface for process lifecycle management
|
|
15
|
+
- IDeploymentStateManager: Interface for persistent state tracking
|
|
16
|
+
- Process data models defined in models/process.py
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
state_manager = DeploymentStateManager(state_file_path)
|
|
20
|
+
process_manager = LocalProcessManager(state_manager)
|
|
21
|
+
|
|
22
|
+
config = StartConfig(
|
|
23
|
+
command=["npm", "run", "dev"],
|
|
24
|
+
working_directory="/path/to/project",
|
|
25
|
+
port=3000
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
deployment = await process_manager.start(config)
|
|
29
|
+
print(f"Started process {deployment.process_id} on port {deployment.port}")
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from abc import ABC, abstractmethod
|
|
33
|
+
from typing import Dict, List, Optional
|
|
34
|
+
|
|
35
|
+
from claude_mpm.services.core.models.process import (
|
|
36
|
+
DeploymentState,
|
|
37
|
+
ProcessInfo,
|
|
38
|
+
ProcessStatus,
|
|
39
|
+
StartConfig,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class IDeploymentStateManager(ABC):
|
|
44
|
+
"""
|
|
45
|
+
Interface for deployment state persistence and management.
|
|
46
|
+
|
|
47
|
+
WHY: State persistence is critical for tracking processes across restarts
|
|
48
|
+
and preventing orphaned processes. This interface abstracts the storage
|
|
49
|
+
mechanism to enable different backends (JSON file, database, etc.).
|
|
50
|
+
|
|
51
|
+
DESIGN DECISION: Provides both low-level (save/load) and high-level (query)
|
|
52
|
+
operations to support different use cases. Uses file locking to prevent
|
|
53
|
+
corruption from concurrent access.
|
|
54
|
+
|
|
55
|
+
Thread Safety: All operations must be thread-safe with proper locking.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def load_state(self) -> Dict[str, DeploymentState]:
|
|
60
|
+
"""
|
|
61
|
+
Load all deployment states from persistent storage.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Dictionary mapping deployment_id to DeploymentState
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
StateCorruptionError: If state file is corrupted
|
|
68
|
+
IOError: If state file cannot be read
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def save_state(self, states: Dict[str, DeploymentState]) -> None:
|
|
73
|
+
"""
|
|
74
|
+
Save all deployment states to persistent storage.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
states: Dictionary mapping deployment_id to DeploymentState
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
IOError: If state file cannot be written
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def get_deployment(self, deployment_id: str) -> Optional[DeploymentState]:
|
|
85
|
+
"""
|
|
86
|
+
Get a specific deployment by ID.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
deployment_id: Unique deployment identifier
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
DeploymentState if found, None otherwise
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def get_all_deployments(self) -> List[DeploymentState]:
|
|
97
|
+
"""
|
|
98
|
+
Get all tracked deployments.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
List of all DeploymentState objects
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
@abstractmethod
|
|
105
|
+
def get_deployments_by_status(self, status: ProcessStatus) -> List[DeploymentState]:
|
|
106
|
+
"""
|
|
107
|
+
Get all deployments with a specific status.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
status: ProcessStatus to filter by
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
List of matching DeploymentState objects
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
@abstractmethod
|
|
117
|
+
def get_deployment_by_port(self, port: int) -> Optional[DeploymentState]:
|
|
118
|
+
"""
|
|
119
|
+
Get deployment using a specific port.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
port: Port number to search for
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
DeploymentState if found, None otherwise
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
@abstractmethod
|
|
129
|
+
def get_deployments_by_project(
|
|
130
|
+
self, working_directory: str
|
|
131
|
+
) -> List[DeploymentState]:
|
|
132
|
+
"""
|
|
133
|
+
Get all deployments for a specific project directory.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
working_directory: Project directory path
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
List of matching DeploymentState objects
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
@abstractmethod
|
|
143
|
+
def add_deployment(self, deployment: DeploymentState) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Add or update a deployment in state.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
deployment: DeploymentState to add/update
|
|
149
|
+
|
|
150
|
+
Raises:
|
|
151
|
+
IOError: If state cannot be persisted
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
@abstractmethod
|
|
155
|
+
def remove_deployment(self, deployment_id: str) -> bool:
|
|
156
|
+
"""
|
|
157
|
+
Remove a deployment from state.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
deployment_id: Unique deployment identifier
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
True if deployment was removed, False if not found
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
IOError: If state cannot be persisted
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
@abstractmethod
|
|
170
|
+
def update_deployment_status(
|
|
171
|
+
self, deployment_id: str, status: ProcessStatus
|
|
172
|
+
) -> bool:
|
|
173
|
+
"""
|
|
174
|
+
Update the status of a deployment.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
deployment_id: Unique deployment identifier
|
|
178
|
+
status: New ProcessStatus
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
True if updated, False if deployment not found
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
IOError: If state cannot be persisted
|
|
185
|
+
"""
|
|
186
|
+
|
|
187
|
+
@abstractmethod
|
|
188
|
+
def cleanup_dead_pids(self) -> int:
|
|
189
|
+
"""
|
|
190
|
+
Remove deployments with dead process IDs.
|
|
191
|
+
|
|
192
|
+
WHY: Processes may crash or be killed externally. This method cleans
|
|
193
|
+
up stale state entries for processes that no longer exist.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Number of dead PIDs cleaned up
|
|
197
|
+
|
|
198
|
+
Raises:
|
|
199
|
+
IOError: If state cannot be persisted
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class ILocalProcessManager(ABC):
|
|
204
|
+
"""
|
|
205
|
+
Interface for local process lifecycle management.
|
|
206
|
+
|
|
207
|
+
WHY: Process management involves complex operations like spawning, tracking,
|
|
208
|
+
and terminating background processes. This interface abstracts these operations
|
|
209
|
+
to enable different implementations and improve testability.
|
|
210
|
+
|
|
211
|
+
DESIGN DECISION: Provides high-level operations (start, stop, restart) that
|
|
212
|
+
handle all the complexity internally including port checking, process group
|
|
213
|
+
isolation, and state tracking.
|
|
214
|
+
|
|
215
|
+
Process Lifecycle:
|
|
216
|
+
1. Start: Spawn process with isolation and port checking
|
|
217
|
+
2. Monitor: Track status and update state
|
|
218
|
+
3. Stop: Graceful shutdown with fallback to force kill
|
|
219
|
+
4. Cleanup: Remove state and release resources
|
|
220
|
+
"""
|
|
221
|
+
|
|
222
|
+
@abstractmethod
|
|
223
|
+
def start(self, config: StartConfig) -> DeploymentState:
|
|
224
|
+
"""
|
|
225
|
+
Start a new background process.
|
|
226
|
+
|
|
227
|
+
WHY: Combines process spawning, port allocation, and state tracking in
|
|
228
|
+
a single operation to ensure consistency.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
config: Configuration for the process to start
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
DeploymentState with process information
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
ProcessSpawnError: If process cannot be spawned
|
|
238
|
+
PortConflictError: If requested port is unavailable and no alternative found
|
|
239
|
+
ValueError: If configuration is invalid
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
@abstractmethod
|
|
243
|
+
def stop(self, deployment_id: str, timeout: int = 10, force: bool = False) -> bool:
|
|
244
|
+
"""
|
|
245
|
+
Stop a running process.
|
|
246
|
+
|
|
247
|
+
WHY: Provides graceful shutdown with configurable timeout and force
|
|
248
|
+
option for stuck processes.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
deployment_id: Unique deployment identifier
|
|
252
|
+
timeout: Seconds to wait for graceful shutdown
|
|
253
|
+
force: If True, kill immediately without waiting
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
True if process stopped successfully
|
|
257
|
+
|
|
258
|
+
Raises:
|
|
259
|
+
ValueError: If deployment_id not found
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
@abstractmethod
|
|
263
|
+
def restart(self, deployment_id: str, timeout: int = 10) -> DeploymentState:
|
|
264
|
+
"""
|
|
265
|
+
Restart a process (stop then start with same config).
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
deployment_id: Unique deployment identifier
|
|
269
|
+
timeout: Seconds to wait for graceful shutdown
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
New DeploymentState after restart
|
|
273
|
+
|
|
274
|
+
Raises:
|
|
275
|
+
ValueError: If deployment_id not found
|
|
276
|
+
ProcessSpawnError: If restart fails
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
@abstractmethod
|
|
280
|
+
def get_status(self, deployment_id: str) -> Optional[ProcessInfo]:
|
|
281
|
+
"""
|
|
282
|
+
Get current status and runtime information for a process.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
deployment_id: Unique deployment identifier
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
ProcessInfo with current status, or None if not found
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
@abstractmethod
|
|
292
|
+
def list_processes(
|
|
293
|
+
self, status_filter: Optional[ProcessStatus] = None
|
|
294
|
+
) -> List[ProcessInfo]:
|
|
295
|
+
"""
|
|
296
|
+
List all managed processes.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
status_filter: Optional status to filter by
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
List of ProcessInfo for all matching processes
|
|
303
|
+
"""
|
|
304
|
+
|
|
305
|
+
@abstractmethod
|
|
306
|
+
def is_port_available(self, port: int) -> bool:
|
|
307
|
+
"""
|
|
308
|
+
Check if a port is available for use.
|
|
309
|
+
|
|
310
|
+
WHY: Port conflict prevention is critical for reliable deployments.
|
|
311
|
+
This check happens before process spawn.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
port: Port number to check
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
True if port is available
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
@abstractmethod
|
|
321
|
+
def find_available_port(
|
|
322
|
+
self, preferred_port: int, max_attempts: int = 10
|
|
323
|
+
) -> Optional[int]:
|
|
324
|
+
"""
|
|
325
|
+
Find an available port starting from preferred_port.
|
|
326
|
+
|
|
327
|
+
WHY: Uses linear probing to find alternative ports when the preferred
|
|
328
|
+
port is unavailable. Respects protected port ranges.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
preferred_port: Starting port number
|
|
332
|
+
max_attempts: Maximum number of ports to try
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Available port number, or None if none found
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
@abstractmethod
|
|
339
|
+
def cleanup_orphans(self) -> int:
|
|
340
|
+
"""
|
|
341
|
+
Clean up orphaned process state entries.
|
|
342
|
+
|
|
343
|
+
WHY: Processes may crash or be killed externally, leaving stale state.
|
|
344
|
+
This method identifies and cleans up these orphans.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Number of orphaned entries cleaned up
|
|
348
|
+
"""
|
|
349
|
+
|
|
350
|
+
@abstractmethod
|
|
351
|
+
def generate_deployment_id(
|
|
352
|
+
self, project_name: str, port: Optional[int] = None
|
|
353
|
+
) -> str:
|
|
354
|
+
"""
|
|
355
|
+
Generate a unique deployment ID.
|
|
356
|
+
|
|
357
|
+
WHY: Provides consistent ID generation with optional port suffix for
|
|
358
|
+
projects with multiple deployments.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
project_name: Name of the project
|
|
362
|
+
port: Optional port number to include in ID
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Unique deployment identifier
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
__all__ = [
|
|
370
|
+
"IDeploymentStateManager",
|
|
371
|
+
"ILocalProcessManager",
|
|
372
|
+
]
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Restart Management Interfaces for Claude MPM Framework
|
|
3
|
+
========================================================
|
|
4
|
+
|
|
5
|
+
WHY: This module defines interfaces for auto-restart functionality with crash
|
|
6
|
+
detection, intelligent restart policies, and circuit breaker patterns.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Restart interfaces are separated to enable modular restart
|
|
9
|
+
management with different crash detection strategies and restart policies.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- ICrashDetector: Interface for detecting process crashes and failures
|
|
13
|
+
- IRestartPolicy: Interface for restart decision logic with backoff
|
|
14
|
+
- IRestartManager: Interface for orchestrating the full restart workflow
|
|
15
|
+
|
|
16
|
+
USAGE:
|
|
17
|
+
crash_detector = CrashDetector(health_manager)
|
|
18
|
+
restart_policy = RestartPolicy(config)
|
|
19
|
+
restart_manager = RestartManager(
|
|
20
|
+
process_manager=process_manager,
|
|
21
|
+
health_manager=health_manager,
|
|
22
|
+
config=config
|
|
23
|
+
)
|
|
24
|
+
restart_manager.enable_auto_restart(deployment_id)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from abc import ABC, abstractmethod
|
|
28
|
+
from typing import TYPE_CHECKING, Callable, Optional
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from claude_mpm.services.core.models.restart import RestartHistory
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ICrashDetector(ABC):
|
|
35
|
+
"""
|
|
36
|
+
Interface for detecting process crashes and failures.
|
|
37
|
+
|
|
38
|
+
WHY: Crash detection requires monitoring health status changes, process
|
|
39
|
+
exits, and zombie states. This interface abstracts different detection
|
|
40
|
+
strategies to enable flexible crash monitoring.
|
|
41
|
+
|
|
42
|
+
DESIGN DECISION: Integrates with IHealthCheckManager via callbacks to
|
|
43
|
+
receive real-time status updates. Tracks crash history per deployment
|
|
44
|
+
to enable pattern detection.
|
|
45
|
+
|
|
46
|
+
Thread Safety: Implementations must be thread-safe for concurrent monitoring.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Register a callback to be invoked when a crash is detected.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
callback: Function called with (deployment_id, reason)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
def start_monitoring(self, deployment_id: str) -> None:
|
|
60
|
+
"""
|
|
61
|
+
Start monitoring a deployment for crashes.
|
|
62
|
+
|
|
63
|
+
WHY: Enables targeted monitoring for specific deployments.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
deployment_id: Unique deployment identifier
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
ValueError: If deployment_id not found
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def stop_monitoring(self, deployment_id: str) -> None:
|
|
74
|
+
"""
|
|
75
|
+
Stop monitoring a deployment.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
deployment_id: Unique deployment identifier
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def is_monitoring(self, deployment_id: str) -> bool:
|
|
83
|
+
"""
|
|
84
|
+
Check if a deployment is being monitored.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
deployment_id: Unique deployment identifier
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
True if deployment is being monitored
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def get_crash_count(self, deployment_id: str) -> int:
|
|
95
|
+
"""
|
|
96
|
+
Get the number of crashes detected for a deployment.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
deployment_id: Unique deployment identifier
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Number of crashes detected
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class IRestartPolicy(ABC):
|
|
107
|
+
"""
|
|
108
|
+
Interface for restart decision logic with exponential backoff.
|
|
109
|
+
|
|
110
|
+
WHY: Restart policies prevent restart loops through exponential backoff,
|
|
111
|
+
max attempts, and circuit breaker patterns. This interface abstracts
|
|
112
|
+
the decision-making logic to enable different strategies.
|
|
113
|
+
|
|
114
|
+
DESIGN DECISION: Implements exponential backoff with configurable
|
|
115
|
+
parameters and circuit breaker state transitions (CLOSED → OPEN → HALF_OPEN).
|
|
116
|
+
|
|
117
|
+
Circuit Breaker States:
|
|
118
|
+
- CLOSED: Normal operation, restarts allowed
|
|
119
|
+
- OPEN: Circuit breaker tripped, restarts blocked
|
|
120
|
+
- HALF_OPEN: Testing if service recovered
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
@abstractmethod
|
|
124
|
+
def should_restart(self, deployment_id: str) -> bool:
|
|
125
|
+
"""
|
|
126
|
+
Determine if a deployment should be restarted.
|
|
127
|
+
|
|
128
|
+
WHY: Central decision point that considers attempt count, circuit
|
|
129
|
+
breaker state, and backoff timing.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
deployment_id: Unique deployment identifier
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
True if restart should proceed
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
@abstractmethod
|
|
139
|
+
def calculate_backoff(self, deployment_id: str) -> float:
|
|
140
|
+
"""
|
|
141
|
+
Calculate backoff time in seconds for next restart.
|
|
142
|
+
|
|
143
|
+
WHY: Implements exponential backoff to prevent restart storms.
|
|
144
|
+
Formula: min(initial * (multiplier ** (attempt - 1)), max_backoff)
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
deployment_id: Unique deployment identifier
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Backoff time in seconds (0 if first attempt)
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
@abstractmethod
|
|
154
|
+
def record_restart_attempt(
|
|
155
|
+
self, deployment_id: str, success: bool, failure_reason: Optional[str] = None
|
|
156
|
+
) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Record a restart attempt and update circuit breaker state.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
deployment_id: Unique deployment identifier
|
|
162
|
+
success: Whether restart succeeded
|
|
163
|
+
failure_reason: Optional reason for failure
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
@abstractmethod
|
|
167
|
+
def reset_restart_history(self, deployment_id: str) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Reset restart history for a deployment.
|
|
170
|
+
|
|
171
|
+
WHY: Clears restart attempts after successful recovery or manual
|
|
172
|
+
intervention.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
deployment_id: Unique deployment identifier
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
@abstractmethod
|
|
179
|
+
def get_circuit_breaker_state(self, deployment_id: str) -> str:
|
|
180
|
+
"""
|
|
181
|
+
Get current circuit breaker state.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
deployment_id: Unique deployment identifier
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
@abstractmethod
|
|
191
|
+
def get_restart_attempt_count(self, deployment_id: str) -> int:
|
|
192
|
+
"""
|
|
193
|
+
Get number of restart attempts for a deployment.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
deployment_id: Unique deployment identifier
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Number of restart attempts
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class IRestartManager(ABC):
|
|
204
|
+
"""
|
|
205
|
+
Interface for orchestrating the complete restart workflow.
|
|
206
|
+
|
|
207
|
+
WHY: Restart management requires coordinating crash detection, policy
|
|
208
|
+
evaluation, process restart, and health verification. This interface
|
|
209
|
+
provides a high-level API for automatic and manual restarts.
|
|
210
|
+
|
|
211
|
+
DESIGN DECISION: Provides both automatic (background) and manual
|
|
212
|
+
(on-demand) restart operations. Integrates with all components:
|
|
213
|
+
CrashDetector, RestartPolicy, ProcessManager, and HealthCheckManager.
|
|
214
|
+
|
|
215
|
+
Restart Workflow:
|
|
216
|
+
1. Detect crash (via CrashDetector callback)
|
|
217
|
+
2. Check restart policy (max attempts, circuit breaker)
|
|
218
|
+
3. Wait for backoff period
|
|
219
|
+
4. Execute restart (preserve original StartConfig)
|
|
220
|
+
5. Verify health after restart
|
|
221
|
+
6. Record attempt and update circuit breaker
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
@abstractmethod
|
|
225
|
+
def enable_auto_restart(self, deployment_id: str) -> None:
|
|
226
|
+
"""
|
|
227
|
+
Enable automatic restarts for a deployment.
|
|
228
|
+
|
|
229
|
+
WHY: Enables hands-free recovery from crashes. Starts monitoring
|
|
230
|
+
via CrashDetector and registers restart callbacks.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
deployment_id: Unique deployment identifier
|
|
234
|
+
|
|
235
|
+
Raises:
|
|
236
|
+
ValueError: If deployment_id not found
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
@abstractmethod
|
|
240
|
+
def disable_auto_restart(self, deployment_id: str) -> None:
|
|
241
|
+
"""
|
|
242
|
+
Disable automatic restarts for a deployment.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
deployment_id: Unique deployment identifier
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
@abstractmethod
|
|
249
|
+
def is_auto_restart_enabled(self, deployment_id: str) -> bool:
|
|
250
|
+
"""
|
|
251
|
+
Check if auto-restart is enabled for a deployment.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
deployment_id: Unique deployment identifier
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
True if auto-restart is enabled
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
@abstractmethod
|
|
261
|
+
def restart_deployment(self, deployment_id: str, manual: bool = False) -> bool:
|
|
262
|
+
"""
|
|
263
|
+
Restart a deployment (manual or automatic trigger).
|
|
264
|
+
|
|
265
|
+
WHY: Provides unified restart operation that respects policy
|
|
266
|
+
constraints and performs health verification.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
deployment_id: Unique deployment identifier
|
|
270
|
+
manual: If True, bypass some policy checks (e.g., circuit breaker)
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
True if restart succeeded
|
|
274
|
+
|
|
275
|
+
Raises:
|
|
276
|
+
ValueError: If deployment_id not found
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
@abstractmethod
|
|
280
|
+
def get_restart_history(self, deployment_id: str) -> Optional["RestartHistory"]:
|
|
281
|
+
"""
|
|
282
|
+
Get restart history for a deployment.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
deployment_id: Unique deployment identifier
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
RestartHistory if found, None otherwise
|
|
289
|
+
"""
|
|
290
|
+
|
|
291
|
+
@abstractmethod
|
|
292
|
+
def clear_restart_history(self, deployment_id: str) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Clear restart history and reset circuit breaker.
|
|
295
|
+
|
|
296
|
+
WHY: Allows manual intervention to clear failed restart state.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
deployment_id: Unique deployment identifier
|
|
300
|
+
"""
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
__all__ = [
|
|
304
|
+
"ICrashDetector",
|
|
305
|
+
"IRestartManager",
|
|
306
|
+
"IRestartPolicy",
|
|
307
|
+
]
|