claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/BASE_ENGINEER.md +286 -0
- claude_mpm/agents/BASE_PM.md +48 -17
- claude_mpm/agents/OUTPUT_STYLE.md +329 -11
- claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
- claude_mpm/agents/agent_loader.py +17 -5
- claude_mpm/agents/frontmatter_validator.py +284 -253
- claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
- claude_mpm/agents/templates/api_qa.json +7 -1
- claude_mpm/agents/templates/clerk-ops.json +8 -1
- claude_mpm/agents/templates/code_analyzer.json +4 -1
- claude_mpm/agents/templates/dart_engineer.json +11 -1
- claude_mpm/agents/templates/data_engineer.json +11 -1
- claude_mpm/agents/templates/documentation.json +6 -1
- claude_mpm/agents/templates/engineer.json +18 -1
- claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
- claude_mpm/agents/templates/golang_engineer.json +11 -1
- claude_mpm/agents/templates/java_engineer.json +12 -2
- claude_mpm/agents/templates/local_ops_agent.json +1217 -6
- claude_mpm/agents/templates/nextjs_engineer.json +11 -1
- claude_mpm/agents/templates/ops.json +8 -1
- claude_mpm/agents/templates/php-engineer.json +11 -1
- claude_mpm/agents/templates/project_organizer.json +10 -3
- claude_mpm/agents/templates/prompt-engineer.json +5 -1
- claude_mpm/agents/templates/python_engineer.json +11 -1
- claude_mpm/agents/templates/qa.json +7 -1
- claude_mpm/agents/templates/react_engineer.json +11 -1
- claude_mpm/agents/templates/refactoring_engineer.json +8 -1
- claude_mpm/agents/templates/research.json +4 -1
- claude_mpm/agents/templates/ruby-engineer.json +11 -1
- claude_mpm/agents/templates/rust_engineer.json +11 -1
- claude_mpm/agents/templates/security.json +6 -1
- claude_mpm/agents/templates/svelte-engineer.json +225 -0
- claude_mpm/agents/templates/ticketing.json +6 -1
- claude_mpm/agents/templates/typescript_engineer.json +11 -1
- claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
- claude_mpm/agents/templates/version_control.json +8 -1
- claude_mpm/agents/templates/web_qa.json +7 -1
- claude_mpm/agents/templates/web_ui.json +11 -1
- claude_mpm/cli/__init__.py +34 -706
- claude_mpm/cli/commands/agent_manager.py +25 -12
- claude_mpm/cli/commands/agent_state_manager.py +186 -0
- claude_mpm/cli/commands/agents.py +204 -148
- claude_mpm/cli/commands/aggregate.py +7 -3
- claude_mpm/cli/commands/analyze.py +9 -4
- claude_mpm/cli/commands/analyze_code.py +7 -2
- claude_mpm/cli/commands/auto_configure.py +7 -9
- claude_mpm/cli/commands/config.py +47 -13
- claude_mpm/cli/commands/configure.py +294 -1788
- claude_mpm/cli/commands/configure_agent_display.py +261 -0
- claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
- claude_mpm/cli/commands/configure_hook_manager.py +225 -0
- claude_mpm/cli/commands/configure_models.py +18 -0
- claude_mpm/cli/commands/configure_navigation.py +167 -0
- claude_mpm/cli/commands/configure_paths.py +104 -0
- claude_mpm/cli/commands/configure_persistence.py +254 -0
- claude_mpm/cli/commands/configure_startup_manager.py +646 -0
- claude_mpm/cli/commands/configure_template_editor.py +497 -0
- claude_mpm/cli/commands/configure_validators.py +73 -0
- claude_mpm/cli/commands/local_deploy.py +537 -0
- claude_mpm/cli/commands/memory.py +54 -20
- claude_mpm/cli/commands/mpm_init.py +39 -25
- claude_mpm/cli/commands/mpm_init_handler.py +8 -3
- claude_mpm/cli/executor.py +202 -0
- claude_mpm/cli/helpers.py +105 -0
- claude_mpm/cli/interactive/__init__.py +3 -0
- claude_mpm/cli/interactive/skills_wizard.py +491 -0
- claude_mpm/cli/parsers/__init__.py +7 -1
- claude_mpm/cli/parsers/base_parser.py +98 -3
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/cli/shared/output_formatters.py +28 -19
- claude_mpm/cli/startup.py +481 -0
- claude_mpm/cli/utils.py +52 -1
- claude_mpm/commands/mpm-help.md +3 -0
- claude_mpm/commands/mpm-version.md +113 -0
- claude_mpm/commands/mpm.md +1 -0
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/base_service.py +13 -12
- claude_mpm/core/enums.py +452 -0
- claude_mpm/core/factories.py +1 -1
- claude_mpm/core/instruction_reinforcement_hook.py +2 -1
- claude_mpm/core/interactive_session.py +9 -3
- claude_mpm/core/logging_config.py +6 -2
- claude_mpm/core/oneshot_session.py +8 -4
- claude_mpm/core/optimized_agent_loader.py +3 -3
- claude_mpm/core/output_style_manager.py +12 -192
- claude_mpm/core/service_registry.py +5 -1
- claude_mpm/core/types.py +2 -9
- claude_mpm/core/typing_utils.py +7 -6
- claude_mpm/dashboard/static/js/dashboard.js +0 -14
- claude_mpm/dashboard/templates/index.html +3 -41
- claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
- claude_mpm/hooks/instruction_reinforcement.py +7 -2
- claude_mpm/models/resume_log.py +340 -0
- claude_mpm/services/agents/auto_config_manager.py +10 -11
- claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
- claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
- claude_mpm/services/agents/deployment/agent_validator.py +17 -1
- claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
- claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
- claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
- claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
- claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
- claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
- claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
- claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
- claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
- claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
- claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
- claude_mpm/services/agents/local_template_manager.py +1 -1
- claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
- claude_mpm/services/agents/registry/modification_tracker.py +5 -2
- claude_mpm/services/command_handler_service.py +11 -5
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +33 -0
- claude_mpm/services/core/models/agent_config.py +12 -28
- claude_mpm/services/core/models/health.py +162 -0
- claude_mpm/services/core/models/process.py +235 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/core/path_resolver.py +23 -7
- claude_mpm/services/diagnostics/__init__.py +2 -2
- claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
- claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
- claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
- claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
- claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
- claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
- claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
- claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
- claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
- claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
- claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
- claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
- claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
- claude_mpm/services/diagnostics/models.py +19 -24
- claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
- claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
- claude_mpm/services/infrastructure/monitoring/base.py +5 -13
- claude_mpm/services/infrastructure/monitoring/network.py +7 -6
- claude_mpm/services/infrastructure/monitoring/process.py +13 -12
- claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
- claude_mpm/services/infrastructure/monitoring/service.py +16 -15
- claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +372 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/mcp_config_manager.py +9 -4
- claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
- claude_mpm/services/mcp_gateway/core/base.py +18 -31
- claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
- claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
- claude_mpm/services/memory_hook_service.py +4 -1
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- claude_mpm/services/monitor/daemon_manager.py +3 -2
- claude_mpm/services/monitor/handlers/dashboard.py +2 -1
- claude_mpm/services/monitor/handlers/hooks.py +2 -1
- claude_mpm/services/monitor/management/lifecycle.py +3 -2
- claude_mpm/services/monitor/server.py +2 -1
- claude_mpm/services/session_management_service.py +3 -2
- claude_mpm/services/session_manager.py +205 -1
- claude_mpm/services/shared/async_service_base.py +16 -27
- claude_mpm/services/shared/lifecycle_service_base.py +1 -14
- claude_mpm/services/socketio/handlers/__init__.py +5 -2
- claude_mpm/services/socketio/handlers/hook.py +13 -2
- claude_mpm/services/socketio/handlers/registry.py +4 -2
- claude_mpm/services/socketio/server/main.py +10 -8
- claude_mpm/services/subprocess_launcher_service.py +14 -5
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
- claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
- claude_mpm/services/unified/deployment_strategies/local.py +6 -5
- claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
- claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
- claude_mpm/services/unified/interfaces.py +3 -1
- claude_mpm/services/unified/unified_analyzer.py +14 -10
- claude_mpm/services/unified/unified_config.py +2 -1
- claude_mpm/services/unified/unified_deployment.py +9 -4
- claude_mpm/services/version_service.py +104 -1
- claude_mpm/skills/__init__.py +21 -0
- claude_mpm/skills/bundled/__init__.py +6 -0
- claude_mpm/skills/bundled/api-documentation.md +393 -0
- claude_mpm/skills/bundled/async-testing.md +571 -0
- claude_mpm/skills/bundled/code-review.md +143 -0
- claude_mpm/skills/bundled/database-migration.md +199 -0
- claude_mpm/skills/bundled/docker-containerization.md +194 -0
- claude_mpm/skills/bundled/express-local-dev.md +1429 -0
- claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
- claude_mpm/skills/bundled/git-workflow.md +414 -0
- claude_mpm/skills/bundled/imagemagick.md +204 -0
- claude_mpm/skills/bundled/json-data-handling.md +223 -0
- claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
- claude_mpm/skills/bundled/pdf.md +141 -0
- claude_mpm/skills/bundled/performance-profiling.md +567 -0
- claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
- claude_mpm/skills/bundled/security-scanning.md +327 -0
- claude_mpm/skills/bundled/systematic-debugging.md +473 -0
- claude_mpm/skills/bundled/test-driven-development.md +378 -0
- claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
- claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
- claude_mpm/skills/bundled/xlsx.md +157 -0
- claude_mpm/skills/registry.py +286 -0
- claude_mpm/skills/skill_manager.py +310 -0
- claude_mpm/tools/code_tree_analyzer.py +177 -141
- claude_mpm/tools/code_tree_events.py +4 -2
- claude_mpm/utils/agent_dependency_loader.py +2 -2
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
- claude_mpm/dashboard/static/css/code-tree.css +0 -1639
- claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
- claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
- claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
- claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
- claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
- claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
- claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
- claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
- claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
- claude_mpm/services/project/analyzer_refactored.py +0 -450
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Restart Manager for Claude MPM Framework
|
|
3
|
+
=========================================
|
|
4
|
+
|
|
5
|
+
WHY: Orchestrates the complete restart workflow including crash detection,
|
|
6
|
+
policy evaluation, process restart, and health verification.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Integrates all restart components (CrashDetector,
|
|
9
|
+
RestartPolicy, ProcessManager, HealthCheckManager) to provide automatic
|
|
10
|
+
and manual restart operations with proper verification.
|
|
11
|
+
|
|
12
|
+
ARCHITECTURE:
|
|
13
|
+
- Auto-restart workflow:
|
|
14
|
+
1. CrashDetector detects crash → triggers callback
|
|
15
|
+
2. RestartManager checks policy (max attempts, circuit breaker)
|
|
16
|
+
3. Calculate and wait for backoff period
|
|
17
|
+
4. Execute restart via ProcessManager
|
|
18
|
+
5. Wait for health check verification
|
|
19
|
+
6. Record attempt and update circuit breaker
|
|
20
|
+
- Manual restart: bypasses some policy checks
|
|
21
|
+
- Thread-safe operations with proper locking
|
|
22
|
+
- State persistence for restart history
|
|
23
|
+
|
|
24
|
+
USAGE:
|
|
25
|
+
config = RestartConfig(max_attempts=5, circuit_breaker_threshold=3)
|
|
26
|
+
restart_manager = RestartManager(
|
|
27
|
+
process_manager=process_manager,
|
|
28
|
+
health_manager=health_manager,
|
|
29
|
+
crash_detector=crash_detector,
|
|
30
|
+
restart_policy=restart_policy
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Enable auto-restart
|
|
34
|
+
restart_manager.enable_auto_restart(deployment_id)
|
|
35
|
+
|
|
36
|
+
# Manual restart
|
|
37
|
+
success = restart_manager.restart_deployment(deployment_id, manual=True)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import json
|
|
41
|
+
import threading
|
|
42
|
+
import time
|
|
43
|
+
from pathlib import Path
|
|
44
|
+
from typing import Optional, Set
|
|
45
|
+
|
|
46
|
+
from claude_mpm.core.enums import HealthStatus
|
|
47
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
48
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheckManager
|
|
49
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
50
|
+
from claude_mpm.services.core.interfaces.restart import (
|
|
51
|
+
ICrashDetector,
|
|
52
|
+
IRestartManager,
|
|
53
|
+
IRestartPolicy,
|
|
54
|
+
)
|
|
55
|
+
from claude_mpm.services.core.models.restart import RestartHistory
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class RestartManager(SyncBaseService, IRestartManager):
|
|
59
|
+
"""
|
|
60
|
+
Orchestrates automatic and manual restart operations.
|
|
61
|
+
|
|
62
|
+
WHY: Provides complete restart workflow by coordinating crash detection,
|
|
63
|
+
policy evaluation, process restart, and health verification.
|
|
64
|
+
|
|
65
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
process_manager: ILocalProcessManager,
|
|
71
|
+
health_manager: IHealthCheckManager,
|
|
72
|
+
crash_detector: ICrashDetector,
|
|
73
|
+
restart_policy: IRestartPolicy,
|
|
74
|
+
state_dir: Optional[Path] = None,
|
|
75
|
+
):
|
|
76
|
+
"""
|
|
77
|
+
Initialize restart manager.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
process_manager: Process manager for restart operations
|
|
81
|
+
health_manager: Health check manager for verification
|
|
82
|
+
crash_detector: Crash detector for automatic restarts
|
|
83
|
+
restart_policy: Restart policy for decision making
|
|
84
|
+
state_dir: Directory for restart history persistence
|
|
85
|
+
"""
|
|
86
|
+
super().__init__("RestartManager")
|
|
87
|
+
self.process_manager = process_manager
|
|
88
|
+
self.health_manager = health_manager
|
|
89
|
+
self.crash_detector = crash_detector
|
|
90
|
+
self.restart_policy = restart_policy
|
|
91
|
+
|
|
92
|
+
# State persistence
|
|
93
|
+
if state_dir is None:
|
|
94
|
+
state_dir = Path.home() / ".claude-mpm"
|
|
95
|
+
self.state_dir = Path(state_dir)
|
|
96
|
+
self.state_dir.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
self.history_file = self.state_dir / "restart-history.json"
|
|
98
|
+
|
|
99
|
+
# Auto-restart tracking
|
|
100
|
+
self._lock = threading.Lock()
|
|
101
|
+
self._auto_restart_enabled: Set[str] = set()
|
|
102
|
+
|
|
103
|
+
# In-progress restart tracking (prevent concurrent restarts)
|
|
104
|
+
self._restart_in_progress: Set[str] = set()
|
|
105
|
+
|
|
106
|
+
def initialize(self) -> bool:
|
|
107
|
+
"""
|
|
108
|
+
Initialize the restart manager.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
True if initialization successful
|
|
112
|
+
"""
|
|
113
|
+
self.logger.info("Initializing RestartManager")
|
|
114
|
+
|
|
115
|
+
# Register crash callback
|
|
116
|
+
self.crash_detector.register_crash_callback(self._on_crash_detected)
|
|
117
|
+
|
|
118
|
+
# Load restart history from disk
|
|
119
|
+
self._load_restart_history()
|
|
120
|
+
|
|
121
|
+
self.logger.info("RestartManager initialized successfully")
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
def enable_auto_restart(self, deployment_id: str) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Enable automatic restarts for a deployment.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
deployment_id: Unique deployment identifier
|
|
130
|
+
|
|
131
|
+
Raises:
|
|
132
|
+
ValueError: If deployment_id not found
|
|
133
|
+
"""
|
|
134
|
+
# Verify deployment exists
|
|
135
|
+
deployment = self.process_manager.get_status(deployment_id)
|
|
136
|
+
if deployment is None:
|
|
137
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
138
|
+
|
|
139
|
+
with self._lock:
|
|
140
|
+
if deployment_id in self._auto_restart_enabled:
|
|
141
|
+
self.logger.debug(f"Auto-restart already enabled for {deployment_id}")
|
|
142
|
+
return
|
|
143
|
+
|
|
144
|
+
# Enable auto-restart
|
|
145
|
+
self._auto_restart_enabled.add(deployment_id)
|
|
146
|
+
|
|
147
|
+
# Start crash monitoring
|
|
148
|
+
self.crash_detector.start_monitoring(deployment_id)
|
|
149
|
+
|
|
150
|
+
self.logger.info(f"Enabled auto-restart for deployment: {deployment_id}")
|
|
151
|
+
|
|
152
|
+
def disable_auto_restart(self, deployment_id: str) -> None:
|
|
153
|
+
"""
|
|
154
|
+
Disable automatic restarts for a deployment.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
deployment_id: Unique deployment identifier
|
|
158
|
+
"""
|
|
159
|
+
with self._lock:
|
|
160
|
+
self._auto_restart_enabled.discard(deployment_id)
|
|
161
|
+
|
|
162
|
+
# Stop crash monitoring
|
|
163
|
+
self.crash_detector.stop_monitoring(deployment_id)
|
|
164
|
+
|
|
165
|
+
self.logger.info(f"Disabled auto-restart for deployment: {deployment_id}")
|
|
166
|
+
|
|
167
|
+
def is_auto_restart_enabled(self, deployment_id: str) -> bool:
|
|
168
|
+
"""
|
|
169
|
+
Check if auto-restart is enabled for a deployment.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
deployment_id: Unique deployment identifier
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
True if auto-restart is enabled
|
|
176
|
+
"""
|
|
177
|
+
with self._lock:
|
|
178
|
+
return deployment_id in self._auto_restart_enabled
|
|
179
|
+
|
|
180
|
+
def restart_deployment(self, deployment_id: str, manual: bool = False) -> bool:
|
|
181
|
+
"""
|
|
182
|
+
Restart a deployment (manual or automatic trigger).
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
deployment_id: Unique deployment identifier
|
|
186
|
+
manual: If True, bypass circuit breaker check
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
True if restart succeeded
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If deployment_id not found
|
|
193
|
+
"""
|
|
194
|
+
with self._lock:
|
|
195
|
+
# Check if restart already in progress
|
|
196
|
+
if deployment_id in self._restart_in_progress:
|
|
197
|
+
self.logger.warning(
|
|
198
|
+
f"Restart already in progress for {deployment_id}, skipping"
|
|
199
|
+
)
|
|
200
|
+
return False
|
|
201
|
+
|
|
202
|
+
# Mark restart in progress
|
|
203
|
+
self._restart_in_progress.add(deployment_id)
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
# Check restart policy (unless manual override)
|
|
207
|
+
if not manual:
|
|
208
|
+
if not self.restart_policy.should_restart(deployment_id):
|
|
209
|
+
self.logger.warning(
|
|
210
|
+
f"Restart policy blocked restart for {deployment_id}"
|
|
211
|
+
)
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
# Calculate and wait for backoff
|
|
215
|
+
backoff = self.restart_policy.calculate_backoff(deployment_id)
|
|
216
|
+
if backoff > 0:
|
|
217
|
+
self.logger.info(
|
|
218
|
+
f"Waiting {backoff:.1f}s backoff before restarting {deployment_id}"
|
|
219
|
+
)
|
|
220
|
+
time.sleep(backoff)
|
|
221
|
+
|
|
222
|
+
# Execute restart
|
|
223
|
+
self.logger.info(f"Restarting deployment: {deployment_id}")
|
|
224
|
+
try:
|
|
225
|
+
new_deployment = self.process_manager.restart(deployment_id)
|
|
226
|
+
|
|
227
|
+
# Wait for initial health check
|
|
228
|
+
self.logger.debug(
|
|
229
|
+
f"Waiting for health check verification for {deployment_id}"
|
|
230
|
+
)
|
|
231
|
+
time.sleep(5) # Brief wait for process to initialize
|
|
232
|
+
|
|
233
|
+
# Verify health status
|
|
234
|
+
health = self.health_manager.check_health(new_deployment.deployment_id)
|
|
235
|
+
success = health.overall_status != HealthStatus.UNHEALTHY
|
|
236
|
+
|
|
237
|
+
if success:
|
|
238
|
+
self.logger.info(
|
|
239
|
+
f"Restart succeeded for {deployment_id}, "
|
|
240
|
+
f"health status: {health.overall_status.value}"
|
|
241
|
+
)
|
|
242
|
+
else:
|
|
243
|
+
self.logger.warning(
|
|
244
|
+
f"Restart completed but deployment unhealthy: {deployment_id}"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Record attempt
|
|
248
|
+
failure_reason = (
|
|
249
|
+
None
|
|
250
|
+
if success
|
|
251
|
+
else f"Health check failed: {health.overall_status.value}"
|
|
252
|
+
)
|
|
253
|
+
self.restart_policy.record_restart_attempt(
|
|
254
|
+
deployment_id, success, failure_reason
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Persist restart history
|
|
258
|
+
self._save_restart_history()
|
|
259
|
+
|
|
260
|
+
return success
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
self.logger.error(
|
|
264
|
+
f"Restart failed for {deployment_id}: {e}", exc_info=True
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Record failed attempt
|
|
268
|
+
self.restart_policy.record_restart_attempt(
|
|
269
|
+
deployment_id, success=False, failure_reason=str(e)
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Persist restart history
|
|
273
|
+
self._save_restart_history()
|
|
274
|
+
|
|
275
|
+
return False
|
|
276
|
+
|
|
277
|
+
finally:
|
|
278
|
+
# Clear in-progress flag
|
|
279
|
+
with self._lock:
|
|
280
|
+
self._restart_in_progress.discard(deployment_id)
|
|
281
|
+
|
|
282
|
+
def get_restart_history(self, deployment_id: str) -> Optional[RestartHistory]:
|
|
283
|
+
"""
|
|
284
|
+
Get restart history for a deployment.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
deployment_id: Unique deployment identifier
|
|
288
|
+
|
|
289
|
+
Returns:
|
|
290
|
+
RestartHistory if found, None otherwise
|
|
291
|
+
"""
|
|
292
|
+
return self.restart_policy.get_history(deployment_id)
|
|
293
|
+
|
|
294
|
+
def clear_restart_history(self, deployment_id: str) -> None:
|
|
295
|
+
"""
|
|
296
|
+
Clear restart history and reset circuit breaker.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
deployment_id: Unique deployment identifier
|
|
300
|
+
"""
|
|
301
|
+
self.restart_policy.reset_restart_history(deployment_id)
|
|
302
|
+
self._save_restart_history()
|
|
303
|
+
self.logger.info(f"Cleared restart history for deployment: {deployment_id}")
|
|
304
|
+
|
|
305
|
+
def _on_crash_detected(self, deployment_id: str, reason: str) -> None:
|
|
306
|
+
"""
|
|
307
|
+
Handle crash detection callback.
|
|
308
|
+
|
|
309
|
+
WHY: Invoked by CrashDetector when a crash is detected.
|
|
310
|
+
Triggers automatic restart if enabled.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
deployment_id: Unique deployment identifier
|
|
314
|
+
reason: Reason for crash detection
|
|
315
|
+
"""
|
|
316
|
+
self.logger.warning(f"Crash detected for {deployment_id}: {reason}")
|
|
317
|
+
|
|
318
|
+
# Check if auto-restart is enabled
|
|
319
|
+
with self._lock:
|
|
320
|
+
if deployment_id not in self._auto_restart_enabled:
|
|
321
|
+
self.logger.debug(
|
|
322
|
+
f"Auto-restart not enabled for {deployment_id}, ignoring crash"
|
|
323
|
+
)
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
# Trigger automatic restart
|
|
327
|
+
self.logger.info(f"Triggering automatic restart for {deployment_id}")
|
|
328
|
+
self.restart_deployment(deployment_id, manual=False)
|
|
329
|
+
|
|
330
|
+
def _load_restart_history(self) -> None:
|
|
331
|
+
"""
|
|
332
|
+
Load restart history from disk.
|
|
333
|
+
|
|
334
|
+
WHY: Persists restart state across service restarts to maintain
|
|
335
|
+
circuit breaker state and attempt counts.
|
|
336
|
+
"""
|
|
337
|
+
if not self.history_file.exists():
|
|
338
|
+
self.logger.debug("No restart history file found, starting fresh")
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
with self.history_file.open() as f:
|
|
343
|
+
data = json.load(f)
|
|
344
|
+
|
|
345
|
+
# Load history into restart policy
|
|
346
|
+
for deployment_id, history_data in data.items():
|
|
347
|
+
history = RestartHistory.from_dict(history_data)
|
|
348
|
+
# Inject into restart policy's internal state
|
|
349
|
+
if hasattr(self.restart_policy, "_history"):
|
|
350
|
+
self.restart_policy._history[deployment_id] = history
|
|
351
|
+
|
|
352
|
+
self.logger.info(f"Loaded restart history for {len(data)} deployments")
|
|
353
|
+
|
|
354
|
+
except Exception as e:
|
|
355
|
+
self.logger.error(f"Failed to load restart history: {e}", exc_info=True)
|
|
356
|
+
|
|
357
|
+
def _save_restart_history(self) -> None:
|
|
358
|
+
"""
|
|
359
|
+
Save restart history to disk.
|
|
360
|
+
|
|
361
|
+
WHY: Persists restart state to maintain circuit breaker and
|
|
362
|
+
attempt counts across service restarts.
|
|
363
|
+
"""
|
|
364
|
+
try:
|
|
365
|
+
# Collect all restart histories from restart policy
|
|
366
|
+
data = {}
|
|
367
|
+
if hasattr(self.restart_policy, "_history"):
|
|
368
|
+
for deployment_id, history in self.restart_policy._history.items():
|
|
369
|
+
data[deployment_id] = history.to_dict()
|
|
370
|
+
|
|
371
|
+
# Write to disk
|
|
372
|
+
with self.history_file.open("w") as f:
|
|
373
|
+
json.dump(data, f, indent=2)
|
|
374
|
+
|
|
375
|
+
self.logger.debug(f"Saved restart history for {len(data)} deployments")
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
self.logger.error(f"Failed to save restart history: {e}", exc_info=True)
|
|
379
|
+
|
|
380
|
+
def shutdown(self) -> bool:
|
|
381
|
+
"""
|
|
382
|
+
Shutdown the restart manager.
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
True if shutdown successful
|
|
386
|
+
"""
|
|
387
|
+
# Save restart history before shutdown
|
|
388
|
+
self._save_restart_history()
|
|
389
|
+
|
|
390
|
+
with self._lock:
|
|
391
|
+
# Disable all auto-restarts
|
|
392
|
+
for deployment_id in list(self._auto_restart_enabled):
|
|
393
|
+
self.crash_detector.stop_monitoring(deployment_id)
|
|
394
|
+
self._auto_restart_enabled.clear()
|
|
395
|
+
self._restart_in_progress.clear()
|
|
396
|
+
|
|
397
|
+
self.logger.info("RestartManager shutdown successfully")
|
|
398
|
+
return True
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
__all__ = ["RestartManager"]
|