claude-mpm 4.7.4__py3-none-any.whl → 4.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/BASE_AGENT_TEMPLATE.md +118 -0
- claude_mpm/agents/BASE_ENGINEER.md +286 -0
- claude_mpm/agents/BASE_PM.md +106 -1
- claude_mpm/agents/OUTPUT_STYLE.md +329 -11
- claude_mpm/agents/PM_INSTRUCTIONS.md +397 -459
- claude_mpm/agents/agent_loader.py +17 -5
- claude_mpm/agents/frontmatter_validator.py +284 -253
- claude_mpm/agents/templates/README.md +465 -0
- claude_mpm/agents/templates/agent-manager.json +4 -1
- claude_mpm/agents/templates/agentic-coder-optimizer.json +13 -3
- claude_mpm/agents/templates/api_qa.json +11 -2
- claude_mpm/agents/templates/circuit_breakers.md +638 -0
- claude_mpm/agents/templates/clerk-ops.json +12 -2
- claude_mpm/agents/templates/code_analyzer.json +8 -2
- claude_mpm/agents/templates/content-agent.json +358 -0
- claude_mpm/agents/templates/dart_engineer.json +15 -2
- claude_mpm/agents/templates/data_engineer.json +15 -2
- claude_mpm/agents/templates/documentation.json +10 -2
- claude_mpm/agents/templates/engineer.json +21 -1
- claude_mpm/agents/templates/gcp_ops_agent.json +12 -2
- claude_mpm/agents/templates/git_file_tracking.md +584 -0
- claude_mpm/agents/templates/golang_engineer.json +270 -0
- claude_mpm/agents/templates/imagemagick.json +4 -1
- claude_mpm/agents/templates/java_engineer.json +346 -0
- claude_mpm/agents/templates/local_ops_agent.json +1227 -6
- claude_mpm/agents/templates/memory_manager.json +4 -1
- claude_mpm/agents/templates/nextjs_engineer.json +141 -133
- claude_mpm/agents/templates/ops.json +12 -2
- claude_mpm/agents/templates/php-engineer.json +270 -174
- claude_mpm/agents/templates/pm_examples.md +474 -0
- claude_mpm/agents/templates/pm_red_flags.md +240 -0
- claude_mpm/agents/templates/product_owner.json +338 -0
- claude_mpm/agents/templates/project_organizer.json +14 -4
- claude_mpm/agents/templates/prompt-engineer.json +13 -2
- claude_mpm/agents/templates/python_engineer.json +174 -81
- claude_mpm/agents/templates/qa.json +11 -2
- claude_mpm/agents/templates/react_engineer.json +16 -3
- claude_mpm/agents/templates/refactoring_engineer.json +12 -2
- claude_mpm/agents/templates/research.json +34 -21
- claude_mpm/agents/templates/response_format.md +583 -0
- claude_mpm/agents/templates/ruby-engineer.json +129 -192
- claude_mpm/agents/templates/rust_engineer.json +270 -0
- claude_mpm/agents/templates/security.json +10 -2
- claude_mpm/agents/templates/svelte-engineer.json +225 -0
- claude_mpm/agents/templates/ticketing.json +10 -2
- claude_mpm/agents/templates/typescript_engineer.json +116 -125
- claude_mpm/agents/templates/validation_templates.md +312 -0
- claude_mpm/agents/templates/vercel_ops_agent.json +12 -2
- claude_mpm/agents/templates/version_control.json +12 -2
- claude_mpm/agents/templates/web_qa.json +11 -2
- claude_mpm/agents/templates/web_ui.json +15 -2
- claude_mpm/cli/__init__.py +34 -614
- claude_mpm/cli/commands/agent_manager.py +25 -12
- claude_mpm/cli/commands/agent_state_manager.py +186 -0
- claude_mpm/cli/commands/agents.py +235 -148
- claude_mpm/cli/commands/agents_detect.py +380 -0
- claude_mpm/cli/commands/agents_recommend.py +309 -0
- claude_mpm/cli/commands/aggregate.py +7 -3
- claude_mpm/cli/commands/analyze.py +9 -4
- claude_mpm/cli/commands/analyze_code.py +7 -2
- claude_mpm/cli/commands/auto_configure.py +570 -0
- claude_mpm/cli/commands/config.py +47 -13
- claude_mpm/cli/commands/configure.py +419 -1571
- claude_mpm/cli/commands/configure_agent_display.py +261 -0
- claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
- claude_mpm/cli/commands/configure_hook_manager.py +225 -0
- claude_mpm/cli/commands/configure_models.py +18 -0
- claude_mpm/cli/commands/configure_navigation.py +167 -0
- claude_mpm/cli/commands/configure_paths.py +104 -0
- claude_mpm/cli/commands/configure_persistence.py +254 -0
- claude_mpm/cli/commands/configure_startup_manager.py +646 -0
- claude_mpm/cli/commands/configure_template_editor.py +497 -0
- claude_mpm/cli/commands/configure_validators.py +73 -0
- claude_mpm/cli/commands/local_deploy.py +537 -0
- claude_mpm/cli/commands/memory.py +54 -20
- claude_mpm/cli/commands/mpm_init.py +585 -196
- claude_mpm/cli/commands/mpm_init_handler.py +37 -3
- claude_mpm/cli/commands/search.py +170 -4
- claude_mpm/cli/commands/upgrade.py +152 -0
- claude_mpm/cli/executor.py +202 -0
- claude_mpm/cli/helpers.py +105 -0
- claude_mpm/cli/interactive/__init__.py +3 -0
- claude_mpm/cli/interactive/skills_wizard.py +491 -0
- claude_mpm/cli/parsers/__init__.py +7 -1
- claude_mpm/cli/parsers/agents_parser.py +9 -0
- claude_mpm/cli/parsers/auto_configure_parser.py +245 -0
- claude_mpm/cli/parsers/base_parser.py +110 -3
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/cli/parsers/mpm_init_parser.py +65 -5
- claude_mpm/cli/shared/output_formatters.py +28 -19
- claude_mpm/cli/startup.py +481 -0
- claude_mpm/cli/utils.py +52 -1
- claude_mpm/commands/mpm-agents-detect.md +168 -0
- claude_mpm/commands/mpm-agents-recommend.md +214 -0
- claude_mpm/commands/mpm-agents.md +75 -1
- claude_mpm/commands/mpm-auto-configure.md +217 -0
- claude_mpm/commands/mpm-help.md +163 -0
- claude_mpm/commands/mpm-init.md +148 -3
- claude_mpm/commands/mpm-version.md +113 -0
- claude_mpm/commands/mpm.md +1 -0
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/constants.py +1 -0
- claude_mpm/core/base_service.py +13 -12
- claude_mpm/core/enums.py +452 -0
- claude_mpm/core/factories.py +1 -1
- claude_mpm/core/instruction_reinforcement_hook.py +2 -1
- claude_mpm/core/interactive_session.py +9 -3
- claude_mpm/core/log_manager.py +2 -0
- claude_mpm/core/logging_config.py +6 -2
- claude_mpm/core/oneshot_session.py +8 -4
- claude_mpm/core/optimized_agent_loader.py +3 -3
- claude_mpm/core/output_style_manager.py +12 -192
- claude_mpm/core/service_registry.py +5 -1
- claude_mpm/core/types.py +2 -9
- claude_mpm/core/typing_utils.py +7 -6
- claude_mpm/dashboard/static/js/dashboard.js +0 -14
- claude_mpm/dashboard/templates/index.html +3 -41
- claude_mpm/hooks/__init__.py +20 -0
- claude_mpm/hooks/claude_hooks/event_handlers.py +4 -2
- claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
- claude_mpm/hooks/claude_hooks/services/connection_manager_http.py +23 -2
- claude_mpm/hooks/failure_learning/__init__.py +60 -0
- claude_mpm/hooks/failure_learning/failure_detection_hook.py +235 -0
- claude_mpm/hooks/failure_learning/fix_detection_hook.py +217 -0
- claude_mpm/hooks/failure_learning/learning_extraction_hook.py +286 -0
- claude_mpm/hooks/instruction_reinforcement.py +7 -2
- claude_mpm/hooks/kuzu_enrichment_hook.py +263 -0
- claude_mpm/hooks/kuzu_memory_hook.py +37 -12
- claude_mpm/hooks/kuzu_response_hook.py +183 -0
- claude_mpm/models/resume_log.py +340 -0
- claude_mpm/services/agents/__init__.py +18 -5
- claude_mpm/services/agents/auto_config_manager.py +796 -0
- claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
- claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
- claude_mpm/services/agents/deployment/agent_validator.py +17 -1
- claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
- claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
- claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
- claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
- claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
- claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
- claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
- claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
- claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
- claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
- claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
- claude_mpm/services/agents/local_template_manager.py +1 -1
- claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
- claude_mpm/services/agents/observers.py +547 -0
- claude_mpm/services/agents/recommender.py +568 -0
- claude_mpm/services/agents/registry/modification_tracker.py +5 -2
- claude_mpm/services/command_handler_service.py +11 -5
- claude_mpm/services/core/__init__.py +33 -1
- claude_mpm/services/core/interfaces/__init__.py +90 -3
- claude_mpm/services/core/interfaces/agent.py +184 -0
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/project.py +121 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/memory_manager.py +11 -24
- claude_mpm/services/core/models/__init__.py +79 -0
- claude_mpm/services/core/models/agent_config.py +381 -0
- claude_mpm/services/core/models/health.py +162 -0
- claude_mpm/services/core/models/process.py +235 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/core/models/toolchain.py +306 -0
- claude_mpm/services/core/path_resolver.py +23 -7
- claude_mpm/services/diagnostics/__init__.py +2 -2
- claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
- claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
- claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
- claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
- claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
- claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
- claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
- claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
- claude_mpm/services/diagnostics/checks/mcp_services_check.py +38 -33
- claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
- claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
- claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
- claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
- claude_mpm/services/diagnostics/models.py +19 -24
- claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
- claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
- claude_mpm/services/infrastructure/monitoring/base.py +5 -13
- claude_mpm/services/infrastructure/monitoring/network.py +7 -6
- claude_mpm/services/infrastructure/monitoring/process.py +13 -12
- claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
- claude_mpm/services/infrastructure/monitoring/service.py +16 -15
- claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +372 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/mcp_config_manager.py +9 -4
- claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
- claude_mpm/services/mcp_gateway/core/base.py +18 -31
- claude_mpm/services/mcp_gateway/main.py +30 -0
- claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +206 -32
- claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
- claude_mpm/services/mcp_gateway/tools/kuzu_memory_service.py +25 -5
- claude_mpm/services/mcp_service_verifier.py +1 -1
- claude_mpm/services/memory/failure_tracker.py +563 -0
- claude_mpm/services/memory_hook_service.py +165 -4
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- claude_mpm/services/monitor/daemon_manager.py +3 -2
- claude_mpm/services/monitor/handlers/dashboard.py +2 -1
- claude_mpm/services/monitor/handlers/hooks.py +2 -1
- claude_mpm/services/monitor/management/lifecycle.py +3 -2
- claude_mpm/services/monitor/server.py +2 -1
- claude_mpm/services/project/__init__.py +23 -0
- claude_mpm/services/project/detection_strategies.py +719 -0
- claude_mpm/services/project/toolchain_analyzer.py +581 -0
- claude_mpm/services/self_upgrade_service.py +342 -0
- claude_mpm/services/session_management_service.py +3 -2
- claude_mpm/services/session_manager.py +205 -1
- claude_mpm/services/shared/async_service_base.py +16 -27
- claude_mpm/services/shared/lifecycle_service_base.py +1 -14
- claude_mpm/services/socketio/handlers/__init__.py +5 -2
- claude_mpm/services/socketio/handlers/hook.py +13 -2
- claude_mpm/services/socketio/handlers/registry.py +4 -2
- claude_mpm/services/socketio/server/main.py +10 -8
- claude_mpm/services/subprocess_launcher_service.py +14 -5
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
- claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
- claude_mpm/services/unified/deployment_strategies/local.py +6 -5
- claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
- claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
- claude_mpm/services/unified/interfaces.py +3 -1
- claude_mpm/services/unified/unified_analyzer.py +14 -10
- claude_mpm/services/unified/unified_config.py +2 -1
- claude_mpm/services/unified/unified_deployment.py +9 -4
- claude_mpm/services/version_service.py +104 -1
- claude_mpm/skills/__init__.py +21 -0
- claude_mpm/skills/bundled/__init__.py +6 -0
- claude_mpm/skills/bundled/api-documentation.md +393 -0
- claude_mpm/skills/bundled/async-testing.md +571 -0
- claude_mpm/skills/bundled/code-review.md +143 -0
- claude_mpm/skills/bundled/database-migration.md +199 -0
- claude_mpm/skills/bundled/docker-containerization.md +194 -0
- claude_mpm/skills/bundled/express-local-dev.md +1429 -0
- claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
- claude_mpm/skills/bundled/git-workflow.md +414 -0
- claude_mpm/skills/bundled/imagemagick.md +204 -0
- claude_mpm/skills/bundled/json-data-handling.md +223 -0
- claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
- claude_mpm/skills/bundled/pdf.md +141 -0
- claude_mpm/skills/bundled/performance-profiling.md +567 -0
- claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
- claude_mpm/skills/bundled/security-scanning.md +327 -0
- claude_mpm/skills/bundled/systematic-debugging.md +473 -0
- claude_mpm/skills/bundled/test-driven-development.md +378 -0
- claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
- claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
- claude_mpm/skills/bundled/xlsx.md +157 -0
- claude_mpm/skills/registry.py +286 -0
- claude_mpm/skills/skill_manager.py +310 -0
- claude_mpm/storage/state_storage.py +15 -15
- claude_mpm/tools/code_tree_analyzer.py +177 -141
- claude_mpm/tools/code_tree_events.py +4 -2
- claude_mpm/utils/agent_dependency_loader.py +40 -20
- claude_mpm/utils/display_helper.py +260 -0
- claude_mpm/utils/git_analyzer.py +407 -0
- claude_mpm/utils/robust_installer.py +73 -19
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +129 -12
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +295 -193
- claude_mpm/dashboard/static/css/code-tree.css +0 -1639
- claude_mpm/dashboard/static/index-hub-backup.html +0 -713
- claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
- claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
- claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
- claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
- claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
- claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
- claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
- claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
- claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
- claude_mpm/services/project/analyzer_refactored.py +0 -450
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.7.4.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Restart Policy for Claude MPM Framework
|
|
3
|
+
========================================
|
|
4
|
+
|
|
5
|
+
WHY: Implements intelligent restart policies with exponential backoff,
|
|
6
|
+
max attempts, and circuit breaker patterns to prevent restart loops.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses exponential backoff with configurable parameters
|
|
9
|
+
and circuit breaker state transitions (CLOSED → OPEN → HALF_OPEN).
|
|
10
|
+
Tracks restart history per deployment for policy decisions.
|
|
11
|
+
|
|
12
|
+
ARCHITECTURE:
|
|
13
|
+
- Exponential backoff: initial * (multiplier ** (attempt - 1))
|
|
14
|
+
- Circuit breaker states: CLOSED, OPEN, HALF_OPEN
|
|
15
|
+
- Failure window tracking for circuit breaker trip detection
|
|
16
|
+
- Thread-safe restart history management
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
config = RestartConfig(
|
|
20
|
+
max_attempts=5,
|
|
21
|
+
initial_backoff_seconds=2.0,
|
|
22
|
+
circuit_breaker_threshold=3
|
|
23
|
+
)
|
|
24
|
+
policy = RestartPolicy(config)
|
|
25
|
+
|
|
26
|
+
if policy.should_restart(deployment_id):
|
|
27
|
+
backoff = policy.calculate_backoff(deployment_id)
|
|
28
|
+
time.sleep(backoff)
|
|
29
|
+
# Perform restart
|
|
30
|
+
policy.record_restart_attempt(deployment_id, success=True)
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
import threading
|
|
34
|
+
from datetime import datetime, timedelta, timezone
|
|
35
|
+
from typing import Dict, Optional
|
|
36
|
+
|
|
37
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
38
|
+
from claude_mpm.services.core.interfaces.restart import IRestartPolicy
|
|
39
|
+
from claude_mpm.services.core.models.restart import (
|
|
40
|
+
CircuitBreakerState,
|
|
41
|
+
RestartAttempt,
|
|
42
|
+
RestartConfig,
|
|
43
|
+
RestartHistory,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RestartPolicy(SyncBaseService, IRestartPolicy):
|
|
48
|
+
"""
|
|
49
|
+
Restart policy with exponential backoff and circuit breaker.
|
|
50
|
+
|
|
51
|
+
WHY: Prevents restart loops through intelligent policy decisions.
|
|
52
|
+
Implements exponential backoff to give services time to recover
|
|
53
|
+
and circuit breaker to block restarts after repeated failures.
|
|
54
|
+
|
|
55
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, config: RestartConfig):
|
|
59
|
+
"""
|
|
60
|
+
Initialize restart policy.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
config: Restart configuration
|
|
64
|
+
"""
|
|
65
|
+
super().__init__("RestartPolicy")
|
|
66
|
+
self.config = config
|
|
67
|
+
self._lock = threading.Lock()
|
|
68
|
+
|
|
69
|
+
# Restart history per deployment
|
|
70
|
+
self._history: Dict[str, RestartHistory] = {}
|
|
71
|
+
|
|
72
|
+
def initialize(self) -> bool:
|
|
73
|
+
"""
|
|
74
|
+
Initialize the restart policy.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
True if initialization successful
|
|
78
|
+
"""
|
|
79
|
+
self.logger.info(
|
|
80
|
+
f"Initializing RestartPolicy with config: "
|
|
81
|
+
f"max_attempts={self.config.max_attempts}, "
|
|
82
|
+
f"backoff={self.config.initial_backoff_seconds}s-{self.config.max_backoff_seconds}s, "
|
|
83
|
+
f"circuit_breaker={self.config.circuit_breaker_threshold} failures"
|
|
84
|
+
)
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
def should_restart(self, deployment_id: str) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
Determine if a deployment should be restarted.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
deployment_id: Unique deployment identifier
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
True if restart should proceed
|
|
96
|
+
"""
|
|
97
|
+
with self._lock:
|
|
98
|
+
history = self._get_or_create_history(deployment_id)
|
|
99
|
+
|
|
100
|
+
# Check circuit breaker state
|
|
101
|
+
if history.circuit_breaker_state == CircuitBreakerState.OPEN:
|
|
102
|
+
self.logger.warning(
|
|
103
|
+
f"Restart blocked for {deployment_id}: circuit breaker OPEN"
|
|
104
|
+
)
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
# Check max attempts
|
|
108
|
+
attempt_count = history.get_attempt_count()
|
|
109
|
+
if attempt_count >= self.config.max_attempts:
|
|
110
|
+
self.logger.warning(
|
|
111
|
+
f"Restart blocked for {deployment_id}: "
|
|
112
|
+
f"max attempts reached ({attempt_count}/{self.config.max_attempts})"
|
|
113
|
+
)
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
# Allow restart
|
|
117
|
+
self.logger.debug(
|
|
118
|
+
f"Restart allowed for {deployment_id}: "
|
|
119
|
+
f"attempt {attempt_count + 1}/{self.config.max_attempts}, "
|
|
120
|
+
f"circuit breaker {history.circuit_breaker_state.value}"
|
|
121
|
+
)
|
|
122
|
+
return True
|
|
123
|
+
|
|
124
|
+
def calculate_backoff(self, deployment_id: str) -> float:
|
|
125
|
+
"""
|
|
126
|
+
Calculate backoff time in seconds for next restart.
|
|
127
|
+
|
|
128
|
+
WHY: Implements exponential backoff. For attempt N, backoff = initial * (multiplier ^ (N-2)).
|
|
129
|
+
Attempt 1 has no backoff (0), attempt 2 gets initial backoff, etc.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
deployment_id: Unique deployment identifier
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Backoff time in seconds (0 if first attempt)
|
|
136
|
+
"""
|
|
137
|
+
with self._lock:
|
|
138
|
+
history = self._get_or_create_history(deployment_id)
|
|
139
|
+
attempt_number = history.get_attempt_count() + 1
|
|
140
|
+
|
|
141
|
+
# First attempt has no backoff
|
|
142
|
+
if attempt_number == 1:
|
|
143
|
+
return 0.0
|
|
144
|
+
|
|
145
|
+
# Calculate exponential backoff: initial * (multiplier ^ (attempt - 2))
|
|
146
|
+
# This gives: attempt 2 = initial, attempt 3 = initial*multiplier, etc.
|
|
147
|
+
backoff = self.config.initial_backoff_seconds * (
|
|
148
|
+
self.config.backoff_multiplier ** (attempt_number - 2)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Cap at max backoff
|
|
152
|
+
backoff = min(backoff, self.config.max_backoff_seconds)
|
|
153
|
+
|
|
154
|
+
self.logger.debug(
|
|
155
|
+
f"Calculated backoff for {deployment_id} "
|
|
156
|
+
f"(attempt {attempt_number}): {backoff:.1f}s"
|
|
157
|
+
)
|
|
158
|
+
return backoff
|
|
159
|
+
|
|
160
|
+
def record_restart_attempt(
|
|
161
|
+
self, deployment_id: str, success: bool, failure_reason: Optional[str] = None
|
|
162
|
+
) -> None:
|
|
163
|
+
"""
|
|
164
|
+
Record a restart attempt and update circuit breaker state.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
deployment_id: Unique deployment identifier
|
|
168
|
+
success: Whether restart succeeded
|
|
169
|
+
failure_reason: Optional reason for failure
|
|
170
|
+
"""
|
|
171
|
+
with self._lock:
|
|
172
|
+
history = self._get_or_create_history(deployment_id)
|
|
173
|
+
now = datetime.now(timezone.utc)
|
|
174
|
+
|
|
175
|
+
# Calculate backoff for this attempt (already holding lock)
|
|
176
|
+
attempt_number = history.get_attempt_count() + 1
|
|
177
|
+
if attempt_number == 1:
|
|
178
|
+
backoff = 0.0
|
|
179
|
+
else:
|
|
180
|
+
backoff = self.config.initial_backoff_seconds * (
|
|
181
|
+
self.config.backoff_multiplier ** (attempt_number - 2)
|
|
182
|
+
)
|
|
183
|
+
backoff = min(backoff, self.config.max_backoff_seconds)
|
|
184
|
+
|
|
185
|
+
# Create restart attempt record
|
|
186
|
+
attempt = RestartAttempt(
|
|
187
|
+
attempt_number=history.get_attempt_count() + 1,
|
|
188
|
+
deployment_id=deployment_id,
|
|
189
|
+
started_at=now,
|
|
190
|
+
completed_at=now,
|
|
191
|
+
success=success,
|
|
192
|
+
failure_reason=failure_reason,
|
|
193
|
+
backoff_seconds=backoff,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Add to history (prepend for newest-first ordering)
|
|
197
|
+
history.attempts.insert(0, attempt)
|
|
198
|
+
|
|
199
|
+
# Update circuit breaker based on result
|
|
200
|
+
if success:
|
|
201
|
+
self._handle_successful_restart(history)
|
|
202
|
+
else:
|
|
203
|
+
self._handle_failed_restart(history, now)
|
|
204
|
+
|
|
205
|
+
self.logger.info(
|
|
206
|
+
f"Recorded restart attempt for {deployment_id}: "
|
|
207
|
+
f"attempt {attempt.attempt_number}, success={success}, "
|
|
208
|
+
f"circuit breaker={history.circuit_breaker_state.value}"
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def reset_restart_history(self, deployment_id: str) -> None:
|
|
212
|
+
"""
|
|
213
|
+
Reset restart history for a deployment.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
deployment_id: Unique deployment identifier
|
|
217
|
+
"""
|
|
218
|
+
with self._lock:
|
|
219
|
+
if deployment_id in self._history:
|
|
220
|
+
del self._history[deployment_id]
|
|
221
|
+
self.logger.info(
|
|
222
|
+
f"Reset restart history for deployment: {deployment_id}"
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def get_circuit_breaker_state(self, deployment_id: str) -> str:
|
|
226
|
+
"""
|
|
227
|
+
Get current circuit breaker state.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
deployment_id: Unique deployment identifier
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
Circuit breaker state (CLOSED, OPEN, HALF_OPEN)
|
|
234
|
+
"""
|
|
235
|
+
with self._lock:
|
|
236
|
+
history = self._get_or_create_history(deployment_id)
|
|
237
|
+
return history.circuit_breaker_state.value
|
|
238
|
+
|
|
239
|
+
def get_restart_attempt_count(self, deployment_id: str) -> int:
|
|
240
|
+
"""
|
|
241
|
+
Get number of restart attempts for a deployment.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
deployment_id: Unique deployment identifier
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Number of restart attempts
|
|
248
|
+
"""
|
|
249
|
+
with self._lock:
|
|
250
|
+
history = self._get_or_create_history(deployment_id)
|
|
251
|
+
return history.get_attempt_count()
|
|
252
|
+
|
|
253
|
+
def get_history(self, deployment_id: str) -> Optional[RestartHistory]:
|
|
254
|
+
"""
|
|
255
|
+
Get restart history for a deployment.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
deployment_id: Unique deployment identifier
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
RestartHistory if exists, None otherwise
|
|
262
|
+
"""
|
|
263
|
+
with self._lock:
|
|
264
|
+
return self._history.get(deployment_id)
|
|
265
|
+
|
|
266
|
+
def shutdown(self) -> bool:
|
|
267
|
+
"""
|
|
268
|
+
Shutdown the restart policy.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
True if shutdown successful
|
|
272
|
+
"""
|
|
273
|
+
with self._lock:
|
|
274
|
+
self._history.clear()
|
|
275
|
+
self.logger.info("RestartPolicy shutdown successfully")
|
|
276
|
+
return True
|
|
277
|
+
|
|
278
|
+
def _get_or_create_history(self, deployment_id: str) -> RestartHistory:
|
|
279
|
+
"""
|
|
280
|
+
Get or create restart history for a deployment.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
deployment_id: Unique deployment identifier
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
RestartHistory instance
|
|
287
|
+
"""
|
|
288
|
+
if deployment_id not in self._history:
|
|
289
|
+
self._history[deployment_id] = RestartHistory(deployment_id=deployment_id)
|
|
290
|
+
return self._history[deployment_id]
|
|
291
|
+
|
|
292
|
+
def _handle_successful_restart(self, history: RestartHistory) -> None:
|
|
293
|
+
"""
|
|
294
|
+
Handle successful restart attempt.
|
|
295
|
+
|
|
296
|
+
WHY: Success transitions circuit breaker from HALF_OPEN → CLOSED
|
|
297
|
+
and resets failure window tracking.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
history: Restart history to update
|
|
301
|
+
"""
|
|
302
|
+
# Reset circuit breaker on success
|
|
303
|
+
if history.circuit_breaker_state == CircuitBreakerState.HALF_OPEN:
|
|
304
|
+
history.circuit_breaker_state = CircuitBreakerState.CLOSED
|
|
305
|
+
self.logger.info(
|
|
306
|
+
f"Circuit breaker CLOSED for {history.deployment_id} after successful restart"
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Reset failure window
|
|
310
|
+
history.failure_count_in_window = 0
|
|
311
|
+
history.last_failure_window_start = None
|
|
312
|
+
|
|
313
|
+
def _handle_failed_restart(self, history: RestartHistory, now: datetime) -> None:
|
|
314
|
+
"""
|
|
315
|
+
Handle failed restart attempt.
|
|
316
|
+
|
|
317
|
+
WHY: Tracks failures in time window and trips circuit breaker
|
|
318
|
+
if threshold exceeded.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
history: Restart history to update
|
|
322
|
+
now: Current timestamp
|
|
323
|
+
"""
|
|
324
|
+
# Initialize failure window if needed
|
|
325
|
+
if history.last_failure_window_start is None:
|
|
326
|
+
history.last_failure_window_start = now
|
|
327
|
+
history.failure_count_in_window = 1
|
|
328
|
+
else:
|
|
329
|
+
# Check if we're still in the same window
|
|
330
|
+
window_start = history.last_failure_window_start
|
|
331
|
+
window_end = window_start + timedelta(
|
|
332
|
+
seconds=self.config.circuit_breaker_window_seconds
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
if now <= window_end:
|
|
336
|
+
# Still in window, increment count
|
|
337
|
+
history.failure_count_in_window += 1
|
|
338
|
+
else:
|
|
339
|
+
# Window expired, start new window
|
|
340
|
+
history.last_failure_window_start = now
|
|
341
|
+
history.failure_count_in_window = 1
|
|
342
|
+
|
|
343
|
+
# Check if we should trip the circuit breaker
|
|
344
|
+
if history.failure_count_in_window >= self.config.circuit_breaker_threshold:
|
|
345
|
+
if history.circuit_breaker_state != CircuitBreakerState.OPEN:
|
|
346
|
+
history.circuit_breaker_state = CircuitBreakerState.OPEN
|
|
347
|
+
self.logger.warning(
|
|
348
|
+
f"Circuit breaker OPEN for {history.deployment_id}: "
|
|
349
|
+
f"{history.failure_count_in_window} failures in "
|
|
350
|
+
f"{self.config.circuit_breaker_window_seconds}s window"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Check if we should transition to HALF_OPEN
|
|
354
|
+
elif history.circuit_breaker_state == CircuitBreakerState.OPEN:
|
|
355
|
+
self._check_circuit_breaker_reset(history, now)
|
|
356
|
+
|
|
357
|
+
def _check_circuit_breaker_reset(
|
|
358
|
+
self, history: RestartHistory, now: datetime
|
|
359
|
+
) -> None:
|
|
360
|
+
"""
|
|
361
|
+
Check if circuit breaker should reset to HALF_OPEN.
|
|
362
|
+
|
|
363
|
+
WHY: After cooldown period, allow one restart attempt to test
|
|
364
|
+
if service has recovered.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
history: Restart history to check
|
|
368
|
+
now: Current timestamp
|
|
369
|
+
"""
|
|
370
|
+
if history.last_failure_window_start is None:
|
|
371
|
+
return
|
|
372
|
+
|
|
373
|
+
# Calculate reset time
|
|
374
|
+
reset_time = history.last_failure_window_start + timedelta(
|
|
375
|
+
seconds=self.config.circuit_breaker_reset_seconds
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Check if cooldown period has elapsed
|
|
379
|
+
if now >= reset_time:
|
|
380
|
+
history.circuit_breaker_state = CircuitBreakerState.HALF_OPEN
|
|
381
|
+
self.logger.info(
|
|
382
|
+
f"Circuit breaker HALF_OPEN for {history.deployment_id} "
|
|
383
|
+
f"after {self.config.circuit_breaker_reset_seconds}s cooldown"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
__all__ = ["RestartPolicy"]
|