claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/BASE_ENGINEER.md +286 -0
- claude_mpm/agents/BASE_PM.md +48 -17
- claude_mpm/agents/OUTPUT_STYLE.md +329 -11
- claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
- claude_mpm/agents/agent_loader.py +17 -5
- claude_mpm/agents/frontmatter_validator.py +284 -253
- claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
- claude_mpm/agents/templates/api_qa.json +7 -1
- claude_mpm/agents/templates/clerk-ops.json +8 -1
- claude_mpm/agents/templates/code_analyzer.json +4 -1
- claude_mpm/agents/templates/dart_engineer.json +11 -1
- claude_mpm/agents/templates/data_engineer.json +11 -1
- claude_mpm/agents/templates/documentation.json +6 -1
- claude_mpm/agents/templates/engineer.json +18 -1
- claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
- claude_mpm/agents/templates/golang_engineer.json +11 -1
- claude_mpm/agents/templates/java_engineer.json +12 -2
- claude_mpm/agents/templates/local_ops_agent.json +1217 -6
- claude_mpm/agents/templates/nextjs_engineer.json +11 -1
- claude_mpm/agents/templates/ops.json +8 -1
- claude_mpm/agents/templates/php-engineer.json +11 -1
- claude_mpm/agents/templates/project_organizer.json +10 -3
- claude_mpm/agents/templates/prompt-engineer.json +5 -1
- claude_mpm/agents/templates/python_engineer.json +11 -1
- claude_mpm/agents/templates/qa.json +7 -1
- claude_mpm/agents/templates/react_engineer.json +11 -1
- claude_mpm/agents/templates/refactoring_engineer.json +8 -1
- claude_mpm/agents/templates/research.json +4 -1
- claude_mpm/agents/templates/ruby-engineer.json +11 -1
- claude_mpm/agents/templates/rust_engineer.json +11 -1
- claude_mpm/agents/templates/security.json +6 -1
- claude_mpm/agents/templates/svelte-engineer.json +225 -0
- claude_mpm/agents/templates/ticketing.json +6 -1
- claude_mpm/agents/templates/typescript_engineer.json +11 -1
- claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
- claude_mpm/agents/templates/version_control.json +8 -1
- claude_mpm/agents/templates/web_qa.json +7 -1
- claude_mpm/agents/templates/web_ui.json +11 -1
- claude_mpm/cli/__init__.py +34 -706
- claude_mpm/cli/commands/agent_manager.py +25 -12
- claude_mpm/cli/commands/agent_state_manager.py +186 -0
- claude_mpm/cli/commands/agents.py +204 -148
- claude_mpm/cli/commands/aggregate.py +7 -3
- claude_mpm/cli/commands/analyze.py +9 -4
- claude_mpm/cli/commands/analyze_code.py +7 -2
- claude_mpm/cli/commands/auto_configure.py +7 -9
- claude_mpm/cli/commands/config.py +47 -13
- claude_mpm/cli/commands/configure.py +294 -1788
- claude_mpm/cli/commands/configure_agent_display.py +261 -0
- claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
- claude_mpm/cli/commands/configure_hook_manager.py +225 -0
- claude_mpm/cli/commands/configure_models.py +18 -0
- claude_mpm/cli/commands/configure_navigation.py +167 -0
- claude_mpm/cli/commands/configure_paths.py +104 -0
- claude_mpm/cli/commands/configure_persistence.py +254 -0
- claude_mpm/cli/commands/configure_startup_manager.py +646 -0
- claude_mpm/cli/commands/configure_template_editor.py +497 -0
- claude_mpm/cli/commands/configure_validators.py +73 -0
- claude_mpm/cli/commands/local_deploy.py +537 -0
- claude_mpm/cli/commands/memory.py +54 -20
- claude_mpm/cli/commands/mpm_init.py +39 -25
- claude_mpm/cli/commands/mpm_init_handler.py +8 -3
- claude_mpm/cli/executor.py +202 -0
- claude_mpm/cli/helpers.py +105 -0
- claude_mpm/cli/interactive/__init__.py +3 -0
- claude_mpm/cli/interactive/skills_wizard.py +491 -0
- claude_mpm/cli/parsers/__init__.py +7 -1
- claude_mpm/cli/parsers/base_parser.py +98 -3
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/cli/shared/output_formatters.py +28 -19
- claude_mpm/cli/startup.py +481 -0
- claude_mpm/cli/utils.py +52 -1
- claude_mpm/commands/mpm-help.md +3 -0
- claude_mpm/commands/mpm-version.md +113 -0
- claude_mpm/commands/mpm.md +1 -0
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/base_service.py +13 -12
- claude_mpm/core/enums.py +452 -0
- claude_mpm/core/factories.py +1 -1
- claude_mpm/core/instruction_reinforcement_hook.py +2 -1
- claude_mpm/core/interactive_session.py +9 -3
- claude_mpm/core/logging_config.py +6 -2
- claude_mpm/core/oneshot_session.py +8 -4
- claude_mpm/core/optimized_agent_loader.py +3 -3
- claude_mpm/core/output_style_manager.py +12 -192
- claude_mpm/core/service_registry.py +5 -1
- claude_mpm/core/types.py +2 -9
- claude_mpm/core/typing_utils.py +7 -6
- claude_mpm/dashboard/static/js/dashboard.js +0 -14
- claude_mpm/dashboard/templates/index.html +3 -41
- claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
- claude_mpm/hooks/instruction_reinforcement.py +7 -2
- claude_mpm/models/resume_log.py +340 -0
- claude_mpm/services/agents/auto_config_manager.py +10 -11
- claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
- claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
- claude_mpm/services/agents/deployment/agent_validator.py +17 -1
- claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
- claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
- claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
- claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
- claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
- claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
- claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
- claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
- claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
- claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
- claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
- claude_mpm/services/agents/local_template_manager.py +1 -1
- claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
- claude_mpm/services/agents/registry/modification_tracker.py +5 -2
- claude_mpm/services/command_handler_service.py +11 -5
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +33 -0
- claude_mpm/services/core/models/agent_config.py +12 -28
- claude_mpm/services/core/models/health.py +162 -0
- claude_mpm/services/core/models/process.py +235 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/core/path_resolver.py +23 -7
- claude_mpm/services/diagnostics/__init__.py +2 -2
- claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
- claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
- claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
- claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
- claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
- claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
- claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
- claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
- claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
- claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
- claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
- claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
- claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
- claude_mpm/services/diagnostics/models.py +19 -24
- claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
- claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
- claude_mpm/services/infrastructure/monitoring/base.py +5 -13
- claude_mpm/services/infrastructure/monitoring/network.py +7 -6
- claude_mpm/services/infrastructure/monitoring/process.py +13 -12
- claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
- claude_mpm/services/infrastructure/monitoring/service.py +16 -15
- claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +372 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/mcp_config_manager.py +9 -4
- claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
- claude_mpm/services/mcp_gateway/core/base.py +18 -31
- claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
- claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
- claude_mpm/services/memory_hook_service.py +4 -1
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- claude_mpm/services/monitor/daemon_manager.py +3 -2
- claude_mpm/services/monitor/handlers/dashboard.py +2 -1
- claude_mpm/services/monitor/handlers/hooks.py +2 -1
- claude_mpm/services/monitor/management/lifecycle.py +3 -2
- claude_mpm/services/monitor/server.py +2 -1
- claude_mpm/services/session_management_service.py +3 -2
- claude_mpm/services/session_manager.py +205 -1
- claude_mpm/services/shared/async_service_base.py +16 -27
- claude_mpm/services/shared/lifecycle_service_base.py +1 -14
- claude_mpm/services/socketio/handlers/__init__.py +5 -2
- claude_mpm/services/socketio/handlers/hook.py +13 -2
- claude_mpm/services/socketio/handlers/registry.py +4 -2
- claude_mpm/services/socketio/server/main.py +10 -8
- claude_mpm/services/subprocess_launcher_service.py +14 -5
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
- claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
- claude_mpm/services/unified/deployment_strategies/local.py +6 -5
- claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
- claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
- claude_mpm/services/unified/interfaces.py +3 -1
- claude_mpm/services/unified/unified_analyzer.py +14 -10
- claude_mpm/services/unified/unified_config.py +2 -1
- claude_mpm/services/unified/unified_deployment.py +9 -4
- claude_mpm/services/version_service.py +104 -1
- claude_mpm/skills/__init__.py +21 -0
- claude_mpm/skills/bundled/__init__.py +6 -0
- claude_mpm/skills/bundled/api-documentation.md +393 -0
- claude_mpm/skills/bundled/async-testing.md +571 -0
- claude_mpm/skills/bundled/code-review.md +143 -0
- claude_mpm/skills/bundled/database-migration.md +199 -0
- claude_mpm/skills/bundled/docker-containerization.md +194 -0
- claude_mpm/skills/bundled/express-local-dev.md +1429 -0
- claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
- claude_mpm/skills/bundled/git-workflow.md +414 -0
- claude_mpm/skills/bundled/imagemagick.md +204 -0
- claude_mpm/skills/bundled/json-data-handling.md +223 -0
- claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
- claude_mpm/skills/bundled/pdf.md +141 -0
- claude_mpm/skills/bundled/performance-profiling.md +567 -0
- claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
- claude_mpm/skills/bundled/security-scanning.md +327 -0
- claude_mpm/skills/bundled/systematic-debugging.md +473 -0
- claude_mpm/skills/bundled/test-driven-development.md +378 -0
- claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
- claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
- claude_mpm/skills/bundled/xlsx.md +157 -0
- claude_mpm/skills/registry.py +286 -0
- claude_mpm/skills/skill_manager.py +310 -0
- claude_mpm/tools/code_tree_analyzer.py +177 -141
- claude_mpm/tools/code_tree_events.py +4 -2
- claude_mpm/utils/agent_dependency_loader.py +2 -2
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
- claude_mpm/dashboard/static/css/code-tree.css +0 -1639
- claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
- claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
- claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
- claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
- claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
- claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
- claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
- claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
- claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
- claude_mpm/services/project/analyzer_refactored.py +0 -450
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Crash Detector for Claude MPM Framework
|
|
3
|
+
========================================
|
|
4
|
+
|
|
5
|
+
WHY: Detects process crashes and failures by monitoring health status changes,
|
|
6
|
+
process exits, and zombie states. Integrates with HealthCheckManager to receive
|
|
7
|
+
real-time status updates.
|
|
8
|
+
|
|
9
|
+
DESIGN DECISION: Uses callback-based architecture to receive health status
|
|
10
|
+
changes from HealthCheckManager. Tracks crash history per deployment to
|
|
11
|
+
enable pattern detection and intelligent restart policies.
|
|
12
|
+
|
|
13
|
+
ARCHITECTURE:
|
|
14
|
+
- Subscribes to HealthCheckManager status change callbacks
|
|
15
|
+
- Detects crashes when status transitions to UNHEALTHY
|
|
16
|
+
- Tracks crash count per deployment
|
|
17
|
+
- Invokes registered crash callbacks when crash detected
|
|
18
|
+
|
|
19
|
+
USAGE:
|
|
20
|
+
crash_detector = CrashDetector(health_manager)
|
|
21
|
+
crash_detector.register_crash_callback(handle_crash)
|
|
22
|
+
crash_detector.start_monitoring(deployment_id)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import threading
|
|
26
|
+
from collections import defaultdict
|
|
27
|
+
from typing import Callable, Dict, List, Set
|
|
28
|
+
|
|
29
|
+
from claude_mpm.core.enums import HealthStatus
|
|
30
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
31
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheckManager
|
|
32
|
+
from claude_mpm.services.core.interfaces.restart import ICrashDetector
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class CrashDetector(SyncBaseService, ICrashDetector):
|
|
36
|
+
"""
|
|
37
|
+
Detects process crashes via health status monitoring.
|
|
38
|
+
|
|
39
|
+
WHY: Provides automated crash detection by monitoring health status
|
|
40
|
+
changes. Enables reactive restart policies based on crash events.
|
|
41
|
+
|
|
42
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, health_manager: IHealthCheckManager):
|
|
46
|
+
"""
|
|
47
|
+
Initialize crash detector.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
health_manager: Health check manager for status monitoring
|
|
51
|
+
"""
|
|
52
|
+
super().__init__("CrashDetector")
|
|
53
|
+
self.health_manager = health_manager
|
|
54
|
+
self._lock = threading.Lock()
|
|
55
|
+
|
|
56
|
+
# Deployments being monitored
|
|
57
|
+
self._monitored_deployments: Set[str] = set()
|
|
58
|
+
|
|
59
|
+
# Last known health status per deployment
|
|
60
|
+
self._last_health_status: Dict[str, HealthStatus] = {}
|
|
61
|
+
|
|
62
|
+
# Crash count per deployment
|
|
63
|
+
self._crash_count: Dict[str, int] = defaultdict(int)
|
|
64
|
+
|
|
65
|
+
# Crash callbacks: List of functions called with (deployment_id, reason)
|
|
66
|
+
self._crash_callbacks: List[Callable[[str, str], None]] = []
|
|
67
|
+
|
|
68
|
+
def initialize(self) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
Initialize the crash detector.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
True if initialization successful
|
|
74
|
+
"""
|
|
75
|
+
self.logger.info("Initializing CrashDetector")
|
|
76
|
+
|
|
77
|
+
# Register with health manager to receive status change callbacks
|
|
78
|
+
self.health_manager.register_status_callback(self._on_health_status_change)
|
|
79
|
+
|
|
80
|
+
self.logger.info("CrashDetector initialized successfully")
|
|
81
|
+
return True
|
|
82
|
+
|
|
83
|
+
def register_crash_callback(self, callback: Callable[[str, str], None]) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Register a callback to be invoked when a crash is detected.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
callback: Function called with (deployment_id, reason)
|
|
89
|
+
"""
|
|
90
|
+
with self._lock:
|
|
91
|
+
self._crash_callbacks.append(callback)
|
|
92
|
+
callback_name = getattr(callback, "__name__", repr(callback))
|
|
93
|
+
self.logger.debug(f"Registered crash callback: {callback_name}")
|
|
94
|
+
|
|
95
|
+
def start_monitoring(self, deployment_id: str) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Start monitoring a deployment for crashes.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
deployment_id: Unique deployment identifier
|
|
101
|
+
|
|
102
|
+
Raises:
|
|
103
|
+
ValueError: If deployment_id not found
|
|
104
|
+
"""
|
|
105
|
+
with self._lock:
|
|
106
|
+
self._monitored_deployments.add(deployment_id)
|
|
107
|
+
self.logger.info(
|
|
108
|
+
f"Started crash monitoring for deployment: {deployment_id}"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Get initial health status
|
|
112
|
+
try:
|
|
113
|
+
health = self.health_manager.check_health(deployment_id)
|
|
114
|
+
self._last_health_status[deployment_id] = health.overall_status
|
|
115
|
+
self.logger.debug(
|
|
116
|
+
f"Initial health status for {deployment_id}: {health.overall_status.value}"
|
|
117
|
+
)
|
|
118
|
+
except Exception as e:
|
|
119
|
+
self.logger.warning(
|
|
120
|
+
f"Failed to get initial health status for {deployment_id}: {e}"
|
|
121
|
+
)
|
|
122
|
+
self._last_health_status[deployment_id] = HealthStatus.UNKNOWN
|
|
123
|
+
|
|
124
|
+
def stop_monitoring(self, deployment_id: str) -> None:
|
|
125
|
+
"""
|
|
126
|
+
Stop monitoring a deployment.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
deployment_id: Unique deployment identifier
|
|
130
|
+
"""
|
|
131
|
+
with self._lock:
|
|
132
|
+
self._monitored_deployments.discard(deployment_id)
|
|
133
|
+
self._last_health_status.pop(deployment_id, None)
|
|
134
|
+
self.logger.info(
|
|
135
|
+
f"Stopped crash monitoring for deployment: {deployment_id}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def is_monitoring(self, deployment_id: str) -> bool:
|
|
139
|
+
"""
|
|
140
|
+
Check if a deployment is being monitored.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
deployment_id: Unique deployment identifier
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
True if deployment is being monitored
|
|
147
|
+
"""
|
|
148
|
+
with self._lock:
|
|
149
|
+
return deployment_id in self._monitored_deployments
|
|
150
|
+
|
|
151
|
+
def get_crash_count(self, deployment_id: str) -> int:
|
|
152
|
+
"""
|
|
153
|
+
Get the number of crashes detected for a deployment.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
deployment_id: Unique deployment identifier
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Number of crashes detected
|
|
160
|
+
"""
|
|
161
|
+
with self._lock:
|
|
162
|
+
return self._crash_count.get(deployment_id, 0)
|
|
163
|
+
|
|
164
|
+
def reset_crash_count(self, deployment_id: str) -> None:
|
|
165
|
+
"""
|
|
166
|
+
Reset crash count for a deployment.
|
|
167
|
+
|
|
168
|
+
WHY: Allows manual intervention to clear crash history.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
deployment_id: Unique deployment identifier
|
|
172
|
+
"""
|
|
173
|
+
with self._lock:
|
|
174
|
+
self._crash_count[deployment_id] = 0
|
|
175
|
+
self.logger.debug(f"Reset crash count for deployment: {deployment_id}")
|
|
176
|
+
|
|
177
|
+
def shutdown(self) -> bool:
|
|
178
|
+
"""
|
|
179
|
+
Shutdown the crash detector.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
True if shutdown successful
|
|
183
|
+
"""
|
|
184
|
+
with self._lock:
|
|
185
|
+
self._monitored_deployments.clear()
|
|
186
|
+
self._last_health_status.clear()
|
|
187
|
+
self._crash_count.clear()
|
|
188
|
+
self._crash_callbacks.clear()
|
|
189
|
+
self.logger.info("CrashDetector shutdown successfully")
|
|
190
|
+
return True
|
|
191
|
+
|
|
192
|
+
def _on_health_status_change(
|
|
193
|
+
self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
|
|
194
|
+
) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Handle health status changes from HealthCheckManager.
|
|
197
|
+
|
|
198
|
+
WHY: Callback invoked by HealthCheckManager when status changes.
|
|
199
|
+
Detects crashes when status transitions to UNHEALTHY.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
deployment_id: Unique deployment identifier
|
|
203
|
+
old_status: Previous health status
|
|
204
|
+
new_status: New health status
|
|
205
|
+
"""
|
|
206
|
+
with self._lock:
|
|
207
|
+
# Only process if we're monitoring this deployment
|
|
208
|
+
if deployment_id not in self._monitored_deployments:
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
# Update last known status
|
|
212
|
+
self._last_health_status[deployment_id] = new_status
|
|
213
|
+
|
|
214
|
+
# Detect crash: transition from operational to UNHEALTHY
|
|
215
|
+
if old_status.is_operational() and new_status.is_critical():
|
|
216
|
+
self._handle_crash(
|
|
217
|
+
deployment_id, "Health status transitioned to UNHEALTHY"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Also detect: transition from UNKNOWN to UNHEALTHY (process died)
|
|
221
|
+
elif (
|
|
222
|
+
old_status == HealthStatus.UNKNOWN
|
|
223
|
+
and new_status == HealthStatus.UNHEALTHY
|
|
224
|
+
):
|
|
225
|
+
self._handle_crash(deployment_id, "Process became unhealthy")
|
|
226
|
+
|
|
227
|
+
def _handle_crash(self, deployment_id: str, reason: str) -> None:
|
|
228
|
+
"""
|
|
229
|
+
Handle detected crash.
|
|
230
|
+
|
|
231
|
+
WHY: Increments crash count and invokes all registered callbacks.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
deployment_id: Unique deployment identifier
|
|
235
|
+
reason: Reason for crash detection
|
|
236
|
+
"""
|
|
237
|
+
# Increment crash count
|
|
238
|
+
self._crash_count[deployment_id] += 1
|
|
239
|
+
crash_count = self._crash_count[deployment_id]
|
|
240
|
+
|
|
241
|
+
self.logger.warning(
|
|
242
|
+
f"Crash detected for deployment {deployment_id} "
|
|
243
|
+
f"(count: {crash_count}): {reason}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# Invoke all crash callbacks
|
|
247
|
+
for callback in self._crash_callbacks:
|
|
248
|
+
try:
|
|
249
|
+
callback(deployment_id, reason)
|
|
250
|
+
except Exception as e:
|
|
251
|
+
self.logger.error(
|
|
252
|
+
f"Error invoking crash callback {callback.__name__}: {e}",
|
|
253
|
+
exc_info=True,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
__all__ = ["CrashDetector"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health Check Implementations for Local Operations
|
|
3
|
+
==================================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides three-tier health monitoring for local deployments:
|
|
6
|
+
- HTTP health checks for endpoint availability
|
|
7
|
+
- Process health checks for process status
|
|
8
|
+
- Resource health checks for CPU/memory/connections
|
|
9
|
+
|
|
10
|
+
ARCHITECTURE:
|
|
11
|
+
- HttpHealthCheck: HTTP endpoint availability and response time
|
|
12
|
+
- ProcessHealthCheck: Process existence and status validation
|
|
13
|
+
- ResourceHealthCheck: CPU, memory, and connection monitoring
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from claude_mpm.services.local_ops.health_checks.http_check import HttpHealthCheck
|
|
17
|
+
from claude_mpm.services.local_ops.health_checks.process_check import (
|
|
18
|
+
ProcessHealthCheck,
|
|
19
|
+
)
|
|
20
|
+
from claude_mpm.services.local_ops.health_checks.resource_check import (
|
|
21
|
+
ResourceHealthCheck,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"HttpHealthCheck",
|
|
26
|
+
"ProcessHealthCheck",
|
|
27
|
+
"ResourceHealthCheck",
|
|
28
|
+
]
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HTTP Health Check for Claude MPM Framework
|
|
3
|
+
===========================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides HTTP endpoint health monitoring with response time measurement,
|
|
6
|
+
status code validation, and timeout handling.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses requests library with configurable timeout and retry logic.
|
|
9
|
+
Supports custom headers and SSL/TLS validation.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- Synchronous HTTP GET requests
|
|
13
|
+
- Response time measurement with time.perf_counter()
|
|
14
|
+
- Status code validation (2xx/3xx = healthy)
|
|
15
|
+
- Timeout and connection error handling
|
|
16
|
+
- Retry logic with exponential backoff
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
http_check = HttpHealthCheck(process_manager)
|
|
20
|
+
result = http_check.check(
|
|
21
|
+
deployment_id="my-app",
|
|
22
|
+
endpoint="http://localhost:3000/health",
|
|
23
|
+
timeout=5.0
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import time
|
|
28
|
+
|
|
29
|
+
import requests
|
|
30
|
+
from requests.exceptions import ConnectionError, RequestException, Timeout
|
|
31
|
+
|
|
32
|
+
from claude_mpm.core.enums import HealthStatus
|
|
33
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
34
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
35
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
36
|
+
from claude_mpm.services.core.models.health import HealthCheckResult
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class HttpHealthCheck(SyncBaseService, IHealthCheck):
|
|
40
|
+
"""
|
|
41
|
+
HTTP endpoint health check implementation.
|
|
42
|
+
|
|
43
|
+
WHY: Validates that deployed services are accessible via HTTP and
|
|
44
|
+
responding within acceptable timeframes.
|
|
45
|
+
|
|
46
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
process_manager: ILocalProcessManager,
|
|
52
|
+
default_timeout: float = 5.0,
|
|
53
|
+
max_retries: int = 2,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize HTTP health check.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
process_manager: Process manager for deployment lookup
|
|
60
|
+
default_timeout: Default timeout in seconds
|
|
61
|
+
max_retries: Maximum number of retry attempts
|
|
62
|
+
"""
|
|
63
|
+
super().__init__("HttpHealthCheck")
|
|
64
|
+
self.process_manager = process_manager
|
|
65
|
+
self.default_timeout = default_timeout
|
|
66
|
+
self.max_retries = max_retries
|
|
67
|
+
|
|
68
|
+
def initialize(self) -> bool:
|
|
69
|
+
"""
|
|
70
|
+
Initialize the health check.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
True if initialization successful
|
|
74
|
+
"""
|
|
75
|
+
self._initialized = True
|
|
76
|
+
self.log_info("HTTP health check initialized")
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
def shutdown(self) -> None:
|
|
80
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
81
|
+
self._shutdown = True
|
|
82
|
+
|
|
83
|
+
def get_check_type(self) -> str:
|
|
84
|
+
"""Get the check type identifier."""
|
|
85
|
+
return "http"
|
|
86
|
+
|
|
87
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
88
|
+
"""
|
|
89
|
+
Execute HTTP health check for a deployment.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
deployment_id: Unique deployment identifier
|
|
93
|
+
**kwargs: Optional parameters:
|
|
94
|
+
- endpoint: HTTP endpoint URL (required)
|
|
95
|
+
- timeout: Request timeout in seconds (default: 5.0)
|
|
96
|
+
- headers: Custom HTTP headers
|
|
97
|
+
- verify_ssl: Verify SSL certificates (default: True)
|
|
98
|
+
- expected_status: Expected status code (default: 200)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
HealthCheckResult with check status and details
|
|
102
|
+
|
|
103
|
+
Raises:
|
|
104
|
+
ValueError: If deployment_id not found or endpoint not provided
|
|
105
|
+
"""
|
|
106
|
+
# Validate deployment exists
|
|
107
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
108
|
+
if not deployment:
|
|
109
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
110
|
+
|
|
111
|
+
# Get endpoint from kwargs
|
|
112
|
+
endpoint = kwargs.get("endpoint")
|
|
113
|
+
if not endpoint:
|
|
114
|
+
# Try to construct from deployment port
|
|
115
|
+
if deployment.port:
|
|
116
|
+
endpoint = f"http://localhost:{deployment.port}/health"
|
|
117
|
+
else:
|
|
118
|
+
return HealthCheckResult(
|
|
119
|
+
status=HealthStatus.UNKNOWN,
|
|
120
|
+
check_type=self.get_check_type(),
|
|
121
|
+
message="No HTTP endpoint configured for deployment",
|
|
122
|
+
details={"deployment_id": deployment_id},
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Get optional parameters
|
|
126
|
+
timeout = kwargs.get("timeout", self.default_timeout)
|
|
127
|
+
headers = kwargs.get("headers", {})
|
|
128
|
+
verify_ssl = kwargs.get("verify_ssl", True)
|
|
129
|
+
expected_status = kwargs.get("expected_status", 200)
|
|
130
|
+
|
|
131
|
+
# Perform HTTP check with retries
|
|
132
|
+
for attempt in range(self.max_retries + 1):
|
|
133
|
+
try:
|
|
134
|
+
start_time = time.perf_counter()
|
|
135
|
+
response = requests.get(
|
|
136
|
+
endpoint, timeout=timeout, headers=headers, verify=verify_ssl
|
|
137
|
+
)
|
|
138
|
+
response_time = time.perf_counter() - start_time
|
|
139
|
+
|
|
140
|
+
# Check status code
|
|
141
|
+
if response.status_code == expected_status or (
|
|
142
|
+
200 <= response.status_code < 400
|
|
143
|
+
):
|
|
144
|
+
return HealthCheckResult(
|
|
145
|
+
status=HealthStatus.HEALTHY,
|
|
146
|
+
check_type=self.get_check_type(),
|
|
147
|
+
message="HTTP endpoint responding normally",
|
|
148
|
+
details={
|
|
149
|
+
"endpoint": endpoint,
|
|
150
|
+
"status_code": response.status_code,
|
|
151
|
+
"response_time_ms": round(response_time * 1000, 2),
|
|
152
|
+
"attempt": attempt + 1,
|
|
153
|
+
},
|
|
154
|
+
)
|
|
155
|
+
return HealthCheckResult(
|
|
156
|
+
status=HealthStatus.DEGRADED,
|
|
157
|
+
check_type=self.get_check_type(),
|
|
158
|
+
message="HTTP endpoint returned unexpected status code",
|
|
159
|
+
details={
|
|
160
|
+
"endpoint": endpoint,
|
|
161
|
+
"status_code": response.status_code,
|
|
162
|
+
"expected_status": expected_status,
|
|
163
|
+
"response_time_ms": round(response_time * 1000, 2),
|
|
164
|
+
},
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
except Timeout:
|
|
168
|
+
if attempt < self.max_retries:
|
|
169
|
+
self.log_debug(
|
|
170
|
+
f"HTTP check timeout for {deployment_id}, "
|
|
171
|
+
f"retrying (attempt {attempt + 1}/{self.max_retries})"
|
|
172
|
+
)
|
|
173
|
+
time.sleep(0.5 * (2**attempt)) # Exponential backoff
|
|
174
|
+
continue
|
|
175
|
+
|
|
176
|
+
return HealthCheckResult(
|
|
177
|
+
status=HealthStatus.DEGRADED,
|
|
178
|
+
check_type=self.get_check_type(),
|
|
179
|
+
message=f"HTTP endpoint timeout after {self.max_retries + 1} attempts",
|
|
180
|
+
details={
|
|
181
|
+
"endpoint": endpoint,
|
|
182
|
+
"timeout_seconds": timeout,
|
|
183
|
+
"attempts": self.max_retries + 1,
|
|
184
|
+
},
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
except ConnectionError as e:
|
|
188
|
+
if attempt < self.max_retries:
|
|
189
|
+
self.log_debug(
|
|
190
|
+
f"HTTP connection error for {deployment_id}, "
|
|
191
|
+
f"retrying (attempt {attempt + 1}/{self.max_retries})"
|
|
192
|
+
)
|
|
193
|
+
time.sleep(0.5 * (2**attempt)) # Exponential backoff
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
return HealthCheckResult(
|
|
197
|
+
status=HealthStatus.UNHEALTHY,
|
|
198
|
+
check_type=self.get_check_type(),
|
|
199
|
+
message="Cannot connect to HTTP endpoint",
|
|
200
|
+
details={
|
|
201
|
+
"endpoint": endpoint,
|
|
202
|
+
"error": str(e),
|
|
203
|
+
"attempts": self.max_retries + 1,
|
|
204
|
+
},
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
except RequestException as e:
|
|
208
|
+
return HealthCheckResult(
|
|
209
|
+
status=HealthStatus.UNHEALTHY,
|
|
210
|
+
check_type=self.get_check_type(),
|
|
211
|
+
message="HTTP request failed",
|
|
212
|
+
details={"endpoint": endpoint, "error": str(e)},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Should not reach here, but return unknown as fallback
|
|
216
|
+
return HealthCheckResult(
|
|
217
|
+
status=HealthStatus.UNKNOWN,
|
|
218
|
+
check_type=self.get_check_type(),
|
|
219
|
+
message="HTTP check completed with unknown result",
|
|
220
|
+
details={"endpoint": endpoint},
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
__all__ = ["HttpHealthCheck"]
|