claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/BASE_ENGINEER.md +286 -0
- claude_mpm/agents/BASE_PM.md +48 -17
- claude_mpm/agents/OUTPUT_STYLE.md +329 -11
- claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
- claude_mpm/agents/agent_loader.py +17 -5
- claude_mpm/agents/frontmatter_validator.py +284 -253
- claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
- claude_mpm/agents/templates/api_qa.json +7 -1
- claude_mpm/agents/templates/clerk-ops.json +8 -1
- claude_mpm/agents/templates/code_analyzer.json +4 -1
- claude_mpm/agents/templates/dart_engineer.json +11 -1
- claude_mpm/agents/templates/data_engineer.json +11 -1
- claude_mpm/agents/templates/documentation.json +6 -1
- claude_mpm/agents/templates/engineer.json +18 -1
- claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
- claude_mpm/agents/templates/golang_engineer.json +11 -1
- claude_mpm/agents/templates/java_engineer.json +12 -2
- claude_mpm/agents/templates/local_ops_agent.json +1217 -6
- claude_mpm/agents/templates/nextjs_engineer.json +11 -1
- claude_mpm/agents/templates/ops.json +8 -1
- claude_mpm/agents/templates/php-engineer.json +11 -1
- claude_mpm/agents/templates/project_organizer.json +10 -3
- claude_mpm/agents/templates/prompt-engineer.json +5 -1
- claude_mpm/agents/templates/python_engineer.json +11 -1
- claude_mpm/agents/templates/qa.json +7 -1
- claude_mpm/agents/templates/react_engineer.json +11 -1
- claude_mpm/agents/templates/refactoring_engineer.json +8 -1
- claude_mpm/agents/templates/research.json +4 -1
- claude_mpm/agents/templates/ruby-engineer.json +11 -1
- claude_mpm/agents/templates/rust_engineer.json +11 -1
- claude_mpm/agents/templates/security.json +6 -1
- claude_mpm/agents/templates/svelte-engineer.json +225 -0
- claude_mpm/agents/templates/ticketing.json +6 -1
- claude_mpm/agents/templates/typescript_engineer.json +11 -1
- claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
- claude_mpm/agents/templates/version_control.json +8 -1
- claude_mpm/agents/templates/web_qa.json +7 -1
- claude_mpm/agents/templates/web_ui.json +11 -1
- claude_mpm/cli/__init__.py +34 -706
- claude_mpm/cli/commands/agent_manager.py +25 -12
- claude_mpm/cli/commands/agent_state_manager.py +186 -0
- claude_mpm/cli/commands/agents.py +204 -148
- claude_mpm/cli/commands/aggregate.py +7 -3
- claude_mpm/cli/commands/analyze.py +9 -4
- claude_mpm/cli/commands/analyze_code.py +7 -2
- claude_mpm/cli/commands/auto_configure.py +7 -9
- claude_mpm/cli/commands/config.py +47 -13
- claude_mpm/cli/commands/configure.py +294 -1788
- claude_mpm/cli/commands/configure_agent_display.py +261 -0
- claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
- claude_mpm/cli/commands/configure_hook_manager.py +225 -0
- claude_mpm/cli/commands/configure_models.py +18 -0
- claude_mpm/cli/commands/configure_navigation.py +167 -0
- claude_mpm/cli/commands/configure_paths.py +104 -0
- claude_mpm/cli/commands/configure_persistence.py +254 -0
- claude_mpm/cli/commands/configure_startup_manager.py +646 -0
- claude_mpm/cli/commands/configure_template_editor.py +497 -0
- claude_mpm/cli/commands/configure_validators.py +73 -0
- claude_mpm/cli/commands/local_deploy.py +537 -0
- claude_mpm/cli/commands/memory.py +54 -20
- claude_mpm/cli/commands/mpm_init.py +39 -25
- claude_mpm/cli/commands/mpm_init_handler.py +8 -3
- claude_mpm/cli/executor.py +202 -0
- claude_mpm/cli/helpers.py +105 -0
- claude_mpm/cli/interactive/__init__.py +3 -0
- claude_mpm/cli/interactive/skills_wizard.py +491 -0
- claude_mpm/cli/parsers/__init__.py +7 -1
- claude_mpm/cli/parsers/base_parser.py +98 -3
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/cli/shared/output_formatters.py +28 -19
- claude_mpm/cli/startup.py +481 -0
- claude_mpm/cli/utils.py +52 -1
- claude_mpm/commands/mpm-help.md +3 -0
- claude_mpm/commands/mpm-version.md +113 -0
- claude_mpm/commands/mpm.md +1 -0
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/base_service.py +13 -12
- claude_mpm/core/enums.py +452 -0
- claude_mpm/core/factories.py +1 -1
- claude_mpm/core/instruction_reinforcement_hook.py +2 -1
- claude_mpm/core/interactive_session.py +9 -3
- claude_mpm/core/logging_config.py +6 -2
- claude_mpm/core/oneshot_session.py +8 -4
- claude_mpm/core/optimized_agent_loader.py +3 -3
- claude_mpm/core/output_style_manager.py +12 -192
- claude_mpm/core/service_registry.py +5 -1
- claude_mpm/core/types.py +2 -9
- claude_mpm/core/typing_utils.py +7 -6
- claude_mpm/dashboard/static/js/dashboard.js +0 -14
- claude_mpm/dashboard/templates/index.html +3 -41
- claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
- claude_mpm/hooks/instruction_reinforcement.py +7 -2
- claude_mpm/models/resume_log.py +340 -0
- claude_mpm/services/agents/auto_config_manager.py +10 -11
- claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
- claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
- claude_mpm/services/agents/deployment/agent_validator.py +17 -1
- claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
- claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
- claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
- claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
- claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
- claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
- claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
- claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
- claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
- claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
- claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
- claude_mpm/services/agents/local_template_manager.py +1 -1
- claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
- claude_mpm/services/agents/registry/modification_tracker.py +5 -2
- claude_mpm/services/command_handler_service.py +11 -5
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +33 -0
- claude_mpm/services/core/models/agent_config.py +12 -28
- claude_mpm/services/core/models/health.py +162 -0
- claude_mpm/services/core/models/process.py +235 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/core/path_resolver.py +23 -7
- claude_mpm/services/diagnostics/__init__.py +2 -2
- claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
- claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
- claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
- claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
- claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
- claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
- claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
- claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
- claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
- claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
- claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
- claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
- claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
- claude_mpm/services/diagnostics/models.py +19 -24
- claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
- claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
- claude_mpm/services/infrastructure/monitoring/base.py +5 -13
- claude_mpm/services/infrastructure/monitoring/network.py +7 -6
- claude_mpm/services/infrastructure/monitoring/process.py +13 -12
- claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
- claude_mpm/services/infrastructure/monitoring/service.py +16 -15
- claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +372 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/mcp_config_manager.py +9 -4
- claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
- claude_mpm/services/mcp_gateway/core/base.py +18 -31
- claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
- claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
- claude_mpm/services/memory_hook_service.py +4 -1
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- claude_mpm/services/monitor/daemon_manager.py +3 -2
- claude_mpm/services/monitor/handlers/dashboard.py +2 -1
- claude_mpm/services/monitor/handlers/hooks.py +2 -1
- claude_mpm/services/monitor/management/lifecycle.py +3 -2
- claude_mpm/services/monitor/server.py +2 -1
- claude_mpm/services/session_management_service.py +3 -2
- claude_mpm/services/session_manager.py +205 -1
- claude_mpm/services/shared/async_service_base.py +16 -27
- claude_mpm/services/shared/lifecycle_service_base.py +1 -14
- claude_mpm/services/socketio/handlers/__init__.py +5 -2
- claude_mpm/services/socketio/handlers/hook.py +13 -2
- claude_mpm/services/socketio/handlers/registry.py +4 -2
- claude_mpm/services/socketio/server/main.py +10 -8
- claude_mpm/services/subprocess_launcher_service.py +14 -5
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
- claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
- claude_mpm/services/unified/deployment_strategies/local.py +6 -5
- claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
- claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
- claude_mpm/services/unified/interfaces.py +3 -1
- claude_mpm/services/unified/unified_analyzer.py +14 -10
- claude_mpm/services/unified/unified_config.py +2 -1
- claude_mpm/services/unified/unified_deployment.py +9 -4
- claude_mpm/services/version_service.py +104 -1
- claude_mpm/skills/__init__.py +21 -0
- claude_mpm/skills/bundled/__init__.py +6 -0
- claude_mpm/skills/bundled/api-documentation.md +393 -0
- claude_mpm/skills/bundled/async-testing.md +571 -0
- claude_mpm/skills/bundled/code-review.md +143 -0
- claude_mpm/skills/bundled/database-migration.md +199 -0
- claude_mpm/skills/bundled/docker-containerization.md +194 -0
- claude_mpm/skills/bundled/express-local-dev.md +1429 -0
- claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
- claude_mpm/skills/bundled/git-workflow.md +414 -0
- claude_mpm/skills/bundled/imagemagick.md +204 -0
- claude_mpm/skills/bundled/json-data-handling.md +223 -0
- claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
- claude_mpm/skills/bundled/pdf.md +141 -0
- claude_mpm/skills/bundled/performance-profiling.md +567 -0
- claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
- claude_mpm/skills/bundled/security-scanning.md +327 -0
- claude_mpm/skills/bundled/systematic-debugging.md +473 -0
- claude_mpm/skills/bundled/test-driven-development.md +378 -0
- claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
- claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
- claude_mpm/skills/bundled/xlsx.md +157 -0
- claude_mpm/skills/registry.py +286 -0
- claude_mpm/skills/skill_manager.py +310 -0
- claude_mpm/tools/code_tree_analyzer.py +177 -141
- claude_mpm/tools/code_tree_events.py +4 -2
- claude_mpm/utils/agent_dependency_loader.py +2 -2
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
- claude_mpm/dashboard/static/css/code-tree.css +0 -1639
- claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
- claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
- claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
- claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
- claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
- claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
- claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
- claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
- claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
- claude_mpm/services/project/analyzer_refactored.py +0 -450
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health Check Manager for Claude MPM Framework
|
|
3
|
+
==============================================
|
|
4
|
+
|
|
5
|
+
WHY: Orchestrates multiple health check types, provides background monitoring,
|
|
6
|
+
and maintains historical health data for local deployments.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses background daemon thread for continuous monitoring with
|
|
9
|
+
configurable check intervals. Aggregates results from all health check types
|
|
10
|
+
using defined priority rules.
|
|
11
|
+
|
|
12
|
+
ARCHITECTURE:
|
|
13
|
+
- Orchestrates HTTP, process, and resource health checks
|
|
14
|
+
- Background monitoring thread with configurable interval (default: 30s)
|
|
15
|
+
- Thread-safe status tracking with threading.Lock
|
|
16
|
+
- Historical health data (last 100 checks per deployment)
|
|
17
|
+
- Health status aggregation with priority:
|
|
18
|
+
1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
|
|
19
|
+
2. Any check UNHEALTHY = Deployment DEGRADED (service issues)
|
|
20
|
+
3. All checks HEALTHY = Deployment HEALTHY
|
|
21
|
+
4. Otherwise = UNKNOWN
|
|
22
|
+
- Event callbacks for status changes
|
|
23
|
+
|
|
24
|
+
USAGE:
|
|
25
|
+
health_manager = HealthCheckManager(
|
|
26
|
+
process_manager=process_manager,
|
|
27
|
+
check_interval=30,
|
|
28
|
+
)
|
|
29
|
+
health_manager.start_monitoring()
|
|
30
|
+
|
|
31
|
+
# Check health on-demand
|
|
32
|
+
health = health_manager.check_health(deployment_id)
|
|
33
|
+
|
|
34
|
+
# Stop monitoring
|
|
35
|
+
health_manager.stop_monitoring()
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
import threading
|
|
39
|
+
from collections import defaultdict
|
|
40
|
+
from typing import Callable, Dict, List, Optional
|
|
41
|
+
|
|
42
|
+
from claude_mpm.core.enums import HealthStatus
|
|
43
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
44
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheckManager
|
|
45
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
46
|
+
from claude_mpm.services.core.models.health import (
|
|
47
|
+
DeploymentHealth,
|
|
48
|
+
HealthCheckResult,
|
|
49
|
+
)
|
|
50
|
+
from claude_mpm.services.local_ops.health_checks import (
|
|
51
|
+
HttpHealthCheck,
|
|
52
|
+
ProcessHealthCheck,
|
|
53
|
+
ResourceHealthCheck,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class HealthCheckManager(SyncBaseService, IHealthCheckManager):
|
|
58
|
+
"""
|
|
59
|
+
Health check orchestration and monitoring service.
|
|
60
|
+
|
|
61
|
+
WHY: Provides comprehensive health monitoring by coordinating multiple
|
|
62
|
+
check types, maintaining historical data, and enabling background monitoring.
|
|
63
|
+
|
|
64
|
+
Thread Safety: All public methods are thread-safe with proper locking.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
process_manager: ILocalProcessManager,
|
|
70
|
+
check_interval: int = 30,
|
|
71
|
+
history_limit: int = 100,
|
|
72
|
+
):
|
|
73
|
+
"""
|
|
74
|
+
Initialize health check manager.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
process_manager: Process manager for deployment lookup
|
|
78
|
+
check_interval: Background check interval in seconds (default: 30)
|
|
79
|
+
history_limit: Maximum historical entries per deployment (default: 100)
|
|
80
|
+
"""
|
|
81
|
+
super().__init__("HealthCheckManager")
|
|
82
|
+
self.process_manager = process_manager
|
|
83
|
+
self.check_interval = check_interval
|
|
84
|
+
self.history_limit = history_limit
|
|
85
|
+
|
|
86
|
+
# Initialize health check implementations
|
|
87
|
+
self.http_check = HttpHealthCheck(process_manager)
|
|
88
|
+
self.process_check = ProcessHealthCheck(process_manager)
|
|
89
|
+
self.resource_check = ResourceHealthCheck(process_manager)
|
|
90
|
+
|
|
91
|
+
# Background monitoring state
|
|
92
|
+
self._monitoring = False
|
|
93
|
+
self._monitor_thread: Optional[threading.Thread] = None
|
|
94
|
+
self._stop_event = threading.Event()
|
|
95
|
+
self._lock = threading.Lock()
|
|
96
|
+
|
|
97
|
+
# Health history: deployment_id -> List[DeploymentHealth]
|
|
98
|
+
self._health_history: Dict[str, List[DeploymentHealth]] = defaultdict(list)
|
|
99
|
+
|
|
100
|
+
# Status change callbacks
|
|
101
|
+
self._status_callbacks: List[Callable] = []
|
|
102
|
+
|
|
103
|
+
def initialize(self) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Initialize the health check manager.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
True if initialization successful
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
# Initialize all health check implementations
|
|
112
|
+
if not self.http_check.initialize():
|
|
113
|
+
self.log_error("Failed to initialize HTTP health check")
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
if not self.process_check.initialize():
|
|
117
|
+
self.log_error("Failed to initialize process health check")
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
if not self.resource_check.initialize():
|
|
121
|
+
self.log_error("Failed to initialize resource health check")
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
self._initialized = True
|
|
125
|
+
self.log_info("Health check manager initialized")
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
except Exception as e:
|
|
129
|
+
self.log_error(f"Failed to initialize: {e}")
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
def shutdown(self) -> None:
|
|
133
|
+
"""Shutdown health check manager and stop monitoring."""
|
|
134
|
+
if self._monitoring:
|
|
135
|
+
self.stop_monitoring()
|
|
136
|
+
|
|
137
|
+
# Shutdown health check implementations
|
|
138
|
+
self.http_check.shutdown()
|
|
139
|
+
self.process_check.shutdown()
|
|
140
|
+
self.resource_check.shutdown()
|
|
141
|
+
|
|
142
|
+
self._shutdown = True
|
|
143
|
+
self.log_info("Health check manager shutdown complete")
|
|
144
|
+
|
|
145
|
+
def check_health(self, deployment_id: str, **kwargs) -> DeploymentHealth:
|
|
146
|
+
"""
|
|
147
|
+
Execute all health checks for a deployment.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
deployment_id: Unique deployment identifier
|
|
151
|
+
**kwargs: Optional parameters passed to health checks:
|
|
152
|
+
- endpoint: HTTP endpoint URL
|
|
153
|
+
- timeout: HTTP timeout in seconds
|
|
154
|
+
- cpu_threshold: CPU usage threshold percentage
|
|
155
|
+
- memory_threshold_mb: Memory usage threshold in MB
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
DeploymentHealth with aggregated status and check results
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
ValueError: If deployment_id not found
|
|
162
|
+
"""
|
|
163
|
+
# Validate deployment exists
|
|
164
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
165
|
+
if not deployment:
|
|
166
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
167
|
+
|
|
168
|
+
# Execute all health checks
|
|
169
|
+
checks: List[HealthCheckResult] = []
|
|
170
|
+
|
|
171
|
+
# 1. Process health check (most critical)
|
|
172
|
+
try:
|
|
173
|
+
process_result = self.process_check.check(deployment_id, **kwargs)
|
|
174
|
+
checks.append(process_result)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
self.log_error(f"Process health check failed: {e}")
|
|
177
|
+
checks.append(
|
|
178
|
+
HealthCheckResult(
|
|
179
|
+
status=HealthStatus.UNKNOWN,
|
|
180
|
+
check_type="process",
|
|
181
|
+
message=f"Check failed: {e}",
|
|
182
|
+
details={"error": str(e)},
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# 2. Resource health check
|
|
187
|
+
try:
|
|
188
|
+
resource_result = self.resource_check.check(deployment_id, **kwargs)
|
|
189
|
+
checks.append(resource_result)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self.log_error(f"Resource health check failed: {e}")
|
|
192
|
+
checks.append(
|
|
193
|
+
HealthCheckResult(
|
|
194
|
+
status=HealthStatus.UNKNOWN,
|
|
195
|
+
check_type="resource",
|
|
196
|
+
message=f"Check failed: {e}",
|
|
197
|
+
details={"error": str(e)},
|
|
198
|
+
)
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# 3. HTTP health check (optional, only if endpoint configured)
|
|
202
|
+
try:
|
|
203
|
+
http_result = self.http_check.check(deployment_id, **kwargs)
|
|
204
|
+
# Only add if check was actually performed (not UNKNOWN due to no endpoint)
|
|
205
|
+
if http_result.status != HealthStatus.UNKNOWN or kwargs.get("endpoint"):
|
|
206
|
+
checks.append(http_result)
|
|
207
|
+
except Exception as e:
|
|
208
|
+
self.log_error(f"HTTP health check failed: {e}")
|
|
209
|
+
checks.append(
|
|
210
|
+
HealthCheckResult(
|
|
211
|
+
status=HealthStatus.UNKNOWN,
|
|
212
|
+
check_type="http",
|
|
213
|
+
message=f"Check failed: {e}",
|
|
214
|
+
details={"error": str(e)},
|
|
215
|
+
)
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Aggregate health status
|
|
219
|
+
overall_status = self._aggregate_health_status(checks)
|
|
220
|
+
|
|
221
|
+
# Create deployment health
|
|
222
|
+
deployment_health = DeploymentHealth(
|
|
223
|
+
deployment_id=deployment_id,
|
|
224
|
+
overall_status=overall_status,
|
|
225
|
+
checks=checks,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Update health history
|
|
229
|
+
with self._lock:
|
|
230
|
+
self._health_history[deployment_id].append(deployment_health)
|
|
231
|
+
# Trim history to limit
|
|
232
|
+
if len(self._health_history[deployment_id]) > self.history_limit:
|
|
233
|
+
self._health_history[deployment_id] = self._health_history[
|
|
234
|
+
deployment_id
|
|
235
|
+
][-self.history_limit :]
|
|
236
|
+
|
|
237
|
+
# Check for status changes and trigger callbacks
|
|
238
|
+
if len(self._health_history[deployment_id]) >= 2:
|
|
239
|
+
previous_health = self._health_history[deployment_id][-2]
|
|
240
|
+
if previous_health.overall_status != overall_status:
|
|
241
|
+
self._trigger_status_callbacks(
|
|
242
|
+
deployment_id, previous_health.overall_status, overall_status
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return deployment_health
|
|
246
|
+
|
|
247
|
+
def start_monitoring(self) -> None:
|
|
248
|
+
"""
|
|
249
|
+
Start background health monitoring.
|
|
250
|
+
|
|
251
|
+
WHY: Enables continuous health tracking without manual polling.
|
|
252
|
+
Creates a daemon thread that performs periodic checks.
|
|
253
|
+
"""
|
|
254
|
+
with self._lock:
|
|
255
|
+
if self._monitoring:
|
|
256
|
+
self.log_warning("Health monitoring already running")
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
self._monitoring = True
|
|
260
|
+
self._stop_event.clear()
|
|
261
|
+
|
|
262
|
+
# Create and start monitoring thread
|
|
263
|
+
self._monitor_thread = threading.Thread(
|
|
264
|
+
target=self._monitor_loop, daemon=True, name="HealthMonitorThread"
|
|
265
|
+
)
|
|
266
|
+
self._monitor_thread.start()
|
|
267
|
+
|
|
268
|
+
self.log_info(
|
|
269
|
+
f"Started health monitoring with {self.check_interval}s interval"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
def stop_monitoring(self) -> None:
|
|
273
|
+
"""
|
|
274
|
+
Stop background health monitoring.
|
|
275
|
+
|
|
276
|
+
WHY: Gracefully stops the monitoring thread and releases resources.
|
|
277
|
+
"""
|
|
278
|
+
with self._lock:
|
|
279
|
+
if not self._monitoring:
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
self._monitoring = False
|
|
283
|
+
self._stop_event.set()
|
|
284
|
+
|
|
285
|
+
# Wait for monitoring thread to stop
|
|
286
|
+
if self._monitor_thread and self._monitor_thread.is_alive():
|
|
287
|
+
self._monitor_thread.join(timeout=5.0)
|
|
288
|
+
|
|
289
|
+
self.log_info("Stopped health monitoring")
|
|
290
|
+
|
|
291
|
+
def is_monitoring(self) -> bool:
|
|
292
|
+
"""
|
|
293
|
+
Check if background monitoring is active.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
True if monitoring thread is running
|
|
297
|
+
"""
|
|
298
|
+
with self._lock:
|
|
299
|
+
return self._monitoring
|
|
300
|
+
|
|
301
|
+
def get_health_history(
|
|
302
|
+
self, deployment_id: str, limit: int = 10
|
|
303
|
+
) -> List[DeploymentHealth]:
|
|
304
|
+
"""
|
|
305
|
+
Get historical health check results for a deployment.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
deployment_id: Unique deployment identifier
|
|
309
|
+
limit: Maximum number of historical entries to return
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
List of DeploymentHealth objects, newest first
|
|
313
|
+
"""
|
|
314
|
+
with self._lock:
|
|
315
|
+
history = self._health_history.get(deployment_id, [])
|
|
316
|
+
return list(reversed(history[-limit:]))
|
|
317
|
+
|
|
318
|
+
def register_status_callback(
|
|
319
|
+
self, callback: Callable[[str, HealthStatus, HealthStatus], None]
|
|
320
|
+
) -> None:
|
|
321
|
+
"""
|
|
322
|
+
Register a callback for health status changes.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
callback: Function called with (deployment_id, old_status, new_status)
|
|
326
|
+
"""
|
|
327
|
+
with self._lock:
|
|
328
|
+
self._status_callbacks.append(callback)
|
|
329
|
+
self.log_debug(f"Registered status callback: {callback.__name__}")
|
|
330
|
+
|
|
331
|
+
def _monitor_loop(self) -> None:
|
|
332
|
+
"""
|
|
333
|
+
Background monitoring loop.
|
|
334
|
+
|
|
335
|
+
WHY: Runs in a separate thread to perform periodic health checks
|
|
336
|
+
on all active deployments.
|
|
337
|
+
"""
|
|
338
|
+
self.log_debug("Health monitoring loop started")
|
|
339
|
+
|
|
340
|
+
while not self._stop_event.is_set():
|
|
341
|
+
try:
|
|
342
|
+
# Get all active deployments
|
|
343
|
+
deployments = self.process_manager.state_manager.get_all_deployments()
|
|
344
|
+
|
|
345
|
+
for deployment in deployments:
|
|
346
|
+
if self._stop_event.is_set():
|
|
347
|
+
break
|
|
348
|
+
|
|
349
|
+
try:
|
|
350
|
+
# Perform health check
|
|
351
|
+
self.check_health(deployment.deployment_id)
|
|
352
|
+
except Exception as e:
|
|
353
|
+
self.log_error(
|
|
354
|
+
f"Error checking health for {deployment.deployment_id}: {e}"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Sleep until next check interval
|
|
358
|
+
# Use wait() instead of sleep() for faster shutdown response
|
|
359
|
+
self._stop_event.wait(timeout=self.check_interval)
|
|
360
|
+
|
|
361
|
+
except Exception as e:
|
|
362
|
+
self.log_error(f"Error in health monitoring loop: {e}")
|
|
363
|
+
# Don't crash the thread, just continue
|
|
364
|
+
self._stop_event.wait(timeout=1.0)
|
|
365
|
+
|
|
366
|
+
self.log_debug("Health monitoring loop stopped")
|
|
367
|
+
|
|
368
|
+
def _aggregate_health_status(self, checks: List[HealthCheckResult]) -> HealthStatus:
|
|
369
|
+
"""
|
|
370
|
+
Aggregate health status from multiple check results.
|
|
371
|
+
|
|
372
|
+
WHY: Combines results from different check types using priority rules
|
|
373
|
+
to determine overall deployment health.
|
|
374
|
+
|
|
375
|
+
Priority Rules:
|
|
376
|
+
1. Process UNHEALTHY = Deployment UNHEALTHY (critical)
|
|
377
|
+
2. Any check UNHEALTHY = Deployment DEGRADED (service issues but process alive)
|
|
378
|
+
3. All checks HEALTHY = Deployment HEALTHY
|
|
379
|
+
4. Otherwise = UNKNOWN
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
checks: List of health check results
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
Aggregated HealthStatus
|
|
386
|
+
"""
|
|
387
|
+
if not checks:
|
|
388
|
+
return HealthStatus.UNKNOWN
|
|
389
|
+
|
|
390
|
+
# Get process check result (most critical)
|
|
391
|
+
process_check = next((c for c in checks if c.check_type == "process"), None)
|
|
392
|
+
|
|
393
|
+
# Rule 1: Process UNHEALTHY = Deployment UNHEALTHY
|
|
394
|
+
if process_check and process_check.status == HealthStatus.UNHEALTHY:
|
|
395
|
+
return HealthStatus.UNHEALTHY
|
|
396
|
+
|
|
397
|
+
# Rule 2: Any check UNHEALTHY (but process alive) = DEGRADED
|
|
398
|
+
if any(c.status == HealthStatus.UNHEALTHY for c in checks):
|
|
399
|
+
return HealthStatus.DEGRADED
|
|
400
|
+
|
|
401
|
+
# Check for degraded status
|
|
402
|
+
if any(c.status == HealthStatus.DEGRADED for c in checks):
|
|
403
|
+
return HealthStatus.DEGRADED
|
|
404
|
+
|
|
405
|
+
# Rule 3: All checks HEALTHY = Deployment HEALTHY
|
|
406
|
+
if all(c.status == HealthStatus.HEALTHY for c in checks):
|
|
407
|
+
return HealthStatus.HEALTHY
|
|
408
|
+
|
|
409
|
+
# Rule 4: Otherwise = UNKNOWN
|
|
410
|
+
return HealthStatus.UNKNOWN
|
|
411
|
+
|
|
412
|
+
def _trigger_status_callbacks(
|
|
413
|
+
self, deployment_id: str, old_status: HealthStatus, new_status: HealthStatus
|
|
414
|
+
) -> None:
|
|
415
|
+
"""
|
|
416
|
+
Trigger registered callbacks for status changes.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
deployment_id: Deployment that changed status
|
|
420
|
+
old_status: Previous health status
|
|
421
|
+
new_status: New health status
|
|
422
|
+
"""
|
|
423
|
+
for callback in self._status_callbacks:
|
|
424
|
+
try:
|
|
425
|
+
callback(deployment_id, old_status, new_status)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
self.log_error(f"Error in status callback {callback.__name__}: {e}")
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
__all__ = ["HealthCheckManager"]
|