claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_mpm/VERSION +1 -1
- claude_mpm/agents/BASE_ENGINEER.md +286 -0
- claude_mpm/agents/BASE_PM.md +48 -17
- claude_mpm/agents/OUTPUT_STYLE.md +329 -11
- claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
- claude_mpm/agents/agent_loader.py +17 -5
- claude_mpm/agents/frontmatter_validator.py +284 -253
- claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
- claude_mpm/agents/templates/api_qa.json +7 -1
- claude_mpm/agents/templates/clerk-ops.json +8 -1
- claude_mpm/agents/templates/code_analyzer.json +4 -1
- claude_mpm/agents/templates/dart_engineer.json +11 -1
- claude_mpm/agents/templates/data_engineer.json +11 -1
- claude_mpm/agents/templates/documentation.json +6 -1
- claude_mpm/agents/templates/engineer.json +18 -1
- claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
- claude_mpm/agents/templates/golang_engineer.json +11 -1
- claude_mpm/agents/templates/java_engineer.json +12 -2
- claude_mpm/agents/templates/local_ops_agent.json +1217 -6
- claude_mpm/agents/templates/nextjs_engineer.json +11 -1
- claude_mpm/agents/templates/ops.json +8 -1
- claude_mpm/agents/templates/php-engineer.json +11 -1
- claude_mpm/agents/templates/project_organizer.json +10 -3
- claude_mpm/agents/templates/prompt-engineer.json +5 -1
- claude_mpm/agents/templates/python_engineer.json +11 -1
- claude_mpm/agents/templates/qa.json +7 -1
- claude_mpm/agents/templates/react_engineer.json +11 -1
- claude_mpm/agents/templates/refactoring_engineer.json +8 -1
- claude_mpm/agents/templates/research.json +4 -1
- claude_mpm/agents/templates/ruby-engineer.json +11 -1
- claude_mpm/agents/templates/rust_engineer.json +11 -1
- claude_mpm/agents/templates/security.json +6 -1
- claude_mpm/agents/templates/svelte-engineer.json +225 -0
- claude_mpm/agents/templates/ticketing.json +6 -1
- claude_mpm/agents/templates/typescript_engineer.json +11 -1
- claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
- claude_mpm/agents/templates/version_control.json +8 -1
- claude_mpm/agents/templates/web_qa.json +7 -1
- claude_mpm/agents/templates/web_ui.json +11 -1
- claude_mpm/cli/__init__.py +34 -706
- claude_mpm/cli/commands/agent_manager.py +25 -12
- claude_mpm/cli/commands/agent_state_manager.py +186 -0
- claude_mpm/cli/commands/agents.py +204 -148
- claude_mpm/cli/commands/aggregate.py +7 -3
- claude_mpm/cli/commands/analyze.py +9 -4
- claude_mpm/cli/commands/analyze_code.py +7 -2
- claude_mpm/cli/commands/auto_configure.py +7 -9
- claude_mpm/cli/commands/config.py +47 -13
- claude_mpm/cli/commands/configure.py +294 -1788
- claude_mpm/cli/commands/configure_agent_display.py +261 -0
- claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
- claude_mpm/cli/commands/configure_hook_manager.py +225 -0
- claude_mpm/cli/commands/configure_models.py +18 -0
- claude_mpm/cli/commands/configure_navigation.py +167 -0
- claude_mpm/cli/commands/configure_paths.py +104 -0
- claude_mpm/cli/commands/configure_persistence.py +254 -0
- claude_mpm/cli/commands/configure_startup_manager.py +646 -0
- claude_mpm/cli/commands/configure_template_editor.py +497 -0
- claude_mpm/cli/commands/configure_validators.py +73 -0
- claude_mpm/cli/commands/local_deploy.py +537 -0
- claude_mpm/cli/commands/memory.py +54 -20
- claude_mpm/cli/commands/mpm_init.py +39 -25
- claude_mpm/cli/commands/mpm_init_handler.py +8 -3
- claude_mpm/cli/executor.py +202 -0
- claude_mpm/cli/helpers.py +105 -0
- claude_mpm/cli/interactive/__init__.py +3 -0
- claude_mpm/cli/interactive/skills_wizard.py +491 -0
- claude_mpm/cli/parsers/__init__.py +7 -1
- claude_mpm/cli/parsers/base_parser.py +98 -3
- claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
- claude_mpm/cli/shared/output_formatters.py +28 -19
- claude_mpm/cli/startup.py +481 -0
- claude_mpm/cli/utils.py +52 -1
- claude_mpm/commands/mpm-help.md +3 -0
- claude_mpm/commands/mpm-version.md +113 -0
- claude_mpm/commands/mpm.md +1 -0
- claude_mpm/config/agent_config.py +2 -2
- claude_mpm/config/model_config.py +428 -0
- claude_mpm/core/base_service.py +13 -12
- claude_mpm/core/enums.py +452 -0
- claude_mpm/core/factories.py +1 -1
- claude_mpm/core/instruction_reinforcement_hook.py +2 -1
- claude_mpm/core/interactive_session.py +9 -3
- claude_mpm/core/logging_config.py +6 -2
- claude_mpm/core/oneshot_session.py +8 -4
- claude_mpm/core/optimized_agent_loader.py +3 -3
- claude_mpm/core/output_style_manager.py +12 -192
- claude_mpm/core/service_registry.py +5 -1
- claude_mpm/core/types.py +2 -9
- claude_mpm/core/typing_utils.py +7 -6
- claude_mpm/dashboard/static/js/dashboard.js +0 -14
- claude_mpm/dashboard/templates/index.html +3 -41
- claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
- claude_mpm/hooks/instruction_reinforcement.py +7 -2
- claude_mpm/models/resume_log.py +340 -0
- claude_mpm/services/agents/auto_config_manager.py +10 -11
- claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
- claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
- claude_mpm/services/agents/deployment/agent_validator.py +17 -1
- claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
- claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
- claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
- claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
- claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
- claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
- claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
- claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
- claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
- claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
- claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
- claude_mpm/services/agents/local_template_manager.py +1 -1
- claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
- claude_mpm/services/agents/registry/modification_tracker.py +5 -2
- claude_mpm/services/command_handler_service.py +11 -5
- claude_mpm/services/core/interfaces/__init__.py +74 -2
- claude_mpm/services/core/interfaces/health.py +172 -0
- claude_mpm/services/core/interfaces/model.py +281 -0
- claude_mpm/services/core/interfaces/process.py +372 -0
- claude_mpm/services/core/interfaces/restart.py +307 -0
- claude_mpm/services/core/interfaces/stability.py +260 -0
- claude_mpm/services/core/models/__init__.py +33 -0
- claude_mpm/services/core/models/agent_config.py +12 -28
- claude_mpm/services/core/models/health.py +162 -0
- claude_mpm/services/core/models/process.py +235 -0
- claude_mpm/services/core/models/restart.py +302 -0
- claude_mpm/services/core/models/stability.py +264 -0
- claude_mpm/services/core/path_resolver.py +23 -7
- claude_mpm/services/diagnostics/__init__.py +2 -2
- claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
- claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
- claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
- claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
- claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
- claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
- claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
- claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
- claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
- claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
- claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
- claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
- claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
- claude_mpm/services/diagnostics/models.py +19 -24
- claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
- claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
- claude_mpm/services/infrastructure/monitoring/base.py +5 -13
- claude_mpm/services/infrastructure/monitoring/network.py +7 -6
- claude_mpm/services/infrastructure/monitoring/process.py +13 -12
- claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
- claude_mpm/services/infrastructure/monitoring/service.py +16 -15
- claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
- claude_mpm/services/local_ops/__init__.py +163 -0
- claude_mpm/services/local_ops/crash_detector.py +257 -0
- claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
- claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
- claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
- claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
- claude_mpm/services/local_ops/health_manager.py +430 -0
- claude_mpm/services/local_ops/log_monitor.py +396 -0
- claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
- claude_mpm/services/local_ops/process_manager.py +595 -0
- claude_mpm/services/local_ops/resource_monitor.py +331 -0
- claude_mpm/services/local_ops/restart_manager.py +401 -0
- claude_mpm/services/local_ops/restart_policy.py +387 -0
- claude_mpm/services/local_ops/state_manager.py +372 -0
- claude_mpm/services/local_ops/unified_manager.py +600 -0
- claude_mpm/services/mcp_config_manager.py +9 -4
- claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
- claude_mpm/services/mcp_gateway/core/base.py +18 -31
- claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
- claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
- claude_mpm/services/memory_hook_service.py +4 -1
- claude_mpm/services/model/__init__.py +147 -0
- claude_mpm/services/model/base_provider.py +365 -0
- claude_mpm/services/model/claude_provider.py +412 -0
- claude_mpm/services/model/model_router.py +453 -0
- claude_mpm/services/model/ollama_provider.py +415 -0
- claude_mpm/services/monitor/daemon_manager.py +3 -2
- claude_mpm/services/monitor/handlers/dashboard.py +2 -1
- claude_mpm/services/monitor/handlers/hooks.py +2 -1
- claude_mpm/services/monitor/management/lifecycle.py +3 -2
- claude_mpm/services/monitor/server.py +2 -1
- claude_mpm/services/session_management_service.py +3 -2
- claude_mpm/services/session_manager.py +205 -1
- claude_mpm/services/shared/async_service_base.py +16 -27
- claude_mpm/services/shared/lifecycle_service_base.py +1 -14
- claude_mpm/services/socketio/handlers/__init__.py +5 -2
- claude_mpm/services/socketio/handlers/hook.py +13 -2
- claude_mpm/services/socketio/handlers/registry.py +4 -2
- claude_mpm/services/socketio/server/main.py +10 -8
- claude_mpm/services/subprocess_launcher_service.py +14 -5
- claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
- claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
- claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
- claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
- claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
- claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
- claude_mpm/services/unified/deployment_strategies/local.py +6 -5
- claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
- claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
- claude_mpm/services/unified/interfaces.py +3 -1
- claude_mpm/services/unified/unified_analyzer.py +14 -10
- claude_mpm/services/unified/unified_config.py +2 -1
- claude_mpm/services/unified/unified_deployment.py +9 -4
- claude_mpm/services/version_service.py +104 -1
- claude_mpm/skills/__init__.py +21 -0
- claude_mpm/skills/bundled/__init__.py +6 -0
- claude_mpm/skills/bundled/api-documentation.md +393 -0
- claude_mpm/skills/bundled/async-testing.md +571 -0
- claude_mpm/skills/bundled/code-review.md +143 -0
- claude_mpm/skills/bundled/database-migration.md +199 -0
- claude_mpm/skills/bundled/docker-containerization.md +194 -0
- claude_mpm/skills/bundled/express-local-dev.md +1429 -0
- claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
- claude_mpm/skills/bundled/git-workflow.md +414 -0
- claude_mpm/skills/bundled/imagemagick.md +204 -0
- claude_mpm/skills/bundled/json-data-handling.md +223 -0
- claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
- claude_mpm/skills/bundled/pdf.md +141 -0
- claude_mpm/skills/bundled/performance-profiling.md +567 -0
- claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
- claude_mpm/skills/bundled/security-scanning.md +327 -0
- claude_mpm/skills/bundled/systematic-debugging.md +473 -0
- claude_mpm/skills/bundled/test-driven-development.md +378 -0
- claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
- claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
- claude_mpm/skills/bundled/xlsx.md +157 -0
- claude_mpm/skills/registry.py +286 -0
- claude_mpm/skills/skill_manager.py +310 -0
- claude_mpm/tools/code_tree_analyzer.py +177 -141
- claude_mpm/tools/code_tree_events.py +4 -2
- claude_mpm/utils/agent_dependency_loader.py +2 -2
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
- claude_mpm/dashboard/static/css/code-tree.css +0 -1639
- claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
- claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
- claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
- claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
- claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
- claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
- claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
- claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
- claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
- claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
- claude_mpm/services/project/analyzer_refactored.py +0 -450
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
- {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Process Health Check for Claude MPM Framework
|
|
3
|
+
==============================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides process-level health monitoring including existence validation,
|
|
6
|
+
status checking (running/zombie/stopped), and exit code detection.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses psutil for cross-platform process monitoring. Validates
|
|
9
|
+
process existence, status, and parent-child relationships.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- Process existence verification with psutil.Process(pid)
|
|
13
|
+
- Process status checking (running, zombie, stopped, sleeping)
|
|
14
|
+
- Exit code detection for dead processes
|
|
15
|
+
- Parent-child relationship validation
|
|
16
|
+
- Process responsiveness checking (not hung)
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
process_check = ProcessHealthCheck(process_manager)
|
|
20
|
+
result = process_check.check(deployment_id="my-app")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import psutil
|
|
24
|
+
|
|
25
|
+
from claude_mpm.core.enums import HealthStatus
|
|
26
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
27
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
28
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
29
|
+
from claude_mpm.services.core.models.health import HealthCheckResult
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ProcessHealthCheck(SyncBaseService, IHealthCheck):
|
|
33
|
+
"""
|
|
34
|
+
Process status health check implementation.
|
|
35
|
+
|
|
36
|
+
WHY: Validates that the process is running properly and not in a
|
|
37
|
+
degraded state (zombie, stopped, etc.).
|
|
38
|
+
|
|
39
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, process_manager: ILocalProcessManager):
|
|
43
|
+
"""
|
|
44
|
+
Initialize process health check.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
process_manager: Process manager for deployment lookup
|
|
48
|
+
"""
|
|
49
|
+
super().__init__("ProcessHealthCheck")
|
|
50
|
+
self.process_manager = process_manager
|
|
51
|
+
|
|
52
|
+
def initialize(self) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Initialize the health check.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if initialization successful
|
|
58
|
+
"""
|
|
59
|
+
self._initialized = True
|
|
60
|
+
self.log_info("Process health check initialized")
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
def shutdown(self) -> None:
|
|
64
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
65
|
+
self._shutdown = True
|
|
66
|
+
|
|
67
|
+
def get_check_type(self) -> str:
|
|
68
|
+
"""Get the check type identifier."""
|
|
69
|
+
return "process"
|
|
70
|
+
|
|
71
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
72
|
+
"""
|
|
73
|
+
Execute process health check for a deployment.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
deployment_id: Unique deployment identifier
|
|
77
|
+
**kwargs: Optional parameters:
|
|
78
|
+
- check_responsiveness: Check if process is responsive (default: True)
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
HealthCheckResult with check status and details
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ValueError: If deployment_id not found
|
|
85
|
+
"""
|
|
86
|
+
# Validate deployment exists
|
|
87
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
88
|
+
if not deployment:
|
|
89
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
90
|
+
|
|
91
|
+
check_responsiveness = kwargs.get("check_responsiveness", True)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
process = psutil.Process(deployment.process_id)
|
|
95
|
+
|
|
96
|
+
# Check if process exists and is running
|
|
97
|
+
if not process.is_running():
|
|
98
|
+
return HealthCheckResult(
|
|
99
|
+
status=HealthStatus.UNHEALTHY,
|
|
100
|
+
check_type=self.get_check_type(),
|
|
101
|
+
message="Process is not running",
|
|
102
|
+
details={
|
|
103
|
+
"pid": deployment.process_id,
|
|
104
|
+
"deployment_id": deployment_id,
|
|
105
|
+
},
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Get process status
|
|
109
|
+
process_status = process.status()
|
|
110
|
+
|
|
111
|
+
# Check for zombie process
|
|
112
|
+
if process_status == psutil.STATUS_ZOMBIE:
|
|
113
|
+
return HealthCheckResult(
|
|
114
|
+
status=HealthStatus.UNHEALTHY,
|
|
115
|
+
check_type=self.get_check_type(),
|
|
116
|
+
message="Process is a zombie",
|
|
117
|
+
details={
|
|
118
|
+
"pid": deployment.process_id,
|
|
119
|
+
"status": process_status,
|
|
120
|
+
"deployment_id": deployment_id,
|
|
121
|
+
},
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Check for stopped process
|
|
125
|
+
if process_status in (psutil.STATUS_STOPPED, psutil.STATUS_DEAD):
|
|
126
|
+
return HealthCheckResult(
|
|
127
|
+
status=HealthStatus.UNHEALTHY,
|
|
128
|
+
check_type=self.get_check_type(),
|
|
129
|
+
message=f"Process is {process_status}",
|
|
130
|
+
details={
|
|
131
|
+
"pid": deployment.process_id,
|
|
132
|
+
"status": process_status,
|
|
133
|
+
"deployment_id": deployment_id,
|
|
134
|
+
},
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Check responsiveness (CPU activity)
|
|
138
|
+
if check_responsiveness:
|
|
139
|
+
try:
|
|
140
|
+
cpu_percent = process.cpu_percent(interval=0.1)
|
|
141
|
+
# Process should have some CPU activity or be idle/sleeping
|
|
142
|
+
# A process with 0% CPU for extended periods might be hung
|
|
143
|
+
is_responsive = cpu_percent > 0 or process_status in (
|
|
144
|
+
psutil.STATUS_SLEEPING,
|
|
145
|
+
psutil.STATUS_IDLE,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if not is_responsive:
|
|
149
|
+
return HealthCheckResult(
|
|
150
|
+
status=HealthStatus.DEGRADED,
|
|
151
|
+
check_type=self.get_check_type(),
|
|
152
|
+
message="Process may be unresponsive",
|
|
153
|
+
details={
|
|
154
|
+
"pid": deployment.process_id,
|
|
155
|
+
"status": process_status,
|
|
156
|
+
"cpu_percent": cpu_percent,
|
|
157
|
+
"deployment_id": deployment_id,
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
161
|
+
# Process disappeared or we lost access during check
|
|
162
|
+
return HealthCheckResult(
|
|
163
|
+
status=HealthStatus.UNHEALTHY,
|
|
164
|
+
check_type=self.get_check_type(),
|
|
165
|
+
message="Process disappeared during check",
|
|
166
|
+
details={
|
|
167
|
+
"pid": deployment.process_id,
|
|
168
|
+
"deployment_id": deployment_id,
|
|
169
|
+
},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Process is healthy
|
|
173
|
+
# Get additional process info for details
|
|
174
|
+
try:
|
|
175
|
+
process_info = {
|
|
176
|
+
"pid": deployment.process_id,
|
|
177
|
+
"status": process_status,
|
|
178
|
+
"deployment_id": deployment_id,
|
|
179
|
+
"name": process.name(),
|
|
180
|
+
"num_threads": process.num_threads(),
|
|
181
|
+
}
|
|
182
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
183
|
+
process_info = {
|
|
184
|
+
"pid": deployment.process_id,
|
|
185
|
+
"status": process_status,
|
|
186
|
+
"deployment_id": deployment_id,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
return HealthCheckResult(
|
|
190
|
+
status=HealthStatus.HEALTHY,
|
|
191
|
+
check_type=self.get_check_type(),
|
|
192
|
+
message="Process is running normally",
|
|
193
|
+
details=process_info,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
except psutil.NoSuchProcess:
|
|
197
|
+
# Process does not exist
|
|
198
|
+
return HealthCheckResult(
|
|
199
|
+
status=HealthStatus.UNHEALTHY,
|
|
200
|
+
check_type=self.get_check_type(),
|
|
201
|
+
message="Process no longer exists",
|
|
202
|
+
details={
|
|
203
|
+
"pid": deployment.process_id,
|
|
204
|
+
"deployment_id": deployment_id,
|
|
205
|
+
},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
except psutil.AccessDenied as e:
|
|
209
|
+
# Cannot access process information
|
|
210
|
+
return HealthCheckResult(
|
|
211
|
+
status=HealthStatus.UNKNOWN,
|
|
212
|
+
check_type=self.get_check_type(),
|
|
213
|
+
message="Cannot access process information",
|
|
214
|
+
details={
|
|
215
|
+
"pid": deployment.process_id,
|
|
216
|
+
"deployment_id": deployment_id,
|
|
217
|
+
"error": str(e),
|
|
218
|
+
},
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
except Exception as e:
|
|
222
|
+
# Unexpected error
|
|
223
|
+
self.log_error(f"Unexpected error in process health check: {e}")
|
|
224
|
+
return HealthCheckResult(
|
|
225
|
+
status=HealthStatus.UNKNOWN,
|
|
226
|
+
check_type=self.get_check_type(),
|
|
227
|
+
message="Health check failed with error",
|
|
228
|
+
details={
|
|
229
|
+
"pid": deployment.process_id,
|
|
230
|
+
"deployment_id": deployment_id,
|
|
231
|
+
"error": str(e),
|
|
232
|
+
},
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
__all__ = ["ProcessHealthCheck"]
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resource Health Check for Claude MPM Framework
|
|
3
|
+
===============================================
|
|
4
|
+
|
|
5
|
+
WHY: Provides resource usage monitoring including CPU, memory, file descriptors,
|
|
6
|
+
threads, and network connections to detect resource exhaustion issues.
|
|
7
|
+
|
|
8
|
+
DESIGN DECISION: Uses psutil for cross-platform resource monitoring with
|
|
9
|
+
configurable thresholds for different resource types.
|
|
10
|
+
|
|
11
|
+
ARCHITECTURE:
|
|
12
|
+
- CPU usage monitoring (threshold: 80%)
|
|
13
|
+
- Memory usage monitoring (threshold: 500MB)
|
|
14
|
+
- File descriptor count (threshold: 1000, Unix only)
|
|
15
|
+
- Thread count monitoring
|
|
16
|
+
- Network connection count (open sockets)
|
|
17
|
+
|
|
18
|
+
USAGE:
|
|
19
|
+
resource_check = ResourceHealthCheck(process_manager)
|
|
20
|
+
result = resource_check.check(
|
|
21
|
+
deployment_id="my-app",
|
|
22
|
+
cpu_threshold=80.0,
|
|
23
|
+
memory_threshold_mb=500.0
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import platform
|
|
28
|
+
|
|
29
|
+
import psutil
|
|
30
|
+
|
|
31
|
+
from claude_mpm.core.enums import HealthStatus
|
|
32
|
+
from claude_mpm.services.core.base import SyncBaseService
|
|
33
|
+
from claude_mpm.services.core.interfaces.health import IHealthCheck
|
|
34
|
+
from claude_mpm.services.core.interfaces.process import ILocalProcessManager
|
|
35
|
+
from claude_mpm.services.core.models.health import HealthCheckResult
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ResourceHealthCheck(SyncBaseService, IHealthCheck):
|
|
39
|
+
"""
|
|
40
|
+
Resource usage health check implementation.
|
|
41
|
+
|
|
42
|
+
WHY: Monitors resource consumption to detect issues before they
|
|
43
|
+
cause service degradation or failures.
|
|
44
|
+
|
|
45
|
+
Thread Safety: Stateless, safe for concurrent execution.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Default thresholds
|
|
49
|
+
DEFAULT_CPU_THRESHOLD = 80.0 # Percentage
|
|
50
|
+
DEFAULT_MEMORY_THRESHOLD_MB = 500.0 # Megabytes
|
|
51
|
+
DEFAULT_FD_THRESHOLD = 1000 # File descriptors (Unix only)
|
|
52
|
+
DEFAULT_THREAD_THRESHOLD = 100 # Threads
|
|
53
|
+
|
|
54
|
+
def __init__(self, process_manager: ILocalProcessManager):
|
|
55
|
+
"""
|
|
56
|
+
Initialize resource health check.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
process_manager: Process manager for deployment lookup
|
|
60
|
+
"""
|
|
61
|
+
super().__init__("ResourceHealthCheck")
|
|
62
|
+
self.process_manager = process_manager
|
|
63
|
+
self.is_windows = platform.system() == "Windows"
|
|
64
|
+
|
|
65
|
+
def initialize(self) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
Initialize the health check.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
True if initialization successful
|
|
71
|
+
"""
|
|
72
|
+
self._initialized = True
|
|
73
|
+
self.log_info("Resource health check initialized")
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
def shutdown(self) -> None:
|
|
77
|
+
"""Shutdown health check (no resources to clean up)."""
|
|
78
|
+
self._shutdown = True
|
|
79
|
+
|
|
80
|
+
def get_check_type(self) -> str:
|
|
81
|
+
"""Get the check type identifier."""
|
|
82
|
+
return "resource"
|
|
83
|
+
|
|
84
|
+
def check(self, deployment_id: str, **kwargs) -> HealthCheckResult:
|
|
85
|
+
"""
|
|
86
|
+
Execute resource health check for a deployment.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
deployment_id: Unique deployment identifier
|
|
90
|
+
**kwargs: Optional parameters:
|
|
91
|
+
- cpu_threshold: CPU usage threshold percentage (default: 80.0)
|
|
92
|
+
- memory_threshold_mb: Memory usage threshold in MB (default: 500.0)
|
|
93
|
+
- fd_threshold: File descriptor threshold (default: 1000, Unix only)
|
|
94
|
+
- thread_threshold: Thread count threshold (default: 100)
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
HealthCheckResult with check status and details
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
ValueError: If deployment_id not found
|
|
101
|
+
"""
|
|
102
|
+
# Validate deployment exists
|
|
103
|
+
deployment = self.process_manager.state_manager.get_deployment(deployment_id)
|
|
104
|
+
if not deployment:
|
|
105
|
+
raise ValueError(f"Deployment not found: {deployment_id}")
|
|
106
|
+
|
|
107
|
+
# Get thresholds from kwargs
|
|
108
|
+
cpu_threshold = kwargs.get("cpu_threshold", self.DEFAULT_CPU_THRESHOLD)
|
|
109
|
+
memory_threshold_mb = kwargs.get(
|
|
110
|
+
"memory_threshold_mb", self.DEFAULT_MEMORY_THRESHOLD_MB
|
|
111
|
+
)
|
|
112
|
+
fd_threshold = kwargs.get("fd_threshold", self.DEFAULT_FD_THRESHOLD)
|
|
113
|
+
thread_threshold = kwargs.get("thread_threshold", self.DEFAULT_THREAD_THRESHOLD)
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
process = psutil.Process(deployment.process_id)
|
|
117
|
+
|
|
118
|
+
# Collect resource metrics
|
|
119
|
+
details = {
|
|
120
|
+
"pid": deployment.process_id,
|
|
121
|
+
"deployment_id": deployment_id,
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
issues = []
|
|
125
|
+
|
|
126
|
+
# Check CPU usage
|
|
127
|
+
try:
|
|
128
|
+
cpu_percent = process.cpu_percent(interval=0.1)
|
|
129
|
+
details["cpu_percent"] = round(cpu_percent, 2)
|
|
130
|
+
details["cpu_threshold"] = cpu_threshold
|
|
131
|
+
|
|
132
|
+
if cpu_percent > cpu_threshold:
|
|
133
|
+
issues.append(
|
|
134
|
+
f"High CPU usage: {cpu_percent:.1f}% (threshold: {cpu_threshold}%)"
|
|
135
|
+
)
|
|
136
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
# Check memory usage
|
|
140
|
+
try:
|
|
141
|
+
memory_info = process.memory_info()
|
|
142
|
+
memory_mb = memory_info.rss / (1024 * 1024)
|
|
143
|
+
details["memory_mb"] = round(memory_mb, 2)
|
|
144
|
+
details["memory_threshold_mb"] = memory_threshold_mb
|
|
145
|
+
|
|
146
|
+
if memory_mb > memory_threshold_mb:
|
|
147
|
+
issues.append(
|
|
148
|
+
f"High memory usage: {memory_mb:.1f}MB (threshold: {memory_threshold_mb}MB)"
|
|
149
|
+
)
|
|
150
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
151
|
+
pass
|
|
152
|
+
|
|
153
|
+
# Check file descriptors (Unix only)
|
|
154
|
+
if not self.is_windows:
|
|
155
|
+
try:
|
|
156
|
+
num_fds = process.num_fds()
|
|
157
|
+
details["num_fds"] = num_fds
|
|
158
|
+
details["fd_threshold"] = fd_threshold
|
|
159
|
+
|
|
160
|
+
if num_fds > fd_threshold:
|
|
161
|
+
issues.append(
|
|
162
|
+
f"High file descriptor count: {num_fds} (threshold: {fd_threshold})"
|
|
163
|
+
)
|
|
164
|
+
except (
|
|
165
|
+
psutil.NoSuchProcess,
|
|
166
|
+
psutil.AccessDenied,
|
|
167
|
+
AttributeError,
|
|
168
|
+
):
|
|
169
|
+
# num_fds() not available on all platforms
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
# Check thread count
|
|
173
|
+
try:
|
|
174
|
+
num_threads = process.num_threads()
|
|
175
|
+
details["num_threads"] = num_threads
|
|
176
|
+
details["thread_threshold"] = thread_threshold
|
|
177
|
+
|
|
178
|
+
if num_threads > thread_threshold:
|
|
179
|
+
issues.append(
|
|
180
|
+
f"High thread count: {num_threads} (threshold: {thread_threshold})"
|
|
181
|
+
)
|
|
182
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
# Check connection count
|
|
186
|
+
try:
|
|
187
|
+
connections = process.net_connections()
|
|
188
|
+
num_connections = len(connections)
|
|
189
|
+
details["num_connections"] = num_connections
|
|
190
|
+
|
|
191
|
+
# Add connection breakdown by state
|
|
192
|
+
connection_states = {}
|
|
193
|
+
for conn in connections:
|
|
194
|
+
state = conn.status
|
|
195
|
+
connection_states[state] = connection_states.get(state, 0) + 1
|
|
196
|
+
details["connection_states"] = connection_states
|
|
197
|
+
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
# Determine health status based on issues
|
|
201
|
+
if issues:
|
|
202
|
+
return HealthCheckResult(
|
|
203
|
+
status=HealthStatus.DEGRADED,
|
|
204
|
+
check_type=self.get_check_type(),
|
|
205
|
+
message=f"Resource usage issues detected: {'; '.join(issues)}",
|
|
206
|
+
details=details,
|
|
207
|
+
)
|
|
208
|
+
return HealthCheckResult(
|
|
209
|
+
status=HealthStatus.HEALTHY,
|
|
210
|
+
check_type=self.get_check_type(),
|
|
211
|
+
message="Resource usage within normal limits",
|
|
212
|
+
details=details,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
except psutil.NoSuchProcess:
|
|
216
|
+
# Process does not exist
|
|
217
|
+
return HealthCheckResult(
|
|
218
|
+
status=HealthStatus.UNHEALTHY,
|
|
219
|
+
check_type=self.get_check_type(),
|
|
220
|
+
message="Process no longer exists",
|
|
221
|
+
details={
|
|
222
|
+
"pid": deployment.process_id,
|
|
223
|
+
"deployment_id": deployment_id,
|
|
224
|
+
},
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
except psutil.AccessDenied as e:
|
|
228
|
+
# Cannot access process information
|
|
229
|
+
return HealthCheckResult(
|
|
230
|
+
status=HealthStatus.UNKNOWN,
|
|
231
|
+
check_type=self.get_check_type(),
|
|
232
|
+
message="Cannot access process resource information",
|
|
233
|
+
details={
|
|
234
|
+
"pid": deployment.process_id,
|
|
235
|
+
"deployment_id": deployment_id,
|
|
236
|
+
"error": str(e),
|
|
237
|
+
},
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
# Unexpected error
|
|
242
|
+
self.log_error(f"Unexpected error in resource health check: {e}")
|
|
243
|
+
return HealthCheckResult(
|
|
244
|
+
status=HealthStatus.UNKNOWN,
|
|
245
|
+
check_type=self.get_check_type(),
|
|
246
|
+
message="Health check failed with error",
|
|
247
|
+
details={
|
|
248
|
+
"pid": deployment.process_id,
|
|
249
|
+
"deployment_id": deployment_id,
|
|
250
|
+
"error": str(e),
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
__all__ = ["ResourceHealthCheck"]
|