claude-mpm 4.13.2__py3-none-any.whl → 4.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. claude_mpm/VERSION +1 -1
  2. claude_mpm/agents/BASE_ENGINEER.md +286 -0
  3. claude_mpm/agents/BASE_PM.md +48 -17
  4. claude_mpm/agents/OUTPUT_STYLE.md +329 -11
  5. claude_mpm/agents/PM_INSTRUCTIONS.md +227 -8
  6. claude_mpm/agents/agent_loader.py +17 -5
  7. claude_mpm/agents/frontmatter_validator.py +284 -253
  8. claude_mpm/agents/templates/agentic-coder-optimizer.json +9 -2
  9. claude_mpm/agents/templates/api_qa.json +7 -1
  10. claude_mpm/agents/templates/clerk-ops.json +8 -1
  11. claude_mpm/agents/templates/code_analyzer.json +4 -1
  12. claude_mpm/agents/templates/dart_engineer.json +11 -1
  13. claude_mpm/agents/templates/data_engineer.json +11 -1
  14. claude_mpm/agents/templates/documentation.json +6 -1
  15. claude_mpm/agents/templates/engineer.json +18 -1
  16. claude_mpm/agents/templates/gcp_ops_agent.json +8 -1
  17. claude_mpm/agents/templates/golang_engineer.json +11 -1
  18. claude_mpm/agents/templates/java_engineer.json +12 -2
  19. claude_mpm/agents/templates/local_ops_agent.json +1217 -6
  20. claude_mpm/agents/templates/nextjs_engineer.json +11 -1
  21. claude_mpm/agents/templates/ops.json +8 -1
  22. claude_mpm/agents/templates/php-engineer.json +11 -1
  23. claude_mpm/agents/templates/project_organizer.json +10 -3
  24. claude_mpm/agents/templates/prompt-engineer.json +5 -1
  25. claude_mpm/agents/templates/python_engineer.json +11 -1
  26. claude_mpm/agents/templates/qa.json +7 -1
  27. claude_mpm/agents/templates/react_engineer.json +11 -1
  28. claude_mpm/agents/templates/refactoring_engineer.json +8 -1
  29. claude_mpm/agents/templates/research.json +4 -1
  30. claude_mpm/agents/templates/ruby-engineer.json +11 -1
  31. claude_mpm/agents/templates/rust_engineer.json +11 -1
  32. claude_mpm/agents/templates/security.json +6 -1
  33. claude_mpm/agents/templates/svelte-engineer.json +225 -0
  34. claude_mpm/agents/templates/ticketing.json +6 -1
  35. claude_mpm/agents/templates/typescript_engineer.json +11 -1
  36. claude_mpm/agents/templates/vercel_ops_agent.json +8 -1
  37. claude_mpm/agents/templates/version_control.json +8 -1
  38. claude_mpm/agents/templates/web_qa.json +7 -1
  39. claude_mpm/agents/templates/web_ui.json +11 -1
  40. claude_mpm/cli/__init__.py +34 -706
  41. claude_mpm/cli/commands/agent_manager.py +25 -12
  42. claude_mpm/cli/commands/agent_state_manager.py +186 -0
  43. claude_mpm/cli/commands/agents.py +204 -148
  44. claude_mpm/cli/commands/aggregate.py +7 -3
  45. claude_mpm/cli/commands/analyze.py +9 -4
  46. claude_mpm/cli/commands/analyze_code.py +7 -2
  47. claude_mpm/cli/commands/auto_configure.py +7 -9
  48. claude_mpm/cli/commands/config.py +47 -13
  49. claude_mpm/cli/commands/configure.py +294 -1788
  50. claude_mpm/cli/commands/configure_agent_display.py +261 -0
  51. claude_mpm/cli/commands/configure_behavior_manager.py +204 -0
  52. claude_mpm/cli/commands/configure_hook_manager.py +225 -0
  53. claude_mpm/cli/commands/configure_models.py +18 -0
  54. claude_mpm/cli/commands/configure_navigation.py +167 -0
  55. claude_mpm/cli/commands/configure_paths.py +104 -0
  56. claude_mpm/cli/commands/configure_persistence.py +254 -0
  57. claude_mpm/cli/commands/configure_startup_manager.py +646 -0
  58. claude_mpm/cli/commands/configure_template_editor.py +497 -0
  59. claude_mpm/cli/commands/configure_validators.py +73 -0
  60. claude_mpm/cli/commands/local_deploy.py +537 -0
  61. claude_mpm/cli/commands/memory.py +54 -20
  62. claude_mpm/cli/commands/mpm_init.py +39 -25
  63. claude_mpm/cli/commands/mpm_init_handler.py +8 -3
  64. claude_mpm/cli/executor.py +202 -0
  65. claude_mpm/cli/helpers.py +105 -0
  66. claude_mpm/cli/interactive/__init__.py +3 -0
  67. claude_mpm/cli/interactive/skills_wizard.py +491 -0
  68. claude_mpm/cli/parsers/__init__.py +7 -1
  69. claude_mpm/cli/parsers/base_parser.py +98 -3
  70. claude_mpm/cli/parsers/local_deploy_parser.py +227 -0
  71. claude_mpm/cli/shared/output_formatters.py +28 -19
  72. claude_mpm/cli/startup.py +481 -0
  73. claude_mpm/cli/utils.py +52 -1
  74. claude_mpm/commands/mpm-help.md +3 -0
  75. claude_mpm/commands/mpm-version.md +113 -0
  76. claude_mpm/commands/mpm.md +1 -0
  77. claude_mpm/config/agent_config.py +2 -2
  78. claude_mpm/config/model_config.py +428 -0
  79. claude_mpm/core/base_service.py +13 -12
  80. claude_mpm/core/enums.py +452 -0
  81. claude_mpm/core/factories.py +1 -1
  82. claude_mpm/core/instruction_reinforcement_hook.py +2 -1
  83. claude_mpm/core/interactive_session.py +9 -3
  84. claude_mpm/core/logging_config.py +6 -2
  85. claude_mpm/core/oneshot_session.py +8 -4
  86. claude_mpm/core/optimized_agent_loader.py +3 -3
  87. claude_mpm/core/output_style_manager.py +12 -192
  88. claude_mpm/core/service_registry.py +5 -1
  89. claude_mpm/core/types.py +2 -9
  90. claude_mpm/core/typing_utils.py +7 -6
  91. claude_mpm/dashboard/static/js/dashboard.js +0 -14
  92. claude_mpm/dashboard/templates/index.html +3 -41
  93. claude_mpm/hooks/claude_hooks/response_tracking.py +35 -1
  94. claude_mpm/hooks/instruction_reinforcement.py +7 -2
  95. claude_mpm/models/resume_log.py +340 -0
  96. claude_mpm/services/agents/auto_config_manager.py +10 -11
  97. claude_mpm/services/agents/deployment/agent_configuration_manager.py +1 -1
  98. claude_mpm/services/agents/deployment/agent_record_service.py +1 -1
  99. claude_mpm/services/agents/deployment/agent_validator.py +17 -1
  100. claude_mpm/services/agents/deployment/async_agent_deployment.py +1 -1
  101. claude_mpm/services/agents/deployment/interface_adapter.py +3 -2
  102. claude_mpm/services/agents/deployment/local_template_deployment.py +1 -1
  103. claude_mpm/services/agents/deployment/pipeline/steps/agent_processing_step.py +7 -6
  104. claude_mpm/services/agents/deployment/pipeline/steps/base_step.py +7 -16
  105. claude_mpm/services/agents/deployment/pipeline/steps/configuration_step.py +4 -3
  106. claude_mpm/services/agents/deployment/pipeline/steps/target_directory_step.py +5 -3
  107. claude_mpm/services/agents/deployment/pipeline/steps/validation_step.py +6 -5
  108. claude_mpm/services/agents/deployment/refactored_agent_deployment_service.py +9 -6
  109. claude_mpm/services/agents/deployment/validation/__init__.py +3 -1
  110. claude_mpm/services/agents/deployment/validation/validation_result.py +1 -9
  111. claude_mpm/services/agents/local_template_manager.py +1 -1
  112. claude_mpm/services/agents/memory/agent_memory_manager.py +5 -2
  113. claude_mpm/services/agents/registry/modification_tracker.py +5 -2
  114. claude_mpm/services/command_handler_service.py +11 -5
  115. claude_mpm/services/core/interfaces/__init__.py +74 -2
  116. claude_mpm/services/core/interfaces/health.py +172 -0
  117. claude_mpm/services/core/interfaces/model.py +281 -0
  118. claude_mpm/services/core/interfaces/process.py +372 -0
  119. claude_mpm/services/core/interfaces/restart.py +307 -0
  120. claude_mpm/services/core/interfaces/stability.py +260 -0
  121. claude_mpm/services/core/models/__init__.py +33 -0
  122. claude_mpm/services/core/models/agent_config.py +12 -28
  123. claude_mpm/services/core/models/health.py +162 -0
  124. claude_mpm/services/core/models/process.py +235 -0
  125. claude_mpm/services/core/models/restart.py +302 -0
  126. claude_mpm/services/core/models/stability.py +264 -0
  127. claude_mpm/services/core/path_resolver.py +23 -7
  128. claude_mpm/services/diagnostics/__init__.py +2 -2
  129. claude_mpm/services/diagnostics/checks/agent_check.py +25 -24
  130. claude_mpm/services/diagnostics/checks/claude_code_check.py +24 -23
  131. claude_mpm/services/diagnostics/checks/common_issues_check.py +25 -24
  132. claude_mpm/services/diagnostics/checks/configuration_check.py +24 -23
  133. claude_mpm/services/diagnostics/checks/filesystem_check.py +18 -17
  134. claude_mpm/services/diagnostics/checks/installation_check.py +30 -29
  135. claude_mpm/services/diagnostics/checks/instructions_check.py +20 -19
  136. claude_mpm/services/diagnostics/checks/mcp_check.py +50 -36
  137. claude_mpm/services/diagnostics/checks/mcp_services_check.py +36 -31
  138. claude_mpm/services/diagnostics/checks/monitor_check.py +23 -22
  139. claude_mpm/services/diagnostics/checks/startup_log_check.py +9 -8
  140. claude_mpm/services/diagnostics/diagnostic_runner.py +6 -5
  141. claude_mpm/services/diagnostics/doctor_reporter.py +28 -25
  142. claude_mpm/services/diagnostics/models.py +19 -24
  143. claude_mpm/services/infrastructure/monitoring/__init__.py +1 -1
  144. claude_mpm/services/infrastructure/monitoring/aggregator.py +12 -12
  145. claude_mpm/services/infrastructure/monitoring/base.py +5 -13
  146. claude_mpm/services/infrastructure/monitoring/network.py +7 -6
  147. claude_mpm/services/infrastructure/monitoring/process.py +13 -12
  148. claude_mpm/services/infrastructure/monitoring/resources.py +7 -6
  149. claude_mpm/services/infrastructure/monitoring/service.py +16 -15
  150. claude_mpm/services/infrastructure/resume_log_generator.py +439 -0
  151. claude_mpm/services/local_ops/__init__.py +163 -0
  152. claude_mpm/services/local_ops/crash_detector.py +257 -0
  153. claude_mpm/services/local_ops/health_checks/__init__.py +28 -0
  154. claude_mpm/services/local_ops/health_checks/http_check.py +224 -0
  155. claude_mpm/services/local_ops/health_checks/process_check.py +236 -0
  156. claude_mpm/services/local_ops/health_checks/resource_check.py +255 -0
  157. claude_mpm/services/local_ops/health_manager.py +430 -0
  158. claude_mpm/services/local_ops/log_monitor.py +396 -0
  159. claude_mpm/services/local_ops/memory_leak_detector.py +294 -0
  160. claude_mpm/services/local_ops/process_manager.py +595 -0
  161. claude_mpm/services/local_ops/resource_monitor.py +331 -0
  162. claude_mpm/services/local_ops/restart_manager.py +401 -0
  163. claude_mpm/services/local_ops/restart_policy.py +387 -0
  164. claude_mpm/services/local_ops/state_manager.py +372 -0
  165. claude_mpm/services/local_ops/unified_manager.py +600 -0
  166. claude_mpm/services/mcp_config_manager.py +9 -4
  167. claude_mpm/services/mcp_gateway/core/__init__.py +1 -2
  168. claude_mpm/services/mcp_gateway/core/base.py +18 -31
  169. claude_mpm/services/mcp_gateway/tools/external_mcp_services.py +71 -24
  170. claude_mpm/services/mcp_gateway/tools/health_check_tool.py +30 -28
  171. claude_mpm/services/memory_hook_service.py +4 -1
  172. claude_mpm/services/model/__init__.py +147 -0
  173. claude_mpm/services/model/base_provider.py +365 -0
  174. claude_mpm/services/model/claude_provider.py +412 -0
  175. claude_mpm/services/model/model_router.py +453 -0
  176. claude_mpm/services/model/ollama_provider.py +415 -0
  177. claude_mpm/services/monitor/daemon_manager.py +3 -2
  178. claude_mpm/services/monitor/handlers/dashboard.py +2 -1
  179. claude_mpm/services/monitor/handlers/hooks.py +2 -1
  180. claude_mpm/services/monitor/management/lifecycle.py +3 -2
  181. claude_mpm/services/monitor/server.py +2 -1
  182. claude_mpm/services/session_management_service.py +3 -2
  183. claude_mpm/services/session_manager.py +205 -1
  184. claude_mpm/services/shared/async_service_base.py +16 -27
  185. claude_mpm/services/shared/lifecycle_service_base.py +1 -14
  186. claude_mpm/services/socketio/handlers/__init__.py +5 -2
  187. claude_mpm/services/socketio/handlers/hook.py +13 -2
  188. claude_mpm/services/socketio/handlers/registry.py +4 -2
  189. claude_mpm/services/socketio/server/main.py +10 -8
  190. claude_mpm/services/subprocess_launcher_service.py +14 -5
  191. claude_mpm/services/unified/analyzer_strategies/code_analyzer.py +8 -7
  192. claude_mpm/services/unified/analyzer_strategies/dependency_analyzer.py +6 -5
  193. claude_mpm/services/unified/analyzer_strategies/performance_analyzer.py +8 -7
  194. claude_mpm/services/unified/analyzer_strategies/security_analyzer.py +7 -6
  195. claude_mpm/services/unified/analyzer_strategies/structure_analyzer.py +5 -4
  196. claude_mpm/services/unified/config_strategies/validation_strategy.py +13 -9
  197. claude_mpm/services/unified/deployment_strategies/cloud_strategies.py +10 -3
  198. claude_mpm/services/unified/deployment_strategies/local.py +6 -5
  199. claude_mpm/services/unified/deployment_strategies/utils.py +6 -5
  200. claude_mpm/services/unified/deployment_strategies/vercel.py +7 -6
  201. claude_mpm/services/unified/interfaces.py +3 -1
  202. claude_mpm/services/unified/unified_analyzer.py +14 -10
  203. claude_mpm/services/unified/unified_config.py +2 -1
  204. claude_mpm/services/unified/unified_deployment.py +9 -4
  205. claude_mpm/services/version_service.py +104 -1
  206. claude_mpm/skills/__init__.py +21 -0
  207. claude_mpm/skills/bundled/__init__.py +6 -0
  208. claude_mpm/skills/bundled/api-documentation.md +393 -0
  209. claude_mpm/skills/bundled/async-testing.md +571 -0
  210. claude_mpm/skills/bundled/code-review.md +143 -0
  211. claude_mpm/skills/bundled/database-migration.md +199 -0
  212. claude_mpm/skills/bundled/docker-containerization.md +194 -0
  213. claude_mpm/skills/bundled/express-local-dev.md +1429 -0
  214. claude_mpm/skills/bundled/fastapi-local-dev.md +1199 -0
  215. claude_mpm/skills/bundled/git-workflow.md +414 -0
  216. claude_mpm/skills/bundled/imagemagick.md +204 -0
  217. claude_mpm/skills/bundled/json-data-handling.md +223 -0
  218. claude_mpm/skills/bundled/nextjs-local-dev.md +807 -0
  219. claude_mpm/skills/bundled/pdf.md +141 -0
  220. claude_mpm/skills/bundled/performance-profiling.md +567 -0
  221. claude_mpm/skills/bundled/refactoring-patterns.md +180 -0
  222. claude_mpm/skills/bundled/security-scanning.md +327 -0
  223. claude_mpm/skills/bundled/systematic-debugging.md +473 -0
  224. claude_mpm/skills/bundled/test-driven-development.md +378 -0
  225. claude_mpm/skills/bundled/vite-local-dev.md +1061 -0
  226. claude_mpm/skills/bundled/web-performance-optimization.md +2305 -0
  227. claude_mpm/skills/bundled/xlsx.md +157 -0
  228. claude_mpm/skills/registry.py +286 -0
  229. claude_mpm/skills/skill_manager.py +310 -0
  230. claude_mpm/tools/code_tree_analyzer.py +177 -141
  231. claude_mpm/tools/code_tree_events.py +4 -2
  232. claude_mpm/utils/agent_dependency_loader.py +2 -2
  233. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/METADATA +117 -8
  234. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/RECORD +238 -174
  235. claude_mpm/dashboard/static/css/code-tree.css +0 -1639
  236. claude_mpm/dashboard/static/js/components/code-tree/tree-breadcrumb.js +0 -353
  237. claude_mpm/dashboard/static/js/components/code-tree/tree-constants.js +0 -235
  238. claude_mpm/dashboard/static/js/components/code-tree/tree-search.js +0 -409
  239. claude_mpm/dashboard/static/js/components/code-tree/tree-utils.js +0 -435
  240. claude_mpm/dashboard/static/js/components/code-tree.js +0 -5869
  241. claude_mpm/dashboard/static/js/components/code-viewer.js +0 -1386
  242. claude_mpm/hooks/claude_hooks/hook_handler_eventbus.py +0 -425
  243. claude_mpm/hooks/claude_hooks/hook_handler_original.py +0 -1041
  244. claude_mpm/hooks/claude_hooks/hook_handler_refactored.py +0 -347
  245. claude_mpm/services/agents/deployment/agent_lifecycle_manager_refactored.py +0 -575
  246. claude_mpm/services/project/analyzer_refactored.py +0 -450
  247. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/WHEEL +0 -0
  248. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/entry_points.txt +0 -0
  249. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/licenses/LICENSE +0 -0
  250. {claude_mpm-4.13.2.dist-info → claude_mpm-4.18.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,396 @@
1
+ """
2
+ Log Monitor for Claude MPM Framework
3
+ ======================================
4
+
5
+ WHY: Provides real-time log file monitoring to detect error patterns that
6
+ indicate imminent crashes (OOM, exceptions, segfaults) BEFORE they occur.
7
+
8
+ DESIGN DECISION: Uses watchdog library for efficient OS-level file system
9
+ monitoring. Avoids polling by receiving file modification events from the OS.
10
+
11
+ ARCHITECTURE:
12
+ - Watchdog-based file system monitoring (OS-level events)
13
+ - Regex-based pattern matching for error detection
14
+ - Configurable error patterns with severity levels
15
+ - Rolling window of recent matches per deployment
16
+ - Callback system for pattern match alerts
17
+ - Thread-safe with proper locking
18
+
19
+ USAGE:
20
+ monitor = LogMonitor()
21
+ monitor.initialize()
22
+
23
+ # Add error patterns
24
+ monitor.add_pattern(r"OutOfMemoryError", severity="CRITICAL")
25
+ monitor.add_pattern(r"Exception:", severity="ERROR")
26
+
27
+ # Start monitoring a log file
28
+ monitor.start_monitoring(
29
+ log_file="/var/log/app.log",
30
+ deployment_id="my-app"
31
+ )
32
+
33
+ # Get recent matches
34
+ matches = monitor.get_recent_matches(deployment_id, limit=10)
35
+ """
36
+
37
+ import re
38
+ import threading
39
+ from collections import defaultdict
40
+ from pathlib import Path
41
+ from typing import Callable, Dict, List, Optional, Tuple
42
+
43
+ from watchdog.events import FileSystemEvent, FileSystemEventHandler
44
+ from watchdog.observers import Observer
45
+
46
+ from claude_mpm.core.logger import get_logger
47
+ from claude_mpm.services.core.base import SyncBaseService
48
+ from claude_mpm.services.core.interfaces.stability import ILogMonitor
49
+ from claude_mpm.services.core.models.stability import LogPatternMatch
50
+
51
+
52
+ class LogFileHandler(FileSystemEventHandler):
53
+ """
54
+ File system event handler for log file monitoring.
55
+
56
+ WHY: Receives OS-level file modification events and triggers
57
+ pattern matching on new log lines.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ log_file: str,
63
+ deployment_id: str,
64
+ on_new_lines: Callable[[str, List[str]], None],
65
+ ):
66
+ """
67
+ Initialize log file handler.
68
+
69
+ Args:
70
+ log_file: Path to log file being monitored
71
+ deployment_id: Deployment identifier
72
+ on_new_lines: Callback for new lines: (deployment_id, lines)
73
+ """
74
+ super().__init__()
75
+ self.log_file = Path(log_file).resolve()
76
+ self.deployment_id = deployment_id
77
+ self.on_new_lines = on_new_lines
78
+ self.last_position = 0
79
+
80
+ # Initialize to end of file
81
+ if self.log_file.exists():
82
+ self.last_position = self.log_file.stat().st_size
83
+
84
+ def on_modified(self, event: FileSystemEvent) -> None:
85
+ """
86
+ Handle file modification events.
87
+
88
+ Args:
89
+ event: File system event
90
+ """
91
+ if event.is_directory:
92
+ return
93
+
94
+ # Check if this is our log file
95
+ event_path = Path(event.src_path).resolve()
96
+ if event_path != self.log_file:
97
+ return
98
+
99
+ # Read new lines
100
+ try:
101
+ new_lines = self._read_new_lines()
102
+ if new_lines:
103
+ self.on_new_lines(self.deployment_id, new_lines)
104
+ except Exception as e:
105
+ # Log error but don't crash the monitoring thread
106
+ get_logger().error(f"Error reading new log lines: {e}")
107
+
108
+ def _read_new_lines(self) -> List[str]:
109
+ """
110
+ Read new lines from log file since last read.
111
+
112
+ Returns:
113
+ List of new lines
114
+ """
115
+ if not self.log_file.exists():
116
+ return []
117
+
118
+ new_lines = []
119
+
120
+ with Path(self.log_file).open(encoding="utf-8", errors="ignore") as f:
121
+ # Seek to last position
122
+ f.seek(self.last_position)
123
+
124
+ # Read new lines
125
+ for line in f:
126
+ new_lines.append(line.rstrip("\n\r"))
127
+
128
+ # Update position
129
+ self.last_position = f.tell()
130
+
131
+ return new_lines
132
+
133
+
134
+ class LogMonitor(SyncBaseService, ILogMonitor):
135
+ """
136
+ Real-time log file monitoring service.
137
+
138
+ WHY: Provides early warning of critical errors by monitoring log files
139
+ in real-time and detecting patterns that indicate imminent failures.
140
+
141
+ Thread Safety: All public methods are thread-safe with proper locking.
142
+ """
143
+
144
+ # Default error patterns
145
+ DEFAULT_PATTERNS = [
146
+ (r"OutOfMemoryError", "CRITICAL"),
147
+ (r"Segmentation fault", "CRITICAL"),
148
+ (r"Exception:", "ERROR"),
149
+ (r"Traceback", "ERROR"),
150
+ (r"Error:", "ERROR"),
151
+ (r"FATAL", "CRITICAL"),
152
+ (r"Database connection failed", "ERROR"),
153
+ (r"Connection refused", "WARNING"),
154
+ (r"Connection timeout", "WARNING"),
155
+ ]
156
+
157
+ def __init__(self, match_history_limit: int = 100):
158
+ """
159
+ Initialize log monitor.
160
+
161
+ Args:
162
+ match_history_limit: Number of matches to keep per deployment (default: 100)
163
+ """
164
+ super().__init__("LogMonitor")
165
+ self.match_history_limit = match_history_limit
166
+
167
+ # Error patterns: List[(pattern, severity)]
168
+ self._patterns: List[Tuple[re.Pattern, str]] = []
169
+
170
+ # Add default patterns
171
+ for pattern, severity in self.DEFAULT_PATTERNS:
172
+ self._patterns.append((re.compile(pattern), severity))
173
+
174
+ # Watchdog observer and handlers
175
+ self._observer: Optional[Observer] = None
176
+ self._handlers: Dict[str, LogFileHandler] = {} # deployment_id -> handler
177
+
178
+ # Match history: deployment_id -> List[LogPatternMatch]
179
+ self._match_history: Dict[str, List[LogPatternMatch]] = defaultdict(list)
180
+
181
+ # Match callbacks
182
+ self._match_callbacks: List[Callable[[str, LogPatternMatch], None]] = []
183
+
184
+ # Thread safety
185
+ self._lock = threading.Lock()
186
+
187
+ def initialize(self) -> bool:
188
+ """
189
+ Initialize the log monitor.
190
+
191
+ Returns:
192
+ True if initialization successful
193
+ """
194
+ try:
195
+ # Create watchdog observer
196
+ self._observer = Observer()
197
+ self._observer.start()
198
+
199
+ self._initialized = True
200
+ self.log_info(
201
+ f"Log monitor initialized with {len(self._patterns)} patterns"
202
+ )
203
+ return True
204
+
205
+ except Exception as e:
206
+ self.log_error(f"Failed to initialize: {e}")
207
+ return False
208
+
209
+ def shutdown(self) -> None:
210
+ """Shutdown log monitor and stop all monitoring."""
211
+ # Stop all monitoring
212
+ with self._lock:
213
+ deployment_ids = list(self._handlers.keys())
214
+
215
+ for deployment_id in deployment_ids:
216
+ self.stop_monitoring(deployment_id)
217
+
218
+ # Stop observer
219
+ if self._observer:
220
+ self._observer.stop()
221
+ self._observer.join(timeout=5.0)
222
+
223
+ self._shutdown = True
224
+ self.log_info("Log monitor shutdown complete")
225
+
226
+ def start_monitoring(self, log_file: str, deployment_id: str) -> None:
227
+ """
228
+ Start monitoring a log file for error patterns.
229
+
230
+ WHY: Begins watching the log file for new entries. Uses OS-level
231
+ file system events for efficiency.
232
+
233
+ Args:
234
+ log_file: Path to log file to monitor
235
+ deployment_id: Deployment identifier for callbacks
236
+ """
237
+ log_path = Path(log_file).resolve()
238
+
239
+ if not log_path.exists():
240
+ self.log_warning(f"Log file does not exist: {log_file}")
241
+ # Still create handler - it will start monitoring when file is created
242
+ # return
243
+
244
+ with self._lock:
245
+ # Check if already monitoring
246
+ if deployment_id in self._handlers:
247
+ self.log_warning(f"Already monitoring logs for {deployment_id}")
248
+ return
249
+
250
+ # Create handler
251
+ handler = LogFileHandler(
252
+ log_file=str(log_path),
253
+ deployment_id=deployment_id,
254
+ on_new_lines=self._process_new_lines,
255
+ )
256
+
257
+ # Schedule handler with observer
258
+ if self._observer:
259
+ # Watch the directory containing the log file
260
+ watch_dir = log_path.parent
261
+ self._observer.schedule(handler, str(watch_dir), recursive=False)
262
+
263
+ self._handlers[deployment_id] = handler
264
+
265
+ self.log_info(f"Started monitoring log file for {deployment_id}: {log_file}")
266
+
267
+ def stop_monitoring(self, deployment_id: str) -> None:
268
+ """
269
+ Stop monitoring a deployment's log file.
270
+
271
+ Args:
272
+ deployment_id: Deployment identifier
273
+ """
274
+ with self._lock:
275
+ handler = self._handlers.pop(deployment_id, None)
276
+ if handler and self._observer:
277
+ # Unschedule handler
278
+ self._observer.unschedule_all()
279
+
280
+ # Reschedule remaining handlers
281
+ for remaining_handler in self._handlers.values():
282
+ watch_dir = remaining_handler.log_file.parent
283
+ self._observer.schedule(
284
+ remaining_handler, str(watch_dir), recursive=False
285
+ )
286
+
287
+ if handler:
288
+ self.log_info(f"Stopped monitoring logs for {deployment_id}")
289
+
290
+ def add_pattern(self, pattern: str, severity: str = "ERROR") -> None:
291
+ """
292
+ Add an error pattern to monitor.
293
+
294
+ Args:
295
+ pattern: Regex pattern to match
296
+ severity: Error severity (ERROR, CRITICAL, WARNING)
297
+ """
298
+ with self._lock:
299
+ compiled_pattern = re.compile(pattern)
300
+ self._patterns.append((compiled_pattern, severity))
301
+
302
+ self.log_debug(f"Added pattern: {pattern} (severity: {severity})")
303
+
304
+ def get_recent_matches(
305
+ self, deployment_id: str, limit: int = 10
306
+ ) -> List[LogPatternMatch]:
307
+ """
308
+ Get recent pattern matches for a deployment.
309
+
310
+ Args:
311
+ deployment_id: Deployment identifier
312
+ limit: Maximum number of matches to return
313
+
314
+ Returns:
315
+ List of LogPatternMatch objects, newest first
316
+ """
317
+ with self._lock:
318
+ matches = self._match_history.get(deployment_id, [])
319
+ return list(reversed(matches[-limit:]))
320
+
321
+ def register_match_callback(
322
+ self, callback: Callable[[str, LogPatternMatch], None]
323
+ ) -> None:
324
+ """
325
+ Register callback for pattern matches.
326
+
327
+ Args:
328
+ callback: Function called with (deployment_id, match) when pattern detected
329
+ """
330
+ with self._lock:
331
+ self._match_callbacks.append(callback)
332
+ self.log_debug(f"Registered match callback: {callback.__name__}")
333
+
334
+ def _process_new_lines(self, deployment_id: str, lines: List[str]) -> None:
335
+ """
336
+ Process new log lines for pattern matching.
337
+
338
+ Args:
339
+ deployment_id: Deployment identifier
340
+ lines: New log lines to process
341
+ """
342
+ for line in lines:
343
+ # Check against all patterns
344
+ for pattern, severity in self._patterns:
345
+ if pattern.search(line):
346
+ # Create match
347
+ match = LogPatternMatch(
348
+ deployment_id=deployment_id,
349
+ pattern=pattern.pattern,
350
+ line=line,
351
+ severity=severity,
352
+ )
353
+
354
+ # Add to history
355
+ with self._lock:
356
+ self._match_history[deployment_id].append(match)
357
+
358
+ # Trim history
359
+ if (
360
+ len(self._match_history[deployment_id])
361
+ > self.match_history_limit
362
+ ):
363
+ self._match_history[deployment_id] = self._match_history[
364
+ deployment_id
365
+ ][-self.match_history_limit :]
366
+
367
+ # Log match
368
+ self.log_warning(
369
+ f"Pattern matched in {deployment_id}: "
370
+ f"[{severity}] {pattern.pattern[:50]}"
371
+ )
372
+
373
+ # Trigger callbacks
374
+ self._trigger_match_callbacks(deployment_id, match)
375
+
376
+ # Only match first pattern per line
377
+ break
378
+
379
+ def _trigger_match_callbacks(
380
+ self, deployment_id: str, match: LogPatternMatch
381
+ ) -> None:
382
+ """
383
+ Trigger registered callbacks for pattern matches.
384
+
385
+ Args:
386
+ deployment_id: Deployment that has a match
387
+ match: LogPatternMatch with pattern details
388
+ """
389
+ for callback in self._match_callbacks:
390
+ try:
391
+ callback(deployment_id, match)
392
+ except Exception as e:
393
+ self.log_error(f"Error in match callback {callback.__name__}: {e}")
394
+
395
+
396
+ __all__ = ["LogMonitor"]
@@ -0,0 +1,294 @@
1
+ """
2
+ Memory Leak Detector for Claude MPM Framework
3
+ ==============================================
4
+
5
+ WHY: Detects memory leaks BEFORE they cause OOM crashes by analyzing memory
6
+ usage trends over time using linear regression slope analysis.
7
+
8
+ DESIGN DECISION: Uses rolling window of memory measurements with configurable
9
+ size and threshold. Calculates slope to detect sustained memory growth patterns.
10
+
11
+ ARCHITECTURE:
12
+ - Rolling window of (timestamp, memory_mb) measurements per deployment
13
+ - Slope-based leak detection: MB/minute growth rate
14
+ - Configurable thresholds and window sizes
15
+ - Callback system for leak detection alerts
16
+ - Thread-safe with proper locking
17
+
18
+ USAGE:
19
+ detector = MemoryLeakDetector(
20
+ leak_threshold_mb_per_minute=10.0,
21
+ window_size=100,
22
+ )
23
+ detector.initialize()
24
+
25
+ # Record memory usage periodically
26
+ detector.record_memory_usage(deployment_id, memory_mb)
27
+
28
+ # Check for leaks
29
+ trend = detector.analyze_trend(deployment_id)
30
+ if trend.is_leaking:
31
+ print(f"Leak detected! Slope: {trend.slope_mb_per_minute} MB/min")
32
+ """
33
+
34
+ import threading
35
+ from collections import defaultdict
36
+ from datetime import datetime, timezone
37
+ from typing import Callable, Dict, List, Tuple
38
+
39
+ from claude_mpm.services.core.base import SyncBaseService
40
+ from claude_mpm.services.core.interfaces.stability import IMemoryLeakDetector
41
+ from claude_mpm.services.core.models.stability import MemoryTrend
42
+
43
+
44
+ class MemoryLeakDetector(SyncBaseService, IMemoryLeakDetector):
45
+ """
46
+ Memory leak detection service using trend analysis.
47
+
48
+ WHY: Provides early warning of memory leaks by analyzing memory growth
49
+ patterns over time, enabling preemptive restarts before OOM crashes.
50
+
51
+ Algorithm:
52
+ 1. Maintain rolling window of memory measurements
53
+ 2. Calculate linear slope (MB per minute)
54
+ 3. Detect leak if slope exceeds threshold (default: 10 MB/min)
55
+
56
+ Thread Safety: All public methods are thread-safe with proper locking.
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ leak_threshold_mb_per_minute: float = 10.0,
62
+ window_size: int = 100,
63
+ ):
64
+ """
65
+ Initialize memory leak detector.
66
+
67
+ Args:
68
+ leak_threshold_mb_per_minute: Threshold for leak detection (default: 10.0)
69
+ window_size: Number of measurements to keep in rolling window (default: 100)
70
+ """
71
+ super().__init__("MemoryLeakDetector")
72
+ self.leak_threshold = leak_threshold_mb_per_minute
73
+ self.window_size = window_size
74
+
75
+ # Memory measurements: deployment_id -> List[(timestamp, memory_mb)]
76
+ self._measurements: Dict[str, List[Tuple[datetime, float]]] = defaultdict(list)
77
+
78
+ # Thread safety
79
+ self._lock = threading.Lock()
80
+
81
+ # Leak detection callbacks
82
+ self._leak_callbacks: List[Callable[[str, MemoryTrend], None]] = []
83
+
84
+ def initialize(self) -> bool:
85
+ """
86
+ Initialize the memory leak detector.
87
+
88
+ Returns:
89
+ True if initialization successful
90
+ """
91
+ self._initialized = True
92
+ self.log_info(
93
+ f"Memory leak detector initialized "
94
+ f"(threshold={self.leak_threshold} MB/min, window={self.window_size})"
95
+ )
96
+ return True
97
+
98
+ def shutdown(self) -> None:
99
+ """Shutdown memory leak detector and clear data."""
100
+ with self._lock:
101
+ self._measurements.clear()
102
+ self._leak_callbacks.clear()
103
+
104
+ self._shutdown = True
105
+ self.log_info("Memory leak detector shutdown complete")
106
+
107
+ def record_memory_usage(self, deployment_id: str, memory_mb: float) -> None:
108
+ """
109
+ Record a memory usage measurement.
110
+
111
+ WHY: Builds historical data for trend analysis. Should be called
112
+ periodically (e.g., every 30s) to collect sufficient data points.
113
+
114
+ Args:
115
+ deployment_id: Deployment identifier
116
+ memory_mb: Current memory usage in megabytes
117
+ """
118
+ with self._lock:
119
+ # Add new measurement
120
+ timestamp = datetime.now(tz=timezone.utc)
121
+ self._measurements[deployment_id].append((timestamp, memory_mb))
122
+
123
+ # Trim to window size
124
+ if len(self._measurements[deployment_id]) > self.window_size:
125
+ self._measurements[deployment_id] = self._measurements[deployment_id][
126
+ -self.window_size :
127
+ ]
128
+
129
+ self.log_debug(
130
+ f"Recorded memory usage for {deployment_id}: {memory_mb:.2f}MB "
131
+ f"({len(self._measurements[deployment_id])} measurements)"
132
+ )
133
+
134
+ def analyze_trend(self, deployment_id: str) -> MemoryTrend:
135
+ """
136
+ Analyze memory usage trend for leak detection.
137
+
138
+ WHY: Computes slope of memory usage over time to detect sustained
139
+ growth patterns characteristic of memory leaks.
140
+
141
+ Args:
142
+ deployment_id: Deployment identifier
143
+
144
+ Returns:
145
+ MemoryTrend with slope analysis and leak detection result
146
+
147
+ Algorithm:
148
+ slope_mb_per_minute = (recent_memory - old_memory) / time_delta_minutes
149
+ is_leaking = slope_mb_per_minute > threshold
150
+ """
151
+ with self._lock:
152
+ measurements = self._measurements.get(deployment_id, [])
153
+
154
+ # Need at least 2 measurements for trend analysis
155
+ if len(measurements) < 2:
156
+ return MemoryTrend(
157
+ deployment_id=deployment_id,
158
+ timestamps=[],
159
+ memory_mb=[],
160
+ slope_mb_per_minute=0.0,
161
+ is_leaking=False,
162
+ window_size=0,
163
+ threshold_mb_per_minute=self.leak_threshold,
164
+ )
165
+
166
+ # Extract timestamps and memory values
167
+ timestamps = [ts for ts, _ in measurements]
168
+ memory_mb = [mem for _, mem in measurements]
169
+
170
+ # Calculate slope using simple linear trend
171
+ slope = self._calculate_slope(measurements)
172
+
173
+ # Detect leak if slope exceeds threshold
174
+ is_leaking = slope > self.leak_threshold
175
+
176
+ trend = MemoryTrend(
177
+ deployment_id=deployment_id,
178
+ timestamps=timestamps,
179
+ memory_mb=memory_mb,
180
+ slope_mb_per_minute=slope,
181
+ is_leaking=is_leaking,
182
+ window_size=len(measurements),
183
+ threshold_mb_per_minute=self.leak_threshold,
184
+ )
185
+
186
+ # Trigger callbacks if leak detected
187
+ if is_leaking:
188
+ self.log_warning(
189
+ f"Memory leak detected for {deployment_id}: "
190
+ f"{slope:.2f} MB/min (threshold: {self.leak_threshold} MB/min)"
191
+ )
192
+ self._trigger_leak_callbacks(deployment_id, trend)
193
+
194
+ return trend
195
+
196
+ def is_leaking(self, deployment_id: str) -> bool:
197
+ """
198
+ Check if deployment has a detected memory leak.
199
+
200
+ Returns:
201
+ True if leak detected (sustained memory growth)
202
+ """
203
+ trend = self.analyze_trend(deployment_id)
204
+ return trend.is_leaking
205
+
206
+ def register_leak_callback(
207
+ self, callback: Callable[[str, MemoryTrend], None]
208
+ ) -> None:
209
+ """
210
+ Register callback for leak detection events.
211
+
212
+ Args:
213
+ callback: Function called with (deployment_id, trend) when leak detected
214
+ """
215
+ with self._lock:
216
+ self._leak_callbacks.append(callback)
217
+ self.log_debug(f"Registered leak callback: {callback.__name__}")
218
+
219
+ def _calculate_slope(self, measurements: List[Tuple[datetime, float]]) -> float:
220
+ """
221
+ Calculate memory growth slope using simple linear regression.
222
+
223
+ WHY: Linear slope provides a robust measure of sustained memory growth,
224
+ filtering out normal variations and temporary spikes.
225
+
226
+ Args:
227
+ measurements: List of (timestamp, memory_mb) tuples
228
+
229
+ Returns:
230
+ Slope in MB per minute
231
+
232
+ Algorithm:
233
+ Simple two-point slope: (y2 - y1) / (x2 - x1)
234
+ Where x is time in minutes, y is memory in MB
235
+ """
236
+ if len(measurements) < 2:
237
+ return 0.0
238
+
239
+ # Get first and last measurements
240
+ first_timestamp, first_memory = measurements[0]
241
+ last_timestamp, last_memory = measurements[-1]
242
+
243
+ # Calculate time delta in minutes
244
+ time_delta_seconds = (last_timestamp - first_timestamp).total_seconds()
245
+ time_delta_minutes = time_delta_seconds / 60.0
246
+
247
+ if time_delta_minutes == 0:
248
+ return 0.0
249
+
250
+ # Calculate slope (MB per minute)
251
+ memory_delta = last_memory - first_memory
252
+ return memory_delta / time_delta_minutes
253
+
254
+ def _trigger_leak_callbacks(self, deployment_id: str, trend: MemoryTrend) -> None:
255
+ """
256
+ Trigger registered callbacks for leak detection.
257
+
258
+ Args:
259
+ deployment_id: Deployment that has a leak
260
+ trend: MemoryTrend with leak analysis
261
+ """
262
+ for callback in self._leak_callbacks:
263
+ try:
264
+ callback(deployment_id, trend)
265
+ except Exception as e:
266
+ self.log_error(f"Error in leak callback {callback.__name__}: {e}")
267
+
268
+ def get_measurements(self, deployment_id: str) -> List[Tuple[datetime, float]]:
269
+ """
270
+ Get all measurements for a deployment (for testing/debugging).
271
+
272
+ Args:
273
+ deployment_id: Deployment identifier
274
+
275
+ Returns:
276
+ List of (timestamp, memory_mb) tuples
277
+ """
278
+ with self._lock:
279
+ return list(self._measurements.get(deployment_id, []))
280
+
281
+ def clear_measurements(self, deployment_id: str) -> None:
282
+ """
283
+ Clear measurements for a deployment (e.g., after restart).
284
+
285
+ Args:
286
+ deployment_id: Deployment identifier
287
+ """
288
+ with self._lock:
289
+ if deployment_id in self._measurements:
290
+ del self._measurements[deployment_id]
291
+ self.log_debug(f"Cleared measurements for {deployment_id}")
292
+
293
+
294
+ __all__ = ["MemoryLeakDetector"]