crackerjack 0.31.10__py3-none-any.whl → 0.31.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crackerjack might be problematic. Click here for more details.
- crackerjack/CLAUDE.md +288 -705
- crackerjack/__main__.py +22 -8
- crackerjack/agents/__init__.py +0 -3
- crackerjack/agents/architect_agent.py +0 -43
- crackerjack/agents/base.py +1 -9
- crackerjack/agents/coordinator.py +2 -148
- crackerjack/agents/documentation_agent.py +109 -81
- crackerjack/agents/dry_agent.py +122 -97
- crackerjack/agents/formatting_agent.py +3 -16
- crackerjack/agents/import_optimization_agent.py +1174 -130
- crackerjack/agents/performance_agent.py +956 -188
- crackerjack/agents/performance_helpers.py +229 -0
- crackerjack/agents/proactive_agent.py +1 -48
- crackerjack/agents/refactoring_agent.py +516 -246
- crackerjack/agents/refactoring_helpers.py +282 -0
- crackerjack/agents/security_agent.py +393 -90
- crackerjack/agents/test_creation_agent.py +1776 -120
- crackerjack/agents/test_specialist_agent.py +59 -15
- crackerjack/agents/tracker.py +0 -102
- crackerjack/api.py +145 -37
- crackerjack/cli/handlers.py +48 -30
- crackerjack/cli/interactive.py +11 -11
- crackerjack/cli/options.py +66 -4
- crackerjack/code_cleaner.py +808 -148
- crackerjack/config/global_lock_config.py +110 -0
- crackerjack/config/hooks.py +43 -64
- crackerjack/core/async_workflow_orchestrator.py +247 -97
- crackerjack/core/autofix_coordinator.py +192 -109
- crackerjack/core/enhanced_container.py +46 -63
- crackerjack/core/file_lifecycle.py +549 -0
- crackerjack/core/performance.py +9 -8
- crackerjack/core/performance_monitor.py +395 -0
- crackerjack/core/phase_coordinator.py +281 -94
- crackerjack/core/proactive_workflow.py +9 -58
- crackerjack/core/resource_manager.py +501 -0
- crackerjack/core/service_watchdog.py +490 -0
- crackerjack/core/session_coordinator.py +4 -8
- crackerjack/core/timeout_manager.py +504 -0
- crackerjack/core/websocket_lifecycle.py +475 -0
- crackerjack/core/workflow_orchestrator.py +343 -209
- crackerjack/dynamic_config.py +50 -9
- crackerjack/errors.py +3 -4
- crackerjack/executors/async_hook_executor.py +63 -13
- crackerjack/executors/cached_hook_executor.py +14 -14
- crackerjack/executors/hook_executor.py +100 -37
- crackerjack/executors/hook_lock_manager.py +856 -0
- crackerjack/executors/individual_hook_executor.py +120 -86
- crackerjack/intelligence/__init__.py +0 -7
- crackerjack/intelligence/adaptive_learning.py +13 -86
- crackerjack/intelligence/agent_orchestrator.py +15 -78
- crackerjack/intelligence/agent_registry.py +12 -59
- crackerjack/intelligence/agent_selector.py +31 -92
- crackerjack/intelligence/integration.py +1 -41
- crackerjack/interactive.py +9 -9
- crackerjack/managers/async_hook_manager.py +25 -8
- crackerjack/managers/hook_manager.py +9 -9
- crackerjack/managers/publish_manager.py +57 -59
- crackerjack/managers/test_command_builder.py +6 -36
- crackerjack/managers/test_executor.py +9 -61
- crackerjack/managers/test_manager.py +17 -63
- crackerjack/managers/test_manager_backup.py +77 -127
- crackerjack/managers/test_progress.py +4 -23
- crackerjack/mcp/cache.py +5 -12
- crackerjack/mcp/client_runner.py +10 -10
- crackerjack/mcp/context.py +64 -6
- crackerjack/mcp/dashboard.py +14 -11
- crackerjack/mcp/enhanced_progress_monitor.py +55 -55
- crackerjack/mcp/file_monitor.py +72 -42
- crackerjack/mcp/progress_components.py +103 -84
- crackerjack/mcp/progress_monitor.py +122 -49
- crackerjack/mcp/rate_limiter.py +12 -12
- crackerjack/mcp/server_core.py +16 -22
- crackerjack/mcp/service_watchdog.py +26 -26
- crackerjack/mcp/state.py +15 -0
- crackerjack/mcp/tools/core_tools.py +95 -39
- crackerjack/mcp/tools/error_analyzer.py +6 -32
- crackerjack/mcp/tools/execution_tools.py +1 -56
- crackerjack/mcp/tools/execution_tools_backup.py +35 -131
- crackerjack/mcp/tools/intelligence_tool_registry.py +0 -36
- crackerjack/mcp/tools/intelligence_tools.py +2 -55
- crackerjack/mcp/tools/monitoring_tools.py +308 -145
- crackerjack/mcp/tools/proactive_tools.py +12 -42
- crackerjack/mcp/tools/progress_tools.py +23 -15
- crackerjack/mcp/tools/utility_tools.py +3 -40
- crackerjack/mcp/tools/workflow_executor.py +40 -60
- crackerjack/mcp/websocket/app.py +0 -3
- crackerjack/mcp/websocket/endpoints.py +206 -268
- crackerjack/mcp/websocket/jobs.py +213 -66
- crackerjack/mcp/websocket/server.py +84 -6
- crackerjack/mcp/websocket/websocket_handler.py +137 -29
- crackerjack/models/config_adapter.py +3 -16
- crackerjack/models/protocols.py +162 -3
- crackerjack/models/resource_protocols.py +454 -0
- crackerjack/models/task.py +3 -3
- crackerjack/monitoring/__init__.py +0 -0
- crackerjack/monitoring/ai_agent_watchdog.py +25 -71
- crackerjack/monitoring/regression_prevention.py +28 -87
- crackerjack/orchestration/advanced_orchestrator.py +44 -78
- crackerjack/orchestration/coverage_improvement.py +10 -60
- crackerjack/orchestration/execution_strategies.py +16 -16
- crackerjack/orchestration/test_progress_streamer.py +61 -53
- crackerjack/plugins/base.py +1 -1
- crackerjack/plugins/managers.py +22 -20
- crackerjack/py313.py +65 -21
- crackerjack/services/backup_service.py +467 -0
- crackerjack/services/bounded_status_operations.py +627 -0
- crackerjack/services/cache.py +7 -9
- crackerjack/services/config.py +35 -52
- crackerjack/services/config_integrity.py +5 -16
- crackerjack/services/config_merge.py +542 -0
- crackerjack/services/contextual_ai_assistant.py +17 -19
- crackerjack/services/coverage_ratchet.py +44 -73
- crackerjack/services/debug.py +25 -39
- crackerjack/services/dependency_monitor.py +52 -50
- crackerjack/services/enhanced_filesystem.py +14 -11
- crackerjack/services/file_hasher.py +1 -1
- crackerjack/services/filesystem.py +1 -12
- crackerjack/services/git.py +71 -47
- crackerjack/services/health_metrics.py +31 -27
- crackerjack/services/initialization.py +276 -428
- crackerjack/services/input_validator.py +760 -0
- crackerjack/services/log_manager.py +16 -16
- crackerjack/services/logging.py +7 -6
- crackerjack/services/metrics.py +43 -43
- crackerjack/services/pattern_cache.py +2 -31
- crackerjack/services/pattern_detector.py +26 -63
- crackerjack/services/performance_benchmarks.py +20 -45
- crackerjack/services/regex_patterns.py +2887 -0
- crackerjack/services/regex_utils.py +537 -0
- crackerjack/services/secure_path_utils.py +683 -0
- crackerjack/services/secure_status_formatter.py +534 -0
- crackerjack/services/secure_subprocess.py +605 -0
- crackerjack/services/security.py +47 -10
- crackerjack/services/security_logger.py +492 -0
- crackerjack/services/server_manager.py +109 -50
- crackerjack/services/smart_scheduling.py +8 -25
- crackerjack/services/status_authentication.py +603 -0
- crackerjack/services/status_security_manager.py +442 -0
- crackerjack/services/thread_safe_status_collector.py +546 -0
- crackerjack/services/tool_version_service.py +1 -23
- crackerjack/services/unified_config.py +36 -58
- crackerjack/services/validation_rate_limiter.py +269 -0
- crackerjack/services/version_checker.py +9 -40
- crackerjack/services/websocket_resource_limiter.py +572 -0
- crackerjack/slash_commands/__init__.py +52 -2
- crackerjack/tools/__init__.py +0 -0
- crackerjack/tools/validate_input_validator_patterns.py +262 -0
- crackerjack/tools/validate_regex_patterns.py +198 -0
- {crackerjack-0.31.10.dist-info → crackerjack-0.31.13.dist-info}/METADATA +197 -12
- crackerjack-0.31.13.dist-info/RECORD +178 -0
- crackerjack/cli/facade.py +0 -104
- crackerjack-0.31.10.dist-info/RECORD +0 -149
- {crackerjack-0.31.10.dist-info → crackerjack-0.31.13.dist-info}/WHEEL +0 -0
- {crackerjack-0.31.10.dist-info → crackerjack-0.31.13.dist-info}/entry_points.txt +0 -0
- {crackerjack-0.31.10.dist-info → crackerjack-0.31.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Service watchdog with timeout protection and automatic recovery.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive monitoring of crackerjack services
|
|
5
|
+
with automatic restart capabilities and hanging prevention.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import contextlib
|
|
10
|
+
import logging
|
|
11
|
+
import signal
|
|
12
|
+
import subprocess
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
16
|
+
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.table import Table
|
|
19
|
+
|
|
20
|
+
from ..services.security_logger import get_security_logger
|
|
21
|
+
from .timeout_manager import TimeoutStrategy, get_timeout_manager
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("crackerjack.service_watchdog")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ServiceState(Enum):
|
|
27
|
+
"""Service states."""
|
|
28
|
+
|
|
29
|
+
STOPPED = "stopped"
|
|
30
|
+
STARTING = "starting"
|
|
31
|
+
RUNNING = "running"
|
|
32
|
+
STOPPING = "stopping"
|
|
33
|
+
FAILED = "failed"
|
|
34
|
+
TIMEOUT = "timeout"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ServiceConfig:
|
|
39
|
+
"""Configuration for a monitored service."""
|
|
40
|
+
|
|
41
|
+
name: str
|
|
42
|
+
command: list[str]
|
|
43
|
+
health_check_url: str | None = None
|
|
44
|
+
health_check_timeout: float = 5.0
|
|
45
|
+
startup_timeout: float = 30.0
|
|
46
|
+
shutdown_timeout: float = 10.0
|
|
47
|
+
max_restarts: int = 5
|
|
48
|
+
restart_delay: float = 5.0
|
|
49
|
+
restart_backoff_multiplier: float = 2.0
|
|
50
|
+
max_restart_delay: float = 300.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class ServiceStatus:
|
|
55
|
+
"""Status of a monitored service."""
|
|
56
|
+
|
|
57
|
+
config: ServiceConfig
|
|
58
|
+
state: ServiceState = ServiceState.STOPPED
|
|
59
|
+
process: subprocess.Popen[bytes] | None = None
|
|
60
|
+
last_start_time: float = 0.0
|
|
61
|
+
last_health_check: float = 0.0
|
|
62
|
+
restart_count: int = 0
|
|
63
|
+
consecutive_failures: int = 0
|
|
64
|
+
last_error: str = ""
|
|
65
|
+
health_check_failures: int = 0
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def uptime(self) -> float:
|
|
69
|
+
"""Get service uptime in seconds."""
|
|
70
|
+
if self.state == ServiceState.RUNNING and self.last_start_time > 0:
|
|
71
|
+
return time.time() - self.last_start_time
|
|
72
|
+
return 0.0
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def is_healthy(self) -> bool:
|
|
76
|
+
"""Check if service is healthy."""
|
|
77
|
+
return (
|
|
78
|
+
self.state == ServiceState.RUNNING
|
|
79
|
+
and self.process is not None
|
|
80
|
+
and self.process.poll() is None
|
|
81
|
+
and self.health_check_failures < 3
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class ServiceWatchdog:
|
|
86
|
+
"""Watchdog for monitoring and managing services with timeout protection."""
|
|
87
|
+
|
|
88
|
+
def __init__(self, console: Console | None = None) -> None:
|
|
89
|
+
self.console = console or Console()
|
|
90
|
+
self.timeout_manager = get_timeout_manager()
|
|
91
|
+
self.services: dict[str, ServiceStatus] = {}
|
|
92
|
+
self.is_running = False
|
|
93
|
+
self.monitor_task: asyncio.Task[None] | None = None
|
|
94
|
+
|
|
95
|
+
# Default service configurations
|
|
96
|
+
self.default_configs = {
|
|
97
|
+
"mcp_server": ServiceConfig(
|
|
98
|
+
name="MCP Server",
|
|
99
|
+
command=["python", "-m", "crackerjack", "--start-mcp-server"],
|
|
100
|
+
startup_timeout=30.0,
|
|
101
|
+
shutdown_timeout=15.0,
|
|
102
|
+
),
|
|
103
|
+
"websocket_server": ServiceConfig(
|
|
104
|
+
name="WebSocket Server",
|
|
105
|
+
command=["python", "-m", "crackerjack", "--start-websocket-server"],
|
|
106
|
+
health_check_url="http://localhost:8675/",
|
|
107
|
+
health_check_timeout=3.0,
|
|
108
|
+
startup_timeout=20.0,
|
|
109
|
+
shutdown_timeout=10.0,
|
|
110
|
+
),
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
def add_service(self, service_id: str, config: ServiceConfig) -> None:
|
|
114
|
+
"""Add a service to monitor."""
|
|
115
|
+
self.services[service_id] = ServiceStatus(config=config)
|
|
116
|
+
logger.info(f"Added service {service_id} to watchdog")
|
|
117
|
+
|
|
118
|
+
def remove_service(self, service_id: str) -> None:
|
|
119
|
+
"""Remove a service from monitoring."""
|
|
120
|
+
if service_id in self.services:
|
|
121
|
+
asyncio.create_task(self.stop_service(service_id))
|
|
122
|
+
del self.services[service_id]
|
|
123
|
+
logger.info(f"Removed service {service_id} from watchdog")
|
|
124
|
+
|
|
125
|
+
async def start_watchdog(self) -> None:
|
|
126
|
+
"""Start the watchdog monitoring."""
|
|
127
|
+
if self.is_running:
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
self.is_running = True
|
|
131
|
+
|
|
132
|
+
# Add default services
|
|
133
|
+
for service_id, config in self.default_configs.items():
|
|
134
|
+
self.add_service(service_id, config)
|
|
135
|
+
|
|
136
|
+
# Start monitoring task with timeout protection
|
|
137
|
+
self.monitor_task = asyncio.create_task(self._monitor_services())
|
|
138
|
+
|
|
139
|
+
# Setup signal handlers for graceful shutdown
|
|
140
|
+
self._setup_signal_handlers()
|
|
141
|
+
|
|
142
|
+
self.console.print("[green]🐕 Service Watchdog started[/green]")
|
|
143
|
+
logger.info("Service watchdog started")
|
|
144
|
+
|
|
145
|
+
async def stop_watchdog(self) -> None:
|
|
146
|
+
"""Stop the watchdog and all monitored services."""
|
|
147
|
+
if not self.is_running:
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
self.is_running = False
|
|
151
|
+
|
|
152
|
+
# Cancel monitoring task
|
|
153
|
+
if self.monitor_task and not self.monitor_task.done():
|
|
154
|
+
self.monitor_task.cancel()
|
|
155
|
+
try:
|
|
156
|
+
await self.monitor_task
|
|
157
|
+
except asyncio.CancelledError:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
# Stop all services
|
|
161
|
+
stop_tasks = [
|
|
162
|
+
self.stop_service(service_id) for service_id in self.services.keys()
|
|
163
|
+
]
|
|
164
|
+
if stop_tasks:
|
|
165
|
+
await asyncio.gather(*stop_tasks, return_exceptions=True)
|
|
166
|
+
|
|
167
|
+
self.console.print("[yellow]🐕 Service Watchdog stopped[/yellow]")
|
|
168
|
+
logger.info("Service watchdog stopped")
|
|
169
|
+
|
|
170
|
+
async def start_service(self, service_id: str) -> bool:
|
|
171
|
+
"""Start a specific service with timeout protection."""
|
|
172
|
+
if not self._validate_service_start_request(service_id):
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
service = self.services[service_id]
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
return await self._execute_service_startup(service_id, service)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
return self._handle_service_start_failure(service, service_id, e)
|
|
181
|
+
|
|
182
|
+
def _validate_service_start_request(self, service_id: str) -> bool:
|
|
183
|
+
"""Validate if service can be started."""
|
|
184
|
+
if service_id not in self.services:
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
service = self.services[service_id]
|
|
188
|
+
return service.state not in (ServiceState.RUNNING, ServiceState.STARTING)
|
|
189
|
+
|
|
190
|
+
async def _execute_service_startup(
|
|
191
|
+
self, service_id: str, service: ServiceStatus
|
|
192
|
+
) -> bool:
|
|
193
|
+
"""Execute the service startup process with timeout protection."""
|
|
194
|
+
async with self.timeout_manager.timeout_context(
|
|
195
|
+
f"start_service_{service_id}",
|
|
196
|
+
timeout=service.config.startup_timeout,
|
|
197
|
+
strategy=TimeoutStrategy.FAIL_FAST,
|
|
198
|
+
):
|
|
199
|
+
self._prepare_service_startup(service)
|
|
200
|
+
|
|
201
|
+
if not await self._start_service_process(service):
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
if not await self._verify_service_health(service):
|
|
205
|
+
return False
|
|
206
|
+
|
|
207
|
+
self._finalize_successful_startup(service, service_id)
|
|
208
|
+
return True
|
|
209
|
+
|
|
210
|
+
def _prepare_service_startup(self, service: ServiceStatus) -> None:
|
|
211
|
+
"""Prepare service for startup."""
|
|
212
|
+
service.state = ServiceState.STARTING
|
|
213
|
+
service.last_start_time = time.time()
|
|
214
|
+
|
|
215
|
+
async def _start_service_process(self, service: ServiceStatus) -> bool:
|
|
216
|
+
"""Start the service process and verify it's running."""
|
|
217
|
+
# Start the service process with security logging
|
|
218
|
+
security_logger = get_security_logger()
|
|
219
|
+
security_logger.log_subprocess_execution(
|
|
220
|
+
command=service.config.command,
|
|
221
|
+
purpose="service_watchdog_start",
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
service.process = subprocess.Popen(
|
|
225
|
+
service.config.command,
|
|
226
|
+
stdout=subprocess.PIPE,
|
|
227
|
+
stderr=subprocess.PIPE,
|
|
228
|
+
start_new_session=True,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Wait for process to stabilize
|
|
232
|
+
await asyncio.sleep(2)
|
|
233
|
+
|
|
234
|
+
# Check if process is still running
|
|
235
|
+
if service.process.poll() is not None:
|
|
236
|
+
service.state = ServiceState.FAILED
|
|
237
|
+
service.last_error = "Process exited immediately"
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
return True
|
|
241
|
+
|
|
242
|
+
async def _verify_service_health(self, service: ServiceStatus) -> bool:
|
|
243
|
+
"""Verify service health if health check is configured."""
|
|
244
|
+
if not service.config.health_check_url:
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
health_ok = await self._perform_health_check(service)
|
|
248
|
+
if not health_ok:
|
|
249
|
+
await self._terminate_process(service)
|
|
250
|
+
service.state = ServiceState.FAILED
|
|
251
|
+
service.last_error = "Health check failed"
|
|
252
|
+
return False
|
|
253
|
+
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
def _finalize_successful_startup(
|
|
257
|
+
self, service: ServiceStatus, service_id: str
|
|
258
|
+
) -> None:
|
|
259
|
+
"""Finalize successful service startup."""
|
|
260
|
+
service.state = ServiceState.RUNNING
|
|
261
|
+
service.consecutive_failures = 0
|
|
262
|
+
service.health_check_failures = 0
|
|
263
|
+
|
|
264
|
+
self.console.print(f"[green]✅ Started {service.config.name}[/green]")
|
|
265
|
+
logger.info(f"Started service {service_id}")
|
|
266
|
+
|
|
267
|
+
def _handle_service_start_failure(
|
|
268
|
+
self, service: ServiceStatus, service_id: str, error: Exception
|
|
269
|
+
) -> bool:
|
|
270
|
+
"""Handle service startup failure."""
|
|
271
|
+
service.state = ServiceState.FAILED
|
|
272
|
+
service.last_error = str(error)
|
|
273
|
+
service.consecutive_failures += 1
|
|
274
|
+
|
|
275
|
+
if service.process:
|
|
276
|
+
asyncio.create_task(self._terminate_process(service))
|
|
277
|
+
|
|
278
|
+
self.console.print(
|
|
279
|
+
f"[red]❌ Failed to start {service.config.name}: {error}[/red]"
|
|
280
|
+
)
|
|
281
|
+
logger.error(f"Failed to start service {service_id}: {error}")
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
async def stop_service(self, service_id: str) -> bool:
|
|
285
|
+
"""Stop a specific service with timeout protection."""
|
|
286
|
+
if service_id not in self.services:
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
service = self.services[service_id]
|
|
290
|
+
|
|
291
|
+
if service.state == ServiceState.STOPPED:
|
|
292
|
+
return True
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
async with self.timeout_manager.timeout_context(
|
|
296
|
+
f"stop_service_{service_id}",
|
|
297
|
+
timeout=service.config.shutdown_timeout,
|
|
298
|
+
strategy=TimeoutStrategy.FAIL_FAST,
|
|
299
|
+
):
|
|
300
|
+
service.state = ServiceState.STOPPING
|
|
301
|
+
|
|
302
|
+
if service.process:
|
|
303
|
+
await self._terminate_process(service)
|
|
304
|
+
|
|
305
|
+
service.state = ServiceState.STOPPED
|
|
306
|
+
service.process = None
|
|
307
|
+
|
|
308
|
+
self.console.print(f"[yellow]⏹️ Stopped {service.config.name}[/yellow]")
|
|
309
|
+
logger.info(f"Stopped service {service_id}")
|
|
310
|
+
return True
|
|
311
|
+
|
|
312
|
+
except Exception as e:
|
|
313
|
+
service.state = ServiceState.FAILED
|
|
314
|
+
service.last_error = str(e)
|
|
315
|
+
|
|
316
|
+
self.console.print(
|
|
317
|
+
f"[red]❌ Failed to stop {service.config.name}: {e}[/red]"
|
|
318
|
+
)
|
|
319
|
+
logger.error(f"Failed to stop service {service_id}: {e}")
|
|
320
|
+
return False
|
|
321
|
+
|
|
322
|
+
async def _monitor_services(self) -> None:
|
|
323
|
+
"""Main monitoring loop with timeout protection."""
|
|
324
|
+
while self.is_running:
|
|
325
|
+
try:
|
|
326
|
+
async with self.timeout_manager.timeout_context(
|
|
327
|
+
"monitor_services",
|
|
328
|
+
timeout=30.0, # Monitor cycle timeout
|
|
329
|
+
strategy=TimeoutStrategy.GRACEFUL_DEGRADATION,
|
|
330
|
+
):
|
|
331
|
+
# Check each service
|
|
332
|
+
for service_id, service in self.services.items():
|
|
333
|
+
if not self.is_running: # Check if shutdown requested
|
|
334
|
+
break
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
await self._check_service_health(service_id, service)
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.error(f"Error checking service {service_id}: {e}")
|
|
340
|
+
|
|
341
|
+
# Wait before next check cycle
|
|
342
|
+
await asyncio.sleep(10) # Check every 10 seconds
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Monitor services error: {e}")
|
|
346
|
+
await asyncio.sleep(30) # Longer delay on error
|
|
347
|
+
|
|
348
|
+
async def _check_service_health(
|
|
349
|
+
self, service_id: str, service: ServiceStatus
|
|
350
|
+
) -> None:
|
|
351
|
+
"""Check health of a single service."""
|
|
352
|
+
if service.state == ServiceState.RUNNING:
|
|
353
|
+
# Check if process is still alive
|
|
354
|
+
if service.process and service.process.poll() is not None:
|
|
355
|
+
service.state = ServiceState.FAILED
|
|
356
|
+
service.last_error = (
|
|
357
|
+
f"Process died with exit code {service.process.returncode}"
|
|
358
|
+
)
|
|
359
|
+
service.consecutive_failures += 1
|
|
360
|
+
|
|
361
|
+
self.console.print(f"[red]💀 {service.config.name} process died[/red]")
|
|
362
|
+
return
|
|
363
|
+
|
|
364
|
+
async def _perform_health_check(self, service: ServiceStatus) -> bool:
|
|
365
|
+
"""Perform HTTP health check with timeout protection."""
|
|
366
|
+
if not service.config.health_check_url:
|
|
367
|
+
return True
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
import aiohttp
|
|
371
|
+
|
|
372
|
+
async with self.timeout_manager.timeout_context(
|
|
373
|
+
"health_check",
|
|
374
|
+
timeout=service.config.health_check_timeout,
|
|
375
|
+
strategy=TimeoutStrategy.FAIL_FAST,
|
|
376
|
+
):
|
|
377
|
+
async with aiohttp.ClientSession() as session:
|
|
378
|
+
async with session.get(service.config.health_check_url) as response:
|
|
379
|
+
return response.status == 200
|
|
380
|
+
|
|
381
|
+
except Exception:
|
|
382
|
+
return False
|
|
383
|
+
|
|
384
|
+
async def _terminate_process(self, service: ServiceStatus) -> None:
|
|
385
|
+
"""Terminate service process gracefully with timeout."""
|
|
386
|
+
if not service.process:
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
# Try graceful termination first
|
|
391
|
+
service.process.terminate()
|
|
392
|
+
|
|
393
|
+
# Wait for graceful shutdown
|
|
394
|
+
try:
|
|
395
|
+
await asyncio.wait_for(
|
|
396
|
+
self._wait_for_process_exit(service.process), timeout=5.0
|
|
397
|
+
)
|
|
398
|
+
except TimeoutError:
|
|
399
|
+
# Force kill if graceful shutdown fails
|
|
400
|
+
service.process.kill()
|
|
401
|
+
await asyncio.wait_for(
|
|
402
|
+
self._wait_for_process_exit(service.process), timeout=2.0
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.warning(f"Error terminating process: {e}")
|
|
407
|
+
# Last resort: force kill
|
|
408
|
+
with contextlib.suppress(Exception):
|
|
409
|
+
service.process.kill()
|
|
410
|
+
|
|
411
|
+
async def _wait_for_process_exit(self, process: subprocess.Popen[bytes]) -> None:
|
|
412
|
+
"""Wait for process to exit."""
|
|
413
|
+
while process.poll() is None:
|
|
414
|
+
await asyncio.sleep(0.1)
|
|
415
|
+
|
|
416
|
+
def _setup_signal_handlers(self) -> None:
|
|
417
|
+
"""Setup signal handlers for graceful shutdown."""
|
|
418
|
+
|
|
419
|
+
def signal_handler(signum: int, frame: object) -> None: # noqa: ARG001
|
|
420
|
+
"""Handle termination signals."""
|
|
421
|
+
_ = frame # Signal handler frame - required by signal API
|
|
422
|
+
logger.info(f"Received signal {signum}, stopping watchdog...")
|
|
423
|
+
asyncio.create_task(self.stop_watchdog())
|
|
424
|
+
|
|
425
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
426
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
427
|
+
|
|
428
|
+
def get_service_status(self, service_id: str) -> ServiceStatus | None:
|
|
429
|
+
"""Get status of a specific service."""
|
|
430
|
+
return self.services.get(service_id)
|
|
431
|
+
|
|
432
|
+
def get_all_services_status(self) -> dict[str, ServiceStatus]:
|
|
433
|
+
"""Get status of all services."""
|
|
434
|
+
return self.services.copy()
|
|
435
|
+
|
|
436
|
+
def print_status_report(self) -> None:
|
|
437
|
+
"""Print formatted status report."""
|
|
438
|
+
self.console.print("\n[bold blue]🐕 Service Watchdog Status[/bold blue]")
|
|
439
|
+
self.console.print("=" * 50)
|
|
440
|
+
|
|
441
|
+
if not self.services:
|
|
442
|
+
self.console.print("[dim]No services configured[/dim]")
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
table = Table()
|
|
446
|
+
table.add_column("Service")
|
|
447
|
+
table.add_column("Status")
|
|
448
|
+
table.add_column("Uptime")
|
|
449
|
+
|
|
450
|
+
for service in self.services.values():
|
|
451
|
+
# Status emoji and color
|
|
452
|
+
if service.state == ServiceState.RUNNING and service.is_healthy:
|
|
453
|
+
status = "[green]🟢 Running[/green]"
|
|
454
|
+
elif service.state == ServiceState.STARTING:
|
|
455
|
+
status = "[yellow]🟡 Starting[/yellow]"
|
|
456
|
+
elif service.state == ServiceState.STOPPING:
|
|
457
|
+
status = "[yellow]🟡 Stopping[/yellow]"
|
|
458
|
+
elif service.state == ServiceState.FAILED:
|
|
459
|
+
status = "[red]🔴 Failed[/red]"
|
|
460
|
+
elif service.state == ServiceState.TIMEOUT:
|
|
461
|
+
status = "[red]⏰ Timeout[/red]"
|
|
462
|
+
else:
|
|
463
|
+
status = "[dim]⚫ Stopped[/dim]"
|
|
464
|
+
|
|
465
|
+
# Format uptime
|
|
466
|
+
uptime = service.uptime
|
|
467
|
+
if uptime > 3600:
|
|
468
|
+
uptime_str = f"{uptime / 3600:.1f}h"
|
|
469
|
+
elif uptime > 60:
|
|
470
|
+
uptime_str = f"{uptime / 60:.1f}m"
|
|
471
|
+
elif uptime > 0:
|
|
472
|
+
uptime_str = f"{uptime:.0f}s"
|
|
473
|
+
else:
|
|
474
|
+
uptime_str = "-"
|
|
475
|
+
|
|
476
|
+
table.add_row(service.config.name, status, uptime_str)
|
|
477
|
+
|
|
478
|
+
self.console.print(table)
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
# Global service watchdog instance
|
|
482
|
+
_global_watchdog: ServiceWatchdog | None = None
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def get_service_watchdog(console: Console | None = None) -> ServiceWatchdog:
|
|
486
|
+
"""Get global service watchdog instance."""
|
|
487
|
+
global _global_watchdog
|
|
488
|
+
if _global_watchdog is None:
|
|
489
|
+
_global_watchdog = ServiceWatchdog(console)
|
|
490
|
+
return _global_watchdog
|
|
@@ -146,12 +146,12 @@ class SessionCoordinator:
|
|
|
146
146
|
if success:
|
|
147
147
|
self.complete_task(
|
|
148
148
|
"workflow",
|
|
149
|
-
f"Completed successfully in {total_time
|
|
149
|
+
f"Completed successfully in {total_time: .1f}s",
|
|
150
150
|
)
|
|
151
151
|
else:
|
|
152
152
|
self.complete_task(
|
|
153
153
|
"workflow",
|
|
154
|
-
f"Completed with issues in {total_time
|
|
154
|
+
f"Completed with issues in {total_time: .1f}s",
|
|
155
155
|
)
|
|
156
156
|
|
|
157
157
|
def register_cleanup(self, cleanup_handler: t.Callable[[], None]) -> None:
|
|
@@ -207,10 +207,9 @@ class SessionCoordinator:
|
|
|
207
207
|
|
|
208
208
|
def _cleanup_coverage_files(self, keep_recent: int = 10) -> None:
|
|
209
209
|
with suppress(Exception):
|
|
210
|
-
# Clean up coverage files from cache directory
|
|
211
210
|
cache_dir = Path.home() / ".cache" / "crackerjack" / "coverage"
|
|
212
211
|
if cache_dir.exists():
|
|
213
|
-
pattern = ".coverage*"
|
|
212
|
+
pattern = ".coverage *"
|
|
214
213
|
coverage_files = sorted(
|
|
215
214
|
cache_dir.glob(pattern),
|
|
216
215
|
key=lambda p: p.stat().st_mtime,
|
|
@@ -220,7 +219,6 @@ class SessionCoordinator:
|
|
|
220
219
|
with suppress(FileNotFoundError, PermissionError):
|
|
221
220
|
old_file.unlink()
|
|
222
221
|
|
|
223
|
-
# Also clean up any legacy coverage files from project root
|
|
224
222
|
pattern = ".coverage.*"
|
|
225
223
|
coverage_files = sorted(
|
|
226
224
|
self.pkg_path.glob(pattern),
|
|
@@ -232,11 +230,9 @@ class SessionCoordinator:
|
|
|
232
230
|
old_file.unlink()
|
|
233
231
|
|
|
234
232
|
def _cleanup_pycache_directories(self) -> None:
|
|
235
|
-
"""Remove __pycache__ directories from the package to keep repo clean."""
|
|
236
233
|
with suppress(Exception):
|
|
237
234
|
import shutil
|
|
238
235
|
|
|
239
|
-
# Clean __pycache__ directories in package
|
|
240
236
|
for pycache_dir in self.pkg_path.rglob("__pycache__"):
|
|
241
237
|
if pycache_dir.is_dir():
|
|
242
238
|
with suppress(FileNotFoundError, PermissionError):
|
|
@@ -281,7 +277,7 @@ class SessionCoordinator:
|
|
|
281
277
|
|
|
282
278
|
except Exception as e:
|
|
283
279
|
self.console.print(
|
|
284
|
-
f"[dim yellow]Warning: Could not update progress file: {e}[/dim yellow]",
|
|
280
|
+
f"[dim yellow]Warning: Could not update progress file: {e}[/ dim yellow]",
|
|
285
281
|
)
|
|
286
282
|
|
|
287
283
|
def update_stage(self, stage: str, status: str) -> None:
|