clonebox 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clonebox/backends/libvirt_backend.py +217 -0
- clonebox/backends/qemu_disk.py +52 -0
- clonebox/backends/subprocess_runner.py +56 -0
- clonebox/cli.py +227 -45
- clonebox/cloner.py +327 -189
- clonebox/di.py +176 -0
- clonebox/health/__init__.py +2 -1
- clonebox/health/manager.py +328 -0
- clonebox/health/probes.py +337 -0
- clonebox/interfaces/disk.py +40 -0
- clonebox/interfaces/hypervisor.py +89 -0
- clonebox/interfaces/network.py +33 -0
- clonebox/interfaces/process.py +46 -0
- clonebox/logging.py +125 -0
- clonebox/models.py +2 -2
- clonebox/monitor.py +1 -3
- clonebox/p2p.py +4 -2
- clonebox/resource_monitor.py +162 -0
- clonebox/resources.py +222 -0
- clonebox/rollback.py +172 -0
- clonebox/secrets.py +331 -0
- clonebox/snapshots/manager.py +3 -9
- clonebox/snapshots/models.py +2 -6
- clonebox/validator.py +34 -0
- {clonebox-1.1.4.dist-info → clonebox-1.1.6.dist-info}/METADATA +52 -2
- clonebox-1.1.6.dist-info/RECORD +42 -0
- clonebox-1.1.4.dist-info/RECORD +0 -27
- {clonebox-1.1.4.dist-info → clonebox-1.1.6.dist-info}/WHEEL +0 -0
- {clonebox-1.1.4.dist-info → clonebox-1.1.6.dist-info}/entry_points.txt +0 -0
- {clonebox-1.1.4.dist-info → clonebox-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {clonebox-1.1.4.dist-info → clonebox-1.1.6.dist-info}/top_level.txt +0 -0
clonebox/di.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
"""IoC container for dependency injection in CloneBox."""
|
|
2
|
+
|
|
3
|
+
import inspect
|
|
4
|
+
import threading
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Callable, Dict, Optional, Type, TypeVar
|
|
7
|
+
|
|
8
|
+
T = TypeVar("T")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ServiceRegistration:
|
|
13
|
+
"""Registration info for a service."""
|
|
14
|
+
|
|
15
|
+
factory: Callable[..., Any]
|
|
16
|
+
singleton: bool = True
|
|
17
|
+
instance: Optional[Any] = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DependencyContainer:
|
|
21
|
+
"""
|
|
22
|
+
IoC container for dependency injection.
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
container = DependencyContainer()
|
|
26
|
+
|
|
27
|
+
# Register services
|
|
28
|
+
container.register(HypervisorBackend, LibvirtBackend, singleton=True)
|
|
29
|
+
container.register(DiskManager, QemuDiskManager)
|
|
30
|
+
|
|
31
|
+
# Resolve dependencies
|
|
32
|
+
cloner = container.resolve(SelectiveVMCloner)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self._registrations: Dict[Type, ServiceRegistration] = {}
|
|
37
|
+
self._lock = threading.Lock()
|
|
38
|
+
|
|
39
|
+
def register(
|
|
40
|
+
self,
|
|
41
|
+
interface: Type[T],
|
|
42
|
+
implementation: Type[T] = None,
|
|
43
|
+
factory: Callable[..., T] = None,
|
|
44
|
+
singleton: bool = True,
|
|
45
|
+
instance: T = None,
|
|
46
|
+
) -> "DependencyContainer":
|
|
47
|
+
"""
|
|
48
|
+
Register a service.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
interface: The interface/base class
|
|
52
|
+
implementation: Concrete implementation class
|
|
53
|
+
factory: Factory function to create instance
|
|
54
|
+
singleton: If True, reuse same instance
|
|
55
|
+
instance: Pre-created instance to use
|
|
56
|
+
"""
|
|
57
|
+
if instance is not None:
|
|
58
|
+
self._registrations[interface] = ServiceRegistration(
|
|
59
|
+
factory=lambda: instance,
|
|
60
|
+
singleton=True,
|
|
61
|
+
instance=instance,
|
|
62
|
+
)
|
|
63
|
+
elif factory is not None:
|
|
64
|
+
self._registrations[interface] = ServiceRegistration(
|
|
65
|
+
factory=factory,
|
|
66
|
+
singleton=singleton,
|
|
67
|
+
)
|
|
68
|
+
elif implementation is not None:
|
|
69
|
+
self._registrations[interface] = ServiceRegistration(
|
|
70
|
+
factory=implementation,
|
|
71
|
+
singleton=singleton,
|
|
72
|
+
)
|
|
73
|
+
else:
|
|
74
|
+
raise ValueError("Must provide implementation, factory, or instance")
|
|
75
|
+
|
|
76
|
+
return self # Enable chaining
|
|
77
|
+
|
|
78
|
+
def resolve(self, interface: Type[T]) -> T:
|
|
79
|
+
"""Resolve a service instance."""
|
|
80
|
+
with self._lock:
|
|
81
|
+
if interface not in self._registrations:
|
|
82
|
+
# If it's a class and not an interface, try to auto-resolve it
|
|
83
|
+
if inspect.isclass(interface):
|
|
84
|
+
return self._create_instance(interface)
|
|
85
|
+
raise KeyError(f"No registration for {interface}")
|
|
86
|
+
|
|
87
|
+
reg = self._registrations[interface]
|
|
88
|
+
|
|
89
|
+
# Return existing instance for singletons
|
|
90
|
+
if reg.singleton and reg.instance is not None:
|
|
91
|
+
return reg.instance
|
|
92
|
+
|
|
93
|
+
# Create new instance
|
|
94
|
+
instance = self._create_instance(reg.factory)
|
|
95
|
+
|
|
96
|
+
# Store for singleton
|
|
97
|
+
if reg.singleton:
|
|
98
|
+
reg.instance = instance
|
|
99
|
+
|
|
100
|
+
return instance
|
|
101
|
+
|
|
102
|
+
def _create_instance(self, factory: Callable) -> Any:
|
|
103
|
+
"""Create instance, resolving constructor dependencies."""
|
|
104
|
+
try:
|
|
105
|
+
sig = inspect.signature(factory)
|
|
106
|
+
except ValueError:
|
|
107
|
+
# Handle cases where signature can't be inspected (e.g., some built-ins)
|
|
108
|
+
return factory()
|
|
109
|
+
|
|
110
|
+
kwargs = {}
|
|
111
|
+
|
|
112
|
+
for name, param in sig.parameters.items():
|
|
113
|
+
if param.annotation != inspect.Parameter.empty:
|
|
114
|
+
# Try to resolve dependency
|
|
115
|
+
try:
|
|
116
|
+
kwargs[name] = self.resolve(param.annotation)
|
|
117
|
+
except (KeyError, TypeError):
|
|
118
|
+
if param.default == inspect.Parameter.empty:
|
|
119
|
+
raise
|
|
120
|
+
# Use default if available
|
|
121
|
+
elif param.default != inspect.Parameter.empty:
|
|
122
|
+
# Use default if no annotation but default exists
|
|
123
|
+
pass
|
|
124
|
+
else:
|
|
125
|
+
# Can't resolve this parameter
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
return factory(**kwargs)
|
|
129
|
+
|
|
130
|
+
def has(self, interface: Type) -> bool:
|
|
131
|
+
"""Check if service is registered."""
|
|
132
|
+
return interface in self._registrations
|
|
133
|
+
|
|
134
|
+
def reset(self) -> None:
|
|
135
|
+
"""Reset all singleton instances."""
|
|
136
|
+
with self._lock:
|
|
137
|
+
for reg in self._registrations.values():
|
|
138
|
+
reg.instance = None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# Global container instance
|
|
142
|
+
_container: Optional[DependencyContainer] = None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_container() -> DependencyContainer:
|
|
146
|
+
"""Get the global container instance."""
|
|
147
|
+
global _container
|
|
148
|
+
if _container is None:
|
|
149
|
+
_container = create_default_container()
|
|
150
|
+
return _container
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def set_container(container: DependencyContainer) -> None:
|
|
154
|
+
"""Set the global container (useful for testing)."""
|
|
155
|
+
global _container
|
|
156
|
+
_container = container
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def create_default_container() -> DependencyContainer:
|
|
160
|
+
"""Create container with default registrations."""
|
|
161
|
+
from .backends.libvirt_backend import LibvirtBackend
|
|
162
|
+
from .backends.qemu_disk import QemuDiskManager
|
|
163
|
+
from .backends.subprocess_runner import SubprocessRunner
|
|
164
|
+
from .interfaces.disk import DiskManager
|
|
165
|
+
from .interfaces.hypervisor import HypervisorBackend
|
|
166
|
+
from .interfaces.process import ProcessRunner
|
|
167
|
+
from .secrets import SecretsManager
|
|
168
|
+
|
|
169
|
+
container = DependencyContainer()
|
|
170
|
+
|
|
171
|
+
container.register(HypervisorBackend, LibvirtBackend)
|
|
172
|
+
container.register(DiskManager, QemuDiskManager)
|
|
173
|
+
container.register(ProcessRunner, SubprocessRunner)
|
|
174
|
+
container.register(SecretsManager, SecretsManager)
|
|
175
|
+
|
|
176
|
+
return container
|
clonebox/health/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Health check system for CloneBox VMs."""
|
|
2
2
|
|
|
3
|
-
from .models import HealthCheckResult, HealthStatus, ProbeConfig
|
|
3
|
+
from .models import HealthCheckResult, HealthStatus, ProbeConfig, ProbeType
|
|
4
4
|
from .probes import HTTPProbe, TCPProbe, CommandProbe, ScriptProbe
|
|
5
5
|
from .manager import HealthCheckManager
|
|
6
6
|
|
|
@@ -8,6 +8,7 @@ __all__ = [
|
|
|
8
8
|
"HealthCheckResult",
|
|
9
9
|
"HealthStatus",
|
|
10
10
|
"ProbeConfig",
|
|
11
|
+
"ProbeType",
|
|
11
12
|
"HTTPProbe",
|
|
12
13
|
"TCPProbe",
|
|
13
14
|
"CommandProbe",
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Health check manager for CloneBox VMs."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
from .models import HealthCheckResult, HealthStatus, ProbeConfig, ProbeType, VMHealthState
|
|
11
|
+
from .probes import get_probe
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HealthCheckManager:
|
|
15
|
+
"""Manage health checks for VMs."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, config_dir: Optional[Path] = None):
|
|
18
|
+
self._config_dir = config_dir or Path.home() / ".local/share/clonebox/health"
|
|
19
|
+
self._config_dir.mkdir(parents=True, exist_ok=True)
|
|
20
|
+
self._vm_states: Dict[str, VMHealthState] = {}
|
|
21
|
+
|
|
22
|
+
def check(
|
|
23
|
+
self,
|
|
24
|
+
vm_name: str,
|
|
25
|
+
probes: List[ProbeConfig],
|
|
26
|
+
) -> VMHealthState:
|
|
27
|
+
"""Run health checks for a VM.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
vm_name: Name of VM to check
|
|
31
|
+
probes: List of probe configurations
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
VMHealthState with aggregated results
|
|
35
|
+
"""
|
|
36
|
+
results = []
|
|
37
|
+
|
|
38
|
+
for config in probes:
|
|
39
|
+
if not config.enabled:
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
probe = get_probe(config.probe_type)
|
|
43
|
+
result = self._run_probe_with_retry(probe, config)
|
|
44
|
+
results.append(result)
|
|
45
|
+
|
|
46
|
+
# Calculate overall status
|
|
47
|
+
overall = self._calculate_overall_status(results)
|
|
48
|
+
|
|
49
|
+
# Update state
|
|
50
|
+
state = self._update_vm_state(vm_name, overall, results)
|
|
51
|
+
|
|
52
|
+
return state
|
|
53
|
+
|
|
54
|
+
def check_single(self, config: ProbeConfig) -> HealthCheckResult:
|
|
55
|
+
"""Run a single health check.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
config: Probe configuration
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
HealthCheckResult
|
|
62
|
+
"""
|
|
63
|
+
probe = get_probe(config.probe_type)
|
|
64
|
+
return self._run_probe_with_retry(probe, config)
|
|
65
|
+
|
|
66
|
+
def check_from_config(
|
|
67
|
+
self,
|
|
68
|
+
vm_name: str,
|
|
69
|
+
config_path: Path,
|
|
70
|
+
) -> VMHealthState:
|
|
71
|
+
"""Run health checks from YAML config file.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
vm_name: Name of VM
|
|
75
|
+
config_path: Path to .clonebox.yaml
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
VMHealthState with results
|
|
79
|
+
"""
|
|
80
|
+
import yaml
|
|
81
|
+
|
|
82
|
+
if not config_path.exists():
|
|
83
|
+
return VMHealthState(
|
|
84
|
+
vm_name=vm_name,
|
|
85
|
+
overall_status=HealthStatus.UNKNOWN,
|
|
86
|
+
last_check=datetime.now(),
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
config = yaml.safe_load(config_path.read_text())
|
|
90
|
+
health_checks = config.get("health_checks", [])
|
|
91
|
+
|
|
92
|
+
probes = []
|
|
93
|
+
for hc in health_checks:
|
|
94
|
+
probe_config = ProbeConfig.from_dict(hc)
|
|
95
|
+
probes.append(probe_config)
|
|
96
|
+
|
|
97
|
+
return self.check(vm_name, probes)
|
|
98
|
+
|
|
99
|
+
def get_state(self, vm_name: str) -> Optional[VMHealthState]:
|
|
100
|
+
"""Get current health state for a VM."""
|
|
101
|
+
return self._vm_states.get(vm_name)
|
|
102
|
+
|
|
103
|
+
def wait_healthy(
|
|
104
|
+
self,
|
|
105
|
+
vm_name: str,
|
|
106
|
+
probes: List[ProbeConfig],
|
|
107
|
+
timeout: int = 300,
|
|
108
|
+
check_interval: float = 5.0,
|
|
109
|
+
) -> bool:
|
|
110
|
+
"""Wait until VM becomes healthy.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
vm_name: Name of VM
|
|
114
|
+
probes: Probe configurations
|
|
115
|
+
timeout: Maximum wait time in seconds
|
|
116
|
+
check_interval: Time between checks
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
True if healthy within timeout, False otherwise
|
|
120
|
+
"""
|
|
121
|
+
start = time.time()
|
|
122
|
+
|
|
123
|
+
while time.time() - start < timeout:
|
|
124
|
+
state = self.check(vm_name, probes)
|
|
125
|
+
|
|
126
|
+
if state.overall_status == HealthStatus.HEALTHY:
|
|
127
|
+
return True
|
|
128
|
+
|
|
129
|
+
time.sleep(check_interval)
|
|
130
|
+
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
def create_default_probes(self, services: List[str]) -> List[ProbeConfig]:
|
|
134
|
+
"""Create default health probes for common services.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
services: List of service names (e.g., ["nginx", "postgres"])
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of ProbeConfig
|
|
141
|
+
"""
|
|
142
|
+
defaults = {
|
|
143
|
+
"nginx": ProbeConfig(
|
|
144
|
+
name="nginx",
|
|
145
|
+
probe_type=ProbeType.HTTP,
|
|
146
|
+
url="http://localhost:80/",
|
|
147
|
+
expected_status=200,
|
|
148
|
+
timeout_seconds=5.0,
|
|
149
|
+
),
|
|
150
|
+
"apache2": ProbeConfig(
|
|
151
|
+
name="apache2",
|
|
152
|
+
probe_type=ProbeType.HTTP,
|
|
153
|
+
url="http://localhost:80/",
|
|
154
|
+
expected_status=200,
|
|
155
|
+
timeout_seconds=5.0,
|
|
156
|
+
),
|
|
157
|
+
"postgres": ProbeConfig(
|
|
158
|
+
name="postgres",
|
|
159
|
+
probe_type=ProbeType.TCP,
|
|
160
|
+
host="localhost",
|
|
161
|
+
port=5432,
|
|
162
|
+
timeout_seconds=3.0,
|
|
163
|
+
),
|
|
164
|
+
"postgresql": ProbeConfig(
|
|
165
|
+
name="postgresql",
|
|
166
|
+
probe_type=ProbeType.TCP,
|
|
167
|
+
host="localhost",
|
|
168
|
+
port=5432,
|
|
169
|
+
timeout_seconds=3.0,
|
|
170
|
+
),
|
|
171
|
+
"mysql": ProbeConfig(
|
|
172
|
+
name="mysql",
|
|
173
|
+
probe_type=ProbeType.TCP,
|
|
174
|
+
host="localhost",
|
|
175
|
+
port=3306,
|
|
176
|
+
timeout_seconds=3.0,
|
|
177
|
+
),
|
|
178
|
+
"redis": ProbeConfig(
|
|
179
|
+
name="redis",
|
|
180
|
+
probe_type=ProbeType.COMMAND,
|
|
181
|
+
command="redis-cli ping",
|
|
182
|
+
expected_output="PONG",
|
|
183
|
+
timeout_seconds=3.0,
|
|
184
|
+
),
|
|
185
|
+
"mongodb": ProbeConfig(
|
|
186
|
+
name="mongodb",
|
|
187
|
+
probe_type=ProbeType.TCP,
|
|
188
|
+
host="localhost",
|
|
189
|
+
port=27017,
|
|
190
|
+
timeout_seconds=3.0,
|
|
191
|
+
),
|
|
192
|
+
"docker": ProbeConfig(
|
|
193
|
+
name="docker",
|
|
194
|
+
probe_type=ProbeType.COMMAND,
|
|
195
|
+
command="docker info",
|
|
196
|
+
expected_exit_code=0,
|
|
197
|
+
timeout_seconds=5.0,
|
|
198
|
+
),
|
|
199
|
+
"ssh": ProbeConfig(
|
|
200
|
+
name="ssh",
|
|
201
|
+
probe_type=ProbeType.TCP,
|
|
202
|
+
host="localhost",
|
|
203
|
+
port=22,
|
|
204
|
+
timeout_seconds=3.0,
|
|
205
|
+
),
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
probes = []
|
|
209
|
+
for service in services:
|
|
210
|
+
service_lower = service.lower()
|
|
211
|
+
if service_lower in defaults:
|
|
212
|
+
probes.append(defaults[service_lower])
|
|
213
|
+
else:
|
|
214
|
+
# Create generic process check
|
|
215
|
+
probes.append(
|
|
216
|
+
ProbeConfig(
|
|
217
|
+
name=service,
|
|
218
|
+
probe_type=ProbeType.COMMAND,
|
|
219
|
+
command=f"pgrep -x {service} || pgrep -f {service}",
|
|
220
|
+
expected_exit_code=0,
|
|
221
|
+
timeout_seconds=3.0,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return probes
|
|
226
|
+
|
|
227
|
+
def _run_probe_with_retry(
|
|
228
|
+
self,
|
|
229
|
+
probe,
|
|
230
|
+
config: ProbeConfig,
|
|
231
|
+
) -> HealthCheckResult:
|
|
232
|
+
"""Run probe with retry logic."""
|
|
233
|
+
last_result = None
|
|
234
|
+
|
|
235
|
+
for attempt in range(config.retries):
|
|
236
|
+
result = probe.check(config)
|
|
237
|
+
|
|
238
|
+
if result.is_healthy:
|
|
239
|
+
return result
|
|
240
|
+
|
|
241
|
+
last_result = result
|
|
242
|
+
|
|
243
|
+
if attempt < config.retries - 1:
|
|
244
|
+
time.sleep(config.retry_delay_seconds)
|
|
245
|
+
|
|
246
|
+
return last_result or HealthCheckResult(
|
|
247
|
+
probe_name=config.name,
|
|
248
|
+
status=HealthStatus.UNKNOWN,
|
|
249
|
+
checked_at=datetime.now(),
|
|
250
|
+
duration_ms=0,
|
|
251
|
+
error="No result after retries",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _calculate_overall_status(
|
|
255
|
+
self,
|
|
256
|
+
results: List[HealthCheckResult],
|
|
257
|
+
) -> HealthStatus:
|
|
258
|
+
"""Calculate overall health status from results."""
|
|
259
|
+
if not results:
|
|
260
|
+
return HealthStatus.UNKNOWN
|
|
261
|
+
|
|
262
|
+
statuses = [r.status for r in results]
|
|
263
|
+
|
|
264
|
+
# All healthy = healthy
|
|
265
|
+
if all(s == HealthStatus.HEALTHY for s in statuses):
|
|
266
|
+
return HealthStatus.HEALTHY
|
|
267
|
+
|
|
268
|
+
# Any unhealthy = unhealthy
|
|
269
|
+
if any(s == HealthStatus.UNHEALTHY for s in statuses):
|
|
270
|
+
return HealthStatus.UNHEALTHY
|
|
271
|
+
|
|
272
|
+
# Any timeout = degraded
|
|
273
|
+
if any(s == HealthStatus.TIMEOUT for s in statuses):
|
|
274
|
+
return HealthStatus.DEGRADED
|
|
275
|
+
|
|
276
|
+
# Mix of healthy and unknown = degraded
|
|
277
|
+
return HealthStatus.DEGRADED
|
|
278
|
+
|
|
279
|
+
def _update_vm_state(
|
|
280
|
+
self,
|
|
281
|
+
vm_name: str,
|
|
282
|
+
overall: HealthStatus,
|
|
283
|
+
results: List[HealthCheckResult],
|
|
284
|
+
) -> VMHealthState:
|
|
285
|
+
"""Update VM health state with new results."""
|
|
286
|
+
if vm_name not in self._vm_states:
|
|
287
|
+
self._vm_states[vm_name] = VMHealthState(
|
|
288
|
+
vm_name=vm_name,
|
|
289
|
+
overall_status=overall,
|
|
290
|
+
last_check=datetime.now(),
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
state = self._vm_states[vm_name]
|
|
294
|
+
state.overall_status = overall
|
|
295
|
+
state.last_check = datetime.now()
|
|
296
|
+
state.check_results = results
|
|
297
|
+
state.total_checks += 1
|
|
298
|
+
|
|
299
|
+
if overall == HealthStatus.HEALTHY:
|
|
300
|
+
state.consecutive_successes += 1
|
|
301
|
+
state.consecutive_failures = 0
|
|
302
|
+
else:
|
|
303
|
+
state.consecutive_failures += 1
|
|
304
|
+
state.consecutive_successes = 0
|
|
305
|
+
state.total_failures += 1
|
|
306
|
+
|
|
307
|
+
return state
|
|
308
|
+
|
|
309
|
+
def export_metrics(self, vm_name: str) -> Dict[str, Any]:
|
|
310
|
+
"""Export health metrics in Prometheus format."""
|
|
311
|
+
state = self._vm_states.get(vm_name)
|
|
312
|
+
if not state:
|
|
313
|
+
return {}
|
|
314
|
+
|
|
315
|
+
metrics = {
|
|
316
|
+
"clonebox_health_status": 1 if state.overall_status == HealthStatus.HEALTHY else 0,
|
|
317
|
+
"clonebox_health_consecutive_failures": state.consecutive_failures,
|
|
318
|
+
"clonebox_health_consecutive_successes": state.consecutive_successes,
|
|
319
|
+
"clonebox_health_total_checks": state.total_checks,
|
|
320
|
+
"clonebox_health_failure_rate": state.failure_rate,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
for result in state.check_results:
|
|
324
|
+
probe_name = result.probe_name.replace("-", "_")
|
|
325
|
+
metrics[f"clonebox_probe_{probe_name}_healthy"] = 1 if result.is_healthy else 0
|
|
326
|
+
metrics[f"clonebox_probe_{probe_name}_duration_ms"] = result.duration_ms
|
|
327
|
+
|
|
328
|
+
return metrics
|