invarlock 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +33 -0
- invarlock/__main__.py +10 -0
- invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
- invarlock/_data/runtime/profiles/release.yaml +23 -0
- invarlock/_data/runtime/tiers.yaml +76 -0
- invarlock/adapters/__init__.py +102 -0
- invarlock/adapters/_capabilities.py +45 -0
- invarlock/adapters/auto.py +99 -0
- invarlock/adapters/base.py +530 -0
- invarlock/adapters/base_types.py +85 -0
- invarlock/adapters/hf_bert.py +852 -0
- invarlock/adapters/hf_gpt2.py +403 -0
- invarlock/adapters/hf_llama.py +485 -0
- invarlock/adapters/hf_mixin.py +383 -0
- invarlock/adapters/hf_onnx.py +112 -0
- invarlock/adapters/hf_t5.py +137 -0
- invarlock/adapters/py.typed +1 -0
- invarlock/assurance/__init__.py +43 -0
- invarlock/cli/__init__.py +8 -0
- invarlock/cli/__main__.py +8 -0
- invarlock/cli/_evidence.py +25 -0
- invarlock/cli/_json.py +75 -0
- invarlock/cli/adapter_auto.py +162 -0
- invarlock/cli/app.py +287 -0
- invarlock/cli/commands/__init__.py +26 -0
- invarlock/cli/commands/certify.py +403 -0
- invarlock/cli/commands/doctor.py +1358 -0
- invarlock/cli/commands/explain_gates.py +151 -0
- invarlock/cli/commands/export_html.py +100 -0
- invarlock/cli/commands/plugins.py +1331 -0
- invarlock/cli/commands/report.py +354 -0
- invarlock/cli/commands/run.py +4146 -0
- invarlock/cli/commands/verify.py +1040 -0
- invarlock/cli/config.py +396 -0
- invarlock/cli/constants.py +68 -0
- invarlock/cli/device.py +92 -0
- invarlock/cli/doctor_helpers.py +74 -0
- invarlock/cli/errors.py +6 -0
- invarlock/cli/overhead_utils.py +60 -0
- invarlock/cli/provenance.py +66 -0
- invarlock/cli/utils.py +41 -0
- invarlock/config.py +56 -0
- invarlock/core/__init__.py +62 -0
- invarlock/core/abi.py +15 -0
- invarlock/core/api.py +274 -0
- invarlock/core/auto_tuning.py +317 -0
- invarlock/core/bootstrap.py +226 -0
- invarlock/core/checkpoint.py +221 -0
- invarlock/core/contracts.py +73 -0
- invarlock/core/error_utils.py +64 -0
- invarlock/core/events.py +298 -0
- invarlock/core/exceptions.py +95 -0
- invarlock/core/registry.py +481 -0
- invarlock/core/retry.py +146 -0
- invarlock/core/runner.py +2041 -0
- invarlock/core/types.py +154 -0
- invarlock/edits/__init__.py +12 -0
- invarlock/edits/_edit_utils.py +249 -0
- invarlock/edits/_external_utils.py +268 -0
- invarlock/edits/noop.py +47 -0
- invarlock/edits/py.typed +1 -0
- invarlock/edits/quant_rtn.py +801 -0
- invarlock/edits/registry.py +166 -0
- invarlock/eval/__init__.py +23 -0
- invarlock/eval/bench.py +1207 -0
- invarlock/eval/bootstrap.py +50 -0
- invarlock/eval/data.py +2052 -0
- invarlock/eval/metrics.py +2167 -0
- invarlock/eval/primary_metric.py +767 -0
- invarlock/eval/probes/__init__.py +24 -0
- invarlock/eval/probes/fft.py +139 -0
- invarlock/eval/probes/mi.py +213 -0
- invarlock/eval/probes/post_attention.py +323 -0
- invarlock/eval/providers/base.py +67 -0
- invarlock/eval/providers/seq2seq.py +111 -0
- invarlock/eval/providers/text_lm.py +113 -0
- invarlock/eval/providers/vision_text.py +93 -0
- invarlock/eval/py.typed +1 -0
- invarlock/guards/__init__.py +18 -0
- invarlock/guards/_contracts.py +9 -0
- invarlock/guards/invariants.py +640 -0
- invarlock/guards/policies.py +805 -0
- invarlock/guards/py.typed +1 -0
- invarlock/guards/rmt.py +2097 -0
- invarlock/guards/spectral.py +1419 -0
- invarlock/guards/tier_config.py +354 -0
- invarlock/guards/variance.py +3298 -0
- invarlock/guards_ref/__init__.py +15 -0
- invarlock/guards_ref/rmt_ref.py +40 -0
- invarlock/guards_ref/spectral_ref.py +135 -0
- invarlock/guards_ref/variance_ref.py +60 -0
- invarlock/model_profile.py +353 -0
- invarlock/model_utils.py +221 -0
- invarlock/observability/__init__.py +10 -0
- invarlock/observability/alerting.py +535 -0
- invarlock/observability/core.py +546 -0
- invarlock/observability/exporters.py +565 -0
- invarlock/observability/health.py +588 -0
- invarlock/observability/metrics.py +457 -0
- invarlock/observability/py.typed +1 -0
- invarlock/observability/utils.py +553 -0
- invarlock/plugins/__init__.py +12 -0
- invarlock/plugins/hello_guard.py +33 -0
- invarlock/plugins/hf_awq_adapter.py +82 -0
- invarlock/plugins/hf_bnb_adapter.py +79 -0
- invarlock/plugins/hf_gptq_adapter.py +78 -0
- invarlock/plugins/py.typed +1 -0
- invarlock/py.typed +1 -0
- invarlock/reporting/__init__.py +7 -0
- invarlock/reporting/certificate.py +3221 -0
- invarlock/reporting/certificate_schema.py +244 -0
- invarlock/reporting/dataset_hashing.py +215 -0
- invarlock/reporting/guards_analysis.py +948 -0
- invarlock/reporting/html.py +32 -0
- invarlock/reporting/normalizer.py +235 -0
- invarlock/reporting/policy_utils.py +517 -0
- invarlock/reporting/primary_metric_utils.py +265 -0
- invarlock/reporting/render.py +1442 -0
- invarlock/reporting/report.py +903 -0
- invarlock/reporting/report_types.py +278 -0
- invarlock/reporting/utils.py +175 -0
- invarlock/reporting/validate.py +631 -0
- invarlock/security.py +176 -0
- invarlock/sparsity_utils.py +323 -0
- invarlock/utils/__init__.py +150 -0
- invarlock/utils/digest.py +45 -0
- invarlock-0.2.0.dist-info/METADATA +586 -0
- invarlock-0.2.0.dist-info/RECORD +132 -0
- invarlock-0.2.0.dist-info/WHEEL +5 -0
- invarlock-0.2.0.dist-info/entry_points.txt +20 -0
- invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
- invarlock-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,588 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Health checking and status monitoring.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import time
|
|
7
|
+
import traceback
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
import psutil
|
|
14
|
+
import torch
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HealthStatus(Enum):
|
|
18
|
+
"""Health status levels."""
|
|
19
|
+
|
|
20
|
+
HEALTHY = "healthy"
|
|
21
|
+
WARNING = "warning"
|
|
22
|
+
CRITICAL = "critical"
|
|
23
|
+
UNKNOWN = "unknown"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class ComponentHealth:
|
|
28
|
+
"""Health status for a component."""
|
|
29
|
+
|
|
30
|
+
name: str
|
|
31
|
+
status: HealthStatus
|
|
32
|
+
message: str
|
|
33
|
+
details: dict[str, Any]
|
|
34
|
+
timestamp: float
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def healthy(self) -> bool:
|
|
38
|
+
"""Check if component is healthy."""
|
|
39
|
+
return self.status == HealthStatus.HEALTHY
|
|
40
|
+
|
|
41
|
+
def to_dict(self) -> dict[str, Any]:
|
|
42
|
+
"""Convert to dictionary."""
|
|
43
|
+
return {
|
|
44
|
+
"name": self.name,
|
|
45
|
+
"status": self.status.value,
|
|
46
|
+
"message": self.message,
|
|
47
|
+
"details": self.details,
|
|
48
|
+
"timestamp": self.timestamp,
|
|
49
|
+
"healthy": self.healthy,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HealthChecker:
|
|
54
|
+
"""System health monitoring."""
|
|
55
|
+
|
|
56
|
+
def __init__(self):
|
|
57
|
+
self.logger = logging.getLogger(__name__)
|
|
58
|
+
self.health_checks: dict[str, Callable[[], ComponentHealth]] = {}
|
|
59
|
+
self.last_results: dict[str, ComponentHealth] = {}
|
|
60
|
+
|
|
61
|
+
# Register default health checks
|
|
62
|
+
self._register_default_checks()
|
|
63
|
+
|
|
64
|
+
def register_check(self, name: str, check_func: Callable[[], ComponentHealth]):
|
|
65
|
+
"""Register a health check function."""
|
|
66
|
+
self.health_checks[name] = check_func
|
|
67
|
+
self.logger.info(f"Registered health check: {name}")
|
|
68
|
+
|
|
69
|
+
def check_component(self, name: str) -> ComponentHealth:
|
|
70
|
+
"""Check health of a specific component."""
|
|
71
|
+
if name not in self.health_checks:
|
|
72
|
+
return ComponentHealth(
|
|
73
|
+
name=name,
|
|
74
|
+
status=HealthStatus.UNKNOWN,
|
|
75
|
+
message=f"No health check registered for {name}",
|
|
76
|
+
details={},
|
|
77
|
+
timestamp=time.time(),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
result = self.health_checks[name]()
|
|
82
|
+
self.last_results[name] = result
|
|
83
|
+
return result
|
|
84
|
+
except Exception as e:
|
|
85
|
+
error_result = ComponentHealth(
|
|
86
|
+
name=name,
|
|
87
|
+
status=HealthStatus.CRITICAL,
|
|
88
|
+
message=f"Health check failed: {str(e)}",
|
|
89
|
+
details={"error": str(e), "traceback": traceback.format_exc()},
|
|
90
|
+
timestamp=time.time(),
|
|
91
|
+
)
|
|
92
|
+
self.last_results[name] = error_result
|
|
93
|
+
return error_result
|
|
94
|
+
|
|
95
|
+
def check_all(self) -> dict[str, ComponentHealth]:
|
|
96
|
+
"""Check health of all registered components."""
|
|
97
|
+
results = {}
|
|
98
|
+
for name in self.health_checks:
|
|
99
|
+
results[name] = self.check_component(name)
|
|
100
|
+
return results
|
|
101
|
+
|
|
102
|
+
def get_overall_status(self) -> HealthStatus:
|
|
103
|
+
"""Get overall system health status."""
|
|
104
|
+
if not self.last_results:
|
|
105
|
+
return HealthStatus.UNKNOWN
|
|
106
|
+
|
|
107
|
+
statuses = [result.status for result in self.last_results.values()]
|
|
108
|
+
|
|
109
|
+
if HealthStatus.CRITICAL in statuses:
|
|
110
|
+
return HealthStatus.CRITICAL
|
|
111
|
+
elif HealthStatus.WARNING in statuses:
|
|
112
|
+
return HealthStatus.WARNING
|
|
113
|
+
elif all(status == HealthStatus.HEALTHY for status in statuses):
|
|
114
|
+
return HealthStatus.HEALTHY
|
|
115
|
+
else:
|
|
116
|
+
return HealthStatus.UNKNOWN
|
|
117
|
+
|
|
118
|
+
def get_summary(self) -> dict[str, Any]:
|
|
119
|
+
"""Get health summary."""
|
|
120
|
+
overall_status = self.get_overall_status()
|
|
121
|
+
|
|
122
|
+
status_counts = {status.value: 0 for status in HealthStatus}
|
|
123
|
+
for result in self.last_results.values():
|
|
124
|
+
status_counts[result.status.value] += 1
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
"overall_status": overall_status.value,
|
|
128
|
+
"total_components": len(self.health_checks),
|
|
129
|
+
"status_counts": status_counts,
|
|
130
|
+
"last_check": max([r.timestamp for r in self.last_results.values()])
|
|
131
|
+
if self.last_results
|
|
132
|
+
else 0,
|
|
133
|
+
"components": {
|
|
134
|
+
name: result.to_dict() for name, result in self.last_results.items()
|
|
135
|
+
},
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
def _register_default_checks(self):
|
|
139
|
+
"""Register default system health checks."""
|
|
140
|
+
|
|
141
|
+
def check_memory():
|
|
142
|
+
"""Check system memory usage."""
|
|
143
|
+
try:
|
|
144
|
+
memory = psutil.virtual_memory()
|
|
145
|
+
percent = memory.percent
|
|
146
|
+
|
|
147
|
+
if percent > 90:
|
|
148
|
+
status = HealthStatus.CRITICAL
|
|
149
|
+
message = f"Critical memory usage: {percent:.1f}%"
|
|
150
|
+
elif percent > 80:
|
|
151
|
+
status = HealthStatus.WARNING
|
|
152
|
+
message = f"High memory usage: {percent:.1f}%"
|
|
153
|
+
else:
|
|
154
|
+
status = HealthStatus.HEALTHY
|
|
155
|
+
message = f"Memory usage normal: {percent:.1f}%"
|
|
156
|
+
|
|
157
|
+
return ComponentHealth(
|
|
158
|
+
name="memory",
|
|
159
|
+
status=status,
|
|
160
|
+
message=message,
|
|
161
|
+
details={
|
|
162
|
+
"percent": percent,
|
|
163
|
+
"available_gb": memory.available / (1024**3),
|
|
164
|
+
"used_gb": memory.used / (1024**3),
|
|
165
|
+
"total_gb": memory.total / (1024**3),
|
|
166
|
+
},
|
|
167
|
+
timestamp=time.time(),
|
|
168
|
+
)
|
|
169
|
+
except Exception as e:
|
|
170
|
+
return ComponentHealth(
|
|
171
|
+
name="memory",
|
|
172
|
+
status=HealthStatus.CRITICAL,
|
|
173
|
+
message=f"Failed to check memory: {e}",
|
|
174
|
+
details={"error": str(e)},
|
|
175
|
+
timestamp=time.time(),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def check_cpu():
|
|
179
|
+
"""Check CPU usage."""
|
|
180
|
+
try:
|
|
181
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
|
182
|
+
|
|
183
|
+
if cpu_percent > 95:
|
|
184
|
+
status = HealthStatus.CRITICAL
|
|
185
|
+
message = f"Critical CPU usage: {cpu_percent:.1f}%"
|
|
186
|
+
elif cpu_percent > 85:
|
|
187
|
+
status = HealthStatus.WARNING
|
|
188
|
+
message = f"High CPU usage: {cpu_percent:.1f}%"
|
|
189
|
+
else:
|
|
190
|
+
status = HealthStatus.HEALTHY
|
|
191
|
+
message = f"CPU usage normal: {cpu_percent:.1f}%"
|
|
192
|
+
|
|
193
|
+
return ComponentHealth(
|
|
194
|
+
name="cpu",
|
|
195
|
+
status=status,
|
|
196
|
+
message=message,
|
|
197
|
+
details={
|
|
198
|
+
"percent": cpu_percent,
|
|
199
|
+
"core_count": psutil.cpu_count(),
|
|
200
|
+
"load_avg": psutil.getloadavg()
|
|
201
|
+
if hasattr(psutil, "getloadavg")
|
|
202
|
+
else None,
|
|
203
|
+
},
|
|
204
|
+
timestamp=time.time(),
|
|
205
|
+
)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
return ComponentHealth(
|
|
208
|
+
name="cpu",
|
|
209
|
+
status=HealthStatus.CRITICAL,
|
|
210
|
+
message=f"Failed to check CPU: {e}",
|
|
211
|
+
details={"error": str(e)},
|
|
212
|
+
timestamp=time.time(),
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
def check_disk():
|
|
216
|
+
"""Check disk space."""
|
|
217
|
+
try:
|
|
218
|
+
disk = psutil.disk_usage("/")
|
|
219
|
+
percent = (disk.used / disk.total) * 100
|
|
220
|
+
|
|
221
|
+
if percent > 95:
|
|
222
|
+
status = HealthStatus.CRITICAL
|
|
223
|
+
message = f"Critical disk usage: {percent:.1f}%"
|
|
224
|
+
elif percent > 85:
|
|
225
|
+
status = HealthStatus.WARNING
|
|
226
|
+
message = f"High disk usage: {percent:.1f}%"
|
|
227
|
+
else:
|
|
228
|
+
status = HealthStatus.HEALTHY
|
|
229
|
+
message = f"Disk usage normal: {percent:.1f}%"
|
|
230
|
+
|
|
231
|
+
return ComponentHealth(
|
|
232
|
+
name="disk",
|
|
233
|
+
status=status,
|
|
234
|
+
message=message,
|
|
235
|
+
details={
|
|
236
|
+
"percent": percent,
|
|
237
|
+
"free_gb": disk.free / (1024**3),
|
|
238
|
+
"used_gb": disk.used / (1024**3),
|
|
239
|
+
"total_gb": disk.total / (1024**3),
|
|
240
|
+
},
|
|
241
|
+
timestamp=time.time(),
|
|
242
|
+
)
|
|
243
|
+
except Exception as e:
|
|
244
|
+
return ComponentHealth(
|
|
245
|
+
name="disk",
|
|
246
|
+
status=HealthStatus.CRITICAL,
|
|
247
|
+
message=f"Failed to check disk: {e}",
|
|
248
|
+
details={"error": str(e)},
|
|
249
|
+
timestamp=time.time(),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def check_gpu():
|
|
253
|
+
"""Check GPU status."""
|
|
254
|
+
try:
|
|
255
|
+
if not torch.cuda.is_available():
|
|
256
|
+
return ComponentHealth(
|
|
257
|
+
name="gpu",
|
|
258
|
+
status=HealthStatus.HEALTHY,
|
|
259
|
+
message="GPU not available (CPU-only mode)",
|
|
260
|
+
details={"cuda_available": False},
|
|
261
|
+
timestamp=time.time(),
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
gpu_count = torch.cuda.device_count()
|
|
265
|
+
gpu_details = {}
|
|
266
|
+
max_memory_percent = 0
|
|
267
|
+
|
|
268
|
+
for i in range(gpu_count):
|
|
269
|
+
props = torch.cuda.get_device_properties(i)
|
|
270
|
+
memory_stats = torch.cuda.memory_stats(i)
|
|
271
|
+
|
|
272
|
+
allocated = memory_stats.get("allocated_bytes.all.current", 0)
|
|
273
|
+
total = props.total_memory
|
|
274
|
+
percent = (allocated / total) * 100
|
|
275
|
+
max_memory_percent = max(max_memory_percent, percent)
|
|
276
|
+
|
|
277
|
+
gpu_details[f"gpu_{i}"] = {
|
|
278
|
+
"name": props.name,
|
|
279
|
+
"memory_allocated_gb": allocated / (1024**3),
|
|
280
|
+
"memory_total_gb": total / (1024**3),
|
|
281
|
+
"memory_percent": percent,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if max_memory_percent > 95:
|
|
285
|
+
status = HealthStatus.CRITICAL
|
|
286
|
+
message = f"Critical GPU memory usage: {max_memory_percent:.1f}%"
|
|
287
|
+
elif max_memory_percent > 85:
|
|
288
|
+
status = HealthStatus.WARNING
|
|
289
|
+
message = f"High GPU memory usage: {max_memory_percent:.1f}%"
|
|
290
|
+
else:
|
|
291
|
+
status = HealthStatus.HEALTHY
|
|
292
|
+
message = (
|
|
293
|
+
f"GPU status normal: {max_memory_percent:.1f}% memory used"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
return ComponentHealth(
|
|
297
|
+
name="gpu",
|
|
298
|
+
status=status,
|
|
299
|
+
message=message,
|
|
300
|
+
details={
|
|
301
|
+
"cuda_available": True,
|
|
302
|
+
"device_count": gpu_count,
|
|
303
|
+
"max_memory_percent": max_memory_percent,
|
|
304
|
+
"devices": gpu_details,
|
|
305
|
+
},
|
|
306
|
+
timestamp=time.time(),
|
|
307
|
+
)
|
|
308
|
+
except Exception as e:
|
|
309
|
+
return ComponentHealth(
|
|
310
|
+
name="gpu",
|
|
311
|
+
status=HealthStatus.WARNING,
|
|
312
|
+
message=f"Failed to check GPU: {e}",
|
|
313
|
+
details={"error": str(e)},
|
|
314
|
+
timestamp=time.time(),
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
def check_pytorch():
|
|
318
|
+
"""Check PyTorch availability and functionality."""
|
|
319
|
+
try:
|
|
320
|
+
# Basic PyTorch functionality test
|
|
321
|
+
test_tensor = torch.randn(10, 10)
|
|
322
|
+
torch.mm(test_tensor, test_tensor.t())
|
|
323
|
+
|
|
324
|
+
details = {
|
|
325
|
+
"version": torch.__version__,
|
|
326
|
+
"cuda_available": torch.cuda.is_available(),
|
|
327
|
+
"cuda_version": torch.version.cuda
|
|
328
|
+
if torch.cuda.is_available()
|
|
329
|
+
else None,
|
|
330
|
+
"device_count": torch.cuda.device_count()
|
|
331
|
+
if torch.cuda.is_available()
|
|
332
|
+
else 0,
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
# Check for MPS (Apple Silicon) availability
|
|
336
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
337
|
+
details["mps_available"] = True
|
|
338
|
+
|
|
339
|
+
return ComponentHealth(
|
|
340
|
+
name="pytorch",
|
|
341
|
+
status=HealthStatus.HEALTHY,
|
|
342
|
+
message="PyTorch working correctly",
|
|
343
|
+
details=details,
|
|
344
|
+
timestamp=time.time(),
|
|
345
|
+
)
|
|
346
|
+
except Exception as e:
|
|
347
|
+
return ComponentHealth(
|
|
348
|
+
name="pytorch",
|
|
349
|
+
status=HealthStatus.CRITICAL,
|
|
350
|
+
message=f"PyTorch check failed: {e}",
|
|
351
|
+
details={"error": str(e)},
|
|
352
|
+
timestamp=time.time(),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Register all default checks
|
|
356
|
+
self.register_check("memory", check_memory)
|
|
357
|
+
self.register_check("cpu", check_cpu)
|
|
358
|
+
self.register_check("disk", check_disk)
|
|
359
|
+
self.register_check("gpu", check_gpu)
|
|
360
|
+
self.register_check("pytorch", check_pytorch)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class InvarLockHealthChecker(HealthChecker):
|
|
364
|
+
"""InvarLock-specific health checker with additional checks."""
|
|
365
|
+
|
|
366
|
+
def __init__(self):
|
|
367
|
+
super().__init__()
|
|
368
|
+
self._register_invarlock_checks()
|
|
369
|
+
|
|
370
|
+
def _register_invarlock_checks(self):
|
|
371
|
+
"""Register InvarLock-specific health checks."""
|
|
372
|
+
|
|
373
|
+
def check_adapters():
|
|
374
|
+
"""Check adapter availability."""
|
|
375
|
+
try:
|
|
376
|
+
from invarlock.adapters import (
|
|
377
|
+
HF_BERT_Adapter,
|
|
378
|
+
HF_GPT2_Adapter,
|
|
379
|
+
HF_LLaMA_Adapter,
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
adapters = {
|
|
383
|
+
"hf_gpt2": HF_GPT2_Adapter,
|
|
384
|
+
"hf_llama": HF_LLaMA_Adapter,
|
|
385
|
+
"hf_bert": HF_BERT_Adapter,
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
available_adapters = []
|
|
389
|
+
failed_adapters = []
|
|
390
|
+
|
|
391
|
+
for name, adapter_class in adapters.items():
|
|
392
|
+
try:
|
|
393
|
+
adapter_class()
|
|
394
|
+
available_adapters.append(name)
|
|
395
|
+
except Exception as e:
|
|
396
|
+
failed_adapters.append({"name": name, "error": str(e)})
|
|
397
|
+
|
|
398
|
+
if not available_adapters:
|
|
399
|
+
status = HealthStatus.CRITICAL
|
|
400
|
+
message = "No adapters available"
|
|
401
|
+
elif failed_adapters:
|
|
402
|
+
status = HealthStatus.WARNING
|
|
403
|
+
message = (
|
|
404
|
+
f"Some adapters failed: {[f['name'] for f in failed_adapters]}"
|
|
405
|
+
)
|
|
406
|
+
else:
|
|
407
|
+
status = HealthStatus.HEALTHY
|
|
408
|
+
message = f"All adapters available: {available_adapters}"
|
|
409
|
+
|
|
410
|
+
return ComponentHealth(
|
|
411
|
+
name="adapters",
|
|
412
|
+
status=status,
|
|
413
|
+
message=message,
|
|
414
|
+
details={
|
|
415
|
+
"available": available_adapters,
|
|
416
|
+
"failed": failed_adapters,
|
|
417
|
+
"total_adapters": len(adapters),
|
|
418
|
+
},
|
|
419
|
+
timestamp=time.time(),
|
|
420
|
+
)
|
|
421
|
+
except Exception as e:
|
|
422
|
+
return ComponentHealth(
|
|
423
|
+
name="adapters",
|
|
424
|
+
status=HealthStatus.CRITICAL,
|
|
425
|
+
message=f"Failed to check adapters: {e}",
|
|
426
|
+
details={"error": str(e)},
|
|
427
|
+
timestamp=time.time(),
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
def check_guards():
|
|
431
|
+
"""Check guard system availability."""
|
|
432
|
+
try:
|
|
433
|
+
from invarlock.guards import (
|
|
434
|
+
InvariantsGuard,
|
|
435
|
+
RMTGuard,
|
|
436
|
+
SpectralGuard,
|
|
437
|
+
VarianceGuard,
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
guards = {
|
|
441
|
+
"spectral": SpectralGuard,
|
|
442
|
+
"rmt": RMTGuard,
|
|
443
|
+
"invariants": InvariantsGuard,
|
|
444
|
+
"variance": VarianceGuard,
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
available_guards = []
|
|
448
|
+
failed_guards = []
|
|
449
|
+
|
|
450
|
+
for name, guard_class in guards.items():
|
|
451
|
+
try:
|
|
452
|
+
if name == "variance":
|
|
453
|
+
# Variance guard needs a policy
|
|
454
|
+
from invarlock.guards.policies import get_variance_policy
|
|
455
|
+
|
|
456
|
+
guard_class(get_variance_policy("balanced"))
|
|
457
|
+
else:
|
|
458
|
+
guard_class()
|
|
459
|
+
available_guards.append(name)
|
|
460
|
+
except Exception as e:
|
|
461
|
+
failed_guards.append({"name": name, "error": str(e)})
|
|
462
|
+
|
|
463
|
+
if not available_guards:
|
|
464
|
+
status = HealthStatus.CRITICAL
|
|
465
|
+
message = "No guards available"
|
|
466
|
+
elif failed_guards:
|
|
467
|
+
status = HealthStatus.WARNING
|
|
468
|
+
message = (
|
|
469
|
+
f"Some guards failed: {[f['name'] for f in failed_guards]}"
|
|
470
|
+
)
|
|
471
|
+
else:
|
|
472
|
+
status = HealthStatus.HEALTHY
|
|
473
|
+
message = f"All guards available: {available_guards}"
|
|
474
|
+
|
|
475
|
+
return ComponentHealth(
|
|
476
|
+
name="guards",
|
|
477
|
+
status=status,
|
|
478
|
+
message=message,
|
|
479
|
+
details={
|
|
480
|
+
"available": available_guards,
|
|
481
|
+
"failed": failed_guards,
|
|
482
|
+
"total_guards": len(guards),
|
|
483
|
+
},
|
|
484
|
+
timestamp=time.time(),
|
|
485
|
+
)
|
|
486
|
+
except Exception as e:
|
|
487
|
+
return ComponentHealth(
|
|
488
|
+
name="guards",
|
|
489
|
+
status=HealthStatus.CRITICAL,
|
|
490
|
+
message=f"Failed to check guards: {e}",
|
|
491
|
+
details={"error": str(e)},
|
|
492
|
+
timestamp=time.time(),
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
def check_dependencies():
|
|
496
|
+
"""Check critical dependencies."""
|
|
497
|
+
try:
|
|
498
|
+
dependencies = {
|
|
499
|
+
"torch": "torch",
|
|
500
|
+
"transformers": "transformers",
|
|
501
|
+
"numpy": "numpy",
|
|
502
|
+
"psutil": "psutil",
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
available_deps = []
|
|
506
|
+
missing_deps = []
|
|
507
|
+
|
|
508
|
+
for name, module_name in dependencies.items():
|
|
509
|
+
try:
|
|
510
|
+
__import__(module_name)
|
|
511
|
+
available_deps.append(name)
|
|
512
|
+
except ImportError:
|
|
513
|
+
missing_deps.append(name)
|
|
514
|
+
|
|
515
|
+
if missing_deps:
|
|
516
|
+
if "torch" in missing_deps:
|
|
517
|
+
status = HealthStatus.CRITICAL
|
|
518
|
+
message = f"Critical dependencies missing: {missing_deps}"
|
|
519
|
+
else:
|
|
520
|
+
status = HealthStatus.WARNING
|
|
521
|
+
message = f"Optional dependencies missing: {missing_deps}"
|
|
522
|
+
else:
|
|
523
|
+
status = HealthStatus.HEALTHY
|
|
524
|
+
message = "All dependencies available"
|
|
525
|
+
|
|
526
|
+
return ComponentHealth(
|
|
527
|
+
name="dependencies",
|
|
528
|
+
status=status,
|
|
529
|
+
message=message,
|
|
530
|
+
details={
|
|
531
|
+
"available": available_deps,
|
|
532
|
+
"missing": missing_deps,
|
|
533
|
+
"total_checked": len(dependencies),
|
|
534
|
+
},
|
|
535
|
+
timestamp=time.time(),
|
|
536
|
+
)
|
|
537
|
+
except Exception as e:
|
|
538
|
+
return ComponentHealth(
|
|
539
|
+
name="dependencies",
|
|
540
|
+
status=HealthStatus.CRITICAL,
|
|
541
|
+
message=f"Failed to check dependencies: {e}",
|
|
542
|
+
details={"error": str(e)},
|
|
543
|
+
timestamp=time.time(),
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
# Register InvarLock-specific checks
|
|
547
|
+
self.register_check("adapters", check_adapters)
|
|
548
|
+
self.register_check("guards", check_guards)
|
|
549
|
+
self.register_check("dependencies", check_dependencies)
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def create_health_endpoint():
|
|
553
|
+
"""Create a simple HTTP health endpoint."""
|
|
554
|
+
try:
|
|
555
|
+
import json
|
|
556
|
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
|
557
|
+
|
|
558
|
+
health_checker = InvarLockHealthChecker()
|
|
559
|
+
|
|
560
|
+
class HealthHandler(BaseHTTPRequestHandler):
|
|
561
|
+
def do_GET(self):
|
|
562
|
+
if self.path == "/health":
|
|
563
|
+
health_summary = health_checker.get_summary()
|
|
564
|
+
|
|
565
|
+
# Set response code based on overall status
|
|
566
|
+
if health_summary["overall_status"] == "healthy":
|
|
567
|
+
self.send_response(200)
|
|
568
|
+
elif health_summary["overall_status"] == "warning":
|
|
569
|
+
self.send_response(200) # Still OK, just warnings
|
|
570
|
+
else:
|
|
571
|
+
self.send_response(503) # Service unavailable
|
|
572
|
+
|
|
573
|
+
self.send_header("Content-type", "application/json")
|
|
574
|
+
self.end_headers()
|
|
575
|
+
|
|
576
|
+
response = json.dumps(health_summary, indent=2)
|
|
577
|
+
self.wfile.write(response.encode())
|
|
578
|
+
else:
|
|
579
|
+
self.send_response(404)
|
|
580
|
+
self.end_headers()
|
|
581
|
+
|
|
582
|
+
def log_message(self, format, *args):
|
|
583
|
+
# Suppress default logging
|
|
584
|
+
pass
|
|
585
|
+
|
|
586
|
+
return HTTPServer, HealthHandler
|
|
587
|
+
except ImportError:
|
|
588
|
+
return None, None
|