invarlock 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. invarlock/__init__.py +33 -0
  2. invarlock/__main__.py +10 -0
  3. invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
  4. invarlock/_data/runtime/profiles/release.yaml +23 -0
  5. invarlock/_data/runtime/tiers.yaml +76 -0
  6. invarlock/adapters/__init__.py +102 -0
  7. invarlock/adapters/_capabilities.py +45 -0
  8. invarlock/adapters/auto.py +99 -0
  9. invarlock/adapters/base.py +530 -0
  10. invarlock/adapters/base_types.py +85 -0
  11. invarlock/adapters/hf_bert.py +852 -0
  12. invarlock/adapters/hf_gpt2.py +403 -0
  13. invarlock/adapters/hf_llama.py +485 -0
  14. invarlock/adapters/hf_mixin.py +383 -0
  15. invarlock/adapters/hf_onnx.py +112 -0
  16. invarlock/adapters/hf_t5.py +137 -0
  17. invarlock/adapters/py.typed +1 -0
  18. invarlock/assurance/__init__.py +43 -0
  19. invarlock/cli/__init__.py +8 -0
  20. invarlock/cli/__main__.py +8 -0
  21. invarlock/cli/_evidence.py +25 -0
  22. invarlock/cli/_json.py +75 -0
  23. invarlock/cli/adapter_auto.py +162 -0
  24. invarlock/cli/app.py +287 -0
  25. invarlock/cli/commands/__init__.py +26 -0
  26. invarlock/cli/commands/certify.py +403 -0
  27. invarlock/cli/commands/doctor.py +1358 -0
  28. invarlock/cli/commands/explain_gates.py +151 -0
  29. invarlock/cli/commands/export_html.py +100 -0
  30. invarlock/cli/commands/plugins.py +1331 -0
  31. invarlock/cli/commands/report.py +354 -0
  32. invarlock/cli/commands/run.py +4146 -0
  33. invarlock/cli/commands/verify.py +1040 -0
  34. invarlock/cli/config.py +396 -0
  35. invarlock/cli/constants.py +68 -0
  36. invarlock/cli/device.py +92 -0
  37. invarlock/cli/doctor_helpers.py +74 -0
  38. invarlock/cli/errors.py +6 -0
  39. invarlock/cli/overhead_utils.py +60 -0
  40. invarlock/cli/provenance.py +66 -0
  41. invarlock/cli/utils.py +41 -0
  42. invarlock/config.py +56 -0
  43. invarlock/core/__init__.py +62 -0
  44. invarlock/core/abi.py +15 -0
  45. invarlock/core/api.py +274 -0
  46. invarlock/core/auto_tuning.py +317 -0
  47. invarlock/core/bootstrap.py +226 -0
  48. invarlock/core/checkpoint.py +221 -0
  49. invarlock/core/contracts.py +73 -0
  50. invarlock/core/error_utils.py +64 -0
  51. invarlock/core/events.py +298 -0
  52. invarlock/core/exceptions.py +95 -0
  53. invarlock/core/registry.py +481 -0
  54. invarlock/core/retry.py +146 -0
  55. invarlock/core/runner.py +2041 -0
  56. invarlock/core/types.py +154 -0
  57. invarlock/edits/__init__.py +12 -0
  58. invarlock/edits/_edit_utils.py +249 -0
  59. invarlock/edits/_external_utils.py +268 -0
  60. invarlock/edits/noop.py +47 -0
  61. invarlock/edits/py.typed +1 -0
  62. invarlock/edits/quant_rtn.py +801 -0
  63. invarlock/edits/registry.py +166 -0
  64. invarlock/eval/__init__.py +23 -0
  65. invarlock/eval/bench.py +1207 -0
  66. invarlock/eval/bootstrap.py +50 -0
  67. invarlock/eval/data.py +2052 -0
  68. invarlock/eval/metrics.py +2167 -0
  69. invarlock/eval/primary_metric.py +767 -0
  70. invarlock/eval/probes/__init__.py +24 -0
  71. invarlock/eval/probes/fft.py +139 -0
  72. invarlock/eval/probes/mi.py +213 -0
  73. invarlock/eval/probes/post_attention.py +323 -0
  74. invarlock/eval/providers/base.py +67 -0
  75. invarlock/eval/providers/seq2seq.py +111 -0
  76. invarlock/eval/providers/text_lm.py +113 -0
  77. invarlock/eval/providers/vision_text.py +93 -0
  78. invarlock/eval/py.typed +1 -0
  79. invarlock/guards/__init__.py +18 -0
  80. invarlock/guards/_contracts.py +9 -0
  81. invarlock/guards/invariants.py +640 -0
  82. invarlock/guards/policies.py +805 -0
  83. invarlock/guards/py.typed +1 -0
  84. invarlock/guards/rmt.py +2097 -0
  85. invarlock/guards/spectral.py +1419 -0
  86. invarlock/guards/tier_config.py +354 -0
  87. invarlock/guards/variance.py +3298 -0
  88. invarlock/guards_ref/__init__.py +15 -0
  89. invarlock/guards_ref/rmt_ref.py +40 -0
  90. invarlock/guards_ref/spectral_ref.py +135 -0
  91. invarlock/guards_ref/variance_ref.py +60 -0
  92. invarlock/model_profile.py +353 -0
  93. invarlock/model_utils.py +221 -0
  94. invarlock/observability/__init__.py +10 -0
  95. invarlock/observability/alerting.py +535 -0
  96. invarlock/observability/core.py +546 -0
  97. invarlock/observability/exporters.py +565 -0
  98. invarlock/observability/health.py +588 -0
  99. invarlock/observability/metrics.py +457 -0
  100. invarlock/observability/py.typed +1 -0
  101. invarlock/observability/utils.py +553 -0
  102. invarlock/plugins/__init__.py +12 -0
  103. invarlock/plugins/hello_guard.py +33 -0
  104. invarlock/plugins/hf_awq_adapter.py +82 -0
  105. invarlock/plugins/hf_bnb_adapter.py +79 -0
  106. invarlock/plugins/hf_gptq_adapter.py +78 -0
  107. invarlock/plugins/py.typed +1 -0
  108. invarlock/py.typed +1 -0
  109. invarlock/reporting/__init__.py +7 -0
  110. invarlock/reporting/certificate.py +3221 -0
  111. invarlock/reporting/certificate_schema.py +244 -0
  112. invarlock/reporting/dataset_hashing.py +215 -0
  113. invarlock/reporting/guards_analysis.py +948 -0
  114. invarlock/reporting/html.py +32 -0
  115. invarlock/reporting/normalizer.py +235 -0
  116. invarlock/reporting/policy_utils.py +517 -0
  117. invarlock/reporting/primary_metric_utils.py +265 -0
  118. invarlock/reporting/render.py +1442 -0
  119. invarlock/reporting/report.py +903 -0
  120. invarlock/reporting/report_types.py +278 -0
  121. invarlock/reporting/utils.py +175 -0
  122. invarlock/reporting/validate.py +631 -0
  123. invarlock/security.py +176 -0
  124. invarlock/sparsity_utils.py +323 -0
  125. invarlock/utils/__init__.py +150 -0
  126. invarlock/utils/digest.py +45 -0
  127. invarlock-0.2.0.dist-info/METADATA +586 -0
  128. invarlock-0.2.0.dist-info/RECORD +132 -0
  129. invarlock-0.2.0.dist-info/WHEEL +5 -0
  130. invarlock-0.2.0.dist-info/entry_points.txt +20 -0
  131. invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
  132. invarlock-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,546 @@
1
+ """
2
+ Core monitoring and telemetry infrastructure.
3
+ """
4
+
5
+ import logging
6
+ import threading
7
+ import time
8
+ from collections import defaultdict, deque
9
+ from dataclasses import dataclass, field
10
+ from typing import Any
11
+
12
+ import psutil
13
+ import torch
14
+
15
+ from .alerting import AlertManager, AlertSeverity
16
+ from .health import HealthChecker
17
+ from .metrics import MetricsRegistry
18
+
19
+
20
+ @dataclass
21
+ class MonitoringConfig:
22
+ """Configuration for monitoring system."""
23
+
24
+ # Collection intervals
25
+ metrics_interval: float = 10.0 # seconds
26
+ health_check_interval: float = 30.0 # seconds
27
+ resource_check_interval: float = 5.0 # seconds
28
+
29
+ # Data retention
30
+ metrics_retention_hours: int = 24
31
+ max_events: int = 10000
32
+
33
+ # Alerting
34
+ enable_alerting: bool = True
35
+ alert_channels: list[str] = field(default_factory=list)
36
+
37
+ # Export settings
38
+ prometheus_enabled: bool = False
39
+ prometheus_port: int = 9090
40
+ json_export_enabled: bool = True
41
+ json_export_path: str = "./monitoring"
42
+
43
+ # Resource monitoring
44
+ cpu_threshold: float = 80.0 # percent
45
+ memory_threshold: float = 85.0 # percent
46
+ gpu_memory_threshold: float = 90.0 # percent
47
+
48
+ # Performance monitoring
49
+ latency_percentiles: list[float] = field(default_factory=lambda: [50, 90, 95, 99])
50
+ slow_request_threshold: float = 30.0 # seconds
51
+
52
+
53
+ class MonitoringManager:
54
+ """Central monitoring manager for InvarLock operations."""
55
+
56
+ def __init__(self, config: MonitoringConfig | None = None):
57
+ self.config = config or MonitoringConfig()
58
+ self.logger = logging.getLogger(__name__)
59
+
60
+ # Core components
61
+ self.metrics = MetricsRegistry()
62
+ self.health_checker = HealthChecker()
63
+ self.alert_manager = AlertManager()
64
+
65
+ # Monitoring threads
66
+ self._monitoring_threads: list = []
67
+ self._stop_event = threading.Event()
68
+
69
+ # Performance tracking
70
+ self.performance_monitor = PerformanceMonitor(self.metrics)
71
+ self.resource_monitor = ResourceMonitor(self.metrics, self.config)
72
+
73
+ # Initialize default metrics
74
+ self._setup_default_metrics()
75
+
76
+ # Setup alerting rules
77
+ self._setup_default_alerts()
78
+
79
+ def start(self):
80
+ """Start all monitoring components."""
81
+ self.logger.info("Starting InvarLock monitoring system")
82
+
83
+ # Start metrics collection
84
+ metrics_thread = threading.Thread(
85
+ target=self._metrics_collection_loop, name="MetricsCollector"
86
+ )
87
+ metrics_thread.daemon = True
88
+ metrics_thread.start()
89
+ self._monitoring_threads.append(metrics_thread)
90
+
91
+ # Start health checking
92
+ health_thread = threading.Thread(
93
+ target=self._health_check_loop, name="HealthChecker"
94
+ )
95
+ health_thread.daemon = True
96
+ health_thread.start()
97
+ self._monitoring_threads.append(health_thread)
98
+
99
+ # Start resource monitoring
100
+ resource_thread = threading.Thread(
101
+ target=self._resource_monitoring_loop, name="ResourceMonitor"
102
+ )
103
+ resource_thread.daemon = True
104
+ resource_thread.start()
105
+ self._monitoring_threads.append(resource_thread)
106
+
107
+ self.logger.info("Monitoring system started successfully")
108
+
109
+ def stop(self):
110
+ """Stop all monitoring components."""
111
+ self.logger.info("Stopping InvarLock monitoring system")
112
+
113
+ self._stop_event.set()
114
+
115
+ # Wait for threads to finish
116
+ for thread in self._monitoring_threads:
117
+ thread.join(timeout=5.0)
118
+
119
+ # Export final metrics
120
+ self._export_metrics()
121
+
122
+ self.logger.info("Monitoring system stopped")
123
+
124
+ def record_operation(self, operation: str, duration: float, **metadata):
125
+ """Record an operation with timing and metadata."""
126
+ self.performance_monitor.record_operation(operation, duration, **metadata)
127
+
128
+ def record_error(self, error_type: str, error_msg: str, **context):
129
+ """Record an error event."""
130
+ self.metrics.get_counter("invarlock.errors.total").inc(
131
+ labels={"type": error_type}
132
+ )
133
+
134
+ # Log error with context
135
+ self.logger.error(f"Error recorded: {error_type} - {error_msg}", extra=context)
136
+
137
+ # Check if alert should be triggered
138
+ self.alert_manager.check_error_alerts(error_type, error_msg, context)
139
+
140
+ def get_status(self) -> dict[str, Any]:
141
+ """Get current monitoring status."""
142
+ return {
143
+ "monitoring_active": not self._stop_event.is_set(),
144
+ "metrics_count": len(self.metrics._metrics),
145
+ "health_status": self.health_checker.get_overall_status(),
146
+ "active_alerts": self.alert_manager.get_active_alerts(),
147
+ "resource_usage": self.resource_monitor.get_current_usage(),
148
+ "performance_stats": self.performance_monitor.get_summary(),
149
+ "uptime": self._get_uptime(),
150
+ }
151
+
152
+ def _setup_default_metrics(self):
153
+ """Setup default InvarLock metrics."""
154
+ # Operation counters
155
+ self.metrics.register_counter(
156
+ "invarlock.operations.total", "Total InvarLock operations"
157
+ )
158
+ self.metrics.register_counter("invarlock.errors.total", "Total errors")
159
+ self.metrics.register_counter("invarlock.edits.applied", "Total edits applied")
160
+ self.metrics.register_counter("invarlock.guards.triggered", "Guard triggers")
161
+
162
+ # Performance metrics
163
+ self.metrics.register_histogram(
164
+ "invarlock.operation.duration", "Operation duration"
165
+ )
166
+ self.metrics.register_histogram(
167
+ "invarlock.edit.duration", "Edit operation duration"
168
+ )
169
+ self.metrics.register_histogram(
170
+ "invarlock.guard.duration", "Guard execution duration"
171
+ )
172
+
173
+ # Resource metrics
174
+ self.metrics.register_gauge("invarlock.memory.usage", "Memory usage")
175
+ self.metrics.register_gauge("invarlock.gpu.memory.usage", "GPU memory usage")
176
+ self.metrics.register_gauge("invarlock.cpu.usage", "CPU usage")
177
+
178
+ # Model metrics
179
+ self.metrics.register_gauge(
180
+ "invarlock.model.parameters", "Model parameter count"
181
+ )
182
+ self.metrics.register_gauge("invarlock.model.size_mb", "Model size in MB")
183
+ self.metrics.register_counter("invarlock.model.loads", "Model loads")
184
+
185
+ def _setup_default_alerts(self):
186
+ """Setup default alerting rules."""
187
+ if not self.config.enable_alerting:
188
+ return
189
+
190
+ from .alerting import AlertRule
191
+
192
+ # High error rate alert
193
+ self.alert_manager.add_rule(
194
+ AlertRule(
195
+ name="high_error_rate",
196
+ metric="invarlock.errors.total",
197
+ threshold=10,
198
+ window_minutes=5,
199
+ severity=AlertSeverity.WARNING,
200
+ message="High error rate detected",
201
+ )
202
+ )
203
+
204
+ # Resource usage alerts
205
+ self.alert_manager.add_rule(
206
+ AlertRule(
207
+ name="high_memory_usage",
208
+ metric="invarlock.memory.usage",
209
+ threshold=self.config.memory_threshold,
210
+ severity=AlertSeverity.WARNING,
211
+ message="High memory usage detected",
212
+ )
213
+ )
214
+
215
+ # Performance alerts
216
+ self.alert_manager.add_rule(
217
+ AlertRule(
218
+ name="slow_operations",
219
+ metric="invarlock.operation.duration",
220
+ threshold=self.config.slow_request_threshold,
221
+ percentile=95,
222
+ severity=AlertSeverity.WARNING,
223
+ message="Slow operations detected",
224
+ )
225
+ )
226
+
227
+ def _metrics_collection_loop(self):
228
+ """Main metrics collection loop."""
229
+ while not self._stop_event.is_set():
230
+ try:
231
+ # Update resource metrics
232
+ self.resource_monitor.update_metrics()
233
+
234
+ # Update performance metrics
235
+ self.performance_monitor.update_metrics()
236
+
237
+ # Export metrics if needed
238
+ if self.config.json_export_enabled:
239
+ self._export_metrics()
240
+
241
+ except Exception as e:
242
+ self.logger.error(f"Error in metrics collection: {e}")
243
+
244
+ self._stop_event.wait(self.config.metrics_interval)
245
+
246
+ def _health_check_loop(self):
247
+ """Health check monitoring loop."""
248
+ while not self._stop_event.is_set():
249
+ try:
250
+ # Run health checks
251
+ health_status = self.health_checker.check_all()
252
+
253
+ # Update health metrics
254
+ for component, status in health_status.items():
255
+ self.metrics.get_gauge("invarlock.health.status").set(
256
+ 1 if status.healthy else 0, labels={"component": component}
257
+ )
258
+
259
+ # Check for health-based alerts
260
+ self.alert_manager.check_health_alerts(health_status)
261
+
262
+ except Exception as e:
263
+ self.logger.error(f"Error in health checking: {e}")
264
+
265
+ self._stop_event.wait(self.config.health_check_interval)
266
+
267
+ def _resource_monitoring_loop(self):
268
+ """Resource monitoring loop."""
269
+ while not self._stop_event.is_set():
270
+ try:
271
+ # Monitor resource usage
272
+ usage = self.resource_monitor.collect_usage()
273
+
274
+ # Check resource-based alerts
275
+ self.alert_manager.check_resource_alerts(usage)
276
+
277
+ except Exception as e:
278
+ self.logger.error(f"Error in resource monitoring: {e}")
279
+
280
+ self._stop_event.wait(self.config.resource_check_interval)
281
+
282
+ def _export_metrics(self):
283
+ """Export metrics to configured outputs."""
284
+ try:
285
+ if self.config.json_export_enabled:
286
+ from .exporters import JSONExporter
287
+
288
+ exporter = JSONExporter(self.config.json_export_path)
289
+ exporter.export(self.metrics.get_all_metrics())
290
+
291
+ except Exception as e:
292
+ self.logger.error(f"Error exporting metrics: {e}")
293
+
294
+ def _get_uptime(self) -> float:
295
+ """Get monitoring system uptime in seconds."""
296
+ return time.time() - getattr(self, "_start_time", time.time())
297
+
298
+
299
+ class TelemetryCollector:
300
+ """Collects telemetry data for InvarLock operations."""
301
+
302
+ def __init__(self, monitoring_manager: MonitoringManager):
303
+ self.monitoring = monitoring_manager
304
+ self.logger = logging.getLogger(__name__)
305
+
306
+ # Operation tracking
307
+ self.active_operations: dict = {}
308
+ self.operation_history: deque = deque(maxlen=1000)
309
+
310
+ def start_operation(
311
+ self, operation_id: str, operation_type: str, **metadata
312
+ ) -> str:
313
+ """Start tracking an operation."""
314
+ start_time = time.time()
315
+
316
+ operation_data = {
317
+ "id": operation_id,
318
+ "type": operation_type,
319
+ "start_time": start_time,
320
+ "metadata": metadata,
321
+ }
322
+
323
+ self.active_operations[operation_id] = operation_data
324
+
325
+ # Record operation start
326
+ self.monitoring.metrics.get_counter("invarlock.operations.total").inc(
327
+ labels={"type": operation_type, "status": "started"}
328
+ )
329
+
330
+ self.logger.info(f"Operation started: {operation_id} ({operation_type})")
331
+ return operation_id
332
+
333
+ def end_operation(
334
+ self, operation_id: str, status: str = "success", **result_metadata
335
+ ):
336
+ """End tracking an operation."""
337
+ if operation_id not in self.active_operations:
338
+ self.logger.warning(f"Unknown operation ID: {operation_id}")
339
+ return
340
+
341
+ operation_data = self.active_operations.pop(operation_id)
342
+ end_time = time.time()
343
+ duration = end_time - operation_data["start_time"]
344
+
345
+ # Complete operation record
346
+ operation_record = {
347
+ **operation_data,
348
+ "end_time": end_time,
349
+ "duration": duration,
350
+ "status": status,
351
+ "result_metadata": result_metadata,
352
+ }
353
+
354
+ self.operation_history.append(operation_record)
355
+
356
+ # Record metrics
357
+ self.monitoring.record_operation(
358
+ operation_data["type"],
359
+ duration,
360
+ status=status,
361
+ **operation_data["metadata"],
362
+ **result_metadata,
363
+ )
364
+
365
+ self.logger.info(
366
+ f"Operation completed: {operation_id} ({operation_data['type']}) "
367
+ f"- {status} in {duration:.2f}s"
368
+ )
369
+
370
+ def get_operation_stats(self) -> dict[str, Any]:
371
+ """Get operation statistics."""
372
+ if not self.operation_history:
373
+ return {}
374
+
375
+ operations = list(self.operation_history)
376
+ total_ops = len(operations)
377
+
378
+ # Calculate statistics
379
+ durations = [op["duration"] for op in operations]
380
+ avg_duration = sum(durations) / len(durations)
381
+
382
+ status_counts: dict = defaultdict(int)
383
+ type_counts: dict = defaultdict(int)
384
+
385
+ for op in operations:
386
+ status_counts[op["status"]] += 1
387
+ type_counts[op["type"]] += 1
388
+
389
+ return {
390
+ "total_operations": total_ops,
391
+ "active_operations": len(self.active_operations),
392
+ "average_duration": avg_duration,
393
+ "status_distribution": dict(status_counts),
394
+ "type_distribution": dict(type_counts),
395
+ "success_rate": status_counts["success"] / total_ops
396
+ if total_ops > 0
397
+ else 0,
398
+ }
399
+
400
+
401
+ class PerformanceMonitor:
402
+ """Monitors InvarLock performance metrics."""
403
+
404
+ def __init__(self, metrics_registry: MetricsRegistry):
405
+ self.metrics = metrics_registry
406
+ self.operation_times: dict = defaultdict(list)
407
+ self.performance_data: dict = defaultdict(dict)
408
+
409
+ def record_operation(self, operation: str, duration: float, **metadata):
410
+ """Record an operation's performance."""
411
+ # Store timing data
412
+ self.operation_times[operation].append(duration)
413
+
414
+ # Keep only recent measurements (last 1000)
415
+ if len(self.operation_times[operation]) > 1000:
416
+ self.operation_times[operation] = self.operation_times[operation][-1000:]
417
+
418
+ # Update histogram metric
419
+ self.metrics.get_histogram("invarlock.operation.duration").observe(
420
+ duration, labels={"operation": operation}
421
+ )
422
+
423
+ # Store metadata
424
+ if metadata:
425
+ self.performance_data[operation].update(metadata)
426
+
427
+ def get_operation_stats(self, operation: str) -> dict[str, float]:
428
+ """Get statistics for a specific operation."""
429
+ times = self.operation_times.get(operation, [])
430
+ if not times:
431
+ return {}
432
+
433
+ times_sorted = sorted(times)
434
+ count = len(times)
435
+
436
+ return {
437
+ "count": count,
438
+ "mean": sum(times) / count,
439
+ "min": min(times),
440
+ "max": max(times),
441
+ "p50": times_sorted[int(count * 0.5)],
442
+ "p90": times_sorted[int(count * 0.9)],
443
+ "p95": times_sorted[int(count * 0.95)],
444
+ "p99": times_sorted[int(count * 0.99)],
445
+ }
446
+
447
+ def get_summary(self) -> dict[str, Any]:
448
+ """Get performance summary for all operations."""
449
+ summary = {}
450
+ for operation in self.operation_times:
451
+ summary[operation] = self.get_operation_stats(operation)
452
+ return summary
453
+
454
+ def update_metrics(self):
455
+ """Update performance metrics."""
456
+ # Update operation-specific metrics
457
+ for operation, stats in self.get_summary().items():
458
+ if stats:
459
+ # Update gauge metrics for key percentiles
460
+ self.metrics.get_gauge("invarlock.operation.p95_duration").set(
461
+ stats["p95"], labels={"operation": operation}
462
+ )
463
+ self.metrics.get_gauge("invarlock.operation.mean_duration").set(
464
+ stats["mean"], labels={"operation": operation}
465
+ )
466
+
467
+
468
+ class ResourceMonitor:
469
+ """Monitors system resource usage."""
470
+
471
+ def __init__(self, metrics_registry: MetricsRegistry, config: MonitoringConfig):
472
+ self.metrics = metrics_registry
473
+ self.config = config
474
+ self.logger = logging.getLogger(__name__)
475
+
476
+ def collect_usage(self) -> dict[str, float]:
477
+ """Collect current resource usage."""
478
+ usage = {}
479
+
480
+ try:
481
+ # CPU usage
482
+ cpu_percent = psutil.cpu_percent(interval=1)
483
+ usage["cpu_percent"] = cpu_percent
484
+
485
+ # Memory usage
486
+ memory = psutil.virtual_memory()
487
+ usage["memory_percent"] = memory.percent
488
+ usage["memory_available_gb"] = memory.available / (1024**3)
489
+ usage["memory_used_gb"] = memory.used / (1024**3)
490
+
491
+ # GPU usage (if available)
492
+ if torch.cuda.is_available():
493
+ for i in range(torch.cuda.device_count()):
494
+ gpu_memory = torch.cuda.memory_stats(i)
495
+ allocated = gpu_memory.get("allocated_bytes.all.current", 0)
496
+ reserved = gpu_memory.get("reserved_bytes.all.current", 0)
497
+
498
+ usage[f"gpu_{i}_memory_allocated_gb"] = allocated / (1024**3)
499
+ usage[f"gpu_{i}_memory_reserved_gb"] = reserved / (1024**3)
500
+
501
+ # Calculate percentage of total memory
502
+ total_memory = torch.cuda.get_device_properties(i).total_memory
503
+ usage[f"gpu_{i}_memory_percent"] = (allocated / total_memory) * 100
504
+
505
+ # Disk usage
506
+ disk = psutil.disk_usage("/")
507
+ usage["disk_percent"] = (disk.used / disk.total) * 100
508
+ usage["disk_free_gb"] = disk.free / (1024**3)
509
+
510
+ except Exception as e:
511
+ self.logger.error(f"Error collecting resource usage: {e}")
512
+
513
+ return usage
514
+
515
+ def update_metrics(self):
516
+ """Update resource metrics."""
517
+ usage = self.collect_usage()
518
+
519
+ for metric_name, value in usage.items():
520
+ metric_key = f"invarlock.resource.{metric_name}"
521
+ self.metrics.get_gauge(metric_key).set(value)
522
+
523
+ def get_current_usage(self) -> dict[str, float]:
524
+ """Get current resource usage."""
525
+ return self.collect_usage()
526
+
527
+ def check_thresholds(self) -> list[str]:
528
+ """Check if any resource usage exceeds thresholds."""
529
+ usage = self.collect_usage()
530
+ warnings = []
531
+
532
+ if usage.get("cpu_percent", 0) > self.config.cpu_threshold:
533
+ warnings.append(f"High CPU usage: {usage['cpu_percent']:.1f}%")
534
+
535
+ if usage.get("memory_percent", 0) > self.config.memory_threshold:
536
+ warnings.append(f"High memory usage: {usage['memory_percent']:.1f}%")
537
+
538
+ # Check GPU memory
539
+ for key, value in usage.items():
540
+ if (
541
+ key.endswith("_memory_percent")
542
+ and value > self.config.gpu_memory_threshold
543
+ ):
544
+ warnings.append(f"High GPU memory usage: {key} = {value:.1f}%")
545
+
546
+ return warnings