invarlock 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- invarlock/__init__.py +33 -0
- invarlock/__main__.py +10 -0
- invarlock/_data/runtime/profiles/ci_cpu.yaml +15 -0
- invarlock/_data/runtime/profiles/release.yaml +23 -0
- invarlock/_data/runtime/tiers.yaml +76 -0
- invarlock/adapters/__init__.py +102 -0
- invarlock/adapters/_capabilities.py +45 -0
- invarlock/adapters/auto.py +99 -0
- invarlock/adapters/base.py +530 -0
- invarlock/adapters/base_types.py +85 -0
- invarlock/adapters/hf_bert.py +852 -0
- invarlock/adapters/hf_gpt2.py +403 -0
- invarlock/adapters/hf_llama.py +485 -0
- invarlock/adapters/hf_mixin.py +383 -0
- invarlock/adapters/hf_onnx.py +112 -0
- invarlock/adapters/hf_t5.py +137 -0
- invarlock/adapters/py.typed +1 -0
- invarlock/assurance/__init__.py +43 -0
- invarlock/cli/__init__.py +8 -0
- invarlock/cli/__main__.py +8 -0
- invarlock/cli/_evidence.py +25 -0
- invarlock/cli/_json.py +75 -0
- invarlock/cli/adapter_auto.py +162 -0
- invarlock/cli/app.py +287 -0
- invarlock/cli/commands/__init__.py +26 -0
- invarlock/cli/commands/certify.py +403 -0
- invarlock/cli/commands/doctor.py +1358 -0
- invarlock/cli/commands/explain_gates.py +151 -0
- invarlock/cli/commands/export_html.py +100 -0
- invarlock/cli/commands/plugins.py +1331 -0
- invarlock/cli/commands/report.py +354 -0
- invarlock/cli/commands/run.py +4146 -0
- invarlock/cli/commands/verify.py +1040 -0
- invarlock/cli/config.py +396 -0
- invarlock/cli/constants.py +68 -0
- invarlock/cli/device.py +92 -0
- invarlock/cli/doctor_helpers.py +74 -0
- invarlock/cli/errors.py +6 -0
- invarlock/cli/overhead_utils.py +60 -0
- invarlock/cli/provenance.py +66 -0
- invarlock/cli/utils.py +41 -0
- invarlock/config.py +56 -0
- invarlock/core/__init__.py +62 -0
- invarlock/core/abi.py +15 -0
- invarlock/core/api.py +274 -0
- invarlock/core/auto_tuning.py +317 -0
- invarlock/core/bootstrap.py +226 -0
- invarlock/core/checkpoint.py +221 -0
- invarlock/core/contracts.py +73 -0
- invarlock/core/error_utils.py +64 -0
- invarlock/core/events.py +298 -0
- invarlock/core/exceptions.py +95 -0
- invarlock/core/registry.py +481 -0
- invarlock/core/retry.py +146 -0
- invarlock/core/runner.py +2041 -0
- invarlock/core/types.py +154 -0
- invarlock/edits/__init__.py +12 -0
- invarlock/edits/_edit_utils.py +249 -0
- invarlock/edits/_external_utils.py +268 -0
- invarlock/edits/noop.py +47 -0
- invarlock/edits/py.typed +1 -0
- invarlock/edits/quant_rtn.py +801 -0
- invarlock/edits/registry.py +166 -0
- invarlock/eval/__init__.py +23 -0
- invarlock/eval/bench.py +1207 -0
- invarlock/eval/bootstrap.py +50 -0
- invarlock/eval/data.py +2052 -0
- invarlock/eval/metrics.py +2167 -0
- invarlock/eval/primary_metric.py +767 -0
- invarlock/eval/probes/__init__.py +24 -0
- invarlock/eval/probes/fft.py +139 -0
- invarlock/eval/probes/mi.py +213 -0
- invarlock/eval/probes/post_attention.py +323 -0
- invarlock/eval/providers/base.py +67 -0
- invarlock/eval/providers/seq2seq.py +111 -0
- invarlock/eval/providers/text_lm.py +113 -0
- invarlock/eval/providers/vision_text.py +93 -0
- invarlock/eval/py.typed +1 -0
- invarlock/guards/__init__.py +18 -0
- invarlock/guards/_contracts.py +9 -0
- invarlock/guards/invariants.py +640 -0
- invarlock/guards/policies.py +805 -0
- invarlock/guards/py.typed +1 -0
- invarlock/guards/rmt.py +2097 -0
- invarlock/guards/spectral.py +1419 -0
- invarlock/guards/tier_config.py +354 -0
- invarlock/guards/variance.py +3298 -0
- invarlock/guards_ref/__init__.py +15 -0
- invarlock/guards_ref/rmt_ref.py +40 -0
- invarlock/guards_ref/spectral_ref.py +135 -0
- invarlock/guards_ref/variance_ref.py +60 -0
- invarlock/model_profile.py +353 -0
- invarlock/model_utils.py +221 -0
- invarlock/observability/__init__.py +10 -0
- invarlock/observability/alerting.py +535 -0
- invarlock/observability/core.py +546 -0
- invarlock/observability/exporters.py +565 -0
- invarlock/observability/health.py +588 -0
- invarlock/observability/metrics.py +457 -0
- invarlock/observability/py.typed +1 -0
- invarlock/observability/utils.py +553 -0
- invarlock/plugins/__init__.py +12 -0
- invarlock/plugins/hello_guard.py +33 -0
- invarlock/plugins/hf_awq_adapter.py +82 -0
- invarlock/plugins/hf_bnb_adapter.py +79 -0
- invarlock/plugins/hf_gptq_adapter.py +78 -0
- invarlock/plugins/py.typed +1 -0
- invarlock/py.typed +1 -0
- invarlock/reporting/__init__.py +7 -0
- invarlock/reporting/certificate.py +3221 -0
- invarlock/reporting/certificate_schema.py +244 -0
- invarlock/reporting/dataset_hashing.py +215 -0
- invarlock/reporting/guards_analysis.py +948 -0
- invarlock/reporting/html.py +32 -0
- invarlock/reporting/normalizer.py +235 -0
- invarlock/reporting/policy_utils.py +517 -0
- invarlock/reporting/primary_metric_utils.py +265 -0
- invarlock/reporting/render.py +1442 -0
- invarlock/reporting/report.py +903 -0
- invarlock/reporting/report_types.py +278 -0
- invarlock/reporting/utils.py +175 -0
- invarlock/reporting/validate.py +631 -0
- invarlock/security.py +176 -0
- invarlock/sparsity_utils.py +323 -0
- invarlock/utils/__init__.py +150 -0
- invarlock/utils/digest.py +45 -0
- invarlock-0.2.0.dist-info/METADATA +586 -0
- invarlock-0.2.0.dist-info/RECORD +132 -0
- invarlock-0.2.0.dist-info/WHEEL +5 -0
- invarlock-0.2.0.dist-info/entry_points.txt +20 -0
- invarlock-0.2.0.dist-info/licenses/LICENSE +201 -0
- invarlock-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core monitoring and telemetry infrastructure.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import threading
|
|
7
|
+
import time
|
|
8
|
+
from collections import defaultdict, deque
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import psutil
|
|
13
|
+
import torch
|
|
14
|
+
|
|
15
|
+
from .alerting import AlertManager, AlertSeverity
|
|
16
|
+
from .health import HealthChecker
|
|
17
|
+
from .metrics import MetricsRegistry
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class MonitoringConfig:
|
|
22
|
+
"""Configuration for monitoring system."""
|
|
23
|
+
|
|
24
|
+
# Collection intervals
|
|
25
|
+
metrics_interval: float = 10.0 # seconds
|
|
26
|
+
health_check_interval: float = 30.0 # seconds
|
|
27
|
+
resource_check_interval: float = 5.0 # seconds
|
|
28
|
+
|
|
29
|
+
# Data retention
|
|
30
|
+
metrics_retention_hours: int = 24
|
|
31
|
+
max_events: int = 10000
|
|
32
|
+
|
|
33
|
+
# Alerting
|
|
34
|
+
enable_alerting: bool = True
|
|
35
|
+
alert_channels: list[str] = field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
# Export settings
|
|
38
|
+
prometheus_enabled: bool = False
|
|
39
|
+
prometheus_port: int = 9090
|
|
40
|
+
json_export_enabled: bool = True
|
|
41
|
+
json_export_path: str = "./monitoring"
|
|
42
|
+
|
|
43
|
+
# Resource monitoring
|
|
44
|
+
cpu_threshold: float = 80.0 # percent
|
|
45
|
+
memory_threshold: float = 85.0 # percent
|
|
46
|
+
gpu_memory_threshold: float = 90.0 # percent
|
|
47
|
+
|
|
48
|
+
# Performance monitoring
|
|
49
|
+
latency_percentiles: list[float] = field(default_factory=lambda: [50, 90, 95, 99])
|
|
50
|
+
slow_request_threshold: float = 30.0 # seconds
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MonitoringManager:
|
|
54
|
+
"""Central monitoring manager for InvarLock operations."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, config: MonitoringConfig | None = None):
|
|
57
|
+
self.config = config or MonitoringConfig()
|
|
58
|
+
self.logger = logging.getLogger(__name__)
|
|
59
|
+
|
|
60
|
+
# Core components
|
|
61
|
+
self.metrics = MetricsRegistry()
|
|
62
|
+
self.health_checker = HealthChecker()
|
|
63
|
+
self.alert_manager = AlertManager()
|
|
64
|
+
|
|
65
|
+
# Monitoring threads
|
|
66
|
+
self._monitoring_threads: list = []
|
|
67
|
+
self._stop_event = threading.Event()
|
|
68
|
+
|
|
69
|
+
# Performance tracking
|
|
70
|
+
self.performance_monitor = PerformanceMonitor(self.metrics)
|
|
71
|
+
self.resource_monitor = ResourceMonitor(self.metrics, self.config)
|
|
72
|
+
|
|
73
|
+
# Initialize default metrics
|
|
74
|
+
self._setup_default_metrics()
|
|
75
|
+
|
|
76
|
+
# Setup alerting rules
|
|
77
|
+
self._setup_default_alerts()
|
|
78
|
+
|
|
79
|
+
def start(self):
|
|
80
|
+
"""Start all monitoring components."""
|
|
81
|
+
self.logger.info("Starting InvarLock monitoring system")
|
|
82
|
+
|
|
83
|
+
# Start metrics collection
|
|
84
|
+
metrics_thread = threading.Thread(
|
|
85
|
+
target=self._metrics_collection_loop, name="MetricsCollector"
|
|
86
|
+
)
|
|
87
|
+
metrics_thread.daemon = True
|
|
88
|
+
metrics_thread.start()
|
|
89
|
+
self._monitoring_threads.append(metrics_thread)
|
|
90
|
+
|
|
91
|
+
# Start health checking
|
|
92
|
+
health_thread = threading.Thread(
|
|
93
|
+
target=self._health_check_loop, name="HealthChecker"
|
|
94
|
+
)
|
|
95
|
+
health_thread.daemon = True
|
|
96
|
+
health_thread.start()
|
|
97
|
+
self._monitoring_threads.append(health_thread)
|
|
98
|
+
|
|
99
|
+
# Start resource monitoring
|
|
100
|
+
resource_thread = threading.Thread(
|
|
101
|
+
target=self._resource_monitoring_loop, name="ResourceMonitor"
|
|
102
|
+
)
|
|
103
|
+
resource_thread.daemon = True
|
|
104
|
+
resource_thread.start()
|
|
105
|
+
self._monitoring_threads.append(resource_thread)
|
|
106
|
+
|
|
107
|
+
self.logger.info("Monitoring system started successfully")
|
|
108
|
+
|
|
109
|
+
def stop(self):
|
|
110
|
+
"""Stop all monitoring components."""
|
|
111
|
+
self.logger.info("Stopping InvarLock monitoring system")
|
|
112
|
+
|
|
113
|
+
self._stop_event.set()
|
|
114
|
+
|
|
115
|
+
# Wait for threads to finish
|
|
116
|
+
for thread in self._monitoring_threads:
|
|
117
|
+
thread.join(timeout=5.0)
|
|
118
|
+
|
|
119
|
+
# Export final metrics
|
|
120
|
+
self._export_metrics()
|
|
121
|
+
|
|
122
|
+
self.logger.info("Monitoring system stopped")
|
|
123
|
+
|
|
124
|
+
def record_operation(self, operation: str, duration: float, **metadata):
|
|
125
|
+
"""Record an operation with timing and metadata."""
|
|
126
|
+
self.performance_monitor.record_operation(operation, duration, **metadata)
|
|
127
|
+
|
|
128
|
+
def record_error(self, error_type: str, error_msg: str, **context):
|
|
129
|
+
"""Record an error event."""
|
|
130
|
+
self.metrics.get_counter("invarlock.errors.total").inc(
|
|
131
|
+
labels={"type": error_type}
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
# Log error with context
|
|
135
|
+
self.logger.error(f"Error recorded: {error_type} - {error_msg}", extra=context)
|
|
136
|
+
|
|
137
|
+
# Check if alert should be triggered
|
|
138
|
+
self.alert_manager.check_error_alerts(error_type, error_msg, context)
|
|
139
|
+
|
|
140
|
+
def get_status(self) -> dict[str, Any]:
|
|
141
|
+
"""Get current monitoring status."""
|
|
142
|
+
return {
|
|
143
|
+
"monitoring_active": not self._stop_event.is_set(),
|
|
144
|
+
"metrics_count": len(self.metrics._metrics),
|
|
145
|
+
"health_status": self.health_checker.get_overall_status(),
|
|
146
|
+
"active_alerts": self.alert_manager.get_active_alerts(),
|
|
147
|
+
"resource_usage": self.resource_monitor.get_current_usage(),
|
|
148
|
+
"performance_stats": self.performance_monitor.get_summary(),
|
|
149
|
+
"uptime": self._get_uptime(),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
def _setup_default_metrics(self):
|
|
153
|
+
"""Setup default InvarLock metrics."""
|
|
154
|
+
# Operation counters
|
|
155
|
+
self.metrics.register_counter(
|
|
156
|
+
"invarlock.operations.total", "Total InvarLock operations"
|
|
157
|
+
)
|
|
158
|
+
self.metrics.register_counter("invarlock.errors.total", "Total errors")
|
|
159
|
+
self.metrics.register_counter("invarlock.edits.applied", "Total edits applied")
|
|
160
|
+
self.metrics.register_counter("invarlock.guards.triggered", "Guard triggers")
|
|
161
|
+
|
|
162
|
+
# Performance metrics
|
|
163
|
+
self.metrics.register_histogram(
|
|
164
|
+
"invarlock.operation.duration", "Operation duration"
|
|
165
|
+
)
|
|
166
|
+
self.metrics.register_histogram(
|
|
167
|
+
"invarlock.edit.duration", "Edit operation duration"
|
|
168
|
+
)
|
|
169
|
+
self.metrics.register_histogram(
|
|
170
|
+
"invarlock.guard.duration", "Guard execution duration"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Resource metrics
|
|
174
|
+
self.metrics.register_gauge("invarlock.memory.usage", "Memory usage")
|
|
175
|
+
self.metrics.register_gauge("invarlock.gpu.memory.usage", "GPU memory usage")
|
|
176
|
+
self.metrics.register_gauge("invarlock.cpu.usage", "CPU usage")
|
|
177
|
+
|
|
178
|
+
# Model metrics
|
|
179
|
+
self.metrics.register_gauge(
|
|
180
|
+
"invarlock.model.parameters", "Model parameter count"
|
|
181
|
+
)
|
|
182
|
+
self.metrics.register_gauge("invarlock.model.size_mb", "Model size in MB")
|
|
183
|
+
self.metrics.register_counter("invarlock.model.loads", "Model loads")
|
|
184
|
+
|
|
185
|
+
def _setup_default_alerts(self):
|
|
186
|
+
"""Setup default alerting rules."""
|
|
187
|
+
if not self.config.enable_alerting:
|
|
188
|
+
return
|
|
189
|
+
|
|
190
|
+
from .alerting import AlertRule
|
|
191
|
+
|
|
192
|
+
# High error rate alert
|
|
193
|
+
self.alert_manager.add_rule(
|
|
194
|
+
AlertRule(
|
|
195
|
+
name="high_error_rate",
|
|
196
|
+
metric="invarlock.errors.total",
|
|
197
|
+
threshold=10,
|
|
198
|
+
window_minutes=5,
|
|
199
|
+
severity=AlertSeverity.WARNING,
|
|
200
|
+
message="High error rate detected",
|
|
201
|
+
)
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Resource usage alerts
|
|
205
|
+
self.alert_manager.add_rule(
|
|
206
|
+
AlertRule(
|
|
207
|
+
name="high_memory_usage",
|
|
208
|
+
metric="invarlock.memory.usage",
|
|
209
|
+
threshold=self.config.memory_threshold,
|
|
210
|
+
severity=AlertSeverity.WARNING,
|
|
211
|
+
message="High memory usage detected",
|
|
212
|
+
)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Performance alerts
|
|
216
|
+
self.alert_manager.add_rule(
|
|
217
|
+
AlertRule(
|
|
218
|
+
name="slow_operations",
|
|
219
|
+
metric="invarlock.operation.duration",
|
|
220
|
+
threshold=self.config.slow_request_threshold,
|
|
221
|
+
percentile=95,
|
|
222
|
+
severity=AlertSeverity.WARNING,
|
|
223
|
+
message="Slow operations detected",
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def _metrics_collection_loop(self):
|
|
228
|
+
"""Main metrics collection loop."""
|
|
229
|
+
while not self._stop_event.is_set():
|
|
230
|
+
try:
|
|
231
|
+
# Update resource metrics
|
|
232
|
+
self.resource_monitor.update_metrics()
|
|
233
|
+
|
|
234
|
+
# Update performance metrics
|
|
235
|
+
self.performance_monitor.update_metrics()
|
|
236
|
+
|
|
237
|
+
# Export metrics if needed
|
|
238
|
+
if self.config.json_export_enabled:
|
|
239
|
+
self._export_metrics()
|
|
240
|
+
|
|
241
|
+
except Exception as e:
|
|
242
|
+
self.logger.error(f"Error in metrics collection: {e}")
|
|
243
|
+
|
|
244
|
+
self._stop_event.wait(self.config.metrics_interval)
|
|
245
|
+
|
|
246
|
+
def _health_check_loop(self):
|
|
247
|
+
"""Health check monitoring loop."""
|
|
248
|
+
while not self._stop_event.is_set():
|
|
249
|
+
try:
|
|
250
|
+
# Run health checks
|
|
251
|
+
health_status = self.health_checker.check_all()
|
|
252
|
+
|
|
253
|
+
# Update health metrics
|
|
254
|
+
for component, status in health_status.items():
|
|
255
|
+
self.metrics.get_gauge("invarlock.health.status").set(
|
|
256
|
+
1 if status.healthy else 0, labels={"component": component}
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Check for health-based alerts
|
|
260
|
+
self.alert_manager.check_health_alerts(health_status)
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
self.logger.error(f"Error in health checking: {e}")
|
|
264
|
+
|
|
265
|
+
self._stop_event.wait(self.config.health_check_interval)
|
|
266
|
+
|
|
267
|
+
def _resource_monitoring_loop(self):
|
|
268
|
+
"""Resource monitoring loop."""
|
|
269
|
+
while not self._stop_event.is_set():
|
|
270
|
+
try:
|
|
271
|
+
# Monitor resource usage
|
|
272
|
+
usage = self.resource_monitor.collect_usage()
|
|
273
|
+
|
|
274
|
+
# Check resource-based alerts
|
|
275
|
+
self.alert_manager.check_resource_alerts(usage)
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
self.logger.error(f"Error in resource monitoring: {e}")
|
|
279
|
+
|
|
280
|
+
self._stop_event.wait(self.config.resource_check_interval)
|
|
281
|
+
|
|
282
|
+
def _export_metrics(self):
|
|
283
|
+
"""Export metrics to configured outputs."""
|
|
284
|
+
try:
|
|
285
|
+
if self.config.json_export_enabled:
|
|
286
|
+
from .exporters import JSONExporter
|
|
287
|
+
|
|
288
|
+
exporter = JSONExporter(self.config.json_export_path)
|
|
289
|
+
exporter.export(self.metrics.get_all_metrics())
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
self.logger.error(f"Error exporting metrics: {e}")
|
|
293
|
+
|
|
294
|
+
def _get_uptime(self) -> float:
|
|
295
|
+
"""Get monitoring system uptime in seconds."""
|
|
296
|
+
return time.time() - getattr(self, "_start_time", time.time())
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class TelemetryCollector:
|
|
300
|
+
"""Collects telemetry data for InvarLock operations."""
|
|
301
|
+
|
|
302
|
+
def __init__(self, monitoring_manager: MonitoringManager):
|
|
303
|
+
self.monitoring = monitoring_manager
|
|
304
|
+
self.logger = logging.getLogger(__name__)
|
|
305
|
+
|
|
306
|
+
# Operation tracking
|
|
307
|
+
self.active_operations: dict = {}
|
|
308
|
+
self.operation_history: deque = deque(maxlen=1000)
|
|
309
|
+
|
|
310
|
+
def start_operation(
|
|
311
|
+
self, operation_id: str, operation_type: str, **metadata
|
|
312
|
+
) -> str:
|
|
313
|
+
"""Start tracking an operation."""
|
|
314
|
+
start_time = time.time()
|
|
315
|
+
|
|
316
|
+
operation_data = {
|
|
317
|
+
"id": operation_id,
|
|
318
|
+
"type": operation_type,
|
|
319
|
+
"start_time": start_time,
|
|
320
|
+
"metadata": metadata,
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
self.active_operations[operation_id] = operation_data
|
|
324
|
+
|
|
325
|
+
# Record operation start
|
|
326
|
+
self.monitoring.metrics.get_counter("invarlock.operations.total").inc(
|
|
327
|
+
labels={"type": operation_type, "status": "started"}
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
self.logger.info(f"Operation started: {operation_id} ({operation_type})")
|
|
331
|
+
return operation_id
|
|
332
|
+
|
|
333
|
+
def end_operation(
|
|
334
|
+
self, operation_id: str, status: str = "success", **result_metadata
|
|
335
|
+
):
|
|
336
|
+
"""End tracking an operation."""
|
|
337
|
+
if operation_id not in self.active_operations:
|
|
338
|
+
self.logger.warning(f"Unknown operation ID: {operation_id}")
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
operation_data = self.active_operations.pop(operation_id)
|
|
342
|
+
end_time = time.time()
|
|
343
|
+
duration = end_time - operation_data["start_time"]
|
|
344
|
+
|
|
345
|
+
# Complete operation record
|
|
346
|
+
operation_record = {
|
|
347
|
+
**operation_data,
|
|
348
|
+
"end_time": end_time,
|
|
349
|
+
"duration": duration,
|
|
350
|
+
"status": status,
|
|
351
|
+
"result_metadata": result_metadata,
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
self.operation_history.append(operation_record)
|
|
355
|
+
|
|
356
|
+
# Record metrics
|
|
357
|
+
self.monitoring.record_operation(
|
|
358
|
+
operation_data["type"],
|
|
359
|
+
duration,
|
|
360
|
+
status=status,
|
|
361
|
+
**operation_data["metadata"],
|
|
362
|
+
**result_metadata,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
self.logger.info(
|
|
366
|
+
f"Operation completed: {operation_id} ({operation_data['type']}) "
|
|
367
|
+
f"- {status} in {duration:.2f}s"
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
def get_operation_stats(self) -> dict[str, Any]:
|
|
371
|
+
"""Get operation statistics."""
|
|
372
|
+
if not self.operation_history:
|
|
373
|
+
return {}
|
|
374
|
+
|
|
375
|
+
operations = list(self.operation_history)
|
|
376
|
+
total_ops = len(operations)
|
|
377
|
+
|
|
378
|
+
# Calculate statistics
|
|
379
|
+
durations = [op["duration"] for op in operations]
|
|
380
|
+
avg_duration = sum(durations) / len(durations)
|
|
381
|
+
|
|
382
|
+
status_counts: dict = defaultdict(int)
|
|
383
|
+
type_counts: dict = defaultdict(int)
|
|
384
|
+
|
|
385
|
+
for op in operations:
|
|
386
|
+
status_counts[op["status"]] += 1
|
|
387
|
+
type_counts[op["type"]] += 1
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
"total_operations": total_ops,
|
|
391
|
+
"active_operations": len(self.active_operations),
|
|
392
|
+
"average_duration": avg_duration,
|
|
393
|
+
"status_distribution": dict(status_counts),
|
|
394
|
+
"type_distribution": dict(type_counts),
|
|
395
|
+
"success_rate": status_counts["success"] / total_ops
|
|
396
|
+
if total_ops > 0
|
|
397
|
+
else 0,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class PerformanceMonitor:
|
|
402
|
+
"""Monitors InvarLock performance metrics."""
|
|
403
|
+
|
|
404
|
+
def __init__(self, metrics_registry: MetricsRegistry):
|
|
405
|
+
self.metrics = metrics_registry
|
|
406
|
+
self.operation_times: dict = defaultdict(list)
|
|
407
|
+
self.performance_data: dict = defaultdict(dict)
|
|
408
|
+
|
|
409
|
+
def record_operation(self, operation: str, duration: float, **metadata):
|
|
410
|
+
"""Record an operation's performance."""
|
|
411
|
+
# Store timing data
|
|
412
|
+
self.operation_times[operation].append(duration)
|
|
413
|
+
|
|
414
|
+
# Keep only recent measurements (last 1000)
|
|
415
|
+
if len(self.operation_times[operation]) > 1000:
|
|
416
|
+
self.operation_times[operation] = self.operation_times[operation][-1000:]
|
|
417
|
+
|
|
418
|
+
# Update histogram metric
|
|
419
|
+
self.metrics.get_histogram("invarlock.operation.duration").observe(
|
|
420
|
+
duration, labels={"operation": operation}
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
# Store metadata
|
|
424
|
+
if metadata:
|
|
425
|
+
self.performance_data[operation].update(metadata)
|
|
426
|
+
|
|
427
|
+
def get_operation_stats(self, operation: str) -> dict[str, float]:
|
|
428
|
+
"""Get statistics for a specific operation."""
|
|
429
|
+
times = self.operation_times.get(operation, [])
|
|
430
|
+
if not times:
|
|
431
|
+
return {}
|
|
432
|
+
|
|
433
|
+
times_sorted = sorted(times)
|
|
434
|
+
count = len(times)
|
|
435
|
+
|
|
436
|
+
return {
|
|
437
|
+
"count": count,
|
|
438
|
+
"mean": sum(times) / count,
|
|
439
|
+
"min": min(times),
|
|
440
|
+
"max": max(times),
|
|
441
|
+
"p50": times_sorted[int(count * 0.5)],
|
|
442
|
+
"p90": times_sorted[int(count * 0.9)],
|
|
443
|
+
"p95": times_sorted[int(count * 0.95)],
|
|
444
|
+
"p99": times_sorted[int(count * 0.99)],
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
def get_summary(self) -> dict[str, Any]:
|
|
448
|
+
"""Get performance summary for all operations."""
|
|
449
|
+
summary = {}
|
|
450
|
+
for operation in self.operation_times:
|
|
451
|
+
summary[operation] = self.get_operation_stats(operation)
|
|
452
|
+
return summary
|
|
453
|
+
|
|
454
|
+
def update_metrics(self):
|
|
455
|
+
"""Update performance metrics."""
|
|
456
|
+
# Update operation-specific metrics
|
|
457
|
+
for operation, stats in self.get_summary().items():
|
|
458
|
+
if stats:
|
|
459
|
+
# Update gauge metrics for key percentiles
|
|
460
|
+
self.metrics.get_gauge("invarlock.operation.p95_duration").set(
|
|
461
|
+
stats["p95"], labels={"operation": operation}
|
|
462
|
+
)
|
|
463
|
+
self.metrics.get_gauge("invarlock.operation.mean_duration").set(
|
|
464
|
+
stats["mean"], labels={"operation": operation}
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class ResourceMonitor:
|
|
469
|
+
"""Monitors system resource usage."""
|
|
470
|
+
|
|
471
|
+
def __init__(self, metrics_registry: MetricsRegistry, config: MonitoringConfig):
|
|
472
|
+
self.metrics = metrics_registry
|
|
473
|
+
self.config = config
|
|
474
|
+
self.logger = logging.getLogger(__name__)
|
|
475
|
+
|
|
476
|
+
def collect_usage(self) -> dict[str, float]:
|
|
477
|
+
"""Collect current resource usage."""
|
|
478
|
+
usage = {}
|
|
479
|
+
|
|
480
|
+
try:
|
|
481
|
+
# CPU usage
|
|
482
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
|
483
|
+
usage["cpu_percent"] = cpu_percent
|
|
484
|
+
|
|
485
|
+
# Memory usage
|
|
486
|
+
memory = psutil.virtual_memory()
|
|
487
|
+
usage["memory_percent"] = memory.percent
|
|
488
|
+
usage["memory_available_gb"] = memory.available / (1024**3)
|
|
489
|
+
usage["memory_used_gb"] = memory.used / (1024**3)
|
|
490
|
+
|
|
491
|
+
# GPU usage (if available)
|
|
492
|
+
if torch.cuda.is_available():
|
|
493
|
+
for i in range(torch.cuda.device_count()):
|
|
494
|
+
gpu_memory = torch.cuda.memory_stats(i)
|
|
495
|
+
allocated = gpu_memory.get("allocated_bytes.all.current", 0)
|
|
496
|
+
reserved = gpu_memory.get("reserved_bytes.all.current", 0)
|
|
497
|
+
|
|
498
|
+
usage[f"gpu_{i}_memory_allocated_gb"] = allocated / (1024**3)
|
|
499
|
+
usage[f"gpu_{i}_memory_reserved_gb"] = reserved / (1024**3)
|
|
500
|
+
|
|
501
|
+
# Calculate percentage of total memory
|
|
502
|
+
total_memory = torch.cuda.get_device_properties(i).total_memory
|
|
503
|
+
usage[f"gpu_{i}_memory_percent"] = (allocated / total_memory) * 100
|
|
504
|
+
|
|
505
|
+
# Disk usage
|
|
506
|
+
disk = psutil.disk_usage("/")
|
|
507
|
+
usage["disk_percent"] = (disk.used / disk.total) * 100
|
|
508
|
+
usage["disk_free_gb"] = disk.free / (1024**3)
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
self.logger.error(f"Error collecting resource usage: {e}")
|
|
512
|
+
|
|
513
|
+
return usage
|
|
514
|
+
|
|
515
|
+
def update_metrics(self):
|
|
516
|
+
"""Update resource metrics."""
|
|
517
|
+
usage = self.collect_usage()
|
|
518
|
+
|
|
519
|
+
for metric_name, value in usage.items():
|
|
520
|
+
metric_key = f"invarlock.resource.{metric_name}"
|
|
521
|
+
self.metrics.get_gauge(metric_key).set(value)
|
|
522
|
+
|
|
523
|
+
def get_current_usage(self) -> dict[str, float]:
|
|
524
|
+
"""Get current resource usage."""
|
|
525
|
+
return self.collect_usage()
|
|
526
|
+
|
|
527
|
+
def check_thresholds(self) -> list[str]:
|
|
528
|
+
"""Check if any resource usage exceeds thresholds."""
|
|
529
|
+
usage = self.collect_usage()
|
|
530
|
+
warnings = []
|
|
531
|
+
|
|
532
|
+
if usage.get("cpu_percent", 0) > self.config.cpu_threshold:
|
|
533
|
+
warnings.append(f"High CPU usage: {usage['cpu_percent']:.1f}%")
|
|
534
|
+
|
|
535
|
+
if usage.get("memory_percent", 0) > self.config.memory_threshold:
|
|
536
|
+
warnings.append(f"High memory usage: {usage['memory_percent']:.1f}%")
|
|
537
|
+
|
|
538
|
+
# Check GPU memory
|
|
539
|
+
for key, value in usage.items():
|
|
540
|
+
if (
|
|
541
|
+
key.endswith("_memory_percent")
|
|
542
|
+
and value > self.config.gpu_memory_threshold
|
|
543
|
+
):
|
|
544
|
+
warnings.append(f"High GPU memory usage: {key} = {value:.1f}%")
|
|
545
|
+
|
|
546
|
+
return warnings
|