alma-memory 0.5.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alma/__init__.py +33 -1
- alma/core.py +124 -16
- alma/extraction/auto_learner.py +4 -3
- alma/graph/__init__.py +26 -1
- alma/graph/backends/__init__.py +14 -0
- alma/graph/backends/kuzu.py +624 -0
- alma/graph/backends/memgraph.py +432 -0
- alma/integration/claude_agents.py +22 -10
- alma/learning/protocols.py +3 -3
- alma/mcp/tools.py +9 -11
- alma/observability/__init__.py +84 -0
- alma/observability/config.py +302 -0
- alma/observability/logging.py +424 -0
- alma/observability/metrics.py +583 -0
- alma/observability/tracing.py +440 -0
- alma/retrieval/engine.py +65 -4
- alma/storage/__init__.py +29 -0
- alma/storage/azure_cosmos.py +343 -132
- alma/storage/base.py +58 -0
- alma/storage/constants.py +103 -0
- alma/storage/file_based.py +3 -8
- alma/storage/migrations/__init__.py +21 -0
- alma/storage/migrations/base.py +321 -0
- alma/storage/migrations/runner.py +323 -0
- alma/storage/migrations/version_stores.py +337 -0
- alma/storage/migrations/versions/__init__.py +11 -0
- alma/storage/migrations/versions/v1_0_0.py +373 -0
- alma/storage/postgresql.py +185 -78
- alma/storage/sqlite_local.py +149 -50
- alma/testing/__init__.py +46 -0
- alma/testing/factories.py +301 -0
- alma/testing/mocks.py +389 -0
- {alma_memory-0.5.0.dist-info → alma_memory-0.5.1.dist-info}/METADATA +42 -8
- {alma_memory-0.5.0.dist-info → alma_memory-0.5.1.dist-info}/RECORD +36 -19
- {alma_memory-0.5.0.dist-info → alma_memory-0.5.1.dist-info}/WHEEL +0 -0
- {alma_memory-0.5.0.dist-info → alma_memory-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ALMA Metrics Collection.
|
|
3
|
+
|
|
4
|
+
Provides metrics collection using OpenTelemetry with fallback
|
|
5
|
+
to in-memory collection when OTel is not available.
|
|
6
|
+
|
|
7
|
+
Metrics tracked:
|
|
8
|
+
- Memory operation latency (retrieve, learn, forget)
|
|
9
|
+
- Embedding generation time
|
|
10
|
+
- Cache hit/miss rates
|
|
11
|
+
- Storage backend query times
|
|
12
|
+
- Memory counts by type
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Dict, List, Optional
|
|
19
|
+
|
|
20
|
+
# Try to import OpenTelemetry
|
|
21
|
+
_otel_available = False
|
|
22
|
+
try:
|
|
23
|
+
from opentelemetry import metrics
|
|
24
|
+
|
|
25
|
+
_otel_available = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
# Global metrics instance
|
|
30
|
+
_metrics_instance: Optional["ALMAMetrics"] = None
|
|
31
|
+
_metrics_lock = threading.Lock()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class MetricValue:
|
|
36
|
+
"""Container for metric values with metadata."""
|
|
37
|
+
|
|
38
|
+
name: str
|
|
39
|
+
value: float
|
|
40
|
+
timestamp: float
|
|
41
|
+
labels: Dict[str, str] = field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class InMemoryMetricsCollector:
|
|
45
|
+
"""
|
|
46
|
+
In-memory metrics collection for when OpenTelemetry is not available.
|
|
47
|
+
|
|
48
|
+
Stores metric values in memory for later retrieval.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, max_samples: int = 10000):
|
|
52
|
+
"""Initialize in-memory collector."""
|
|
53
|
+
self._counters: Dict[str, float] = {}
|
|
54
|
+
self._histograms: Dict[str, List[float]] = {}
|
|
55
|
+
self._gauges: Dict[str, float] = {}
|
|
56
|
+
self._lock = threading.RLock()
|
|
57
|
+
self._max_samples = max_samples
|
|
58
|
+
|
|
59
|
+
def increment_counter(
|
|
60
|
+
self,
|
|
61
|
+
name: str,
|
|
62
|
+
value: float = 1.0,
|
|
63
|
+
labels: Optional[Dict[str, str]] = None,
|
|
64
|
+
):
|
|
65
|
+
"""Increment a counter metric."""
|
|
66
|
+
key = self._make_key(name, labels)
|
|
67
|
+
with self._lock:
|
|
68
|
+
self._counters[key] = self._counters.get(key, 0.0) + value
|
|
69
|
+
|
|
70
|
+
def record_histogram(
|
|
71
|
+
self,
|
|
72
|
+
name: str,
|
|
73
|
+
value: float,
|
|
74
|
+
labels: Optional[Dict[str, str]] = None,
|
|
75
|
+
):
|
|
76
|
+
"""Record a histogram value."""
|
|
77
|
+
key = self._make_key(name, labels)
|
|
78
|
+
with self._lock:
|
|
79
|
+
if key not in self._histograms:
|
|
80
|
+
self._histograms[key] = []
|
|
81
|
+
self._histograms[key].append(value)
|
|
82
|
+
# Trim if needed
|
|
83
|
+
if len(self._histograms[key]) > self._max_samples:
|
|
84
|
+
self._histograms[key] = self._histograms[key][-self._max_samples :]
|
|
85
|
+
|
|
86
|
+
def set_gauge(
|
|
87
|
+
self,
|
|
88
|
+
name: str,
|
|
89
|
+
value: float,
|
|
90
|
+
labels: Optional[Dict[str, str]] = None,
|
|
91
|
+
):
|
|
92
|
+
"""Set a gauge value."""
|
|
93
|
+
key = self._make_key(name, labels)
|
|
94
|
+
with self._lock:
|
|
95
|
+
self._gauges[key] = value
|
|
96
|
+
|
|
97
|
+
def increment_gauge(
|
|
98
|
+
self,
|
|
99
|
+
name: str,
|
|
100
|
+
value: float = 1.0,
|
|
101
|
+
labels: Optional[Dict[str, str]] = None,
|
|
102
|
+
):
|
|
103
|
+
"""Increment a gauge (up-down counter)."""
|
|
104
|
+
key = self._make_key(name, labels)
|
|
105
|
+
with self._lock:
|
|
106
|
+
self._gauges[key] = self._gauges.get(key, 0.0) + value
|
|
107
|
+
|
|
108
|
+
def _make_key(
|
|
109
|
+
self,
|
|
110
|
+
name: str,
|
|
111
|
+
labels: Optional[Dict[str, str]] = None,
|
|
112
|
+
) -> str:
|
|
113
|
+
"""Create a unique key for the metric with labels."""
|
|
114
|
+
if not labels:
|
|
115
|
+
return name
|
|
116
|
+
label_str = ",".join(f"{k}={v}" for k, v in sorted(labels.items()))
|
|
117
|
+
return f"{name}{{{label_str}}}"
|
|
118
|
+
|
|
119
|
+
def get_counter(self, name: str, labels: Optional[Dict[str, str]] = None) -> float:
|
|
120
|
+
"""Get counter value."""
|
|
121
|
+
key = self._make_key(name, labels)
|
|
122
|
+
with self._lock:
|
|
123
|
+
return self._counters.get(key, 0.0)
|
|
124
|
+
|
|
125
|
+
def get_histogram_stats(
|
|
126
|
+
self,
|
|
127
|
+
name: str,
|
|
128
|
+
labels: Optional[Dict[str, str]] = None,
|
|
129
|
+
) -> Dict[str, float]:
|
|
130
|
+
"""Get histogram statistics."""
|
|
131
|
+
key = self._make_key(name, labels)
|
|
132
|
+
with self._lock:
|
|
133
|
+
values = self._histograms.get(key, [])
|
|
134
|
+
if not values:
|
|
135
|
+
return {
|
|
136
|
+
"count": 0,
|
|
137
|
+
"sum": 0,
|
|
138
|
+
"min": 0,
|
|
139
|
+
"max": 0,
|
|
140
|
+
"avg": 0,
|
|
141
|
+
"p50": 0,
|
|
142
|
+
"p95": 0,
|
|
143
|
+
"p99": 0,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
sorted_values = sorted(values)
|
|
147
|
+
count = len(sorted_values)
|
|
148
|
+
return {
|
|
149
|
+
"count": count,
|
|
150
|
+
"sum": sum(sorted_values),
|
|
151
|
+
"min": sorted_values[0],
|
|
152
|
+
"max": sorted_values[-1],
|
|
153
|
+
"avg": sum(sorted_values) / count,
|
|
154
|
+
"p50": sorted_values[int(count * 0.5)],
|
|
155
|
+
"p95": sorted_values[min(int(count * 0.95), count - 1)],
|
|
156
|
+
"p99": sorted_values[min(int(count * 0.99), count - 1)],
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
def get_gauge(self, name: str, labels: Optional[Dict[str, str]] = None) -> float:
|
|
160
|
+
"""Get gauge value."""
|
|
161
|
+
key = self._make_key(name, labels)
|
|
162
|
+
with self._lock:
|
|
163
|
+
return self._gauges.get(key, 0.0)
|
|
164
|
+
|
|
165
|
+
def get_all_metrics(self) -> Dict[str, Any]:
|
|
166
|
+
"""Get all metrics as a dictionary."""
|
|
167
|
+
with self._lock:
|
|
168
|
+
return {
|
|
169
|
+
"counters": dict(self._counters),
|
|
170
|
+
"histograms": {
|
|
171
|
+
k: self.get_histogram_stats(k.split("{")[0])
|
|
172
|
+
for k in self._histograms
|
|
173
|
+
},
|
|
174
|
+
"gauges": dict(self._gauges),
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
def reset(self):
|
|
178
|
+
"""Reset all metrics."""
|
|
179
|
+
with self._lock:
|
|
180
|
+
self._counters.clear()
|
|
181
|
+
self._histograms.clear()
|
|
182
|
+
self._gauges.clear()
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class MetricsCollector:
|
|
186
|
+
"""
|
|
187
|
+
Unified metrics collector that uses OpenTelemetry when available,
|
|
188
|
+
falling back to in-memory collection otherwise.
|
|
189
|
+
"""
|
|
190
|
+
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
service_name: str = "alma-memory",
|
|
194
|
+
use_otel: bool = True,
|
|
195
|
+
):
|
|
196
|
+
"""
|
|
197
|
+
Initialize metrics collector.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
service_name: Service name for metrics
|
|
201
|
+
use_otel: Whether to use OpenTelemetry (if available)
|
|
202
|
+
"""
|
|
203
|
+
self.service_name = service_name
|
|
204
|
+
self._use_otel = use_otel and _otel_available
|
|
205
|
+
self._fallback = InMemoryMetricsCollector()
|
|
206
|
+
|
|
207
|
+
# OpenTelemetry instruments
|
|
208
|
+
self._otel_counters: Dict[str, Any] = {}
|
|
209
|
+
self._otel_histograms: Dict[str, Any] = {}
|
|
210
|
+
self._otel_gauges: Dict[str, Any] = {}
|
|
211
|
+
|
|
212
|
+
if self._use_otel:
|
|
213
|
+
self._meter = metrics.get_meter(service_name)
|
|
214
|
+
|
|
215
|
+
def counter(
|
|
216
|
+
self,
|
|
217
|
+
name: str,
|
|
218
|
+
value: float = 1.0,
|
|
219
|
+
labels: Optional[Dict[str, str]] = None,
|
|
220
|
+
):
|
|
221
|
+
"""Increment a counter metric."""
|
|
222
|
+
if self._use_otel:
|
|
223
|
+
if name not in self._otel_counters:
|
|
224
|
+
self._otel_counters[name] = self._meter.create_counter(
|
|
225
|
+
name=f"alma.{name}",
|
|
226
|
+
description=f"ALMA counter: {name}",
|
|
227
|
+
)
|
|
228
|
+
self._otel_counters[name].add(value, labels or {})
|
|
229
|
+
else:
|
|
230
|
+
self._fallback.increment_counter(name, value, labels)
|
|
231
|
+
|
|
232
|
+
def histogram(
|
|
233
|
+
self,
|
|
234
|
+
name: str,
|
|
235
|
+
value: float,
|
|
236
|
+
unit: str = "ms",
|
|
237
|
+
labels: Optional[Dict[str, str]] = None,
|
|
238
|
+
):
|
|
239
|
+
"""Record a histogram value (typically latency)."""
|
|
240
|
+
if self._use_otel:
|
|
241
|
+
if name not in self._otel_histograms:
|
|
242
|
+
self._otel_histograms[name] = self._meter.create_histogram(
|
|
243
|
+
name=f"alma.{name}",
|
|
244
|
+
unit=unit,
|
|
245
|
+
description=f"ALMA histogram: {name}",
|
|
246
|
+
)
|
|
247
|
+
self._otel_histograms[name].record(value, labels or {})
|
|
248
|
+
else:
|
|
249
|
+
self._fallback.record_histogram(name, value, labels)
|
|
250
|
+
|
|
251
|
+
def gauge(
|
|
252
|
+
self,
|
|
253
|
+
name: str,
|
|
254
|
+
value: float,
|
|
255
|
+
labels: Optional[Dict[str, str]] = None,
|
|
256
|
+
):
|
|
257
|
+
"""Set a gauge value."""
|
|
258
|
+
if self._use_otel:
|
|
259
|
+
# OTel gauges require callbacks, so we use up-down counter
|
|
260
|
+
if name not in self._otel_gauges:
|
|
261
|
+
self._otel_gauges[name] = self._meter.create_up_down_counter(
|
|
262
|
+
name=f"alma.{name}",
|
|
263
|
+
description=f"ALMA gauge: {name}",
|
|
264
|
+
)
|
|
265
|
+
# Note: OTel up-down counters don't support setting absolute values
|
|
266
|
+
# We track the last value and adjust
|
|
267
|
+
pass
|
|
268
|
+
# Always use fallback for gauges to support get operations
|
|
269
|
+
self._fallback.set_gauge(name, value, labels)
|
|
270
|
+
|
|
271
|
+
def gauge_increment(
|
|
272
|
+
self,
|
|
273
|
+
name: str,
|
|
274
|
+
value: float = 1.0,
|
|
275
|
+
labels: Optional[Dict[str, str]] = None,
|
|
276
|
+
):
|
|
277
|
+
"""Increment (or decrement if negative) a gauge."""
|
|
278
|
+
if self._use_otel:
|
|
279
|
+
if name not in self._otel_gauges:
|
|
280
|
+
self._otel_gauges[name] = self._meter.create_up_down_counter(
|
|
281
|
+
name=f"alma.{name}",
|
|
282
|
+
description=f"ALMA gauge: {name}",
|
|
283
|
+
)
|
|
284
|
+
self._otel_gauges[name].add(value, labels or {})
|
|
285
|
+
self._fallback.increment_gauge(name, value, labels)
|
|
286
|
+
|
|
287
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
288
|
+
"""Get all metrics as a dictionary (from fallback collector)."""
|
|
289
|
+
return self._fallback.get_all_metrics()
|
|
290
|
+
|
|
291
|
+
def timer(self, name: str, labels: Optional[Dict[str, str]] = None) -> "Timer":
|
|
292
|
+
"""Create a timer context manager for measuring duration."""
|
|
293
|
+
return Timer(self, name, labels)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class Timer:
|
|
297
|
+
"""Context manager for timing operations."""
|
|
298
|
+
|
|
299
|
+
def __init__(
|
|
300
|
+
self,
|
|
301
|
+
collector: MetricsCollector,
|
|
302
|
+
name: str,
|
|
303
|
+
labels: Optional[Dict[str, str]] = None,
|
|
304
|
+
):
|
|
305
|
+
self._collector = collector
|
|
306
|
+
self._name = name
|
|
307
|
+
self._labels = labels
|
|
308
|
+
self._start_time: Optional[float] = None
|
|
309
|
+
self.duration_ms: float = 0
|
|
310
|
+
|
|
311
|
+
def __enter__(self) -> "Timer":
|
|
312
|
+
self._start_time = time.time()
|
|
313
|
+
return self
|
|
314
|
+
|
|
315
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
316
|
+
if self._start_time is not None:
|
|
317
|
+
self.duration_ms = (time.time() - self._start_time) * 1000
|
|
318
|
+
labels = dict(self._labels or {})
|
|
319
|
+
labels["success"] = "false" if exc_type else "true"
|
|
320
|
+
self._collector.histogram(self._name, self.duration_ms, "ms", labels)
|
|
321
|
+
return False
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class ALMAMetrics:
|
|
325
|
+
"""
|
|
326
|
+
High-level metrics interface for ALMA operations.
|
|
327
|
+
|
|
328
|
+
Provides semantic methods for tracking ALMA-specific metrics.
|
|
329
|
+
"""
|
|
330
|
+
|
|
331
|
+
def __init__(self, collector: Optional[MetricsCollector] = None):
|
|
332
|
+
"""Initialize ALMA metrics."""
|
|
333
|
+
self._collector = collector or MetricsCollector()
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def collector(self) -> MetricsCollector:
|
|
337
|
+
"""Get underlying metrics collector."""
|
|
338
|
+
return self._collector
|
|
339
|
+
|
|
340
|
+
# ==================== Memory Operations ====================
|
|
341
|
+
|
|
342
|
+
def record_retrieve_latency(
|
|
343
|
+
self,
|
|
344
|
+
duration_ms: float,
|
|
345
|
+
agent: str,
|
|
346
|
+
project_id: str,
|
|
347
|
+
cache_hit: bool,
|
|
348
|
+
items_returned: int,
|
|
349
|
+
):
|
|
350
|
+
"""Record memory retrieval latency."""
|
|
351
|
+
self._collector.histogram(
|
|
352
|
+
"memory.retrieve.latency",
|
|
353
|
+
duration_ms,
|
|
354
|
+
"ms",
|
|
355
|
+
{
|
|
356
|
+
"agent": agent,
|
|
357
|
+
"project_id": project_id,
|
|
358
|
+
"cache_hit": str(cache_hit).lower(),
|
|
359
|
+
},
|
|
360
|
+
)
|
|
361
|
+
self._collector.counter(
|
|
362
|
+
"memory.retrieve.count",
|
|
363
|
+
1,
|
|
364
|
+
{"agent": agent, "project_id": project_id},
|
|
365
|
+
)
|
|
366
|
+
self._collector.counter(
|
|
367
|
+
"memory.retrieve.items",
|
|
368
|
+
items_returned,
|
|
369
|
+
{"agent": agent, "project_id": project_id},
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def record_learn_operation(
|
|
373
|
+
self,
|
|
374
|
+
duration_ms: float,
|
|
375
|
+
agent: str,
|
|
376
|
+
project_id: str,
|
|
377
|
+
memory_type: str,
|
|
378
|
+
success: bool,
|
|
379
|
+
):
|
|
380
|
+
"""Record a learning operation."""
|
|
381
|
+
self._collector.histogram(
|
|
382
|
+
"memory.learn.latency",
|
|
383
|
+
duration_ms,
|
|
384
|
+
"ms",
|
|
385
|
+
{
|
|
386
|
+
"agent": agent,
|
|
387
|
+
"project_id": project_id,
|
|
388
|
+
"memory_type": memory_type,
|
|
389
|
+
"success": str(success).lower(),
|
|
390
|
+
},
|
|
391
|
+
)
|
|
392
|
+
self._collector.counter(
|
|
393
|
+
"memory.learn.count",
|
|
394
|
+
1,
|
|
395
|
+
{
|
|
396
|
+
"agent": agent,
|
|
397
|
+
"memory_type": memory_type,
|
|
398
|
+
"success": str(success).lower(),
|
|
399
|
+
},
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
def record_forget_operation(
|
|
403
|
+
self,
|
|
404
|
+
duration_ms: float,
|
|
405
|
+
agent: Optional[str],
|
|
406
|
+
project_id: str,
|
|
407
|
+
items_removed: int,
|
|
408
|
+
):
|
|
409
|
+
"""Record a forget (pruning) operation."""
|
|
410
|
+
self._collector.histogram(
|
|
411
|
+
"memory.forget.latency",
|
|
412
|
+
duration_ms,
|
|
413
|
+
"ms",
|
|
414
|
+
{"project_id": project_id},
|
|
415
|
+
)
|
|
416
|
+
self._collector.counter(
|
|
417
|
+
"memory.forget.items",
|
|
418
|
+
items_removed,
|
|
419
|
+
{"project_id": project_id, "agent": agent or "all"},
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
# ==================== Embedding Operations ====================
|
|
423
|
+
|
|
424
|
+
def record_embedding_latency(
|
|
425
|
+
self,
|
|
426
|
+
duration_ms: float,
|
|
427
|
+
provider: str,
|
|
428
|
+
batch_size: int = 1,
|
|
429
|
+
):
|
|
430
|
+
"""Record embedding generation latency."""
|
|
431
|
+
self._collector.histogram(
|
|
432
|
+
"embedding.latency",
|
|
433
|
+
duration_ms,
|
|
434
|
+
"ms",
|
|
435
|
+
{"provider": provider, "batch_size": str(batch_size)},
|
|
436
|
+
)
|
|
437
|
+
self._collector.counter(
|
|
438
|
+
"embedding.count",
|
|
439
|
+
batch_size,
|
|
440
|
+
{"provider": provider},
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
# ==================== Cache Operations ====================
|
|
444
|
+
|
|
445
|
+
def record_cache_hit(self, cache_type: str = "retrieval"):
|
|
446
|
+
"""Record a cache hit."""
|
|
447
|
+
self._collector.counter("cache.hit", 1, {"cache_type": cache_type})
|
|
448
|
+
|
|
449
|
+
def record_cache_miss(self, cache_type: str = "retrieval"):
|
|
450
|
+
"""Record a cache miss."""
|
|
451
|
+
self._collector.counter("cache.miss", 1, {"cache_type": cache_type})
|
|
452
|
+
|
|
453
|
+
def record_cache_eviction(self, cache_type: str = "retrieval", count: int = 1):
|
|
454
|
+
"""Record cache evictions."""
|
|
455
|
+
self._collector.counter("cache.eviction", count, {"cache_type": cache_type})
|
|
456
|
+
|
|
457
|
+
def set_cache_size(self, size: int, cache_type: str = "retrieval"):
|
|
458
|
+
"""Set current cache size."""
|
|
459
|
+
self._collector.gauge("cache.size", size, {"cache_type": cache_type})
|
|
460
|
+
|
|
461
|
+
# ==================== Storage Operations ====================
|
|
462
|
+
|
|
463
|
+
def record_storage_query_latency(
|
|
464
|
+
self,
|
|
465
|
+
duration_ms: float,
|
|
466
|
+
operation: str,
|
|
467
|
+
backend: str,
|
|
468
|
+
success: bool = True,
|
|
469
|
+
):
|
|
470
|
+
"""Record storage query latency."""
|
|
471
|
+
self._collector.histogram(
|
|
472
|
+
"storage.query.latency",
|
|
473
|
+
duration_ms,
|
|
474
|
+
"ms",
|
|
475
|
+
{
|
|
476
|
+
"operation": operation,
|
|
477
|
+
"backend": backend,
|
|
478
|
+
"success": str(success).lower(),
|
|
479
|
+
},
|
|
480
|
+
)
|
|
481
|
+
self._collector.counter(
|
|
482
|
+
"storage.query.count",
|
|
483
|
+
1,
|
|
484
|
+
{"operation": operation, "backend": backend},
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
def record_storage_error(self, backend: str, operation: str, error_type: str):
|
|
488
|
+
"""Record a storage error."""
|
|
489
|
+
self._collector.counter(
|
|
490
|
+
"storage.error.count",
|
|
491
|
+
1,
|
|
492
|
+
{"backend": backend, "operation": operation, "error_type": error_type},
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# ==================== Memory Counts ====================
|
|
496
|
+
|
|
497
|
+
def set_memory_count(
|
|
498
|
+
self,
|
|
499
|
+
count: int,
|
|
500
|
+
memory_type: str,
|
|
501
|
+
agent: Optional[str] = None,
|
|
502
|
+
project_id: Optional[str] = None,
|
|
503
|
+
):
|
|
504
|
+
"""Set memory item count gauge."""
|
|
505
|
+
labels = {"memory_type": memory_type}
|
|
506
|
+
if agent:
|
|
507
|
+
labels["agent"] = agent
|
|
508
|
+
if project_id:
|
|
509
|
+
labels["project_id"] = project_id
|
|
510
|
+
self._collector.gauge("memory.count", count, labels)
|
|
511
|
+
|
|
512
|
+
# ==================== Session Operations ====================
|
|
513
|
+
|
|
514
|
+
def record_session_start(self, agent: str, project_id: str):
|
|
515
|
+
"""Record a session start."""
|
|
516
|
+
self._collector.counter(
|
|
517
|
+
"session.start",
|
|
518
|
+
1,
|
|
519
|
+
{"agent": agent, "project_id": project_id},
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
def record_session_end(
|
|
523
|
+
self,
|
|
524
|
+
agent: str,
|
|
525
|
+
project_id: str,
|
|
526
|
+
duration_ms: float,
|
|
527
|
+
outcome: str,
|
|
528
|
+
):
|
|
529
|
+
"""Record a session end."""
|
|
530
|
+
self._collector.histogram(
|
|
531
|
+
"session.duration",
|
|
532
|
+
duration_ms,
|
|
533
|
+
"ms",
|
|
534
|
+
{"agent": agent, "project_id": project_id, "outcome": outcome},
|
|
535
|
+
)
|
|
536
|
+
self._collector.counter(
|
|
537
|
+
"session.end",
|
|
538
|
+
1,
|
|
539
|
+
{"agent": agent, "project_id": project_id, "outcome": outcome},
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# ==================== Utility ====================
|
|
543
|
+
|
|
544
|
+
def get_all_metrics(self) -> Dict[str, Any]:
|
|
545
|
+
"""Get all collected metrics."""
|
|
546
|
+
return self._collector.get_stats()
|
|
547
|
+
|
|
548
|
+
def timer(self, name: str, labels: Optional[Dict[str, str]] = None) -> Timer:
|
|
549
|
+
"""Create a timer for measuring operation duration."""
|
|
550
|
+
return self._collector.timer(name, labels)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def get_meter(name: str = "alma"):
|
|
554
|
+
"""
|
|
555
|
+
Get an OpenTelemetry meter.
|
|
556
|
+
|
|
557
|
+
Falls back to a no-op meter if OTel is not available.
|
|
558
|
+
"""
|
|
559
|
+
if _otel_available:
|
|
560
|
+
return metrics.get_meter(name)
|
|
561
|
+
return None
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def get_metrics() -> ALMAMetrics:
|
|
565
|
+
"""
|
|
566
|
+
Get the global ALMAMetrics instance.
|
|
567
|
+
|
|
568
|
+
Creates one if it doesn't exist.
|
|
569
|
+
"""
|
|
570
|
+
global _metrics_instance
|
|
571
|
+
|
|
572
|
+
with _metrics_lock:
|
|
573
|
+
if _metrics_instance is None:
|
|
574
|
+
_metrics_instance = ALMAMetrics()
|
|
575
|
+
return _metrics_instance
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def set_metrics(metrics_instance: ALMAMetrics):
|
|
579
|
+
"""Set the global ALMAMetrics instance."""
|
|
580
|
+
global _metrics_instance
|
|
581
|
+
|
|
582
|
+
with _metrics_lock:
|
|
583
|
+
_metrics_instance = metrics_instance
|