agmem 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/METADATA +338 -26
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/RECORD +32 -16
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/coordinator/server.py +18 -2
- memvcs/core/agents.py +411 -0
- memvcs/core/archaeology.py +410 -0
- memvcs/core/collaboration.py +435 -0
- memvcs/core/compliance.py +427 -0
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/confidence.py +379 -0
- memvcs/core/daemon.py +735 -0
- memvcs/core/delta.py +45 -23
- memvcs/core/distiller.py +3 -12
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +1 -1
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/private_search.py +327 -0
- memvcs/core/protocol_builder.py +198 -0
- memvcs/core/search_index.py +538 -0
- memvcs/core/semantic_graph.py +388 -0
- memvcs/core/session.py +520 -0
- memvcs/core/timetravel.py +430 -0
- memvcs/integrations/mcp_server.py +775 -4
- memvcs/integrations/web_ui/server.py +424 -0
- memvcs/integrations/web_ui/websocket.py +223 -0
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/WHEEL +0 -0
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/entry_points.txt +0 -0
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.2.0.dist-info → agmem-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Compliance Dashboard - Privacy, Encryption, and Audit verification.
|
|
3
|
+
|
|
4
|
+
This module provides compliance monitoring capabilities:
|
|
5
|
+
- Privacy budget tracking (ε/δ for differential privacy)
|
|
6
|
+
- Encryption status verification
|
|
7
|
+
- Tamper detection via Merkle tree verification
|
|
8
|
+
- Audit trail analysis
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class PrivacyBudget:
|
|
22
|
+
"""Tracks differential privacy budget consumption."""
|
|
23
|
+
|
|
24
|
+
epsilon: float # Privacy loss parameter
|
|
25
|
+
delta: float # Failure probability
|
|
26
|
+
queries_made: int = 0
|
|
27
|
+
budget_consumed: float = 0.0
|
|
28
|
+
budget_limit: float = 1.0
|
|
29
|
+
last_query: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
def consume(self, epsilon_cost: float) -> bool:
|
|
32
|
+
"""Consume privacy budget. Returns True if within limit."""
|
|
33
|
+
if self.budget_consumed + epsilon_cost > self.budget_limit:
|
|
34
|
+
return False
|
|
35
|
+
self.budget_consumed += epsilon_cost
|
|
36
|
+
self.queries_made += 1
|
|
37
|
+
self.last_query = datetime.now(timezone.utc).isoformat()
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
def remaining(self) -> float:
|
|
41
|
+
"""Get remaining privacy budget."""
|
|
42
|
+
return max(0, self.budget_limit - self.budget_consumed)
|
|
43
|
+
|
|
44
|
+
def is_exhausted(self) -> bool:
|
|
45
|
+
"""Check if budget is exhausted."""
|
|
46
|
+
return self.budget_consumed >= self.budget_limit
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
49
|
+
return {
|
|
50
|
+
"epsilon": self.epsilon,
|
|
51
|
+
"delta": self.delta,
|
|
52
|
+
"queries_made": self.queries_made,
|
|
53
|
+
"budget_consumed": self.budget_consumed,
|
|
54
|
+
"budget_limit": self.budget_limit,
|
|
55
|
+
"remaining": self.remaining(),
|
|
56
|
+
"exhausted": self.is_exhausted(),
|
|
57
|
+
"last_query": self.last_query,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class PrivacyManager:
|
|
62
|
+
"""Manages privacy budgets for different data sources."""
|
|
63
|
+
|
|
64
|
+
def __init__(self, mem_dir: Path):
|
|
65
|
+
self.mem_dir = Path(mem_dir)
|
|
66
|
+
self.privacy_file = self.mem_dir / "privacy.json"
|
|
67
|
+
self._budgets: Dict[str, PrivacyBudget] = {}
|
|
68
|
+
self._load()
|
|
69
|
+
|
|
70
|
+
def _load(self) -> None:
|
|
71
|
+
"""Load privacy budgets from disk."""
|
|
72
|
+
if self.privacy_file.exists():
|
|
73
|
+
try:
|
|
74
|
+
data = json.loads(self.privacy_file.read_text())
|
|
75
|
+
for name, budget_data in data.get("budgets", {}).items():
|
|
76
|
+
self._budgets[name] = PrivacyBudget(
|
|
77
|
+
epsilon=budget_data["epsilon"],
|
|
78
|
+
delta=budget_data["delta"],
|
|
79
|
+
queries_made=budget_data.get("queries_made", 0),
|
|
80
|
+
budget_consumed=budget_data.get("budget_consumed", 0.0),
|
|
81
|
+
budget_limit=budget_data.get("budget_limit", 1.0),
|
|
82
|
+
last_query=budget_data.get("last_query"),
|
|
83
|
+
)
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
def _save(self) -> None:
|
|
88
|
+
"""Save privacy budgets to disk."""
|
|
89
|
+
self.mem_dir.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
data = {"budgets": {name: b.to_dict() for name, b in self._budgets.items()}}
|
|
91
|
+
self.privacy_file.write_text(json.dumps(data, indent=2))
|
|
92
|
+
|
|
93
|
+
def create_budget(
|
|
94
|
+
self, name: str, epsilon: float = 0.1, delta: float = 1e-5, limit: float = 1.0
|
|
95
|
+
) -> PrivacyBudget:
|
|
96
|
+
"""Create a new privacy budget."""
|
|
97
|
+
budget = PrivacyBudget(epsilon=epsilon, delta=delta, budget_limit=limit)
|
|
98
|
+
self._budgets[name] = budget
|
|
99
|
+
self._save()
|
|
100
|
+
return budget
|
|
101
|
+
|
|
102
|
+
def consume(self, name: str, epsilon_cost: float) -> Tuple[bool, Optional[PrivacyBudget]]:
|
|
103
|
+
"""Consume budget for a data source. Returns (success, budget)."""
|
|
104
|
+
budget = self._budgets.get(name)
|
|
105
|
+
if not budget:
|
|
106
|
+
return False, None
|
|
107
|
+
success = budget.consume(epsilon_cost)
|
|
108
|
+
self._save()
|
|
109
|
+
return success, budget
|
|
110
|
+
|
|
111
|
+
def get_budget(self, name: str) -> Optional[PrivacyBudget]:
|
|
112
|
+
"""Get a privacy budget by name."""
|
|
113
|
+
return self._budgets.get(name)
|
|
114
|
+
|
|
115
|
+
def get_all_budgets(self) -> Dict[str, PrivacyBudget]:
|
|
116
|
+
"""Get all privacy budgets."""
|
|
117
|
+
return self._budgets.copy()
|
|
118
|
+
|
|
119
|
+
def get_dashboard_data(self) -> Dict[str, Any]:
|
|
120
|
+
"""Get data for privacy dashboard."""
|
|
121
|
+
return {
|
|
122
|
+
"budgets": [
|
|
123
|
+
{"name": name, **budget.to_dict()} for name, budget in self._budgets.items()
|
|
124
|
+
],
|
|
125
|
+
"total_queries": sum(b.queries_made for b in self._budgets.values()),
|
|
126
|
+
"total_consumed": sum(b.budget_consumed for b in self._budgets.values()),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class EncryptionStatus:
|
|
132
|
+
"""Status of an encrypted file."""
|
|
133
|
+
|
|
134
|
+
path: str
|
|
135
|
+
is_encrypted: bool
|
|
136
|
+
algorithm: Optional[str] = None
|
|
137
|
+
key_id: Optional[str] = None
|
|
138
|
+
encrypted_at: Optional[str] = None
|
|
139
|
+
can_decrypt: bool = False
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class EncryptionVerifier:
|
|
143
|
+
"""Verifies encryption status of memory files."""
|
|
144
|
+
|
|
145
|
+
ENCRYPTION_MARKERS = [b"-----BEGIN ENCRYPTED", b"$ENCRYPTED$", b"\x00AGMEM-ENC"]
|
|
146
|
+
|
|
147
|
+
def __init__(self, mem_dir: Path, current_dir: Path):
|
|
148
|
+
self.mem_dir = Path(mem_dir)
|
|
149
|
+
self.current_dir = Path(current_dir)
|
|
150
|
+
self.key_file = self.mem_dir / "encryption_keys.json"
|
|
151
|
+
|
|
152
|
+
def check_file(self, filepath: Path) -> EncryptionStatus:
|
|
153
|
+
"""Check encryption status of a file."""
|
|
154
|
+
if not filepath.exists():
|
|
155
|
+
return EncryptionStatus(path=str(filepath), is_encrypted=False)
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
content = filepath.read_bytes()[:100]
|
|
159
|
+
is_encrypted = any(marker in content for marker in self.ENCRYPTION_MARKERS)
|
|
160
|
+
|
|
161
|
+
if is_encrypted:
|
|
162
|
+
algorithm = self._detect_algorithm(content)
|
|
163
|
+
return EncryptionStatus(
|
|
164
|
+
path=str(filepath),
|
|
165
|
+
is_encrypted=True,
|
|
166
|
+
algorithm=algorithm,
|
|
167
|
+
can_decrypt=self._can_decrypt(filepath),
|
|
168
|
+
)
|
|
169
|
+
else:
|
|
170
|
+
return EncryptionStatus(
|
|
171
|
+
path=str(filepath),
|
|
172
|
+
is_encrypted=False,
|
|
173
|
+
)
|
|
174
|
+
except Exception:
|
|
175
|
+
return EncryptionStatus(path=str(filepath), is_encrypted=False)
|
|
176
|
+
|
|
177
|
+
def _detect_algorithm(self, content: bytes) -> str:
|
|
178
|
+
"""Detect encryption algorithm from header."""
|
|
179
|
+
if b"AES-256" in content:
|
|
180
|
+
return "AES-256-GCM"
|
|
181
|
+
elif b"CHACHA20" in content:
|
|
182
|
+
return "ChaCha20-Poly1305"
|
|
183
|
+
elif b"FERNET" in content:
|
|
184
|
+
return "Fernet"
|
|
185
|
+
return "Unknown"
|
|
186
|
+
|
|
187
|
+
def _can_decrypt(self, filepath: Path) -> bool:
|
|
188
|
+
"""Check if we have the key to decrypt."""
|
|
189
|
+
if not self.key_file.exists():
|
|
190
|
+
return False
|
|
191
|
+
# Simplified check - just verify key file exists
|
|
192
|
+
return True
|
|
193
|
+
|
|
194
|
+
def scan_directory(self) -> Dict[str, Any]:
|
|
195
|
+
"""Scan current directory for encryption status."""
|
|
196
|
+
results = {"encrypted": [], "unencrypted": [], "errors": []}
|
|
197
|
+
|
|
198
|
+
for filepath in self.current_dir.rglob("*"):
|
|
199
|
+
if filepath.is_file():
|
|
200
|
+
try:
|
|
201
|
+
status = self.check_file(filepath)
|
|
202
|
+
if status.is_encrypted:
|
|
203
|
+
results["encrypted"].append(status)
|
|
204
|
+
else:
|
|
205
|
+
results["unencrypted"].append(status)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
results["errors"].append({"path": str(filepath), "error": str(e)})
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
"total": len(results["encrypted"]) + len(results["unencrypted"]),
|
|
211
|
+
"encrypted_count": len(results["encrypted"]),
|
|
212
|
+
"unencrypted_count": len(results["unencrypted"]),
|
|
213
|
+
"error_count": len(results["errors"]),
|
|
214
|
+
"encrypted_files": [e.path for e in results["encrypted"]],
|
|
215
|
+
"encryption_coverage": len(results["encrypted"])
|
|
216
|
+
/ max(1, len(results["encrypted"]) + len(results["unencrypted"]))
|
|
217
|
+
* 100,
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class TamperDetector:
|
|
222
|
+
"""Detects tampering via Merkle tree verification."""
|
|
223
|
+
|
|
224
|
+
def __init__(self, mem_dir: Path):
|
|
225
|
+
self.mem_dir = Path(mem_dir)
|
|
226
|
+
self.merkle_file = self.mem_dir / "merkle_root.json"
|
|
227
|
+
|
|
228
|
+
def compute_file_hash(self, filepath: Path) -> str:
|
|
229
|
+
"""Compute SHA-256 hash of a file."""
|
|
230
|
+
hasher = hashlib.sha256()
|
|
231
|
+
try:
|
|
232
|
+
with open(filepath, "rb") as f:
|
|
233
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
234
|
+
hasher.update(chunk)
|
|
235
|
+
return hasher.hexdigest()
|
|
236
|
+
except Exception:
|
|
237
|
+
return ""
|
|
238
|
+
|
|
239
|
+
def compute_merkle_root(self, file_hashes: List[str]) -> str:
|
|
240
|
+
"""Compute Merkle root from file hashes."""
|
|
241
|
+
if not file_hashes:
|
|
242
|
+
return hashlib.sha256(b"").hexdigest()
|
|
243
|
+
|
|
244
|
+
# Pad to power of 2
|
|
245
|
+
while len(file_hashes) & (len(file_hashes) - 1):
|
|
246
|
+
file_hashes.append(file_hashes[-1])
|
|
247
|
+
|
|
248
|
+
# Build tree
|
|
249
|
+
level = file_hashes
|
|
250
|
+
while len(level) > 1:
|
|
251
|
+
next_level = []
|
|
252
|
+
for i in range(0, len(level), 2):
|
|
253
|
+
combined = level[i] + level[i + 1]
|
|
254
|
+
next_level.append(hashlib.sha256(combined.encode()).hexdigest())
|
|
255
|
+
level = next_level
|
|
256
|
+
|
|
257
|
+
return level[0]
|
|
258
|
+
|
|
259
|
+
def store_merkle_state(self, directory: Path) -> Dict[str, Any]:
|
|
260
|
+
"""Store current Merkle state for later verification."""
|
|
261
|
+
file_hashes = []
|
|
262
|
+
file_paths = []
|
|
263
|
+
|
|
264
|
+
for filepath in sorted(directory.rglob("*")):
|
|
265
|
+
if filepath.is_file():
|
|
266
|
+
file_hash = self.compute_file_hash(filepath)
|
|
267
|
+
if file_hash:
|
|
268
|
+
file_hashes.append(file_hash)
|
|
269
|
+
file_paths.append(str(filepath.relative_to(directory)))
|
|
270
|
+
|
|
271
|
+
merkle_root = self.compute_merkle_root(file_hashes)
|
|
272
|
+
|
|
273
|
+
state = {
|
|
274
|
+
"merkle_root": merkle_root,
|
|
275
|
+
"file_count": len(file_hashes),
|
|
276
|
+
"computed_at": datetime.now(timezone.utc).isoformat(),
|
|
277
|
+
"file_hashes": dict(zip(file_paths, file_hashes)),
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
self.mem_dir.mkdir(parents=True, exist_ok=True)
|
|
281
|
+
self.merkle_file.write_text(json.dumps(state, indent=2))
|
|
282
|
+
|
|
283
|
+
return state
|
|
284
|
+
|
|
285
|
+
def verify_integrity(self, directory: Path) -> Dict[str, Any]:
|
|
286
|
+
"""Verify current state against stored Merkle root."""
|
|
287
|
+
if not self.merkle_file.exists():
|
|
288
|
+
return {"verified": False, "error": "No stored Merkle state found"}
|
|
289
|
+
|
|
290
|
+
stored = json.loads(self.merkle_file.read_text())
|
|
291
|
+
stored_hashes = stored.get("file_hashes", {})
|
|
292
|
+
|
|
293
|
+
current_hashes = {}
|
|
294
|
+
for filepath in sorted(directory.rglob("*")):
|
|
295
|
+
if filepath.is_file():
|
|
296
|
+
rel_path = str(filepath.relative_to(directory))
|
|
297
|
+
current_hashes[rel_path] = self.compute_file_hash(filepath)
|
|
298
|
+
|
|
299
|
+
# Compare
|
|
300
|
+
modified = []
|
|
301
|
+
added = []
|
|
302
|
+
deleted = []
|
|
303
|
+
|
|
304
|
+
for path, hash_value in current_hashes.items():
|
|
305
|
+
if path not in stored_hashes:
|
|
306
|
+
added.append(path)
|
|
307
|
+
elif stored_hashes[path] != hash_value:
|
|
308
|
+
modified.append(path)
|
|
309
|
+
|
|
310
|
+
for path in stored_hashes:
|
|
311
|
+
if path not in current_hashes:
|
|
312
|
+
deleted.append(path)
|
|
313
|
+
|
|
314
|
+
current_root = self.compute_merkle_root(list(current_hashes.values()))
|
|
315
|
+
|
|
316
|
+
return {
|
|
317
|
+
"verified": len(modified) == 0 and len(added) == 0 and len(deleted) == 0,
|
|
318
|
+
"stored_root": stored.get("merkle_root"),
|
|
319
|
+
"current_root": current_root,
|
|
320
|
+
"roots_match": stored.get("merkle_root") == current_root,
|
|
321
|
+
"modified_files": modified,
|
|
322
|
+
"added_files": added,
|
|
323
|
+
"deleted_files": deleted,
|
|
324
|
+
"stored_at": stored.get("computed_at"),
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class AuditAnalyzer:
|
|
329
|
+
"""Analyzes audit trail for compliance."""
|
|
330
|
+
|
|
331
|
+
def __init__(self, mem_dir: Path):
|
|
332
|
+
self.mem_dir = Path(mem_dir)
|
|
333
|
+
self.audit_file = self.mem_dir / "audit.log"
|
|
334
|
+
|
|
335
|
+
def load_audit_entries(self) -> List[Dict[str, Any]]:
|
|
336
|
+
"""Load audit log entries."""
|
|
337
|
+
if not self.audit_file.exists():
|
|
338
|
+
return []
|
|
339
|
+
|
|
340
|
+
entries = []
|
|
341
|
+
try:
|
|
342
|
+
for line in self.audit_file.read_text().strip().split("\n"):
|
|
343
|
+
if line:
|
|
344
|
+
try:
|
|
345
|
+
entries.append(json.loads(line))
|
|
346
|
+
except Exception:
|
|
347
|
+
pass
|
|
348
|
+
except Exception:
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
return entries
|
|
352
|
+
|
|
353
|
+
def verify_chain(self) -> Dict[str, Any]:
|
|
354
|
+
"""Verify audit chain integrity."""
|
|
355
|
+
entries = self.load_audit_entries()
|
|
356
|
+
if not entries:
|
|
357
|
+
return {"valid": True, "entries": 0, "message": "No audit entries"}
|
|
358
|
+
|
|
359
|
+
valid = True
|
|
360
|
+
errors = []
|
|
361
|
+
prev_hash = None
|
|
362
|
+
|
|
363
|
+
for i, entry in enumerate(entries):
|
|
364
|
+
# Verify hash chain
|
|
365
|
+
entry_hash = entry.get("hash")
|
|
366
|
+
entry_prev = entry.get("prev_hash")
|
|
367
|
+
|
|
368
|
+
if i > 0 and entry_prev != prev_hash:
|
|
369
|
+
valid = False
|
|
370
|
+
errors.append(f"Chain break at entry {i}")
|
|
371
|
+
|
|
372
|
+
prev_hash = entry_hash
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
"valid": valid,
|
|
376
|
+
"entries": len(entries),
|
|
377
|
+
"errors": errors,
|
|
378
|
+
"first_entry": entries[0].get("timestamp") if entries else None,
|
|
379
|
+
"last_entry": entries[-1].get("timestamp") if entries else None,
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
383
|
+
"""Get audit statistics."""
|
|
384
|
+
entries = self.load_audit_entries()
|
|
385
|
+
|
|
386
|
+
operations = {}
|
|
387
|
+
agents = {}
|
|
388
|
+
by_day = {}
|
|
389
|
+
|
|
390
|
+
for entry in entries:
|
|
391
|
+
op = entry.get("operation", "unknown")
|
|
392
|
+
operations[op] = operations.get(op, 0) + 1
|
|
393
|
+
|
|
394
|
+
agent = entry.get("agent", "unknown")
|
|
395
|
+
agents[agent] = agents.get(agent, 0) + 1
|
|
396
|
+
|
|
397
|
+
ts = entry.get("timestamp", "")[:10]
|
|
398
|
+
if ts:
|
|
399
|
+
by_day[ts] = by_day.get(ts, 0) + 1
|
|
400
|
+
|
|
401
|
+
return {
|
|
402
|
+
"total_entries": len(entries),
|
|
403
|
+
"operations": operations,
|
|
404
|
+
"agents": agents,
|
|
405
|
+
"by_day": by_day,
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# --- Dashboard Helper ---
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def get_compliance_dashboard(mem_dir: Path, current_dir: Path) -> Dict[str, Any]:
|
|
413
|
+
"""Get data for compliance dashboard."""
|
|
414
|
+
privacy_mgr = PrivacyManager(mem_dir)
|
|
415
|
+
encryption_verifier = EncryptionVerifier(mem_dir, current_dir)
|
|
416
|
+
tamper_detector = TamperDetector(mem_dir)
|
|
417
|
+
audit_analyzer = AuditAnalyzer(mem_dir)
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
"privacy": privacy_mgr.get_dashboard_data(),
|
|
421
|
+
"encryption": encryption_verifier.scan_directory(),
|
|
422
|
+
"integrity": tamper_detector.verify_integrity(current_dir),
|
|
423
|
+
"audit": {
|
|
424
|
+
"chain_valid": audit_analyzer.verify_chain(),
|
|
425
|
+
"statistics": audit_analyzer.get_statistics(),
|
|
426
|
+
},
|
|
427
|
+
}
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta compression metrics and observability.
|
|
3
|
+
|
|
4
|
+
Tracks compression effectiveness across object types to enable future
|
|
5
|
+
optimization and auto-tuning of delta encoding parameters.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- DeltaCompressionMetrics: Tracks compression ratio, object types, benefits
|
|
9
|
+
- CompressionHeatmap: Visualizes which types compress best
|
|
10
|
+
- Statistics reporting for gc --repack operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ObjectCompressionStats:
|
|
20
|
+
"""Statistics for a single object's compression."""
|
|
21
|
+
|
|
22
|
+
object_id: str
|
|
23
|
+
object_type: str # "semantic", "episodic", "procedural"
|
|
24
|
+
original_size: int # bytes
|
|
25
|
+
compressed_size: int # bytes after delta encoding
|
|
26
|
+
compression_ratio: float # compressed_size / original_size (0.0 = 100% compression)
|
|
27
|
+
delta_used: bool # Whether delta encoding was applied
|
|
28
|
+
compression_benefit: float # original_size - compressed_size
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class TypeCompressionStats:
|
|
33
|
+
"""Aggregated statistics for an object type."""
|
|
34
|
+
|
|
35
|
+
object_type: str
|
|
36
|
+
count: int = 0
|
|
37
|
+
total_original_size: int = 0
|
|
38
|
+
total_compressed_size: int = 0
|
|
39
|
+
avg_compression_ratio: float = 0.0
|
|
40
|
+
total_benefit: int = 0 # Total bytes saved
|
|
41
|
+
objects_with_delta: int = 0 # How many used delta encoding
|
|
42
|
+
min_ratio: float = 1.0
|
|
43
|
+
max_ratio: float = 0.0
|
|
44
|
+
|
|
45
|
+
def update_from_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
46
|
+
"""Update type stats with a single object's stats."""
|
|
47
|
+
self.count += 1
|
|
48
|
+
self.total_original_size += obj_stats.original_size
|
|
49
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
50
|
+
self.total_benefit += int(obj_stats.compression_benefit)
|
|
51
|
+
if obj_stats.delta_used:
|
|
52
|
+
self.objects_with_delta += 1
|
|
53
|
+
self.min_ratio = min(self.min_ratio, obj_stats.compression_ratio)
|
|
54
|
+
self.max_ratio = max(self.max_ratio, obj_stats.compression_ratio)
|
|
55
|
+
|
|
56
|
+
# Recalculate average
|
|
57
|
+
if self.total_original_size > 0:
|
|
58
|
+
self.avg_compression_ratio = self.total_compressed_size / self.total_original_size
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""Convert to dict for reporting."""
|
|
62
|
+
savings_pct = 0.0
|
|
63
|
+
if self.total_original_size > 0:
|
|
64
|
+
savings_pct = (self.total_benefit / self.total_original_size) * 100
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"object_type": self.object_type,
|
|
68
|
+
"count": self.count,
|
|
69
|
+
"total_original_bytes": self.total_original_size,
|
|
70
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
71
|
+
"avg_compression_ratio": round(self.avg_compression_ratio, 3),
|
|
72
|
+
"compression_range": f"{self.min_ratio:.1%} - {self.max_ratio:.1%}",
|
|
73
|
+
"total_bytes_saved": self.total_benefit,
|
|
74
|
+
"savings_percentage": round(savings_pct, 1),
|
|
75
|
+
"objects_using_delta": self.objects_with_delta,
|
|
76
|
+
"delta_adoption_rate": (
|
|
77
|
+
round((self.objects_with_delta / self.count * 100), 1) if self.count > 0 else 0
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DeltaCompressionMetrics:
|
|
83
|
+
"""Tracks delta compression statistics across all objects.
|
|
84
|
+
|
|
85
|
+
Usage:
|
|
86
|
+
metrics = DeltaCompressionMetrics()
|
|
87
|
+
# ... during packing ...
|
|
88
|
+
metrics.record_object(ObjectCompressionStats(...))
|
|
89
|
+
# ... after packing ...
|
|
90
|
+
report = metrics.get_report()
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self):
|
|
94
|
+
self.objects: List[ObjectCompressionStats] = []
|
|
95
|
+
self.type_stats: Dict[str, TypeCompressionStats] = {}
|
|
96
|
+
self.total_original_size: int = 0
|
|
97
|
+
self.total_compressed_size: int = 0
|
|
98
|
+
|
|
99
|
+
def record_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
100
|
+
"""Record compression stats for a single object."""
|
|
101
|
+
self.objects.append(obj_stats)
|
|
102
|
+
self.total_original_size += obj_stats.original_size
|
|
103
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
104
|
+
|
|
105
|
+
# Update type-specific stats
|
|
106
|
+
if obj_stats.object_type not in self.type_stats:
|
|
107
|
+
self.type_stats[obj_stats.object_type] = TypeCompressionStats(
|
|
108
|
+
object_type=obj_stats.object_type
|
|
109
|
+
)
|
|
110
|
+
self.type_stats[obj_stats.object_type].update_from_object(obj_stats)
|
|
111
|
+
|
|
112
|
+
def get_type_stats(self, object_type: str) -> Optional[TypeCompressionStats]:
|
|
113
|
+
"""Get stats for a specific object type."""
|
|
114
|
+
return self.type_stats.get(object_type)
|
|
115
|
+
|
|
116
|
+
def get_overall_ratio(self) -> float:
|
|
117
|
+
"""Get overall compression ratio across all objects."""
|
|
118
|
+
if self.total_original_size == 0:
|
|
119
|
+
return 0.0
|
|
120
|
+
return self.total_compressed_size / self.total_original_size
|
|
121
|
+
|
|
122
|
+
def get_overall_savings(self) -> int:
|
|
123
|
+
"""Get total bytes saved across all objects."""
|
|
124
|
+
return self.total_original_size - self.total_compressed_size
|
|
125
|
+
|
|
126
|
+
def get_report(self) -> Dict[str, Any]:
|
|
127
|
+
"""Generate a comprehensive compression report."""
|
|
128
|
+
overall_ratio = self.get_overall_ratio()
|
|
129
|
+
overall_savings = self.get_overall_savings()
|
|
130
|
+
savings_pct = (
|
|
131
|
+
(overall_savings / self.total_original_size * 100)
|
|
132
|
+
if self.total_original_size > 0
|
|
133
|
+
else 0
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"timestamp": None, # Set by caller if needed
|
|
138
|
+
"total_objects": len(self.objects),
|
|
139
|
+
"total_original_bytes": self.total_original_size,
|
|
140
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
141
|
+
"overall_compression_ratio": round(overall_ratio, 3),
|
|
142
|
+
"total_bytes_saved": overall_savings,
|
|
143
|
+
"compression_percentage": round(savings_pct, 1),
|
|
144
|
+
"type_statistics": {otype: stats.to_dict() for otype, stats in self.type_stats.items()},
|
|
145
|
+
"recommendations": self._generate_recommendations(),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
def _generate_recommendations(self) -> List[str]:
|
|
149
|
+
"""Generate optimization recommendations based on compression stats."""
|
|
150
|
+
recommendations = []
|
|
151
|
+
|
|
152
|
+
# Check if delta encoding is worth it
|
|
153
|
+
objects_with_delta = sum(s.objects_with_delta for s in self.type_stats.values())
|
|
154
|
+
if objects_with_delta == 0:
|
|
155
|
+
recommendations.append("No objects used delta encoding. Check similarity thresholds.")
|
|
156
|
+
|
|
157
|
+
# Check for types with poor compression
|
|
158
|
+
for otype, stats in self.type_stats.items():
|
|
159
|
+
if stats.count > 0 and stats.avg_compression_ratio > 0.9:
|
|
160
|
+
recommendations.append(
|
|
161
|
+
f"Type '{otype}' compresses poorly (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
162
|
+
f"Consider increasing similarity threshold or reducing delta cost."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Check for types with excellent compression
|
|
166
|
+
for otype, stats in self.type_stats.items():
|
|
167
|
+
if stats.count > 0 and stats.avg_compression_ratio < 0.5:
|
|
168
|
+
recommendations.append(
|
|
169
|
+
f"Type '{otype}' compresses very well (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
170
|
+
f"Consider aggressive delta encoding or reduced threshold."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not recommendations:
|
|
174
|
+
recommendations.append("Compression is operating normally.")
|
|
175
|
+
|
|
176
|
+
return recommendations
|
|
177
|
+
|
|
178
|
+
def get_heatmap(self) -> str:
|
|
179
|
+
"""Generate a text-based compression heatmap."""
|
|
180
|
+
lines = ["Delta Compression Heatmap", "=" * 50]
|
|
181
|
+
|
|
182
|
+
if not self.type_stats:
|
|
183
|
+
lines.append("No compression data available")
|
|
184
|
+
return "\n".join(lines)
|
|
185
|
+
|
|
186
|
+
# Sort by compression ratio
|
|
187
|
+
sorted_types = sorted(
|
|
188
|
+
self.type_stats.values(),
|
|
189
|
+
key=lambda s: s.avg_compression_ratio,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
for stats in sorted_types:
|
|
193
|
+
if stats.count == 0:
|
|
194
|
+
continue
|
|
195
|
+
ratio = stats.avg_compression_ratio
|
|
196
|
+
# Create a simple bar chart
|
|
197
|
+
bar_width = 30
|
|
198
|
+
filled = int(bar_width * ratio)
|
|
199
|
+
bar = "█" * filled + "░" * (bar_width - filled)
|
|
200
|
+
saved_pct = (
|
|
201
|
+
(stats.total_benefit / stats.total_original_size * 100)
|
|
202
|
+
if stats.total_original_size > 0
|
|
203
|
+
else 0
|
|
204
|
+
)
|
|
205
|
+
lines.append(
|
|
206
|
+
f"{stats.object_type:12} {bar} {saved_pct:5.1f}% saved ({stats.objects_with_delta}/{stats.count} using delta)"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
def log_report(self, logger: Any = None) -> None:
|
|
212
|
+
"""Log the compression report."""
|
|
213
|
+
report = self.get_report()
|
|
214
|
+
heatmap = self.get_heatmap()
|
|
215
|
+
|
|
216
|
+
output = [
|
|
217
|
+
"=" * 70,
|
|
218
|
+
"Delta Compression Report",
|
|
219
|
+
"=" * 70,
|
|
220
|
+
f"Total Objects: {report['total_objects']}",
|
|
221
|
+
f"Total Original: {report['total_original_bytes']:,} bytes",
|
|
222
|
+
f"Total Compressed: {report['total_compressed_bytes']:,} bytes",
|
|
223
|
+
f"Overall Ratio: {report['overall_compression_ratio']:.1%}",
|
|
224
|
+
f"Bytes Saved: {report['total_bytes_saved']:,} ({report['compression_percentage']:.1f}%)",
|
|
225
|
+
"",
|
|
226
|
+
heatmap,
|
|
227
|
+
"",
|
|
228
|
+
"Type Breakdown:",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
for otype, stats in sorted(report["type_statistics"].items()):
|
|
232
|
+
output.append(f" {otype}:")
|
|
233
|
+
output.append(f" Count: {stats['count']}")
|
|
234
|
+
output.append(f" Compression: {stats['avg_compression_ratio']:.1%}")
|
|
235
|
+
output.append(f" Saved: {stats['total_bytes_saved']:,} bytes")
|
|
236
|
+
output.append(f" Delta adoption: {stats['delta_adoption_rate']:.0f}%")
|
|
237
|
+
|
|
238
|
+
output.extend(["", "Recommendations:"])
|
|
239
|
+
for rec in report["recommendations"]:
|
|
240
|
+
output.append(f" - {rec}")
|
|
241
|
+
|
|
242
|
+
output.append("=" * 70)
|
|
243
|
+
|
|
244
|
+
full_output = "\n".join(output)
|
|
245
|
+
if logger:
|
|
246
|
+
logger.info(full_output)
|
|
247
|
+
else:
|
|
248
|
+
print(full_output)
|