agmem 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/METADATA +5 -4
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/RECORD +17 -13
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/coordinator/server.py +18 -2
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/distiller.py +3 -12
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +1 -1
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/protocol_builder.py +198 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: agmem
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Agentic Memory Version Control System - Git for AI agent memories
|
|
5
5
|
Home-page: https://github.com/vivek-tiwari-vt/agmem
|
|
6
6
|
Author: agmem Team
|
|
@@ -137,14 +137,15 @@ agmem solves all of these problems with a familiar Git-like interface.
|
|
|
137
137
|
- ✅ **Tamper-evident audit trail** — Append-only hash-chained log (init, add, commit, checkout, merge, push, pull, config); `agmem audit` and `agmem audit --verify`
|
|
138
138
|
- ✅ **Multi-agent trust** — Trust store (full / conditional / untrusted) per public key; applied on pull/merge; clone copies remote keys
|
|
139
139
|
- ✅ **Conflict resolution** — `agmem resolve` with ours/theirs/both; conflicts persisted in `.mem/merge/`; path-safe
|
|
140
|
-
- ✅ **Differential privacy** — Epsilon/delta budget in `.mem/privacy_budget.json`; `--private` on `agmem distill` and `agmem garden`; noise
|
|
140
|
+
- ✅ **Differential privacy** — Epsilon/delta budget in `.mem/privacy_budget.json`; `--private` on `agmem distill` and `agmem garden`; noise applies to fact-level data only (metadata fields excluded)
|
|
141
141
|
- ✅ **Pack files & GC** — `agmem gc [--repack]` (reachable from refs, prune loose, optional pack file + index); ObjectStore reads from pack when loose missing
|
|
142
142
|
- ✅ **Multi-provider LLM** — OpenAI and Anthropic via `memvcs.core.llm`; config/repo or env; used by gardener, distiller, consistency, merge
|
|
143
143
|
- ✅ **Temporal querying** — Point-in-time and range queries in temporal index; frontmatter timestamps
|
|
144
|
-
- ✅ **Federated collaboration** — `agmem federated push|pull`;
|
|
144
|
+
- ✅ **Federated collaboration** — `agmem federated push|pull`; protocol-compliant summaries (agent_id, timestamp, topic_counts, fact_hashes); optional DP on outbound; coordinator API in docs/FEDERATED.md
|
|
145
145
|
- ✅ **Zero-knowledge proofs** — `agmem prove` (hash/signature-based): keyword containment (Merkle set membership), memory freshness (signed timestamp). **Note:** Current implementation is proof-of-knowledge with known limitations; see docs for migration to true zk-SNARKs.
|
|
146
146
|
- ✅ **Daemon health** — 4-point health monitoring (storage, redundancy, staleness, graph consistency) with periodic checks; visible warnings and JSON reports
|
|
147
|
-
- ✅ **Delta encoding** — 5-10x compression for similar objects using Levenshtein distance and SequenceMatcher;
|
|
147
|
+
- ✅ **Delta encoding** — 5-10x compression for similar objects using Levenshtein distance and SequenceMatcher; enabled in GC repack with multi-tier similarity filtering
|
|
148
|
+
- ✅ **Performance safeguards** — Multi-tier similarity filter (length ratio + SimHash) avoids O(n²×m²) worst-case comparisons
|
|
148
149
|
- ✅ **GPU acceleration** — Vector store detects GPU for embedding model when available
|
|
149
150
|
- ✅ **Optional** — `serve`, `daemon` (watch + auto-commit), `garden` (episode archival), MCP server; install extras as needed
|
|
150
151
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
agmem-0.2.
|
|
2
|
-
memvcs/__init__.py,sha256=
|
|
3
|
-
memvcs/cli.py,sha256=
|
|
1
|
+
agmem-0.2.1.dist-info/licenses/LICENSE,sha256=X_S6RBErW-F0IDbM3FAEoDB-zxExFnl2m8640rTXphM,1067
|
|
2
|
+
memvcs/__init__.py,sha256=PwF2IkjOfw5nZCDcZdsNKns-h-FEvRahAqNd37Ti8_8,193
|
|
3
|
+
memvcs/cli.py,sha256=WPjhbevcOc_w_7SEXV5oitbEA5kYY5lHWgyTOq6x8sU,6075
|
|
4
4
|
memvcs/commands/__init__.py,sha256=A2D6xWaO6epU7iV4QSvqvF5TspnwRyDN7NojmGatPrE,510
|
|
5
5
|
memvcs/commands/add.py,sha256=k9eM7qf2NFvneiJkFQNiAYFB2GgKmyPw_NXmkCxblQE,8736
|
|
6
6
|
memvcs/commands/audit.py,sha256=E6m54B726tqDQR3rrgRXWrjE-seu2UocqrFxN1aHkY4,1680
|
|
@@ -47,10 +47,11 @@ memvcs/commands/tree.py,sha256=vdULq4vIXA_4gNfMnHn_Y78BwE0sJoeTBOnFJR3WsZ4,4927
|
|
|
47
47
|
memvcs/commands/verify.py,sha256=04CVW5NYWkUlPJ5z1Kci6dfQFM6UmPTGZh9ZextFLMc,3887
|
|
48
48
|
memvcs/commands/when.py,sha256=bxG_tEYnZNBTl2IPkoxpc2LUEbO_5ev1hRvEzxQQDmc,4773
|
|
49
49
|
memvcs/coordinator/__init__.py,sha256=XJEXEXJFvvhtRInPeyAC9bFNXGbshSrtuK6wZo3wS6g,139
|
|
50
|
-
memvcs/coordinator/server.py,sha256
|
|
50
|
+
memvcs/coordinator/server.py,sha256=M0wnww0EbtxuDaunP29LJDCnsTm1mcOn7h_fqZbQy5c,7550
|
|
51
51
|
memvcs/core/__init__.py,sha256=dkIC-4tS0GhwV2mZIbofEe8xR8uiFwrxslGf1aXwhYg,493
|
|
52
52
|
memvcs/core/access_index.py,sha256=HhacnzSUASzRV2jhDHkwRFoPS3rtqh9n9yE1VV7JXpk,5596
|
|
53
53
|
memvcs/core/audit.py,sha256=8APkm9Spl_-1rIdyRQz1elyxOeK3nlpwm0CLkpLlhTE,3732
|
|
54
|
+
memvcs/core/compression_metrics.py,sha256=0JrbkCGr0hnaKlmPLqv5WVLwO3emOEz2iFhdMTDNTNY,9835
|
|
54
55
|
memvcs/core/compression_pipeline.py,sha256=Vzr5v_0pgAG20C8znC0-Ho5fEwBoaTOLddxMTldd64M,5564
|
|
55
56
|
memvcs/core/config_loader.py,sha256=j-jgLDp2TRzWN9ZEZebfWSfatevBNYs0FEb3ud1SIR8,8277
|
|
56
57
|
memvcs/core/consistency.py,sha256=YOG8xhqZLKZCLbai2rdcP0KxYPNGFv5RRMwrQ6qCeyc,7462
|
|
@@ -59,18 +60,21 @@ memvcs/core/crypto_verify.py,sha256=DTuC7Kfx6z2b8UWOWziBTqP633LrjXbdtGmBBqrJTF0,
|
|
|
59
60
|
memvcs/core/decay.py,sha256=ROGwnqngs7eJNkbKmwyOdij607m73vpmoJqzrIDLBzk,6581
|
|
60
61
|
memvcs/core/delta.py,sha256=obXzojUSc2HaEUqH3L_1LF-GcJ63Wr_yYvIPM8iyeSg,7865
|
|
61
62
|
memvcs/core/diff.py,sha256=koEHTLciIUxYKVJVuvmY0GDXMgDgGZP_qg5RayhF-iE,13226
|
|
62
|
-
memvcs/core/distiller.py,sha256=
|
|
63
|
+
memvcs/core/distiller.py,sha256=wwY3xQVRBjVfxnOUIwMsQCSeQ2tlG68w2-KiCwkF9yo,13844
|
|
63
64
|
memvcs/core/encryption.py,sha256=epny_nlW6ylllv1qxs1mAcFq-PrLIisgfot4llOoAqw,5289
|
|
64
|
-
memvcs/core/
|
|
65
|
-
memvcs/core/
|
|
65
|
+
memvcs/core/fast_similarity.py,sha256=phgjxkSchJg7om9AFFSMbtP6bSidyRy-vVrR3XyMmDQ,13934
|
|
66
|
+
memvcs/core/federated.py,sha256=qwvfhNgga-lHadbinAfKPI4oAl0RMn5ab01ChmQTP1s,5863
|
|
67
|
+
memvcs/core/gardener.py,sha256=bpoJbK6PJ6nvK3ytj23jpMUBUB7Nn_fB80Ap1E7-Nv8,17041
|
|
66
68
|
memvcs/core/hooks.py,sha256=XF9z8J5sWjAcuOyWQ2nuvEzK0UV8s4ThrcltaBZttzw,5448
|
|
67
69
|
memvcs/core/ipfs_remote.py,sha256=xmEO14bn_7Ej-W5jhx2QJyBd-ljj9S2COOxMmcZBiTs,6643
|
|
68
70
|
memvcs/core/knowledge_graph.py,sha256=GY27e1rgraF2zMpz_jsumdUtpgTRk48yH5CAEQ3TDl4,16416
|
|
69
71
|
memvcs/core/merge.py,sha256=x2eSaxr4f63Eq00FCJ6DDe2TZU8H5yHQpzKzMhYsaFw,19871
|
|
70
72
|
memvcs/core/objects.py,sha256=Xgw1IpQnJLCG5o_7gDHVQ-TNGR9CSpDYWRXzLgLSuec,11006
|
|
71
|
-
memvcs/core/pack.py,sha256=
|
|
73
|
+
memvcs/core/pack.py,sha256=jtbeBh625K6nshPgBGf7zelU-BhvK5-t5NYBJPoYfgs,15961
|
|
72
74
|
memvcs/core/pii_scanner.py,sha256=T6gQ1APFrSDk980fjnv4ZMF-UztbJgmUFSwGrwWixEw,10802
|
|
73
75
|
memvcs/core/privacy_budget.py,sha256=fOPlxoKEAmsKtda-OJCrSaKjTyw7ekcqdN7KfRBw1CY,2113
|
|
76
|
+
memvcs/core/privacy_validator.py,sha256=g3l1zxSIxkjMYJMwL5yfuDY5FFjmkm6HZ2Wo4xBiEkQ,6795
|
|
77
|
+
memvcs/core/protocol_builder.py,sha256=b_5FphgmMdp7qP34ws3U2agXEoeYzTBjSgsQqd2Jx6Y,7713
|
|
74
78
|
memvcs/core/refs.py,sha256=4Nx2ZVRa_DzfUZ4O1AwzOHEjoGAEICJKqSd9GxaiD_g,16754
|
|
75
79
|
memvcs/core/remote.py,sha256=sZbAO9JEaDJM96PylB0CjpmR5UxWYdoXlq86sj3R2gU,22228
|
|
76
80
|
memvcs/core/repository.py,sha256=NzC2UFPv6ePxi5lfiSKyZFLclH4bJpWJz88pY7tDiv4,20605
|
|
@@ -104,8 +108,8 @@ memvcs/retrieval/recaller.py,sha256=8KY-XjMUz5_vcKf46zI64uk1DEM__u7wM92ShukOtsY,
|
|
|
104
108
|
memvcs/retrieval/strategies.py,sha256=26yxQQubQfjxWQXknfVMxuzPHf2EcZxJg_B99BEdl5c,11458
|
|
105
109
|
memvcs/utils/__init__.py,sha256=8psUzz4Ntv2GzbRebkeVsoyC6Ck-FIwi0_lfYdj5oho,185
|
|
106
110
|
memvcs/utils/helpers.py,sha256=37zg_DcQ2y99b9NSLqxFkglHe13rJXKhFDpEbQ7iLhM,4121
|
|
107
|
-
agmem-0.2.
|
|
108
|
-
agmem-0.2.
|
|
109
|
-
agmem-0.2.
|
|
110
|
-
agmem-0.2.
|
|
111
|
-
agmem-0.2.
|
|
111
|
+
agmem-0.2.1.dist-info/METADATA,sha256=6UV86NAOpGnnqpRJJE_9XkU-7j2aoLSIf3TB1oQ3dC0,42320
|
|
112
|
+
agmem-0.2.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
113
|
+
agmem-0.2.1.dist-info/entry_points.txt,sha256=at7eWycgjqOo1wbUMECnXUsNo3gpCkJTU71OzrGLHu0,42
|
|
114
|
+
agmem-0.2.1.dist-info/top_level.txt,sha256=HtMMsKuwLKLOdgF1GxqQztqFM54tTJctVdJuOec6B-4,7
|
|
115
|
+
agmem-0.2.1.dist-info/RECORD,,
|
memvcs/__init__.py
CHANGED
memvcs/cli.py
CHANGED
|
@@ -141,7 +141,7 @@ For more information: https://github.com/vivek-tiwari-vt/agmem
|
|
|
141
141
|
""",
|
|
142
142
|
)
|
|
143
143
|
|
|
144
|
-
parser.add_argument("--version", "-v", action="version", version="%(prog)s 0.1
|
|
144
|
+
parser.add_argument("--version", "-v", action="version", version="%(prog)s 0.2.1")
|
|
145
145
|
|
|
146
146
|
parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
|
|
147
147
|
|
memvcs/coordinator/server.py
CHANGED
|
@@ -21,6 +21,7 @@ from typing import Dict, List, Optional, Any
|
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
import json
|
|
23
23
|
import hashlib
|
|
24
|
+
import re
|
|
24
25
|
|
|
25
26
|
try:
|
|
26
27
|
from fastapi import FastAPI, HTTPException, Request
|
|
@@ -39,10 +40,25 @@ except ImportError:
|
|
|
39
40
|
return None
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
def _get_version() -> str:
|
|
44
|
+
"""Get agmem version from pyproject.toml. Falls back to 0.2.1 if not found."""
|
|
45
|
+
try:
|
|
46
|
+
pyproject_path = Path(__file__).parent.parent.parent / "pyproject.toml"
|
|
47
|
+
if pyproject_path.exists():
|
|
48
|
+
content = pyproject_path.read_text()
|
|
49
|
+
match = re.search(r'version\s*=\s*"([^"]+)"', content)
|
|
50
|
+
if match:
|
|
51
|
+
return match.group(1)
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
return "0.2.1"
|
|
55
|
+
|
|
56
|
+
|
|
42
57
|
# Storage: In-memory for simplicity (use Redis/PostgreSQL for production)
|
|
43
58
|
summaries_store: Dict[str, List[Dict[str, Any]]] = {}
|
|
59
|
+
_version = _get_version()
|
|
44
60
|
metadata_store: Dict[str, Any] = {
|
|
45
|
-
"coordinator_version":
|
|
61
|
+
"coordinator_version": _version,
|
|
46
62
|
"started_at": datetime.now(timezone.utc).isoformat(),
|
|
47
63
|
"total_pushes": 0,
|
|
48
64
|
"total_agents": 0,
|
|
@@ -79,7 +95,7 @@ if FASTAPI_AVAILABLE:
|
|
|
79
95
|
app = FastAPI(
|
|
80
96
|
title="agmem Federated Coordinator",
|
|
81
97
|
description="Minimal coordinator for federated agent memory collaboration",
|
|
82
|
-
version=
|
|
98
|
+
version=_version,
|
|
83
99
|
)
|
|
84
100
|
|
|
85
101
|
@app.get("/")
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Delta compression metrics and observability.
|
|
3
|
+
|
|
4
|
+
Tracks compression effectiveness across object types to enable future
|
|
5
|
+
optimization and auto-tuning of delta encoding parameters.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- DeltaCompressionMetrics: Tracks compression ratio, object types, benefits
|
|
9
|
+
- CompressionHeatmap: Visualizes which types compress best
|
|
10
|
+
- Statistics reporting for gc --repack operations
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from typing import Dict, List, Any, Optional, Tuple
|
|
15
|
+
from collections import defaultdict
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ObjectCompressionStats:
|
|
20
|
+
"""Statistics for a single object's compression."""
|
|
21
|
+
|
|
22
|
+
object_id: str
|
|
23
|
+
object_type: str # "semantic", "episodic", "procedural"
|
|
24
|
+
original_size: int # bytes
|
|
25
|
+
compressed_size: int # bytes after delta encoding
|
|
26
|
+
compression_ratio: float # compressed_size / original_size (0.0 = 100% compression)
|
|
27
|
+
delta_used: bool # Whether delta encoding was applied
|
|
28
|
+
compression_benefit: float # original_size - compressed_size
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class TypeCompressionStats:
|
|
33
|
+
"""Aggregated statistics for an object type."""
|
|
34
|
+
|
|
35
|
+
object_type: str
|
|
36
|
+
count: int = 0
|
|
37
|
+
total_original_size: int = 0
|
|
38
|
+
total_compressed_size: int = 0
|
|
39
|
+
avg_compression_ratio: float = 0.0
|
|
40
|
+
total_benefit: int = 0 # Total bytes saved
|
|
41
|
+
objects_with_delta: int = 0 # How many used delta encoding
|
|
42
|
+
min_ratio: float = 1.0
|
|
43
|
+
max_ratio: float = 0.0
|
|
44
|
+
|
|
45
|
+
def update_from_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
46
|
+
"""Update type stats with a single object's stats."""
|
|
47
|
+
self.count += 1
|
|
48
|
+
self.total_original_size += obj_stats.original_size
|
|
49
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
50
|
+
self.total_benefit += int(obj_stats.compression_benefit)
|
|
51
|
+
if obj_stats.delta_used:
|
|
52
|
+
self.objects_with_delta += 1
|
|
53
|
+
self.min_ratio = min(self.min_ratio, obj_stats.compression_ratio)
|
|
54
|
+
self.max_ratio = max(self.max_ratio, obj_stats.compression_ratio)
|
|
55
|
+
|
|
56
|
+
# Recalculate average
|
|
57
|
+
if self.total_original_size > 0:
|
|
58
|
+
self.avg_compression_ratio = self.total_compressed_size / self.total_original_size
|
|
59
|
+
|
|
60
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
61
|
+
"""Convert to dict for reporting."""
|
|
62
|
+
savings_pct = 0.0
|
|
63
|
+
if self.total_original_size > 0:
|
|
64
|
+
savings_pct = (self.total_benefit / self.total_original_size) * 100
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"object_type": self.object_type,
|
|
68
|
+
"count": self.count,
|
|
69
|
+
"total_original_bytes": self.total_original_size,
|
|
70
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
71
|
+
"avg_compression_ratio": round(self.avg_compression_ratio, 3),
|
|
72
|
+
"compression_range": f"{self.min_ratio:.1%} - {self.max_ratio:.1%}",
|
|
73
|
+
"total_bytes_saved": self.total_benefit,
|
|
74
|
+
"savings_percentage": round(savings_pct, 1),
|
|
75
|
+
"objects_using_delta": self.objects_with_delta,
|
|
76
|
+
"delta_adoption_rate": (
|
|
77
|
+
round((self.objects_with_delta / self.count * 100), 1) if self.count > 0 else 0
|
|
78
|
+
),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class DeltaCompressionMetrics:
|
|
83
|
+
"""Tracks delta compression statistics across all objects.
|
|
84
|
+
|
|
85
|
+
Usage:
|
|
86
|
+
metrics = DeltaCompressionMetrics()
|
|
87
|
+
# ... during packing ...
|
|
88
|
+
metrics.record_object(ObjectCompressionStats(...))
|
|
89
|
+
# ... after packing ...
|
|
90
|
+
report = metrics.get_report()
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self):
|
|
94
|
+
self.objects: List[ObjectCompressionStats] = []
|
|
95
|
+
self.type_stats: Dict[str, TypeCompressionStats] = {}
|
|
96
|
+
self.total_original_size: int = 0
|
|
97
|
+
self.total_compressed_size: int = 0
|
|
98
|
+
|
|
99
|
+
def record_object(self, obj_stats: ObjectCompressionStats) -> None:
|
|
100
|
+
"""Record compression stats for a single object."""
|
|
101
|
+
self.objects.append(obj_stats)
|
|
102
|
+
self.total_original_size += obj_stats.original_size
|
|
103
|
+
self.total_compressed_size += obj_stats.compressed_size
|
|
104
|
+
|
|
105
|
+
# Update type-specific stats
|
|
106
|
+
if obj_stats.object_type not in self.type_stats:
|
|
107
|
+
self.type_stats[obj_stats.object_type] = TypeCompressionStats(
|
|
108
|
+
object_type=obj_stats.object_type
|
|
109
|
+
)
|
|
110
|
+
self.type_stats[obj_stats.object_type].update_from_object(obj_stats)
|
|
111
|
+
|
|
112
|
+
def get_type_stats(self, object_type: str) -> Optional[TypeCompressionStats]:
|
|
113
|
+
"""Get stats for a specific object type."""
|
|
114
|
+
return self.type_stats.get(object_type)
|
|
115
|
+
|
|
116
|
+
def get_overall_ratio(self) -> float:
|
|
117
|
+
"""Get overall compression ratio across all objects."""
|
|
118
|
+
if self.total_original_size == 0:
|
|
119
|
+
return 0.0
|
|
120
|
+
return self.total_compressed_size / self.total_original_size
|
|
121
|
+
|
|
122
|
+
def get_overall_savings(self) -> int:
|
|
123
|
+
"""Get total bytes saved across all objects."""
|
|
124
|
+
return self.total_original_size - self.total_compressed_size
|
|
125
|
+
|
|
126
|
+
def get_report(self) -> Dict[str, Any]:
|
|
127
|
+
"""Generate a comprehensive compression report."""
|
|
128
|
+
overall_ratio = self.get_overall_ratio()
|
|
129
|
+
overall_savings = self.get_overall_savings()
|
|
130
|
+
savings_pct = (
|
|
131
|
+
(overall_savings / self.total_original_size * 100)
|
|
132
|
+
if self.total_original_size > 0
|
|
133
|
+
else 0
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return {
|
|
137
|
+
"timestamp": None, # Set by caller if needed
|
|
138
|
+
"total_objects": len(self.objects),
|
|
139
|
+
"total_original_bytes": self.total_original_size,
|
|
140
|
+
"total_compressed_bytes": self.total_compressed_size,
|
|
141
|
+
"overall_compression_ratio": round(overall_ratio, 3),
|
|
142
|
+
"total_bytes_saved": overall_savings,
|
|
143
|
+
"compression_percentage": round(savings_pct, 1),
|
|
144
|
+
"type_statistics": {otype: stats.to_dict() for otype, stats in self.type_stats.items()},
|
|
145
|
+
"recommendations": self._generate_recommendations(),
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
def _generate_recommendations(self) -> List[str]:
|
|
149
|
+
"""Generate optimization recommendations based on compression stats."""
|
|
150
|
+
recommendations = []
|
|
151
|
+
|
|
152
|
+
# Check if delta encoding is worth it
|
|
153
|
+
objects_with_delta = sum(s.objects_with_delta for s in self.type_stats.values())
|
|
154
|
+
if objects_with_delta == 0:
|
|
155
|
+
recommendations.append("No objects used delta encoding. Check similarity thresholds.")
|
|
156
|
+
|
|
157
|
+
# Check for types with poor compression
|
|
158
|
+
for otype, stats in self.type_stats.items():
|
|
159
|
+
if stats.count > 0 and stats.avg_compression_ratio > 0.9:
|
|
160
|
+
recommendations.append(
|
|
161
|
+
f"Type '{otype}' compresses poorly (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
162
|
+
f"Consider increasing similarity threshold or reducing delta cost."
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Check for types with excellent compression
|
|
166
|
+
for otype, stats in self.type_stats.items():
|
|
167
|
+
if stats.count > 0 and stats.avg_compression_ratio < 0.5:
|
|
168
|
+
recommendations.append(
|
|
169
|
+
f"Type '{otype}' compresses very well (ratio: {stats.avg_compression_ratio:.1%}). "
|
|
170
|
+
f"Consider aggressive delta encoding or reduced threshold."
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if not recommendations:
|
|
174
|
+
recommendations.append("Compression is operating normally.")
|
|
175
|
+
|
|
176
|
+
return recommendations
|
|
177
|
+
|
|
178
|
+
def get_heatmap(self) -> str:
|
|
179
|
+
"""Generate a text-based compression heatmap."""
|
|
180
|
+
lines = ["Delta Compression Heatmap", "=" * 50]
|
|
181
|
+
|
|
182
|
+
if not self.type_stats:
|
|
183
|
+
lines.append("No compression data available")
|
|
184
|
+
return "\n".join(lines)
|
|
185
|
+
|
|
186
|
+
# Sort by compression ratio
|
|
187
|
+
sorted_types = sorted(
|
|
188
|
+
self.type_stats.values(),
|
|
189
|
+
key=lambda s: s.avg_compression_ratio,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
for stats in sorted_types:
|
|
193
|
+
if stats.count == 0:
|
|
194
|
+
continue
|
|
195
|
+
ratio = stats.avg_compression_ratio
|
|
196
|
+
# Create a simple bar chart
|
|
197
|
+
bar_width = 30
|
|
198
|
+
filled = int(bar_width * ratio)
|
|
199
|
+
bar = "█" * filled + "░" * (bar_width - filled)
|
|
200
|
+
saved_pct = (
|
|
201
|
+
(stats.total_benefit / stats.total_original_size * 100)
|
|
202
|
+
if stats.total_original_size > 0
|
|
203
|
+
else 0
|
|
204
|
+
)
|
|
205
|
+
lines.append(
|
|
206
|
+
f"{stats.object_type:12} {bar} {saved_pct:5.1f}% saved ({stats.objects_with_delta}/{stats.count} using delta)"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
def log_report(self, logger: Any = None) -> None:
|
|
212
|
+
"""Log the compression report."""
|
|
213
|
+
report = self.get_report()
|
|
214
|
+
heatmap = self.get_heatmap()
|
|
215
|
+
|
|
216
|
+
output = [
|
|
217
|
+
"=" * 70,
|
|
218
|
+
"Delta Compression Report",
|
|
219
|
+
"=" * 70,
|
|
220
|
+
f"Total Objects: {report['total_objects']}",
|
|
221
|
+
f"Total Original: {report['total_original_bytes']:,} bytes",
|
|
222
|
+
f"Total Compressed: {report['total_compressed_bytes']:,} bytes",
|
|
223
|
+
f"Overall Ratio: {report['overall_compression_ratio']:.1%}",
|
|
224
|
+
f"Bytes Saved: {report['total_bytes_saved']:,} ({report['compression_percentage']:.1f}%)",
|
|
225
|
+
"",
|
|
226
|
+
heatmap,
|
|
227
|
+
"",
|
|
228
|
+
"Type Breakdown:",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
for otype, stats in sorted(report["type_statistics"].items()):
|
|
232
|
+
output.append(f" {otype}:")
|
|
233
|
+
output.append(f" Count: {stats['count']}")
|
|
234
|
+
output.append(f" Compression: {stats['avg_compression_ratio']:.1%}")
|
|
235
|
+
output.append(f" Saved: {stats['total_bytes_saved']:,} bytes")
|
|
236
|
+
output.append(f" Delta adoption: {stats['delta_adoption_rate']:.0f}%")
|
|
237
|
+
|
|
238
|
+
output.extend(["", "Recommendations:"])
|
|
239
|
+
for rec in report["recommendations"]:
|
|
240
|
+
output.append(f" - {rec}")
|
|
241
|
+
|
|
242
|
+
output.append("=" * 70)
|
|
243
|
+
|
|
244
|
+
full_output = "\n".join(output)
|
|
245
|
+
if logger:
|
|
246
|
+
logger.info(full_output)
|
|
247
|
+
else:
|
|
248
|
+
print(full_output)
|
memvcs/core/distiller.py
CHANGED
|
@@ -211,7 +211,6 @@ class Distiller:
|
|
|
211
211
|
# Sample facts with noise - prevents any single episode from dominating
|
|
212
212
|
import random
|
|
213
213
|
|
|
214
|
-
random.seed(42) # Deterministic but different per cluster due to content
|
|
215
214
|
sampled = random.sample(facts, min(noisy_count, len(facts)))
|
|
216
215
|
|
|
217
216
|
# Optional: Add slight noise to fact embeddings if vector store available
|
|
@@ -233,17 +232,9 @@ class Distiller:
|
|
|
233
232
|
out_path = self.target_dir / f"consolidated-{ts}.md"
|
|
234
233
|
|
|
235
234
|
confidence_score = self.config.extraction_confidence_threshold
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
and self.config.dp_delta is not None
|
|
240
|
-
):
|
|
241
|
-
from .privacy_budget import add_noise
|
|
242
|
-
|
|
243
|
-
confidence_score = add_noise(
|
|
244
|
-
confidence_score, 0.1, self.config.dp_epsilon, self.config.dp_delta
|
|
245
|
-
)
|
|
246
|
-
confidence_score = max(0.0, min(1.0, confidence_score))
|
|
235
|
+
# Metadata noise removed: confidence_score is a metadata field (threshold setting),
|
|
236
|
+
# not an individual fact. Adding noise to metadata doesn't provide meaningful
|
|
237
|
+
# privacy guarantees. See privacy_validator.py for the distinction.
|
|
247
238
|
frontmatter = {
|
|
248
239
|
"schema_version": "1.0",
|
|
249
240
|
"last_updated": datetime.utcnow().isoformat() + "Z",
|