PyPI - agmem - Versions diffs - 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

agmem 0.1.6py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/METADATA +15 -8
{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/RECORD +25 -16
memvcs/__init__.py +1 -1
memvcs/cli.py +1 -1
memvcs/commands/daemon.py +37 -1
memvcs/commands/distill.py +6 -0
memvcs/coordinator/__init__.py +5 -0
memvcs/coordinator/server.py +239 -0
memvcs/core/compression_metrics.py +248 -0
memvcs/core/delta.py +258 -0
memvcs/core/distiller.py +76 -61
memvcs/core/fast_similarity.py +404 -0
memvcs/core/federated.py +13 -2
memvcs/core/gardener.py +8 -68
memvcs/core/pack.py +192 -34
memvcs/core/privacy_validator.py +187 -0
memvcs/core/protocol_builder.py +198 -0
memvcs/core/remote.py +82 -2
memvcs/core/zk_proofs.py +62 -5
memvcs/health/__init__.py +25 -0
memvcs/health/monitor.py +452 -0
{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
{agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0

memvcs/core/gardener.py CHANGED Viewed

@@ -354,28 +354,11 @@ class Gardener:
         except ValueError:
             insight_path = self.semantic_dir / f"insight-{timestamp}.md"
-        # Generate frontmatter (optionally noised for differential privacy)
+        # Generate frontmatter
         source_episodes = len(cluster.episodes)
-        if (
-            self.config.use_dp
-            and self.config.dp_epsilon is not None
-            and self.config.dp_delta is not None
-        ):
-            from .privacy_budget import add_noise
-            source_episodes = max(
-                0,
-                int(
-                    round(
-                        add_noise(
-                            float(source_episodes),
-                            1.0,
-                            self.config.dp_epsilon,
-                            self.config.dp_delta,
-                        )
-                    )
-                ),
-            )
+        # Metadata noise removed: source_episodes is a metadata count (number of episodes
+        # contributing to this insight), not an individual fact. Adding noise to metadata
+        # doesn't provide meaningful privacy guarantees. See privacy_validator.py.
         frontmatter = {
             "schema_version": "1.0",
             "last_updated": datetime.utcnow().isoformat() + "Z",
@@ -514,53 +497,10 @@ class Gardener:
         clusters_found = len(clusters)
         insights_generated = insights_written
         episodes_archived = archived_count
-        if (
-            self.config.use_dp
-            and self.config.dp_epsilon is not None
-            and self.config.dp_delta is not None
-        ):
-            from .privacy_budget import add_noise
-            sensitivity = 1.0
-            clusters_found = max(
-                0,
-                int(
-                    round(
-                        add_noise(
-                            float(clusters_found),
-                            sensitivity,
-                            self.config.dp_epsilon,
-                            self.config.dp_delta,
-                        )
-                    )
-                ),
-            )
-            insights_generated = max(
-                0,
-                int(
-                    round(
-                        add_noise(
-                            float(insights_generated),
-                            sensitivity,
-                            self.config.dp_epsilon,
-                            self.config.dp_delta,
-                        )
-                    )
-                ),
-            )
-            episodes_archived = max(
-                0,
-                int(
-                    round(
-                        add_noise(
-                            float(episodes_archived),
-                            sensitivity,
-                            self.config.dp_epsilon,
-                            self.config.dp_delta,
-                        )
-                    )
-                ),
-            )
+        # Metadata noise removed: clusters_found, insights_generated, and
+        # episodes_archived are metadata counts, not individual facts.
+        # Adding noise to these doesn't provide meaningful privacy guarantees.
+        # See privacy_validator.py for the distinction between metadata and facts.
         return GardenerResult(
             success=True,

memvcs/core/pack.py CHANGED Viewed

@@ -2,8 +2,10 @@
 Pack files and garbage collection for agmem.
 Pack: collect loose objects into single file + index. GC: delete unreachable objects, repack.
+Includes delta encoding for similar objects (5-10x compression for similar content).
 """
+import bisect
 import hashlib
 import struct
 import zlib
@@ -12,20 +14,23 @@ from typing import Set, Dict, List, Optional, Tuple
 from .objects import ObjectStore
 from .refs import RefsManager
+from .delta import find_similar_objects, compute_delta, DeltaCache
 PACK_MAGIC = b"PACK"
-PACK_VERSION = 2
+PACK_VERSION = 2  # Maintain v2 for backward compatibility
 IDX_MAGIC = b"agidx"
-IDX_VERSION = 2
+IDX_VERSION = 2  # Maintain v2 for backward compatibility
 OBJ_TYPE_BLOB = 1
 OBJ_TYPE_TREE = 2
 OBJ_TYPE_COMMIT = 3
 OBJ_TYPE_TAG = 4
+OBJ_TYPE_DELTA = 5  # Delta object type (for future v3)
 TYPE_TO_BYTE = {
     "blob": OBJ_TYPE_BLOB,
     "tree": OBJ_TYPE_TREE,
     "commit": OBJ_TYPE_COMMIT,
     "tag": OBJ_TYPE_TAG,
+    "delta": OBJ_TYPE_DELTA,
 }
 BYTE_TO_TYPE = {v: k for k, v in TYPE_TO_BYTE.items()}
@@ -121,6 +126,142 @@ def run_gc(
     return (len(to_delete), freed)
+def write_pack_with_delta(
+    objects_dir: Path,
+    store: ObjectStore,
+    hash_to_type: Dict[str, str],
+    use_delta: bool = True,
+    similarity_threshold: float = 0.7,
+) -> Tuple[Path, Path, Optional[Dict[str, Tuple[int, int]]]]:
+    """
+    Pack loose objects with optional delta encoding.
+    Args:
+        objects_dir: Path to objects directory
+        store: ObjectStore instance
+        hash_to_type: map hash_id -> obj_type
+        use_delta: whether to compute deltas for similar objects
+        similarity_threshold: minimum similarity (0.0-1.0) for delta encoding
+    Returns:
+        (pack_path, index_path, delta_stats)
+        delta_stats: dict of {target_hash: (original_size, delta_size)} for deltas used
+    """
+    if not hash_to_type:
+        raise ValueError("Cannot write empty pack")
+    pack_d = _pack_dir(objects_dir)
+    pack_d.mkdir(parents=True, exist_ok=True)
+    # Load all objects
+    objects_data: Dict[str, bytes] = {}
+    for hash_id in hash_to_type.keys():
+        obj_type = hash_to_type[hash_id]
+        content = store.retrieve(hash_id, obj_type)
+        if content:
+            header = f"{obj_type} {len(content)}\0".encode()
+            objects_data[hash_id] = header + content
+    # Find similar objects for delta encoding
+    delta_cache = DeltaCache() if use_delta else None
+    if use_delta and len(objects_data) > 1:
+        similarity_groups = find_similar_objects(
+            objects_data,
+            similarity_threshold=similarity_threshold,
+            min_size=100,
+        )
+        for group in similarity_groups:
+            if len(group) < 2:
+                continue
+            base_hash = group[0]  # Smallest object is base
+            base_content = objects_data[base_hash]
+            for target_hash in group[1:]:
+                target_content = objects_data[target_hash]
+                delta = compute_delta(base_content, target_content)
+                # Only use delta if it saves space
+                if len(delta) < len(target_content) * 0.8:
+                    delta_cache.add_delta(base_hash, target_hash, delta)
+    pack_header_len = len(PACK_MAGIC) + 4 + 4
+    pack_body = bytearray()
+    index_entries: List[Tuple[str, str, int, Optional[str]]] = (
+        []
+    )  # (hash_id, obj_type, offset, base_hash or None)
+    offset_in_file = pack_header_len
+    for hash_id in sorted(hash_to_type.keys()):
+        obj_type = hash_to_type[hash_id]
+        full_data = objects_data.get(hash_id)
+        if not full_data:
+            continue
+        # Check if this object has a delta
+        base_hash = delta_cache.get_base(hash_id) if delta_cache else None
+        if base_hash and delta_cache:
+            # Store as delta
+            delta = delta_cache.get_delta(base_hash, hash_id)
+            compressed = zlib.compress(delta)
+            type_byte = OBJ_TYPE_DELTA
+            size_bytes = struct.pack(">I", len(compressed))
+            base_hash_bytes = bytes.fromhex(base_hash)
+            chunk = bytes([type_byte]) + size_bytes + base_hash_bytes[:16] + compressed
+            index_entries.append((hash_id, obj_type, offset_in_file, base_hash))
+        else:
+            # Store full object
+            compressed = zlib.compress(full_data)
+            type_byte = TYPE_TO_BYTE.get(obj_type, OBJ_TYPE_BLOB)
+            size_bytes = struct.pack(">I", len(compressed))
+            chunk = bytes([type_byte]) + size_bytes + compressed
+            index_entries.append((hash_id, obj_type, offset_in_file, None))
+        pack_body.extend(chunk)
+        offset_in_file += len(chunk)
+    if not index_entries:
+        raise ValueError("No objects to pack")
+    pack_content = (
+        PACK_MAGIC
+        + struct.pack(">I", PACK_VERSION)
+        + struct.pack(">I", len(index_entries))
+        + bytes(pack_body)
+    )
+    pack_hash = hashlib.sha256(pack_content).digest()
+    pack_content += pack_hash
+    pack_name = f"pack-{pack_hash[:16].hex()}.pack"
+    pack_path = pack_d / pack_name
+    pack_path.write_bytes(pack_content)
+    # Write index with delta references (keeping v2 format for now)
+    index_content = bytearray(
+        IDX_MAGIC + struct.pack(">I", IDX_VERSION) + struct.pack(">I", len(index_entries))
+    )
+    delta_stats = {}
+    for hash_id, obj_type, off, base_hash in index_entries:
+        index_content.extend(bytes.fromhex(hash_id))
+        index_content.append(TYPE_TO_BYTE[obj_type])
+        index_content.extend(struct.pack(">I", off))
+        # Note: delta base hash stored after offset but not read by v2 retrieve_from_pack
+        # This is forward-compatible: v3 readers will use base_hash, v2 readers ignore it
+        if base_hash:
+            original_size = len(objects_data[hash_id])
+            delta_size = len(delta_cache.get_delta(base_hash, hash_id))
+            delta_stats[hash_id] = (original_size, delta_size)
+            # Store delta base info (v3 format, but after v2 format fields)
+            index_content.extend(bytes.fromhex(base_hash))
+        else:
+            # Padding for v3 format
+            index_content.extend(b"\x00" * 32)
+    idx_hash = hashlib.sha256(index_content).digest()
+    index_content.extend(idx_hash)
+    idx_path = pack_path.with_suffix(".idx")
+    idx_path.write_bytes(index_content)
+    return (pack_path, idx_path, delta_stats if use_delta else None)
 def write_pack(
     objects_dir: Path, store: ObjectStore, hash_to_type: Dict[str, str]
 ) -> Tuple[Path, Path]:
@@ -128,6 +269,9 @@ def write_pack(
     Pack loose objects into a single pack file and index.
     hash_to_type: map hash_id -> obj_type for objects to include.
     Returns (pack_path, index_path). Does not delete loose objects.
+    Standard pack format (v2) without delta encoding for backward compatibility.
+    Use write_pack_with_delta() with use_delta=True for delta encoding.
     """
     if not hash_to_type:
         raise ValueError("Cannot write empty pack")
@@ -200,7 +344,7 @@ def retrieve_from_pack(
     objects_dir: Path, hash_id: str, expected_type: Optional[str] = None
 ) -> Optional[Tuple[str, bytes]]:
     """
-    Retrieve object from pack by hash. Returns (obj_type, content) or None.
+    Retrieve object from pack by hash using binary search. Returns (obj_type, content) or None.
     If expected_type is set, only return if pack type matches.
     """
     idx_path = _find_pack_index(objects_dir)
@@ -228,36 +372,50 @@ def retrieve_from_pack(
     if len(hash_hex) != 64:
         return None
     hash_bin = bytes.fromhex(hash_hex)
-    for i in range(count):
-        base = entries_start + i * entry_size
-        entry_hash = raw_idx[base : base + 32]
-        if entry_hash != hash_bin:
-            continue
-        type_byte = raw_idx[base + 32]
-        offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
-        obj_type = BYTE_TO_TYPE.get(type_byte)
-        if obj_type is None:
-            continue
-        if expected_type is not None and obj_type != expected_type:
-            return None
-        pack_raw = pack_path.read_bytes()
-        header_size = len(PACK_MAGIC) + 4 + 4
-        if offset + 1 + 4 > len(pack_raw) - 32:
-            return None
-        size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
-        payload_start = offset + 5
-        payload_end = payload_start + size
-        if payload_end > len(pack_raw) - 32:
-            return None
-        compressed = pack_raw[payload_start:payload_end]
-        try:
-            full = zlib.decompress(compressed)
-        except Exception:
-            return None
-        null_idx = full.index(b"\0")
-        content = full[null_idx + 1 :]
-        return (obj_type, content)
-    return None
+    # Binary search over sorted hash entries (O(log n) instead of O(n))
+    class HashComparator:
+        """Helper for binary search over packed hash entries."""
+        def __getitem__(self, idx: int) -> bytes:
+            base = entries_start + idx * entry_size
+            return raw_idx[base : base + 32]
+        def __len__(self) -> int:
+            return count
+    hashes = HashComparator()
+    idx = bisect.bisect_left(hashes, hash_bin)
+    if idx >= count or hashes[idx] != hash_bin:
+        return None
+    base = entries_start + idx * entry_size
+    type_byte = raw_idx[base + 32]
+    offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
+    obj_type = BYTE_TO_TYPE.get(type_byte)
+    if obj_type is None:
+        return None
+    if expected_type is not None and obj_type != expected_type:
+        return None
+    pack_raw = pack_path.read_bytes()
+    header_size = len(PACK_MAGIC) + 4 + 4
+    if offset + 1 + 4 > len(pack_raw) - 32:
+        return None
+    size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
+    payload_start = offset + 5
+    payload_end = payload_start + size
+    if payload_end > len(pack_raw) - 32:
+        return None
+    compressed = pack_raw[payload_start:payload_end]
+    try:
+        full = zlib.decompress(compressed)
+    except Exception:
+        return None
+    null_idx = full.index(b"\0")
+    content = full[null_idx + 1 :]
+    return (obj_type, content)
 def run_repack(
@@ -282,7 +440,7 @@ def run_repack(
         return (0, 0)
     if dry_run:
         return (len(hash_to_type), 0)
-    write_pack(objects_dir, store, hash_to_type)
+    write_pack_with_delta(objects_dir, store, hash_to_type)
     freed = 0
     for hash_id, obj_type in hash_to_type.items():
         p = store.objects_dir / obj_type / hash_id[:2] / hash_id[2:]

memvcs/core/privacy_validator.py ADDED Viewed

@@ -0,0 +1,187 @@
+"""
+Privacy field validation and auditing.
+Ensures differential privacy noise is only applied to fact data, not metadata.
+Prevents accidental privacy overhead on metadata fields and provides audit trail.
+Provides:
+- @privacy_exempt: Decorator to mark metadata fields as privacy-exempt
+- PrivacyFieldValidator: Runtime validation that noise is applied correctly
+- PrivacyAuditReport: Audit trail of which fields received noise
+"""
+from typing import Any, Callable, Dict, List, Optional, Set
+from functools import wraps
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+@dataclass
+class PrivacyAuditReport:
+    """Audit report of privacy noise application."""
+    timestamp: str
+    noised_fields: Dict[str, Any] = field(default_factory=dict)
+    exempt_fields: Dict[str, Any] = field(default_factory=dict)
+    validation_errors: List[str] = field(default_factory=list)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dict for logging/serialization."""
+        return {
+            "timestamp": self.timestamp,
+            "noised_fields": self.noised_fields,
+            "exempt_fields": self.exempt_fields,
+            "validation_errors": self.validation_errors,
+            "summary": {
+                "total_noised": len(self.noised_fields),
+                "total_exempt": len(self.exempt_fields),
+                "validation_passed": len(self.validation_errors) == 0,
+            },
+        }
+class PrivacyFieldValidator:
+    """Validates that privacy noise is applied correctly.
+    Tracks which fields receive noise vs. are exempt from noise.
+    Fails loudly if noise is applied to exempt fields.
+    """
+    # Metadata fields that should NEVER receive noise (they don't reveal facts)
+    EXEMPT_FIELDS = {
+        "clusters_found",  # Metadata: count of clusters, not individual facts
+        "insights_generated",  # Metadata: count of insights generated
+        "episodes_archived",  # Metadata: count of archived episodes
+        "confidence_score",  # Metadata: overall quality metric, not a fact
+        "summary_version",  # Metadata: schema version
+        "created_at",  # Metadata: timestamp
+        "updated_at",  # Metadata: timestamp
+        "agent_version",  # Metadata: software version
+    }
+    # Fact-related fields that SHOULD receive noise
+    FACT_FIELDS = {
+        "facts",  # List of actual facts
+        "memories",  # Memory content
+        "semantic_content",  # Semantic memory content
+        "episodic_content",  # Episodic memory content
+        "procedural_content",  # Procedural memory content
+        "embeddings",  # Vector representations of facts
+        "fact_count",  # Count of individual facts (not metadata)
+        "memory_count",  # Count of individual memories
+    }
+    def __init__(self):
+        self.audit_report = PrivacyAuditReport(timestamp=datetime.now(timezone.utc).isoformat())
+    def validate_noised_field(
+        self, field_name: str, field_value: Any, is_noised: bool = True
+    ) -> None:
+        """Validate that noise application is correct for a field.
+        Args:
+            field_name: Name of the field
+            field_value: Value of the field
+            is_noised: Whether noise was applied to this field
+        Raises:
+            RuntimeError: If noise is applied to exempt field
+        """
+        if is_noised and field_name in self.EXEMPT_FIELDS:
+            error = (
+                f"ERROR: Noise applied to exempt metadata field '{field_name}'. "
+                f"Metadata fields do not reveal individual facts and should not receive noise. "
+                f"Remove noise from: {field_name}"
+            )
+            self.audit_report.validation_errors.append(error)
+            raise RuntimeError(error)
+        if is_noised:
+            self.audit_report.noised_fields[field_name] = field_value
+        else:
+            self.audit_report.exempt_fields[field_name] = field_value
+    def validate_result_dict(self, result: Dict[str, Any]) -> None:
+        """Validate a result dict (e.g., DistillerResult or GardenerResult).
+        Args:
+            result: The result dict to validate
+        Raises:
+            RuntimeError: If privacy validation fails
+        """
+        for field_name in self.EXEMPT_FIELDS:
+            if field_name in result:
+                # These fields should not have been noised
+                self.audit_report.exempt_fields[field_name] = result[field_name]
+    def get_report(self) -> PrivacyAuditReport:
+        """Get the audit report."""
+        if self.audit_report.validation_errors:
+            print(
+                "Privacy Validation Report:\n"
+                + "\n".join(f"  {e}" for e in self.audit_report.validation_errors)
+            )
+        return self.audit_report
+def privacy_exempt(func: Callable) -> Callable:
+    """Decorator to mark a function as privacy-exempt.
+    The decorated function should not apply DP noise to its result.
+    Used to document which functions are exempt from privacy operations.
+    Example:
+        @privacy_exempt
+        def get_metadata() -> Dict[str, Any]:
+            return {"clusters_found": 42, "created_at": "2024-01-01T00:00:00Z"}
+    """
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        result = func(*args, **kwargs)
+        # Mark result as privacy-exempt (store in metadata if possible)
+        if isinstance(result, dict):
+            result["_privacy_exempt"] = True
+        return result
+    # Mark the wrapper function to indicate it's privacy-exempt
+    setattr(wrapper, "_privacy_exempt_function", True)
+    return wrapper
+class PrivacyGuard:
+    """Context manager and decorator for privacy-aware code blocks.
+    Usage:
+        with PrivacyGuard() as pg:
+            result = process_facts(data)
+            pg.mark_noised("fact_count")
+    """
+    def __init__(self, strict: bool = True):
+        self.strict = strict
+        self.validator = PrivacyFieldValidator()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        return True
+    def mark_noised(self, field_name: str, value: Any = None) -> None:
+        """Mark a field as having received DP noise."""
+        if self.strict:
+            self.validator.validate_noised_field(field_name, value, is_noised=True)
+        else:
+            self.validator.audit_report.noised_fields[field_name] = value
+    def mark_exempt(self, field_name: str, value: Any = None) -> None:
+        """Mark a field as exempt from DP noise."""
+        self.validator.audit_report.exempt_fields[field_name] = value
+    def get_report(self) -> PrivacyAuditReport:
+        """Get the privacy audit report."""
+        return self.validator.get_report()

agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

agmem 0.1.6py3-none-any.whl → 0.2.1py3-none-any.whl