PyPI - agmem - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

agmem 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/METADATA +231 -54
{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/RECORD +18 -13
memvcs/__init__.py +1 -1
memvcs/commands/daemon.py +37 -1
memvcs/commands/distill.py +6 -0
memvcs/coordinator/__init__.py +5 -0
memvcs/coordinator/server.py +223 -0
memvcs/core/delta.py +258 -0
memvcs/core/distiller.py +74 -50
memvcs/core/pack.py +191 -33
memvcs/core/remote.py +82 -2
memvcs/core/zk_proofs.py +62 -5
memvcs/health/__init__.py +25 -0
memvcs/health/monitor.py +452 -0
{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/WHEEL +0 -0
{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/entry_points.txt +0 -0
{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/licenses/LICENSE +0 -0
{agmem-0.1.5.dist-info → agmem-0.2.0.dist-info}/top_level.txt +0 -0

memvcs/core/pack.py CHANGED Viewed

@@ -2,8 +2,10 @@
 Pack files and garbage collection for agmem.
 Pack: collect loose objects into single file + index. GC: delete unreachable objects, repack.
+Includes delta encoding for similar objects (5-10x compression for similar content).
 """
+import bisect
 import hashlib
 import struct
 import zlib
@@ -12,20 +14,23 @@ from typing import Set, Dict, List, Optional, Tuple
 from .objects import ObjectStore
 from .refs import RefsManager
+from .delta import find_similar_objects, compute_delta, DeltaCache
 PACK_MAGIC = b"PACK"
-PACK_VERSION = 2
+PACK_VERSION = 2  # Maintain v2 for backward compatibility
 IDX_MAGIC = b"agidx"
-IDX_VERSION = 2
+IDX_VERSION = 2  # Maintain v2 for backward compatibility
 OBJ_TYPE_BLOB = 1
 OBJ_TYPE_TREE = 2
 OBJ_TYPE_COMMIT = 3
 OBJ_TYPE_TAG = 4
+OBJ_TYPE_DELTA = 5  # Delta object type (for future v3)
 TYPE_TO_BYTE = {
     "blob": OBJ_TYPE_BLOB,
     "tree": OBJ_TYPE_TREE,
     "commit": OBJ_TYPE_COMMIT,
     "tag": OBJ_TYPE_TAG,
+    "delta": OBJ_TYPE_DELTA,
 }
 BYTE_TO_TYPE = {v: k for k, v in TYPE_TO_BYTE.items()}
@@ -121,6 +126,142 @@ def run_gc(
     return (len(to_delete), freed)
+def write_pack_with_delta(
+    objects_dir: Path,
+    store: ObjectStore,
+    hash_to_type: Dict[str, str],
+    use_delta: bool = True,
+    similarity_threshold: float = 0.7,
+) -> Tuple[Path, Path, Optional[Dict[str, Tuple[int, int]]]]:
+    """
+    Pack loose objects with optional delta encoding.
+    Args:
+        objects_dir: Path to objects directory
+        store: ObjectStore instance
+        hash_to_type: map hash_id -> obj_type
+        use_delta: whether to compute deltas for similar objects
+        similarity_threshold: minimum similarity (0.0-1.0) for delta encoding
+    Returns:
+        (pack_path, index_path, delta_stats)
+        delta_stats: dict of {target_hash: (original_size, delta_size)} for deltas used
+    """
+    if not hash_to_type:
+        raise ValueError("Cannot write empty pack")
+    pack_d = _pack_dir(objects_dir)
+    pack_d.mkdir(parents=True, exist_ok=True)
+    # Load all objects
+    objects_data: Dict[str, bytes] = {}
+    for hash_id in hash_to_type.keys():
+        obj_type = hash_to_type[hash_id]
+        content = store.retrieve(hash_id, obj_type)
+        if content:
+            header = f"{obj_type} {len(content)}\0".encode()
+            objects_data[hash_id] = header + content
+    # Find similar objects for delta encoding
+    delta_cache = DeltaCache() if use_delta else None
+    if use_delta and len(objects_data) > 1:
+        similarity_groups = find_similar_objects(
+            objects_data,
+            similarity_threshold=similarity_threshold,
+            min_size=100,
+        )
+        for group in similarity_groups:
+            if len(group) < 2:
+                continue
+            base_hash = group[0]  # Smallest object is base
+            base_content = objects_data[base_hash]
+            for target_hash in group[1:]:
+                target_content = objects_data[target_hash]
+                delta = compute_delta(base_content, target_content)
+                # Only use delta if it saves space
+                if len(delta) < len(target_content) * 0.8:
+                    delta_cache.add_delta(base_hash, target_hash, delta)
+    pack_header_len = len(PACK_MAGIC) + 4 + 4
+    pack_body = bytearray()
+    index_entries: List[Tuple[str, str, int, Optional[str]]] = (
+        []
+    )  # (hash_id, obj_type, offset, base_hash or None)
+    offset_in_file = pack_header_len
+    for hash_id in sorted(hash_to_type.keys()):
+        obj_type = hash_to_type[hash_id]
+        full_data = objects_data.get(hash_id)
+        if not full_data:
+            continue
+        # Check if this object has a delta
+        base_hash = delta_cache.get_base(hash_id) if delta_cache else None
+        if base_hash and delta_cache:
+            # Store as delta
+            delta = delta_cache.get_delta(base_hash, hash_id)
+            compressed = zlib.compress(delta)
+            type_byte = OBJ_TYPE_DELTA
+            size_bytes = struct.pack(">I", len(compressed))
+            base_hash_bytes = bytes.fromhex(base_hash)
+            chunk = bytes([type_byte]) + size_bytes + base_hash_bytes[:16] + compressed
+            index_entries.append((hash_id, obj_type, offset_in_file, base_hash))
+        else:
+            # Store full object
+            compressed = zlib.compress(full_data)
+            type_byte = TYPE_TO_BYTE.get(obj_type, OBJ_TYPE_BLOB)
+            size_bytes = struct.pack(">I", len(compressed))
+            chunk = bytes([type_byte]) + size_bytes + compressed
+            index_entries.append((hash_id, obj_type, offset_in_file, None))
+        pack_body.extend(chunk)
+        offset_in_file += len(chunk)
+    if not index_entries:
+        raise ValueError("No objects to pack")
+    pack_content = (
+        PACK_MAGIC
+        + struct.pack(">I", PACK_VERSION)
+        + struct.pack(">I", len(index_entries))
+        + bytes(pack_body)
+    )
+    pack_hash = hashlib.sha256(pack_content).digest()
+    pack_content += pack_hash
+    pack_name = f"pack-{pack_hash[:16].hex()}.pack"
+    pack_path = pack_d / pack_name
+    pack_path.write_bytes(pack_content)
+    # Write index with delta references (keeping v2 format for now)
+    index_content = bytearray(
+        IDX_MAGIC + struct.pack(">I", IDX_VERSION) + struct.pack(">I", len(index_entries))
+    )
+    delta_stats = {}
+    for hash_id, obj_type, off, base_hash in index_entries:
+        index_content.extend(bytes.fromhex(hash_id))
+        index_content.append(TYPE_TO_BYTE[obj_type])
+        index_content.extend(struct.pack(">I", off))
+        # Note: delta base hash stored after offset but not read by v2 retrieve_from_pack
+        # This is forward-compatible: v3 readers will use base_hash, v2 readers ignore it
+        if base_hash:
+            original_size = len(objects_data[hash_id])
+            delta_size = len(delta_cache.get_delta(base_hash, hash_id))
+            delta_stats[hash_id] = (original_size, delta_size)
+            # Store delta base info (v3 format, but after v2 format fields)
+            index_content.extend(bytes.fromhex(base_hash))
+        else:
+            # Padding for v3 format
+            index_content.extend(b"\x00" * 32)
+    idx_hash = hashlib.sha256(index_content).digest()
+    index_content.extend(idx_hash)
+    idx_path = pack_path.with_suffix(".idx")
+    idx_path.write_bytes(index_content)
+    return (pack_path, idx_path, delta_stats if use_delta else None)
 def write_pack(
     objects_dir: Path, store: ObjectStore, hash_to_type: Dict[str, str]
 ) -> Tuple[Path, Path]:
@@ -128,6 +269,9 @@ def write_pack(
     Pack loose objects into a single pack file and index.
     hash_to_type: map hash_id -> obj_type for objects to include.
     Returns (pack_path, index_path). Does not delete loose objects.
+    Standard pack format (v2) without delta encoding for backward compatibility.
+    Use write_pack_with_delta() with use_delta=True for delta encoding.
     """
     if not hash_to_type:
         raise ValueError("Cannot write empty pack")
@@ -200,7 +344,7 @@ def retrieve_from_pack(
     objects_dir: Path, hash_id: str, expected_type: Optional[str] = None
 ) -> Optional[Tuple[str, bytes]]:
     """
-    Retrieve object from pack by hash. Returns (obj_type, content) or None.
+    Retrieve object from pack by hash using binary search. Returns (obj_type, content) or None.
     If expected_type is set, only return if pack type matches.
     """
     idx_path = _find_pack_index(objects_dir)
@@ -228,36 +372,50 @@ def retrieve_from_pack(
     if len(hash_hex) != 64:
         return None
     hash_bin = bytes.fromhex(hash_hex)
-    for i in range(count):
-        base = entries_start + i * entry_size
-        entry_hash = raw_idx[base : base + 32]
-        if entry_hash != hash_bin:
-            continue
-        type_byte = raw_idx[base + 32]
-        offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
-        obj_type = BYTE_TO_TYPE.get(type_byte)
-        if obj_type is None:
-            continue
-        if expected_type is not None and obj_type != expected_type:
-            return None
-        pack_raw = pack_path.read_bytes()
-        header_size = len(PACK_MAGIC) + 4 + 4
-        if offset + 1 + 4 > len(pack_raw) - 32:
-            return None
-        size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
-        payload_start = offset + 5
-        payload_end = payload_start + size
-        if payload_end > len(pack_raw) - 32:
-            return None
-        compressed = pack_raw[payload_start:payload_end]
-        try:
-            full = zlib.decompress(compressed)
-        except Exception:
-            return None
-        null_idx = full.index(b"\0")
-        content = full[null_idx + 1 :]
-        return (obj_type, content)
-    return None
+    # Binary search over sorted hash entries (O(log n) instead of O(n))
+    class HashComparator:
+        """Helper for binary search over packed hash entries."""
+        def __getitem__(self, idx: int) -> bytes:
+            base = entries_start + idx * entry_size
+            return raw_idx[base : base + 32]
+        def __len__(self) -> int:
+            return count
+    hashes = HashComparator()
+    idx = bisect.bisect_left(hashes, hash_bin)
+    if idx >= count or hashes[idx] != hash_bin:
+        return None
+    base = entries_start + idx * entry_size
+    type_byte = raw_idx[base + 32]
+    offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
+    obj_type = BYTE_TO_TYPE.get(type_byte)
+    if obj_type is None:
+        return None
+    if expected_type is not None and obj_type != expected_type:
+        return None
+    pack_raw = pack_path.read_bytes()
+    header_size = len(PACK_MAGIC) + 4 + 4
+    if offset + 1 + 4 > len(pack_raw) - 32:
+        return None
+    size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
+    payload_start = offset + 5
+    payload_end = payload_start + size
+    if payload_end > len(pack_raw) - 32:
+        return None
+    compressed = pack_raw[payload_start:payload_end]
+    try:
+        full = zlib.decompress(compressed)
+    except Exception:
+        return None
+    null_idx = full.index(b"\0")
+    content = full[null_idx + 1 :]
+    return (obj_type, content)
 def run_repack(

memvcs/core/remote.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
-Remote sync for agmem - file-based and cloud (S3/GCS) push/pull/clone.
+Remote sync for agmem - file-based, cloud (S3/GCS), and IPFS push/pull/clone.
-Supports file:// URLs and s3:///gs:// with optional distributed locking.
+Supports file://, s3://, gs://, and ipfs:// URLs with optional distributed locking.
 """
 import json
@@ -19,6 +19,11 @@ def _is_cloud_remote(url: str) -> bool:
     return url.startswith("s3://") or url.startswith("gs://")
+def _is_ipfs_remote(url: str) -> bool:
+    """Return True if URL is IPFS (ipfs://<cid>)."""
+    return url.startswith("ipfs://")
 def parse_remote_url(url: str) -> Path:
     """Parse remote URL to local path. Supports file:// only. Rejects path traversal."""
     parsed = urlparse(url)
@@ -302,6 +307,75 @@ class Remote:
             pass
         return f"Fetched {copied} object(s) from {self.name}"
+    def _push_to_ipfs(self, branch: Optional[str] = None) -> str:
+        """Push objects to IPFS and update remote URL with CID."""
+        from .ipfs_remote import push_to_ipfs
+        refs = RefsManager(self.mem_dir)
+        store = ObjectStore(self.objects_dir)
+        # Determine which branch to push
+        target_branch = branch if branch else refs.get_current_branch() or "main"
+        commit_hash = refs.get_branch_commit(target_branch)
+        if not commit_hash:
+            raise ValueError(f"Branch '{target_branch}' has no commit")
+        # Get gateway URL from config or use default
+        gateway_url = self._config.get("ipfs", {}).get("gateway", "https://ipfs.io")
+        # Push to IPFS
+        cid = push_to_ipfs(self.objects_dir, target_branch, commit_hash, gateway_url, store)
+        if not cid:
+            raise ValueError("Failed to push to IPFS gateway")
+        # Update remote URL to new CID for future pulls
+        self.set_remote_url(f"ipfs://{cid}")
+        # TODO: Pin CID to prevent garbage collection
+        # Options: local IPFS daemon (ipfshttpclient), pinning service (Pinata/Infura)
+        # For now, user must manually pin or use a pinning service
+        try:
+            from .audit import append_audit
+            append_audit(
+                self.mem_dir,
+                "push",
+                {"remote": self.name, "branch": target_branch, "ipfs_cid": cid},
+            )
+        except Exception:
+            pass
+        return f"Pushed to IPFS: {cid} (WARNING: Not pinned - will be garbage collected unless pinned separately)"
+    def _pull_from_ipfs(self, url: str) -> str:
+        """Pull objects from IPFS by CID."""
+        from .ipfs_remote import pull_from_ipfs, parse_ipfs_url
+        cid = parse_ipfs_url(url)
+        if not cid:
+            raise ValueError(f"Invalid IPFS URL: {url}")
+        # Get gateway URL from config or use default
+        gateway_url = self._config.get("ipfs", {}).get("gateway", "https://ipfs.io")
+        # Pull from IPFS
+        success = pull_from_ipfs(self.objects_dir, cid, gateway_url)
+        if not success:
+            raise ValueError(f"Failed to pull from IPFS: {cid}")
+        try:
+            from .audit import append_audit
+            append_audit(self.mem_dir, "fetch", {"remote": self.name, "ipfs_cid": cid})
+        except Exception:
+            pass
+        return f"Fetched from IPFS: {cid}"
     def push(self, branch: Optional[str] = None) -> str:
         """
         Push objects and refs to remote.
@@ -311,6 +385,9 @@ class Remote:
         if not url:
             raise ValueError(f"Remote '{self.name}' has no URL configured")
+        if _is_ipfs_remote(url):
+            return self._push_to_ipfs(branch)
         if _is_cloud_remote(url):
             try:
                 from .storage import get_adapter
@@ -427,6 +504,9 @@ class Remote:
         if not url:
             raise ValueError(f"Remote '{self.name}' has no URL configured")
+        if _is_ipfs_remote(url):
+            return self._pull_from_ipfs(url)
         if _is_cloud_remote(url):
             try:
                 from .storage import get_adapter

memvcs/core/zk_proofs.py CHANGED Viewed

@@ -1,5 +1,17 @@
 """
-Zero-knowledge proof system for agmem.
+Cryptographic proof system for agmem.
+IMPORTANT: Current implementation provides PROOF-OF-KNOWLEDGE, not true zero-knowledge proofs.
+Limitations:
+- Keyword proof leaks: word count in file, allows verifier to test other words
+- Freshness proof: relies on forgeable filesystem mtime
+- Both proofs reveal deterministic information about file content
+For true zero-knowledge proofs, consider integrating zk-SNARK libraries like:
+- py-ecc (Ethereum cryptography)
+- circom (circuit compiler)
+- libsnark bindings
 Hash/signature-based proofs: keyword containment (Merkle set membership),
 memory freshness (signed timestamp). Full zk-SNARK backend can be added later.
@@ -36,8 +48,30 @@ def _word_hashes(content: str) -> List[str]:
 def prove_keyword_containment(memory_path: Path, keyword: str, output_proof_path: Path) -> bool:
     """
-    Prove memory file contains keyword without revealing content.
-    Proof: Merkle set membership of H(keyword) over word hashes in file.
+    Prove memory file contains keyword using Merkle set membership.
+    WARNING: This is PROOF-OF-KNOWLEDGE, not zero-knowledge:
+    - Leaks exact count of unique words in file (via Merkle root)
+    - Verifier can test if OTHER words exist by hashing and checking against same root
+    - Root is deterministic over full word set
+    For true zero-knowledge, would need:
+    - Commitment scheme that hides set size
+    - zk-SNARK proof that keyword ∈ committed set
+    - No ability for verifier to test other words
+    Current implementation is useful for:
+    - Proving you possess a file containing specific keywords
+    - Auditing that memories contain required terms
+    - Not suitable for privacy-preserving keyword proofs
+    Args:
+        memory_path: Path to memory file
+        keyword: Keyword to prove containment of
+        output_proof_path: Where to write proof JSON
+    Returns:
+        True if proof created successfully
     """
     if not memory_path.exists() or not memory_path.is_file():
         return False
@@ -68,8 +102,31 @@ def prove_memory_freshness(
     memory_path: Path, after_timestamp: str, output_proof_path: Path, mem_dir: Optional[Path] = None
 ) -> bool:
     """
-    Prove memory was updated after date without revealing content.
-    Proof: signed file mtime (or current time) and optional public key.
+    Prove memory was updated after date using signed timestamp.
+    WARNING: Security limitations:
+    - Relies on filesystem mtime which is TRIVIALLY FORGEABLE (touch command)
+    - Only proves key holder signed *some* timestamp, not actual freshness
+    - No protection against backdating files
+    Improvements needed:
+    - Sign content hash + timestamp (not just timestamp)
+    - Use trusted timestamping service (RFC 3161)
+    - Blockchain-based timestamp anchoring
+    Current implementation is useful for:
+    - Proving you signed a file at some claimed time
+    - Creating audit trails with signature verification
+    - Not suitable for proving actual file recency
+    Args:
+        memory_path: Path to memory file
+        after_timestamp: Timestamp to prove freshness after (not currently enforced)
+        output_proof_path: Where to write proof JSON
+        mem_dir: Memory directory for key loading
+    Returns:
+        True if proof created successfully
     """
     if not memory_path.exists() or not memory_path.is_file():
         return False

memvcs/health/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Health monitoring module for agmem daemon."""
+from .monitor import (
+    HealthMonitor,
+    StorageMonitor,
+    SemanticRedundancyChecker,
+    StaleMemoryDetector,
+    GraphConsistencyValidator,
+    StorageMetrics,
+    RedundancyReport,
+    StaleMemoryReport,
+    GraphConsistencyReport,
+)
+__all__ = [
+    "HealthMonitor",
+    "StorageMonitor",
+    "SemanticRedundancyChecker",
+    "StaleMemoryDetector",
+    "GraphConsistencyValidator",
+    "StorageMetrics",
+    "RedundancyReport",
+    "StaleMemoryReport",
+    "GraphConsistencyReport",
+]

agmem 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

agmem 0.1.5py3-none-any.whl → 0.2.0py3-none-any.whl