agmem 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
memvcs/core/pack.py CHANGED
@@ -2,8 +2,10 @@
2
2
  Pack files and garbage collection for agmem.
3
3
 
4
4
  Pack: collect loose objects into single file + index. GC: delete unreachable objects, repack.
5
+ Includes delta encoding for similar objects (5-10x compression for similar content).
5
6
  """
6
7
 
8
+ import bisect
7
9
  import hashlib
8
10
  import struct
9
11
  import zlib
@@ -12,20 +14,23 @@ from typing import Set, Dict, List, Optional, Tuple
12
14
 
13
15
  from .objects import ObjectStore
14
16
  from .refs import RefsManager
17
+ from .delta import find_similar_objects, compute_delta, DeltaCache
15
18
 
16
19
  PACK_MAGIC = b"PACK"
17
- PACK_VERSION = 2
20
+ PACK_VERSION = 2 # Maintain v2 for backward compatibility
18
21
  IDX_MAGIC = b"agidx"
19
- IDX_VERSION = 2
22
+ IDX_VERSION = 2 # Maintain v2 for backward compatibility
20
23
  OBJ_TYPE_BLOB = 1
21
24
  OBJ_TYPE_TREE = 2
22
25
  OBJ_TYPE_COMMIT = 3
23
26
  OBJ_TYPE_TAG = 4
27
+ OBJ_TYPE_DELTA = 5 # Delta object type (for future v3)
24
28
  TYPE_TO_BYTE = {
25
29
  "blob": OBJ_TYPE_BLOB,
26
30
  "tree": OBJ_TYPE_TREE,
27
31
  "commit": OBJ_TYPE_COMMIT,
28
32
  "tag": OBJ_TYPE_TAG,
33
+ "delta": OBJ_TYPE_DELTA,
29
34
  }
30
35
  BYTE_TO_TYPE = {v: k for k, v in TYPE_TO_BYTE.items()}
31
36
 
@@ -121,6 +126,142 @@ def run_gc(
121
126
  return (len(to_delete), freed)
122
127
 
123
128
 
129
+ def write_pack_with_delta(
130
+ objects_dir: Path,
131
+ store: ObjectStore,
132
+ hash_to_type: Dict[str, str],
133
+ use_delta: bool = True,
134
+ similarity_threshold: float = 0.7,
135
+ ) -> Tuple[Path, Path, Optional[Dict[str, Tuple[int, int]]]]:
136
+ """
137
+ Pack loose objects with optional delta encoding.
138
+
139
+ Args:
140
+ objects_dir: Path to objects directory
141
+ store: ObjectStore instance
142
+ hash_to_type: map hash_id -> obj_type
143
+ use_delta: whether to compute deltas for similar objects
144
+ similarity_threshold: minimum similarity (0.0-1.0) for delta encoding
145
+
146
+ Returns:
147
+ (pack_path, index_path, delta_stats)
148
+ delta_stats: dict of {target_hash: (original_size, delta_size)} for deltas used
149
+ """
150
+ if not hash_to_type:
151
+ raise ValueError("Cannot write empty pack")
152
+
153
+ pack_d = _pack_dir(objects_dir)
154
+ pack_d.mkdir(parents=True, exist_ok=True)
155
+
156
+ # Load all objects
157
+ objects_data: Dict[str, bytes] = {}
158
+ for hash_id in hash_to_type.keys():
159
+ obj_type = hash_to_type[hash_id]
160
+ content = store.retrieve(hash_id, obj_type)
161
+ if content:
162
+ header = f"{obj_type} {len(content)}\0".encode()
163
+ objects_data[hash_id] = header + content
164
+
165
+ # Find similar objects for delta encoding
166
+ delta_cache = DeltaCache() if use_delta else None
167
+ if use_delta and len(objects_data) > 1:
168
+ similarity_groups = find_similar_objects(
169
+ objects_data,
170
+ similarity_threshold=similarity_threshold,
171
+ min_size=100,
172
+ )
173
+ for group in similarity_groups:
174
+ if len(group) < 2:
175
+ continue
176
+ base_hash = group[0] # Smallest object is base
177
+ base_content = objects_data[base_hash]
178
+ for target_hash in group[1:]:
179
+ target_content = objects_data[target_hash]
180
+ delta = compute_delta(base_content, target_content)
181
+ # Only use delta if it saves space
182
+ if len(delta) < len(target_content) * 0.8:
183
+ delta_cache.add_delta(base_hash, target_hash, delta)
184
+
185
+ pack_header_len = len(PACK_MAGIC) + 4 + 4
186
+ pack_body = bytearray()
187
+ index_entries: List[Tuple[str, str, int, Optional[str]]] = (
188
+ []
189
+ ) # (hash_id, obj_type, offset, base_hash or None)
190
+ offset_in_file = pack_header_len
191
+
192
+ for hash_id in sorted(hash_to_type.keys()):
193
+ obj_type = hash_to_type[hash_id]
194
+ full_data = objects_data.get(hash_id)
195
+ if not full_data:
196
+ continue
197
+
198
+ # Check if this object has a delta
199
+ base_hash = delta_cache.get_base(hash_id) if delta_cache else None
200
+ if base_hash and delta_cache:
201
+ # Store as delta
202
+ delta = delta_cache.get_delta(base_hash, hash_id)
203
+ compressed = zlib.compress(delta)
204
+ type_byte = OBJ_TYPE_DELTA
205
+ size_bytes = struct.pack(">I", len(compressed))
206
+ base_hash_bytes = bytes.fromhex(base_hash)
207
+ chunk = bytes([type_byte]) + size_bytes + base_hash_bytes[:16] + compressed
208
+ index_entries.append((hash_id, obj_type, offset_in_file, base_hash))
209
+ else:
210
+ # Store full object
211
+ compressed = zlib.compress(full_data)
212
+ type_byte = TYPE_TO_BYTE.get(obj_type, OBJ_TYPE_BLOB)
213
+ size_bytes = struct.pack(">I", len(compressed))
214
+ chunk = bytes([type_byte]) + size_bytes + compressed
215
+ index_entries.append((hash_id, obj_type, offset_in_file, None))
216
+
217
+ pack_body.extend(chunk)
218
+ offset_in_file += len(chunk)
219
+
220
+ if not index_entries:
221
+ raise ValueError("No objects to pack")
222
+
223
+ pack_content = (
224
+ PACK_MAGIC
225
+ + struct.pack(">I", PACK_VERSION)
226
+ + struct.pack(">I", len(index_entries))
227
+ + bytes(pack_body)
228
+ )
229
+ pack_hash = hashlib.sha256(pack_content).digest()
230
+ pack_content += pack_hash
231
+
232
+ pack_name = f"pack-{pack_hash[:16].hex()}.pack"
233
+ pack_path = pack_d / pack_name
234
+ pack_path.write_bytes(pack_content)
235
+
236
+ # Write index with delta references (keeping v2 format for now)
237
+ index_content = bytearray(
238
+ IDX_MAGIC + struct.pack(">I", IDX_VERSION) + struct.pack(">I", len(index_entries))
239
+ )
240
+ delta_stats = {}
241
+ for hash_id, obj_type, off, base_hash in index_entries:
242
+ index_content.extend(bytes.fromhex(hash_id))
243
+ index_content.append(TYPE_TO_BYTE[obj_type])
244
+ index_content.extend(struct.pack(">I", off))
245
+ # Note: delta base hash stored after offset but not read by v2 retrieve_from_pack
246
+ # This is forward-compatible: v3 readers will use base_hash, v2 readers ignore it
247
+ if base_hash:
248
+ original_size = len(objects_data[hash_id])
249
+ delta_size = len(delta_cache.get_delta(base_hash, hash_id))
250
+ delta_stats[hash_id] = (original_size, delta_size)
251
+ # Store delta base info (v3 format, but after v2 format fields)
252
+ index_content.extend(bytes.fromhex(base_hash))
253
+ else:
254
+ # Padding for v3 format
255
+ index_content.extend(b"\x00" * 32)
256
+
257
+ idx_hash = hashlib.sha256(index_content).digest()
258
+ index_content.extend(idx_hash)
259
+ idx_path = pack_path.with_suffix(".idx")
260
+ idx_path.write_bytes(index_content)
261
+
262
+ return (pack_path, idx_path, delta_stats if use_delta else None)
263
+
264
+
124
265
  def write_pack(
125
266
  objects_dir: Path, store: ObjectStore, hash_to_type: Dict[str, str]
126
267
  ) -> Tuple[Path, Path]:
@@ -128,6 +269,9 @@ def write_pack(
128
269
  Pack loose objects into a single pack file and index.
129
270
  hash_to_type: map hash_id -> obj_type for objects to include.
130
271
  Returns (pack_path, index_path). Does not delete loose objects.
272
+
273
+ Standard pack format (v2) without delta encoding for backward compatibility.
274
+ Use write_pack_with_delta() with use_delta=True for delta encoding.
131
275
  """
132
276
  if not hash_to_type:
133
277
  raise ValueError("Cannot write empty pack")
@@ -200,7 +344,7 @@ def retrieve_from_pack(
200
344
  objects_dir: Path, hash_id: str, expected_type: Optional[str] = None
201
345
  ) -> Optional[Tuple[str, bytes]]:
202
346
  """
203
- Retrieve object from pack by hash. Returns (obj_type, content) or None.
347
+ Retrieve object from pack by hash using binary search. Returns (obj_type, content) or None.
204
348
  If expected_type is set, only return if pack type matches.
205
349
  """
206
350
  idx_path = _find_pack_index(objects_dir)
@@ -228,36 +372,50 @@ def retrieve_from_pack(
228
372
  if len(hash_hex) != 64:
229
373
  return None
230
374
  hash_bin = bytes.fromhex(hash_hex)
231
- for i in range(count):
232
- base = entries_start + i * entry_size
233
- entry_hash = raw_idx[base : base + 32]
234
- if entry_hash != hash_bin:
235
- continue
236
- type_byte = raw_idx[base + 32]
237
- offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
238
- obj_type = BYTE_TO_TYPE.get(type_byte)
239
- if obj_type is None:
240
- continue
241
- if expected_type is not None and obj_type != expected_type:
242
- return None
243
- pack_raw = pack_path.read_bytes()
244
- header_size = len(PACK_MAGIC) + 4 + 4
245
- if offset + 1 + 4 > len(pack_raw) - 32:
246
- return None
247
- size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
248
- payload_start = offset + 5
249
- payload_end = payload_start + size
250
- if payload_end > len(pack_raw) - 32:
251
- return None
252
- compressed = pack_raw[payload_start:payload_end]
253
- try:
254
- full = zlib.decompress(compressed)
255
- except Exception:
256
- return None
257
- null_idx = full.index(b"\0")
258
- content = full[null_idx + 1 :]
259
- return (obj_type, content)
260
- return None
375
+
376
+ # Binary search over sorted hash entries (O(log n) instead of O(n))
377
+ class HashComparator:
378
+ """Helper for binary search over packed hash entries."""
379
+
380
+ def __getitem__(self, idx: int) -> bytes:
381
+ base = entries_start + idx * entry_size
382
+ return raw_idx[base : base + 32]
383
+
384
+ def __len__(self) -> int:
385
+ return count
386
+
387
+ hashes = HashComparator()
388
+ idx = bisect.bisect_left(hashes, hash_bin)
389
+
390
+ if idx >= count or hashes[idx] != hash_bin:
391
+ return None
392
+
393
+ base = entries_start + idx * entry_size
394
+ type_byte = raw_idx[base + 32]
395
+ offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
396
+ obj_type = BYTE_TO_TYPE.get(type_byte)
397
+ if obj_type is None:
398
+ return None
399
+ if expected_type is not None and obj_type != expected_type:
400
+ return None
401
+
402
+ pack_raw = pack_path.read_bytes()
403
+ header_size = len(PACK_MAGIC) + 4 + 4
404
+ if offset + 1 + 4 > len(pack_raw) - 32:
405
+ return None
406
+ size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
407
+ payload_start = offset + 5
408
+ payload_end = payload_start + size
409
+ if payload_end > len(pack_raw) - 32:
410
+ return None
411
+ compressed = pack_raw[payload_start:payload_end]
412
+ try:
413
+ full = zlib.decompress(compressed)
414
+ except Exception:
415
+ return None
416
+ null_idx = full.index(b"\0")
417
+ content = full[null_idx + 1 :]
418
+ return (obj_type, content)
261
419
 
262
420
 
263
421
  def run_repack(
memvcs/core/remote.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """
2
- Remote sync for agmem - file-based and cloud (S3/GCS) push/pull/clone.
2
+ Remote sync for agmem - file-based, cloud (S3/GCS), and IPFS push/pull/clone.
3
3
 
4
- Supports file:// URLs and s3:///gs:// with optional distributed locking.
4
+ Supports file://, s3://, gs://, and ipfs:// URLs with optional distributed locking.
5
5
  """
6
6
 
7
7
  import json
@@ -19,6 +19,11 @@ def _is_cloud_remote(url: str) -> bool:
19
19
  return url.startswith("s3://") or url.startswith("gs://")
20
20
 
21
21
 
22
+ def _is_ipfs_remote(url: str) -> bool:
23
+ """Return True if URL is IPFS (ipfs://<cid>)."""
24
+ return url.startswith("ipfs://")
25
+
26
+
22
27
  def parse_remote_url(url: str) -> Path:
23
28
  """Parse remote URL to local path. Supports file:// only. Rejects path traversal."""
24
29
  parsed = urlparse(url)
@@ -302,6 +307,75 @@ class Remote:
302
307
  pass
303
308
  return f"Fetched {copied} object(s) from {self.name}"
304
309
 
310
+ def _push_to_ipfs(self, branch: Optional[str] = None) -> str:
311
+ """Push objects to IPFS and update remote URL with CID."""
312
+ from .ipfs_remote import push_to_ipfs
313
+
314
+ refs = RefsManager(self.mem_dir)
315
+ store = ObjectStore(self.objects_dir)
316
+
317
+ # Determine which branch to push
318
+ target_branch = branch if branch else refs.get_current_branch() or "main"
319
+ commit_hash = refs.get_branch_commit(target_branch)
320
+
321
+ if not commit_hash:
322
+ raise ValueError(f"Branch '{target_branch}' has no commit")
323
+
324
+ # Get gateway URL from config or use default
325
+ gateway_url = self._config.get("ipfs", {}).get("gateway", "https://ipfs.io")
326
+
327
+ # Push to IPFS
328
+ cid = push_to_ipfs(self.objects_dir, target_branch, commit_hash, gateway_url, store)
329
+
330
+ if not cid:
331
+ raise ValueError("Failed to push to IPFS gateway")
332
+
333
+ # Update remote URL to new CID for future pulls
334
+ self.set_remote_url(f"ipfs://{cid}")
335
+
336
+ # TODO: Pin CID to prevent garbage collection
337
+ # Options: local IPFS daemon (ipfshttpclient), pinning service (Pinata/Infura)
338
+ # For now, user must manually pin or use a pinning service
339
+
340
+ try:
341
+ from .audit import append_audit
342
+
343
+ append_audit(
344
+ self.mem_dir,
345
+ "push",
346
+ {"remote": self.name, "branch": target_branch, "ipfs_cid": cid},
347
+ )
348
+ except Exception:
349
+ pass
350
+
351
+ return f"Pushed to IPFS: {cid} (WARNING: Not pinned - will be garbage collected unless pinned separately)"
352
+
353
+ def _pull_from_ipfs(self, url: str) -> str:
354
+ """Pull objects from IPFS by CID."""
355
+ from .ipfs_remote import pull_from_ipfs, parse_ipfs_url
356
+
357
+ cid = parse_ipfs_url(url)
358
+ if not cid:
359
+ raise ValueError(f"Invalid IPFS URL: {url}")
360
+
361
+ # Get gateway URL from config or use default
362
+ gateway_url = self._config.get("ipfs", {}).get("gateway", "https://ipfs.io")
363
+
364
+ # Pull from IPFS
365
+ success = pull_from_ipfs(self.objects_dir, cid, gateway_url)
366
+
367
+ if not success:
368
+ raise ValueError(f"Failed to pull from IPFS: {cid}")
369
+
370
+ try:
371
+ from .audit import append_audit
372
+
373
+ append_audit(self.mem_dir, "fetch", {"remote": self.name, "ipfs_cid": cid})
374
+ except Exception:
375
+ pass
376
+
377
+ return f"Fetched from IPFS: {cid}"
378
+
305
379
  def push(self, branch: Optional[str] = None) -> str:
306
380
  """
307
381
  Push objects and refs to remote.
@@ -311,6 +385,9 @@ class Remote:
311
385
  if not url:
312
386
  raise ValueError(f"Remote '{self.name}' has no URL configured")
313
387
 
388
+ if _is_ipfs_remote(url):
389
+ return self._push_to_ipfs(branch)
390
+
314
391
  if _is_cloud_remote(url):
315
392
  try:
316
393
  from .storage import get_adapter
@@ -427,6 +504,9 @@ class Remote:
427
504
  if not url:
428
505
  raise ValueError(f"Remote '{self.name}' has no URL configured")
429
506
 
507
+ if _is_ipfs_remote(url):
508
+ return self._pull_from_ipfs(url)
509
+
430
510
  if _is_cloud_remote(url):
431
511
  try:
432
512
  from .storage import get_adapter
memvcs/core/zk_proofs.py CHANGED
@@ -1,5 +1,17 @@
1
1
  """
2
- Zero-knowledge proof system for agmem.
2
+ Cryptographic proof system for agmem.
3
+
4
+ IMPORTANT: Current implementation provides PROOF-OF-KNOWLEDGE, not true zero-knowledge proofs.
5
+
6
+ Limitations:
7
+ - Keyword proof leaks: word count in file, allows verifier to test other words
8
+ - Freshness proof: relies on forgeable filesystem mtime
9
+ - Both proofs reveal deterministic information about file content
10
+
11
+ For true zero-knowledge proofs, consider integrating zk-SNARK libraries like:
12
+ - py-ecc (Ethereum cryptography)
13
+ - circom (circuit compiler)
14
+ - libsnark bindings
3
15
 
4
16
  Hash/signature-based proofs: keyword containment (Merkle set membership),
5
17
  memory freshness (signed timestamp). Full zk-SNARK backend can be added later.
@@ -36,8 +48,30 @@ def _word_hashes(content: str) -> List[str]:
36
48
 
37
49
  def prove_keyword_containment(memory_path: Path, keyword: str, output_proof_path: Path) -> bool:
38
50
  """
39
- Prove memory file contains keyword without revealing content.
40
- Proof: Merkle set membership of H(keyword) over word hashes in file.
51
+ Prove memory file contains keyword using Merkle set membership.
52
+
53
+ WARNING: This is PROOF-OF-KNOWLEDGE, not zero-knowledge:
54
+ - Leaks exact count of unique words in file (via Merkle root)
55
+ - Verifier can test if OTHER words exist by hashing and checking against same root
56
+ - Root is deterministic over full word set
57
+
58
+ For true zero-knowledge, would need:
59
+ - Commitment scheme that hides set size
60
+ - zk-SNARK proof that keyword ∈ committed set
61
+ - No ability for verifier to test other words
62
+
63
+ Current implementation is useful for:
64
+ - Proving you possess a file containing specific keywords
65
+ - Auditing that memories contain required terms
66
+ - Not suitable for privacy-preserving keyword proofs
67
+
68
+ Args:
69
+ memory_path: Path to memory file
70
+ keyword: Keyword to prove containment of
71
+ output_proof_path: Where to write proof JSON
72
+
73
+ Returns:
74
+ True if proof created successfully
41
75
  """
42
76
  if not memory_path.exists() or not memory_path.is_file():
43
77
  return False
@@ -68,8 +102,31 @@ def prove_memory_freshness(
68
102
  memory_path: Path, after_timestamp: str, output_proof_path: Path, mem_dir: Optional[Path] = None
69
103
  ) -> bool:
70
104
  """
71
- Prove memory was updated after date without revealing content.
72
- Proof: signed file mtime (or current time) and optional public key.
105
+ Prove memory was updated after date using signed timestamp.
106
+
107
+ WARNING: Security limitations:
108
+ - Relies on filesystem mtime which is TRIVIALLY FORGEABLE (touch command)
109
+ - Only proves key holder signed *some* timestamp, not actual freshness
110
+ - No protection against backdating files
111
+
112
+ Improvements needed:
113
+ - Sign content hash + timestamp (not just timestamp)
114
+ - Use trusted timestamping service (RFC 3161)
115
+ - Blockchain-based timestamp anchoring
116
+
117
+ Current implementation is useful for:
118
+ - Proving you signed a file at some claimed time
119
+ - Creating audit trails with signature verification
120
+ - Not suitable for proving actual file recency
121
+
122
+ Args:
123
+ memory_path: Path to memory file
124
+ after_timestamp: Timestamp to prove freshness after (not currently enforced)
125
+ output_proof_path: Where to write proof JSON
126
+ mem_dir: Memory directory for key loading
127
+
128
+ Returns:
129
+ True if proof created successfully
73
130
  """
74
131
  if not memory_path.exists() or not memory_path.is_file():
75
132
  return False
@@ -0,0 +1,25 @@
1
+ """Health monitoring module for agmem daemon."""
2
+
3
+ from .monitor import (
4
+ HealthMonitor,
5
+ StorageMonitor,
6
+ SemanticRedundancyChecker,
7
+ StaleMemoryDetector,
8
+ GraphConsistencyValidator,
9
+ StorageMetrics,
10
+ RedundancyReport,
11
+ StaleMemoryReport,
12
+ GraphConsistencyReport,
13
+ )
14
+
15
+ __all__ = [
16
+ "HealthMonitor",
17
+ "StorageMonitor",
18
+ "SemanticRedundancyChecker",
19
+ "StaleMemoryDetector",
20
+ "GraphConsistencyValidator",
21
+ "StorageMetrics",
22
+ "RedundancyReport",
23
+ "StaleMemoryReport",
24
+ "GraphConsistencyReport",
25
+ ]