agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
memvcs/core/gardener.py CHANGED
@@ -354,28 +354,11 @@ class Gardener:
354
354
  except ValueError:
355
355
  insight_path = self.semantic_dir / f"insight-{timestamp}.md"
356
356
 
357
- # Generate frontmatter (optionally noised for differential privacy)
357
+ # Generate frontmatter
358
358
  source_episodes = len(cluster.episodes)
359
- if (
360
- self.config.use_dp
361
- and self.config.dp_epsilon is not None
362
- and self.config.dp_delta is not None
363
- ):
364
- from .privacy_budget import add_noise
365
-
366
- source_episodes = max(
367
- 0,
368
- int(
369
- round(
370
- add_noise(
371
- float(source_episodes),
372
- 1.0,
373
- self.config.dp_epsilon,
374
- self.config.dp_delta,
375
- )
376
- )
377
- ),
378
- )
359
+ # Metadata noise removed: source_episodes is a metadata count (number of episodes
360
+ # contributing to this insight), not an individual fact. Adding noise to metadata
361
+ # doesn't provide meaningful privacy guarantees. See privacy_validator.py.
379
362
  frontmatter = {
380
363
  "schema_version": "1.0",
381
364
  "last_updated": datetime.utcnow().isoformat() + "Z",
@@ -514,53 +497,10 @@ class Gardener:
514
497
  clusters_found = len(clusters)
515
498
  insights_generated = insights_written
516
499
  episodes_archived = archived_count
517
- if (
518
- self.config.use_dp
519
- and self.config.dp_epsilon is not None
520
- and self.config.dp_delta is not None
521
- ):
522
- from .privacy_budget import add_noise
523
-
524
- sensitivity = 1.0
525
- clusters_found = max(
526
- 0,
527
- int(
528
- round(
529
- add_noise(
530
- float(clusters_found),
531
- sensitivity,
532
- self.config.dp_epsilon,
533
- self.config.dp_delta,
534
- )
535
- )
536
- ),
537
- )
538
- insights_generated = max(
539
- 0,
540
- int(
541
- round(
542
- add_noise(
543
- float(insights_generated),
544
- sensitivity,
545
- self.config.dp_epsilon,
546
- self.config.dp_delta,
547
- )
548
- )
549
- ),
550
- )
551
- episodes_archived = max(
552
- 0,
553
- int(
554
- round(
555
- add_noise(
556
- float(episodes_archived),
557
- sensitivity,
558
- self.config.dp_epsilon,
559
- self.config.dp_delta,
560
- )
561
- )
562
- ),
563
- )
500
+ # Metadata noise removed: clusters_found, insights_generated, and
501
+ # episodes_archived are metadata counts, not individual facts.
502
+ # Adding noise to these doesn't provide meaningful privacy guarantees.
503
+ # See privacy_validator.py for the distinction between metadata and facts.
564
504
 
565
505
  return GardenerResult(
566
506
  success=True,
memvcs/core/pack.py CHANGED
@@ -2,8 +2,10 @@
2
2
  Pack files and garbage collection for agmem.
3
3
 
4
4
  Pack: collect loose objects into single file + index. GC: delete unreachable objects, repack.
5
+ Includes delta encoding for similar objects (5-10x compression for similar content).
5
6
  """
6
7
 
8
+ import bisect
7
9
  import hashlib
8
10
  import struct
9
11
  import zlib
@@ -12,20 +14,23 @@ from typing import Set, Dict, List, Optional, Tuple
12
14
 
13
15
  from .objects import ObjectStore
14
16
  from .refs import RefsManager
17
+ from .delta import find_similar_objects, compute_delta, DeltaCache
15
18
 
16
19
  PACK_MAGIC = b"PACK"
17
- PACK_VERSION = 2
20
+ PACK_VERSION = 2 # Maintain v2 for backward compatibility
18
21
  IDX_MAGIC = b"agidx"
19
- IDX_VERSION = 2
22
+ IDX_VERSION = 2 # Maintain v2 for backward compatibility
20
23
  OBJ_TYPE_BLOB = 1
21
24
  OBJ_TYPE_TREE = 2
22
25
  OBJ_TYPE_COMMIT = 3
23
26
  OBJ_TYPE_TAG = 4
27
+ OBJ_TYPE_DELTA = 5 # Delta object type (for future v3)
24
28
  TYPE_TO_BYTE = {
25
29
  "blob": OBJ_TYPE_BLOB,
26
30
  "tree": OBJ_TYPE_TREE,
27
31
  "commit": OBJ_TYPE_COMMIT,
28
32
  "tag": OBJ_TYPE_TAG,
33
+ "delta": OBJ_TYPE_DELTA,
29
34
  }
30
35
  BYTE_TO_TYPE = {v: k for k, v in TYPE_TO_BYTE.items()}
31
36
 
@@ -121,6 +126,142 @@ def run_gc(
121
126
  return (len(to_delete), freed)
122
127
 
123
128
 
129
+ def write_pack_with_delta(
130
+ objects_dir: Path,
131
+ store: ObjectStore,
132
+ hash_to_type: Dict[str, str],
133
+ use_delta: bool = True,
134
+ similarity_threshold: float = 0.7,
135
+ ) -> Tuple[Path, Path, Optional[Dict[str, Tuple[int, int]]]]:
136
+ """
137
+ Pack loose objects with optional delta encoding.
138
+
139
+ Args:
140
+ objects_dir: Path to objects directory
141
+ store: ObjectStore instance
142
+ hash_to_type: map hash_id -> obj_type
143
+ use_delta: whether to compute deltas for similar objects
144
+ similarity_threshold: minimum similarity (0.0-1.0) for delta encoding
145
+
146
+ Returns:
147
+ (pack_path, index_path, delta_stats)
148
+ delta_stats: dict of {target_hash: (original_size, delta_size)} for deltas used
149
+ """
150
+ if not hash_to_type:
151
+ raise ValueError("Cannot write empty pack")
152
+
153
+ pack_d = _pack_dir(objects_dir)
154
+ pack_d.mkdir(parents=True, exist_ok=True)
155
+
156
+ # Load all objects
157
+ objects_data: Dict[str, bytes] = {}
158
+ for hash_id in hash_to_type.keys():
159
+ obj_type = hash_to_type[hash_id]
160
+ content = store.retrieve(hash_id, obj_type)
161
+ if content:
162
+ header = f"{obj_type} {len(content)}\0".encode()
163
+ objects_data[hash_id] = header + content
164
+
165
+ # Find similar objects for delta encoding
166
+ delta_cache = DeltaCache() if use_delta else None
167
+ if use_delta and len(objects_data) > 1:
168
+ similarity_groups = find_similar_objects(
169
+ objects_data,
170
+ similarity_threshold=similarity_threshold,
171
+ min_size=100,
172
+ )
173
+ for group in similarity_groups:
174
+ if len(group) < 2:
175
+ continue
176
+ base_hash = group[0] # Smallest object is base
177
+ base_content = objects_data[base_hash]
178
+ for target_hash in group[1:]:
179
+ target_content = objects_data[target_hash]
180
+ delta = compute_delta(base_content, target_content)
181
+ # Only use delta if it saves space
182
+ if len(delta) < len(target_content) * 0.8:
183
+ delta_cache.add_delta(base_hash, target_hash, delta)
184
+
185
+ pack_header_len = len(PACK_MAGIC) + 4 + 4
186
+ pack_body = bytearray()
187
+ index_entries: List[Tuple[str, str, int, Optional[str]]] = (
188
+ []
189
+ ) # (hash_id, obj_type, offset, base_hash or None)
190
+ offset_in_file = pack_header_len
191
+
192
+ for hash_id in sorted(hash_to_type.keys()):
193
+ obj_type = hash_to_type[hash_id]
194
+ full_data = objects_data.get(hash_id)
195
+ if not full_data:
196
+ continue
197
+
198
+ # Check if this object has a delta
199
+ base_hash = delta_cache.get_base(hash_id) if delta_cache else None
200
+ if base_hash and delta_cache:
201
+ # Store as delta
202
+ delta = delta_cache.get_delta(base_hash, hash_id)
203
+ compressed = zlib.compress(delta)
204
+ type_byte = OBJ_TYPE_DELTA
205
+ size_bytes = struct.pack(">I", len(compressed))
206
+ base_hash_bytes = bytes.fromhex(base_hash)
207
+ chunk = bytes([type_byte]) + size_bytes + base_hash_bytes[:16] + compressed
208
+ index_entries.append((hash_id, obj_type, offset_in_file, base_hash))
209
+ else:
210
+ # Store full object
211
+ compressed = zlib.compress(full_data)
212
+ type_byte = TYPE_TO_BYTE.get(obj_type, OBJ_TYPE_BLOB)
213
+ size_bytes = struct.pack(">I", len(compressed))
214
+ chunk = bytes([type_byte]) + size_bytes + compressed
215
+ index_entries.append((hash_id, obj_type, offset_in_file, None))
216
+
217
+ pack_body.extend(chunk)
218
+ offset_in_file += len(chunk)
219
+
220
+ if not index_entries:
221
+ raise ValueError("No objects to pack")
222
+
223
+ pack_content = (
224
+ PACK_MAGIC
225
+ + struct.pack(">I", PACK_VERSION)
226
+ + struct.pack(">I", len(index_entries))
227
+ + bytes(pack_body)
228
+ )
229
+ pack_hash = hashlib.sha256(pack_content).digest()
230
+ pack_content += pack_hash
231
+
232
+ pack_name = f"pack-{pack_hash[:16].hex()}.pack"
233
+ pack_path = pack_d / pack_name
234
+ pack_path.write_bytes(pack_content)
235
+
236
+ # Write index with delta references (keeping v2 format for now)
237
+ index_content = bytearray(
238
+ IDX_MAGIC + struct.pack(">I", IDX_VERSION) + struct.pack(">I", len(index_entries))
239
+ )
240
+ delta_stats = {}
241
+ for hash_id, obj_type, off, base_hash in index_entries:
242
+ index_content.extend(bytes.fromhex(hash_id))
243
+ index_content.append(TYPE_TO_BYTE[obj_type])
244
+ index_content.extend(struct.pack(">I", off))
245
+ # Note: delta base hash stored after offset but not read by v2 retrieve_from_pack
246
+ # This is forward-compatible: v3 readers will use base_hash, v2 readers ignore it
247
+ if base_hash:
248
+ original_size = len(objects_data[hash_id])
249
+ delta_size = len(delta_cache.get_delta(base_hash, hash_id))
250
+ delta_stats[hash_id] = (original_size, delta_size)
251
+ # Store delta base info (v3 format, but after v2 format fields)
252
+ index_content.extend(bytes.fromhex(base_hash))
253
+ else:
254
+ # Padding for v3 format
255
+ index_content.extend(b"\x00" * 32)
256
+
257
+ idx_hash = hashlib.sha256(index_content).digest()
258
+ index_content.extend(idx_hash)
259
+ idx_path = pack_path.with_suffix(".idx")
260
+ idx_path.write_bytes(index_content)
261
+
262
+ return (pack_path, idx_path, delta_stats if use_delta else None)
263
+
264
+
124
265
  def write_pack(
125
266
  objects_dir: Path, store: ObjectStore, hash_to_type: Dict[str, str]
126
267
  ) -> Tuple[Path, Path]:
@@ -128,6 +269,9 @@ def write_pack(
128
269
  Pack loose objects into a single pack file and index.
129
270
  hash_to_type: map hash_id -> obj_type for objects to include.
130
271
  Returns (pack_path, index_path). Does not delete loose objects.
272
+
273
+ Standard pack format (v2) without delta encoding for backward compatibility.
274
+ Use write_pack_with_delta() with use_delta=True for delta encoding.
131
275
  """
132
276
  if not hash_to_type:
133
277
  raise ValueError("Cannot write empty pack")
@@ -200,7 +344,7 @@ def retrieve_from_pack(
200
344
  objects_dir: Path, hash_id: str, expected_type: Optional[str] = None
201
345
  ) -> Optional[Tuple[str, bytes]]:
202
346
  """
203
- Retrieve object from pack by hash. Returns (obj_type, content) or None.
347
+ Retrieve object from pack by hash using binary search. Returns (obj_type, content) or None.
204
348
  If expected_type is set, only return if pack type matches.
205
349
  """
206
350
  idx_path = _find_pack_index(objects_dir)
@@ -228,36 +372,50 @@ def retrieve_from_pack(
228
372
  if len(hash_hex) != 64:
229
373
  return None
230
374
  hash_bin = bytes.fromhex(hash_hex)
231
- for i in range(count):
232
- base = entries_start + i * entry_size
233
- entry_hash = raw_idx[base : base + 32]
234
- if entry_hash != hash_bin:
235
- continue
236
- type_byte = raw_idx[base + 32]
237
- offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
238
- obj_type = BYTE_TO_TYPE.get(type_byte)
239
- if obj_type is None:
240
- continue
241
- if expected_type is not None and obj_type != expected_type:
242
- return None
243
- pack_raw = pack_path.read_bytes()
244
- header_size = len(PACK_MAGIC) + 4 + 4
245
- if offset + 1 + 4 > len(pack_raw) - 32:
246
- return None
247
- size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
248
- payload_start = offset + 5
249
- payload_end = payload_start + size
250
- if payload_end > len(pack_raw) - 32:
251
- return None
252
- compressed = pack_raw[payload_start:payload_end]
253
- try:
254
- full = zlib.decompress(compressed)
255
- except Exception:
256
- return None
257
- null_idx = full.index(b"\0")
258
- content = full[null_idx + 1 :]
259
- return (obj_type, content)
260
- return None
375
+
376
+ # Binary search over sorted hash entries (O(log n) instead of O(n))
377
+ class HashComparator:
378
+ """Helper for binary search over packed hash entries."""
379
+
380
+ def __getitem__(self, idx: int) -> bytes:
381
+ base = entries_start + idx * entry_size
382
+ return raw_idx[base : base + 32]
383
+
384
+ def __len__(self) -> int:
385
+ return count
386
+
387
+ hashes = HashComparator()
388
+ idx = bisect.bisect_left(hashes, hash_bin)
389
+
390
+ if idx >= count or hashes[idx] != hash_bin:
391
+ return None
392
+
393
+ base = entries_start + idx * entry_size
394
+ type_byte = raw_idx[base + 32]
395
+ offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
396
+ obj_type = BYTE_TO_TYPE.get(type_byte)
397
+ if obj_type is None:
398
+ return None
399
+ if expected_type is not None and obj_type != expected_type:
400
+ return None
401
+
402
+ pack_raw = pack_path.read_bytes()
403
+ header_size = len(PACK_MAGIC) + 4 + 4
404
+ if offset + 1 + 4 > len(pack_raw) - 32:
405
+ return None
406
+ size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
407
+ payload_start = offset + 5
408
+ payload_end = payload_start + size
409
+ if payload_end > len(pack_raw) - 32:
410
+ return None
411
+ compressed = pack_raw[payload_start:payload_end]
412
+ try:
413
+ full = zlib.decompress(compressed)
414
+ except Exception:
415
+ return None
416
+ null_idx = full.index(b"\0")
417
+ content = full[null_idx + 1 :]
418
+ return (obj_type, content)
261
419
 
262
420
 
263
421
  def run_repack(
@@ -282,7 +440,7 @@ def run_repack(
282
440
  return (0, 0)
283
441
  if dry_run:
284
442
  return (len(hash_to_type), 0)
285
- write_pack(objects_dir, store, hash_to_type)
443
+ write_pack_with_delta(objects_dir, store, hash_to_type)
286
444
  freed = 0
287
445
  for hash_id, obj_type in hash_to_type.items():
288
446
  p = store.objects_dir / obj_type / hash_id[:2] / hash_id[2:]
@@ -0,0 +1,187 @@
1
+ """
2
+ Privacy field validation and auditing.
3
+
4
+ Ensures differential privacy noise is only applied to fact data, not metadata.
5
+ Prevents accidental privacy overhead on metadata fields and provides audit trail.
6
+
7
+ Provides:
8
+ - @privacy_exempt: Decorator to mark metadata fields as privacy-exempt
9
+ - PrivacyFieldValidator: Runtime validation that noise is applied correctly
10
+ - PrivacyAuditReport: Audit trail of which fields received noise
11
+ """
12
+
13
+ from typing import Any, Callable, Dict, List, Optional, Set
14
+ from functools import wraps
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+
18
+
19
+ @dataclass
20
+ class PrivacyAuditReport:
21
+ """Audit report of privacy noise application."""
22
+
23
+ timestamp: str
24
+ noised_fields: Dict[str, Any] = field(default_factory=dict)
25
+ exempt_fields: Dict[str, Any] = field(default_factory=dict)
26
+ validation_errors: List[str] = field(default_factory=list)
27
+
28
+ def to_dict(self) -> Dict[str, Any]:
29
+ """Convert to dict for logging/serialization."""
30
+ return {
31
+ "timestamp": self.timestamp,
32
+ "noised_fields": self.noised_fields,
33
+ "exempt_fields": self.exempt_fields,
34
+ "validation_errors": self.validation_errors,
35
+ "summary": {
36
+ "total_noised": len(self.noised_fields),
37
+ "total_exempt": len(self.exempt_fields),
38
+ "validation_passed": len(self.validation_errors) == 0,
39
+ },
40
+ }
41
+
42
+
43
+ class PrivacyFieldValidator:
44
+ """Validates that privacy noise is applied correctly.
45
+
46
+ Tracks which fields receive noise vs. are exempt from noise.
47
+ Fails loudly if noise is applied to exempt fields.
48
+ """
49
+
50
+ # Metadata fields that should NEVER receive noise (they don't reveal facts)
51
+ EXEMPT_FIELDS = {
52
+ "clusters_found", # Metadata: count of clusters, not individual facts
53
+ "insights_generated", # Metadata: count of insights generated
54
+ "episodes_archived", # Metadata: count of archived episodes
55
+ "confidence_score", # Metadata: overall quality metric, not a fact
56
+ "summary_version", # Metadata: schema version
57
+ "created_at", # Metadata: timestamp
58
+ "updated_at", # Metadata: timestamp
59
+ "agent_version", # Metadata: software version
60
+ }
61
+
62
+ # Fact-related fields that SHOULD receive noise
63
+ FACT_FIELDS = {
64
+ "facts", # List of actual facts
65
+ "memories", # Memory content
66
+ "semantic_content", # Semantic memory content
67
+ "episodic_content", # Episodic memory content
68
+ "procedural_content", # Procedural memory content
69
+ "embeddings", # Vector representations of facts
70
+ "fact_count", # Count of individual facts (not metadata)
71
+ "memory_count", # Count of individual memories
72
+ }
73
+
74
+ def __init__(self):
75
+ self.audit_report = PrivacyAuditReport(timestamp=datetime.now(timezone.utc).isoformat())
76
+
77
+ def validate_noised_field(
78
+ self, field_name: str, field_value: Any, is_noised: bool = True
79
+ ) -> None:
80
+ """Validate that noise application is correct for a field.
81
+
82
+ Args:
83
+ field_name: Name of the field
84
+ field_value: Value of the field
85
+ is_noised: Whether noise was applied to this field
86
+
87
+ Raises:
88
+ RuntimeError: If noise is applied to exempt field
89
+ """
90
+ if is_noised and field_name in self.EXEMPT_FIELDS:
91
+ error = (
92
+ f"ERROR: Noise applied to exempt metadata field '{field_name}'. "
93
+ f"Metadata fields do not reveal individual facts and should not receive noise. "
94
+ f"Remove noise from: {field_name}"
95
+ )
96
+ self.audit_report.validation_errors.append(error)
97
+ raise RuntimeError(error)
98
+
99
+ if is_noised:
100
+ self.audit_report.noised_fields[field_name] = field_value
101
+ else:
102
+ self.audit_report.exempt_fields[field_name] = field_value
103
+
104
+ def validate_result_dict(self, result: Dict[str, Any]) -> None:
105
+ """Validate a result dict (e.g., DistillerResult or GardenerResult).
106
+
107
+ Args:
108
+ result: The result dict to validate
109
+
110
+ Raises:
111
+ RuntimeError: If privacy validation fails
112
+ """
113
+ for field_name in self.EXEMPT_FIELDS:
114
+ if field_name in result:
115
+ # These fields should not have been noised
116
+ self.audit_report.exempt_fields[field_name] = result[field_name]
117
+
118
+ def get_report(self) -> PrivacyAuditReport:
119
+ """Get the audit report."""
120
+ if self.audit_report.validation_errors:
121
+ print(
122
+ "Privacy Validation Report:\n"
123
+ + "\n".join(f" {e}" for e in self.audit_report.validation_errors)
124
+ )
125
+ return self.audit_report
126
+
127
+
128
+ def privacy_exempt(func: Callable) -> Callable:
129
+ """Decorator to mark a function as privacy-exempt.
130
+
131
+ The decorated function should not apply DP noise to its result.
132
+ Used to document which functions are exempt from privacy operations.
133
+
134
+ Example:
135
+ @privacy_exempt
136
+ def get_metadata() -> Dict[str, Any]:
137
+ return {"clusters_found": 42, "created_at": "2024-01-01T00:00:00Z"}
138
+ """
139
+
140
+ @wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ result = func(*args, **kwargs)
143
+ # Mark result as privacy-exempt (store in metadata if possible)
144
+ if isinstance(result, dict):
145
+ result["_privacy_exempt"] = True
146
+ return result
147
+
148
+ # Mark the wrapper function to indicate it's privacy-exempt
149
+ setattr(wrapper, "_privacy_exempt_function", True)
150
+ return wrapper
151
+
152
+
153
+ class PrivacyGuard:
154
+ """Context manager and decorator for privacy-aware code blocks.
155
+
156
+ Usage:
157
+ with PrivacyGuard() as pg:
158
+ result = process_facts(data)
159
+ pg.mark_noised("fact_count")
160
+ """
161
+
162
+ def __init__(self, strict: bool = True):
163
+ self.strict = strict
164
+ self.validator = PrivacyFieldValidator()
165
+
166
+ def __enter__(self):
167
+ return self
168
+
169
+ def __exit__(self, exc_type, exc_val, exc_tb):
170
+ if exc_type is not None:
171
+ return False
172
+ return True
173
+
174
+ def mark_noised(self, field_name: str, value: Any = None) -> None:
175
+ """Mark a field as having received DP noise."""
176
+ if self.strict:
177
+ self.validator.validate_noised_field(field_name, value, is_noised=True)
178
+ else:
179
+ self.validator.audit_report.noised_fields[field_name] = value
180
+
181
+ def mark_exempt(self, field_name: str, value: Any = None) -> None:
182
+ """Mark a field as exempt from DP noise."""
183
+ self.validator.audit_report.exempt_fields[field_name] = value
184
+
185
+ def get_report(self) -> PrivacyAuditReport:
186
+ """Get the privacy audit report."""
187
+ return self.validator.get_report()