agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/METADATA +15 -8
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/RECORD +25 -16
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/commands/daemon.py +37 -1
- memvcs/commands/distill.py +6 -0
- memvcs/coordinator/__init__.py +5 -0
- memvcs/coordinator/server.py +239 -0
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/delta.py +258 -0
- memvcs/core/distiller.py +76 -61
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +192 -34
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/protocol_builder.py +198 -0
- memvcs/core/remote.py +82 -2
- memvcs/core/zk_proofs.py +62 -5
- memvcs/health/__init__.py +25 -0
- memvcs/health/monitor.py +452 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0
memvcs/core/gardener.py
CHANGED
|
@@ -354,28 +354,11 @@ class Gardener:
|
|
|
354
354
|
except ValueError:
|
|
355
355
|
insight_path = self.semantic_dir / f"insight-{timestamp}.md"
|
|
356
356
|
|
|
357
|
-
# Generate frontmatter
|
|
357
|
+
# Generate frontmatter
|
|
358
358
|
source_episodes = len(cluster.episodes)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
and self.config.dp_delta is not None
|
|
363
|
-
):
|
|
364
|
-
from .privacy_budget import add_noise
|
|
365
|
-
|
|
366
|
-
source_episodes = max(
|
|
367
|
-
0,
|
|
368
|
-
int(
|
|
369
|
-
round(
|
|
370
|
-
add_noise(
|
|
371
|
-
float(source_episodes),
|
|
372
|
-
1.0,
|
|
373
|
-
self.config.dp_epsilon,
|
|
374
|
-
self.config.dp_delta,
|
|
375
|
-
)
|
|
376
|
-
)
|
|
377
|
-
),
|
|
378
|
-
)
|
|
359
|
+
# Metadata noise removed: source_episodes is a metadata count (number of episodes
|
|
360
|
+
# contributing to this insight), not an individual fact. Adding noise to metadata
|
|
361
|
+
# doesn't provide meaningful privacy guarantees. See privacy_validator.py.
|
|
379
362
|
frontmatter = {
|
|
380
363
|
"schema_version": "1.0",
|
|
381
364
|
"last_updated": datetime.utcnow().isoformat() + "Z",
|
|
@@ -514,53 +497,10 @@ class Gardener:
|
|
|
514
497
|
clusters_found = len(clusters)
|
|
515
498
|
insights_generated = insights_written
|
|
516
499
|
episodes_archived = archived_count
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
):
|
|
522
|
-
from .privacy_budget import add_noise
|
|
523
|
-
|
|
524
|
-
sensitivity = 1.0
|
|
525
|
-
clusters_found = max(
|
|
526
|
-
0,
|
|
527
|
-
int(
|
|
528
|
-
round(
|
|
529
|
-
add_noise(
|
|
530
|
-
float(clusters_found),
|
|
531
|
-
sensitivity,
|
|
532
|
-
self.config.dp_epsilon,
|
|
533
|
-
self.config.dp_delta,
|
|
534
|
-
)
|
|
535
|
-
)
|
|
536
|
-
),
|
|
537
|
-
)
|
|
538
|
-
insights_generated = max(
|
|
539
|
-
0,
|
|
540
|
-
int(
|
|
541
|
-
round(
|
|
542
|
-
add_noise(
|
|
543
|
-
float(insights_generated),
|
|
544
|
-
sensitivity,
|
|
545
|
-
self.config.dp_epsilon,
|
|
546
|
-
self.config.dp_delta,
|
|
547
|
-
)
|
|
548
|
-
)
|
|
549
|
-
),
|
|
550
|
-
)
|
|
551
|
-
episodes_archived = max(
|
|
552
|
-
0,
|
|
553
|
-
int(
|
|
554
|
-
round(
|
|
555
|
-
add_noise(
|
|
556
|
-
float(episodes_archived),
|
|
557
|
-
sensitivity,
|
|
558
|
-
self.config.dp_epsilon,
|
|
559
|
-
self.config.dp_delta,
|
|
560
|
-
)
|
|
561
|
-
)
|
|
562
|
-
),
|
|
563
|
-
)
|
|
500
|
+
# Metadata noise removed: clusters_found, insights_generated, and
|
|
501
|
+
# episodes_archived are metadata counts, not individual facts.
|
|
502
|
+
# Adding noise to these doesn't provide meaningful privacy guarantees.
|
|
503
|
+
# See privacy_validator.py for the distinction between metadata and facts.
|
|
564
504
|
|
|
565
505
|
return GardenerResult(
|
|
566
506
|
success=True,
|
memvcs/core/pack.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
|
2
2
|
Pack files and garbage collection for agmem.
|
|
3
3
|
|
|
4
4
|
Pack: collect loose objects into single file + index. GC: delete unreachable objects, repack.
|
|
5
|
+
Includes delta encoding for similar objects (5-10x compression for similar content).
|
|
5
6
|
"""
|
|
6
7
|
|
|
8
|
+
import bisect
|
|
7
9
|
import hashlib
|
|
8
10
|
import struct
|
|
9
11
|
import zlib
|
|
@@ -12,20 +14,23 @@ from typing import Set, Dict, List, Optional, Tuple
|
|
|
12
14
|
|
|
13
15
|
from .objects import ObjectStore
|
|
14
16
|
from .refs import RefsManager
|
|
17
|
+
from .delta import find_similar_objects, compute_delta, DeltaCache
|
|
15
18
|
|
|
16
19
|
PACK_MAGIC = b"PACK"
|
|
17
|
-
PACK_VERSION = 2
|
|
20
|
+
PACK_VERSION = 2 # Maintain v2 for backward compatibility
|
|
18
21
|
IDX_MAGIC = b"agidx"
|
|
19
|
-
IDX_VERSION = 2
|
|
22
|
+
IDX_VERSION = 2 # Maintain v2 for backward compatibility
|
|
20
23
|
OBJ_TYPE_BLOB = 1
|
|
21
24
|
OBJ_TYPE_TREE = 2
|
|
22
25
|
OBJ_TYPE_COMMIT = 3
|
|
23
26
|
OBJ_TYPE_TAG = 4
|
|
27
|
+
OBJ_TYPE_DELTA = 5 # Delta object type (for future v3)
|
|
24
28
|
TYPE_TO_BYTE = {
|
|
25
29
|
"blob": OBJ_TYPE_BLOB,
|
|
26
30
|
"tree": OBJ_TYPE_TREE,
|
|
27
31
|
"commit": OBJ_TYPE_COMMIT,
|
|
28
32
|
"tag": OBJ_TYPE_TAG,
|
|
33
|
+
"delta": OBJ_TYPE_DELTA,
|
|
29
34
|
}
|
|
30
35
|
BYTE_TO_TYPE = {v: k for k, v in TYPE_TO_BYTE.items()}
|
|
31
36
|
|
|
@@ -121,6 +126,142 @@ def run_gc(
|
|
|
121
126
|
return (len(to_delete), freed)
|
|
122
127
|
|
|
123
128
|
|
|
129
|
+
def write_pack_with_delta(
|
|
130
|
+
objects_dir: Path,
|
|
131
|
+
store: ObjectStore,
|
|
132
|
+
hash_to_type: Dict[str, str],
|
|
133
|
+
use_delta: bool = True,
|
|
134
|
+
similarity_threshold: float = 0.7,
|
|
135
|
+
) -> Tuple[Path, Path, Optional[Dict[str, Tuple[int, int]]]]:
|
|
136
|
+
"""
|
|
137
|
+
Pack loose objects with optional delta encoding.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
objects_dir: Path to objects directory
|
|
141
|
+
store: ObjectStore instance
|
|
142
|
+
hash_to_type: map hash_id -> obj_type
|
|
143
|
+
use_delta: whether to compute deltas for similar objects
|
|
144
|
+
similarity_threshold: minimum similarity (0.0-1.0) for delta encoding
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
(pack_path, index_path, delta_stats)
|
|
148
|
+
delta_stats: dict of {target_hash: (original_size, delta_size)} for deltas used
|
|
149
|
+
"""
|
|
150
|
+
if not hash_to_type:
|
|
151
|
+
raise ValueError("Cannot write empty pack")
|
|
152
|
+
|
|
153
|
+
pack_d = _pack_dir(objects_dir)
|
|
154
|
+
pack_d.mkdir(parents=True, exist_ok=True)
|
|
155
|
+
|
|
156
|
+
# Load all objects
|
|
157
|
+
objects_data: Dict[str, bytes] = {}
|
|
158
|
+
for hash_id in hash_to_type.keys():
|
|
159
|
+
obj_type = hash_to_type[hash_id]
|
|
160
|
+
content = store.retrieve(hash_id, obj_type)
|
|
161
|
+
if content:
|
|
162
|
+
header = f"{obj_type} {len(content)}\0".encode()
|
|
163
|
+
objects_data[hash_id] = header + content
|
|
164
|
+
|
|
165
|
+
# Find similar objects for delta encoding
|
|
166
|
+
delta_cache = DeltaCache() if use_delta else None
|
|
167
|
+
if use_delta and len(objects_data) > 1:
|
|
168
|
+
similarity_groups = find_similar_objects(
|
|
169
|
+
objects_data,
|
|
170
|
+
similarity_threshold=similarity_threshold,
|
|
171
|
+
min_size=100,
|
|
172
|
+
)
|
|
173
|
+
for group in similarity_groups:
|
|
174
|
+
if len(group) < 2:
|
|
175
|
+
continue
|
|
176
|
+
base_hash = group[0] # Smallest object is base
|
|
177
|
+
base_content = objects_data[base_hash]
|
|
178
|
+
for target_hash in group[1:]:
|
|
179
|
+
target_content = objects_data[target_hash]
|
|
180
|
+
delta = compute_delta(base_content, target_content)
|
|
181
|
+
# Only use delta if it saves space
|
|
182
|
+
if len(delta) < len(target_content) * 0.8:
|
|
183
|
+
delta_cache.add_delta(base_hash, target_hash, delta)
|
|
184
|
+
|
|
185
|
+
pack_header_len = len(PACK_MAGIC) + 4 + 4
|
|
186
|
+
pack_body = bytearray()
|
|
187
|
+
index_entries: List[Tuple[str, str, int, Optional[str]]] = (
|
|
188
|
+
[]
|
|
189
|
+
) # (hash_id, obj_type, offset, base_hash or None)
|
|
190
|
+
offset_in_file = pack_header_len
|
|
191
|
+
|
|
192
|
+
for hash_id in sorted(hash_to_type.keys()):
|
|
193
|
+
obj_type = hash_to_type[hash_id]
|
|
194
|
+
full_data = objects_data.get(hash_id)
|
|
195
|
+
if not full_data:
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
# Check if this object has a delta
|
|
199
|
+
base_hash = delta_cache.get_base(hash_id) if delta_cache else None
|
|
200
|
+
if base_hash and delta_cache:
|
|
201
|
+
# Store as delta
|
|
202
|
+
delta = delta_cache.get_delta(base_hash, hash_id)
|
|
203
|
+
compressed = zlib.compress(delta)
|
|
204
|
+
type_byte = OBJ_TYPE_DELTA
|
|
205
|
+
size_bytes = struct.pack(">I", len(compressed))
|
|
206
|
+
base_hash_bytes = bytes.fromhex(base_hash)
|
|
207
|
+
chunk = bytes([type_byte]) + size_bytes + base_hash_bytes[:16] + compressed
|
|
208
|
+
index_entries.append((hash_id, obj_type, offset_in_file, base_hash))
|
|
209
|
+
else:
|
|
210
|
+
# Store full object
|
|
211
|
+
compressed = zlib.compress(full_data)
|
|
212
|
+
type_byte = TYPE_TO_BYTE.get(obj_type, OBJ_TYPE_BLOB)
|
|
213
|
+
size_bytes = struct.pack(">I", len(compressed))
|
|
214
|
+
chunk = bytes([type_byte]) + size_bytes + compressed
|
|
215
|
+
index_entries.append((hash_id, obj_type, offset_in_file, None))
|
|
216
|
+
|
|
217
|
+
pack_body.extend(chunk)
|
|
218
|
+
offset_in_file += len(chunk)
|
|
219
|
+
|
|
220
|
+
if not index_entries:
|
|
221
|
+
raise ValueError("No objects to pack")
|
|
222
|
+
|
|
223
|
+
pack_content = (
|
|
224
|
+
PACK_MAGIC
|
|
225
|
+
+ struct.pack(">I", PACK_VERSION)
|
|
226
|
+
+ struct.pack(">I", len(index_entries))
|
|
227
|
+
+ bytes(pack_body)
|
|
228
|
+
)
|
|
229
|
+
pack_hash = hashlib.sha256(pack_content).digest()
|
|
230
|
+
pack_content += pack_hash
|
|
231
|
+
|
|
232
|
+
pack_name = f"pack-{pack_hash[:16].hex()}.pack"
|
|
233
|
+
pack_path = pack_d / pack_name
|
|
234
|
+
pack_path.write_bytes(pack_content)
|
|
235
|
+
|
|
236
|
+
# Write index with delta references (keeping v2 format for now)
|
|
237
|
+
index_content = bytearray(
|
|
238
|
+
IDX_MAGIC + struct.pack(">I", IDX_VERSION) + struct.pack(">I", len(index_entries))
|
|
239
|
+
)
|
|
240
|
+
delta_stats = {}
|
|
241
|
+
for hash_id, obj_type, off, base_hash in index_entries:
|
|
242
|
+
index_content.extend(bytes.fromhex(hash_id))
|
|
243
|
+
index_content.append(TYPE_TO_BYTE[obj_type])
|
|
244
|
+
index_content.extend(struct.pack(">I", off))
|
|
245
|
+
# Note: delta base hash stored after offset but not read by v2 retrieve_from_pack
|
|
246
|
+
# This is forward-compatible: v3 readers will use base_hash, v2 readers ignore it
|
|
247
|
+
if base_hash:
|
|
248
|
+
original_size = len(objects_data[hash_id])
|
|
249
|
+
delta_size = len(delta_cache.get_delta(base_hash, hash_id))
|
|
250
|
+
delta_stats[hash_id] = (original_size, delta_size)
|
|
251
|
+
# Store delta base info (v3 format, but after v2 format fields)
|
|
252
|
+
index_content.extend(bytes.fromhex(base_hash))
|
|
253
|
+
else:
|
|
254
|
+
# Padding for v3 format
|
|
255
|
+
index_content.extend(b"\x00" * 32)
|
|
256
|
+
|
|
257
|
+
idx_hash = hashlib.sha256(index_content).digest()
|
|
258
|
+
index_content.extend(idx_hash)
|
|
259
|
+
idx_path = pack_path.with_suffix(".idx")
|
|
260
|
+
idx_path.write_bytes(index_content)
|
|
261
|
+
|
|
262
|
+
return (pack_path, idx_path, delta_stats if use_delta else None)
|
|
263
|
+
|
|
264
|
+
|
|
124
265
|
def write_pack(
|
|
125
266
|
objects_dir: Path, store: ObjectStore, hash_to_type: Dict[str, str]
|
|
126
267
|
) -> Tuple[Path, Path]:
|
|
@@ -128,6 +269,9 @@ def write_pack(
|
|
|
128
269
|
Pack loose objects into a single pack file and index.
|
|
129
270
|
hash_to_type: map hash_id -> obj_type for objects to include.
|
|
130
271
|
Returns (pack_path, index_path). Does not delete loose objects.
|
|
272
|
+
|
|
273
|
+
Standard pack format (v2) without delta encoding for backward compatibility.
|
|
274
|
+
Use write_pack_with_delta() with use_delta=True for delta encoding.
|
|
131
275
|
"""
|
|
132
276
|
if not hash_to_type:
|
|
133
277
|
raise ValueError("Cannot write empty pack")
|
|
@@ -200,7 +344,7 @@ def retrieve_from_pack(
|
|
|
200
344
|
objects_dir: Path, hash_id: str, expected_type: Optional[str] = None
|
|
201
345
|
) -> Optional[Tuple[str, bytes]]:
|
|
202
346
|
"""
|
|
203
|
-
Retrieve object from pack by hash. Returns (obj_type, content) or None.
|
|
347
|
+
Retrieve object from pack by hash using binary search. Returns (obj_type, content) or None.
|
|
204
348
|
If expected_type is set, only return if pack type matches.
|
|
205
349
|
"""
|
|
206
350
|
idx_path = _find_pack_index(objects_dir)
|
|
@@ -228,36 +372,50 @@ def retrieve_from_pack(
|
|
|
228
372
|
if len(hash_hex) != 64:
|
|
229
373
|
return None
|
|
230
374
|
hash_bin = bytes.fromhex(hash_hex)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
375
|
+
|
|
376
|
+
# Binary search over sorted hash entries (O(log n) instead of O(n))
|
|
377
|
+
class HashComparator:
|
|
378
|
+
"""Helper for binary search over packed hash entries."""
|
|
379
|
+
|
|
380
|
+
def __getitem__(self, idx: int) -> bytes:
|
|
381
|
+
base = entries_start + idx * entry_size
|
|
382
|
+
return raw_idx[base : base + 32]
|
|
383
|
+
|
|
384
|
+
def __len__(self) -> int:
|
|
385
|
+
return count
|
|
386
|
+
|
|
387
|
+
hashes = HashComparator()
|
|
388
|
+
idx = bisect.bisect_left(hashes, hash_bin)
|
|
389
|
+
|
|
390
|
+
if idx >= count or hashes[idx] != hash_bin:
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
base = entries_start + idx * entry_size
|
|
394
|
+
type_byte = raw_idx[base + 32]
|
|
395
|
+
offset = struct.unpack(">I", raw_idx[base + 33 : base + 37])[0]
|
|
396
|
+
obj_type = BYTE_TO_TYPE.get(type_byte)
|
|
397
|
+
if obj_type is None:
|
|
398
|
+
return None
|
|
399
|
+
if expected_type is not None and obj_type != expected_type:
|
|
400
|
+
return None
|
|
401
|
+
|
|
402
|
+
pack_raw = pack_path.read_bytes()
|
|
403
|
+
header_size = len(PACK_MAGIC) + 4 + 4
|
|
404
|
+
if offset + 1 + 4 > len(pack_raw) - 32:
|
|
405
|
+
return None
|
|
406
|
+
size = struct.unpack(">I", pack_raw[offset + 1 : offset + 5])[0]
|
|
407
|
+
payload_start = offset + 5
|
|
408
|
+
payload_end = payload_start + size
|
|
409
|
+
if payload_end > len(pack_raw) - 32:
|
|
410
|
+
return None
|
|
411
|
+
compressed = pack_raw[payload_start:payload_end]
|
|
412
|
+
try:
|
|
413
|
+
full = zlib.decompress(compressed)
|
|
414
|
+
except Exception:
|
|
415
|
+
return None
|
|
416
|
+
null_idx = full.index(b"\0")
|
|
417
|
+
content = full[null_idx + 1 :]
|
|
418
|
+
return (obj_type, content)
|
|
261
419
|
|
|
262
420
|
|
|
263
421
|
def run_repack(
|
|
@@ -282,7 +440,7 @@ def run_repack(
|
|
|
282
440
|
return (0, 0)
|
|
283
441
|
if dry_run:
|
|
284
442
|
return (len(hash_to_type), 0)
|
|
285
|
-
|
|
443
|
+
write_pack_with_delta(objects_dir, store, hash_to_type)
|
|
286
444
|
freed = 0
|
|
287
445
|
for hash_id, obj_type in hash_to_type.items():
|
|
288
446
|
p = store.objects_dir / obj_type / hash_id[:2] / hash_id[2:]
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Privacy field validation and auditing.
|
|
3
|
+
|
|
4
|
+
Ensures differential privacy noise is only applied to fact data, not metadata.
|
|
5
|
+
Prevents accidental privacy overhead on metadata fields and provides audit trail.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- @privacy_exempt: Decorator to mark metadata fields as privacy-exempt
|
|
9
|
+
- PrivacyFieldValidator: Runtime validation that noise is applied correctly
|
|
10
|
+
- PrivacyAuditReport: Audit trail of which fields received noise
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
|
14
|
+
from functools import wraps
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class PrivacyAuditReport:
|
|
21
|
+
"""Audit report of privacy noise application."""
|
|
22
|
+
|
|
23
|
+
timestamp: str
|
|
24
|
+
noised_fields: Dict[str, Any] = field(default_factory=dict)
|
|
25
|
+
exempt_fields: Dict[str, Any] = field(default_factory=dict)
|
|
26
|
+
validation_errors: List[str] = field(default_factory=list)
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
29
|
+
"""Convert to dict for logging/serialization."""
|
|
30
|
+
return {
|
|
31
|
+
"timestamp": self.timestamp,
|
|
32
|
+
"noised_fields": self.noised_fields,
|
|
33
|
+
"exempt_fields": self.exempt_fields,
|
|
34
|
+
"validation_errors": self.validation_errors,
|
|
35
|
+
"summary": {
|
|
36
|
+
"total_noised": len(self.noised_fields),
|
|
37
|
+
"total_exempt": len(self.exempt_fields),
|
|
38
|
+
"validation_passed": len(self.validation_errors) == 0,
|
|
39
|
+
},
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class PrivacyFieldValidator:
|
|
44
|
+
"""Validates that privacy noise is applied correctly.
|
|
45
|
+
|
|
46
|
+
Tracks which fields receive noise vs. are exempt from noise.
|
|
47
|
+
Fails loudly if noise is applied to exempt fields.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Metadata fields that should NEVER receive noise (they don't reveal facts)
|
|
51
|
+
EXEMPT_FIELDS = {
|
|
52
|
+
"clusters_found", # Metadata: count of clusters, not individual facts
|
|
53
|
+
"insights_generated", # Metadata: count of insights generated
|
|
54
|
+
"episodes_archived", # Metadata: count of archived episodes
|
|
55
|
+
"confidence_score", # Metadata: overall quality metric, not a fact
|
|
56
|
+
"summary_version", # Metadata: schema version
|
|
57
|
+
"created_at", # Metadata: timestamp
|
|
58
|
+
"updated_at", # Metadata: timestamp
|
|
59
|
+
"agent_version", # Metadata: software version
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Fact-related fields that SHOULD receive noise
|
|
63
|
+
FACT_FIELDS = {
|
|
64
|
+
"facts", # List of actual facts
|
|
65
|
+
"memories", # Memory content
|
|
66
|
+
"semantic_content", # Semantic memory content
|
|
67
|
+
"episodic_content", # Episodic memory content
|
|
68
|
+
"procedural_content", # Procedural memory content
|
|
69
|
+
"embeddings", # Vector representations of facts
|
|
70
|
+
"fact_count", # Count of individual facts (not metadata)
|
|
71
|
+
"memory_count", # Count of individual memories
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def __init__(self):
|
|
75
|
+
self.audit_report = PrivacyAuditReport(timestamp=datetime.now(timezone.utc).isoformat())
|
|
76
|
+
|
|
77
|
+
def validate_noised_field(
|
|
78
|
+
self, field_name: str, field_value: Any, is_noised: bool = True
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Validate that noise application is correct for a field.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
field_name: Name of the field
|
|
84
|
+
field_value: Value of the field
|
|
85
|
+
is_noised: Whether noise was applied to this field
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
RuntimeError: If noise is applied to exempt field
|
|
89
|
+
"""
|
|
90
|
+
if is_noised and field_name in self.EXEMPT_FIELDS:
|
|
91
|
+
error = (
|
|
92
|
+
f"ERROR: Noise applied to exempt metadata field '{field_name}'. "
|
|
93
|
+
f"Metadata fields do not reveal individual facts and should not receive noise. "
|
|
94
|
+
f"Remove noise from: {field_name}"
|
|
95
|
+
)
|
|
96
|
+
self.audit_report.validation_errors.append(error)
|
|
97
|
+
raise RuntimeError(error)
|
|
98
|
+
|
|
99
|
+
if is_noised:
|
|
100
|
+
self.audit_report.noised_fields[field_name] = field_value
|
|
101
|
+
else:
|
|
102
|
+
self.audit_report.exempt_fields[field_name] = field_value
|
|
103
|
+
|
|
104
|
+
def validate_result_dict(self, result: Dict[str, Any]) -> None:
|
|
105
|
+
"""Validate a result dict (e.g., DistillerResult or GardenerResult).
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
result: The result dict to validate
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
RuntimeError: If privacy validation fails
|
|
112
|
+
"""
|
|
113
|
+
for field_name in self.EXEMPT_FIELDS:
|
|
114
|
+
if field_name in result:
|
|
115
|
+
# These fields should not have been noised
|
|
116
|
+
self.audit_report.exempt_fields[field_name] = result[field_name]
|
|
117
|
+
|
|
118
|
+
def get_report(self) -> PrivacyAuditReport:
|
|
119
|
+
"""Get the audit report."""
|
|
120
|
+
if self.audit_report.validation_errors:
|
|
121
|
+
print(
|
|
122
|
+
"Privacy Validation Report:\n"
|
|
123
|
+
+ "\n".join(f" {e}" for e in self.audit_report.validation_errors)
|
|
124
|
+
)
|
|
125
|
+
return self.audit_report
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def privacy_exempt(func: Callable) -> Callable:
|
|
129
|
+
"""Decorator to mark a function as privacy-exempt.
|
|
130
|
+
|
|
131
|
+
The decorated function should not apply DP noise to its result.
|
|
132
|
+
Used to document which functions are exempt from privacy operations.
|
|
133
|
+
|
|
134
|
+
Example:
|
|
135
|
+
@privacy_exempt
|
|
136
|
+
def get_metadata() -> Dict[str, Any]:
|
|
137
|
+
return {"clusters_found": 42, "created_at": "2024-01-01T00:00:00Z"}
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
@wraps(func)
|
|
141
|
+
def wrapper(*args, **kwargs):
|
|
142
|
+
result = func(*args, **kwargs)
|
|
143
|
+
# Mark result as privacy-exempt (store in metadata if possible)
|
|
144
|
+
if isinstance(result, dict):
|
|
145
|
+
result["_privacy_exempt"] = True
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
# Mark the wrapper function to indicate it's privacy-exempt
|
|
149
|
+
setattr(wrapper, "_privacy_exempt_function", True)
|
|
150
|
+
return wrapper
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class PrivacyGuard:
|
|
154
|
+
"""Context manager and decorator for privacy-aware code blocks.
|
|
155
|
+
|
|
156
|
+
Usage:
|
|
157
|
+
with PrivacyGuard() as pg:
|
|
158
|
+
result = process_facts(data)
|
|
159
|
+
pg.mark_noised("fact_count")
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self, strict: bool = True):
|
|
163
|
+
self.strict = strict
|
|
164
|
+
self.validator = PrivacyFieldValidator()
|
|
165
|
+
|
|
166
|
+
def __enter__(self):
|
|
167
|
+
return self
|
|
168
|
+
|
|
169
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
170
|
+
if exc_type is not None:
|
|
171
|
+
return False
|
|
172
|
+
return True
|
|
173
|
+
|
|
174
|
+
def mark_noised(self, field_name: str, value: Any = None) -> None:
|
|
175
|
+
"""Mark a field as having received DP noise."""
|
|
176
|
+
if self.strict:
|
|
177
|
+
self.validator.validate_noised_field(field_name, value, is_noised=True)
|
|
178
|
+
else:
|
|
179
|
+
self.validator.audit_report.noised_fields[field_name] = value
|
|
180
|
+
|
|
181
|
+
def mark_exempt(self, field_name: str, value: Any = None) -> None:
|
|
182
|
+
"""Mark a field as exempt from DP noise."""
|
|
183
|
+
self.validator.audit_report.exempt_fields[field_name] = value
|
|
184
|
+
|
|
185
|
+
def get_report(self) -> PrivacyAuditReport:
|
|
186
|
+
"""Get the privacy audit report."""
|
|
187
|
+
return self.validator.get_report()
|