agmem 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,187 @@
1
+ """
2
+ Privacy field validation and auditing.
3
+
4
+ Ensures differential privacy noise is only applied to fact data, not metadata.
5
+ Prevents accidental privacy overhead on metadata fields and provides audit trail.
6
+
7
+ Provides:
8
+ - @privacy_exempt: Decorator to mark metadata fields as privacy-exempt
9
+ - PrivacyFieldValidator: Runtime validation that noise is applied correctly
10
+ - PrivacyAuditReport: Audit trail of which fields received noise
11
+ """
12
+
13
+ from typing import Any, Callable, Dict, List, Optional, Set
14
+ from functools import wraps
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timezone
17
+
18
+
19
+ @dataclass
20
+ class PrivacyAuditReport:
21
+ """Audit report of privacy noise application."""
22
+
23
+ timestamp: str
24
+ noised_fields: Dict[str, Any] = field(default_factory=dict)
25
+ exempt_fields: Dict[str, Any] = field(default_factory=dict)
26
+ validation_errors: List[str] = field(default_factory=list)
27
+
28
+ def to_dict(self) -> Dict[str, Any]:
29
+ """Convert to dict for logging/serialization."""
30
+ return {
31
+ "timestamp": self.timestamp,
32
+ "noised_fields": self.noised_fields,
33
+ "exempt_fields": self.exempt_fields,
34
+ "validation_errors": self.validation_errors,
35
+ "summary": {
36
+ "total_noised": len(self.noised_fields),
37
+ "total_exempt": len(self.exempt_fields),
38
+ "validation_passed": len(self.validation_errors) == 0,
39
+ },
40
+ }
41
+
42
+
43
+ class PrivacyFieldValidator:
44
+ """Validates that privacy noise is applied correctly.
45
+
46
+ Tracks which fields receive noise vs. are exempt from noise.
47
+ Fails loudly if noise is applied to exempt fields.
48
+ """
49
+
50
+ # Metadata fields that should NEVER receive noise (they don't reveal facts)
51
+ EXEMPT_FIELDS = {
52
+ "clusters_found", # Metadata: count of clusters, not individual facts
53
+ "insights_generated", # Metadata: count of insights generated
54
+ "episodes_archived", # Metadata: count of archived episodes
55
+ "confidence_score", # Metadata: overall quality metric, not a fact
56
+ "summary_version", # Metadata: schema version
57
+ "created_at", # Metadata: timestamp
58
+ "updated_at", # Metadata: timestamp
59
+ "agent_version", # Metadata: software version
60
+ }
61
+
62
+ # Fact-related fields that SHOULD receive noise
63
+ FACT_FIELDS = {
64
+ "facts", # List of actual facts
65
+ "memories", # Memory content
66
+ "semantic_content", # Semantic memory content
67
+ "episodic_content", # Episodic memory content
68
+ "procedural_content", # Procedural memory content
69
+ "embeddings", # Vector representations of facts
70
+ "fact_count", # Count of individual facts (not metadata)
71
+ "memory_count", # Count of individual memories
72
+ }
73
+
74
+ def __init__(self):
75
+ self.audit_report = PrivacyAuditReport(timestamp=datetime.now(timezone.utc).isoformat())
76
+
77
+ def validate_noised_field(
78
+ self, field_name: str, field_value: Any, is_noised: bool = True
79
+ ) -> None:
80
+ """Validate that noise application is correct for a field.
81
+
82
+ Args:
83
+ field_name: Name of the field
84
+ field_value: Value of the field
85
+ is_noised: Whether noise was applied to this field
86
+
87
+ Raises:
88
+ RuntimeError: If noise is applied to exempt field
89
+ """
90
+ if is_noised and field_name in self.EXEMPT_FIELDS:
91
+ error = (
92
+ f"ERROR: Noise applied to exempt metadata field '{field_name}'. "
93
+ f"Metadata fields do not reveal individual facts and should not receive noise. "
94
+ f"Remove noise from: {field_name}"
95
+ )
96
+ self.audit_report.validation_errors.append(error)
97
+ raise RuntimeError(error)
98
+
99
+ if is_noised:
100
+ self.audit_report.noised_fields[field_name] = field_value
101
+ else:
102
+ self.audit_report.exempt_fields[field_name] = field_value
103
+
104
+ def validate_result_dict(self, result: Dict[str, Any]) -> None:
105
+ """Validate a result dict (e.g., DistillerResult or GardenerResult).
106
+
107
+ Args:
108
+ result: The result dict to validate
109
+
110
+ Raises:
111
+ RuntimeError: If privacy validation fails
112
+ """
113
+ for field_name in self.EXEMPT_FIELDS:
114
+ if field_name in result:
115
+ # These fields should not have been noised
116
+ self.audit_report.exempt_fields[field_name] = result[field_name]
117
+
118
+ def get_report(self) -> PrivacyAuditReport:
119
+ """Get the audit report."""
120
+ if self.audit_report.validation_errors:
121
+ print(
122
+ "Privacy Validation Report:\n"
123
+ + "\n".join(f" {e}" for e in self.audit_report.validation_errors)
124
+ )
125
+ return self.audit_report
126
+
127
+
128
+ def privacy_exempt(func: Callable) -> Callable:
129
+ """Decorator to mark a function as privacy-exempt.
130
+
131
+ The decorated function should not apply DP noise to its result.
132
+ Used to document which functions are exempt from privacy operations.
133
+
134
+ Example:
135
+ @privacy_exempt
136
+ def get_metadata() -> Dict[str, Any]:
137
+ return {"clusters_found": 42, "created_at": "2024-01-01T00:00:00Z"}
138
+ """
139
+
140
+ @wraps(func)
141
+ def wrapper(*args, **kwargs):
142
+ result = func(*args, **kwargs)
143
+ # Mark result as privacy-exempt (store in metadata if possible)
144
+ if isinstance(result, dict):
145
+ result["_privacy_exempt"] = True
146
+ return result
147
+
148
+ # Mark the wrapper function to indicate it's privacy-exempt
149
+ setattr(wrapper, "_privacy_exempt_function", True)
150
+ return wrapper
151
+
152
+
153
+ class PrivacyGuard:
154
+ """Context manager and decorator for privacy-aware code blocks.
155
+
156
+ Usage:
157
+ with PrivacyGuard() as pg:
158
+ result = process_facts(data)
159
+ pg.mark_noised("fact_count")
160
+ """
161
+
162
+ def __init__(self, strict: bool = True):
163
+ self.strict = strict
164
+ self.validator = PrivacyFieldValidator()
165
+
166
+ def __enter__(self):
167
+ return self
168
+
169
+ def __exit__(self, exc_type, exc_val, exc_tb):
170
+ if exc_type is not None:
171
+ return False
172
+ return True
173
+
174
+ def mark_noised(self, field_name: str, value: Any = None) -> None:
175
+ """Mark a field as having received DP noise."""
176
+ if self.strict:
177
+ self.validator.validate_noised_field(field_name, value, is_noised=True)
178
+ else:
179
+ self.validator.audit_report.noised_fields[field_name] = value
180
+
181
+ def mark_exempt(self, field_name: str, value: Any = None) -> None:
182
+ """Mark a field as exempt from DP noise."""
183
+ self.validator.audit_report.exempt_fields[field_name] = value
184
+
185
+ def get_report(self) -> PrivacyAuditReport:
186
+ """Get the privacy audit report."""
187
+ return self.validator.get_report()
@@ -0,0 +1,327 @@
1
+ """
2
+ Privacy-Preserving Search - Secure search with encryption and differential privacy.
3
+
4
+ This module provides:
5
+ - Encrypted search indices
6
+ - Differential privacy for queries
7
+ - Access control integration
8
+ - Secure search result handling
9
+ """
10
+
11
+ import hashlib
12
+ import hmac
13
+ import json
14
+ import os
15
+ import secrets
16
+ from dataclasses import dataclass, field
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Set, Tuple
20
+
21
+
22
+ @dataclass
23
+ class SearchQuery:
24
+ """A search query with privacy metadata."""
25
+
26
+ query: str
27
+ requester_id: str
28
+ privacy_level: str = "normal" # "public", "normal", "sensitive", "secret"
29
+ max_results: int = 10
30
+ include_content: bool = False
31
+
32
+ def to_dict(self) -> Dict[str, Any]:
33
+ return {
34
+ "query": self.query,
35
+ "requester_id": self.requester_id,
36
+ "privacy_level": self.privacy_level,
37
+ "max_results": self.max_results,
38
+ "include_content": self.include_content,
39
+ }
40
+
41
+
42
+ @dataclass
43
+ class SecureSearchResult:
44
+ """A search result with privacy handling."""
45
+
46
+ path: str
47
+ score: float
48
+ snippet: Optional[str] = None
49
+ accessed_at: Optional[str] = None
50
+ privacy_level: str = "normal"
51
+ redacted: bool = False
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ return {
55
+ "path": self.path,
56
+ "score": self.score,
57
+ "snippet": self.snippet,
58
+ "privacy_level": self.privacy_level,
59
+ "redacted": self.redacted,
60
+ }
61
+
62
+
63
+ class SearchTokenizer:
64
+ """Tokenizes and hashes search terms for privacy."""
65
+
66
+ def __init__(self, secret_key: Optional[bytes] = None):
67
+ self.secret_key = secret_key or secrets.token_bytes(32)
68
+
69
+ def tokenize(self, text: str) -> List[str]:
70
+ """Tokenize text into search terms."""
71
+ # Simple tokenization
72
+ import re
73
+
74
+ words = re.findall(r"\b\w+\b", text.lower())
75
+ return [w for w in words if len(w) >= 3]
76
+
77
+ def hash_token(self, token: str) -> str:
78
+ """Create a keyed hash of a token for blind search."""
79
+ return hmac.new(self.secret_key, token.encode(), hashlib.sha256).hexdigest()[:16]
80
+
81
+ def tokenize_and_hash(self, text: str) -> List[str]:
82
+ """Tokenize and hash all terms."""
83
+ tokens = self.tokenize(text)
84
+ return [self.hash_token(t) for t in tokens]
85
+
86
+
87
+ class AccessControl:
88
+ """Controls access to search results based on permissions."""
89
+
90
+ def __init__(self, mem_dir: Path):
91
+ self.mem_dir = Path(mem_dir)
92
+ self.acl_file = self.mem_dir / "search_acl.json"
93
+ self._acl: Dict[str, Dict[str, Any]] = {}
94
+ self._load()
95
+
96
+ def _load(self) -> None:
97
+ """Load ACL from disk."""
98
+ if self.acl_file.exists():
99
+ try:
100
+ self._acl = json.loads(self.acl_file.read_text())
101
+ except Exception:
102
+ pass
103
+
104
+ def _save(self) -> None:
105
+ """Save ACL to disk."""
106
+ self.mem_dir.mkdir(parents=True, exist_ok=True)
107
+ self.acl_file.write_text(json.dumps(self._acl, indent=2))
108
+
109
+ def set_file_access(
110
+ self,
111
+ path: str,
112
+ allowed_users: List[str],
113
+ privacy_level: str = "normal",
114
+ ) -> None:
115
+ """Set access control for a file."""
116
+ self._acl[path] = {
117
+ "allowed_users": allowed_users,
118
+ "privacy_level": privacy_level,
119
+ "updated_at": datetime.now(timezone.utc).isoformat(),
120
+ }
121
+ self._save()
122
+
123
+ def can_access(self, path: str, user_id: str, user_level: str = "normal") -> bool:
124
+ """Check if a user can access a file."""
125
+ acl = self._acl.get(path)
126
+ if not acl:
127
+ return True # No ACL = public access
128
+
129
+ # Check explicit user list
130
+ if acl.get("allowed_users"):
131
+ if user_id not in acl["allowed_users"]:
132
+ return False
133
+
134
+ # Check privacy level
135
+ level_order = ["public", "normal", "sensitive", "secret"]
136
+ file_level = acl.get("privacy_level", "normal")
137
+
138
+ try:
139
+ file_idx = level_order.index(file_level)
140
+ user_idx = level_order.index(user_level)
141
+ return user_idx >= file_idx
142
+ except ValueError:
143
+ return False
144
+
145
+ def get_file_acl(self, path: str) -> Optional[Dict[str, Any]]:
146
+ """Get ACL for a file."""
147
+ return self._acl.get(path)
148
+
149
+
150
+ class DifferentialPrivacyNoise:
151
+ """Adds differential privacy noise to search results."""
152
+
153
+ def __init__(self, epsilon: float = 0.1):
154
+ self.epsilon = epsilon
155
+
156
+ def add_laplace_noise(self, value: float, sensitivity: float = 1.0) -> float:
157
+ """Add Laplace noise for differential privacy."""
158
+ import random
159
+
160
+ scale = sensitivity / self.epsilon
161
+ u = random.random() - 0.5
162
+ noise = -scale * (1 if u >= 0 else -1) * (1 - 2 * abs(u))
163
+ return value + noise
164
+
165
+ def randomize_order(self, results: List[Any], threshold: float = 0.8) -> List[Any]:
166
+ """Randomly reorder similar results to add privacy."""
167
+ import random
168
+
169
+ # Group by similar scores
170
+ groups: List[List[Any]] = []
171
+ current_group: List[Any] = []
172
+ prev_score = None
173
+
174
+ for r in results:
175
+ score = getattr(r, "score", 0) if hasattr(r, "score") else r.get("score", 0)
176
+ if prev_score is None or abs(score - prev_score) < threshold:
177
+ current_group.append(r)
178
+ else:
179
+ if current_group:
180
+ groups.append(current_group)
181
+ current_group = [r]
182
+ prev_score = score
183
+
184
+ if current_group:
185
+ groups.append(current_group)
186
+
187
+ # Shuffle within groups
188
+ reordered = []
189
+ for group in groups:
190
+ random.shuffle(group)
191
+ reordered.extend(group)
192
+
193
+ return reordered
194
+
195
+ def truncate_snippets(self, snippet: str, max_len: int = 100) -> str:
196
+ """Truncate snippets to limit information leakage."""
197
+ if len(snippet) <= max_len:
198
+ return snippet
199
+
200
+ # Find a good break point
201
+ break_point = snippet.rfind(" ", max_len - 20, max_len)
202
+ if break_point == -1:
203
+ break_point = max_len
204
+
205
+ return snippet[:break_point] + "..."
206
+
207
+
208
+ class PrivateSearchEngine:
209
+ """Privacy-preserving search engine."""
210
+
211
+ def __init__(self, mem_dir: Path, current_dir: Path):
212
+ self.mem_dir = Path(mem_dir)
213
+ self.current_dir = Path(current_dir)
214
+ self.tokenizer = SearchTokenizer()
215
+ self.access_control = AccessControl(mem_dir)
216
+ self.dp_noise = DifferentialPrivacyNoise(epsilon=0.1)
217
+ self.query_log: List[Dict[str, Any]] = []
218
+
219
+ def search(self, query: SearchQuery) -> List[SecureSearchResult]:
220
+ """Perform a privacy-preserving search."""
221
+ results = []
222
+
223
+ # Token-based search
224
+ query_tokens = self.tokenizer.tokenize(query.query)
225
+
226
+ # Search through files
227
+ for filepath in self.current_dir.rglob("*"):
228
+ if not filepath.is_file():
229
+ continue
230
+
231
+ rel_path = str(filepath.relative_to(self.current_dir))
232
+
233
+ try:
234
+ content = filepath.read_text(encoding="utf-8", errors="replace")
235
+ content_tokens = self.tokenizer.tokenize(content)
236
+
237
+ # Simple scoring
238
+ matches = sum(1 for t in query_tokens if t in content_tokens)
239
+ if matches == 0:
240
+ continue
241
+
242
+ score = matches / len(query_tokens)
243
+
244
+ # Check access control
245
+ can_access = self.access_control.can_access(
246
+ rel_path, query.requester_id, query.privacy_level
247
+ )
248
+
249
+ if not can_access:
250
+ # Include redacted result
251
+ results.append(
252
+ SecureSearchResult(
253
+ path=rel_path,
254
+ score=score,
255
+ snippet=None,
256
+ privacy_level=query.privacy_level,
257
+ redacted=True,
258
+ )
259
+ )
260
+ else:
261
+ # Include full result
262
+ snippet = None
263
+ if query.include_content:
264
+ # Find snippet around first match
265
+ query_lower = query.query.lower()
266
+ idx = content.lower().find(query_lower)
267
+ if idx >= 0:
268
+ start = max(0, idx - 50)
269
+ end = min(len(content), idx + len(query.query) + 50)
270
+ snippet = content[start:end]
271
+ snippet = self.dp_noise.truncate_snippets(snippet)
272
+
273
+ results.append(
274
+ SecureSearchResult(
275
+ path=rel_path,
276
+ score=score,
277
+ snippet=snippet,
278
+ privacy_level=query.privacy_level,
279
+ redacted=False,
280
+ accessed_at=datetime.now(timezone.utc).isoformat(),
281
+ )
282
+ )
283
+ except Exception:
284
+ pass
285
+
286
+ # Sort by score
287
+ results.sort(key=lambda r: r.score, reverse=True)
288
+
289
+ # Apply differential privacy
290
+ results = self.dp_noise.randomize_order(results[: query.max_results * 2])
291
+
292
+ # Log query
293
+ self._log_query(query, len(results))
294
+
295
+ return results[: query.max_results]
296
+
297
+ def _log_query(self, query: SearchQuery, result_count: int) -> None:
298
+ """Log query for auditing (without preserving full query)."""
299
+ self.query_log.append(
300
+ {
301
+ "query_hash": hashlib.sha256(query.query.encode()).hexdigest()[:8],
302
+ "requester": query.requester_id,
303
+ "result_count": result_count,
304
+ "timestamp": datetime.now(timezone.utc).isoformat(),
305
+ }
306
+ )
307
+
308
+ def get_query_stats(self) -> Dict[str, Any]:
309
+ """Get query statistics."""
310
+ return {
311
+ "total_queries": len(self.query_log),
312
+ "recent_queries": self.query_log[-10:],
313
+ }
314
+
315
+
316
+ # --- Dashboard Helper ---
317
+
318
+
319
+ def get_private_search_stats(mem_dir: Path, current_dir: Path) -> Dict[str, Any]:
320
+ """Get private search statistics."""
321
+ engine = PrivateSearchEngine(mem_dir, current_dir)
322
+ access_control = AccessControl(mem_dir)
323
+
324
+ return {
325
+ "query_stats": engine.get_query_stats(),
326
+ "acl_count": len(access_control._acl),
327
+ }