agmem 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
memvcs/core/delta.py CHANGED
@@ -9,8 +9,11 @@ This can achieve 5-10x compression improvement for highly similar content
9
9
  """
10
10
 
11
11
  import hashlib
12
+ from collections import defaultdict
12
13
  from typing import List, Tuple, Dict, Optional
13
14
 
15
+ from memvcs.core.fast_similarity import FastSimilarityMatcher
16
+
14
17
 
15
18
  def levenshtein_distance(s1: bytes, s2: bytes) -> int:
16
19
  """
@@ -75,34 +78,53 @@ def find_similar_objects(
75
78
  """
76
79
  candidates = {h: content for h, content in objects.items() if len(content) >= min_size}
77
80
 
78
- if not candidates:
81
+ if len(candidates) < 2:
79
82
  return []
80
83
 
81
- grouped = {}
82
- used = set()
83
-
84
- for hash_id, content in candidates.items():
85
- if hash_id in used:
86
- continue
87
-
88
- group = [hash_id]
89
- used.add(hash_id)
90
-
91
- for other_id, other_content in candidates.items():
92
- if other_id in used:
93
- continue
84
+ use_parallel = len(candidates) > 10
85
+ max_len = max(len(content) for content in candidates.values())
86
+ simhash_threshold = 64 if max_len < 256 else 15
87
+ matcher = FastSimilarityMatcher(
88
+ length_ratio_threshold=0.5,
89
+ simhash_threshold=simhash_threshold,
90
+ min_similarity=similarity_threshold,
91
+ use_parallel=use_parallel,
92
+ max_workers=None,
93
+ )
94
+
95
+ similar_pairs = matcher.find_similar_pairs(candidates)
96
+ if not similar_pairs:
97
+ return []
94
98
 
95
- similarity = content_similarity(content, other_content)
96
- if similarity >= similarity_threshold:
97
- group.append(other_id)
98
- used.add(other_id)
99
+ graph: Dict[str, set] = defaultdict(set)
100
+ for id1, id2, _score in similar_pairs:
101
+ graph[id1].add(id2)
102
+ graph[id2].add(id1)
99
103
 
100
- if len(group) > 1:
101
- # Sort by size ascending (smallest first = best base)
102
- group.sort(key=lambda h: len(candidates[h]))
103
- grouped[group[0]] = group
104
+ groups: List[List[str]] = []
105
+ visited = set()
104
106
 
105
- return list(grouped.values())
107
+ for node in graph:
108
+ if node in visited:
109
+ continue
110
+ stack = [node]
111
+ component = []
112
+ visited.add(node)
113
+
114
+ while stack:
115
+ current = stack.pop()
116
+ component.append(current)
117
+ for neighbor in graph[current]:
118
+ if neighbor in visited:
119
+ continue
120
+ visited.add(neighbor)
121
+ stack.append(neighbor)
122
+
123
+ if len(component) > 1:
124
+ component.sort(key=lambda h: len(candidates[h]))
125
+ groups.append(component)
126
+
127
+ return groups
106
128
 
107
129
 
108
130
  def compute_delta(base: bytes, target: bytes) -> bytes:
@@ -0,0 +1,327 @@
1
+ """
2
+ Privacy-Preserving Search - Secure search with encryption and differential privacy.
3
+
4
+ This module provides:
5
+ - Encrypted search indices
6
+ - Differential privacy for queries
7
+ - Access control integration
8
+ - Secure search result handling
9
+ """
10
+
11
+ import hashlib
12
+ import hmac
13
+ import json
14
+ import os
15
+ import secrets
16
+ from dataclasses import dataclass, field
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Set, Tuple
20
+
21
+
22
+ @dataclass
23
+ class SearchQuery:
24
+ """A search query with privacy metadata."""
25
+
26
+ query: str
27
+ requester_id: str
28
+ privacy_level: str = "normal" # "public", "normal", "sensitive", "secret"
29
+ max_results: int = 10
30
+ include_content: bool = False
31
+
32
+ def to_dict(self) -> Dict[str, Any]:
33
+ return {
34
+ "query": self.query,
35
+ "requester_id": self.requester_id,
36
+ "privacy_level": self.privacy_level,
37
+ "max_results": self.max_results,
38
+ "include_content": self.include_content,
39
+ }
40
+
41
+
42
+ @dataclass
43
+ class SecureSearchResult:
44
+ """A search result with privacy handling."""
45
+
46
+ path: str
47
+ score: float
48
+ snippet: Optional[str] = None
49
+ accessed_at: Optional[str] = None
50
+ privacy_level: str = "normal"
51
+ redacted: bool = False
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ return {
55
+ "path": self.path,
56
+ "score": self.score,
57
+ "snippet": self.snippet,
58
+ "privacy_level": self.privacy_level,
59
+ "redacted": self.redacted,
60
+ }
61
+
62
+
63
+ class SearchTokenizer:
64
+ """Tokenizes and hashes search terms for privacy."""
65
+
66
+ def __init__(self, secret_key: Optional[bytes] = None):
67
+ self.secret_key = secret_key or secrets.token_bytes(32)
68
+
69
+ def tokenize(self, text: str) -> List[str]:
70
+ """Tokenize text into search terms."""
71
+ # Simple tokenization
72
+ import re
73
+
74
+ words = re.findall(r"\b\w+\b", text.lower())
75
+ return [w for w in words if len(w) >= 3]
76
+
77
+ def hash_token(self, token: str) -> str:
78
+ """Create a keyed hash of a token for blind search."""
79
+ return hmac.new(self.secret_key, token.encode(), hashlib.sha256).hexdigest()[:16]
80
+
81
+ def tokenize_and_hash(self, text: str) -> List[str]:
82
+ """Tokenize and hash all terms."""
83
+ tokens = self.tokenize(text)
84
+ return [self.hash_token(t) for t in tokens]
85
+
86
+
87
+ class AccessControl:
88
+ """Controls access to search results based on permissions."""
89
+
90
+ def __init__(self, mem_dir: Path):
91
+ self.mem_dir = Path(mem_dir)
92
+ self.acl_file = self.mem_dir / "search_acl.json"
93
+ self._acl: Dict[str, Dict[str, Any]] = {}
94
+ self._load()
95
+
96
+ def _load(self) -> None:
97
+ """Load ACL from disk."""
98
+ if self.acl_file.exists():
99
+ try:
100
+ self._acl = json.loads(self.acl_file.read_text())
101
+ except Exception:
102
+ pass
103
+
104
+ def _save(self) -> None:
105
+ """Save ACL to disk."""
106
+ self.mem_dir.mkdir(parents=True, exist_ok=True)
107
+ self.acl_file.write_text(json.dumps(self._acl, indent=2))
108
+
109
+ def set_file_access(
110
+ self,
111
+ path: str,
112
+ allowed_users: List[str],
113
+ privacy_level: str = "normal",
114
+ ) -> None:
115
+ """Set access control for a file."""
116
+ self._acl[path] = {
117
+ "allowed_users": allowed_users,
118
+ "privacy_level": privacy_level,
119
+ "updated_at": datetime.now(timezone.utc).isoformat(),
120
+ }
121
+ self._save()
122
+
123
+ def can_access(self, path: str, user_id: str, user_level: str = "normal") -> bool:
124
+ """Check if a user can access a file."""
125
+ acl = self._acl.get(path)
126
+ if not acl:
127
+ return True # No ACL = public access
128
+
129
+ # Check explicit user list
130
+ if acl.get("allowed_users"):
131
+ if user_id not in acl["allowed_users"]:
132
+ return False
133
+
134
+ # Check privacy level
135
+ level_order = ["public", "normal", "sensitive", "secret"]
136
+ file_level = acl.get("privacy_level", "normal")
137
+
138
+ try:
139
+ file_idx = level_order.index(file_level)
140
+ user_idx = level_order.index(user_level)
141
+ return user_idx >= file_idx
142
+ except ValueError:
143
+ return False
144
+
145
+ def get_file_acl(self, path: str) -> Optional[Dict[str, Any]]:
146
+ """Get ACL for a file."""
147
+ return self._acl.get(path)
148
+
149
+
150
+ class DifferentialPrivacyNoise:
151
+ """Adds differential privacy noise to search results."""
152
+
153
+ def __init__(self, epsilon: float = 0.1):
154
+ self.epsilon = epsilon
155
+
156
+ def add_laplace_noise(self, value: float, sensitivity: float = 1.0) -> float:
157
+ """Add Laplace noise for differential privacy."""
158
+ import random
159
+
160
+ scale = sensitivity / self.epsilon
161
+ u = random.random() - 0.5
162
+ noise = -scale * (1 if u >= 0 else -1) * (1 - 2 * abs(u))
163
+ return value + noise
164
+
165
+ def randomize_order(self, results: List[Any], threshold: float = 0.8) -> List[Any]:
166
+ """Randomly reorder similar results to add privacy."""
167
+ import random
168
+
169
+ # Group by similar scores
170
+ groups: List[List[Any]] = []
171
+ current_group: List[Any] = []
172
+ prev_score = None
173
+
174
+ for r in results:
175
+ score = getattr(r, "score", 0) if hasattr(r, "score") else r.get("score", 0)
176
+ if prev_score is None or abs(score - prev_score) < threshold:
177
+ current_group.append(r)
178
+ else:
179
+ if current_group:
180
+ groups.append(current_group)
181
+ current_group = [r]
182
+ prev_score = score
183
+
184
+ if current_group:
185
+ groups.append(current_group)
186
+
187
+ # Shuffle within groups
188
+ reordered = []
189
+ for group in groups:
190
+ random.shuffle(group)
191
+ reordered.extend(group)
192
+
193
+ return reordered
194
+
195
+ def truncate_snippets(self, snippet: str, max_len: int = 100) -> str:
196
+ """Truncate snippets to limit information leakage."""
197
+ if len(snippet) <= max_len:
198
+ return snippet
199
+
200
+ # Find a good break point
201
+ break_point = snippet.rfind(" ", max_len - 20, max_len)
202
+ if break_point == -1:
203
+ break_point = max_len
204
+
205
+ return snippet[:break_point] + "..."
206
+
207
+
208
+ class PrivateSearchEngine:
209
+ """Privacy-preserving search engine."""
210
+
211
+ def __init__(self, mem_dir: Path, current_dir: Path):
212
+ self.mem_dir = Path(mem_dir)
213
+ self.current_dir = Path(current_dir)
214
+ self.tokenizer = SearchTokenizer()
215
+ self.access_control = AccessControl(mem_dir)
216
+ self.dp_noise = DifferentialPrivacyNoise(epsilon=0.1)
217
+ self.query_log: List[Dict[str, Any]] = []
218
+
219
+ def search(self, query: SearchQuery) -> List[SecureSearchResult]:
220
+ """Perform a privacy-preserving search."""
221
+ results = []
222
+
223
+ # Token-based search
224
+ query_tokens = self.tokenizer.tokenize(query.query)
225
+
226
+ # Search through files
227
+ for filepath in self.current_dir.rglob("*"):
228
+ if not filepath.is_file():
229
+ continue
230
+
231
+ rel_path = str(filepath.relative_to(self.current_dir))
232
+
233
+ try:
234
+ content = filepath.read_text(encoding="utf-8", errors="replace")
235
+ content_tokens = self.tokenizer.tokenize(content)
236
+
237
+ # Simple scoring
238
+ matches = sum(1 for t in query_tokens if t in content_tokens)
239
+ if matches == 0:
240
+ continue
241
+
242
+ score = matches / len(query_tokens)
243
+
244
+ # Check access control
245
+ can_access = self.access_control.can_access(
246
+ rel_path, query.requester_id, query.privacy_level
247
+ )
248
+
249
+ if not can_access:
250
+ # Include redacted result
251
+ results.append(
252
+ SecureSearchResult(
253
+ path=rel_path,
254
+ score=score,
255
+ snippet=None,
256
+ privacy_level=query.privacy_level,
257
+ redacted=True,
258
+ )
259
+ )
260
+ else:
261
+ # Include full result
262
+ snippet = None
263
+ if query.include_content:
264
+ # Find snippet around first match
265
+ query_lower = query.query.lower()
266
+ idx = content.lower().find(query_lower)
267
+ if idx >= 0:
268
+ start = max(0, idx - 50)
269
+ end = min(len(content), idx + len(query.query) + 50)
270
+ snippet = content[start:end]
271
+ snippet = self.dp_noise.truncate_snippets(snippet)
272
+
273
+ results.append(
274
+ SecureSearchResult(
275
+ path=rel_path,
276
+ score=score,
277
+ snippet=snippet,
278
+ privacy_level=query.privacy_level,
279
+ redacted=False,
280
+ accessed_at=datetime.now(timezone.utc).isoformat(),
281
+ )
282
+ )
283
+ except Exception:
284
+ pass
285
+
286
+ # Sort by score
287
+ results.sort(key=lambda r: r.score, reverse=True)
288
+
289
+ # Apply differential privacy
290
+ results = self.dp_noise.randomize_order(results[: query.max_results * 2])
291
+
292
+ # Log query
293
+ self._log_query(query, len(results))
294
+
295
+ return results[: query.max_results]
296
+
297
+ def _log_query(self, query: SearchQuery, result_count: int) -> None:
298
+ """Log query for auditing (without preserving full query)."""
299
+ self.query_log.append(
300
+ {
301
+ "query_hash": hashlib.sha256(query.query.encode()).hexdigest()[:8],
302
+ "requester": query.requester_id,
303
+ "result_count": result_count,
304
+ "timestamp": datetime.now(timezone.utc).isoformat(),
305
+ }
306
+ )
307
+
308
+ def get_query_stats(self) -> Dict[str, Any]:
309
+ """Get query statistics."""
310
+ return {
311
+ "total_queries": len(self.query_log),
312
+ "recent_queries": self.query_log[-10:],
313
+ }
314
+
315
+
316
+ # --- Dashboard Helper ---
317
+
318
+
319
+ def get_private_search_stats(mem_dir: Path, current_dir: Path) -> Dict[str, Any]:
320
+ """Get private search statistics."""
321
+ engine = PrivateSearchEngine(mem_dir, current_dir)
322
+ access_control = AccessControl(mem_dir)
323
+
324
+ return {
325
+ "query_stats": engine.get_query_stats(),
326
+ "acl_count": len(access_control._acl),
327
+ }