agmem 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/METADATA +5 -4
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/RECORD +17 -13
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/coordinator/server.py +18 -2
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/distiller.py +3 -12
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +1 -1
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/protocol_builder.py +198 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.2.0.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fast similarity matching with tiered filtering.
|
|
3
|
+
|
|
4
|
+
Solves O(n²×m²) performance bottleneck in delta encoding by filtering
|
|
5
|
+
candidates before expensive Levenshtein distance computation.
|
|
6
|
+
|
|
7
|
+
Three-tier approach:
|
|
8
|
+
1. Length-ratio filter: O(1) - skip if sizes differ >50%
|
|
9
|
+
2. SimHash filter: O(n) - skip if approximate similarity below threshold
|
|
10
|
+
3. Levenshtein distance: O(n×m) - only for candidates passing tiers 1-2
|
|
11
|
+
4. Parallel processing: Multiprocessing for tier 3 across multiple cores
|
|
12
|
+
|
|
13
|
+
With 100 objects × 2KB each, filters typically eliminate 90%+ of pairs
|
|
14
|
+
before expensive distance computation, reducing 40B operations to <100M.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
from typing import Dict, List, Tuple, Optional, Set, Any
|
|
19
|
+
from multiprocessing import Pool, cpu_count
|
|
20
|
+
from functools import partial
|
|
21
|
+
import math
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SimHashFilter:
|
|
25
|
+
"""Fast approximate similarity using SimHash.
|
|
26
|
+
|
|
27
|
+
SimHash creates a 64-bit fingerprint of content that:
|
|
28
|
+
- Changes minimally for similar content
|
|
29
|
+
- Computes in O(n) time
|
|
30
|
+
- Allows Hamming distance for approximate matching
|
|
31
|
+
|
|
32
|
+
Papers: "Detecting Near-Duplicates for Web Crawling" (Charikar, 2002)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def compute_hash(content: bytes, hash_bits: int = 64) -> int:
|
|
37
|
+
"""Compute SimHash fingerprint for content.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
content: Bytes to hash
|
|
41
|
+
hash_bits: Number of bits in fingerprint (default 64)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
SimHash fingerprint as integer
|
|
45
|
+
"""
|
|
46
|
+
if not content:
|
|
47
|
+
return 0
|
|
48
|
+
|
|
49
|
+
# Initialize fingerprint vector
|
|
50
|
+
fingerprint = [0] * hash_bits
|
|
51
|
+
|
|
52
|
+
# Process content in 64-byte chunks
|
|
53
|
+
chunk_size = 64
|
|
54
|
+
for i in range(0, len(content), chunk_size):
|
|
55
|
+
chunk = content[i : i + chunk_size]
|
|
56
|
+
# Hash each chunk
|
|
57
|
+
h = hashlib.sha256(chunk).digest()
|
|
58
|
+
# Map hash bits to fingerprint
|
|
59
|
+
for bit_idx in range(hash_bits):
|
|
60
|
+
byte_idx = bit_idx // 8
|
|
61
|
+
bit_pos = bit_idx % 8
|
|
62
|
+
if byte_idx < len(h):
|
|
63
|
+
if (h[byte_idx] >> bit_pos) & 1:
|
|
64
|
+
fingerprint[bit_idx] += 1
|
|
65
|
+
else:
|
|
66
|
+
fingerprint[bit_idx] -= 1
|
|
67
|
+
|
|
68
|
+
# Convert fingerprint to integer
|
|
69
|
+
result = 0
|
|
70
|
+
for i, v in enumerate(fingerprint):
|
|
71
|
+
if v > 0:
|
|
72
|
+
result |= 1 << i
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def hamming_distance(hash1: int, hash2: int) -> int:
|
|
78
|
+
"""Compute Hamming distance between two SimHash fingerprints.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
hash1: First SimHash fingerprint
|
|
82
|
+
hash2: Second SimHash fingerprint
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Hamming distance (0-64)
|
|
86
|
+
"""
|
|
87
|
+
xor = hash1 ^ hash2
|
|
88
|
+
distance = 0
|
|
89
|
+
while xor:
|
|
90
|
+
distance += xor & 1
|
|
91
|
+
xor >>= 1
|
|
92
|
+
return distance
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class FastSimilarityMatcher:
|
|
96
|
+
"""Multi-tier similarity matching with progressive filtering.
|
|
97
|
+
|
|
98
|
+
Tiers:
|
|
99
|
+
1. Length-ratio filter (O(1)): Skip if object sizes differ >50%
|
|
100
|
+
2. SimHash filter (O(n)): Skip if Hamming distance indicates dissimilarity
|
|
101
|
+
3. Levenshtein distance (O(n×m)): Only for candidates passing tiers 1-2
|
|
102
|
+
4. Parallel processing: Use multiprocessing for tier 3 across CPU cores
|
|
103
|
+
|
|
104
|
+
Usage:
|
|
105
|
+
matcher = FastSimilarityMatcher(
|
|
106
|
+
length_ratio_threshold=0.5,
|
|
107
|
+
simhash_threshold=15, # Hamming distance
|
|
108
|
+
min_similarity=0.8
|
|
109
|
+
)
|
|
110
|
+
similar_pairs = matcher.find_similar_pairs(objects_dict)
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
length_ratio_threshold: float = 0.5,
|
|
116
|
+
simhash_threshold: int = 15,
|
|
117
|
+
min_similarity: float = 0.8,
|
|
118
|
+
use_parallel: bool = True,
|
|
119
|
+
max_workers: Optional[int] = None,
|
|
120
|
+
):
|
|
121
|
+
"""Initialize the similarity matcher.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
length_ratio_threshold: Skip if |len(a) - len(b)| / max(len(a), len(b)) > threshold
|
|
125
|
+
simhash_threshold: Skip if SimHash Hamming distance > threshold
|
|
126
|
+
min_similarity: Minimum Levenshtein similarity required (0.0-1.0)
|
|
127
|
+
use_parallel: Whether to use multiprocessing for tier 3
|
|
128
|
+
max_workers: Max worker processes (defaults to CPU count)
|
|
129
|
+
"""
|
|
130
|
+
self.length_ratio_threshold = length_ratio_threshold
|
|
131
|
+
self.simhash_threshold = simhash_threshold
|
|
132
|
+
self.min_similarity = min_similarity
|
|
133
|
+
self.use_parallel = use_parallel
|
|
134
|
+
self.max_workers = max_workers or cpu_count()
|
|
135
|
+
|
|
136
|
+
# Statistics for debugging/reporting
|
|
137
|
+
self.stats = {
|
|
138
|
+
"total_pairs": 0,
|
|
139
|
+
"filtered_tier1": 0, # Length ratio
|
|
140
|
+
"filtered_tier2": 0, # SimHash
|
|
141
|
+
"evaluated_tier3": 0, # Levenshtein
|
|
142
|
+
"matches_found": 0,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def find_similar_pairs(self, objects: Dict[str, bytes]) -> List[Tuple[str, str, float]]:
|
|
146
|
+
"""Find similar object pairs using tiered filtering.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
objects: Dict mapping object_id -> content (bytes)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of (id1, id2, similarity_score) tuples, sorted by similarity (descending)
|
|
153
|
+
"""
|
|
154
|
+
self.stats = {
|
|
155
|
+
"total_pairs": 0,
|
|
156
|
+
"filtered_tier1": 0,
|
|
157
|
+
"filtered_tier2": 0,
|
|
158
|
+
"evaluated_tier3": 0,
|
|
159
|
+
"matches_found": 0,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if len(objects) < 2:
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
object_ids = list(objects.keys())
|
|
166
|
+
similar_pairs: List[Tuple[str, str, float]] = []
|
|
167
|
+
|
|
168
|
+
# Pre-compute SimHash for all objects (tier 2 pre-computation)
|
|
169
|
+
simhash_cache = {oid: SimHashFilter.compute_hash(objects[oid]) for oid in object_ids}
|
|
170
|
+
|
|
171
|
+
# Generate candidate pairs
|
|
172
|
+
candidates_for_tier3 = []
|
|
173
|
+
|
|
174
|
+
for i in range(len(object_ids)):
|
|
175
|
+
for j in range(i + 1, len(object_ids)):
|
|
176
|
+
id1, id2 = object_ids[i], object_ids[j]
|
|
177
|
+
content1, content2 = objects[id1], objects[id2]
|
|
178
|
+
|
|
179
|
+
self.stats["total_pairs"] += 1
|
|
180
|
+
|
|
181
|
+
# Tier 1: Length-ratio filter
|
|
182
|
+
if not self._pass_length_filter(len(content1), len(content2)):
|
|
183
|
+
self.stats["filtered_tier1"] += 1
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
# Tier 2: SimHash filter
|
|
187
|
+
hash1 = simhash_cache[id1]
|
|
188
|
+
hash2 = simhash_cache[id2]
|
|
189
|
+
if not self._pass_simhash_filter(hash1, hash2):
|
|
190
|
+
self.stats["filtered_tier2"] += 1
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Tier 3: These candidates need Levenshtein distance
|
|
194
|
+
candidates_for_tier3.append((id1, id2, content1, content2))
|
|
195
|
+
|
|
196
|
+
# Tier 3: Levenshtein distance (parallel if enabled)
|
|
197
|
+
self.stats["evaluated_tier3"] = len(candidates_for_tier3)
|
|
198
|
+
|
|
199
|
+
if not candidates_for_tier3:
|
|
200
|
+
return []
|
|
201
|
+
|
|
202
|
+
if self.use_parallel and len(candidates_for_tier3) > 1:
|
|
203
|
+
similar_pairs = self._evaluate_tier3_parallel(candidates_for_tier3)
|
|
204
|
+
else:
|
|
205
|
+
similar_pairs = self._evaluate_tier3_serial(candidates_for_tier3)
|
|
206
|
+
|
|
207
|
+
# Sort by similarity (highest first)
|
|
208
|
+
similar_pairs.sort(key=lambda x: x[2], reverse=True)
|
|
209
|
+
self.stats["matches_found"] = len(similar_pairs)
|
|
210
|
+
|
|
211
|
+
return similar_pairs
|
|
212
|
+
|
|
213
|
+
def _pass_length_filter(self, len1: int, len2: int) -> bool:
|
|
214
|
+
"""Check if two objects pass the length-ratio filter (tier 1).
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
len1: Length of first object
|
|
218
|
+
len2: Length of second object
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
True if objects should be compared further, False if filtered out
|
|
222
|
+
"""
|
|
223
|
+
if len1 == 0 or len2 == 0:
|
|
224
|
+
return len1 == len2
|
|
225
|
+
|
|
226
|
+
max_len = max(len1, len2)
|
|
227
|
+
min_len = min(len1, len2)
|
|
228
|
+
ratio = 1.0 - (min_len / max_len)
|
|
229
|
+
|
|
230
|
+
return ratio <= self.length_ratio_threshold
|
|
231
|
+
|
|
232
|
+
def _pass_simhash_filter(self, hash1: int, hash2: int) -> bool:
|
|
233
|
+
"""Check if two objects pass the SimHash filter (tier 2).
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
hash1: SimHash fingerprint of first object
|
|
237
|
+
hash2: SimHash fingerprint of second object
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
True if objects should be compared further, False if filtered out
|
|
241
|
+
"""
|
|
242
|
+
distance = SimHashFilter.hamming_distance(hash1, hash2)
|
|
243
|
+
# Lower Hamming distance = more similar
|
|
244
|
+
return distance <= self.simhash_threshold
|
|
245
|
+
|
|
246
|
+
def _evaluate_tier3_serial(
|
|
247
|
+
self, candidates: List[Tuple[str, str, bytes, bytes]]
|
|
248
|
+
) -> List[Tuple[str, str, float]]:
|
|
249
|
+
"""Evaluate candidates using Levenshtein distance (serial).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
candidates: List of (id1, id2, content1, content2) tuples
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
|
|
256
|
+
"""
|
|
257
|
+
results = []
|
|
258
|
+
for id1, id2, content1, content2 in candidates:
|
|
259
|
+
similarity = self._levenshtein_similarity(content1, content2)
|
|
260
|
+
if similarity >= self.min_similarity:
|
|
261
|
+
results.append((id1, id2, similarity))
|
|
262
|
+
return results
|
|
263
|
+
|
|
264
|
+
def _evaluate_tier3_parallel(
|
|
265
|
+
self, candidates: List[Tuple[str, str, bytes, bytes]]
|
|
266
|
+
) -> List[Tuple[str, str, float]]:
|
|
267
|
+
"""Evaluate candidates using Levenshtein distance (parallel).
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
candidates: List of (id1, id2, content1, content2) tuples
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
|
|
274
|
+
"""
|
|
275
|
+
# Process pairs in parallel
|
|
276
|
+
with Pool(processes=self.max_workers) as pool:
|
|
277
|
+
results = pool.map(
|
|
278
|
+
partial(
|
|
279
|
+
_compute_similarity_worker,
|
|
280
|
+
min_similarity=self.min_similarity,
|
|
281
|
+
),
|
|
282
|
+
candidates,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Filter out None results (pairs that didn't meet minimum similarity)
|
|
286
|
+
return [r for r in results if r is not None]
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def _levenshtein_similarity(s1: bytes, s2: bytes) -> float:
|
|
290
|
+
"""Compute Levenshtein similarity (0.0-1.0).
|
|
291
|
+
|
|
292
|
+
Similarity = 1.0 - (distance / max_length)
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
s1: First byte sequence
|
|
296
|
+
s2: Second byte sequence
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Similarity score (0.0 = completely different, 1.0 = identical)
|
|
300
|
+
"""
|
|
301
|
+
distance = _levenshtein_distance(s1, s2)
|
|
302
|
+
max_len = max(len(s1), len(s2))
|
|
303
|
+
if max_len == 0:
|
|
304
|
+
return 1.0
|
|
305
|
+
return 1.0 - (distance / max_len)
|
|
306
|
+
|
|
307
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
308
|
+
"""Get filtering statistics.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Dict with tier-by-tier breakdown of filtering effectiveness
|
|
312
|
+
"""
|
|
313
|
+
total = self.stats["total_pairs"]
|
|
314
|
+
tier1_pct = (self.stats["filtered_tier1"] / total * 100) if total > 0 else 0
|
|
315
|
+
tier2_pct = (self.stats["filtered_tier2"] / total * 100) if total > 0 else 0
|
|
316
|
+
|
|
317
|
+
return {
|
|
318
|
+
"total_pairs_evaluated": total,
|
|
319
|
+
"filtered_tier1_length": {
|
|
320
|
+
"count": self.stats["filtered_tier1"],
|
|
321
|
+
"percentage": tier1_pct,
|
|
322
|
+
},
|
|
323
|
+
"filtered_tier2_simhash": {
|
|
324
|
+
"count": self.stats["filtered_tier2"],
|
|
325
|
+
"percentage": tier2_pct,
|
|
326
|
+
},
|
|
327
|
+
"evaluated_tier3_levenshtein": {
|
|
328
|
+
"count": self.stats["evaluated_tier3"],
|
|
329
|
+
"percentage": ((self.stats["evaluated_tier3"] / total * 100) if total > 0 else 0),
|
|
330
|
+
},
|
|
331
|
+
"matches_found": self.stats["matches_found"],
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
def log_statistics(self, logger=None) -> None:
|
|
335
|
+
"""Log filtering statistics for debugging."""
|
|
336
|
+
stats = self.get_statistics()
|
|
337
|
+
output = [
|
|
338
|
+
"Similarity Matching Statistics",
|
|
339
|
+
"=" * 50,
|
|
340
|
+
f"Total pairs evaluated: {stats['total_pairs_evaluated']}",
|
|
341
|
+
f"Filtered (Tier 1 - Length): {stats['filtered_tier1_length']['count']} ({stats['filtered_tier1_length']['percentage']:.1f}%)",
|
|
342
|
+
f"Filtered (Tier 2 - SimHash): {stats['filtered_tier2_simhash']['count']} ({stats['filtered_tier2_simhash']['percentage']:.1f}%)",
|
|
343
|
+
f"Evaluated (Tier 3 - Levenshtein): {stats['evaluated_tier3_levenshtein']['count']} ({stats['evaluated_tier3_levenshtein']['percentage']:.1f}%)",
|
|
344
|
+
f"Similar pairs found: {stats['matches_found']}",
|
|
345
|
+
"=" * 50,
|
|
346
|
+
]
|
|
347
|
+
full_output = "\n".join(output)
|
|
348
|
+
if logger:
|
|
349
|
+
logger.info(full_output)
|
|
350
|
+
else:
|
|
351
|
+
print(full_output)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _levenshtein_distance(s1: bytes, s2: bytes) -> int:
|
|
355
|
+
"""Compute Levenshtein distance between two byte sequences.
|
|
356
|
+
|
|
357
|
+
O(n×m) time complexity. Optimized for common cases.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
s1: First byte sequence
|
|
361
|
+
s2: Second byte sequence
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Edit distance (minimum edits to transform s1 to s2)
|
|
365
|
+
"""
|
|
366
|
+
if len(s1) < len(s2):
|
|
367
|
+
s1, s2 = s2, s1
|
|
368
|
+
|
|
369
|
+
if len(s2) == 0:
|
|
370
|
+
return len(s1)
|
|
371
|
+
|
|
372
|
+
# Use only two rows for space optimization
|
|
373
|
+
prev = list(range(len(s2) + 1))
|
|
374
|
+
for i, c1 in enumerate(s1):
|
|
375
|
+
curr = [i + 1]
|
|
376
|
+
for j, c2 in enumerate(s2):
|
|
377
|
+
insertions = prev[j + 1] + 1
|
|
378
|
+
deletions = curr[j] + 1
|
|
379
|
+
substitutions = prev[j] + (c1 != c2)
|
|
380
|
+
curr.append(min(insertions, deletions, substitutions))
|
|
381
|
+
prev = curr
|
|
382
|
+
|
|
383
|
+
return prev[-1]
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _compute_similarity_worker(
|
|
387
|
+
candidate: Tuple[str, str, bytes, bytes],
|
|
388
|
+
min_similarity: float,
|
|
389
|
+
) -> Optional[Tuple[str, str, float]]:
|
|
390
|
+
"""Worker function for parallel Levenshtein computation.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
candidate: (id1, id2, content1, content2) tuple
|
|
394
|
+
min_similarity: Minimum similarity threshold
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
(id1, id2, similarity) if similarity >= min_similarity, else None
|
|
398
|
+
"""
|
|
399
|
+
id1, id2, content1, content2 = candidate
|
|
400
|
+
similarity = FastSimilarityMatcher._levenshtein_similarity(content1, content2)
|
|
401
|
+
|
|
402
|
+
if similarity >= min_similarity:
|
|
403
|
+
return (id1, id2, similarity)
|
|
404
|
+
return None
|
memvcs/core/federated.py
CHANGED
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
from typing import Optional, List, Dict, Any
|
|
12
12
|
|
|
13
13
|
from .config_loader import load_agmem_config
|
|
14
|
+
from .protocol_builder import ClientSummaryBuilder
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def get_federated_config(repo_root: Path) -> Optional[Dict[str, Any]]:
|
|
@@ -120,17 +121,27 @@ def produce_local_summary(
|
|
|
120
121
|
|
|
121
122
|
|
|
122
123
|
def push_updates(repo_root: Path, summary: Dict[str, Any]) -> str:
|
|
123
|
-
"""Send local summary to coordinator
|
|
124
|
+
"""Send local summary to coordinator using protocol-compliant schema.
|
|
125
|
+
|
|
126
|
+
Uses ClientSummaryBuilder to ensure the summary conforms to the
|
|
127
|
+
server's PushRequest schema before transmission.
|
|
128
|
+
|
|
129
|
+
Returns status message."""
|
|
124
130
|
cfg = get_federated_config(repo_root)
|
|
125
131
|
if not cfg:
|
|
126
132
|
return "Federated collaboration not configured"
|
|
127
133
|
url = cfg["coordinator_url"] + "/push"
|
|
128
134
|
try:
|
|
135
|
+
from .protocol_builder import ClientSummaryBuilder
|
|
136
|
+
|
|
137
|
+
# Build protocol-compliant summary
|
|
138
|
+
compliant_summary = ClientSummaryBuilder.build(repo_root, summary, strict_mode=False)
|
|
139
|
+
|
|
129
140
|
import urllib.request
|
|
130
141
|
|
|
131
142
|
req = urllib.request.Request(
|
|
132
143
|
url,
|
|
133
|
-
data=json.dumps(
|
|
144
|
+
data=json.dumps(compliant_summary).encode(),
|
|
134
145
|
headers={"Content-Type": "application/json"},
|
|
135
146
|
method="POST",
|
|
136
147
|
)
|
memvcs/core/gardener.py
CHANGED
|
@@ -354,28 +354,11 @@ class Gardener:
|
|
|
354
354
|
except ValueError:
|
|
355
355
|
insight_path = self.semantic_dir / f"insight-{timestamp}.md"
|
|
356
356
|
|
|
357
|
-
# Generate frontmatter
|
|
357
|
+
# Generate frontmatter
|
|
358
358
|
source_episodes = len(cluster.episodes)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
and self.config.dp_delta is not None
|
|
363
|
-
):
|
|
364
|
-
from .privacy_budget import add_noise
|
|
365
|
-
|
|
366
|
-
source_episodes = max(
|
|
367
|
-
0,
|
|
368
|
-
int(
|
|
369
|
-
round(
|
|
370
|
-
add_noise(
|
|
371
|
-
float(source_episodes),
|
|
372
|
-
1.0,
|
|
373
|
-
self.config.dp_epsilon,
|
|
374
|
-
self.config.dp_delta,
|
|
375
|
-
)
|
|
376
|
-
)
|
|
377
|
-
),
|
|
378
|
-
)
|
|
359
|
+
# Metadata noise removed: source_episodes is a metadata count (number of episodes
|
|
360
|
+
# contributing to this insight), not an individual fact. Adding noise to metadata
|
|
361
|
+
# doesn't provide meaningful privacy guarantees. See privacy_validator.py.
|
|
379
362
|
frontmatter = {
|
|
380
363
|
"schema_version": "1.0",
|
|
381
364
|
"last_updated": datetime.utcnow().isoformat() + "Z",
|
|
@@ -514,53 +497,10 @@ class Gardener:
|
|
|
514
497
|
clusters_found = len(clusters)
|
|
515
498
|
insights_generated = insights_written
|
|
516
499
|
episodes_archived = archived_count
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
):
|
|
522
|
-
from .privacy_budget import add_noise
|
|
523
|
-
|
|
524
|
-
sensitivity = 1.0
|
|
525
|
-
clusters_found = max(
|
|
526
|
-
0,
|
|
527
|
-
int(
|
|
528
|
-
round(
|
|
529
|
-
add_noise(
|
|
530
|
-
float(clusters_found),
|
|
531
|
-
sensitivity,
|
|
532
|
-
self.config.dp_epsilon,
|
|
533
|
-
self.config.dp_delta,
|
|
534
|
-
)
|
|
535
|
-
)
|
|
536
|
-
),
|
|
537
|
-
)
|
|
538
|
-
insights_generated = max(
|
|
539
|
-
0,
|
|
540
|
-
int(
|
|
541
|
-
round(
|
|
542
|
-
add_noise(
|
|
543
|
-
float(insights_generated),
|
|
544
|
-
sensitivity,
|
|
545
|
-
self.config.dp_epsilon,
|
|
546
|
-
self.config.dp_delta,
|
|
547
|
-
)
|
|
548
|
-
)
|
|
549
|
-
),
|
|
550
|
-
)
|
|
551
|
-
episodes_archived = max(
|
|
552
|
-
0,
|
|
553
|
-
int(
|
|
554
|
-
round(
|
|
555
|
-
add_noise(
|
|
556
|
-
float(episodes_archived),
|
|
557
|
-
sensitivity,
|
|
558
|
-
self.config.dp_epsilon,
|
|
559
|
-
self.config.dp_delta,
|
|
560
|
-
)
|
|
561
|
-
)
|
|
562
|
-
),
|
|
563
|
-
)
|
|
500
|
+
# Metadata noise removed: clusters_found, insights_generated, and
|
|
501
|
+
# episodes_archived are metadata counts, not individual facts.
|
|
502
|
+
# Adding noise to these doesn't provide meaningful privacy guarantees.
|
|
503
|
+
# See privacy_validator.py for the distinction between metadata and facts.
|
|
564
504
|
|
|
565
505
|
return GardenerResult(
|
|
566
506
|
success=True,
|
memvcs/core/pack.py
CHANGED
|
@@ -440,7 +440,7 @@ def run_repack(
|
|
|
440
440
|
return (0, 0)
|
|
441
441
|
if dry_run:
|
|
442
442
|
return (len(hash_to_type), 0)
|
|
443
|
-
|
|
443
|
+
write_pack_with_delta(objects_dir, store, hash_to_type)
|
|
444
444
|
freed = 0
|
|
445
445
|
for hash_id, obj_type in hash_to_type.items():
|
|
446
446
|
p = store.objects_dir / obj_type / hash_id[:2] / hash_id[2:]
|