agmem 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
memvcs/core/delta.py CHANGED
@@ -9,8 +9,11 @@ This can achieve 5-10x compression improvement for highly similar content
9
9
  """
10
10
 
11
11
  import hashlib
12
+ from collections import defaultdict
12
13
  from typing import List, Tuple, Dict, Optional
13
14
 
15
+ from memvcs.core.fast_similarity import FastSimilarityMatcher
16
+
14
17
 
15
18
  def levenshtein_distance(s1: bytes, s2: bytes) -> int:
16
19
  """
@@ -75,34 +78,53 @@ def find_similar_objects(
75
78
  """
76
79
  candidates = {h: content for h, content in objects.items() if len(content) >= min_size}
77
80
 
78
- if not candidates:
81
+ if len(candidates) < 2:
79
82
  return []
80
83
 
81
- grouped = {}
82
- used = set()
83
-
84
- for hash_id, content in candidates.items():
85
- if hash_id in used:
86
- continue
87
-
88
- group = [hash_id]
89
- used.add(hash_id)
90
-
91
- for other_id, other_content in candidates.items():
92
- if other_id in used:
93
- continue
84
+ use_parallel = len(candidates) > 10
85
+ max_len = max(len(content) for content in candidates.values())
86
+ simhash_threshold = 64 if max_len < 256 else 15
87
+ matcher = FastSimilarityMatcher(
88
+ length_ratio_threshold=0.5,
89
+ simhash_threshold=simhash_threshold,
90
+ min_similarity=similarity_threshold,
91
+ use_parallel=use_parallel,
92
+ max_workers=None,
93
+ )
94
+
95
+ similar_pairs = matcher.find_similar_pairs(candidates)
96
+ if not similar_pairs:
97
+ return []
94
98
 
95
- similarity = content_similarity(content, other_content)
96
- if similarity >= similarity_threshold:
97
- group.append(other_id)
98
- used.add(other_id)
99
+ graph: Dict[str, set] = defaultdict(set)
100
+ for id1, id2, _score in similar_pairs:
101
+ graph[id1].add(id2)
102
+ graph[id2].add(id1)
99
103
 
100
- if len(group) > 1:
101
- # Sort by size ascending (smallest first = best base)
102
- group.sort(key=lambda h: len(candidates[h]))
103
- grouped[group[0]] = group
104
+ groups: List[List[str]] = []
105
+ visited = set()
104
106
 
105
- return list(grouped.values())
107
+ for node in graph:
108
+ if node in visited:
109
+ continue
110
+ stack = [node]
111
+ component = []
112
+ visited.add(node)
113
+
114
+ while stack:
115
+ current = stack.pop()
116
+ component.append(current)
117
+ for neighbor in graph[current]:
118
+ if neighbor in visited:
119
+ continue
120
+ visited.add(neighbor)
121
+ stack.append(neighbor)
122
+
123
+ if len(component) > 1:
124
+ component.sort(key=lambda h: len(candidates[h]))
125
+ groups.append(component)
126
+
127
+ return groups
106
128
 
107
129
 
108
130
  def compute_delta(base: bytes, target: bytes) -> bytes:
memvcs/core/distiller.py CHANGED
@@ -211,7 +211,6 @@ class Distiller:
211
211
  # Sample facts with noise - prevents any single episode from dominating
212
212
  import random
213
213
 
214
- random.seed(42) # Deterministic but different per cluster due to content
215
214
  sampled = random.sample(facts, min(noisy_count, len(facts)))
216
215
 
217
216
  # Optional: Add slight noise to fact embeddings if vector store available
@@ -233,17 +232,9 @@ class Distiller:
233
232
  out_path = self.target_dir / f"consolidated-{ts}.md"
234
233
 
235
234
  confidence_score = self.config.extraction_confidence_threshold
236
- if (
237
- self.config.use_dp
238
- and self.config.dp_epsilon is not None
239
- and self.config.dp_delta is not None
240
- ):
241
- from .privacy_budget import add_noise
242
-
243
- confidence_score = add_noise(
244
- confidence_score, 0.1, self.config.dp_epsilon, self.config.dp_delta
245
- )
246
- confidence_score = max(0.0, min(1.0, confidence_score))
235
+ # Metadata noise removed: confidence_score is a metadata field (threshold setting),
236
+ # not an individual fact. Adding noise to metadata doesn't provide meaningful
237
+ # privacy guarantees. See privacy_validator.py for the distinction.
247
238
  frontmatter = {
248
239
  "schema_version": "1.0",
249
240
  "last_updated": datetime.utcnow().isoformat() + "Z",
@@ -0,0 +1,404 @@
1
+ """
2
+ Fast similarity matching with tiered filtering.
3
+
4
+ Solves O(n²×m²) performance bottleneck in delta encoding by filtering
5
+ candidates before expensive Levenshtein distance computation.
6
+
7
+ Three-tier approach:
8
+ 1. Length-ratio filter: O(1) - skip if sizes differ >50%
9
+ 2. SimHash filter: O(n) - skip if approximate similarity below threshold
10
+ 3. Levenshtein distance: O(n×m) - only for candidates passing tiers 1-2
11
+ 4. Parallel processing: Multiprocessing for tier 3 across multiple cores
12
+
13
+ With 100 objects × 2KB each, filters typically eliminate 90%+ of pairs
14
+ before expensive distance computation, reducing 40B operations to <100M.
15
+ """
16
+
17
+ import hashlib
18
+ from typing import Dict, List, Tuple, Optional, Set, Any
19
+ from multiprocessing import Pool, cpu_count
20
+ from functools import partial
21
+ import math
22
+
23
+
24
+ class SimHashFilter:
25
+ """Fast approximate similarity using SimHash.
26
+
27
+ SimHash creates a 64-bit fingerprint of content that:
28
+ - Changes minimally for similar content
29
+ - Computes in O(n) time
30
+ - Allows Hamming distance for approximate matching
31
+
32
+ Papers: "Detecting Near-Duplicates for Web Crawling" (Charikar, 2002)
33
+ """
34
+
35
+ @staticmethod
36
+ def compute_hash(content: bytes, hash_bits: int = 64) -> int:
37
+ """Compute SimHash fingerprint for content.
38
+
39
+ Args:
40
+ content: Bytes to hash
41
+ hash_bits: Number of bits in fingerprint (default 64)
42
+
43
+ Returns:
44
+ SimHash fingerprint as integer
45
+ """
46
+ if not content:
47
+ return 0
48
+
49
+ # Initialize fingerprint vector
50
+ fingerprint = [0] * hash_bits
51
+
52
+ # Process content in 64-byte chunks
53
+ chunk_size = 64
54
+ for i in range(0, len(content), chunk_size):
55
+ chunk = content[i : i + chunk_size]
56
+ # Hash each chunk
57
+ h = hashlib.sha256(chunk).digest()
58
+ # Map hash bits to fingerprint
59
+ for bit_idx in range(hash_bits):
60
+ byte_idx = bit_idx // 8
61
+ bit_pos = bit_idx % 8
62
+ if byte_idx < len(h):
63
+ if (h[byte_idx] >> bit_pos) & 1:
64
+ fingerprint[bit_idx] += 1
65
+ else:
66
+ fingerprint[bit_idx] -= 1
67
+
68
+ # Convert fingerprint to integer
69
+ result = 0
70
+ for i, v in enumerate(fingerprint):
71
+ if v > 0:
72
+ result |= 1 << i
73
+
74
+ return result
75
+
76
+ @staticmethod
77
+ def hamming_distance(hash1: int, hash2: int) -> int:
78
+ """Compute Hamming distance between two SimHash fingerprints.
79
+
80
+ Args:
81
+ hash1: First SimHash fingerprint
82
+ hash2: Second SimHash fingerprint
83
+
84
+ Returns:
85
+ Hamming distance (0-64)
86
+ """
87
+ xor = hash1 ^ hash2
88
+ distance = 0
89
+ while xor:
90
+ distance += xor & 1
91
+ xor >>= 1
92
+ return distance
93
+
94
+
95
+ class FastSimilarityMatcher:
96
+ """Multi-tier similarity matching with progressive filtering.
97
+
98
+ Tiers:
99
+ 1. Length-ratio filter (O(1)): Skip if object sizes differ >50%
100
+ 2. SimHash filter (O(n)): Skip if Hamming distance indicates dissimilarity
101
+ 3. Levenshtein distance (O(n×m)): Only for candidates passing tiers 1-2
102
+ 4. Parallel processing: Use multiprocessing for tier 3 across CPU cores
103
+
104
+ Usage:
105
+ matcher = FastSimilarityMatcher(
106
+ length_ratio_threshold=0.5,
107
+ simhash_threshold=15, # Hamming distance
108
+ min_similarity=0.8
109
+ )
110
+ similar_pairs = matcher.find_similar_pairs(objects_dict)
111
+ """
112
+
113
+ def __init__(
114
+ self,
115
+ length_ratio_threshold: float = 0.5,
116
+ simhash_threshold: int = 15,
117
+ min_similarity: float = 0.8,
118
+ use_parallel: bool = True,
119
+ max_workers: Optional[int] = None,
120
+ ):
121
+ """Initialize the similarity matcher.
122
+
123
+ Args:
124
+ length_ratio_threshold: Skip if |len(a) - len(b)| / max(len(a), len(b)) > threshold
125
+ simhash_threshold: Skip if SimHash Hamming distance > threshold
126
+ min_similarity: Minimum Levenshtein similarity required (0.0-1.0)
127
+ use_parallel: Whether to use multiprocessing for tier 3
128
+ max_workers: Max worker processes (defaults to CPU count)
129
+ """
130
+ self.length_ratio_threshold = length_ratio_threshold
131
+ self.simhash_threshold = simhash_threshold
132
+ self.min_similarity = min_similarity
133
+ self.use_parallel = use_parallel
134
+ self.max_workers = max_workers or cpu_count()
135
+
136
+ # Statistics for debugging/reporting
137
+ self.stats = {
138
+ "total_pairs": 0,
139
+ "filtered_tier1": 0, # Length ratio
140
+ "filtered_tier2": 0, # SimHash
141
+ "evaluated_tier3": 0, # Levenshtein
142
+ "matches_found": 0,
143
+ }
144
+
145
+ def find_similar_pairs(self, objects: Dict[str, bytes]) -> List[Tuple[str, str, float]]:
146
+ """Find similar object pairs using tiered filtering.
147
+
148
+ Args:
149
+ objects: Dict mapping object_id -> content (bytes)
150
+
151
+ Returns:
152
+ List of (id1, id2, similarity_score) tuples, sorted by similarity (descending)
153
+ """
154
+ self.stats = {
155
+ "total_pairs": 0,
156
+ "filtered_tier1": 0,
157
+ "filtered_tier2": 0,
158
+ "evaluated_tier3": 0,
159
+ "matches_found": 0,
160
+ }
161
+
162
+ if len(objects) < 2:
163
+ return []
164
+
165
+ object_ids = list(objects.keys())
166
+ similar_pairs: List[Tuple[str, str, float]] = []
167
+
168
+ # Pre-compute SimHash for all objects (tier 2 pre-computation)
169
+ simhash_cache = {oid: SimHashFilter.compute_hash(objects[oid]) for oid in object_ids}
170
+
171
+ # Generate candidate pairs
172
+ candidates_for_tier3 = []
173
+
174
+ for i in range(len(object_ids)):
175
+ for j in range(i + 1, len(object_ids)):
176
+ id1, id2 = object_ids[i], object_ids[j]
177
+ content1, content2 = objects[id1], objects[id2]
178
+
179
+ self.stats["total_pairs"] += 1
180
+
181
+ # Tier 1: Length-ratio filter
182
+ if not self._pass_length_filter(len(content1), len(content2)):
183
+ self.stats["filtered_tier1"] += 1
184
+ continue
185
+
186
+ # Tier 2: SimHash filter
187
+ hash1 = simhash_cache[id1]
188
+ hash2 = simhash_cache[id2]
189
+ if not self._pass_simhash_filter(hash1, hash2):
190
+ self.stats["filtered_tier2"] += 1
191
+ continue
192
+
193
+ # Tier 3: These candidates need Levenshtein distance
194
+ candidates_for_tier3.append((id1, id2, content1, content2))
195
+
196
+ # Tier 3: Levenshtein distance (parallel if enabled)
197
+ self.stats["evaluated_tier3"] = len(candidates_for_tier3)
198
+
199
+ if not candidates_for_tier3:
200
+ return []
201
+
202
+ if self.use_parallel and len(candidates_for_tier3) > 1:
203
+ similar_pairs = self._evaluate_tier3_parallel(candidates_for_tier3)
204
+ else:
205
+ similar_pairs = self._evaluate_tier3_serial(candidates_for_tier3)
206
+
207
+ # Sort by similarity (highest first)
208
+ similar_pairs.sort(key=lambda x: x[2], reverse=True)
209
+ self.stats["matches_found"] = len(similar_pairs)
210
+
211
+ return similar_pairs
212
+
213
+ def _pass_length_filter(self, len1: int, len2: int) -> bool:
214
+ """Check if two objects pass the length-ratio filter (tier 1).
215
+
216
+ Args:
217
+ len1: Length of first object
218
+ len2: Length of second object
219
+
220
+ Returns:
221
+ True if objects should be compared further, False if filtered out
222
+ """
223
+ if len1 == 0 or len2 == 0:
224
+ return len1 == len2
225
+
226
+ max_len = max(len1, len2)
227
+ min_len = min(len1, len2)
228
+ ratio = 1.0 - (min_len / max_len)
229
+
230
+ return ratio <= self.length_ratio_threshold
231
+
232
+ def _pass_simhash_filter(self, hash1: int, hash2: int) -> bool:
233
+ """Check if two objects pass the SimHash filter (tier 2).
234
+
235
+ Args:
236
+ hash1: SimHash fingerprint of first object
237
+ hash2: SimHash fingerprint of second object
238
+
239
+ Returns:
240
+ True if objects should be compared further, False if filtered out
241
+ """
242
+ distance = SimHashFilter.hamming_distance(hash1, hash2)
243
+ # Lower Hamming distance = more similar
244
+ return distance <= self.simhash_threshold
245
+
246
+ def _evaluate_tier3_serial(
247
+ self, candidates: List[Tuple[str, str, bytes, bytes]]
248
+ ) -> List[Tuple[str, str, float]]:
249
+ """Evaluate candidates using Levenshtein distance (serial).
250
+
251
+ Args:
252
+ candidates: List of (id1, id2, content1, content2) tuples
253
+
254
+ Returns:
255
+ List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
256
+ """
257
+ results = []
258
+ for id1, id2, content1, content2 in candidates:
259
+ similarity = self._levenshtein_similarity(content1, content2)
260
+ if similarity >= self.min_similarity:
261
+ results.append((id1, id2, similarity))
262
+ return results
263
+
264
+ def _evaluate_tier3_parallel(
265
+ self, candidates: List[Tuple[str, str, bytes, bytes]]
266
+ ) -> List[Tuple[str, str, float]]:
267
+ """Evaluate candidates using Levenshtein distance (parallel).
268
+
269
+ Args:
270
+ candidates: List of (id1, id2, content1, content2) tuples
271
+
272
+ Returns:
273
+ List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
274
+ """
275
+ # Process pairs in parallel
276
+ with Pool(processes=self.max_workers) as pool:
277
+ results = pool.map(
278
+ partial(
279
+ _compute_similarity_worker,
280
+ min_similarity=self.min_similarity,
281
+ ),
282
+ candidates,
283
+ )
284
+
285
+ # Filter out None results (pairs that didn't meet minimum similarity)
286
+ return [r for r in results if r is not None]
287
+
288
+ @staticmethod
289
+ def _levenshtein_similarity(s1: bytes, s2: bytes) -> float:
290
+ """Compute Levenshtein similarity (0.0-1.0).
291
+
292
+ Similarity = 1.0 - (distance / max_length)
293
+
294
+ Args:
295
+ s1: First byte sequence
296
+ s2: Second byte sequence
297
+
298
+ Returns:
299
+ Similarity score (0.0 = completely different, 1.0 = identical)
300
+ """
301
+ distance = _levenshtein_distance(s1, s2)
302
+ max_len = max(len(s1), len(s2))
303
+ if max_len == 0:
304
+ return 1.0
305
+ return 1.0 - (distance / max_len)
306
+
307
+ def get_statistics(self) -> Dict[str, Any]:
308
+ """Get filtering statistics.
309
+
310
+ Returns:
311
+ Dict with tier-by-tier breakdown of filtering effectiveness
312
+ """
313
+ total = self.stats["total_pairs"]
314
+ tier1_pct = (self.stats["filtered_tier1"] / total * 100) if total > 0 else 0
315
+ tier2_pct = (self.stats["filtered_tier2"] / total * 100) if total > 0 else 0
316
+
317
+ return {
318
+ "total_pairs_evaluated": total,
319
+ "filtered_tier1_length": {
320
+ "count": self.stats["filtered_tier1"],
321
+ "percentage": tier1_pct,
322
+ },
323
+ "filtered_tier2_simhash": {
324
+ "count": self.stats["filtered_tier2"],
325
+ "percentage": tier2_pct,
326
+ },
327
+ "evaluated_tier3_levenshtein": {
328
+ "count": self.stats["evaluated_tier3"],
329
+ "percentage": ((self.stats["evaluated_tier3"] / total * 100) if total > 0 else 0),
330
+ },
331
+ "matches_found": self.stats["matches_found"],
332
+ }
333
+
334
+ def log_statistics(self, logger=None) -> None:
335
+ """Log filtering statistics for debugging."""
336
+ stats = self.get_statistics()
337
+ output = [
338
+ "Similarity Matching Statistics",
339
+ "=" * 50,
340
+ f"Total pairs evaluated: {stats['total_pairs_evaluated']}",
341
+ f"Filtered (Tier 1 - Length): {stats['filtered_tier1_length']['count']} ({stats['filtered_tier1_length']['percentage']:.1f}%)",
342
+ f"Filtered (Tier 2 - SimHash): {stats['filtered_tier2_simhash']['count']} ({stats['filtered_tier2_simhash']['percentage']:.1f}%)",
343
+ f"Evaluated (Tier 3 - Levenshtein): {stats['evaluated_tier3_levenshtein']['count']} ({stats['evaluated_tier3_levenshtein']['percentage']:.1f}%)",
344
+ f"Similar pairs found: {stats['matches_found']}",
345
+ "=" * 50,
346
+ ]
347
+ full_output = "\n".join(output)
348
+ if logger:
349
+ logger.info(full_output)
350
+ else:
351
+ print(full_output)
352
+
353
+
354
+ def _levenshtein_distance(s1: bytes, s2: bytes) -> int:
355
+ """Compute Levenshtein distance between two byte sequences.
356
+
357
+ O(n×m) time complexity. Optimized for common cases.
358
+
359
+ Args:
360
+ s1: First byte sequence
361
+ s2: Second byte sequence
362
+
363
+ Returns:
364
+ Edit distance (minimum edits to transform s1 to s2)
365
+ """
366
+ if len(s1) < len(s2):
367
+ s1, s2 = s2, s1
368
+
369
+ if len(s2) == 0:
370
+ return len(s1)
371
+
372
+ # Use only two rows for space optimization
373
+ prev = list(range(len(s2) + 1))
374
+ for i, c1 in enumerate(s1):
375
+ curr = [i + 1]
376
+ for j, c2 in enumerate(s2):
377
+ insertions = prev[j + 1] + 1
378
+ deletions = curr[j] + 1
379
+ substitutions = prev[j] + (c1 != c2)
380
+ curr.append(min(insertions, deletions, substitutions))
381
+ prev = curr
382
+
383
+ return prev[-1]
384
+
385
+
386
+ def _compute_similarity_worker(
387
+ candidate: Tuple[str, str, bytes, bytes],
388
+ min_similarity: float,
389
+ ) -> Optional[Tuple[str, str, float]]:
390
+ """Worker function for parallel Levenshtein computation.
391
+
392
+ Args:
393
+ candidate: (id1, id2, content1, content2) tuple
394
+ min_similarity: Minimum similarity threshold
395
+
396
+ Returns:
397
+ (id1, id2, similarity) if similarity >= min_similarity, else None
398
+ """
399
+ id1, id2, content1, content2 = candidate
400
+ similarity = FastSimilarityMatcher._levenshtein_similarity(content1, content2)
401
+
402
+ if similarity >= min_similarity:
403
+ return (id1, id2, similarity)
404
+ return None
memvcs/core/federated.py CHANGED
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
  from typing import Optional, List, Dict, Any
12
12
 
13
13
  from .config_loader import load_agmem_config
14
+ from .protocol_builder import ClientSummaryBuilder
14
15
 
15
16
 
16
17
  def get_federated_config(repo_root: Path) -> Optional[Dict[str, Any]]:
@@ -120,17 +121,27 @@ def produce_local_summary(
120
121
 
121
122
 
122
123
  def push_updates(repo_root: Path, summary: Dict[str, Any]) -> str:
123
- """Send local summary to coordinator. Returns status message."""
124
+ """Send local summary to coordinator using protocol-compliant schema.
125
+
126
+ Uses ClientSummaryBuilder to ensure the summary conforms to the
127
+ server's PushRequest schema before transmission.
128
+
129
+ Returns status message."""
124
130
  cfg = get_federated_config(repo_root)
125
131
  if not cfg:
126
132
  return "Federated collaboration not configured"
127
133
  url = cfg["coordinator_url"] + "/push"
128
134
  try:
135
+ from .protocol_builder import ClientSummaryBuilder
136
+
137
+ # Build protocol-compliant summary
138
+ compliant_summary = ClientSummaryBuilder.build(repo_root, summary, strict_mode=False)
139
+
129
140
  import urllib.request
130
141
 
131
142
  req = urllib.request.Request(
132
143
  url,
133
- data=json.dumps(summary).encode(),
144
+ data=json.dumps(compliant_summary).encode(),
134
145
  headers={"Content-Type": "application/json"},
135
146
  method="POST",
136
147
  )
memvcs/core/gardener.py CHANGED
@@ -354,28 +354,11 @@ class Gardener:
354
354
  except ValueError:
355
355
  insight_path = self.semantic_dir / f"insight-{timestamp}.md"
356
356
 
357
- # Generate frontmatter (optionally noised for differential privacy)
357
+ # Generate frontmatter
358
358
  source_episodes = len(cluster.episodes)
359
- if (
360
- self.config.use_dp
361
- and self.config.dp_epsilon is not None
362
- and self.config.dp_delta is not None
363
- ):
364
- from .privacy_budget import add_noise
365
-
366
- source_episodes = max(
367
- 0,
368
- int(
369
- round(
370
- add_noise(
371
- float(source_episodes),
372
- 1.0,
373
- self.config.dp_epsilon,
374
- self.config.dp_delta,
375
- )
376
- )
377
- ),
378
- )
359
+ # Metadata noise removed: source_episodes is a metadata count (number of episodes
360
+ # contributing to this insight), not an individual fact. Adding noise to metadata
361
+ # doesn't provide meaningful privacy guarantees. See privacy_validator.py.
379
362
  frontmatter = {
380
363
  "schema_version": "1.0",
381
364
  "last_updated": datetime.utcnow().isoformat() + "Z",
@@ -514,53 +497,10 @@ class Gardener:
514
497
  clusters_found = len(clusters)
515
498
  insights_generated = insights_written
516
499
  episodes_archived = archived_count
517
- if (
518
- self.config.use_dp
519
- and self.config.dp_epsilon is not None
520
- and self.config.dp_delta is not None
521
- ):
522
- from .privacy_budget import add_noise
523
-
524
- sensitivity = 1.0
525
- clusters_found = max(
526
- 0,
527
- int(
528
- round(
529
- add_noise(
530
- float(clusters_found),
531
- sensitivity,
532
- self.config.dp_epsilon,
533
- self.config.dp_delta,
534
- )
535
- )
536
- ),
537
- )
538
- insights_generated = max(
539
- 0,
540
- int(
541
- round(
542
- add_noise(
543
- float(insights_generated),
544
- sensitivity,
545
- self.config.dp_epsilon,
546
- self.config.dp_delta,
547
- )
548
- )
549
- ),
550
- )
551
- episodes_archived = max(
552
- 0,
553
- int(
554
- round(
555
- add_noise(
556
- float(episodes_archived),
557
- sensitivity,
558
- self.config.dp_epsilon,
559
- self.config.dp_delta,
560
- )
561
- )
562
- ),
563
- )
500
+ # Metadata noise removed: clusters_found, insights_generated, and
501
+ # episodes_archived are metadata counts, not individual facts.
502
+ # Adding noise to these doesn't provide meaningful privacy guarantees.
503
+ # See privacy_validator.py for the distinction between metadata and facts.
564
504
 
565
505
  return GardenerResult(
566
506
  success=True,
memvcs/core/pack.py CHANGED
@@ -440,7 +440,7 @@ def run_repack(
440
440
  return (0, 0)
441
441
  if dry_run:
442
442
  return (len(hash_to_type), 0)
443
- write_pack(objects_dir, store, hash_to_type)
443
+ write_pack_with_delta(objects_dir, store, hash_to_type)
444
444
  freed = 0
445
445
  for hash_id, obj_type in hash_to_type.items():
446
446
  p = store.objects_dir / obj_type / hash_id[:2] / hash_id[2:]