agmem 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/METADATA +15 -8
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/RECORD +25 -16
- memvcs/__init__.py +1 -1
- memvcs/cli.py +1 -1
- memvcs/commands/daemon.py +37 -1
- memvcs/commands/distill.py +6 -0
- memvcs/coordinator/__init__.py +5 -0
- memvcs/coordinator/server.py +239 -0
- memvcs/core/compression_metrics.py +248 -0
- memvcs/core/delta.py +258 -0
- memvcs/core/distiller.py +76 -61
- memvcs/core/fast_similarity.py +404 -0
- memvcs/core/federated.py +13 -2
- memvcs/core/gardener.py +8 -68
- memvcs/core/pack.py +192 -34
- memvcs/core/privacy_validator.py +187 -0
- memvcs/core/protocol_builder.py +198 -0
- memvcs/core/remote.py +82 -2
- memvcs/core/zk_proofs.py +62 -5
- memvcs/health/__init__.py +25 -0
- memvcs/health/monitor.py +452 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/WHEEL +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/entry_points.txt +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/licenses/LICENSE +0 -0
- {agmem-0.1.6.dist-info → agmem-0.2.1.dist-info}/top_level.txt +0 -0
memvcs/core/distiller.py
CHANGED
|
@@ -20,6 +20,7 @@ except ImportError:
|
|
|
20
20
|
YAML_AVAILABLE = False
|
|
21
21
|
|
|
22
22
|
from .gardener import Gardener, GardenerConfig, EpisodeCluster
|
|
23
|
+
from .compression_pipeline import CompressionPipeline
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
@dataclass
|
|
@@ -35,6 +36,7 @@ class DistillerConfig:
|
|
|
35
36
|
llm_provider: Optional[str] = None
|
|
36
37
|
llm_model: Optional[str] = None
|
|
37
38
|
create_safety_branch: bool = True
|
|
39
|
+
use_compression_pipeline: bool = True # Enable compression preprocessing
|
|
38
40
|
use_dp: bool = False
|
|
39
41
|
dp_epsilon: Optional[float] = None
|
|
40
42
|
dp_delta: Optional[float] = None
|
|
@@ -82,6 +84,19 @@ class Distiller:
|
|
|
82
84
|
llm_model=self.config.llm_model,
|
|
83
85
|
),
|
|
84
86
|
)
|
|
87
|
+
# Initialize compression pipeline for pre-processing
|
|
88
|
+
self.compression_pipeline = (
|
|
89
|
+
CompressionPipeline(
|
|
90
|
+
chunk_size=512,
|
|
91
|
+
use_sentences=True,
|
|
92
|
+
extract_facts=True,
|
|
93
|
+
dedup_hash=True,
|
|
94
|
+
vector_store=None, # Can be wired to repo's vector store if available
|
|
95
|
+
tier_by_recency=True,
|
|
96
|
+
)
|
|
97
|
+
if self.config.use_compression_pipeline
|
|
98
|
+
else None
|
|
99
|
+
)
|
|
85
100
|
|
|
86
101
|
def load_episodes_from(self, source_path: Path) -> List[Tuple[Path, str]]:
|
|
87
102
|
"""Load episodes from source directory."""
|
|
@@ -104,7 +119,7 @@ class Distiller:
|
|
|
104
119
|
return self.gardener.cluster_episodes(episodes)
|
|
105
120
|
|
|
106
121
|
def extract_facts(self, cluster: EpisodeCluster) -> List[str]:
|
|
107
|
-
"""Extract factual statements from cluster via LLM or heuristics."""
|
|
122
|
+
"""Extract factual statements from cluster via LLM or heuristics with optional compression."""
|
|
108
123
|
contents = []
|
|
109
124
|
for ep_path in cluster.episodes[:10]:
|
|
110
125
|
try:
|
|
@@ -113,6 +128,15 @@ class Distiller:
|
|
|
113
128
|
continue
|
|
114
129
|
combined = "\n---\n".join(contents)
|
|
115
130
|
|
|
131
|
+
# Apply compression pipeline if enabled (pre-processing before LLM)
|
|
132
|
+
if self.compression_pipeline:
|
|
133
|
+
try:
|
|
134
|
+
compressed_chunks = self.compression_pipeline.run(combined)
|
|
135
|
+
# Extract content from (content, hash, tier) tuples
|
|
136
|
+
combined = "\n".join([chunk[0] for chunk in compressed_chunks[:20]])
|
|
137
|
+
except Exception:
|
|
138
|
+
pass # Fall back to uncompressed content
|
|
139
|
+
|
|
116
140
|
if self.config.llm_provider and self.config.llm_model:
|
|
117
141
|
try:
|
|
118
142
|
from .llm import get_provider
|
|
@@ -136,9 +160,15 @@ class Distiller:
|
|
|
136
160
|
],
|
|
137
161
|
max_tokens=500,
|
|
138
162
|
)
|
|
139
|
-
|
|
163
|
+
facts = [
|
|
140
164
|
line.strip() for line in text.splitlines() if line.strip().startswith("-")
|
|
141
165
|
][:15]
|
|
166
|
+
|
|
167
|
+
# Apply DP to actual facts (not metadata) if enabled
|
|
168
|
+
if self.config.use_dp and self.config.dp_epsilon and self.config.dp_delta:
|
|
169
|
+
facts = self._apply_dp_to_facts(facts)
|
|
170
|
+
|
|
171
|
+
return facts
|
|
142
172
|
except Exception:
|
|
143
173
|
pass
|
|
144
174
|
|
|
@@ -149,7 +179,45 @@ class Distiller:
|
|
|
149
179
|
if len(line) > 20 and not line.startswith("#") and not line.startswith("-"):
|
|
150
180
|
if any(w in line.lower() for w in ["prefers", "likes", "uses", "learned", "user"]):
|
|
151
181
|
facts.append(f"- {line[:200]}")
|
|
152
|
-
|
|
182
|
+
|
|
183
|
+
result = facts[:10] if facts else [f"- Learned about {cluster.topic}"]
|
|
184
|
+
|
|
185
|
+
# Apply DP to fallback facts as well
|
|
186
|
+
if self.config.use_dp and self.config.dp_epsilon and self.config.dp_delta:
|
|
187
|
+
result = self._apply_dp_to_facts(result)
|
|
188
|
+
|
|
189
|
+
return result
|
|
190
|
+
|
|
191
|
+
def _apply_dp_to_facts(self, facts: List[str]) -> List[str]:
|
|
192
|
+
"""
|
|
193
|
+
Apply differential privacy to actual facts (not metadata).
|
|
194
|
+
This ensures removing one episode produces statistically similar output.
|
|
195
|
+
Uses fact sampling with noise to limit individual episode influence.
|
|
196
|
+
"""
|
|
197
|
+
if not facts:
|
|
198
|
+
return facts
|
|
199
|
+
|
|
200
|
+
from .privacy_budget import add_noise
|
|
201
|
+
|
|
202
|
+
# Add noise to fact count (sample with DP)
|
|
203
|
+
noisy_count = add_noise(
|
|
204
|
+
float(len(facts)),
|
|
205
|
+
sensitivity=1.0,
|
|
206
|
+
epsilon=self.config.dp_epsilon,
|
|
207
|
+
delta=self.config.dp_delta,
|
|
208
|
+
)
|
|
209
|
+
noisy_count = max(1, min(len(facts), int(round(noisy_count))))
|
|
210
|
+
|
|
211
|
+
# Sample facts with noise - prevents any single episode from dominating
|
|
212
|
+
import random
|
|
213
|
+
|
|
214
|
+
sampled = random.sample(facts, min(noisy_count, len(facts)))
|
|
215
|
+
|
|
216
|
+
# Optional: Add slight noise to fact embeddings if vector store available
|
|
217
|
+
# This would further obscure individual episode contributions
|
|
218
|
+
# For now, sampling provides basic DP guarantee
|
|
219
|
+
|
|
220
|
+
return sampled
|
|
153
221
|
|
|
154
222
|
def write_consolidated(self, cluster: EpisodeCluster, facts: List[str]) -> Path:
|
|
155
223
|
"""Write consolidated semantic file."""
|
|
@@ -164,17 +232,9 @@ class Distiller:
|
|
|
164
232
|
out_path = self.target_dir / f"consolidated-{ts}.md"
|
|
165
233
|
|
|
166
234
|
confidence_score = self.config.extraction_confidence_threshold
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
and self.config.dp_delta is not None
|
|
171
|
-
):
|
|
172
|
-
from .privacy_budget import add_noise
|
|
173
|
-
|
|
174
|
-
confidence_score = add_noise(
|
|
175
|
-
confidence_score, 0.1, self.config.dp_epsilon, self.config.dp_delta
|
|
176
|
-
)
|
|
177
|
-
confidence_score = max(0.0, min(1.0, confidence_score))
|
|
235
|
+
# Metadata noise removed: confidence_score is a metadata field (threshold setting),
|
|
236
|
+
# not an individual fact. Adding noise to metadata doesn't provide meaningful
|
|
237
|
+
# privacy guarantees. See privacy_validator.py for the distinction.
|
|
178
238
|
frontmatter = {
|
|
179
239
|
"schema_version": "1.0",
|
|
180
240
|
"last_updated": datetime.utcnow().isoformat() + "Z",
|
|
@@ -284,53 +344,8 @@ class Distiller:
|
|
|
284
344
|
clusters_processed = len(clusters)
|
|
285
345
|
facts_extracted = facts_count
|
|
286
346
|
episodes_archived = archived
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
and self.config.dp_epsilon is not None
|
|
290
|
-
and self.config.dp_delta is not None
|
|
291
|
-
):
|
|
292
|
-
from .privacy_budget import add_noise
|
|
293
|
-
|
|
294
|
-
sensitivity = 1.0
|
|
295
|
-
clusters_processed = max(
|
|
296
|
-
0,
|
|
297
|
-
int(
|
|
298
|
-
round(
|
|
299
|
-
add_noise(
|
|
300
|
-
float(clusters_processed),
|
|
301
|
-
sensitivity,
|
|
302
|
-
self.config.dp_epsilon,
|
|
303
|
-
self.config.dp_delta,
|
|
304
|
-
)
|
|
305
|
-
)
|
|
306
|
-
),
|
|
307
|
-
)
|
|
308
|
-
facts_extracted = max(
|
|
309
|
-
0,
|
|
310
|
-
int(
|
|
311
|
-
round(
|
|
312
|
-
add_noise(
|
|
313
|
-
float(facts_extracted),
|
|
314
|
-
sensitivity,
|
|
315
|
-
self.config.dp_epsilon,
|
|
316
|
-
self.config.dp_delta,
|
|
317
|
-
)
|
|
318
|
-
)
|
|
319
|
-
),
|
|
320
|
-
)
|
|
321
|
-
episodes_archived = max(
|
|
322
|
-
0,
|
|
323
|
-
int(
|
|
324
|
-
round(
|
|
325
|
-
add_noise(
|
|
326
|
-
float(episodes_archived),
|
|
327
|
-
sensitivity,
|
|
328
|
-
self.config.dp_epsilon,
|
|
329
|
-
self.config.dp_delta,
|
|
330
|
-
)
|
|
331
|
-
)
|
|
332
|
-
),
|
|
333
|
-
)
|
|
347
|
+
# Note: DP is now applied to actual facts during extraction, not metadata.
|
|
348
|
+
# Metadata noise removed as it doesn't provide meaningful privacy guarantees.
|
|
334
349
|
|
|
335
350
|
return DistillerResult(
|
|
336
351
|
success=True,
|
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fast similarity matching with tiered filtering.
|
|
3
|
+
|
|
4
|
+
Solves O(n²×m²) performance bottleneck in delta encoding by filtering
|
|
5
|
+
candidates before expensive Levenshtein distance computation.
|
|
6
|
+
|
|
7
|
+
Three-tier approach:
|
|
8
|
+
1. Length-ratio filter: O(1) - skip if sizes differ >50%
|
|
9
|
+
2. SimHash filter: O(n) - skip if approximate similarity below threshold
|
|
10
|
+
3. Levenshtein distance: O(n×m) - only for candidates passing tiers 1-2
|
|
11
|
+
4. Parallel processing: Multiprocessing for tier 3 across multiple cores
|
|
12
|
+
|
|
13
|
+
With 100 objects × 2KB each, filters typically eliminate 90%+ of pairs
|
|
14
|
+
before expensive distance computation, reducing 40B operations to <100M.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
from typing import Dict, List, Tuple, Optional, Set, Any
|
|
19
|
+
from multiprocessing import Pool, cpu_count
|
|
20
|
+
from functools import partial
|
|
21
|
+
import math
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SimHashFilter:
|
|
25
|
+
"""Fast approximate similarity using SimHash.
|
|
26
|
+
|
|
27
|
+
SimHash creates a 64-bit fingerprint of content that:
|
|
28
|
+
- Changes minimally for similar content
|
|
29
|
+
- Computes in O(n) time
|
|
30
|
+
- Allows Hamming distance for approximate matching
|
|
31
|
+
|
|
32
|
+
Papers: "Detecting Near-Duplicates for Web Crawling" (Charikar, 2002)
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
@staticmethod
|
|
36
|
+
def compute_hash(content: bytes, hash_bits: int = 64) -> int:
|
|
37
|
+
"""Compute SimHash fingerprint for content.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
content: Bytes to hash
|
|
41
|
+
hash_bits: Number of bits in fingerprint (default 64)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
SimHash fingerprint as integer
|
|
45
|
+
"""
|
|
46
|
+
if not content:
|
|
47
|
+
return 0
|
|
48
|
+
|
|
49
|
+
# Initialize fingerprint vector
|
|
50
|
+
fingerprint = [0] * hash_bits
|
|
51
|
+
|
|
52
|
+
# Process content in 64-byte chunks
|
|
53
|
+
chunk_size = 64
|
|
54
|
+
for i in range(0, len(content), chunk_size):
|
|
55
|
+
chunk = content[i : i + chunk_size]
|
|
56
|
+
# Hash each chunk
|
|
57
|
+
h = hashlib.sha256(chunk).digest()
|
|
58
|
+
# Map hash bits to fingerprint
|
|
59
|
+
for bit_idx in range(hash_bits):
|
|
60
|
+
byte_idx = bit_idx // 8
|
|
61
|
+
bit_pos = bit_idx % 8
|
|
62
|
+
if byte_idx < len(h):
|
|
63
|
+
if (h[byte_idx] >> bit_pos) & 1:
|
|
64
|
+
fingerprint[bit_idx] += 1
|
|
65
|
+
else:
|
|
66
|
+
fingerprint[bit_idx] -= 1
|
|
67
|
+
|
|
68
|
+
# Convert fingerprint to integer
|
|
69
|
+
result = 0
|
|
70
|
+
for i, v in enumerate(fingerprint):
|
|
71
|
+
if v > 0:
|
|
72
|
+
result |= 1 << i
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
@staticmethod
|
|
77
|
+
def hamming_distance(hash1: int, hash2: int) -> int:
|
|
78
|
+
"""Compute Hamming distance between two SimHash fingerprints.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
hash1: First SimHash fingerprint
|
|
82
|
+
hash2: Second SimHash fingerprint
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Hamming distance (0-64)
|
|
86
|
+
"""
|
|
87
|
+
xor = hash1 ^ hash2
|
|
88
|
+
distance = 0
|
|
89
|
+
while xor:
|
|
90
|
+
distance += xor & 1
|
|
91
|
+
xor >>= 1
|
|
92
|
+
return distance
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class FastSimilarityMatcher:
|
|
96
|
+
"""Multi-tier similarity matching with progressive filtering.
|
|
97
|
+
|
|
98
|
+
Tiers:
|
|
99
|
+
1. Length-ratio filter (O(1)): Skip if object sizes differ >50%
|
|
100
|
+
2. SimHash filter (O(n)): Skip if Hamming distance indicates dissimilarity
|
|
101
|
+
3. Levenshtein distance (O(n×m)): Only for candidates passing tiers 1-2
|
|
102
|
+
4. Parallel processing: Use multiprocessing for tier 3 across CPU cores
|
|
103
|
+
|
|
104
|
+
Usage:
|
|
105
|
+
matcher = FastSimilarityMatcher(
|
|
106
|
+
length_ratio_threshold=0.5,
|
|
107
|
+
simhash_threshold=15, # Hamming distance
|
|
108
|
+
min_similarity=0.8
|
|
109
|
+
)
|
|
110
|
+
similar_pairs = matcher.find_similar_pairs(objects_dict)
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
length_ratio_threshold: float = 0.5,
|
|
116
|
+
simhash_threshold: int = 15,
|
|
117
|
+
min_similarity: float = 0.8,
|
|
118
|
+
use_parallel: bool = True,
|
|
119
|
+
max_workers: Optional[int] = None,
|
|
120
|
+
):
|
|
121
|
+
"""Initialize the similarity matcher.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
length_ratio_threshold: Skip if |len(a) - len(b)| / max(len(a), len(b)) > threshold
|
|
125
|
+
simhash_threshold: Skip if SimHash Hamming distance > threshold
|
|
126
|
+
min_similarity: Minimum Levenshtein similarity required (0.0-1.0)
|
|
127
|
+
use_parallel: Whether to use multiprocessing for tier 3
|
|
128
|
+
max_workers: Max worker processes (defaults to CPU count)
|
|
129
|
+
"""
|
|
130
|
+
self.length_ratio_threshold = length_ratio_threshold
|
|
131
|
+
self.simhash_threshold = simhash_threshold
|
|
132
|
+
self.min_similarity = min_similarity
|
|
133
|
+
self.use_parallel = use_parallel
|
|
134
|
+
self.max_workers = max_workers or cpu_count()
|
|
135
|
+
|
|
136
|
+
# Statistics for debugging/reporting
|
|
137
|
+
self.stats = {
|
|
138
|
+
"total_pairs": 0,
|
|
139
|
+
"filtered_tier1": 0, # Length ratio
|
|
140
|
+
"filtered_tier2": 0, # SimHash
|
|
141
|
+
"evaluated_tier3": 0, # Levenshtein
|
|
142
|
+
"matches_found": 0,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def find_similar_pairs(self, objects: Dict[str, bytes]) -> List[Tuple[str, str, float]]:
|
|
146
|
+
"""Find similar object pairs using tiered filtering.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
objects: Dict mapping object_id -> content (bytes)
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of (id1, id2, similarity_score) tuples, sorted by similarity (descending)
|
|
153
|
+
"""
|
|
154
|
+
self.stats = {
|
|
155
|
+
"total_pairs": 0,
|
|
156
|
+
"filtered_tier1": 0,
|
|
157
|
+
"filtered_tier2": 0,
|
|
158
|
+
"evaluated_tier3": 0,
|
|
159
|
+
"matches_found": 0,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
if len(objects) < 2:
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
object_ids = list(objects.keys())
|
|
166
|
+
similar_pairs: List[Tuple[str, str, float]] = []
|
|
167
|
+
|
|
168
|
+
# Pre-compute SimHash for all objects (tier 2 pre-computation)
|
|
169
|
+
simhash_cache = {oid: SimHashFilter.compute_hash(objects[oid]) for oid in object_ids}
|
|
170
|
+
|
|
171
|
+
# Generate candidate pairs
|
|
172
|
+
candidates_for_tier3 = []
|
|
173
|
+
|
|
174
|
+
for i in range(len(object_ids)):
|
|
175
|
+
for j in range(i + 1, len(object_ids)):
|
|
176
|
+
id1, id2 = object_ids[i], object_ids[j]
|
|
177
|
+
content1, content2 = objects[id1], objects[id2]
|
|
178
|
+
|
|
179
|
+
self.stats["total_pairs"] += 1
|
|
180
|
+
|
|
181
|
+
# Tier 1: Length-ratio filter
|
|
182
|
+
if not self._pass_length_filter(len(content1), len(content2)):
|
|
183
|
+
self.stats["filtered_tier1"] += 1
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
# Tier 2: SimHash filter
|
|
187
|
+
hash1 = simhash_cache[id1]
|
|
188
|
+
hash2 = simhash_cache[id2]
|
|
189
|
+
if not self._pass_simhash_filter(hash1, hash2):
|
|
190
|
+
self.stats["filtered_tier2"] += 1
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Tier 3: These candidates need Levenshtein distance
|
|
194
|
+
candidates_for_tier3.append((id1, id2, content1, content2))
|
|
195
|
+
|
|
196
|
+
# Tier 3: Levenshtein distance (parallel if enabled)
|
|
197
|
+
self.stats["evaluated_tier3"] = len(candidates_for_tier3)
|
|
198
|
+
|
|
199
|
+
if not candidates_for_tier3:
|
|
200
|
+
return []
|
|
201
|
+
|
|
202
|
+
if self.use_parallel and len(candidates_for_tier3) > 1:
|
|
203
|
+
similar_pairs = self._evaluate_tier3_parallel(candidates_for_tier3)
|
|
204
|
+
else:
|
|
205
|
+
similar_pairs = self._evaluate_tier3_serial(candidates_for_tier3)
|
|
206
|
+
|
|
207
|
+
# Sort by similarity (highest first)
|
|
208
|
+
similar_pairs.sort(key=lambda x: x[2], reverse=True)
|
|
209
|
+
self.stats["matches_found"] = len(similar_pairs)
|
|
210
|
+
|
|
211
|
+
return similar_pairs
|
|
212
|
+
|
|
213
|
+
def _pass_length_filter(self, len1: int, len2: int) -> bool:
|
|
214
|
+
"""Check if two objects pass the length-ratio filter (tier 1).
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
len1: Length of first object
|
|
218
|
+
len2: Length of second object
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
True if objects should be compared further, False if filtered out
|
|
222
|
+
"""
|
|
223
|
+
if len1 == 0 or len2 == 0:
|
|
224
|
+
return len1 == len2
|
|
225
|
+
|
|
226
|
+
max_len = max(len1, len2)
|
|
227
|
+
min_len = min(len1, len2)
|
|
228
|
+
ratio = 1.0 - (min_len / max_len)
|
|
229
|
+
|
|
230
|
+
return ratio <= self.length_ratio_threshold
|
|
231
|
+
|
|
232
|
+
def _pass_simhash_filter(self, hash1: int, hash2: int) -> bool:
|
|
233
|
+
"""Check if two objects pass the SimHash filter (tier 2).
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
hash1: SimHash fingerprint of first object
|
|
237
|
+
hash2: SimHash fingerprint of second object
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
True if objects should be compared further, False if filtered out
|
|
241
|
+
"""
|
|
242
|
+
distance = SimHashFilter.hamming_distance(hash1, hash2)
|
|
243
|
+
# Lower Hamming distance = more similar
|
|
244
|
+
return distance <= self.simhash_threshold
|
|
245
|
+
|
|
246
|
+
def _evaluate_tier3_serial(
|
|
247
|
+
self, candidates: List[Tuple[str, str, bytes, bytes]]
|
|
248
|
+
) -> List[Tuple[str, str, float]]:
|
|
249
|
+
"""Evaluate candidates using Levenshtein distance (serial).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
candidates: List of (id1, id2, content1, content2) tuples
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
|
|
256
|
+
"""
|
|
257
|
+
results = []
|
|
258
|
+
for id1, id2, content1, content2 in candidates:
|
|
259
|
+
similarity = self._levenshtein_similarity(content1, content2)
|
|
260
|
+
if similarity >= self.min_similarity:
|
|
261
|
+
results.append((id1, id2, similarity))
|
|
262
|
+
return results
|
|
263
|
+
|
|
264
|
+
def _evaluate_tier3_parallel(
|
|
265
|
+
self, candidates: List[Tuple[str, str, bytes, bytes]]
|
|
266
|
+
) -> List[Tuple[str, str, float]]:
|
|
267
|
+
"""Evaluate candidates using Levenshtein distance (parallel).
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
candidates: List of (id1, id2, content1, content2) tuples
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
List of (id1, id2, similarity_score) tuples where similarity >= min_similarity
|
|
274
|
+
"""
|
|
275
|
+
# Process pairs in parallel
|
|
276
|
+
with Pool(processes=self.max_workers) as pool:
|
|
277
|
+
results = pool.map(
|
|
278
|
+
partial(
|
|
279
|
+
_compute_similarity_worker,
|
|
280
|
+
min_similarity=self.min_similarity,
|
|
281
|
+
),
|
|
282
|
+
candidates,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Filter out None results (pairs that didn't meet minimum similarity)
|
|
286
|
+
return [r for r in results if r is not None]
|
|
287
|
+
|
|
288
|
+
@staticmethod
|
|
289
|
+
def _levenshtein_similarity(s1: bytes, s2: bytes) -> float:
|
|
290
|
+
"""Compute Levenshtein similarity (0.0-1.0).
|
|
291
|
+
|
|
292
|
+
Similarity = 1.0 - (distance / max_length)
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
s1: First byte sequence
|
|
296
|
+
s2: Second byte sequence
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Similarity score (0.0 = completely different, 1.0 = identical)
|
|
300
|
+
"""
|
|
301
|
+
distance = _levenshtein_distance(s1, s2)
|
|
302
|
+
max_len = max(len(s1), len(s2))
|
|
303
|
+
if max_len == 0:
|
|
304
|
+
return 1.0
|
|
305
|
+
return 1.0 - (distance / max_len)
|
|
306
|
+
|
|
307
|
+
def get_statistics(self) -> Dict[str, Any]:
|
|
308
|
+
"""Get filtering statistics.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Dict with tier-by-tier breakdown of filtering effectiveness
|
|
312
|
+
"""
|
|
313
|
+
total = self.stats["total_pairs"]
|
|
314
|
+
tier1_pct = (self.stats["filtered_tier1"] / total * 100) if total > 0 else 0
|
|
315
|
+
tier2_pct = (self.stats["filtered_tier2"] / total * 100) if total > 0 else 0
|
|
316
|
+
|
|
317
|
+
return {
|
|
318
|
+
"total_pairs_evaluated": total,
|
|
319
|
+
"filtered_tier1_length": {
|
|
320
|
+
"count": self.stats["filtered_tier1"],
|
|
321
|
+
"percentage": tier1_pct,
|
|
322
|
+
},
|
|
323
|
+
"filtered_tier2_simhash": {
|
|
324
|
+
"count": self.stats["filtered_tier2"],
|
|
325
|
+
"percentage": tier2_pct,
|
|
326
|
+
},
|
|
327
|
+
"evaluated_tier3_levenshtein": {
|
|
328
|
+
"count": self.stats["evaluated_tier3"],
|
|
329
|
+
"percentage": ((self.stats["evaluated_tier3"] / total * 100) if total > 0 else 0),
|
|
330
|
+
},
|
|
331
|
+
"matches_found": self.stats["matches_found"],
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
def log_statistics(self, logger=None) -> None:
|
|
335
|
+
"""Log filtering statistics for debugging."""
|
|
336
|
+
stats = self.get_statistics()
|
|
337
|
+
output = [
|
|
338
|
+
"Similarity Matching Statistics",
|
|
339
|
+
"=" * 50,
|
|
340
|
+
f"Total pairs evaluated: {stats['total_pairs_evaluated']}",
|
|
341
|
+
f"Filtered (Tier 1 - Length): {stats['filtered_tier1_length']['count']} ({stats['filtered_tier1_length']['percentage']:.1f}%)",
|
|
342
|
+
f"Filtered (Tier 2 - SimHash): {stats['filtered_tier2_simhash']['count']} ({stats['filtered_tier2_simhash']['percentage']:.1f}%)",
|
|
343
|
+
f"Evaluated (Tier 3 - Levenshtein): {stats['evaluated_tier3_levenshtein']['count']} ({stats['evaluated_tier3_levenshtein']['percentage']:.1f}%)",
|
|
344
|
+
f"Similar pairs found: {stats['matches_found']}",
|
|
345
|
+
"=" * 50,
|
|
346
|
+
]
|
|
347
|
+
full_output = "\n".join(output)
|
|
348
|
+
if logger:
|
|
349
|
+
logger.info(full_output)
|
|
350
|
+
else:
|
|
351
|
+
print(full_output)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _levenshtein_distance(s1: bytes, s2: bytes) -> int:
|
|
355
|
+
"""Compute Levenshtein distance between two byte sequences.
|
|
356
|
+
|
|
357
|
+
O(n×m) time complexity. Optimized for common cases.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
s1: First byte sequence
|
|
361
|
+
s2: Second byte sequence
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Edit distance (minimum edits to transform s1 to s2)
|
|
365
|
+
"""
|
|
366
|
+
if len(s1) < len(s2):
|
|
367
|
+
s1, s2 = s2, s1
|
|
368
|
+
|
|
369
|
+
if len(s2) == 0:
|
|
370
|
+
return len(s1)
|
|
371
|
+
|
|
372
|
+
# Use only two rows for space optimization
|
|
373
|
+
prev = list(range(len(s2) + 1))
|
|
374
|
+
for i, c1 in enumerate(s1):
|
|
375
|
+
curr = [i + 1]
|
|
376
|
+
for j, c2 in enumerate(s2):
|
|
377
|
+
insertions = prev[j + 1] + 1
|
|
378
|
+
deletions = curr[j] + 1
|
|
379
|
+
substitutions = prev[j] + (c1 != c2)
|
|
380
|
+
curr.append(min(insertions, deletions, substitutions))
|
|
381
|
+
prev = curr
|
|
382
|
+
|
|
383
|
+
return prev[-1]
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _compute_similarity_worker(
|
|
387
|
+
candidate: Tuple[str, str, bytes, bytes],
|
|
388
|
+
min_similarity: float,
|
|
389
|
+
) -> Optional[Tuple[str, str, float]]:
|
|
390
|
+
"""Worker function for parallel Levenshtein computation.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
candidate: (id1, id2, content1, content2) tuple
|
|
394
|
+
min_similarity: Minimum similarity threshold
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
(id1, id2, similarity) if similarity >= min_similarity, else None
|
|
398
|
+
"""
|
|
399
|
+
id1, id2, content1, content2 = candidate
|
|
400
|
+
similarity = FastSimilarityMatcher._levenshtein_similarity(content1, content2)
|
|
401
|
+
|
|
402
|
+
if similarity >= min_similarity:
|
|
403
|
+
return (id1, id2, similarity)
|
|
404
|
+
return None
|
memvcs/core/federated.py
CHANGED
|
@@ -11,6 +11,7 @@ from pathlib import Path
|
|
|
11
11
|
from typing import Optional, List, Dict, Any
|
|
12
12
|
|
|
13
13
|
from .config_loader import load_agmem_config
|
|
14
|
+
from .protocol_builder import ClientSummaryBuilder
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def get_federated_config(repo_root: Path) -> Optional[Dict[str, Any]]:
|
|
@@ -120,17 +121,27 @@ def produce_local_summary(
|
|
|
120
121
|
|
|
121
122
|
|
|
122
123
|
def push_updates(repo_root: Path, summary: Dict[str, Any]) -> str:
|
|
123
|
-
"""Send local summary to coordinator
|
|
124
|
+
"""Send local summary to coordinator using protocol-compliant schema.
|
|
125
|
+
|
|
126
|
+
Uses ClientSummaryBuilder to ensure the summary conforms to the
|
|
127
|
+
server's PushRequest schema before transmission.
|
|
128
|
+
|
|
129
|
+
Returns status message."""
|
|
124
130
|
cfg = get_federated_config(repo_root)
|
|
125
131
|
if not cfg:
|
|
126
132
|
return "Federated collaboration not configured"
|
|
127
133
|
url = cfg["coordinator_url"] + "/push"
|
|
128
134
|
try:
|
|
135
|
+
from .protocol_builder import ClientSummaryBuilder
|
|
136
|
+
|
|
137
|
+
# Build protocol-compliant summary
|
|
138
|
+
compliant_summary = ClientSummaryBuilder.build(repo_root, summary, strict_mode=False)
|
|
139
|
+
|
|
129
140
|
import urllib.request
|
|
130
141
|
|
|
131
142
|
req = urllib.request.Request(
|
|
132
143
|
url,
|
|
133
|
-
data=json.dumps(
|
|
144
|
+
data=json.dumps(compliant_summary).encode(),
|
|
134
145
|
headers={"Content-Type": "application/json"},
|
|
135
146
|
method="POST",
|
|
136
147
|
)
|